aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_conv
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_conv')
-rw-r--r--src/core/NEON/kernels/arm_conv/addressing.cpp333
-rw-r--r--src/core/NEON/kernels/arm_conv/addressing.hpp263
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp308
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp66
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp700
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp315
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp604
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp356
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp539
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp148
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp165
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp408
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp351
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp62
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp97
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp362
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp234
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp244
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp244
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp152
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp82
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp161
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.hpp53
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp59
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp135
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp135
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp723
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp697
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp1158
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp1291
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp1736
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp2007
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp895
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp897
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp1387
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp1427
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp51
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp520
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp52
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp1044
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp527
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp513
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp828
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp905
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp1232
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1397
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp615
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp629
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp991
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp1043
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp51
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp376
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp61
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp533
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp61
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp917
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp52
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp850
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp76
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp1658
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1166
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1397
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp73
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp2187
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp51
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp618
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp61
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp519
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp61
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp640
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp52
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp1480
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp76
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp1484
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp76
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp1658
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1166
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1397
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp2187
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp51
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp618
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp61
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp519
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp61
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp640
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp52
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp1480
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1164
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1395
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp2185
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1166
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp73
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1397
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp2187
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp51
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp618
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp52
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp1480
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp67
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp336
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp277
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp67
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp483
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp444
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp67
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp672
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp653
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp67
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp374
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp318
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp67
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp586
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp537
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp336
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp277
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp483
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp444
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp672
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp653
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp374
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp318
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp74
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp455
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp74
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp650
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp74
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp883
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp74
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp1172
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp74
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp560
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp74
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp763
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp74
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp1151
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp74
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp1246
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp664
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp881
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp1204
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp1354
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp664
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp881
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp1204
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp1354
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp664
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp881
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp1204
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp1354
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp316
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp296
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp460
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp477
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp656
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp714
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp337
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp337
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp523
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp551
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp316
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp296
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp460
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp477
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp656
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp714
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp337
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp337
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp71
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp523
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp551
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp51
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp166
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp61
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp259
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp61
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp392
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp52
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp454
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp76
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp497
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp62
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp410
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp62
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp451
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp62
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp652
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp61
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp339
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp61
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp402
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp76
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp436
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp76
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp497
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp62
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp410
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp62
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp451
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp62
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp652
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp61
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp339
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp61
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp402
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp62
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp410
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp62
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp451
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp62
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp652
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/premultiply.cpp84
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp461
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp299
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp295
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst.hpp16
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp409
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp179
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst.hpp16
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp399
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp295
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst.hpp16
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp361
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp179
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst.hpp16
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp351
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst.hpp16
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp428
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp179
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst.hpp16
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp491
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst.hpp16
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp434
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst.hpp16
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp734
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst.hpp16
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp424
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp179
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst.hpp16
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp491
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst.hpp16
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp493
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst.hpp16
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp916
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp15
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp16
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp51
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp209
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp46
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp233
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp51
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp148
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp46
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp225
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp51
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp209
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp46
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp233
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp51
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp148
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp46
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp225
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp46
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp419
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp51
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp148
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst.hpp46
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp225
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp46
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp460
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp46
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp388
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp46
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp419
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp51
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp148
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst.hpp46
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp225
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp46
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp489
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp46
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp418
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp208
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst.hpp20
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp264
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp132
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst.hpp20
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp276
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp208
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst.hpp20
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp264
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp132
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst.hpp20
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp276
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst.hpp20
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp365
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp132
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst.hpp20
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp276
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst.hpp20
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp449
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst.hpp20
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp560
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst.hpp20
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp365
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp132
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst.hpp20
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp276
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst.hpp20
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp461
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst.hpp20
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp658
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp454
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_cache_oblivious.hpp312
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp412
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic_quantized.hpp256
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp126
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp126
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_implementation.hpp23
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp107
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp74
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp113
-rw-r--r--src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp74
379 files changed, 130509 insertions, 8063 deletions
diff --git a/src/core/NEON/kernels/arm_conv/addressing.cpp b/src/core/NEON/kernels/arm_conv/addressing.cpp
new file mode 100644
index 0000000000..2460398880
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/addressing.cpp
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "addressing.hpp"
+#include "utils.hpp"
+#include <algorithm>
+#include <cstring>
+
+namespace arm_conv {
+namespace addressing {
+
+void fill_pointer_array(
+ size_t element_size,
+ void **dest_raw, const unsigned int array_rows, const unsigned int array_cols,
+ void *base_ptr_raw, size_t ld_row, size_t ld_col,
+ void *pad_buffer_raw,
+ const unsigned int pad_top, const unsigned int valid_rows,
+ const unsigned int pad_left, const unsigned int valid_cols
+)
+{
+ auto dest = reinterpret_cast<char **>(dest_raw);
+ auto base_ptr = reinterpret_cast<char *>(base_ptr_raw);
+ auto pad_buffer = reinterpret_cast<char *>(pad_buffer_raw);
+ ld_row *= element_size;
+ ld_col *= element_size;
+
+ const auto last_valid_row = std::min(pad_top + valid_rows, array_rows);
+ const auto last_valid_col = std::min(pad_left + valid_cols, array_cols);
+
+ unsigned int i = 0;
+ for (; i < pad_top; i++)
+ {
+ for (unsigned int j = 0; j < array_cols; j++)
+ {
+ *(dest++) = pad_buffer;
+ }
+ }
+ for (; i < last_valid_row; i++)
+ {
+ unsigned int j = 0;
+ auto colptr = base_ptr;
+ base_ptr += ld_row;
+
+ for (; j < pad_left; j++)
+ {
+ *(dest++) = pad_buffer;
+ }
+ for (; j < last_valid_col; j++)
+ {
+ *(dest++) = colptr;
+ colptr += ld_col;
+ }
+ for (; j < array_cols; j++)
+ {
+ *(dest++) = pad_buffer;
+ }
+ }
+ for (; i < array_rows; i++)
+ {
+ for (unsigned int j = 0; j < array_cols; j++)
+ {
+ *(dest++) = pad_buffer;
+ }
+ }
+}
+
+
+void fill_pointer_array_generic_kernel(
+ const size_t element_size,
+ void **dest_raw,
+ const unsigned int output_rows, const unsigned int output_cols,
+ const unsigned int kernel_rows, const unsigned int kernel_cols,
+ const unsigned int stride_rows, const unsigned int stride_cols,
+ void *base_ptr_raw, size_t ld_row, size_t ld_col,
+ void *pad_buffer_raw,
+ const unsigned int pad_top, const unsigned int valid_rows,
+ const unsigned int pad_left, const unsigned int valid_cols
+)
+{
+ auto dest = reinterpret_cast<char **>(dest_raw);
+ auto base_ptr = reinterpret_cast<char *>(base_ptr_raw);
+ auto pad_buffer = reinterpret_cast<char *>(pad_buffer_raw);
+ ld_row *= element_size;
+ ld_col *= element_size;
+
+ const auto last_valid_row = pad_top + valid_rows;
+ const auto last_valid_col = pad_left + valid_cols;
+ const auto point_stride = output_rows * output_cols;
+
+ // Iterate over the output points, after every point increment the pointer
+ // into the address array.
+ for (unsigned int oi = 0; oi < output_rows; oi++)
+ {
+ for (unsigned int oj = 0; oj < output_cols; oj++)
+ {
+ auto point_dest = dest;
+ dest++;
+
+ // Iterate over kernel points and fill in the pointer array.
+ unsigned int ki = 0, ii = oi*stride_rows;
+ for (; ii < pad_top && ki < kernel_rows; ii++, ki++)
+ {
+ // Fill with padding
+ for (unsigned int j = 0; j < kernel_cols; j++)
+ {
+ *point_dest = pad_buffer;
+ point_dest += point_stride;
+ }
+ }
+ for (; ii < last_valid_row && ki < kernel_rows; ii++, ki++)
+ {
+ unsigned int kj = 0, ij = oj*stride_cols;
+ for (; ij < pad_left && kj < kernel_cols; ij++, kj++)
+ {
+ // Padding
+ *point_dest = pad_buffer;
+ point_dest += point_stride;
+ }
+ for (; ij < last_valid_col && kj < kernel_cols; ij++, kj++)
+ {
+ *point_dest = base_ptr + (ii - pad_top)*ld_row + (ij - pad_left)*ld_col;
+ point_dest += point_stride;
+ }
+ for (; kj < kernel_cols; kj++)
+ {
+ // Padding
+ *point_dest = pad_buffer;
+ point_dest += point_stride;
+ }
+ }
+ for (; ki < kernel_rows; ki++)
+ {
+ // Fill with padding
+ for (unsigned int j = 0; j < kernel_cols; j++)
+ {
+ *point_dest = pad_buffer;
+ point_dest += point_stride;
+ }
+ }
+ }
+ }
+}
+
+/* Patch array constructor
+ *
+ * Some depthwise kernels require an NCHW-ordered patch of input. Here we
+ * construct such a patch, and fill in an array of pointers to the rows of the
+ * patch.
+ */
+void fill_nchw_patch_array(
+ size_t element_size,
+ const void **dest_row_pointers_raw, // Array of pointers to each row of the patch
+ void *dest_patch_raw, // Pointer to space which can be used to construct the patch
+ const unsigned int patch_rows, unsigned int patch_cols, // Patch size
+ const void *src_ptr_raw, size_t ld_row, size_t ld_col, // Source tensor
+ const void *pad_row, // Pointer to a row of padding values
+ const unsigned int pad_top, const unsigned int valid_rows,
+ const unsigned int pad_left, const unsigned int valid_cols
+)
+{
+ // Convert into more useful types
+ auto row_pointers = reinterpret_cast<const char **>(dest_row_pointers_raw);
+ auto dest_patch = reinterpret_cast<char *>(dest_patch_raw);
+ auto src = reinterpret_cast<const char *>(src_ptr_raw);
+ ld_row *= element_size;
+ ld_col *= element_size;
+
+ // Round up the patch columns to be a full quad
+ patch_cols = arm_gemm::roundup<unsigned int>(patch_cols, 16 / element_size);
+
+ const auto last_valid_row = std::min(pad_top + valid_rows, patch_rows);
+ const auto last_valid_col = std::min(pad_left + valid_cols, patch_cols);
+
+ // Construct the patch and row pointer array together
+ unsigned int i = 0;
+ for (; i < pad_top; i++)
+ {
+ // Insert pointers into the padding row
+ *(row_pointers++) = reinterpret_cast<const char *>(pad_row);
+ }
+ for (; i < last_valid_row; i++)
+ {
+ // Get a copy of the pointer for this row
+ auto colptr = src;
+ src += ld_row;
+
+ // If the input is already in NCHW format (ld_col == element_size) AND
+ // there is no padding, then we just use a pointer to the source tensor;
+ // otherwise we need to construct a patch and provide a pointer to it.
+ if (ld_col == element_size && pad_left == 0 && last_valid_col == patch_cols)
+ {
+ *(row_pointers++) = colptr;
+ }
+ else
+ {
+ auto patch_col = dest_patch;
+ *(row_pointers++) = dest_patch;
+ dest_patch += element_size * patch_cols; // Move the patch pointer on
+
+ // Construct the patch; fill the entirety with padding and then copy in
+ // the valid elements.
+ memcpy(patch_col, pad_row, element_size * patch_cols);
+ patch_col += pad_left * element_size; // Move over the left padding
+
+ if (ld_col == element_size)
+ {
+ // If the input is NCHW then copy across as many columns as we can.
+ memcpy(patch_col, colptr, (last_valid_col - pad_left) * element_size);
+ }
+ else
+ {
+ // If the input is NHWC then copy columns across in turn.
+ for (auto j = pad_left; j < last_valid_col; j++)
+ {
+ memcpy(patch_col, colptr, element_size); // Copy the valid element
+ patch_col += element_size; // Progress the patch destination
+ colptr += ld_col; // Progress the patch source
+ }
+ }
+ }
+ }
+ for (; i < patch_rows; i++)
+ {
+ // Insert pointers into the padding row
+ *(row_pointers++) = reinterpret_cast<const char *>(pad_row);
+ }
+}
+
+
+/* Patch array constructor (generic kernels)
+ *
+ * Construct an array of pointers; one pointer for each output row for each
+ * kernel point. Pointers should point at a whole number of QUADS containing an
+ * input point for each output point. If the kernel column stride is 1 and the
+ * data is NCHW then the input tensor might be addressed directly, otherwise a
+ * new patch sample might need to be constructed.
+ */
+void fill_patch_array_generic_kernel(
+ size_t element_size,
+ const void **dest_pointers_raw, // Pointers: one per output row per kernel point
+ void *patch_raw, // Pointer to space which can be used to construct the patch
+ const unsigned int output_rows, const unsigned int output_cols,
+ const unsigned int kernel_rows, const unsigned int kernel_cols,
+ const unsigned int stride_rows, const unsigned int stride_cols,
+ const void *src_ptr_raw, size_t ld_row, size_t ld_col, // Source tensor
+ const void *pad_row, // Pointer to a row of padding values
+ const unsigned int pad_top, const unsigned int valid_rows,
+ const unsigned int pad_left, const unsigned int valid_cols
+)
+{
+ auto dest = reinterpret_cast<const char **>(dest_pointers_raw);
+ auto patch = reinterpret_cast<char *>(patch_raw);
+ auto src_ptr = reinterpret_cast<const char *>(src_ptr_raw);
+ ld_row *= element_size;
+ ld_col *= element_size;
+
+ // Round up the patch columns to a multiple of quad-length
+ const auto patch_cols = arm_gemm::roundup<unsigned int>(output_cols, 16 / element_size);
+
+ const auto input_rows = kernel_rows + (output_rows - 1) * stride_rows;
+ const auto last_valid_row = std::min(pad_top + valid_rows, input_rows);
+
+ const auto input_cols = kernel_cols + (output_cols - 1) * stride_cols;
+ const auto last_valid_col = std::min(pad_left + valid_cols, input_cols);
+
+ for (auto ki = 0u; ki < kernel_rows; ki++)
+ {
+ for (auto kj = 0u; kj < kernel_cols; kj++)
+ {
+ auto oi = 0u, ii = ki;
+ for (; oi < output_rows && ii < pad_top; oi++, ii += stride_rows)
+ {
+ // Insert a pointer to the padding row
+ *(dest++) = reinterpret_cast<const char *>(pad_row);
+ }
+ for (; oi < output_rows && ii < last_valid_row; oi++, ii += stride_rows)
+ {
+ auto rowptr = src_ptr + (ii - pad_top) * ld_row;
+
+ // Construct a sample of the input here
+ auto patch_pos = patch;
+ *(dest++) = patch;
+ patch += patch_cols * element_size;
+
+ // Fill with padding
+ memcpy(patch_pos, pad_row, patch_cols * element_size);
+
+ // Fill in the valid elements
+ auto oj = 0u, ij = kj;
+ for (; oj < patch_cols && ij < pad_left; oj++, ij += stride_cols)
+ {
+ // Do nothing for padding
+ patch_pos += element_size;
+ }
+ for (; oj < patch_cols && ij < last_valid_col; oj++, ij += stride_cols)
+ {
+ // Copy from the source tensor
+ memcpy(patch_pos, rowptr + (ij - pad_left)*ld_col, element_size);
+ patch_pos += element_size;
+ }
+ // No action required for right-hand padding
+ }
+ for (; oi < output_rows; oi++)
+ {
+ *(dest++) = reinterpret_cast<const char *>(pad_row);
+ }
+ }
+ }
+}
+
+} // namespace addressing
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/addressing.hpp b/src/core/NEON/kernels/arm_conv/addressing.hpp
new file mode 100644
index 0000000000..35715a3764
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/addressing.hpp
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* arm_conv kernels share a lot of similarities in how they address input and
+ * output tensors. Consequently, this file contains common approaches to
+ * preparing these tensor descriptions. Generic (i.e., untyped) methods are
+ * contained within the `arm_conv::addressing` namespace, and typed wrappers
+ * are provided within an anonymous namespace within `arm_conv`. The various
+ * methods are described below.
+ */
+
+#include <cstddef>
+
+namespace arm_conv {
+namespace addressing {
+
+/* Pointer array
+ * -------------
+ *
+ * Constructs an array of pointers which point to a `array_rows` x `array_cols`
+ * chunk of a tensor. The array of pointers will be written into `dest`.
+ *
+ * `base_ptr` should point at the first VALID element of the chunk of tensor
+ * (i.e., if there's one padded row, and one padded column, then `base_ptr`
+ * should point at the element which will be at position (1, 1) in the array).
+ * `ld_row` and `ld_col` are in bytes, and describe the strides over rows and
+ * columns (respectively) of the NHWC-ordered tensor. `pad_buffer` should point
+ * at a suitably sized (and initialised) area of memory which can be addressed
+ * by elements of the array which represent padding.
+ *
+ * `pad_top` and `pad_left` describe the padding on the top and left of the
+ * array, respectively, and `valid_rows` and `valid_cols` describe the number
+ * of rows and columns between the element pointed to by `base_ptr` and the
+ * edge of the image (that is `valid_rows` may be greater than `array_rows` and
+ * likewise for the columns).
+ */
+void fill_pointer_array(
+ size_t element_size,
+ void **dest, unsigned int array_rows, unsigned int array_cols,
+ void *base_ptr, size_t ld_row, size_t ld_col,
+ void *pad_buffer,
+ unsigned int pad_top, unsigned int valid_rows,
+ unsigned int pad_left, unsigned int valid_cols
+);
+
+/* Interleaved multi-point pointer array
+ * -------------------------------------
+ *
+ * For each point in a `output_rows` x `output_cols` array, constructs
+ * `kernel_rows` x `kernel_cols` array of pointers. The pointers are
+ * interleaved thusly:
+ *
+ * for ki in kernel_rows:
+ * for kj in kernel_cols:
+ * for oi in output_rows:
+ * for oj in output_cols:
+ * get pointer for point (oi*stride_rows + ki, oj*stride_cols + kj)
+ *
+ * Other arguments are as for `fill_pointer_array`.
+ *
+ * The name reflects that this is the form of addressing mode used by "generic"
+ * depthwise and pooling kernels.
+ */
+void fill_pointer_array_generic_kernel(
+ size_t element_size,
+ void **dest,
+ unsigned int output_rows, unsigned int output_cols,
+ unsigned int kernel_rows, unsigned int kernel_cols,
+ unsigned int stride_rows, unsigned int stride_cols,
+ void *base_ptr, size_t ld_row, size_t ld_col,
+ void *pad_buffer,
+ unsigned int pad_top, unsigned int valid_rows,
+ unsigned int pad_left, unsigned int valid_cols
+);
+
+/* NCHW-patch addressed by row
+ * ---------------------------
+ *
+ * Construct an array of pointers, each of which points at a row of an
+ * NCHW-ordered patch of a tensor. Memory addressed by the pointers may be
+ * outside of the original tensor, and should therefore not be written to
+ * (modifications will be lost).
+ *
+ * `dest_row_pointers` should point at a `patch_rows` list of pointers; each of
+ * which will point at a 1 x `patch_cols` NCHW-ordered sample of the source
+ * tensor.
+ *
+ * `dest_patch` should point to a `element_size * patch_rows * patch_cols` area
+ * of memory which can be written to by this function to form samples of the
+ * source tensor.
+ *
+ * `src_ptr` should point at the first VALID element of the chunk of tensor
+ * (i.e., if there's one padded row, and one padded column, then `src_ptr`
+ * should point at the element which will be at position (1, 1) in the array).
+ * `ld_row` and `ld_col` are in bytes, and describe the strides over rows and
+ * columns (respectively) of the NHWC-ordered tensor. If `ld_col` ==
+ * `element_size` then copies from the source tensor will be elided and source
+ * data may be addressed directly.
+ *
+ * `pad_row` should point to a `patch_cols` array of (appropriately
+ * initialised) padding values.
+ *
+ * Other arguments are as for `fill_pointer_array`.
+ */
+void fill_nchw_patch_array(
+ size_t element_size,
+ const void **dest_row_pointers, // Array of pointers to each row of the patch
+ void *dest_patch, // Pointer to space which can be used to construct the patch
+ unsigned int patch_rows, unsigned int patch_cols, // Patch size
+ const void *src_ptr, size_t ld_row, size_t ld_col, // Source tensor
+ const void *pad_row, // Pointer to a row of padding values
+ unsigned int pad_top, unsigned int valid_rows,
+ unsigned int pad_left, unsigned int valid_cols
+);
+
+void fill_patch_array_generic_kernel(
+ size_t element_size,
+ const void **dest_pointers, // Pointers: one per output row per kernel point
+ void *dest_patch, // Pointer to space which can be used to construct the patch
+ unsigned int output_rows, unsigned int output_cols,
+ unsigned int kernel_rows, unsigned int kernel_cols,
+ unsigned int stride_rows, unsigned int stride_cols,
+ const void *src_ptr, size_t ld_row, size_t ld_col, // Source tensor
+ const void *pad_row, // Pointer to a row of padding values
+ unsigned int pad_top, unsigned int valid_rows,
+ unsigned int pad_left, unsigned int valid_cols
+);
+
+} // namespace addressing
+
+namespace {
+
+/* Pointer array
+ * -------------
+ *
+ * See `addressing::fill_pointer_array`. No copies are made by this method,
+ * memory pointed to by the pointer array is contained within the base tensor
+ * and the padding buffer.
+ */
+template <typename T>
+inline void fill_pointer_array(
+ T **dest, unsigned int array_rows, unsigned int array_cols,
+ T *base_ptr, size_t ld_row, size_t ld_col,
+ T *pad_buffer,
+ unsigned int pad_top, unsigned int valid_rows,
+ unsigned int pad_left, unsigned int valid_cols
+)
+{
+ addressing::fill_pointer_array(
+ sizeof(T), (void **) dest, array_rows, array_cols,
+ (void *) base_ptr, ld_row, ld_col,
+ (void *) pad_buffer,
+ pad_top, valid_rows,
+ pad_left, valid_cols
+ );
+}
+
+
+/* Interleaved multi-point pointer array
+ * -------------------------------------
+ *
+ * See `addressing::fill_pointer_array_generic_kernel`. No copies are made by
+ * this method, memory pointed to by the pointer array is contained within the
+ * base tensor and the padding buffer.
+ */
+template <typename T>
+inline void fill_pointer_array_generic_kernel(
+ T **dest,
+ unsigned int output_rows, unsigned int output_cols,
+ unsigned int kernel_rows, unsigned int kernel_cols,
+ unsigned int stride_rows, unsigned int stride_cols,
+ T *base_ptr, size_t ld_row, size_t ld_col,
+ T *pad_buffer,
+ unsigned int pad_top, unsigned int valid_rows,
+ unsigned int pad_left, unsigned int valid_cols
+)
+{
+ addressing::fill_pointer_array_generic_kernel(
+ sizeof(T),
+ (void **) dest,
+ output_rows, output_cols,
+ kernel_rows, kernel_cols,
+ stride_rows, stride_cols,
+ (void *) base_ptr, ld_row, ld_col,
+ (void *) pad_buffer,
+ pad_top, valid_rows,
+ pad_left, valid_cols
+ );
+}
+
+template <typename T>
+inline void fill_nchw_patch_array(
+ const T **dest_row_pointers, // Array of pointers to each row of the patch
+ T *dest_patch, // Pointer to space which can be used to construct the patch
+ unsigned int patch_rows, unsigned int patch_cols, // Patch size
+ const T *src_ptr, size_t ld_row, size_t ld_col, // Source tensor
+ const T *pad_row, // Pointer to a row of padding values
+ unsigned int pad_top, unsigned int valid_rows,
+ unsigned int pad_left, unsigned int valid_cols
+)
+{
+ addressing::fill_nchw_patch_array(
+ sizeof(T),
+ reinterpret_cast<const void **>(dest_row_pointers),
+ reinterpret_cast<void *>(dest_patch),
+ patch_rows, patch_cols,
+ reinterpret_cast<const void *>(src_ptr), ld_row, ld_col,
+ reinterpret_cast<const void *>(pad_row),
+ pad_top, valid_rows,
+ pad_left, valid_cols
+ );
+}
+
+template <typename T>
+inline void fill_patch_array_generic_kernel(
+ const T **dest_pointers, // Pointers: one per output row per kernel point
+ T *dest_patch, // Pointer to space which can be used to construct the patch
+ unsigned int output_rows, unsigned int output_cols,
+ unsigned int kernel_rows, unsigned int kernel_cols,
+ unsigned int stride_rows, unsigned int stride_cols,
+ const T *src_ptr, size_t ld_row, size_t ld_col, // Source tensor
+ const T *pad_row, // Pointer to a row of padding values
+ unsigned int pad_top, unsigned int valid_rows,
+ unsigned int pad_left, unsigned int valid_cols
+)
+{
+ addressing::fill_patch_array_generic_kernel(
+ sizeof(T),
+ reinterpret_cast<const void **>(dest_pointers),
+ reinterpret_cast<void *>(dest_patch),
+ output_rows, output_cols,
+ kernel_rows, kernel_cols,
+ stride_rows, stride_cols,
+ reinterpret_cast<const void *>(src_ptr), ld_row, ld_col,
+ reinterpret_cast<const void *>(pad_row),
+ pad_top, valid_rows,
+ pad_left, valid_cols
+ );
+}
+
+} // namespace {anonymous}
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
new file mode 100644
index 0000000000..95ece8cdc8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "depthwise.hpp"
+#include "utils.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename T> struct DefaultTAccum { using Type = T; };
+template <> struct DefaultTAccum<int8_t> { using Type = int32_t; };
+template <> struct DefaultTAccum<uint8_t> { using Type = int32_t; };
+
+template <typename T> struct DefaultOutputStage { using Type = Nothing; };
+template <> struct DefaultOutputStage<int8_t> { using Type = arm_gemm::Requantize32; };
+template <> struct DefaultOutputStage<uint8_t> { using Type = arm_gemm::Requantize32; };
+
+class IDepthfirstStrategy
+{
+ public:
+ virtual ~IDepthfirstStrategy() = default;
+
+ virtual unsigned int get_input_rows() const = 0;
+ virtual unsigned int get_input_cols() const = 0;
+
+ virtual unsigned int get_output_rows() const = 0;
+ virtual unsigned int get_output_cols() const = 0;
+};
+
+
+template <typename T>
+struct TensorSpec
+{
+ T base;
+ size_t ld_row, ld_col;
+
+ TensorSpec(T ptr, size_t ld_row, size_t ld_col)
+ : base(ptr), ld_row(ld_row), ld_col(ld_col) {}
+};
+
+
+template <typename TInput, typename TWeight, typename TOutput>
+class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
+{
+ protected:
+ using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
+
+ // The strategy which we're applying to solve the depthwise convolution.
+ std::unique_ptr<const IDepthfirstStrategy> m_strat;
+
+ /* Compute the amount of working space required for a single thread. */
+ virtual size_t get_working_size_per_thread() const = 0;
+
+ /* Initialise the working space for a thread. */
+ virtual void initialise_working_space(void *) const = 0;
+
+ /* Compute a portion of the output tensor with padding. */
+ virtual void compute_tile_padded(
+ const DepthwiseArgs &args,
+ unsigned int output_i, unsigned int output_j,
+ unsigned int output_channel_start, unsigned int output_channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ const void *parameters,
+ void *working_space
+ ) const = 0;
+
+ /* Compute a portion of the work with only top/bottom padding.
+ *
+ * The default implementation of this repeatedly calls into the padded tile
+ * variant.
+ */
+ virtual void compute_row_padded_tile_row(
+ const DepthwiseArgs &args,
+ const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
+ const unsigned int output_channel_start, const unsigned int output_channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ const void *parameters,
+ void *working_space
+ ) const
+ {
+ for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols())
+ {
+ this->compute_tile_padded(
+ args,
+ output_i, output_j, output_channel_start, output_channel_end,
+ input, output, parameters, working_space
+ );
+ }
+ }
+
+ /* Compute a portion of the output tensor with no padding.
+ *
+ * The default implementation of this repeatedly calls into the padded
+ * variant.
+ */
+ virtual void compute_tiles_unpadded(
+ const DepthwiseArgs &args,
+ unsigned int start_output_i, unsigned int start_output_j,
+ unsigned int n_tile_rows, unsigned int n_tile_cols,
+ unsigned int output_channel_start, unsigned int output_channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ const void *parameters,
+ void *working_space
+ ) const
+ {
+ for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
+ {
+ unsigned int row_start_output_j = start_output_j;
+ for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++)
+ {
+ this->compute_tile_padded(
+ args,
+ start_output_i, row_start_output_j,
+ output_channel_start, output_channel_end,
+ input, output, parameters, working_space
+ );
+ row_start_output_j += m_strat->get_output_cols();
+ }
+ start_output_i += m_strat->get_output_rows();
+ }
+ }
+
+ void execute_internal(
+ const DepthwiseArgs &args,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads
+ ) const override
+ {
+ // Get and initialise the working space for this thread.
+ void *thread_working_space =
+ static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
+ this->initialise_working_space(thread_working_space);
+
+ // Construct convenient representations of the input/output tensors.
+ TensorSpec<const TInput *> input_tensor(reinterpret_cast<const TInput *>(input), ld_input_row, ld_input_col);
+ TensorSpec<TOutput *> output_tensor(reinterpret_cast<TOutput *>(output), ld_output_row, ld_output_col);
+
+ const auto n_output_channels = args.input_channels * args.channel_multiplier;
+
+ // By default we parallelize over the rows, but if there's only 1 row, we
+ // try to parallize over batches
+ auto thread_id_for_rows = thread_id;
+ auto n_threads_for_rows = n_threads;
+ auto thread_id_for_batches = 0;
+ auto n_threads_for_batches = 1;
+ if (args.output_rows == 1) {
+ thread_id_for_rows = 0;
+ n_threads_for_rows = 1;
+ thread_id_for_batches = thread_id;
+ n_threads_for_batches = n_threads;
+ }
+
+ // Progress the pointers for the first batch.
+ input_tensor.base += ld_input_batch*thread_id_for_batches;
+ output_tensor.base += ld_output_batch*thread_id_for_batches;
+ for (unsigned int batch = thread_id_for_batches;
+ batch < args.n_batches;
+ batch += n_threads_for_batches)
+ {
+ // Iterate over rows of the output tensor; we stripe over the tiles.
+ for (unsigned int start_output_i = thread_id_for_rows * m_strat->get_output_rows();
+ start_output_i < args.output_rows;
+ start_output_i += n_threads_for_rows * m_strat->get_output_rows())
+ {
+ // Determine what (if any padding) is required on the top/bottom of
+ // this row of the convolution.
+ const auto end_output_i = start_output_i + m_strat->get_output_rows();
+ const bool pad_output_bottom = args.output_rows < end_output_i;
+
+ const int start_input_i = start_output_i * args.stride_rows - args.padding.top;
+ const bool pad_input_top = start_input_i < 0;
+ const int end_input_i = start_input_i + m_strat->get_input_rows();
+ const bool pad_input_bottom = static_cast<int>(args.input_rows) < end_input_i;
+ // We only need to account for input padding if direct padding is not supported.
+ const bool pad_row = ((pad_input_top || pad_input_bottom) && !this->supports_direct_padding())
+ || pad_output_bottom;
+
+ // Iterate over the columns of the output tensor; we attempt to grab as
+ // much as possible of the unpadded regions, so the loop structure is a
+ // bit odd.
+ unsigned int start_output_j = 0;
+ while (start_output_j < args.output_cols)
+ {
+ const int start_in_j = start_output_j * args.stride_cols - args.padding.left;
+ const bool pad_input_left = start_in_j < 0;
+
+ // Determine if we can process a number of unpadded tiles in one go.
+ int n_unpadded_tiles = 0;
+ if ((!pad_input_left) || this->supports_direct_padding())
+ {
+ // Determine the maximum number of tiles we could handle.
+ n_unpadded_tiles = (args.output_cols - start_output_j) / m_strat->get_output_cols();
+
+ // Handle padding on the right hand edge
+ const int tile_stride = m_strat->get_output_cols() * args.stride_cols;
+ int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols();
+ int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride;
+
+ while (n_unpadded_tiles > 0 &&
+ (static_cast<int>(args.output_cols) < end_output_j ||
+ static_cast<int>(args.input_cols) < end_input_j))
+ {
+ n_unpadded_tiles--;
+ end_output_j -= m_strat->get_output_cols();
+ end_input_j -= tile_stride;
+ }
+ }
+
+ // Process unpadded tiles, if possible, otherwise process a padded tile.
+ if (n_unpadded_tiles)
+ {
+ if (!pad_row)
+ {
+ // Completely unpadded execution
+ this->compute_tiles_unpadded(
+ args,
+ start_output_i, start_output_j,
+ 1, n_unpadded_tiles, // Compute a row of unpadded tiles
+ 0, n_output_channels, // Compute all channels
+ input_tensor, output_tensor, parameters, thread_working_space
+ );
+ }
+ else
+ {
+ // Top/bottom padding only
+ this->compute_row_padded_tile_row(
+ args,
+ start_output_i, start_output_j, n_unpadded_tiles,
+ 0, n_output_channels, // Compute all channels
+ input_tensor, output_tensor, parameters, thread_working_space
+ );
+ }
+ start_output_j += n_unpadded_tiles * m_strat->get_output_cols();
+ }
+ else
+ {
+ this->compute_tile_padded(
+ args,
+ start_output_i, start_output_j,
+ 0, n_output_channels, // Compute all channels
+ input_tensor, output_tensor, parameters, thread_working_space
+ );
+ start_output_j += m_strat->get_output_cols();
+ }
+ }
+ }
+
+ // Progress the pointers for the next batch.
+ input_tensor.base += ld_input_batch*n_threads_for_batches;
+ output_tensor.base += ld_output_batch*n_threads_for_batches;
+ }
+ }
+
+ public:
+ DepthfirstDriver(IDepthfirstStrategy *strategy, const DepthwiseArgs &args)
+ : Parent(args), m_strat(strategy)
+ {
+ }
+
+ size_t get_working_size(unsigned int n_threads) const override final
+ {
+ return n_threads * this->get_working_size_per_thread();
+ }
+
+ virtual bool supports_direct_padding() const
+ {
+ return false;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp
new file mode 100644
index 0000000000..2950d5e957
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_common.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "depthwise_common.hpp"
+
+#include "utils.hpp"
+
+using arm_gemm::iceildiv;
+
+namespace arm_conv {
+namespace depthwise {
+
+std::tuple<size_t, size_t, size_t, size_t, size_t>
+get_reduced_view_for_dilation(size_t out_size, size_t in_size, const size_t d,
+ const size_t dilation_factor,
+ const size_t kernel_size, const size_t stride,
+ const size_t orig_pad_before) {
+ // Get the valid output range
+ out_size = iceildiv(out_size - d, dilation_factor);
+
+ // Compute the start offset and the amount of padding which applies to this
+ // portion of the work.
+ size_t start_pos = d * stride, pad_before = 0;
+ if (start_pos < orig_pad_before) {
+ pad_before = iceildiv(orig_pad_before - start_pos, dilation_factor);
+ }
+ start_pos += pad_before * dilation_factor - orig_pad_before;
+
+ // Hence compute the valid input range
+ in_size = start_pos < in_size
+ ? iceildiv(in_size - start_pos, dilation_factor)
+ : 0;
+
+ // Finally, compute the "after" padding
+ const size_t reqd_input = (out_size - 1) * stride + kernel_size;
+ size_t pad_after = 0;
+ if (reqd_input > (pad_before + in_size)) {
+ pad_after = reqd_input - (pad_before + in_size);
+ }
+
+ return std::make_tuple(out_size, in_size, start_pos, pad_before, pad_after);
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
new file mode 100644
index 0000000000..7b00c9a7af
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
@@ -0,0 +1,700 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "src/core/NEON/kernels/arm_conv/addressing.hpp"
+#include "depthwise_strategies_common.hpp"
+#include "working_space.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+#include <limits>
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum,
+ typename OutputStage>
+class DepthwiseDepthfirstStrategyCommon
+ : public DepthfirstStrategy<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+ protected:
+ unsigned int m_output_rows, m_output_cols;
+ unsigned int m_kernel_rows, m_kernel_cols;
+ unsigned int m_stride_rows, m_stride_cols;
+
+ public:
+ DepthwiseDepthfirstStrategyCommon(
+ unsigned int output_rows, unsigned int output_cols,
+ unsigned int kernel_rows, unsigned int kernel_cols,
+ unsigned int stride_rows=1, unsigned int stride_cols=1
+ ) : m_output_rows(output_rows), m_output_cols(output_cols),
+ m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
+ m_stride_rows(stride_rows), m_stride_cols(stride_cols)
+ {
+ }
+
+ DepthwiseDepthfirstStrategyCommon(unsigned int output_size, unsigned int kernel_size, unsigned int stride=1)
+ : DepthwiseDepthfirstStrategyCommon(output_size, output_size, kernel_size, kernel_size, stride, stride)
+ {
+ }
+
+ virtual ~DepthwiseDepthfirstStrategyCommon() {}
+
+ unsigned int get_output_rows() const override { return m_output_rows; }
+ unsigned int get_output_cols() const override { return m_output_cols; }
+
+ unsigned int get_kernel_rows() const override { return m_kernel_rows; }
+ unsigned int get_kernel_cols() const override { return m_kernel_cols; }
+
+ unsigned int get_stride_rows() const override { return m_stride_rows; }
+ unsigned int get_stride_cols() const override { return m_stride_cols; }
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwiseDepthfirstStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+ using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>;
+
+ public:
+ using Parent::Parent;
+
+ typedef void (*IndirectKernelType)(
+ const TInput *const *input_ptrs,
+ TOutput *const *output_ptrs,
+ const void *params,
+ unsigned int n_channels,
+ const TAccum activation_min,
+ const TAccum activation_max
+ );
+ virtual IndirectKernelType get_indirect_kernel(void) const = 0;
+
+ typedef void (*DirectKernelType)(
+ const unsigned int n_tile_rows, const unsigned int n_tile_cols,
+ const TInput *inptr_base, int64_t ld_input_row, int64_t ld_input_col,
+ TOutput *outptr_base, int64_t ld_output_row, int64_t ld_output_col,
+ const void *params, unsigned int n_channels,
+ const TAccum activation_min,
+ const TAccum activation_max
+ );
+ virtual DirectKernelType get_direct_kernel(void) const = 0;
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+class DepthwiseDepthfirstStrategy<TInput, TWeight, TOutput, int32_t>
+: public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+ using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>;
+
+ protected:
+ interleaves::PackingArguments get_packing_args(void) const
+ {
+ return interleaves::PackingArguments(
+ this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
+ false, sizeof(int32_t), this->uses_premultiply(), // Don't pack the bias
+ this->get_vl_type(), sizeof(int32_t), this->get_accumulator_depth_vl(),
+ [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+ { return this->get_kernel_packing_point(idx, x, y); }
+ );
+ }
+
+ public:
+ using Parent::Parent;
+
+ typedef void (*KernelType)(
+ unsigned int, // n_channels,
+ const TInput *const *, // inptrs
+ const TWeight *, // weights
+ const int32_t *, // bias,
+ const arm_gemm::Requantize32 &,
+ const int32_t *, const int32_t *, // requant_muls and requant_shifts
+ TOutput *const * // outptrs
+ );
+ virtual KernelType get_kernel() const = 0;
+
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleaves::get_storage_size_generic(get_packing_args(), args);
+ }
+
+ void pack_parameters(
+ const DepthwiseArgs &args, void *buffer,
+ const void *biases, const arm_gemm::Requantize32 &,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const override
+ {
+ interleaves::pack_parameters_generic(
+ get_packing_args(), args, buffer, biases, weights, ld_weight_col, ld_weight_row);
+ }
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
+class DepthwiseDepthfirstCommon : public DepthfirstDriver<TInput, TWeight, TOutput>
+{
+ using StratType = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>;
+ OutputStage m_os;
+
+ protected:
+ inline OutputStage &get_output_stage(void) { return m_os; }
+ inline const OutputStage &get_output_stage(void) const { return m_os; }
+
+ bool uses_intermediate_array() const
+ {
+ return this->m_args.channel_multiplier != 1 && this->uses_premultiply();
+ }
+
+ virtual void fill_inptr_array(const DepthwiseArgs &args,
+ const TensorSpec<const TInput *> &input,
+ const TInput **inptr_array, TInput *input_buffer,
+ const unsigned int input_i, const unsigned int input_j,
+ const unsigned int input_pad_top, const unsigned int input_pad_left) const = 0;
+
+ void initialise_inptr_array(const DepthwiseArgs &args,
+ unsigned int output_channel_start, unsigned int output_channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TInput **inptr_array, TInput *input_buffer, TInput *intermediate_buffer,
+ const unsigned int input_i, const unsigned int input_j,
+ const unsigned int input_pad_top, const unsigned int input_pad_left,
+ Tile<TInput> &multiplied_input
+ ) const
+ {
+ // Compute the input pointer array
+ const auto input_channel_start = output_channel_start / args.channel_multiplier;
+
+ const auto last_valid_row = std::min(input_pad_top + args.input_rows - input_i, this->m_strat->get_input_rows());
+ const auto last_valid_col = std::min(input_pad_left + args.input_cols - input_j, this->m_strat->get_input_cols());
+
+ const auto tile_rows = last_valid_row - input_pad_top;
+ const auto tile_cols = last_valid_col - input_pad_left;
+
+ const auto tile_channels = output_channel_end - output_channel_start;
+
+ TensorSpec<const TInput *> tile_tensor(0, 0, 0);
+ if (this->uses_intermediate_array()) {
+ multiplied_input = Tile<TInput>(intermediate_buffer, tile_rows, tile_cols, tile_channels);
+ multiplied_input.load_from(input.base, input.ld_row, input.ld_col,
+ args.input_rows, args.input_cols,
+ input_i, input_j, args.channel_multiplier);
+
+ tile_tensor = TensorSpec<const TInput *>(
+ multiplied_input.array,
+ tile_cols * tile_channels, tile_channels
+ );
+ } else {
+ tile_tensor = TensorSpec<const TInput *>(
+ input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start,
+ input.ld_row, input.ld_col
+ );
+ }
+
+ fill_inptr_array(args,
+ tile_tensor,
+ inptr_array, input_buffer,
+ input_i, input_j,
+ input_pad_top,
+ input_pad_left
+ );
+ }
+
+ public:
+ DepthwiseDepthfirstCommon(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os)
+ : DepthfirstDriver<TInput, TWeight, TOutput>(strat, args), m_os(os)
+ {
+ }
+
+ DepthwiseDepthfirstCommon(DepthwiseDepthfirstCommon &) = delete;
+ DepthwiseDepthfirstCommon &operator=(DepthwiseDepthfirstCommon &) = delete;
+
+ size_t get_storage_size(void) const override
+ {
+ return reinterpret_cast<const StratType *>(this->m_strat.get())->
+ get_storage_size(this->m_args);
+ }
+
+ void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+ {
+ reinterpret_cast<const StratType *>(this->m_strat.get())->
+ pack_parameters(this->m_args, buffer, biases, m_os, weights, ld_weight_col, ld_weight_row);
+ }
+};
+
+namespace depthwise_depthfirst {
+
+/* Workspace Element for an array of input pointers as consumed by the
+ * specialised depthwise kernels.
+ */
+template <typename T>
+class InputArrayElement
+{
+ public:
+ struct Workspace
+ {
+ const T **inptr_array;
+ };
+
+ template <class OutputStage>
+ static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ return sizeof(T **) * args.strategy->get_input_rows() * args.strategy->get_input_cols();
+ }
+
+ template <class WorkspaceType, class OutputStage>
+ static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ ws->inptr_array = reinterpret_cast<const T**>(buffer);
+ return reinterpret_cast<char *>(buffer) + get_element_size(args);
+ }
+};
+
+template <typename TAccum, typename OutputStage, bool IsDot=false>
+struct WorkspaceFinalElement
+{
+ using Element = ActivationsElement<TAccum, OutputStage>;
+};
+
+template <>
+struct WorkspaceFinalElement<int32_t, arm_gemm::Requantize32, false>
+{
+ using Element = RequantizationParametersElement;
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
+struct Invoke
+{
+ constexpr static bool supports_direct_kernel = true;
+
+ template <typename Strat, typename Workspace>
+ static inline void indirect(const Strat *strat, const Workspace *ws, const OutputStage &, const void *params, const TAccum *, unsigned int n_channels)
+ {
+ strat->get_indirect_kernel()(
+ ws->inptr_array,
+ ws->outptr_array,
+ params, n_channels,
+ ws->activation_min, ws->activation_max
+ );
+ }
+
+ template <typename Strat, typename Workspace>
+ static void direct(
+ const Strat *strat, const Workspace *ws, const OutputStage &,
+ unsigned int n_tile_rows, unsigned int n_tile_cols,
+ const TInput *inptr, size_t ld_in_row, size_t ld_in_col,
+ TOutput *outptr, size_t ld_out_row, size_t ld_out_col,
+ const void *params, unsigned int n_channels
+ )
+ {
+ strat->get_direct_kernel()(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_in_row, ld_in_col,
+ outptr, ld_out_row, ld_out_col,
+ params, n_channels, ws->activation_min, ws->activation_max
+ );
+ }
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+struct Invoke<TInput, TWeight, TOutput, TAccum, arm_gemm::Requantize32>
+{
+ constexpr static bool supports_direct_kernel = false;
+
+ template <typename Strat, typename Workspace>
+ static inline void indirect(const Strat *strat, const Workspace *ws, const arm_gemm::Requantize32 &qp, const void *params, const TAccum *, unsigned int n_channels)
+ {
+ strat->get_kernel()(
+ n_channels, ws->inptr_array,
+ reinterpret_cast<const TWeight *>(params), ws->bias,
+ qp, ws->requant_muls, ws->requant_shifts,
+ ws->outptr_array
+ );
+ }
+
+ template <typename Strat, typename Workspace>
+ static inline void direct(
+ const Strat *, const Workspace *, const arm_gemm::Requantize32 &,
+ unsigned int, unsigned int, // n_tile_rows, n_tile_cols
+ const TInput *, size_t, size_t, // Input pointer, row stride, column stride
+ TOutput *, size_t, size_t, // Output pointer, row stride, column stride
+ const void *, unsigned int // Parameters, number of channels
+ )
+ {
+ // Do nothing - this should never be reached because entry to it is guarded
+ // by an `if` on a `constexpr static bool`.
+ }
+};
+
+namespace
+{
+
+template <typename OutputStage>
+inline void stash_bias(OutputStage &, const void *) {}
+
+template <>
+inline void stash_bias(arm_gemm::Requantize32 &qp, const void *bias) __attribute__ ((unused));
+
+template <>
+inline void stash_bias(arm_gemm::Requantize32 &qp, const void *bias)
+{
+ qp.bias = reinterpret_cast<const int32_t *>(bias);
+}
+
+}
+
+} // namespace depthwise_depthfirst
+
+template <typename TInput,
+ typename TWeight=TInput,
+ typename TOutput=TInput,
+ typename TAccum=typename DefaultTAccum<TInput>::Type,
+ typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwiseDepthfirst
+: public DepthwiseDepthfirstCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+ using StratType = DepthwiseDepthfirstStrategy<TInput, TWeight, TOutput, TAccum>;
+ using Parent = DepthwiseDepthfirstCommon<TInput, TWeight, TOutput, TAccum, OutputStage>;
+ using WorkspaceManager = Workspace<
+ OutputArrayElement<TOutput>,
+ depthwise_depthfirst::InputArrayElement<TInput>,
+ InputBufferElement<TInput>,
+ IntermediateBufferElement<TInput>,
+ typename depthwise_depthfirst::WorkspaceFinalElement<TAccum, OutputStage>::Element
+ >;
+ using WorkingSpace = typename WorkspaceManager::WorkspaceType;
+
+ // We keep a copy of the bias and output stage
+ const TAccum *m_bias;
+
+ public:
+ DepthwiseDepthfirst(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
+ : Parent(strat, args, os), m_bias(nullptr)
+ {
+ }
+
+ DepthwiseDepthfirst(DepthwiseDepthfirst &) = delete;
+ DepthwiseDepthfirst &operator=(DepthwiseDepthfirst &) = delete;
+
+ void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+ {
+ reinterpret_cast<const StratType *>(this->m_strat.get())->pack_parameters(
+ this->m_args, buffer, biases, this->get_output_stage(),
+ weights, ld_weight_col, ld_weight_row
+ );
+ m_bias = reinterpret_cast<const TAccum *>(biases);
+ depthwise_depthfirst::stash_bias(this->get_output_stage(), biases);
+ }
+
+ size_t get_working_size_per_thread() const override
+ {
+ DepthwiseArgs args(this->m_args);
+ return WorkspaceManager::get_sizeof_workspace(
+ WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage())
+ );
+ }
+
+ void initialise_working_space(void *buffer) const override
+ {
+ DepthwiseArgs args(this->m_args);
+ WorkspaceManager::initialise(
+ buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage())
+ );
+ }
+
+ virtual bool supports_direct_padding() const override
+ {
+ using Invoker = depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>;
+ return Invoker::supports_direct_kernel && this->uses_intermediate_array();
+ }
+
+ protected:
+
+ void fill_inptr_array(const DepthwiseArgs &args,
+ const TensorSpec<const TInput *> &input,
+ const TInput **inptr_array, TInput *input_buffer,
+ const unsigned int input_i, const unsigned int input_j,
+ const unsigned int input_pad_top, const unsigned int input_pad_left) const override
+ {
+ fill_pointer_array<const TInput>(
+ inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
+ input.base,
+ input.ld_row, input.ld_col,
+ input_buffer,
+ input_pad_top, args.input_rows - input_i,
+ input_pad_left, args.input_cols - input_j
+ );
+ }
+
+ void compute_tile_padded(
+ const DepthwiseArgs &args,
+ unsigned int output_i, unsigned int output_j,
+ unsigned int output_channel_start, unsigned int output_channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ const void *parameters,
+ void *working_space_raw
+ ) const override
+ {
+ // Get the working space
+ auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
+
+ // Compute the input pointer array
+ const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
+ const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+ const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+
+ const int ij = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
+ const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
+ const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
+
+ Tile<TInput> multiplied_input;
+ this->initialise_inptr_array(args, output_channel_start, output_channel_end, input,
+ ws->inptr_array, ws->input_buffer, ws->intermediate_buffer,
+ input_i, input_j, input_pad_top, input_pad_left, multiplied_input);
+
+ // Compute the output pointer array
+ fill_pointer_array(
+ ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+ output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
+ output.ld_row, output.ld_col,
+ ws->output_buffer,
+ 0, args.output_rows - output_i, // Top padding, # valid rows
+ 0, args.output_cols - output_j // Left padding, # valid columns
+ );
+
+ // Execute the kernel
+ depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>::indirect(
+ reinterpret_cast<const StratType *>(this->m_strat.get()),
+ ws, this->get_output_stage(), parameters, m_bias, output_channel_end - output_channel_start
+ );
+ }
+
+ void compute_row_padded_tile_row(
+ const DepthwiseArgs &args,
+ const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
+ const unsigned int output_channel_start, const unsigned int output_channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ const void *parameters,
+ void *working_space
+ ) const override
+ {
+ using Invoker = depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>;
+ auto ws = reinterpret_cast<WorkingSpace *>(working_space);
+ const auto strat = reinterpret_cast<const StratType *>(this->m_strat.get());
+ const auto os = this->get_output_stage();
+
+ // Compute top and bottom padding; hence fill in the initial pointer arrays.
+ const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
+ const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+
+ const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+ auto input_j = output_j * args.stride_cols - args.padding.left;
+
+ // Valid input rows is the smallest of the input rows that aren't padding for this tile, and the number of rows
+ // available.
+ const auto valid_input_rows = std::min(strat->get_input_rows() - input_pad_top, args.input_rows - input_i);
+ const auto valid_output_rows = std::min(strat->get_output_rows(), args.output_rows - output_i);
+
+ const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * args.stride_cols;
+ const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols();
+
+ Tile<TInput> multiplied_input;
+ this->initialise_inptr_array(args, output_channel_start, output_channel_end, input,
+ ws->inptr_array, ws->input_buffer, ws->intermediate_buffer,
+ input_i, input_j, input_pad_top, 0, multiplied_input);
+
+ fill_pointer_array(
+ ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+ output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
+ output.ld_row, output.ld_col,
+ ws->output_buffer,
+ 0, args.output_rows - output_i, // Top padding, # valid rows
+ 0, args.output_cols - output_j // Left padding, # valid columns
+ );
+
+ for (; n_tile_cols; n_tile_cols--)
+ {
+ // Execute the kernel
+ Invoker::indirect(
+ strat, ws, os, parameters, m_bias, output_channel_end - output_channel_start
+ );
+
+ // Update all unpadded pointers
+ if (this->uses_intermediate_array()) {
+ input_j += input_point_stride / input.ld_col;
+ multiplied_input.load_from(input.base,
+ input.ld_row, input.ld_col,
+ args.input_rows, args.input_cols,
+ input_i, input_j, args.channel_multiplier);
+ } else {
+ {
+ auto ptr = ws->inptr_array + strat->get_input_cols() * input_pad_top;
+ for (auto n = input_pad_top; n < (valid_input_rows + input_pad_top); n++)
+ {
+ for (auto m = 0u; m < strat->get_input_cols(); m++)
+ {
+ *(ptr++) += input_point_stride;
+ }
+ }
+ }
+ }
+
+ {
+ auto ptr = ws->outptr_array;
+ for (auto n = 0u; n < valid_output_rows * strat->get_output_cols(); n++)
+ {
+ *(ptr++) += output_point_stride;
+ }
+ }
+ }
+ }
+
+ void compute_tiles_unpadded(
+ const DepthwiseArgs &args,
+ unsigned int output_i, const unsigned int output_j,
+ unsigned int n_tile_rows, unsigned int n_tile_cols,
+ unsigned int output_channel_start, unsigned int output_channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ const void *parameters,
+ void *working_space_raw
+ ) const override
+ {
+ using Invoker = depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>;
+ auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
+ const auto strat = reinterpret_cast<const StratType *>(this->m_strat.get());
+ const auto os = this->get_output_stage();
+
+ if (Invoker::supports_direct_kernel)
+ {
+ PaddingValues tile_padding = {
+ args.kernel_cols / 2,
+ args.kernel_rows / 2,
+ args.kernel_cols / 2,
+ args.kernel_rows / 2
+ };
+
+ // If the direct kernel is supported, then use it.
+ // Compute the base pointers we'll use in the tile.
+ auto outptr = output.base + output_channel_start + output_i * output.ld_row + output_j * output.ld_col;
+ const int start_input_i = output_i * args.stride_rows - args.padding.top;
+ const int start_input_j = output_j * args.stride_cols - args.padding.left;
+ auto inptr = input.base + output_channel_start + start_input_i * input.ld_row + start_input_j * input.ld_col;
+
+ auto ld_row = input.ld_row;
+ auto ld_col = input.ld_col;
+
+ const auto tile_rows = this->m_strat->get_output_rows() * args.stride_rows * n_tile_rows + tile_padding.top + tile_padding.bottom;
+ const auto tile_cols = this->m_strat->get_output_cols() * args.stride_cols * n_tile_cols + tile_padding.left + tile_padding.right;
+ const auto tile_channels = output_channel_end - output_channel_start;
+
+ Tile<TInput> multiplied_input;
+ if (this->uses_intermediate_array()) {
+ multiplied_input = Tile<TInput>(ws->intermediate_buffer, tile_rows, tile_cols, tile_channels);
+ multiplied_input.load_from(input.base,
+ input.ld_row, input.ld_col,
+ args.input_rows, args.input_cols,
+ start_input_i, start_input_j, args.channel_multiplier);
+
+ ld_row = tile_cols * tile_channels;
+ ld_col = tile_channels;
+ inptr = multiplied_input.array;
+ }
+
+ // Execute the kernel
+ Invoker::direct(
+ strat, ws, os,
+ n_tile_rows, n_tile_cols,
+ inptr, ld_row, ld_col,
+ outptr, output.ld_row, output.ld_col,
+ parameters, output_channel_end - output_channel_start
+ );
+ }
+ else
+ {
+ // Otherwise, we repeatedly call the padded kernel but use our knowledge
+ // of the tensor structure to avoid recomputing the pointer array.
+
+ const auto n_input_pointers = this->m_strat->get_input_rows() * this->m_strat->get_input_cols();
+ const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * args.stride_cols;
+ const auto n_output_pointers = this->m_strat->get_output_rows() * this->m_strat->get_output_cols();
+ const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols();
+
+ // For each tile row, initialise the input and output pointer arrays. For
+ // each subsequent tile we simply update the pointers.
+ for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
+ {
+ const int input_i = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
+ int input_j = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
+
+ Tile<TInput> multiplied_input;
+ this->initialise_inptr_array(args, output_channel_start, output_channel_end, input,
+ ws->inptr_array, ws->input_buffer, ws->intermediate_buffer,
+ input_i, input_j, 0, 0, multiplied_input);
+
+ // Compute the output pointer array
+ fill_pointer_array(
+ ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+ output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
+ output.ld_row, output.ld_col,
+ ws->output_buffer,
+ 0, args.output_rows,
+ 0, args.output_cols
+ );
+
+ for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++)
+ {
+ // Invoke the indirect kernel for this tile
+ depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>::indirect(
+ strat, ws, os, parameters, m_bias, output_channel_end - output_channel_start
+ );
+
+ // Progress the pointers
+ if (this->uses_intermediate_array()) {
+ input_j += input_point_stride / input.ld_col;
+ multiplied_input.load_from(input.base,
+ input.ld_row, input.ld_col,
+ args.input_rows, args.input_cols, input_i, input_j, args.channel_multiplier);
+ } else {
+ for (auto i = 0u; i < n_input_pointers; i++)
+ {
+ ws->inptr_array[i] += input_point_stride;
+ }
+ }
+
+ for (auto i = 0u; i < n_output_pointers; i++)
+ {
+ ws->outptr_array[i] += output_point_stride;
+ }
+ }
+
+ output_i += this->m_strat->get_output_rows();
+ }
+ }
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp
new file mode 100644
index 0000000000..e2d05560a1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "depthwise_depthfirst.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename TInput, typename TOutput, typename TAccum>
+struct GenericDepthfirstKernelStrategyFunctionType
+{
+ using KernelType = std::function<void(const TInput *const *const, TOutput *const *const, const void *, const void *, const unsigned int, const unsigned int, const TAccum, const TAccum)>;
+};
+
+template <typename TInput, typename TOutput>
+struct GenericDepthfirstKernelStrategyFunctionType<TInput, TOutput, int32_t>
+{
+ using KernelType = std::function<void(const TInput *const *const, TOutput *const *const, const void *, const arm_gemm::Requantize32 &, unsigned int, unsigned int)>;
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+class GenericDepthfirstKernelStrategy
+{
+ unsigned int m_n_output_points;
+ arm_gemm::VLType m_vl_type;
+ unsigned int m_accumulator_depth_vl;
+
+ public:
+ GenericDepthfirstKernelStrategy(unsigned int n_output_points, arm_gemm::VLType vl_type, unsigned int accumulator_depth_vl=1)
+ : m_n_output_points(n_output_points), m_vl_type(vl_type), m_accumulator_depth_vl(accumulator_depth_vl)
+ {
+ }
+
+ virtual ~GenericDepthfirstKernelStrategy() = default;
+
+ virtual arm_gemm::VLType get_vl_type() const { return m_vl_type; }
+ virtual unsigned int get_accumulator_depth_vl() const { return m_accumulator_depth_vl; }
+ virtual unsigned int get_n_output_points() const { return m_n_output_points; }
+
+ using KernelType = typename GenericDepthfirstKernelStrategyFunctionType<TInput, TOutput, TAccum>::KernelType;
+ virtual KernelType get_kernel(void) const = 0;
+};
+
+template <typename TInput,
+ typename TWeight=TInput,
+ typename TOutput=TInput,
+ typename TAccum=typename DefaultTAccum<TInput>::Type,
+ typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class GenericDepthfirstStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+ protected:
+ using KernelStrategyType = GenericDepthfirstKernelStrategy<TInput, TWeight, TOutput, TAccum>;
+ std::unique_ptr<KernelStrategyType> m_strategy;
+
+ public:
+ GenericDepthfirstStrategy(
+ KernelStrategyType *strat, unsigned int n_output_rows, unsigned int n_output_cols,
+ const DepthwiseArgs &args
+ )
+ : DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>(
+ n_output_rows, n_output_cols,
+ args.kernel_rows, args.kernel_cols,
+ args.stride_rows, args.stride_cols
+ ),
+ m_strategy(strat)
+ {
+ }
+
+ GenericDepthfirstStrategy(GenericDepthfirstStrategy &) = delete;
+ GenericDepthfirstStrategy operator=(GenericDepthfirstStrategy &) = delete;
+
+ arm_gemm::VLType get_vl_type(void) const override { return m_strategy->get_vl_type(); }
+ unsigned int get_accumulator_depth_vl(void) const override { return m_strategy->get_accumulator_depth_vl(); }
+
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ interleaves::PackingArguments packing_args(
+ this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
+ false, sizeof(TAccum), this->uses_premultiply(), // Don't pack the bias
+ this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(),
+ [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+ { return this->get_kernel_packing_point(idx, x, y); }
+ );
+ return interleaves::get_storage_size_generic(packing_args, args);
+ }
+
+ void pack_parameters(
+ const DepthwiseArgs &args, void *buffer,
+ const void *biases, const OutputStage &,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const override
+ {
+ interleaves::PackingArguments packing_args(
+ this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
+ false, sizeof(TAccum), this->uses_premultiply(), // Don't pack the bias
+ this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(),
+ [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+ { return this->get_kernel_packing_point(idx, x, y); }
+ );
+ interleaves::pack_parameters_generic(
+ packing_args, args, buffer, biases, weights, ld_weight_col, ld_weight_row);
+ }
+
+ const typename KernelStrategyType::KernelType get_kernel() const { return m_strategy->get_kernel(); }
+};
+
+// Use a templated function to marshal arguments when executing the kernel.
+template <typename OutputStage> struct DepthwiseDepthfirstGenericKernelCall;
+
+template <>
+struct DepthwiseDepthfirstGenericKernelCall<Nothing>
+{
+ template <typename StratType, typename WorkspaceType, typename TAccum>
+ static void execute(
+ const StratType *strat, const WorkspaceType *ws, const Nothing &,
+ const TAccum *bias, const void *params,
+ const unsigned int n_kernel_points, const unsigned int n_output_channels
+ )
+ {
+ strat->get_kernel()(
+ ws->inptr_array,
+ ws->outptr_array,
+ params, bias,
+ n_kernel_points, n_output_channels,
+ ws->activation_min, ws->activation_max
+ );
+ }
+};
+
+template <>
+struct DepthwiseDepthfirstGenericKernelCall<arm_gemm::Requantize32>
+{
+ template <typename StratType, typename WorkspaceType>
+ static void execute(
+ const StratType *strat, const WorkspaceType *ws, const arm_gemm::Requantize32 &qp,
+ const int32_t *, const void *params,
+ const unsigned int n_kernel_points, const unsigned int n_output_channels
+ )
+ {
+ strat->get_kernel()(
+ ws->inptr_array,
+ ws->outptr_array,
+ params, qp,
+ n_kernel_points, n_output_channels
+ );
+ }
+};
+
+
+/* Workspace Element for an array of input pointers as consumed by the
+ * "Generic" depthwise kernels.
+ */
+template <typename T>
+class GenericInputArrayElement
+{
+ public:
+ struct Workspace
+ {
+ const T **inptr_array;
+ };
+
+ template <class OutputStage>
+ static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ const auto kernel_points = args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
+ return sizeof(T **) * args.strategy->get_output_rows() * args.strategy->get_output_cols() * kernel_points;
+ }
+
+ template <class WorkspaceType, class OutputStage>
+ static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ ws->inptr_array = reinterpret_cast<const T**>(buffer);
+ return reinterpret_cast<char *>(buffer) + get_element_size(args);
+ }
+};
+
+template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
+ typename TAccum=typename DefaultTAccum<TInput>::Type,
+ typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+ using StratType = GenericDepthfirstStrategy<TInput, TWeight, TOutput, TAccum, OutputStage>;
+ using Parent = DepthwiseDepthfirstCommon<TInput, TWeight, TOutput, TAccum, OutputStage>;
+ using WorkspaceManager = Workspace<
+ OutputArrayElement<TOutput>,
+ GenericInputArrayElement<TInput>,
+ InputBufferElement<TInput>,
+ IntermediateBufferElement<TInput>,
+ ActivationsElement<TAccum, OutputStage>
+ >;
+ using WorkingSpace = typename WorkspaceManager::WorkspaceType;
+ const TAccum *m_bias = nullptr;
+
+ public:
+ DepthwiseDepthfirstGeneric(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os={})
+ : Parent(strat, args, os)
+ {
+ }
+
+ DepthwiseDepthfirstGeneric(DepthwiseDepthfirstGeneric &) = delete;
+ DepthwiseDepthfirstGeneric &operator=(DepthwiseDepthfirstGeneric &) = delete;
+
+ void pack_parameters(
+ void *buffer, const void *biases,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) override
+ {
+ Parent::pack_parameters(buffer, biases, weights, ld_weight_col, ld_weight_row);
+ m_bias = reinterpret_cast<const TAccum *>(biases); // Get a copy of the biases
+ depthwise_depthfirst::stash_bias(this->get_output_stage(), m_bias);
+ }
+
+ size_t get_working_size_per_thread() const override
+ {
+ DepthwiseArgs args(this->m_args);
+ return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage()));
+ }
+
+ void initialise_working_space(void *buffer) const override
+ {
+ DepthwiseArgs args(this->m_args);
+ return WorkspaceManager::initialise(buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage()));
+ }
+
+ protected:
+ void fill_inptr_array(const DepthwiseArgs &args,
+ const TensorSpec<const TInput *> &input,
+ const TInput **inptr_array, TInput *input_buffer,
+ const unsigned int input_i, const unsigned int input_j,
+ const unsigned int input_pad_top, const unsigned int input_pad_left) const override
+ {
+ fill_pointer_array_generic_kernel<const TInput>(
+ inptr_array,
+ this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+ args.kernel_rows, args.kernel_cols,
+ args.stride_rows, args.stride_cols,
+ input.base,
+ input.ld_row, input.ld_col,
+ input_buffer,
+ input_pad_top, args.input_rows - input_i,
+ input_pad_left, args.input_cols - input_j
+ );
+ }
+
+ void compute_tile_padded(
+ const DepthwiseArgs &args,
+ unsigned int output_i, unsigned int output_j,
+ unsigned int channel_start, unsigned int channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ const void *parameters,
+ void *working_space_raw
+ ) const override
+ {
+ // Get the working space
+ WorkingSpace *ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
+
+ const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
+ const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+ const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+
+ const int ij = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
+ const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
+ const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
+
+ Tile<TInput> multiplied_input;
+ this->initialise_inptr_array(args, channel_start, channel_end, input,
+ ws->inptr_array, ws->input_buffer, ws->intermediate_buffer,
+ input_i, input_j, input_pad_top, input_pad_left, multiplied_input);
+
+ // Compute the output pointer array
+ fill_pointer_array<TOutput>(
+ ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+ output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start,
+ output.ld_row, output.ld_col,
+ ws->output_buffer,
+ 0, args.output_rows - output_i, // Top padding, # valid rows
+ 0, args.output_cols - output_j // Left padding, # valid columns
+ );
+
+ // Execute the kernel
+ DepthwiseDepthfirstGenericKernelCall<OutputStage>::execute(
+ reinterpret_cast<const StratType *>(this->m_strat.get()), ws,
+ this->get_output_stage(), m_bias, parameters,
+ args.kernel_rows * args.kernel_cols,
+ channel_end - channel_start
+ );
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
new file mode 100644
index 0000000000..b93caa2aaa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
@@ -0,0 +1,604 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "depthwise_depthfirst.hpp"
+#include "interleaves/generic_quantized_dot_product.hpp"
+
+#include <limits>
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+class DepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, Nothing>
+{
+ using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, Nothing>;
+
+ protected:
+ virtual interleaves::PackingArguments get_packing_args(const DepthwiseArgs &args) const
+ {
+ return interleaves::PackingArguments(
+ args.kernel_rows, args.kernel_cols, sizeof(TWeight),
+ true, sizeof(TAccum), this->uses_premultiply(),
+ this->get_vl_type(),
+ sizeof(TAccum), 1,
+ [args] (unsigned int pos, unsigned int &x, unsigned int &y) -> bool
+ {
+ if (pos < args.kernel_rows * args.kernel_cols)
+ {
+ y = pos % args.kernel_cols;
+ x = pos / args.kernel_cols;
+ return true;
+ }
+ return false;
+ }
+ );
+ }
+
+ bool uses_premultiply() const override {
+ return false;
+ }
+
+ public:
+ using Parent::Parent;
+
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleaves::get_storage_size_generic(this->get_packing_args(args), args);
+ }
+
+ void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const Nothing &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
+ {
+ interleaves::pack_parameters_generic(
+ this->get_packing_args(args), args,
+ buffer, biases, weights, ld_weight_col, ld_weight_row
+ );
+ }
+
+ using KernelType = std::function<void(
+ const TInput *const *, // Input pointers
+ TOutput *const *, // Output pointers
+ const void *, // Ravelled bias, weights, and quantization parameters
+ unsigned int, // # output channels
+ TAccum, TAccum // Min and max activation clamps
+ )>;
+ virtual KernelType get_kernel(void) const = 0;
+};
+
+
+template <typename TInput, typename TWeight, typename TOutput>
+class DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t> : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+ using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>;
+
+ public:
+ using Parent::Parent;
+
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleaves::quantized::get_storage_size(args, this->get_vl_type(), this->get_accumulator_depth_vl());
+ }
+
+ void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
+ {
+ interleaves::quantized::pack_parameters<TWeight>(
+ buffer, reinterpret_cast<const int32_t *>(biases),
+ reinterpret_cast<const TWeight *>(weights), ld_weight_col, ld_weight_row,
+ args, qp, this->get_vl_type(), this->get_accumulator_depth_vl()
+ );
+ }
+
+ using KernelType = std::function<void(
+ const TInput *const *, // Input pointers
+ TOutput *const *, // Output pointers
+ const void *, // Ravelled bias, weights, and quantization parameters
+ unsigned int, // # output channels
+ const arm_gemm::Requantize32 &
+ )>;
+ virtual KernelType get_kernel(void) const = 0;
+};
+
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+class GenericDepthfirstMultiplierKernelStrategy
+{
+ const arm_gemm::VLType m_vl_type;
+ const unsigned int m_output_rows, m_output_cols;
+
+ public:
+ GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type)
+ : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols)
+ {
+ }
+
+ virtual ~GenericDepthfirstMultiplierKernelStrategy() = default;
+
+ arm_gemm::VLType get_vl_type(void) const { return m_vl_type; }
+ unsigned int get_output_rows(void) const { return m_output_rows; }
+ unsigned int get_output_cols(void) const { return m_output_cols; }
+
+ using KernelType = std::function<void(
+ const TInput *const *, // Input pointers
+ TOutput *const *, // Output pointers
+ const TWeight *, // Ravelled weight parameters
+ const TAccum *, // Bias,
+ unsigned int, unsigned int, // Number of kernel points, number of output channels
+ TAccum, TAccum // Activation minimum and maximum
+ )>;
+ virtual KernelType get_kernel(void) const = 0;
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+class GenericDepthfirstMultiplierKernelStrategy<TInput, TWeight, TOutput, int32_t>
+{
+ const arm_gemm::VLType m_vl_type;
+ const unsigned int m_output_rows, m_output_cols;
+
+ public:
+ GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type)
+ : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols)
+ {
+ }
+
+ virtual ~GenericDepthfirstMultiplierKernelStrategy() = default;
+
+ arm_gemm::VLType get_vl_type(void) const { return m_vl_type; }
+ unsigned int get_output_rows(void) const { return m_output_rows; }
+ unsigned int get_output_cols(void) const { return m_output_cols; }
+
+ using KernelType = std::function<void(
+ const TInput *const *, // Input pointers
+ TOutput *const *, // Output pointers
+ const TWeight *, // Ravelled weight parameters
+ const int32_t *, // Bias,
+ unsigned int, unsigned int, // Number of kernel points, number of output channels
+ const int32_t *, const int32_t *, const int32_t *, // Per-channel left-shifts, multipliers, right-shifts (need to account for start channel)
+ const arm_gemm::Requantize32 &
+ )>;
+ virtual KernelType get_kernel(void) const = 0;
+};
+
+template <typename TInput,
+ typename TWeight=TInput,
+ typename TOutput=TInput,
+ typename TAccum=typename DefaultTAccum<TInput>::Type,
+ typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class GenericDepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+ using KernelStrategyType = GenericDepthfirstMultiplierKernelStrategy<TInput, TWeight, TOutput, TAccum>;
+ std::unique_ptr<KernelStrategyType> m_kern;
+
+ protected:
+ virtual interleaves::PackingArguments get_packing_args(const DepthwiseArgs &args) const
+ {
+ return interleaves::PackingArguments(
+ args.kernel_rows, args.kernel_cols, sizeof(TWeight),
+ false, sizeof(TAccum), this->uses_premultiply(),
+ this->get_vl_type(),
+ sizeof(TAccum), 1,
+ [args] (unsigned int pos, unsigned int &x, unsigned int &y) -> bool
+ {
+ if (pos < args.kernel_rows * args.kernel_cols)
+ {
+ y = pos % args.kernel_cols;
+ x = pos / args.kernel_cols;
+ return true;
+ }
+ return false;
+ }
+ );
+ }
+
+ bool uses_premultiply() const override {
+ return false;
+ }
+
+ public:
+ GenericDepthfirstMultiplierStrategy(KernelStrategyType *kern, const DepthwiseArgs &args)
+ : DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>(
+ kern->get_output_rows(), kern->get_output_cols(),
+ args.kernel_rows, args.kernel_cols,
+ args.stride_rows, args.stride_cols
+ ),
+ m_kern(kern)
+ {
+ };
+
+ arm_gemm::VLType get_vl_type(void) const override { return m_kern->get_vl_type(); }
+ const typename KernelStrategyType::KernelType get_kernel(void) const { return m_kern->get_kernel(); }
+
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleaves::get_storage_size_generic(this->get_packing_args(args), args);
+ }
+
+ void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const OutputStage &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
+ {
+ interleaves::pack_parameters_generic(
+ this->get_packing_args(args), args,
+ buffer, biases, weights, ld_weight_col, ld_weight_row
+ );
+ }
+};
+
+// Specialise elements of the wrapper based on the type of kernel.
+namespace depthfirst_multiplier {
+
+/* Working space element which contains a pointer for each row of input, a row
+ * of padding, and a space which can be used to construct an NCHW-ordered patch
+ * of input.
+ */
+template <typename T, bool IsGeneric=false, typename OutputStage=Nothing>
+class InputPatchElement
+{
+ public:
+ struct Workspace
+ {
+ constexpr static bool InputPatchIsGeneric = IsGeneric;
+ const T **input_rows;
+ T *input_padding;
+ T *input_patch;
+ };
+
+ static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ return sizeof_input_rows(args) + sizeof_input_padding(args) + sizeof_input_patch(args);
+ }
+
+ template <class WorkspaceType>
+ static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ auto buffer_bytes = reinterpret_cast<char *>(buffer);
+
+ ws->input_rows = reinterpret_cast<const T **>(buffer_bytes);
+ buffer_bytes += sizeof_input_rows(args);
+
+ ws->input_padding = reinterpret_cast<T*>(buffer_bytes);
+ buffer_bytes += sizeof_input_padding(args);
+
+ ws->input_patch = reinterpret_cast<T*>(buffer_bytes);
+ buffer_bytes += sizeof_input_patch(args);
+
+ // Initialise the padding
+ memset(ws->input_padding,
+ get_input_buffer_fill_value(args.output_stage),
+ sizeof_input_padding(args));
+
+ return buffer_bytes;
+ }
+
+ protected:
+ static size_t sizeof_input_rows(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ if (IsGeneric)
+ {
+ return sizeof(T *) * args.strategy->get_output_rows() * args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
+ }
+ else
+ {
+ return sizeof(T *) * args.strategy->get_input_rows();
+ }
+ }
+
+ static size_t sizeof_input_padding(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ // Round-up the number of columns to be a whole number of QUADS
+ auto input_cols = arm_gemm::roundup<size_t>(args.strategy->get_input_cols(), 16 / sizeof(T));
+ return sizeof(T) * input_cols;
+ }
+
+ static size_t sizeof_input_patch(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ if (IsGeneric)
+ {
+ // Round-up the number of columns to be a whole number of QUADS
+ auto output_cols = arm_gemm::roundup<size_t>(args.strategy->get_output_cols(), 16 / sizeof(T));
+ const auto kernel_points = args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
+ return sizeof(T) * kernel_points * args.strategy->get_output_rows() * output_cols;
+ }
+ else
+ {
+ // Round-up the number of columns to be a whole number of QUADS
+ auto input_cols = arm_gemm::roundup<size_t>(args.strategy->get_input_cols(), 16 / sizeof(T));
+ return sizeof(T) * args.strategy->get_input_rows() * input_cols;
+ }
+ }
+};
+
+template <bool IsGeneric, typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
+struct StrategyType
+{
+ using Type = DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, TAccum>;
+
+ template <typename WorkspaceType>
+ static void execute(
+ const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
+ const OutputStage &, const unsigned int,
+ const void *parameters, const void *
+ )
+ {
+ strat->get_kernel()(
+ ws->input_rows,
+ ws->outptr_array,
+ parameters, args.channel_multiplier,
+ ws->activation_min, ws->activation_max
+ );
+ }
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
+struct StrategyType<true, TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+ using Type = GenericDepthfirstMultiplierStrategy<TInput, TWeight, TOutput, TAccum, OutputStage>;
+
+ template <typename WorkspaceType>
+ static void execute(
+ const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
+ const OutputStage &, const unsigned int start_output_channel,
+ const void *parameters, const void *bias
+ )
+ {
+ strat->get_kernel()(
+ ws->input_rows, ws->outptr_array,
+ reinterpret_cast<const TWeight *>(parameters),
+ bias == nullptr ? nullptr : reinterpret_cast<const TAccum *>(bias) + start_output_channel,
+ strat->get_kernel_rows() * strat->get_kernel_cols(),
+ args.channel_multiplier,
+ ws->activation_min, ws->activation_max
+ );
+ }
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+struct StrategyType<false, TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+ using Type = DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t>;
+
+ template <typename WorkspaceType>
+ static void execute(
+ const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
+ const arm_gemm::Requantize32 &qp, const unsigned int,
+ const void *parameters, const void *
+ )
+ {
+ strat->get_kernel()(
+ ws->input_rows,
+ ws->outptr_array,
+ parameters, args.channel_multiplier,
+ qp
+ );
+ }
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+struct StrategyType<true, TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+ using Type = GenericDepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>;
+
+ template <typename WorkspaceType>
+ static void execute(
+ const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
+ const arm_gemm::Requantize32 &qp, const unsigned int start_output_channel,
+ const void *parameters, const void *
+ )
+ {
+ auto get_ptr = [start_output_channel] (const int32_t *ptr) -> const int32_t *
+ {
+ return ptr == nullptr ? nullptr : ptr + start_output_channel;
+ };
+
+ strat->get_kernel()(
+ ws->input_rows, ws->outptr_array,
+ reinterpret_cast<const TWeight *>(parameters),
+ get_ptr(qp.bias),
+ strat->get_kernel_rows() * strat->get_kernel_cols(),
+ args.channel_multiplier,
+ get_ptr(qp.per_channel_left_shifts),
+ get_ptr(qp.per_channel_muls),
+ get_ptr(qp.per_channel_right_shifts),
+ qp
+ );
+ }
+};
+
+template <bool IsGeneric> struct PrepareInputSample;
+
+template <> struct PrepareInputSample<false>
+{
+ template <typename WorkspaceType, typename StrategyType, typename T>
+ static void execute(
+ const DepthwiseArgs &, WorkspaceType *ws, const StrategyType *strat,
+ T *base_ptr, size_t ld_row, size_t ld_col,
+ const unsigned int input_pad_top, const unsigned int valid_rows,
+ const unsigned int input_pad_left, const unsigned int valid_cols
+ )
+ {
+ fill_nchw_patch_array(
+ ws->input_rows, ws->input_patch, strat->get_input_rows(), strat->get_input_cols(),
+ base_ptr, ld_row, ld_col,
+ ws->input_padding,
+ input_pad_top, valid_rows,
+ input_pad_left, valid_cols
+ );
+ }
+};
+
+template <> struct PrepareInputSample<true>
+{
+ template <typename WorkspaceType, typename StrategyType, typename T>
+ static void execute(
+ const DepthwiseArgs &args, WorkspaceType *ws, const StrategyType *strat,
+ T *base_ptr, size_t ld_row, size_t ld_col,
+ const unsigned int input_pad_top, const unsigned int valid_rows,
+ const unsigned int input_pad_left, const unsigned int valid_cols
+ )
+ {
+ fill_patch_array_generic_kernel(
+ ws->input_rows, ws->input_patch,
+ strat->get_output_rows(), strat->get_output_cols(),
+ args.kernel_rows, args.kernel_cols,
+ args.stride_rows, args.stride_cols,
+ base_ptr, ld_row, ld_col,
+ ws->input_padding,
+ input_pad_top, valid_rows,
+ input_pad_left, valid_cols
+ );
+ }
+};
+
+} // namespace depthfirst_multiplier
+
+template <typename TInput,
+ typename TWeight=TInput,
+ typename TOutput=TInput,
+ typename TAccum=typename DefaultTAccum<TInput>::Type,
+ bool is_generic=false,
+ typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwiseDepthfirstMultiplier : public DepthfirstDriver<TInput, TWeight, TOutput>
+{
+ protected:
+ using StratType = typename depthfirst_multiplier::StrategyType<is_generic, TInput, TWeight, TOutput, TAccum, OutputStage>::Type;
+ using WorkspaceManager = Workspace<
+ OutputArrayElement<TOutput>,
+ depthfirst_multiplier::InputPatchElement<TInput, is_generic, OutputStage>,
+ ActivationsElement<TOutput, OutputStage>
+ >;
+ using WorkingSpace = typename WorkspaceManager::WorkspaceType;
+
+ OutputStage m_os; // Copy of the output parameters
+ const void *m_bias = nullptr; // Copy of the bias (should we need it)
+
+ bool uses_premultiply() const override {
+ return false;
+ }
+
+ public:
+ DepthwiseDepthfirstMultiplier(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
+ : DepthfirstDriver<TInput, TWeight, TOutput>(strat, args), m_os(os)
+ {
+ }
+
+ DepthwiseDepthfirstMultiplier(DepthwiseDepthfirstMultiplier &) = delete;
+ DepthwiseDepthfirstMultiplier &operator=(DepthwiseDepthfirstMultiplier &) = delete;
+
+ size_t get_storage_size(void) const override
+ {
+ return reinterpret_cast<const StratType *>(this->m_strat.get())
+ ->get_storage_size(this->m_args);
+ }
+
+ void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+ {
+ reinterpret_cast<const StratType *>(this->m_strat.get())
+ ->pack_parameters(this->m_args, buffer, biases, m_os, weights, ld_weight_col, ld_weight_row);
+ m_bias = biases;
+ depthwise_depthfirst::stash_bias(m_os, biases);
+ }
+
+ size_t get_working_size_per_thread() const override
+ {
+ DepthwiseArgs args(this->m_args);
+ return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, m_os));
+ }
+
+ void initialise_working_space(void *buffer) const override
+ {
+ DepthwiseArgs args(this->m_args);
+ return WorkspaceManager::initialise(buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, m_os));
+ }
+
+ void compute_tile_padded(
+ const DepthwiseArgs &args,
+ unsigned int output_i, unsigned int output_j,
+ unsigned int output_channel_start, unsigned int output_channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ const void *parameters,
+ void *working_space_raw
+ ) const override
+ {
+ // Get the working space
+ auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
+
+ const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
+ const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+ const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+
+ const int ij = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
+ const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
+ const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
+
+ // Compute the output pointer array. We'll update this array after every
+ // invocation of the kernel.
+ fill_pointer_array(
+ ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+ output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
+ output.ld_row, output.ld_col,
+ ws->output_buffer,
+ 0, args.output_rows - output_i, // Top padding, # valid rows
+ 0, args.output_cols - output_j // Left padding, # valid columns
+ );
+
+ // Compute the parameter stride
+ DepthwiseArgs single_iter(args);
+ single_iter.input_channels = 1;
+ const size_t parameter_stride = reinterpret_cast<const StratType *>(this->m_strat.get())
+ ->get_storage_size(single_iter);
+
+ for (; output_channel_start < output_channel_end;
+ output_channel_start += args.channel_multiplier)
+ {
+ // Compute the input pointer array
+ const auto input_channel = output_channel_start / args.channel_multiplier;
+
+ // Construct the input patch
+ depthfirst_multiplier::PrepareInputSample<is_generic>::execute(
+ args, ws, this->m_strat.get(),
+ input.base + input_channel + input_i*input.ld_row + input_j*input.ld_col, input.ld_row, input.ld_col,
+ input_pad_top, args.input_rows - input_i,
+ input_pad_left, args.input_cols - input_j
+ );
+
+ // Execute the kernel
+ depthfirst_multiplier::StrategyType<is_generic, TInput, TWeight, TOutput, TAccum, OutputStage>::execute(
+ args, ws, reinterpret_cast<const StratType *>(this->m_strat.get()), m_os, output_channel_start,
+ parameters, m_bias
+ );
+
+ // Update the output pointers
+ for (unsigned int n = 0; n < this->m_strat->get_output_rows() * this->m_strat->get_output_cols(); n++)
+ {
+ ws->outptr_array[n] += args.channel_multiplier;
+ }
+
+ // Progress the parameters
+ parameters = reinterpret_cast<const char *>(parameters) + parameter_stride;
+ }
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
new file mode 100644
index 0000000000..8fef6f8ae0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
+#include "depthwise_planar.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+// This can only be built if the target/compiler supports FP16 arguments.
+#if defined(__ARM_FP16_ARGS)
+
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#include "kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#include "kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+#if defined(__aarch64__)
+#if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ bool prefer_premultiply(const DepthwiseArgs &args) {
+ if ((args.stride_rows != args.stride_cols) || (args.kernel_rows != args.kernel_cols))
+ {
+ return false;
+ }
+
+ unsigned int threshold;
+
+ if (args.stride_rows == 1 && args.kernel_rows == 3)
+ {
+ threshold = 30;
+ }
+ else if (args.stride_rows == 1 && args.kernel_rows == 5)
+ {
+ threshold = 31;
+ }
+ else if (args.stride_rows == 2 && args.kernel_rows == 3)
+ {
+ threshold = 11;
+ }
+ else if (args.stride_rows == 2 && args.kernel_rows == 5)
+ {
+ threshold = 19;
+ } else
+ {
+ return false;
+ }
+
+ return args.channel_multiplier <= threshold;
+ }
+
+ template <class Strategy>
+ unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+ {
+ if (args.channel_multiplier > 1 && !prefer_premultiply(args))
+ {
+ return std::numeric_limits<unsigned int>::max();
+ }
+
+ // First-pass: compute the number of output pixels which will be computed.
+ return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+ arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
+ arm_gemm::iceildiv(
+ (long unsigned) args.input_channels * args.channel_multiplier,
+ arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+ );
+ }
+
+ template <class Strategy>
+ unsigned int planar_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+ {
+ // First-pass: compute the number of output pixels which will be computed.
+ return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+ args.output_cols *
+ arm_gemm::iceildiv(
+ (long unsigned) args.input_channels * args.channel_multiplier,
+ arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+ );
+ }
+
+ unsigned int multiplier_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+ {
+ return prefer_premultiply(args)? std::numeric_limits<unsigned int>::max() : 0;
+ }
+
+ unsigned int not_preferred(const DepthwiseArgs &, const Nothing &) __attribute__ ((unused));
+ unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
+ {
+ return std::numeric_limits<unsigned int>::max();
+ }
+#endif // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // defined(__aarch64__)
+}
+
+static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = {
+#if defined(__aarch64__)
+#if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
+ constraint(is_supported<sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+ cpu_has_sme2),
+ cycle_estimate<sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ auto strat = new sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
+ constraint(is_supported<sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+ cpu_has_sme2),
+ cycle_estimate<sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ auto strat = new sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint(is_supported<sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ cpu_has_sme2),
+ cycle_estimate<sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ auto strat = new sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint(is_supported<sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ cpu_has_sme2),
+ cycle_estimate<sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ auto strat = new sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint(is_supported<sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ cpu_has_sme2),
+ cycle_estimate<sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ auto strat = new sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
+ },
+ },
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
+ constraint(is_supported<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+ cpu_has_sve),
+ cycle_estimate<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ auto strat = new sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
+ constraint(is_supported<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+ cpu_has_sve),
+ cycle_estimate<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ auto strat = new sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint(is_supported<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ cpu_has_sve),
+ cycle_estimate<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ auto strat = new sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint(is_supported<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ cpu_has_sve),
+ cycle_estimate<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ auto strat = new sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint(is_supported<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ cpu_has_sve),
+ cycle_estimate<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ auto strat = new sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
+ },
+ },
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
+ constraint(is_supported<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+ cpu_has_fp16),
+ cycle_estimate<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ auto strat = new a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
+ constraint(is_supported<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+ cpu_has_fp16),
+ cycle_estimate<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ auto strat = new a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint(is_supported<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ cpu_has_fp16),
+ cycle_estimate<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ auto strat = new a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint(is_supported<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ cpu_has_fp16),
+ cycle_estimate<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ auto strat = new a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint(is_supported<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ cpu_has_fp16),
+ cycle_estimate<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ auto strat = new a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp16_nhwc_generic_output3x3_mla_depthfirst",
+ constraint(cpu_has_fp16),
+ not_preferred,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ auto kern = new a64_fp16_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstStrategy<__fp16>(kern, 3, 3, args);
+ return new DepthwiseDepthfirstGeneric<__fp16>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp16_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+ constraint(cpu_has_fp16, has_channel_multiplier),
+ multiplier_cycle_estimate,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ auto kern = new a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstMultiplierStrategy<__fp16>(kern, args);
+ return new DepthwiseDepthfirstMultiplier<__fp16, __fp16, __fp16, __fp16, true>(strat, args);
+ },
+ },
+#endif // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // defined(__aarch64__)
+ { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
+};
+
+template <>
+const DepthwiseImplementation<__fp16> *depthwise_implementation_list()
+{
+ return depthwise_fp16_methods;
+}
+
+template UniqueDepthwiseCommon<__fp16> depthwise(const DepthwiseArgs &, const Nothing &);
+template std::vector<KernelDescription> get_compatible_kernels<__fp16>(const DepthwiseArgs &, const Nothing &);
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
new file mode 100644
index 0000000000..760328f3ba
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
@@ -0,0 +1,539 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
+#include "depthwise_planar.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#include "interleaves/list.hpp"
+
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp"
+#include "kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp"
+
+#include "kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp"
+#include "kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp"
+#include "kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp"
+#include "kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp"
+
+#include "kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include "kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
+#include "kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif // defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+#if defined(__aarch64__)
+ bool prefer_premultiply(const DepthwiseArgs &args) {
+ if ((args.stride_rows != args.stride_cols) || (args.kernel_rows != args.kernel_cols))
+ {
+ return false;
+ }
+
+ unsigned int threshold;
+
+ if (args.stride_rows == 1 && args.kernel_rows == 3)
+ {
+ threshold = 18;
+ }
+ else if (args.stride_rows == 1 && args.kernel_rows == 5)
+ {
+ threshold = 5;
+ }
+ else if (args.stride_rows == 2 && args.kernel_rows == 3)
+ {
+ threshold = 5;
+ }
+ else if (args.stride_rows == 2 && args.kernel_rows == 5)
+ {
+ threshold = 12;
+ } else
+ {
+ return false;
+ }
+
+ return args.channel_multiplier <= threshold;
+ }
+
+ template <class Strategy>
+ unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+ {
+ if (args.channel_multiplier > 1 && !prefer_premultiply(args))
+ {
+ return std::numeric_limits<unsigned int>::max();
+ }
+
+ // First-pass: compute the number of output pixels which will be computed.
+ return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+ arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
+ arm_gemm::iceildiv(
+ (long unsigned) args.input_channels * args.channel_multiplier,
+ arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+ );
+ }
+
+ template <class Strategy>
+ unsigned int planar_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+ {
+ // First-pass: compute the number of output pixels which will be computed.
+ return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+ args.output_cols *
+ arm_gemm::iceildiv(
+ (long unsigned) args.input_channels * args.channel_multiplier,
+ arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+ );
+ }
+
+ template <class Strategy>
+ unsigned int fast_mode_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+ {
+ // First-pass: compute the number of output pixels which will be computed.
+ return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+ arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
+ arm_gemm::iceildiv(
+ (long unsigned) args.input_channels * args.channel_multiplier,
+ arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+ ) * 2 / 3;
+ }
+
+ unsigned int multiplier_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+ {
+ return prefer_premultiply(args)? std::numeric_limits<unsigned int>::max() : 0;
+ }
+
+ unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
+ {
+ return std::numeric_limits<unsigned int>::max();
+ }
+
+ bool fast_mode_enabled(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+ bool fast_mode_enabled(const DepthwiseArgs &args, const void *)
+ {
+ return args.fast_mode;
+ }
+#endif // defined(__aarch64__)
+}
+
+static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+ {
+ DepthwiseMethod::PLANAR,
+ "sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za",
+ constraint(fast_mode_enabled,
+ cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za>,
+ has_no_channel_multiplier, no_prime_right_pad),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za(args.cpu_info);
+ return new DepthwisePlanar<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::PLANAR,
+ "sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za",
+ constraint(fast_mode_enabled,
+ cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za>,
+ has_no_channel_multiplier, no_prime_right_pad),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za(args.cpu_info);
+ return new DepthwisePlanar<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::PLANAR,
+ "sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za",
+ constraint(fast_mode_enabled,
+ cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za>,
+ has_no_channel_multiplier, no_prime_right_pad),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za(args.cpu_info);
+ return new DepthwisePlanar<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::PLANAR,
+ "sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za",
+ constraint(fast_mode_enabled,
+ cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za>,
+ has_no_channel_multiplier, no_prime_right_pad),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za(args.cpu_info);
+ return new DepthwisePlanar<float>(strat, args);
+ },
+ },
+
+ {
+ DepthwiseMethod::PLANAR,
+ "sme2_fp32_planar_3x3_s1_4rows_mla_za",
+ constraint(cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_fp32_planar_3x3_s1_4rows_mla_za>,
+ has_no_channel_multiplier, no_prime_right_pad),
+ [] (const DepthwiseArgs &args, const Nothing &os) -> unsigned int {
+ // Heuristic, don't prefer this kernel unless the input plane is greater
+ // than the number of channels.
+ if (args.input_rows * args.input_cols < args.input_channels)
+ return UINT32_MAX;
+
+ return planar_cycle_estimate<sme2_fp32_planar_3x3_s1_4rows_mla_za>(args, os);
+ },
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new sme2_fp32_planar_3x3_s1_4rows_mla_za(args.cpu_info);
+ return new DepthwisePlanar<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::PLANAR,
+ "sme2_fp32_planar_3x3_s2_4rows_mla_za",
+ constraint(cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_fp32_planar_3x3_s2_4rows_mla_za>,
+ has_no_channel_multiplier, no_prime_right_pad),
+ planar_cycle_estimate<sme2_fp32_planar_3x3_s2_4rows_mla_za>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new sme2_fp32_planar_3x3_s2_4rows_mla_za(args.cpu_info);
+ return new DepthwisePlanar<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::PLANAR,
+ "sme2_fp32_planar_5x5_s1_4rows_mla_za",
+ constraint(cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_fp32_planar_5x5_s1_4rows_mla_za>,
+ has_no_channel_multiplier, no_prime_right_pad),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new sme2_fp32_planar_5x5_s1_4rows_mla_za(args.cpu_info);
+ return new DepthwisePlanar<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::PLANAR,
+ "sme2_fp32_planar_5x5_s2_4rows_mla_za",
+ constraint(cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_fp32_planar_5x5_s2_4rows_mla_za>,
+ has_no_channel_multiplier, no_prime_right_pad),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new sme2_fp32_planar_5x5_s2_4rows_mla_za(args.cpu_info);
+ return new DepthwisePlanar<float>(strat, args);
+ },
+ },
+
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
+ constraint(cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>),
+ cycle_estimate<sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
+ constraint(cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>),
+ cycle_estimate<sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint(cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>),
+ cycle_estimate<sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint(cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>),
+ cycle_estimate<sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+ },
+ },
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
+ constraint(is_supported<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+ cpu_has_sve),
+ cycle_estimate<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
+ constraint(is_supported<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+ cpu_has_sve),
+ cycle_estimate<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint(is_supported<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ cpu_has_sve),
+ cycle_estimate<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint(is_supported<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ cpu_has_sve),
+ cycle_estimate<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint(is_supported<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ cpu_has_sve),
+ cycle_estimate<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp32_nhwc_generic_output3x3_mla_depthfirst",
+ constraint(cpu_has_sve),
+ not_preferred,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto kern = new sve_fp32_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstStrategy<float>(kern, 3, 3, args);
+ return new DepthwiseDepthfirstGeneric<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
+ constraint(is_supported<sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>,
+ cpu_has_sve, has_channel_multiplier),
+ multiplier_cycle_estimate,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
+ constraint(is_supported<sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>,
+ cpu_has_sve, has_channel_multiplier),
+ multiplier_cycle_estimate,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+ constraint(cpu_has_sve, has_channel_multiplier),
+ multiplier_cycle_estimate,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto kern = new sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstMultiplierStrategy<float>(kern, args);
+ return new DepthwiseDepthfirstMultiplier<float, float, float, float, true>(strat, args);
+ },
+ },
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
+ constraint(is_supported<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>),
+ cycle_estimate<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
+ constraint(is_supported<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>),
+ cycle_estimate<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint(is_supported<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>),
+ cycle_estimate<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint(is_supported<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>),
+ cycle_estimate<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint(is_supported<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>),
+ cycle_estimate<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp32_nhwc_generic_output3x3_mla_depthfirst",
+ nullptr,
+ not_preferred,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto kern = new a64_fp32_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstStrategy<float>(kern, 3, 3, args);
+ return new DepthwiseDepthfirstGeneric<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
+ constraint(is_supported<a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>,
+ has_channel_multiplier),
+ multiplier_cycle_estimate,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
+ constraint(is_supported<a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>,
+ has_channel_multiplier),
+ multiplier_cycle_estimate,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto strat = new a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<float>(strat, args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+ constraint(has_channel_multiplier),
+ multiplier_cycle_estimate,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ auto kern = new a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstMultiplierStrategy<float>(kern, args);
+ return new DepthwiseDepthfirstMultiplier<float, float, float, float, true>(strat, args);
+ },
+ },
+#endif // defined(__aarch64__)
+ { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
+};
+
+template <>
+const DepthwiseImplementation<float> *depthwise_implementation_list()
+{
+ return depthwise_fp32_methods;
+}
+
+template UniqueDepthwiseCommon<float> depthwise(const DepthwiseArgs &, const Nothing &);
+template std::vector<KernelDescription> get_compatible_kernels<float>(const DepthwiseArgs &, const Nothing &);
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
new file mode 100644
index 0000000000..82821af1e6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "depthwise.hpp"
+
+#include <cstddef>
+#include <functional>
+
+using arm_gemm::Nothing;
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+struct DepthwiseImplementation
+{
+ const DepthwiseMethod method;
+ const char *name;
+ std::function<bool(const DepthwiseArgs &, const OutputStage &)> is_supported;
+ std::function<uint64_t(const DepthwiseArgs &, const OutputStage &)> cycle_estimate;
+ std::function<DepthwiseCommon<TInput, TWeight, TOutput> *(const DepthwiseArgs &, const OutputStage &)> initialise;
+
+ bool get_is_supported(const DepthwiseArgs &args, const OutputStage &os) const
+ {
+ return (is_supported == nullptr) ? true : is_supported(args, os);
+ }
+
+ uint64_t get_cycle_estimate(const DepthwiseArgs &args, const OutputStage &os) const
+ {
+ return (cycle_estimate == nullptr) ? 0 : cycle_estimate(args, os);
+ }
+
+ DepthwiseCommon<TInput, TWeight, TOutput> *get_instance(const DepthwiseArgs &args, const OutputStage &os) const
+ {
+ auto impl = initialise(args, os);
+ impl->set_name(std::string(name));
+ return impl;
+ }
+};
+
+/**
+ * \relates DepthwiseImplementation
+ */
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *depthwise_implementation_list();
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+bool find_implementation(
+ const DepthwiseArgs &args,
+ const OutputStage &os,
+ const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> * &selected
+)
+{
+ selected = nullptr;
+ uint64_t best_cycle_estimate = UINT64_MAX;
+
+ const auto *impl = depthwise_implementation_list<TInput, TWeight, TOutput, OutputStage>();
+ for (; impl->method != DepthwiseMethod::DEFAULT; impl++)
+ {
+ const bool has_cfg = (args.config != nullptr);
+ const auto &cfg = args.config;
+
+ if (
+ !impl->get_is_supported(args, os) || // Problem is unsupported
+ (has_cfg && cfg->method != DepthwiseMethod::DEFAULT && cfg->method != impl->method) ||
+ (has_cfg && cfg->filter != "" && !std::strstr(impl->name, cfg->filter.c_str()))
+ )
+ {
+ continue;
+ }
+
+ const auto cycle_estimate = impl->get_cycle_estimate(args, os);
+
+ if (cycle_estimate == 0)
+ {
+ selected = impl;
+ break;
+ }
+
+ if (selected == nullptr || cycle_estimate < best_cycle_estimate)
+ {
+ selected = impl;
+ best_cycle_estimate = cycle_estimate;
+ }
+ }
+
+ return (selected != nullptr);
+}
+
+template <typename TInput, typename TWeight, typename TOutput, class OutputStage>
+std::vector<KernelDescription> get_compatible_kernels(const DepthwiseArgs &args, const OutputStage &os)
+{
+ std::vector<KernelDescription> kerns;
+
+ // Find the default implementation so we can flag it accordingly
+ const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *default_impl;
+ find_implementation<TInput, TWeight, TOutput, OutputStage>(args, os, default_impl);
+
+ for (auto impl = depthwise_implementation_list<TInput, TWeight, TOutput, OutputStage>();
+ impl->method != DepthwiseMethod::DEFAULT; impl++)
+ {
+ if (!impl->get_is_supported(args, os))
+ {
+ continue;
+ }
+
+ kerns.emplace_back(
+ impl->method, impl->name, impl == default_impl,
+ impl->get_cycle_estimate(args, os)
+ );
+ }
+
+ return kerns;
+}
+
+template <typename TInput, typename TWeight, typename TOutput, class OutputStage>
+UniqueDepthwiseCommon<TInput, TWeight, TOutput> depthwise(const DepthwiseArgs &args, const OutputStage &os)
+{
+ const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *impl = nullptr;
+ const bool success = find_implementation<TInput, TWeight, TOutput, OutputStage>(args, os, impl);
+ return UniqueDepthwiseCommon<TInput, TWeight, TOutput>(success ? impl->get_instance(args, os) : nullptr);
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
new file mode 100644
index 0000000000..15064aeedc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Utilities for constructing functions which constrain which kernels are
+ * selected for a given depthwise problem.
+ *
+ * It is expected that this will be included in the files which list the
+ * available kernels. To avoid multiple definitions, an anonymous namespace is
+ * used.
+ */
+
+#pragma once
+
+#include "arm_gemm.hpp"
+#include "depthwise.hpp"
+
+namespace arm_conv
+{
+namespace depthwise
+{
+namespace
+{
+
+template <class OutputStage>
+using ConstraintFn = std::function<bool(const DepthwiseArgs &, const OutputStage &)>;
+
+using GenericConstraintFn = std::function<bool(const DepthwiseArgs &, const void *)>;
+
+GenericConstraintFn make_constraint(const GenericConstraintFn &f) __attribute__ ((unused));
+GenericConstraintFn make_constraint(const GenericConstraintFn &f)
+{
+ return f;
+}
+
+template <typename ... Fs>
+GenericConstraintFn make_constraint(const GenericConstraintFn &f, Fs ... fs)
+{
+ return [f, fs...] (const DepthwiseArgs &args, const void *os) -> bool {
+ return f(args, os) && make_constraint(fs...)(args, os);
+ };
+}
+
+template <typename OutputStage=Nothing, typename ... Fs>
+ConstraintFn<OutputStage> constraint(Fs ... fs)
+{
+ return [fs...] (const DepthwiseArgs &args, const OutputStage &os) -> bool {
+ return make_constraint(fs...)(args, &os);
+ };
+}
+
+// Some useful constraints
+template <class Strategy>
+bool is_supported(const DepthwiseArgs &args, const void *)
+{
+ return ((args.kernel_rows == Strategy::kernel_rows) &&
+ (args.kernel_cols == Strategy::kernel_cols) &&
+ (args.stride_rows == Strategy::stride_rows) &&
+ (args.stride_cols == Strategy::stride_cols));
+}
+
+bool cpu_has_dot_product(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool cpu_has_dot_product(const DepthwiseArgs &args, const void *)
+{
+ return args.cpu_info->has_dotprod();
+}
+
+bool cpu_has_sme(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool cpu_has_sme(const DepthwiseArgs &args, const void *)
+{
+ return args.cpu_info->has_sme();
+}
+
+bool cpu_has_sme2(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool cpu_has_sme2(const DepthwiseArgs &args, const void *)
+{
+ return args.cpu_info->has_sme2();
+}
+
+bool cpu_has_sve(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool cpu_has_sve(const DepthwiseArgs &args, const void *)
+{
+ return args.cpu_info->has_sve();
+}
+
+bool cpu_has_sve2(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool cpu_has_sve2(const DepthwiseArgs &args, const void *)
+{
+ return args.cpu_info->has_sve2();
+}
+
+bool cpu_has_fp16(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool cpu_has_fp16(const DepthwiseArgs &args, const void *)
+{
+ return args.cpu_info->has_fp16();
+}
+
+bool has_no_channel_multiplier(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool has_no_channel_multiplier(const DepthwiseArgs &args, const void *)
+{
+ return args.channel_multiplier == 1;
+}
+
+bool has_channel_multiplier(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool has_channel_multiplier(const DepthwiseArgs &args, const void *)
+{
+ return args.channel_multiplier > 1;
+}
+
+// Planar kernels require a "priming" step before the main processing loop. The kernels can prime with left padding
+// or input data, but not right padding - which could be needed in some extreme cases such as a 5x5 kernel, width 1
+// padding 2. These are rare enough and can be handled with other kernels anyway, so filter them out with this.
+bool no_prime_right_pad(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool no_prime_right_pad(const DepthwiseArgs &args, const void *)
+{
+ return (args.input_cols + args.padding.left) >= (args.kernel_cols - 1);
+}
+
+bool qp_has_no_left_shift(const DepthwiseArgs &args, const void *_qp) __attribute__ ((unused));
+bool qp_has_no_left_shift(const DepthwiseArgs &, const void *_qp)
+{
+ const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
+ return qp->per_channel_requant ?
+ (qp->per_channel_left_shifts == nullptr) :
+ (qp->per_layer_left_shift == 0);
+}
+
+bool qp_zero_a_offset(const DepthwiseArgs &args, const void *_qp) __attribute__ ((unused));
+bool qp_zero_a_offset(const DepthwiseArgs &, const void *_qp)
+{
+ const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
+ return qp->a_offset == 0;
+}
+
+template <typename T> bool qp_skip_clamp(const DepthwiseArgs &args, const void *_qp) __attribute__ ((unused));
+template <typename T> bool qp_skip_clamp(const DepthwiseArgs &, const void *_qp)
+{
+ const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
+ return (qp->minval == std::numeric_limits<T>::min() &&
+ qp->maxval == std::numeric_limits<T>::max());
+}
+
+} // namespace
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
new file mode 100644
index 0000000000..c3daaf04fe
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "depthfirst_driver.hpp"
+#include "interleaves/generic.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename OutputStage>
+class IPlanarStrategy
+{
+ public:
+ virtual ~IPlanarStrategy() = default;
+ virtual unsigned int get_output_rows(void) const = 0;
+ virtual arm_gemm::VLType get_vl_type(void) const = 0;
+
+ virtual size_t get_storage_size(const DepthwiseArgs &) const = 0;
+ virtual void pack_parameters(
+ const DepthwiseArgs &args, void *buffer,
+ const void *biases, const OutputStage &,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const = 0;
+};
+
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum,
+ typename OutputStage>
+struct PlanarKernelType;
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+struct PlanarKernelType<TInput, TWeight, TOutput, TAccum, Nothing>
+{
+ typedef void (*Type)(
+ const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+ unsigned int pad_top, unsigned int valid_input_rows,
+ unsigned int pad_left, unsigned int valid_input_cols,
+ const TWeight *, const TAccum *,
+ TOutput **, const size_t *, const size_t *, unsigned int output_cols,
+ unsigned int start_channels, unsigned int valid_channels,
+ TAccum act_min, TAccum act_max
+ );
+
+ template <typename WorkspaceType>
+ static inline void execute(
+ const Type fn,
+ const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+ unsigned int pad_top, unsigned int valid_input_rows,
+ unsigned int pad_left, unsigned int valid_input_cols,
+ const TWeight *weights, const TAccum *bias,
+ TOutput **outptrs, const size_t *outlds, const size_t *outvllds, unsigned int output_cols,
+ unsigned int start_channel, unsigned int valid_channels,
+ const Nothing &, const WorkspaceType *ws
+ )
+ {
+ fn(
+ inptr, ld_in_row, ld_in_col, ld_in_vl,
+ pad_top, valid_input_rows,
+ pad_left, valid_input_cols,
+ weights, bias,
+ outptrs, outlds, outvllds, output_cols,
+ start_channel, valid_channels,
+ ws->activation_min, ws->activation_max
+ );
+ }
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+struct PlanarKernelType<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+ typedef void (*Type)(
+ const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+ unsigned int pad_top, unsigned int valid_input_rows,
+ unsigned int pad_left, unsigned int valid_input_cols,
+ const TWeight *,
+ TOutput **, const size_t *, const size_t *, unsigned int output_cols,
+ unsigned int start_channel, unsigned int valid_channels,
+ const arm_gemm::Requantize32 &
+ );
+
+ template <typename WorkspaceType>
+ static inline void execute(
+ const Type fn,
+ const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+ unsigned int pad_top, unsigned int valid_input_rows,
+ unsigned int pad_left, unsigned int valid_input_cols,
+ const TWeight *weights, const int32_t *,
+ TOutput **outptrs, const size_t *outlds, const size_t *outldvls, unsigned int output_cols,
+ unsigned int first_channel, unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp, const WorkspaceType *
+ )
+ {
+ fn(
+ inptr, ld_in_row, ld_in_col, ld_in_vl,
+ pad_top, valid_input_rows,
+ pad_left, valid_input_cols,
+ weights,
+ outptrs, outlds, outldvls, output_cols,
+ first_channel, valid_channels,
+ qp
+ );
+ }
+};
+
+
+template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
+ typename TAccum=typename DefaultTAccum<TOutput>::Type,
+ typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class PlanarStrategy : public IPlanarStrategy<OutputStage>
+{
+ unsigned int m_kernel_rows, m_kernel_cols;
+ unsigned int m_stride_rows, m_stride_cols;
+ unsigned int m_output_rows;
+ arm_gemm::VLType m_vl_type;
+
+ protected:
+ virtual bool get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const
+ {
+ // Get the kernel point to pack at the given index; return false to
+ // indicate that this index (and all greater indices) is out of range.
+ if (m_kernel_rows * m_kernel_cols <= index)
+ return false;
+
+ y = index % m_kernel_cols;
+ x = index / m_kernel_cols;
+ return true;
+ }
+
+ virtual interleaves::PackingArguments get_kernel_packing_arguments(void) const
+ {
+ return interleaves::PackingArguments(
+ m_kernel_rows, m_kernel_cols, sizeof(TWeight),
+ false, sizeof(TAccum), true, // Don't pack the bias
+ m_vl_type, sizeof(TAccum), 1, // Accumulator depth of 1 TODO
+ [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+ { return this->get_kernel_packing_point(idx, x, y); }
+ );
+ }
+
+ public:
+ PlanarStrategy(
+ unsigned int kernel_rows, unsigned int kernel_cols,
+ unsigned int stride_rows, unsigned int stride_cols,
+ unsigned int output_rows,
+ arm_gemm::VLType vl_type
+ ) : m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
+ m_stride_rows(stride_rows), m_stride_cols(stride_cols),
+ m_output_rows(output_rows), m_vl_type(vl_type)
+ {
+ }
+
+ unsigned int get_output_rows(void) const override { return m_output_rows; }
+ arm_gemm::VLType get_vl_type(void) const override { return m_vl_type; }
+
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleaves::get_storage_size_generic(this->get_kernel_packing_arguments(), args);
+ }
+
+ void pack_parameters(
+ const DepthwiseArgs &args, void *buffer,
+ const void *biases, const OutputStage &,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const override
+ {
+ interleaves::pack_parameters_generic(
+ this->get_kernel_packing_arguments(), args,
+ buffer, biases, weights, ld_weight_col, ld_weight_row
+ );
+ }
+
+ using KernelType = typename PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::Type;
+ virtual KernelType get_kernel(void) const = 0;
+};
+
+
+namespace {
+
+template <typename T>
+struct OutputRowPtrsElement
+{
+ struct Workspace
+ {
+ T **output_row_ptrs;
+ size_t *output_ld_cols;
+ size_t *output_ld_vls; // Stride between vectors of channels
+ T *output_padding_buffer;
+ };
+
+ template <typename OutputStage>
+ static size_t get_element_size(const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
+ {
+ // We need one pointer and stride for each row of output, and an additional
+ // blob of memory into which padded stores can go.
+ return args.strategy->get_output_rows() * (sizeof(T *) + 2*sizeof(size_t)) +
+ get_vector_length<char>(args.strategy->get_vl_type());
+ }
+
+ template <typename WorkspaceType, typename OutputStage>
+ static void *initialise(WorkspaceType *ws, void *buffer,
+ const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
+ {
+ const auto n_rows = args.strategy->get_output_rows();
+ ws->output_row_ptrs = reinterpret_cast<T **>(buffer);
+ ws->output_ld_cols = reinterpret_cast<size_t *>(ws->output_row_ptrs + n_rows);
+ ws->output_ld_vls = ws->output_ld_cols + n_rows;
+ ws->output_padding_buffer = reinterpret_cast<T *>(ws->output_ld_vls + n_rows);
+ return ws->output_padding_buffer + get_vector_length<T>(args.strategy->get_vl_type());
+ }
+};
+
+} // namespace {anonymous}
+
+
+template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
+ typename TAccum=typename DefaultTAccum<TOutput>::Type,
+ typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwisePlanar : public DepthwiseCommon<TInput, TWeight, TOutput>
+{
+ using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
+ using StrategyType = IPlanarStrategy<OutputStage>;
+ using WorkspaceManager = Workspace<
+ OutputRowPtrsElement<TOutput>,
+ ActivationsElement<TAccum, OutputStage>
+ >;
+ using WorkspaceType = typename WorkspaceManager::WorkspaceType;
+
+ std::unique_ptr<StrategyType> m_strat;
+ const TAccum *m_bias;
+ OutputStage m_os;
+
+ public:
+ DepthwisePlanar(StrategyType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
+ : Parent(args), m_strat(strat), m_bias(nullptr), m_os(os)
+ {
+ }
+
+ DepthwisePlanar(DepthwisePlanar &) = delete;
+ DepthwisePlanar &operator=(DepthwisePlanar &) = delete;
+
+ size_t get_storage_size(void) const override
+ {
+ return m_strat->get_storage_size(this->m_args);
+ }
+
+ void pack_parameters(
+ void *buffer, const void *biases,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) override
+ {
+ m_strat->pack_parameters(this->m_args, buffer, biases, {}, weights, ld_weight_col, ld_weight_row);
+ this->m_bias = reinterpret_cast<const TAccum *>(biases);
+ depthwise_depthfirst::stash_bias(this->m_os, biases);
+ }
+
+ size_t get_working_size(unsigned int n_threads) const override
+ {
+ return this->get_working_size_per_thread() * n_threads;
+ }
+
+ protected:
+ /* Compute the amount of working space required for a single thread. */
+ virtual size_t get_working_size_per_thread(void) const
+ {
+ return WorkspaceManager::get_sizeof_workspace(
+ WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os));
+ }
+
+ /* Initialise the working space for a thread. */
+ virtual void initialise_working_space(void *buffer) const
+ {
+ WorkspaceManager::initialise(
+ buffer,
+ WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os)
+ );
+ }
+
+ /* Execute the kernel for a given chunk of work. */
+ virtual void execute_kernel(
+ const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+ unsigned int pad_top, unsigned int valid_input_rows,
+ unsigned int pad_left, unsigned int valid_input_cols,
+ const TWeight *weights, const TAccum *bias,
+ TOutput *outptr, size_t ld_out_row, size_t ld_out_col, size_t ld_out_vl,
+ unsigned int valid_output_rows, unsigned int valid_output_cols,
+ unsigned int first_channel, unsigned int valid_channels,
+ WorkspaceType *ws
+ ) const
+ {
+ // Initialise the output pointers
+ for (auto i = 0u; i < m_strat->get_output_rows(); i++)
+ {
+ // Point at the output tensor for all valid rows; otherwise point at the
+ // padding buffer.
+ ws->output_row_ptrs[i] = i < valid_output_rows ? outptr : ws->output_padding_buffer;
+ ws->output_ld_cols[i] = i < valid_output_rows ? ld_out_col : 0;
+ ws->output_ld_vls[i] = i < valid_output_rows ? ld_out_vl : 0;
+ outptr += ld_out_row;
+ }
+
+ // Execute the kernel
+ PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::template execute<WorkspaceType>(
+ reinterpret_cast<const PlanarStrategy<TInput, TWeight, TOutput, TAccum, OutputStage> *>(m_strat.get())->get_kernel(),
+ inptr, ld_in_row, ld_in_col, ld_in_vl,
+ pad_top, valid_input_rows, pad_left, valid_input_cols,
+ weights, bias,
+ ws->output_row_ptrs, ws->output_ld_cols, ws->output_ld_vls,
+ valid_output_cols, first_channel, valid_channels,
+ this->m_os, ws
+ );
+ }
+
+ void execute_internal(
+ const DepthwiseArgs &args,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads
+ ) const override
+ {
+ // Get and initialise the working space for this thread.
+ void *thread_working_space =
+ static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
+ this->initialise_working_space(thread_working_space);
+ auto ws = reinterpret_cast<WorkspaceType *>(thread_working_space);
+
+ const auto n_output_channels = args.input_channels * args.channel_multiplier;
+ const auto vl = get_vector_length<TAccum>(m_strat->get_vl_type());
+
+ // Get typed pointers
+ auto input_batch = reinterpret_cast<const TInput *>(input);
+ auto output_batch = reinterpret_cast<TOutput *>(output);
+ auto weights = reinterpret_cast<const TWeight *>(parameters);
+
+ // Iterate over batches
+ for (auto batches = args.n_batches; batches; batches--)
+ {
+ // NOTE: Other loop orderings are possible and it would be worth
+ // investigating them.
+
+ // Within a batch, stripe threads across rows.
+ for (auto start_output_i = thread_id * m_strat->get_output_rows();
+ start_output_i < args.output_rows;
+ start_output_i += n_threads * m_strat->get_output_rows())
+ {
+ // Determine what (if any padding) is required on the top/bottom of
+ // this row of the convolution.
+ const int start_input_i = start_output_i * args.stride_rows - args.padding.top;
+ const unsigned int input_pad_top = start_input_i < 0 ? -start_input_i : 0;
+ const unsigned int input_i = start_input_i < 0 ? 0 : start_input_i;
+ const unsigned int valid_input_rows = input_i > args.input_rows ? 0 : args.input_rows - input_i;
+ const unsigned int valid_output_rows = args.output_rows - start_output_i;
+
+ auto inptr_row = input_batch + input_i*ld_input_row;
+ auto outptr_row = output_batch + start_output_i * ld_output_row;
+
+ // Execute the kernel
+ this->execute_kernel(
+ inptr_row, ld_input_row, ld_input_col, vl,
+ input_pad_top, valid_input_rows, args.padding.left, args.input_cols,
+ weights, this->m_bias,
+ outptr_row, ld_output_row, ld_output_col, vl,
+ valid_output_rows, args.output_cols,
+ 0 /* first channel */, n_output_channels,
+ ws
+ );
+ }
+
+ // Update the input and output pointers to account for batch
+ input_batch += ld_input_batch;
+ output_batch += ld_output_batch;
+ }
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp
new file mode 100644
index 0000000000..6ecdc36bf0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
+#include "depthwise_planar.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp"
+#include "kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp"
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include "kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
+#include "kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
+#include "kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
+#include "kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
+#include "kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif // defined(__aarch64__)
+
+#include <cstdint>
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+#if defined(__aarch64__)
+bool qp_weights_are_symmetric(const DepthwiseArgs &, const void *_qp)
+{
+ const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
+ return qp->b_offset == 0;
+}
+
+uint64_t not_preferred(const DepthwiseArgs &, const Requantize32 &)
+{
+ return std::numeric_limits<uint64_t>::max();
+}
+#endif // defined(__aarch64__)
+}
+
+static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depthwise_s8q_methods[] = {
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+ {
+ DepthwiseMethod::PLANAR,
+ "sme2_s8q_planar_3x3_s1_4rows_dot_za",
+ constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_s8q_planar_3x3_s1_4rows_dot_za>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift, no_prime_right_pad),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ auto strat = new sme2_s8q_planar_3x3_s1_4rows_dot_za(args.cpu_info);
+ return new DepthwisePlanar<int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::PLANAR,
+ "sme2_s8q_planar_3x3_s2_4rows_dot_za",
+ constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_s8q_planar_3x3_s2_4rows_dot_za>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift, no_prime_right_pad),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ auto strat = new sme2_s8q_planar_3x3_s2_4rows_dot_za(args.cpu_info);
+ return new DepthwisePlanar<int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::PLANAR,
+ "sme2_s8q_planar_5x5_s1_4rows_dot_za",
+ constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_s8q_planar_5x5_s1_4rows_dot_za>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift, no_prime_right_pad),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ auto strat = new sme2_s8q_planar_5x5_s1_4rows_dot_za(args.cpu_info);
+ return new DepthwisePlanar<int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::PLANAR,
+ "sme2_s8q_planar_5x5_s2_4rows_dot_za",
+ constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_s8q_planar_5x5_s2_4rows_dot_za>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift, no_prime_right_pad),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ auto strat = new sme2_s8q_planar_5x5_s2_4rows_dot_za(args.cpu_info);
+ return new DepthwisePlanar<int8_t>(strat, args, qp);
+ },
+ },
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst",
+ constraint<Requantize32>(is_supported<sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+ qp_has_no_left_shift,
+ qp_weights_are_symmetric,
+ cpu_has_sve2),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ auto strat = new sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
+ constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+ qp_has_no_left_shift,
+ cpu_has_sve2),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ auto strat = new sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ qp_has_no_left_shift,
+ cpu_has_sve2),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ auto strat = new sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ qp_has_no_left_shift,
+ cpu_has_sve2),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ auto strat = new sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ qp_has_no_left_shift,
+ cpu_has_sve2),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ auto strat = new sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
+ constraint<Requantize32>(is_supported<sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
+ qp_has_no_left_shift,
+ has_channel_multiplier,
+ cpu_has_sve2),
+ not_preferred,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ auto strat = new sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
+ constraint<Requantize32>(is_supported<sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
+ qp_has_no_left_shift,
+ has_channel_multiplier,
+ cpu_has_sve2),
+ not_preferred,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ auto strat = new sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
+ },
+ },
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst",
+ constraint<Requantize32>(is_supported<a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+ qp_weights_are_symmetric,
+ qp_has_no_left_shift,
+ cpu_has_dot_product),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ auto strat = new a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
+ constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+ qp_has_no_left_shift,
+ cpu_has_dot_product),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ auto strat = new a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ auto strat = new a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ auto strat = new a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ auto strat = new a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_s8q_nhwc_generic_output3x3_mla_depthfirst",
+ nullptr,
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ auto kernel = new a64_s8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstStrategy<int8_t>(kernel, 3, 3, args);
+ return new DepthwiseDepthfirstGeneric<int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
+ constraint<Requantize32>(is_supported<a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
+ qp_has_no_left_shift,
+ has_channel_multiplier,
+ cpu_has_dot_product),
+ not_preferred,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ auto strat = new a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
+ constraint<Requantize32>(is_supported<a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
+ qp_has_no_left_shift,
+ has_channel_multiplier,
+ cpu_has_dot_product),
+ not_preferred,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ auto strat = new a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+ constraint<Requantize32>(has_channel_multiplier),
+ not_preferred,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ auto kern = new a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstMultiplierStrategy<int8_t>(kern, args);
+ return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, true>(strat, args, qp);
+ },
+ },
+#endif // defined(__aarch64__)
+ { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
+};
+
+template <>
+const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> *depthwise_implementation_list()
+{
+ return depthwise_s8q_methods;
+}
+
+template UniqueDepthwiseCommon<int8_t, int8_t, int8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
+template std::vector<KernelDescription> get_compatible_kernels<int8_t, int8_t, int8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp
new file mode 100644
index 0000000000..37892b6963
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "depthwise_strategies_common.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+unsigned int DepthfirstStrategyUntyped::get_input_rows() const
+{
+ return this->get_kernel_rows() + (this->get_output_rows() - 1) * this->get_stride_rows();
+}
+
+unsigned int DepthfirstStrategyUntyped::get_input_cols() const
+{
+ return this->get_kernel_cols() + (this->get_output_cols() - 1) * this->get_stride_cols();
+}
+
+unsigned int DepthfirstStrategyUntyped::get_n_input_points() const { return this->get_input_rows() * this->get_input_cols(); }
+unsigned int DepthfirstStrategyUntyped::get_n_output_points() const { return this->get_output_rows() * this->get_output_cols(); }
+unsigned int DepthfirstStrategyUntyped::get_n_kernel_points() const { return this->get_kernel_rows() * this->get_kernel_cols(); }
+
+bool DepthfirstStrategyUntyped::uses_premultiply() const { return true; }
+
+unsigned int DepthfirstStrategyUntyped::get_accumulator_depth_vl() const { return 1; }
+
+bool DepthfirstStrategyUntyped::get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const
+{
+ // Get the kernel point to pack at the given index; return false to
+ // indicate that this index, and all greater indices, is out of range.
+ if (index < (this->get_kernel_cols() * this->get_kernel_rows()))
+ {
+ y = index % this->get_kernel_cols();
+ x = index / this->get_kernel_cols();
+ return true;
+ }
+ return false;
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp
new file mode 100644
index 0000000000..19cf26dd2f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "utils.hpp"
+#include "interleaves/generic.hpp"
+#include "depthfirst_driver.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+class DepthfirstStrategyUntyped : public IDepthfirstStrategy
+{
+ public:
+ virtual arm_gemm::VLType get_vl_type() const = 0;
+
+ virtual unsigned int get_kernel_rows() const = 0;
+ virtual unsigned int get_kernel_cols() const = 0;
+
+ virtual unsigned int get_stride_rows() const = 0;
+ virtual unsigned int get_stride_cols() const = 0;
+
+ virtual unsigned int get_input_rows() const override;
+ virtual unsigned int get_input_cols() const override;
+
+ virtual unsigned int get_n_input_points() const;
+ virtual unsigned int get_n_output_points() const;
+ virtual unsigned int get_n_kernel_points() const;
+
+ virtual bool uses_premultiply() const;
+
+ // Get the number of VLs used in the accumulator, this defaults to 1.
+ virtual unsigned int get_accumulator_depth_vl() const;
+
+ // Get the order in which to pack the weights, this defaults to a row-major
+ // sweep over the weight tensor.
+ virtual bool get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const;
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
+class DepthfirstStrategy : public DepthfirstStrategyUntyped
+{
+ public:
+ virtual size_t get_storage_size(const DepthwiseArgs &args) const
+ {
+ interleaves::PackingArguments packing_args(
+ this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
+ true, sizeof(TAccum), this->uses_premultiply(),
+ this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(),
+ [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+ { return this->get_kernel_packing_point(idx, x, y); }
+ );
+ return interleaves::get_storage_size_generic(packing_args, args);
+ }
+
+ virtual void pack_parameters(
+ const DepthwiseArgs &args, void *buffer,
+ const void *biases, const OutputStage &,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const
+ {
+ interleaves::PackingArguments packing_args(
+ this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
+ true, sizeof(TAccum), this->uses_premultiply(),
+ this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(),
+ [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+ { return this->get_kernel_packing_point(idx, x, y); }
+ );
+ interleaves::pack_parameters_generic(
+ packing_args, args, buffer, biases, weights, ld_weight_col, ld_weight_row);
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp
new file mode 100644
index 0000000000..236930ee26
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
+#include "depthwise_planar.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp"
+#include "kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp"
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include "kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
+#include "kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
+#include "kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+
+#include "kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+
+#include "kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
+#include "kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
+#include "kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+
+#endif // defined(__aarch64__)
+
+#include <cstdint>
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+#if defined(__aarch64__)
+uint64_t not_preferred(const DepthwiseArgs &, const Requantize32 &)
+{
+ return std::numeric_limits<uint64_t>::max();
+}
+#endif // defined(__aarch64__)
+}
+
+static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> depthwise_u8q_methods[] = {
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+ {
+ DepthwiseMethod::PLANAR,
+ "sme2_u8q_planar_3x3_s1_4rows_dot_za",
+ constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_u8q_planar_3x3_s1_4rows_dot_za>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift, no_prime_right_pad),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new sme2_u8q_planar_3x3_s1_4rows_dot_za(args.cpu_info);
+ return new DepthwisePlanar<uint8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::PLANAR,
+ "sme2_u8q_planar_3x3_s2_4rows_dot_za",
+ constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_u8q_planar_3x3_s2_4rows_dot_za>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift, no_prime_right_pad),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new sme2_u8q_planar_3x3_s2_4rows_dot_za(args.cpu_info);
+ return new DepthwisePlanar<uint8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::PLANAR,
+ "sme2_u8q_planar_5x5_s1_4rows_dot_za",
+ constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_u8q_planar_5x5_s1_4rows_dot_za>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift, no_prime_right_pad),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new sme2_u8q_planar_5x5_s1_4rows_dot_za(args.cpu_info);
+ return new DepthwisePlanar<uint8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::PLANAR,
+ "sme2_u8q_planar_5x5_s2_4rows_dot_za",
+ constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_u8q_planar_5x5_s2_4rows_dot_za>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift, no_prime_right_pad),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new sme2_u8q_planar_5x5_s2_4rows_dot_za(args.cpu_info);
+ return new DepthwisePlanar<uint8_t>(strat, args, qp);
+ },
+ },
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
+ constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+ qp_has_no_left_shift,
+ cpu_has_sve2),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ qp_has_no_left_shift,
+ cpu_has_sve2),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ qp_has_no_left_shift,
+ cpu_has_sve2),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ qp_has_no_left_shift,
+ cpu_has_sve2),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
+ constraint<Requantize32>(is_supported<sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
+ qp_has_no_left_shift,
+ has_channel_multiplier,
+ cpu_has_sve2),
+ not_preferred,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
+ constraint<Requantize32>(is_supported<sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
+ qp_has_no_left_shift,
+ has_channel_multiplier,
+ cpu_has_sve2),
+ not_preferred,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
+ },
+ },
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+ cpu_has_dot_product,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+ },
+ },
+
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ qp_zero_a_offset,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ qp_zero_a_offset,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ qp_zero_a_offset,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+ },
+ },
+
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8q_nhwc_generic_output3x3_mla_depthfirst",
+ nullptr,
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto kernel = new a64_u8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstStrategy<uint8_t>(kernel, 3, 3, args);
+ return new DepthwiseDepthfirstGeneric<uint8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
+ cpu_has_dot_product,
+ has_channel_multiplier,
+ qp_has_no_left_shift),
+ not_preferred,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
+ cpu_has_dot_product,
+ has_channel_multiplier,
+ qp_has_no_left_shift),
+ not_preferred,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+ constraint<Requantize32>(has_channel_multiplier),
+ not_preferred,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto kern = new a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstMultiplierStrategy<uint8_t>(kern, args);
+ return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, true>(strat, args, qp);
+ },
+ },
+
+#endif // defined(__aarch64__)
+ { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
+};
+
+template <>
+const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> *depthwise_implementation_list()
+{
+ return depthwise_u8q_methods;
+}
+
+template UniqueDepthwiseCommon<uint8_t, uint8_t, uint8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
+template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint8_t, uint8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp
new file mode 100644
index 0000000000..a888958b76
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
+#include "depthwise_planar.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+#include "kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp"
+#include "kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp"
+#include "kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp"
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+#include "kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
+#include "kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif // defined(__aarch64__)
+
+#include <cstdint>
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+#if defined(__aarch64__)
+uint64_t not_preferred(const DepthwiseArgs &, const Requantize32 &)
+{
+ return std::numeric_limits<uint64_t>::max();
+}
+#endif // defined(__aarch64__)
+}
+
+static const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> depthwise_u8q_methods[] = {
+#if defined(__aarch64__)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+ {
+ DepthwiseMethod::PLANAR,
+ "sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za",
+ constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+ auto strat = new sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za(args.cpu_info);
+ return new DepthwisePlanar<uint8_t, int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::PLANAR,
+ "sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za",
+ constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+ auto strat = new sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za(args.cpu_info);
+ return new DepthwisePlanar<uint8_t, int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::PLANAR,
+ "sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za",
+ constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+ auto strat = new sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za(args.cpu_info);
+ return new DepthwisePlanar<uint8_t, int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::PLANAR,
+ "sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za",
+ constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
+ is_supported<sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+ auto strat = new sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za(args.cpu_info);
+ return new DepthwisePlanar<uint8_t, int8_t>(strat, args, qp);
+ },
+ },
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ qp_has_no_left_shift,
+ cpu_has_sve2),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+ auto strat = new sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ qp_has_no_left_shift,
+ cpu_has_sve2),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+ auto strat = new sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ qp_has_no_left_shift,
+ cpu_has_sve2),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+ auto strat = new sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
+ },
+ },
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+ auto strat = new a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+ auto strat = new a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+ auto strat = new a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8s8u8q_nhwc_generic_output3x3_mla_depthfirst",
+ nullptr,
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+ auto kernel = new a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstStrategy<uint8_t, int8_t>(kernel, 3, 3, args);
+ return new DepthwiseDepthfirstGeneric<uint8_t, int8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+ constraint<Requantize32>(has_channel_multiplier),
+ not_preferred,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+ auto kern = new a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstMultiplierStrategy<uint8_t, int8_t>(kern, args);
+ return new DepthwiseDepthfirstMultiplier<uint8_t, int8_t, uint8_t, int32_t, true>(strat, args, qp);
+ },
+ },
+#endif // defined(__aarch64__)
+ { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
+};
+
+template <>
+const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> *depthwise_implementation_list()
+{
+ return depthwise_u8q_methods;
+}
+
+template UniqueDepthwiseCommon<uint8_t, int8_t, uint8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
+template std::vector<KernelDescription> get_compatible_kernels<uint8_t, int8_t, uint8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp
new file mode 100644
index 0000000000..3de4bdc1fb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+#include "depthwise.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+struct interleave_a64_s8q_3x3_dot
+{
+ static size_t get_packed_size(const DepthwiseArgs &);
+ static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row);
+};
+
+size_t interleave_a64_s8q_3x3_dot::get_packed_size(const DepthwiseArgs &args)
+{
+ // We store 7 vectors for every <vector_of_ints> of channels.
+ const unsigned int n = arm_gemm::roundup(
+ arm_gemm::iceildiv((long unsigned int) args.input_channels * args.channel_multiplier,
+ get_vector_length<int32_t>(arm_gemm::VLType::None)), 4lu
+ );
+ return n * 7 * get_vector_length<int8_t>(arm_gemm::VLType::None);
+}
+
+void interleave_a64_s8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row)
+{
+ __asm__ __volatile__(
+ "cmp %x[ld_weight_col], XZR\n"
+ "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+ "movi v16.4s, #0x9\n"
+ "movi v31.16b, #0x0\n"
+ "mov x21, #0x3\n"
+ "mul x21, %x[ld_weight_col], x21\n"
+ "add x20, %x[qp], %[offsetof_input_offset]\n"
+ "ld1r { v30.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_weights_offset]\n"
+ "ld1r { v29.4s }, [x20]\n"
+ "cmp %x[ld_weight_row], XZR\n"
+ "mul v29.4s, v29.4s, v30.4s\n"
+ "csel %x[ld_weight_row], %x[ld_weight_row], x21, NE\n"
+ "lsr x21, %x[n_channels], #0x2\n"
+ "movi v28.16b, #0x1\n"
+ "mul v29.4s, v29.4s, v16.4s\n"
+ "add x25, %x[weights], %x[ld_weight_row]\n"
+ "add x20, %x[qp], %[offsetof_per_layer_mul]\n"
+ "ld1r { v27.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_per_layer_right_shift]\n"
+ "ld1r { v26.4s }, [x20]\n"
+ "add x24, x25, %x[ld_weight_row]\n"
+ "add x23, %x[ld_weight_col], %x[ld_weight_col]\n"
+ "mov x22, #0x0\n"
+ "cbz x21, 4f\n"
+ "1:" // Loop
+ "movi v25.4s, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "ldr q25, [%x[bias], x22]\n"
+ "2:" // Loop: Skip bias load
+ "ldr s19, [%x[weights], #0x0]\n"
+ "ldr s16, [%x[weights], %x[ld_weight_col]]\n"
+ "zip1 v17.16b, v16.16b, v31.16b\n"
+ "movi v21.4s, #0x0\n"
+ "ldr s16, [%x[weights], x23]\n"
+ "ldr s18, [x25, #0x0]\n"
+ "zip1 v16.16b, v19.16b, v16.16b\n"
+ "zip1 v20.16b, v16.16b, v17.16b\n"
+ "ldr s17, [x25, %x[ld_weight_col]]\n"
+ "ldr s16, [x25, x23]\n"
+ "zip1 v18.16b, v18.16b, v16.16b\n"
+ "zip1 v16.16b, v17.16b, v31.16b\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s19, [x24, %x[ld_weight_col]]\n"
+ ".inst 0x4e949795 // sdot v21.4s, v28.16b, v20.16b\n"
+ "zip1 v18.16b, v18.16b, v16.16b\n"
+ "ldr s16, [x24, x23]\n"
+ "zip1 v17.16b, v17.16b, v16.16b\n"
+ "zip1 v16.16b, v19.16b, v31.16b\n"
+ ".inst 0x4e929795 // sdot v21.4s, v28.16b, v18.16b\n"
+ "zip1 v16.16b, v17.16b, v16.16b\n"
+ ".inst 0x4e909795 // sdot v21.4s, v28.16b, v16.16b\n"
+ "add %x[weights], %x[weights], #0x4\n"
+ "add x25, x25, #0x4\n"
+ "mls v25.4s, v21.4s, v30.4s\n"
+ "add x24, x24, #0x4\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "str q25, [%x[outptr], #0x0]\n"
+ "str q20, [%x[outptr], #0x10]\n"
+ "str q18, [%x[outptr], #0x20]\n"
+ "str q16, [%x[outptr], #0x30]\n"
+ "add %x[outptr], %x[outptr], #0x40\n"
+ "cbz %x[rq_mul_perchannel], 3f\n"
+ "ldr q27, [%x[rq_mul_perchannel], x22]\n"
+ "ldr q26, [%x[rq_shift_perchannel], x22]\n"
+ "3:" // Loop: Quantisation parameters: Store
+ "subs x21, x21, #0x1\n"
+ "str q27, [%x[outptr], #0x0]\n"
+ "add x22, x22, #0x10\n"
+ "str q26, [%x[outptr], #0x10]\n"
+ "add %x[outptr], %x[outptr], #0x20\n"
+ "bgt 1b\n"
+ "tst %x[n_channels], #0x3\n"
+ "beq 13f\n"
+ "4:" // Oddments
+ "movi v25.4s, #0x0\n"
+ "cbz %x[bias], 7f\n"
+ "add %x[bias], %x[bias], x22\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ld1 { v25.d }[0], [%x[bias]], #0x8\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v25.s }[2], [%x[bias]], #0x4\n"
+ "b 6f\n"
+ "5:" // Oddments: Load bias: Bit 1: Unset
+ "ld1 { v25.s }[0], [%x[bias]], #0x4\n"
+ "6:" // Oddments: Load bias: Bit 1: End
+ "7:" // Oddments: Skip bias load
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v17.h }[0], [%x[weights]]\n"
+ "ld1 { v24.h }[0], [x25]\n"
+ "add x21, %x[weights], %x[ld_weight_col]\n"
+ "add x20, %x[weights], x23\n"
+ "ld1 { v20.h }[0], [x21]\n"
+ "ld1 { v16.h }[0], [x20]\n"
+ "add x21, x25, %x[ld_weight_col]\n"
+ "add x20, x25, x23\n"
+ "ld1 { v19.h }[0], [x21]\n"
+ "ld1 { v18.h }[0], [x20]\n"
+ "add x21, x24, %x[ld_weight_col]\n"
+ "add x20, x24, x23\n"
+ "ld1 { v23.h }[0], [x24]\n"
+ "ld1 { v22.h }[0], [x21]\n"
+ "add %x[weights], %x[weights], #0x2\n"
+ "add x25, x25, #0x2\n"
+ "ld1 { v21.h }[0], [x20]\n"
+ "add x24, x24, #0x2\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v17.b }[2], [%x[weights]]\n"
+ "ld1 { v24.b }[2], [x25]\n"
+ "add x21, %x[weights], %x[ld_weight_col]\n"
+ "add x20, %x[weights], x23\n"
+ "ld1 { v20.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "add x21, x25, %x[ld_weight_col]\n"
+ "add x20, x25, x23\n"
+ "ld1 { v19.b }[2], [x21]\n"
+ "ld1 { v18.b }[2], [x20]\n"
+ "add x21, x24, %x[ld_weight_col]\n"
+ "add x20, x24, x23\n"
+ "ld1 { v23.b }[2], [x24]\n"
+ "ld1 { v22.b }[2], [x21]\n"
+ "add %x[weights], %x[weights], #0x1\n"
+ "ld1 { v21.b }[2], [x20]\n"
+ "b 9f\n"
+ "8:" // Oddments: Load weights: Bit 1: Unset
+ "ld1 { v17.b }[0], [%x[weights]]\n"
+ "ld1 { v24.b }[0], [x25]\n"
+ "add x21, %x[weights], %x[ld_weight_col]\n"
+ "add x20, %x[weights], x23\n"
+ "ld1 { v20.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "add x21, x25, %x[ld_weight_col]\n"
+ "add x20, x25, x23\n"
+ "ld1 { v19.b }[0], [x21]\n"
+ "ld1 { v18.b }[0], [x20]\n"
+ "add x21, x24, %x[ld_weight_col]\n"
+ "add x20, x24, x23\n"
+ "ld1 { v23.b }[0], [x24]\n"
+ "ld1 { v22.b }[0], [x21]\n"
+ "add %x[weights], %x[weights], #0x1\n"
+ "ld1 { v21.b }[0], [x20]\n"
+ "9:" // Oddments: Load weights: Bit 1: End
+ "zip1 v17.16b, v17.16b, v16.16b\n"
+ "zip1 v16.16b, v20.16b, v31.16b\n"
+ "zip1 v20.16b, v17.16b, v16.16b\n"
+ "zip1 v17.16b, v24.16b, v18.16b\n"
+ "zip1 v16.16b, v19.16b, v31.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4e949793 // sdot v19.4s, v28.16b, v20.16b\n"
+ "zip1 v18.16b, v17.16b, v16.16b\n"
+ "zip1 v17.16b, v23.16b, v21.16b\n"
+ ".inst 0x4e929793 // sdot v19.4s, v28.16b, v18.16b\n"
+ "zip1 v16.16b, v22.16b, v31.16b\n"
+ "zip1 v16.16b, v17.16b, v16.16b\n"
+ ".inst 0x4e909793 // sdot v19.4s, v28.16b, v16.16b\n"
+ "mls v25.4s, v19.4s, v30.4s\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "str q25, [%x[outptr], #0x0]\n"
+ "str q20, [%x[outptr], #0x10]\n"
+ "str q18, [%x[outptr], #0x20]\n"
+ "str q16, [%x[outptr], #0x30]\n"
+ "add %x[outptr], %x[outptr], #0x40\n"
+ "cbz %x[rq_mul_perchannel], 12f\n"
+ "add x21, %x[rq_mul_perchannel], x22\n"
+ "add x20, %x[rq_shift_perchannel], x22\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v27.d }[0], [x21], #0x8\n"
+ "ld1 { v26.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v27.s }[2], [x21], #0x4\n"
+ "ld1 { v26.s }[2], [x20], #0x4\n"
+ "b 11f\n"
+ "10:" // Oddments: Quantisation parameters: Load quant params: Bit 1: Unset
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "11:" // Oddments: Quantisation parameters: Load quant params: Bit 1: End
+ "12:" // Oddments: Quantisation parameters: Store
+ "str q27, [%x[outptr], #0x0]\n"
+ "str q26, [%x[outptr], #0x10]\n"
+ "add %x[outptr], %x[outptr], #0x20\n"
+ "13:" // End
+ : [bias] "+&r" (bias), [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
+ : [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp
new file mode 100644
index 0000000000..19264c9fce
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+#include "depthwise.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+struct interleave_a64_u8q_3x3_dot
+{
+ static size_t get_packed_size(const DepthwiseArgs &);
+ static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row);
+};
+
+size_t interleave_a64_u8q_3x3_dot::get_packed_size(const DepthwiseArgs &args)
+{
+ // We store 7 vectors for every <vector_of_ints> of channels.
+ const unsigned int n = arm_gemm::roundup(
+ arm_gemm::iceildiv((long unsigned int) args.input_channels * args.channel_multiplier,
+ get_vector_length<int32_t>(arm_gemm::VLType::None)), 4lu
+ );
+ return n * 7 * get_vector_length<uint8_t>(arm_gemm::VLType::None);
+}
+
+void interleave_a64_u8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row)
+{
+ __asm__ __volatile__(
+ "cmp %x[ld_weight_col], XZR\n"
+ "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+ "movi v16.4s, #0x9\n"
+ "movi v31.16b, #0x0\n"
+ "mov x21, #0x3\n"
+ "mul x21, %x[ld_weight_col], x21\n"
+ "add x20, %x[qp], %[offsetof_input_offset]\n"
+ "ld1r { v30.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_weights_offset]\n"
+ "ld1r { v29.4s }, [x20]\n"
+ "cmp %x[ld_weight_row], XZR\n"
+ "mul v29.4s, v29.4s, v30.4s\n"
+ "csel %x[ld_weight_row], %x[ld_weight_row], x21, NE\n"
+ "lsr x21, %x[n_channels], #0x2\n"
+ "movi v28.16b, #0x1\n"
+ "mul v29.4s, v29.4s, v16.4s\n"
+ "add x25, %x[weights], %x[ld_weight_row]\n"
+ "add x20, %x[qp], %[offsetof_per_layer_mul]\n"
+ "ld1r { v27.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_per_layer_right_shift]\n"
+ "ld1r { v26.4s }, [x20]\n"
+ "add x24, x25, %x[ld_weight_row]\n"
+ "add x23, %x[ld_weight_col], %x[ld_weight_col]\n"
+ "mov x22, #0x0\n"
+ "cbz x21, 4f\n"
+ "1:" // Loop
+ "movi v25.4s, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "ldr q25, [%x[bias], x22]\n"
+ "2:" // Loop: Skip bias load
+ "ldr s19, [%x[weights], #0x0]\n"
+ "ldr s16, [%x[weights], %x[ld_weight_col]]\n"
+ "zip1 v17.16b, v16.16b, v31.16b\n"
+ "movi v21.4s, #0x0\n"
+ "ldr s16, [%x[weights], x23]\n"
+ "ldr s18, [x25, #0x0]\n"
+ "zip1 v16.16b, v19.16b, v16.16b\n"
+ "zip1 v20.16b, v16.16b, v17.16b\n"
+ "ldr s17, [x25, %x[ld_weight_col]]\n"
+ "ldr s16, [x25, x23]\n"
+ "zip1 v18.16b, v18.16b, v16.16b\n"
+ "zip1 v16.16b, v17.16b, v31.16b\n"
+ "ldr s17, [x24, #0x0]\n"
+ "ldr s19, [x24, %x[ld_weight_col]]\n"
+ ".inst 0x6e949795 // udot v21.4s, v28.16b, v20.16b\n"
+ "zip1 v18.16b, v18.16b, v16.16b\n"
+ "ldr s16, [x24, x23]\n"
+ "zip1 v17.16b, v17.16b, v16.16b\n"
+ "zip1 v16.16b, v19.16b, v31.16b\n"
+ ".inst 0x6e929795 // udot v21.4s, v28.16b, v18.16b\n"
+ "zip1 v16.16b, v17.16b, v16.16b\n"
+ ".inst 0x6e909795 // udot v21.4s, v28.16b, v16.16b\n"
+ "add %x[weights], %x[weights], #0x4\n"
+ "add x25, x25, #0x4\n"
+ "mls v25.4s, v21.4s, v30.4s\n"
+ "add x24, x24, #0x4\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "str q25, [%x[outptr], #0x0]\n"
+ "str q20, [%x[outptr], #0x10]\n"
+ "str q18, [%x[outptr], #0x20]\n"
+ "str q16, [%x[outptr], #0x30]\n"
+ "add %x[outptr], %x[outptr], #0x40\n"
+ "cbz %x[rq_mul_perchannel], 3f\n"
+ "ldr q27, [%x[rq_mul_perchannel], x22]\n"
+ "ldr q26, [%x[rq_shift_perchannel], x22]\n"
+ "3:" // Loop: Quantisation parameters: Store
+ "subs x21, x21, #0x1\n"
+ "str q27, [%x[outptr], #0x0]\n"
+ "add x22, x22, #0x10\n"
+ "str q26, [%x[outptr], #0x10]\n"
+ "add %x[outptr], %x[outptr], #0x20\n"
+ "bgt 1b\n"
+ "tst %x[n_channels], #0x3\n"
+ "beq 13f\n"
+ "4:" // Oddments
+ "movi v25.4s, #0x0\n"
+ "cbz %x[bias], 7f\n"
+ "add %x[bias], %x[bias], x22\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ld1 { v25.d }[0], [%x[bias]], #0x8\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v25.s }[2], [%x[bias]], #0x4\n"
+ "b 6f\n"
+ "5:" // Oddments: Load bias: Bit 1: Unset
+ "ld1 { v25.s }[0], [%x[bias]], #0x4\n"
+ "6:" // Oddments: Load bias: Bit 1: End
+ "7:" // Oddments: Skip bias load
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v17.h }[0], [%x[weights]]\n"
+ "ld1 { v24.h }[0], [x25]\n"
+ "add x21, %x[weights], %x[ld_weight_col]\n"
+ "add x20, %x[weights], x23\n"
+ "ld1 { v20.h }[0], [x21]\n"
+ "ld1 { v16.h }[0], [x20]\n"
+ "add x21, x25, %x[ld_weight_col]\n"
+ "add x20, x25, x23\n"
+ "ld1 { v19.h }[0], [x21]\n"
+ "ld1 { v18.h }[0], [x20]\n"
+ "add x21, x24, %x[ld_weight_col]\n"
+ "add x20, x24, x23\n"
+ "ld1 { v23.h }[0], [x24]\n"
+ "ld1 { v22.h }[0], [x21]\n"
+ "add %x[weights], %x[weights], #0x2\n"
+ "add x25, x25, #0x2\n"
+ "ld1 { v21.h }[0], [x20]\n"
+ "add x24, x24, #0x2\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v17.b }[2], [%x[weights]]\n"
+ "ld1 { v24.b }[2], [x25]\n"
+ "add x21, %x[weights], %x[ld_weight_col]\n"
+ "add x20, %x[weights], x23\n"
+ "ld1 { v20.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "add x21, x25, %x[ld_weight_col]\n"
+ "add x20, x25, x23\n"
+ "ld1 { v19.b }[2], [x21]\n"
+ "ld1 { v18.b }[2], [x20]\n"
+ "add x21, x24, %x[ld_weight_col]\n"
+ "add x20, x24, x23\n"
+ "ld1 { v23.b }[2], [x24]\n"
+ "ld1 { v22.b }[2], [x21]\n"
+ "add %x[weights], %x[weights], #0x1\n"
+ "ld1 { v21.b }[2], [x20]\n"
+ "b 9f\n"
+ "8:" // Oddments: Load weights: Bit 1: Unset
+ "ld1 { v17.b }[0], [%x[weights]]\n"
+ "ld1 { v24.b }[0], [x25]\n"
+ "add x21, %x[weights], %x[ld_weight_col]\n"
+ "add x20, %x[weights], x23\n"
+ "ld1 { v20.b }[0], [x21]\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "add x21, x25, %x[ld_weight_col]\n"
+ "add x20, x25, x23\n"
+ "ld1 { v19.b }[0], [x21]\n"
+ "ld1 { v18.b }[0], [x20]\n"
+ "add x21, x24, %x[ld_weight_col]\n"
+ "add x20, x24, x23\n"
+ "ld1 { v23.b }[0], [x24]\n"
+ "ld1 { v22.b }[0], [x21]\n"
+ "add %x[weights], %x[weights], #0x1\n"
+ "ld1 { v21.b }[0], [x20]\n"
+ "9:" // Oddments: Load weights: Bit 1: End
+ "zip1 v17.16b, v17.16b, v16.16b\n"
+ "zip1 v16.16b, v20.16b, v31.16b\n"
+ "zip1 v20.16b, v17.16b, v16.16b\n"
+ "zip1 v17.16b, v24.16b, v18.16b\n"
+ "zip1 v16.16b, v19.16b, v31.16b\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x6e949793 // udot v19.4s, v28.16b, v20.16b\n"
+ "zip1 v18.16b, v17.16b, v16.16b\n"
+ "zip1 v17.16b, v23.16b, v21.16b\n"
+ ".inst 0x6e929793 // udot v19.4s, v28.16b, v18.16b\n"
+ "zip1 v16.16b, v22.16b, v31.16b\n"
+ "zip1 v16.16b, v17.16b, v16.16b\n"
+ ".inst 0x6e909793 // udot v19.4s, v28.16b, v16.16b\n"
+ "mls v25.4s, v19.4s, v30.4s\n"
+ "add v25.4s, v25.4s, v29.4s\n"
+ "str q25, [%x[outptr], #0x0]\n"
+ "str q20, [%x[outptr], #0x10]\n"
+ "str q18, [%x[outptr], #0x20]\n"
+ "str q16, [%x[outptr], #0x30]\n"
+ "add %x[outptr], %x[outptr], #0x40\n"
+ "cbz %x[rq_mul_perchannel], 12f\n"
+ "add x21, %x[rq_mul_perchannel], x22\n"
+ "add x20, %x[rq_shift_perchannel], x22\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v27.d }[0], [x21], #0x8\n"
+ "ld1 { v26.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v27.s }[2], [x21], #0x4\n"
+ "ld1 { v26.s }[2], [x20], #0x4\n"
+ "b 11f\n"
+ "10:" // Oddments: Quantisation parameters: Load quant params: Bit 1: Unset
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "11:" // Oddments: Quantisation parameters: Load quant params: Bit 1: End
+ "12:" // Oddments: Quantisation parameters: Store
+ "str q27, [%x[outptr], #0x0]\n"
+ "str q26, [%x[outptr], #0x10]\n"
+ "add %x[outptr], %x[outptr], #0x20\n"
+ "13:" // End
+ : [bias] "+&r" (bias), [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
+ : [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp
new file mode 100644
index 0000000000..dc505a013d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "generic.hpp"
+
+#include <functional>
+
+namespace arm_conv {
+namespace depthwise {
+namespace interleaves {
+
+PackingArguments::PackingArguments(
+ unsigned int kernel_rows, unsigned int kernel_cols, size_t weight_element_size,
+ bool include_bias, size_t bias_element_size, bool premultiply,
+ arm_gemm::VLType vl_type, size_t accumulator_element_size, unsigned int accumulator_depth_vl,
+ std::function<bool(unsigned int, unsigned int &, unsigned int &)> get_weight_pos
+) : kernel_rows(kernel_rows), kernel_cols(kernel_cols), weight_element_size(weight_element_size),
+ include_bias(include_bias), bias_element_size(bias_element_size), premultiply(premultiply),
+ vl_type(vl_type), accumulator_element_size(accumulator_element_size), accumulator_depth_vl(accumulator_depth_vl),
+ get_weight_pos(get_weight_pos)
+{
+}
+
+size_t get_storage_size_generic(const PackingArguments &packing_args, const DepthwiseArgs &args)
+{
+ // If the channel multiplier is greater than one, then we treat this as a
+ // repeated packing of `channel_multiplier`-sized problems.
+ if (args.channel_multiplier > 1 && !packing_args.premultiply)
+ {
+ DepthwiseArgs args_per_input_channel(args);
+ args_per_input_channel.input_channels = args.channel_multiplier;
+ args_per_input_channel.channel_multiplier = 1;
+
+ return args.input_channels * get_storage_size_generic(packing_args, args_per_input_channel);
+ }
+
+ const unsigned int vl =
+ packing_args.accumulator_depth_vl *
+ arm_gemm::utils::get_vector_length<uint8_t>(packing_args.vl_type) / packing_args.accumulator_element_size;
+ const unsigned int n_packs = arm_gemm::iceildiv(args.input_channels * args.channel_multiplier, vl);
+ const auto pack_size = (packing_args.include_bias ? packing_args.bias_element_size : 0) +
+ packing_args.kernel_points() * packing_args.weight_element_size;
+ return n_packs * pack_size * vl;
+}
+
+void pack_parameters_generic(
+ const PackingArguments &packing_args,
+ const DepthwiseArgs &args,
+ void *buffer_raw,
+ const void *biases_raw,
+ const void *weights_raw,
+ size_t ld_weight_col,
+ size_t ld_weight_row
+)
+{
+ // Cast the pointers to byte sizes
+ auto *buffer = static_cast<uint8_t *>(buffer_raw);
+ auto *biases = static_cast<const uint8_t *>(biases_raw);
+ auto *weights = static_cast<const uint8_t *>(weights_raw);
+
+ // If the channel multiplier is greater than one, then we treat this as a
+ // repeated packing of `channel_multiplier`-sized problems.
+ if (args.channel_multiplier > 1 && !packing_args.premultiply)
+ {
+ // Get a modified copy of the depthwise arguments
+ DepthwiseArgs args_per_input_channel(args);
+ args_per_input_channel.input_channels = args.channel_multiplier;
+ args_per_input_channel.channel_multiplier = 1;
+
+ // Resolve the strides here
+ ld_weight_col = ld_weight_col ? ld_weight_col : args.input_channels * args.channel_multiplier;
+ ld_weight_row = ld_weight_row ? ld_weight_row : ld_weight_col * packing_args.kernel_cols;
+
+ auto per_input_channel_size = get_storage_size_generic(packing_args, args_per_input_channel);
+
+ for (unsigned int c = 0; c < args.input_channels; c++)
+ {
+ pack_parameters_generic(
+ packing_args, args_per_input_channel, buffer, biases, weights, ld_weight_col, ld_weight_row);
+
+ // Update the pointers
+ buffer += per_input_channel_size;
+ biases += (biases == nullptr) ? 0 : packing_args.bias_element_size * args.channel_multiplier;
+ weights += packing_args.weight_element_size * args.channel_multiplier;
+ }
+ return;
+ }
+
+ auto input_channels = args.input_channels * args.channel_multiplier;
+
+ // Finalise the weight strides
+ ld_weight_col = (ld_weight_col == 0) ? input_channels : ld_weight_col;
+ ld_weight_row = (ld_weight_row == 0) ? packing_args.kernel_cols * ld_weight_col : ld_weight_row;
+
+ const unsigned int vl =
+ packing_args.accumulator_depth_vl *
+ arm_gemm::utils::get_vector_length<uint8_t>(packing_args.vl_type) / packing_args.accumulator_element_size;
+
+ for (unsigned int n = 0; n < input_channels; n += vl)
+ {
+ const unsigned int todo = std::min(vl, input_channels - n);
+
+ if (packing_args.include_bias)
+ {
+ if (biases != nullptr)
+ {
+ memcpy(buffer, biases, todo * packing_args.bias_element_size);
+ biases += todo * packing_args.bias_element_size;
+ }
+ else
+ {
+ memset(buffer, 0, vl * packing_args.bias_element_size);
+ }
+
+ buffer += vl * packing_args.bias_element_size;
+ }
+
+ // Copy each of the weights in turn
+ unsigned int kx, ky;
+ for (int kindex = 0; packing_args.get_weight_pos(kindex, kx, ky); kindex++)
+ {
+ const auto src_ptr = weights + (kx*ld_weight_row + ky*ld_weight_col + n) * packing_args.weight_element_size;
+ memcpy(buffer, src_ptr, todo * packing_args.weight_element_size);
+ buffer += vl * packing_args.weight_element_size;
+ }
+ }
+}
+
+} // namespace interleaves
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp
new file mode 100644
index 0000000000..1842f10150
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "utils.hpp"
+#include "depthwise.hpp"
+
+#include <functional>
+
+namespace arm_conv {
+namespace depthwise {
+namespace interleaves {
+
+struct PackingArguments
+{
+ const unsigned int kernel_rows;
+ const unsigned int kernel_cols;
+ const size_t weight_element_size;
+ const bool include_bias;
+ const size_t bias_element_size;
+ const bool premultiply;
+ arm_gemm::VLType vl_type;
+ const size_t accumulator_element_size;
+ const unsigned int accumulator_depth_vl;
+ std::function<bool(unsigned int, unsigned int &, unsigned int &)> get_weight_pos;
+
+ unsigned int kernel_points(void) const { return kernel_cols * kernel_rows; }
+
+ PackingArguments(
+ unsigned int kernel_rows,
+ unsigned int kernel_cols,
+ size_t weight_element_size,
+ bool include_bias,
+ size_t bias_element_size,
+ bool premultiply,
+ arm_gemm::VLType vl_type,
+ size_t accumulator_element_size,
+ unsigned int accumulator_depth_vl,
+ std::function<bool(unsigned int, unsigned int &, unsigned int &)> get_weight_pos
+ );
+};
+
+size_t get_storage_size_generic(
+ const PackingArguments &packing_args,
+ const DepthwiseArgs &args
+);
+
+void pack_parameters_generic(
+ const PackingArguments &packing_args,
+ const DepthwiseArgs &args,
+ void *buffer_raw,
+ const void *biases_raw,
+ const void *weights_raw,
+ size_t ld_weight_col,
+ size_t ld_weight_row
+);
+
+} // namespace interleaves
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp
new file mode 100644
index 0000000000..a6389054d1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "generic_quantized_dot_product.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+namespace interleaves {
+namespace quantized {
+
+size_t get_storage_size(
+ const DepthwiseArgs &args,
+ const arm_gemm::VLType vl_type,
+ const unsigned int accumulator_depth_vl
+)
+{
+ // We produce VL<int32_t> channels at a time, for each of these blocks of
+ // channels we store a vector of biases, weights (complicated) and
+ // requantize parameters.
+ const unsigned int iter_length = accumulator_depth_vl * arm_gemm::utils::get_vector_length<int32_t>(vl_type);
+ const unsigned int n_iters = args.input_channels * arm_gemm::iceildiv(args.channel_multiplier, iter_length);
+
+ // Compute the cost of storing the weights
+ const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(args.kernel_cols, 4u);
+
+ return n_iters * iter_length * (
+ sizeof(int32_t) + // Bias
+ 4 * n_dots_per_kernel_row * args.kernel_rows * sizeof(int8_t) + // Weights
+ 2 * sizeof(int32_t) // Requantisation parameters
+ );
+}
+
+template <typename T>
+void pack_parameters(
+ void *_buffer, const int32_t *biases,
+ const T *weights, size_t ld_weight_col, size_t ld_weight_row,
+ const DepthwiseArgs &args,
+ const arm_gemm::Requantize32 &qp,
+ const arm_gemm::VLType vl_type,
+ const unsigned int accumulator_depth_vl
+)
+{
+ auto buffer = static_cast<uint8_t *>(_buffer);
+ auto requant_muls = qp.per_channel_muls;
+ auto requant_shifts = qp.per_channel_right_shifts;
+
+ const unsigned int iter_length = accumulator_depth_vl * arm_gemm::utils::get_vector_length<int32_t>(vl_type);
+ const unsigned int n_iters_per_input_channel = arm_gemm::iceildiv(args.channel_multiplier, iter_length);
+ const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(args.kernel_cols, 4u);
+
+ const size_t iter_stride = iter_length * (
+ sizeof(int32_t) + // Bias
+ 4 * n_dots_per_kernel_row * args.kernel_rows * sizeof(T) + // Weights
+ 2 * sizeof(int32_t) // Requantisation parameters
+ );
+
+ ld_weight_col = (ld_weight_col == 0) ? args.input_channels * args.channel_multiplier : ld_weight_col;
+ ld_weight_row = (ld_weight_row == 0) ? args.kernel_cols * ld_weight_col : ld_weight_row;
+
+ for (unsigned int input_channel = 0; input_channel < args.input_channels; input_channel++)
+ {
+ auto buffer_input_channel = buffer + input_channel * n_iters_per_input_channel * iter_stride;
+ auto weights_input_channel = weights + input_channel * args.channel_multiplier;
+
+ for (unsigned int iter = 0; iter < n_iters_per_input_channel; iter++)
+ {
+ // Get a pointer to the start of this portion of the buffer; consequently
+ // derive pointers to the bias, weight and requantisation portions of
+ // this frame.
+ auto buffer_base = buffer_input_channel + iter_stride * iter;
+ auto buffer_biases = reinterpret_cast<int32_t *>(buffer_base);
+ auto buffer_weights = buffer_base + sizeof(int32_t) * iter_length;
+ auto buffer_requant_mul = reinterpret_cast<int32_t *>(
+ buffer_weights + args.kernel_rows * n_dots_per_kernel_row * 4 * iter_length);
+ auto buffer_requant_shift = buffer_requant_mul + iter_length;
+ auto weights_base = weights_input_channel + iter * iter_length;
+
+ // Hence work through the data for this iteration, on a
+ // channel-by-channel basis.
+ const auto this_iter_length = std::min<unsigned int>(
+ iter_length, args.channel_multiplier - iter * iter_length
+ );
+ for (unsigned int i = 0; i < this_iter_length; i++)
+ {
+ auto weights_channel = weights_base + i;
+
+ // Read the bias value, we modify this as we read the weights.
+ auto bias_value = biases == nullptr ? 0 : *(biases++);
+ int32_t elements_sum = 0;
+
+ // Read through the kernel; for each row, marshal together as many dot
+ // product terms as are required.
+ for (unsigned int ki = 0; ki < args.kernel_rows; ki++)
+ {
+ auto buffer_row = buffer_weights + i*4 + ki * 4 * n_dots_per_kernel_row * iter_length;
+ auto weights_row = weights_channel + ki * ld_weight_row;
+
+ unsigned int kj = 0;
+ for (; kj < args.kernel_cols; kj++)
+ {
+ // Determine which element to which we're writing
+ const auto dot = kj / 4;
+ const auto elem = kj % 4;
+
+ // Copy the value; include in the sum
+ const auto val = weights_row[kj * ld_weight_col];
+ buffer_row[dot * 4 * iter_length + elem] = val;
+ elements_sum += val;
+ }
+ for (; kj < 4 * n_dots_per_kernel_row; kj++)
+ {
+ const auto dot = kj / 4;
+ const auto elem = kj % 4;
+ buffer_row[dot * 4 * iter_length + elem] = 0;
+ }
+
+ buffer_row += 4 * n_dots_per_kernel_row * iter_length;
+ }
+
+ // Write back the bias and offset values
+ *(buffer_biases++) =
+ bias_value - qp.a_offset * elements_sum +
+ args.kernel_rows * args.kernel_cols * qp.a_offset * qp.b_offset;
+
+ // Write out the requantisation parameters
+ *(buffer_requant_mul++) = qp.per_channel_requant ? *(requant_muls++) : qp.per_layer_mul;
+ *(buffer_requant_shift++) = qp.per_channel_requant ? *(requant_shifts++) : qp.per_layer_right_shift;
+ }
+ }
+ }
+}
+
+template void pack_parameters(void *, const int32_t *, const int8_t *, size_t, size_t, const DepthwiseArgs &, const arm_gemm::Requantize32 &, arm_gemm::VLType, unsigned int);
+template void pack_parameters(void *, const int32_t *, const uint8_t *, size_t, size_t, const DepthwiseArgs &, const arm_gemm::Requantize32 &, arm_gemm::VLType, unsigned int);
+
+} // namespace quantized
+} // namespace interleaves
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.hpp
new file mode 100644
index 0000000000..779d67d3f4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "generic.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+namespace interleaves {
+namespace quantized {
+
+size_t get_storage_size(
+ const DepthwiseArgs &args,
+ arm_gemm::VLType vl_type,
+ unsigned int accumulator_depth_vl=1
+);
+
+template <typename T>
+void pack_parameters(
+ void *buffer, const int32_t *biases,
+ const T *weights, size_t ld_weight_col, size_t ld_weight_row,
+ const DepthwiseArgs &args,
+ const arm_gemm::Requantize32 &qp,
+ arm_gemm::VLType vl_type,
+ unsigned int accumulator_depth_vl
+);
+
+} // namespace quantized
+} // namespace interleaves
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp
new file mode 100644
index 0000000000..76f38eb335
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+struct interleave_sve_u8q_3x3_dot
+{
+ static void pack_parameters(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+ static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+struct interleave_sve_s8q_3x3_dot
+{
+ static void pack_parameters(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+ static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
+
+struct interleave_a64_u8q_3x3_dot
+{
+ static void pack_parameters(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+ static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+struct interleave_a64_s8q_3x3_dot
+{
+ static void pack_parameters(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+ static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp
new file mode 100644
index 0000000000..5d7b54f235
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+#include "depthwise.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+struct interleave_sve_s8q_3x3_dot
+{
+ static size_t get_packed_size(const DepthwiseArgs &);
+ static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row);
+};
+
+size_t interleave_sve_s8q_3x3_dot::get_packed_size(const DepthwiseArgs &args)
+{
+ // We store 7 vectors for every <vector_of_ints> of channels.
+ const unsigned int n = arm_gemm::roundup(
+ arm_gemm::iceildiv((long unsigned int) args.input_channels * args.channel_multiplier,
+ get_vector_length<int32_t>(arm_gemm::VLType::SVE)), 4lu
+ );
+ return n * 7 * get_vector_length<int8_t>(arm_gemm::VLType::SVE);
+}
+
+void interleave_sve_s8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row)
+{
+ __asm__ __volatile__(
+ "cmp %x[ld_weight_col], XZR\n"
+ "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+ "mov z16.s, #0x9\n"
+ "mov z28.b, #0x0\n"
+ "mov x20, #0x3\n"
+ "ptrue p2.b\n"
+ "mul x20, %x[ld_weight_col], x20\n"
+ "ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_input_offset]]\n"
+ "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_weights_offset]]\n"
+ "cmp %x[ld_weight_row], XZR\n"
+ "csel %x[ld_weight_row], %x[ld_weight_row], x20, NE\n"
+ "mov z25.b, #0x1\n"
+ "mul z26.s, p2/M, z26.s, z27.s\n"
+ "add x24, %x[weights], %x[ld_weight_row]\n"
+ "ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_per_layer_mul]]\n"
+ "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_per_layer_right_shift]]\n"
+ "add x23, x24, %x[ld_weight_row]\n"
+ "add x22, %x[ld_weight_col], %x[ld_weight_col]\n"
+ "whilelt p1.s, XZR, %x[n_channels]\n"
+ "mov x21, #0x0\n"
+ "mul z26.s, p2/M, z26.s, z16.s\n"
+ "pfalse p8.b\n"
+ "cbz %x[bias], 1f\n"
+ "ptrue p8.s\n"
+ "1:" // No bias
+ "2:" // Loop
+ "cntp x20, p2, p1.s\n"
+ "whilelt p0.b, XZR, x20\n"
+ "ld1b { z18.b }, p0/Z, [%x[weights]]\n"
+ "ld1b { z17.b }, p0/Z, [%x[weights], %x[ld_weight_col]]\n"
+ "ld1b { z16.b }, p0/Z, [%x[weights], x22]\n"
+ "zip1 z20.b, z18.b, z16.b\n"
+ "zip1 z19.b, z17.b, z28.b\n"
+ "ld1b { z18.b }, p0/Z, [x24]\n"
+ "ld1b { z17.b }, p0/Z, [x24, %x[ld_weight_col]]\n"
+ "ld1b { z16.b }, p0/Z, [x24, x22]\n"
+ "zip1 z22.b, z20.b, z19.b\n"
+ "zip1 z21.b, z18.b, z16.b\n"
+ "zip1 z19.b, z17.b, z28.b\n"
+ "mov z20.s, #0x0\n"
+ "ld1b { z18.b }, p0/Z, [x23]\n"
+ "ld1b { z17.b }, p0/Z, [x23, %x[ld_weight_col]]\n"
+ "ld1b { z16.b }, p0/Z, [x23, x22]\n"
+ "sdot z20.s, z25.b, z22.b\n"
+ "zip1 z19.b, z21.b, z19.b\n"
+ "sdot z20.s, z25.b, z19.b\n"
+ "zip1 z18.b, z18.b, z16.b\n"
+ "zip1 z16.b, z17.b, z28.b\n"
+ "and p0.b, p2/Z, p8.b, p1.b\n"
+ "ld1w { z17.s }, p0/Z, [%x[bias], x21, LSL #2]\n"
+ "zip1 z16.b, z18.b, z16.b\n"
+ "sdot z20.s, z25.b, z16.b\n"
+ "mls z17.s, p2/M, z20.s, z27.s\n"
+ "add %x[weights], %x[weights], x20\n"
+ "add x24, x24, x20\n"
+ "add x23, x23, x20\n"
+ "add z17.s, z17.s, z26.s\n"
+ "st1w { z17.s }, p2, [%x[outptr]]\n"
+ "st1b { z22.b }, p2, [%x[outptr], #1, MUL VL]\n"
+ "st1b { z19.b }, p2, [%x[outptr], #2, MUL VL]\n"
+ "st1b { z16.b }, p2, [%x[outptr], #3, MUL VL]\n"
+ "addvl %x[outptr], %x[outptr], #4\n"
+ "cbz %x[rq_mul_perchannel], 3f\n"
+ "ld1w { z24.s }, p1/Z, [%x[rq_mul_perchannel], x21, LSL #2]\n"
+ "ld1w { z23.s }, p1/Z, [%x[rq_shift_perchannel], x21, LSL #2]\n"
+ "3:" // Loop: Quantisation parameters: Store
+ "incw x21\n"
+ "whilelt p1.s, x21, %x[n_channels]\n"
+ "st1w { z24.s }, p2, [%x[outptr]]\n"
+ "st1w { z23.s }, p2, [%x[outptr], #1, MUL VL]\n"
+ "addvl %x[outptr], %x[outptr], #2\n"
+ "b.any 2b\n"
+ : [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
+ : [bias] "r" (bias), [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
+ : "cc", "memory", "p0", "p1", "p2", "p8", "x20", "x21", "x22", "x23", "x24", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp
new file mode 100644
index 0000000000..c3da81448b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+#include "depthwise.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+struct interleave_sve_u8q_3x3_dot
+{
+ static size_t get_packed_size(const DepthwiseArgs &);
+ static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row);
+};
+
+size_t interleave_sve_u8q_3x3_dot::get_packed_size(const DepthwiseArgs &args)
+{
+ // We store 7 vectors for every <vector_of_ints> of channels.
+ const unsigned int n = arm_gemm::roundup(
+ arm_gemm::iceildiv((long unsigned int) args.input_channels * args.channel_multiplier,
+ get_vector_length<int32_t>(arm_gemm::VLType::SVE)), 4lu
+ );
+ return n * 7 * get_vector_length<uint8_t>(arm_gemm::VLType::SVE);
+}
+
+void interleave_sve_u8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row)
+{
+ __asm__ __volatile__(
+ "cmp %x[ld_weight_col], XZR\n"
+ "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+ "mov z16.s, #0x9\n"
+ "mov z28.b, #0x0\n"
+ "mov x20, #0x3\n"
+ "ptrue p2.b\n"
+ "mul x20, %x[ld_weight_col], x20\n"
+ "ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_input_offset]]\n"
+ "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_weights_offset]]\n"
+ "cmp %x[ld_weight_row], XZR\n"
+ "csel %x[ld_weight_row], %x[ld_weight_row], x20, NE\n"
+ "mov z25.b, #0x1\n"
+ "mul z26.s, p2/M, z26.s, z27.s\n"
+ "add x24, %x[weights], %x[ld_weight_row]\n"
+ "ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_per_layer_mul]]\n"
+ "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_per_layer_right_shift]]\n"
+ "add x23, x24, %x[ld_weight_row]\n"
+ "add x22, %x[ld_weight_col], %x[ld_weight_col]\n"
+ "whilelt p1.s, XZR, %x[n_channels]\n"
+ "mov x21, #0x0\n"
+ "mul z26.s, p2/M, z26.s, z16.s\n"
+ "pfalse p8.b\n"
+ "cbz %x[bias], 1f\n"
+ "ptrue p8.s\n"
+ "1:" // No bias
+ "2:" // Loop
+ "cntp x20, p2, p1.s\n"
+ "whilelt p0.b, XZR, x20\n"
+ "ld1b { z18.b }, p0/Z, [%x[weights]]\n"
+ "ld1b { z17.b }, p0/Z, [%x[weights], %x[ld_weight_col]]\n"
+ "ld1b { z16.b }, p0/Z, [%x[weights], x22]\n"
+ "zip1 z20.b, z18.b, z16.b\n"
+ "zip1 z19.b, z17.b, z28.b\n"
+ "ld1b { z18.b }, p0/Z, [x24]\n"
+ "ld1b { z17.b }, p0/Z, [x24, %x[ld_weight_col]]\n"
+ "ld1b { z16.b }, p0/Z, [x24, x22]\n"
+ "zip1 z22.b, z20.b, z19.b\n"
+ "zip1 z21.b, z18.b, z16.b\n"
+ "zip1 z19.b, z17.b, z28.b\n"
+ "mov z20.s, #0x0\n"
+ "ld1b { z18.b }, p0/Z, [x23]\n"
+ "ld1b { z17.b }, p0/Z, [x23, %x[ld_weight_col]]\n"
+ "ld1b { z16.b }, p0/Z, [x23, x22]\n"
+ "udot z20.s, z25.b, z22.b\n"
+ "zip1 z19.b, z21.b, z19.b\n"
+ "udot z20.s, z25.b, z19.b\n"
+ "zip1 z18.b, z18.b, z16.b\n"
+ "zip1 z16.b, z17.b, z28.b\n"
+ "and p0.b, p2/Z, p8.b, p1.b\n"
+ "ld1w { z17.s }, p0/Z, [%x[bias], x21, LSL #2]\n"
+ "zip1 z16.b, z18.b, z16.b\n"
+ "udot z20.s, z25.b, z16.b\n"
+ "mls z17.s, p2/M, z20.s, z27.s\n"
+ "add %x[weights], %x[weights], x20\n"
+ "add x24, x24, x20\n"
+ "add x23, x23, x20\n"
+ "add z17.s, z17.s, z26.s\n"
+ "st1w { z17.s }, p2, [%x[outptr]]\n"
+ "st1b { z22.b }, p2, [%x[outptr], #1, MUL VL]\n"
+ "st1b { z19.b }, p2, [%x[outptr], #2, MUL VL]\n"
+ "st1b { z16.b }, p2, [%x[outptr], #3, MUL VL]\n"
+ "addvl %x[outptr], %x[outptr], #4\n"
+ "cbz %x[rq_mul_perchannel], 3f\n"
+ "ld1w { z24.s }, p1/Z, [%x[rq_mul_perchannel], x21, LSL #2]\n"
+ "ld1w { z23.s }, p1/Z, [%x[rq_shift_perchannel], x21, LSL #2]\n"
+ "3:" // Loop: Quantisation parameters: Store
+ "incw x21\n"
+ "whilelt p1.s, x21, %x[n_channels]\n"
+ "st1w { z24.s }, p2, [%x[outptr]]\n"
+ "st1w { z23.s }, p2, [%x[outptr], #1, MUL VL]\n"
+ "addvl %x[outptr], %x[outptr], #2\n"
+ "b.any 2b\n"
+ : [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
+ : [bias] "r" (bias), [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
+ : "cc", "memory", "p0", "p1", "p2", "p8", "x20", "x21", "x22", "x23", "x24", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..6beaba841f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..d8ca3d7437
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,723 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x23, #0x0\n"
+ "mov x22, #0x0\n"
+ "1:" // Tile loop
+ "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x27, #0x2\n"
+ "mov x26, #0x2\n"
+ "str x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x23, x25\n" // offset = tile_i * ld_input_row
+ "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x20, x23, x24\n" // offset = tile_i * ld_output_row
+ "mov x23, #0x10\n" // cntb _, ALL, #1
+ "madd x21, x22, x15, x21\n" // offset += tile_j * ld_input_col
+ "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "lsl x15, x15, #0x1\n"
+ "ldr x12, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "madd x20, x22, x14, x20\n" // offset += tile_j * ld_output_col
+ "lsr x22, %x[n_channels], #0x3\n"
+ "add x11, x15, x15\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mul x21, x21, x27\n" // offset *= kernel_stride * output_size
+ "add x13, x13, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x9, x13, x25, LSL #1\n"
+ "mul x20, x20, x26\n" // offset *= output_tile_size
+ "add x28, x9, x25, LSL #1\n"
+ "add x12, x12, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v27.8h }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v26.8h }, [x20]\n"
+ "add x27, x28, x25, LSL #1\n"
+ "add x26, x11, x15\n"
+ "add x25, x12, x24, LSL #1\n"
+ "lsl x14, x14, #0x1\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x23\n"
+ "cbz x22, 4f\n"
+ "ldr q25, [x10, #0x0]\n"
+ "ldr q0, [x10, #0x10]\n"
+ "cmp x23, x22, LSL #4\n"
+ "ldr q1, [x10, #0x20]\n"
+ "ldr q2, [x10, #0x30]\n"
+ "ldr q3, [x10, #0x40]\n"
+ "ldr q4, [x10, #0x50]\n"
+ "ldr q5, [x10, #0x60]\n"
+ "ldr q6, [x10, #0x70]\n"
+ "ldr q7, [x10, #0x80]\n"
+ "ldr q8, [x10, #0x90]\n"
+ "add x10, x10, #0xa0\n"
+ "ldr q9, [x9, x15]\n"
+ "ld1 { v10.8h }, [x13]\n"
+ "ldr q11, [x13, x26]\n"
+ "ldr q12, [x9, x11]\n"
+ "ldr q13, [x28, x15]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
+ "mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
+ "add x23, x23, #0x10\n"
+ "cmp x23, x22, LSL #4\n"
+ "mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "ld1 { v18.8h }, [x27]\n"
+ "ldr q25, [x10, #0x0]\n"
+ "fmla v24.8h, v0.8h, v10.8h\n"
+ "ldr q20, [x28, x11]\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x27, x26]\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "add x20, x20, #0x10\n"
+ "add x21, x21, #0x10\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v23.8h, v4.8h, v12.8h\n"
+ "ldr q16, [x13, x15]\n"
+ "fmla v22.8h, v6.8h, v18.8h\n"
+ "ldr q18, [x13, x11]\n"
+ "fmla v21.8h, v3.8h, v13.8h\n"
+ "add x13, x13, #0x10\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "fmla v22.8h, v4.8h, v13.8h\n"
+ "fmla v21.8h, v8.8h, v17.8h\n"
+ "ld1 { v17.8h }, [x9]\n"
+ "fmla v24.8h, v1.8h, v16.8h\n"
+ "fmla v23.8h, v0.8h, v16.8h\n"
+ "ldr q16, [x9, x26]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v22.8h, v5.8h, v20.8h\n"
+ "fmla v21.8h, v4.8h, v20.8h\n"
+ "ldr q4, [x10, #0x50]\n"
+ "fmla v24.8h, v2.8h, v18.8h\n"
+ "fmla v23.8h, v1.8h, v18.8h\n"
+ "ld1 { v19.8h }, [x28]\n"
+ "ldr q1, [x10, #0x20]\n"
+ "fmla v22.8h, v0.8h, v17.8h\n"
+ "ldr q0, [x10, #0x10]\n"
+ "fmla v21.8h, v2.8h, v16.8h\n"
+ "ldr q2, [x10, #0x30]\n"
+ "fmla v24.8h, v8.8h, v20.8h\n"
+ "fmla v23.8h, v7.8h, v20.8h\n"
+ "ldr q18, [x28, x26]\n"
+ "add x28, x28, #0x10\n"
+ "ldr q13, [x28, x15]\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "fmla v21.8h, v5.8h, v18.8h\n"
+ "fmla v24.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x27, x15]\n"
+ "ldr q3, [x10, #0x40]\n"
+ "fmla v23.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x27, x11]\n"
+ "ldr q5, [x10, #0x60]\n"
+ "fmla v22.8h, v7.8h, v17.8h\n"
+ "fmla v21.8h, v6.8h, v17.8h\n"
+ "ldr q11, [x13, x26]\n"
+ "fmla v24.8h, v6.8h, v19.8h\n"
+ "ldr q9, [x9, x15]\n"
+ "fmla v23.8h, v8.8h, v18.8h\n"
+ "ld1 { v10.8h }, [x13]\n"
+ "ldr q6, [x10, #0x70]\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "fmla v21.8h, v7.8h, v16.8h\n"
+ "ldr q12, [x9, x11]\n"
+ "ldr q7, [x10, #0x80]\n"
+ "fmax v24.8h, v24.8h, v27.8h\n"
+ "fmax v23.8h, v23.8h, v27.8h\n"
+ "ldr q8, [x10, #0x90]\n"
+ "fmax v22.8h, v22.8h, v27.8h\n"
+ "fmax v21.8h, v21.8h, v27.8h\n"
+ "add x27, x27, #0x10\n"
+ "fmin v24.8h, v24.8h, v26.8h\n"
+ "fmin v23.8h, v23.8h, v26.8h\n"
+ "st1 { v24.8h }, [x12]\n"
+ "add x10, x10, #0xa0\n"
+ "fmin v22.8h, v22.8h, v26.8h\n"
+ "fmin v21.8h, v21.8h, v26.8h\n"
+ "str q23, [x12, x14]\n"
+ "add x12, x12, #0x10\n"
+ "st1 { v22.8h }, [x25]\n"
+ "str q21, [x25, x14]\n"
+ "add x25, x25, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
+ "mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
+ "mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "ld1 { v18.8h }, [x27]\n"
+ "fmla v24.8h, v0.8h, v10.8h\n"
+ "ldr q20, [x28, x11]\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x27, x26]\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v23.8h, v4.8h, v12.8h\n"
+ "ldr q16, [x13, x15]\n"
+ "fmla v22.8h, v6.8h, v18.8h\n"
+ "ldr q18, [x13, x11]\n"
+ "fmla v21.8h, v3.8h, v13.8h\n"
+ "add x13, x13, #0x10\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "fmla v22.8h, v4.8h, v13.8h\n"
+ "fmla v21.8h, v8.8h, v17.8h\n"
+ "ld1 { v17.8h }, [x9]\n"
+ "fmla v24.8h, v1.8h, v16.8h\n"
+ "fmla v23.8h, v0.8h, v16.8h\n"
+ "ldr q16, [x9, x26]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v22.8h, v5.8h, v20.8h\n"
+ "fmla v21.8h, v4.8h, v20.8h\n"
+ "fmla v24.8h, v2.8h, v18.8h\n"
+ "fmla v23.8h, v1.8h, v18.8h\n"
+ "ld1 { v19.8h }, [x28]\n"
+ "fmla v22.8h, v0.8h, v17.8h\n"
+ "fmla v21.8h, v2.8h, v16.8h\n"
+ "fmla v24.8h, v8.8h, v20.8h\n"
+ "fmla v23.8h, v7.8h, v20.8h\n"
+ "ldr q18, [x28, x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "fmla v21.8h, v5.8h, v18.8h\n"
+ "fmla v24.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x27, x15]\n"
+ "fmla v23.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x27, x11]\n"
+ "fmla v22.8h, v7.8h, v17.8h\n"
+ "fmla v21.8h, v6.8h, v17.8h\n"
+ "add x27, x27, #0x10\n"
+ "fmla v24.8h, v6.8h, v19.8h\n"
+ "fmla v23.8h, v8.8h, v18.8h\n"
+ "fmax v24.8h, v24.8h, v27.8h\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "fmla v21.8h, v7.8h, v16.8h\n"
+ "fmax v23.8h, v23.8h, v27.8h\n"
+ "fmax v22.8h, v22.8h, v27.8h\n"
+ "fmax v21.8h, v21.8h, v27.8h\n"
+ "fmin v24.8h, v24.8h, v26.8h\n"
+ "fmin v23.8h, v23.8h, v26.8h\n"
+ "st1 { v24.8h }, [x12]\n"
+ "fmin v22.8h, v22.8h, v26.8h\n"
+ "fmin v21.8h, v21.8h, v26.8h\n"
+ "str q23, [x12, x14]\n"
+ "add x12, x12, #0x10\n"
+ "st1 { v22.8h }, [x25]\n"
+ "str q21, [x25, x14]\n"
+ "add x25, x25, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x7\n"
+ "beq 57f\n"
+ "ldr q25, [x10, #0x0]\n"
+ "ldr q0, [x10, #0x10]\n"
+ "add x24, x9, x15\n"
+ "add x23, x13, XZR\n"
+ "ldr q1, [x10, #0x20]\n"
+ "ldr q2, [x10, #0x30]\n"
+ "add x22, x13, x26\n"
+ "add x21, x9, x11\n"
+ "ldr q3, [x10, #0x40]\n"
+ "ldr q4, [x10, #0x50]\n"
+ "add x20, x28, x15\n"
+ "ldr q5, [x10, #0x60]\n"
+ "ldr q6, [x10, #0x70]\n"
+ "ldr q7, [x10, #0x80]\n"
+ "ldr q8, [x10, #0x90]\n"
+ "tbz %x[n_channels], #2, 6f\n"
+ "ldr d9, [x24], #0x8\n"
+ "ldr d10, [x23], #0x8\n"
+ "ldr d11, [x22], #0x8\n"
+ "ldr d12, [x21], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ld1 { v9.s }[2], [x24], #0x4\n"
+ "ld1 { v10.s }[2], [x23], #0x4\n"
+ "ld1 { v11.s }[2], [x22], #0x4\n"
+ "ld1 { v12.s }[2], [x21], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v9.h }[6], [x24]\n"
+ "ld1 { v10.h }[6], [x23]\n"
+ "ld1 { v11.h }[6], [x22]\n"
+ "ld1 { v12.h }[6], [x21]\n"
+ "ld1 { v13.h }[6], [x20]\n"
+ "b 8f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v9.h }[4], [x24]\n"
+ "ld1 { v10.h }[4], [x23]\n"
+ "ld1 { v11.h }[4], [x22]\n"
+ "ld1 { v12.h }[4], [x21]\n"
+ "ld1 { v13.h }[4], [x20]\n"
+ "b 8f\n"
+ "6:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr s9, [x24], #0x4\n"
+ "ldr s10, [x23], #0x4\n"
+ "ldr s11, [x22], #0x4\n"
+ "ldr s12, [x21], #0x4\n"
+ "ldr s13, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v9.h }[2], [x24]\n"
+ "ld1 { v10.h }[2], [x23]\n"
+ "ld1 { v11.h }[2], [x22]\n"
+ "ld1 { v12.h }[2], [x21]\n"
+ "ld1 { v13.h }[2], [x20]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h9, [x24, #0x0]\n"
+ "ldr h10, [x23, #0x0]\n"
+ "ldr h11, [x22, #0x0]\n"
+ "ldr h12, [x21, #0x0]\n"
+ "ldr h13, [x20, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: End
+ "mov v28.16b, v25.16b\n fmla v28.8h, v4.8h, v9.8h\n"
+ "mov v29.16b, v25.16b\n fmla v29.8h, v3.8h, v9.8h\n"
+ "add x20, x27, XZR\n"
+ "mov v30.16b, v25.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v0.8h, v10.8h\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v28.8h, v5.8h, v12.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 10f\n"
+ "ldr d9, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v9.h }[6], [x20]\n"
+ "b 12f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v9.h }[4], [x20]\n"
+ "b 12f\n"
+ "10:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr s9, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v9.h }[2], [x20]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset: Bit 1: Unset
+ "ldr h9, [x20, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
+ "fmla v30.8h, v6.8h, v9.8h\n"
+ "fmla v28.8h, v7.8h, v13.8h\n"
+ "add x20, x27, x26\n"
+ "fmla v29.8h, v6.8h, v13.8h\n"
+ "fmla v30.8h, v4.8h, v13.8h\n"
+ "fmla v31.8h, v3.8h, v13.8h\n"
+ "tbz %x[n_channels], #2, 14f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 16f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 16f\n"
+ "14:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
+ "fmla v31.8h, v8.8h, v11.8h\n"
+ "add x20, x13, x15\n"
+ "tbz %x[n_channels], #2, 18f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 20f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 20f\n"
+ "18:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: End
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v29.8h, v0.8h, v12.8h\n"
+ "add x20, x13, x11\n"
+ "tbz %x[n_channels], #2, 22f\n"
+ "ldr d9, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v9.h }[6], [x20]\n"
+ "b 24f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v9.h }[4], [x20]\n"
+ "b 24f\n"
+ "22:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr s9, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v9.h }[2], [x20]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset: Bit 1: Unset
+ "ldr h9, [x20, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: End
+ "fmla v28.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v1.8h, v9.8h\n"
+ "add x20, x28, x11\n"
+ "tbz %x[n_channels], #2, 26f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v10.h }[6], [x20]\n"
+ "b 28f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v10.h }[4], [x20]\n"
+ "b 28f\n"
+ "26:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr s10, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v10.h }[2], [x20]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Unset: Bit 1: Unset
+ "ldr h10, [x20, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: End
+ "fmla v28.8h, v8.8h, v10.8h\n"
+ "fmla v29.8h, v7.8h, v10.8h\n"
+ "add x20, x9, XZR\n"
+ "fmla v30.8h, v5.8h, v10.8h\n"
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 30f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 32f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 32f\n"
+ "30:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: End
+ "fmla v28.8h, v3.8h, v11.8h\n"
+ "fmla v30.8h, v0.8h, v11.8h\n"
+ "add x20, x9, x26\n"
+ "tbz %x[n_channels], #2, 34f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 36f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 36f\n"
+ "34:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
+ "fmla v29.8h, v5.8h, v12.8h\n"
+ "fmla v31.8h, v2.8h, v12.8h\n"
+ "add x20, x28, XZR\n"
+ "tbz %x[n_channels], #2, 38f\n"
+ "ldr d9, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v9.h }[6], [x20]\n"
+ "b 40f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v9.h }[4], [x20]\n"
+ "b 40f\n"
+ "38:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr s9, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v9.h }[2], [x20]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset: Bit 1: Unset
+ "ldr h9, [x20, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: End
+ "fmla v28.8h, v6.8h, v9.8h\n"
+ "fmla v30.8h, v3.8h, v9.8h\n"
+ "add x20, x28, x26\n"
+ "tbz %x[n_channels], #2, 42f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v10.h }[6], [x20]\n"
+ "b 44f\n"
+ "41:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v10.h }[4], [x20]\n"
+ "b 44f\n"
+ "42:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 43f\n"
+ "ldr s10, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v10.h }[2], [x20]\n"
+ "b 44f\n"
+ "43:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h10, [x20, #0x0]\n"
+ "44:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: End
+ "fmla v29.8h, v8.8h, v10.8h\n"
+ "fmla v31.8h, v5.8h, v10.8h\n"
+ "add x20, x27, x15\n"
+ "tbz %x[n_channels], #2, 46f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 45f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 48f\n"
+ "45:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 48f\n"
+ "46:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 47f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 48f\n"
+ "47:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "48:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
+ "fmla v30.8h, v7.8h, v11.8h\n"
+ "fmla v31.8h, v6.8h, v11.8h\n"
+ "add x20, x27, x11\n"
+ "tbz %x[n_channels], #2, 50f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 49f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 52f\n"
+ "49:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 52f\n"
+ "50:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 51f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 52f\n"
+ "51:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "52:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
+ "fmla v30.8h, v8.8h, v12.8h\n"
+ "fmla v31.8h, v7.8h, v12.8h\n"
+ "fmax v28.8h, v28.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
+ "fmin v28.8h, v28.8h, v26.8h\n"
+ "fmin v29.8h, v29.8h, v26.8h\n"
+ "fmin v30.8h, v30.8h, v26.8h\n"
+ "fmin v31.8h, v31.8h, v26.8h\n"
+ "tbz %x[n_channels], #2, 54f\n"
+ "mov x21, x12\n"
+ "mov x20, x25\n"
+ "st1 { v28.d }[0], [x21], x14\n"
+ "st1 { v30.d }[0], [x20], x14\n"
+ "add x12, x12, #0x8\n"
+ "add x25, x25, #0x8\n"
+ "st1 { v29.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
+ "tbz %x[n_channels], #1, 53f\n"
+ "mov x21, x12\n"
+ "mov x20, x25\n"
+ "st1 { v28.s }[2], [x21], x14\n"
+ "st1 { v30.s }[2], [x20], x14\n"
+ "add x12, x12, #0x4\n"
+ "add x25, x25, #0x4\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
+ "tbz %x[n_channels], #0, 56f\n"
+ "mov x21, x12\n"
+ "mov x20, x25\n"
+ "st1 { v28.h }[6], [x21], x14\n"
+ "st1 { v30.h }[6], [x20], x14\n"
+ "st1 { v29.h }[6], [x21]\n"
+ "st1 { v31.h }[6], [x20]\n"
+ "b 56f\n"
+ "53:" // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 56f\n"
+ "mov x21, x12\n"
+ "mov x20, x25\n"
+ "st1 { v28.h }[4], [x21], x14\n"
+ "st1 { v30.h }[4], [x20], x14\n"
+ "st1 { v29.h }[4], [x21]\n"
+ "st1 { v31.h }[4], [x20]\n"
+ "b 56f\n"
+ "54:" // Tile loop: Oddments: Store: Bit 2: Unset
+ "tbz %x[n_channels], #1, 55f\n"
+ "mov x21, x12\n"
+ "mov x20, x25\n"
+ "st1 { v28.s }[0], [x21], x14\n"
+ "st1 { v30.s }[0], [x20], x14\n"
+ "add x12, x12, #0x4\n"
+ "add x25, x25, #0x4\n"
+ "st1 { v29.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
+ "tbz %x[n_channels], #0, 56f\n"
+ "mov x21, x12\n"
+ "mov x20, x25\n"
+ "st1 { v28.h }[2], [x21], x14\n"
+ "st1 { v30.h }[2], [x20], x14\n"
+ "st1 { v29.h }[2], [x21]\n"
+ "st1 { v31.h }[2], [x20]\n"
+ "b 56f\n"
+ "55:" // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
+ "mov x21, x12\n"
+ "mov x20, x25\n"
+ "st1 { v28.h }[0], [x21], x14\n"
+ "st1 { v30.h }[0], [x20], x14\n"
+ "st1 { v29.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
+ "56:" // Tile loop: Oddments: Store: Bit 2: End
+ "57:" // Tile loop: End
+ "ldr x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x22, x22, #0x1\n"
+ "add x21, x23, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x22, x20\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x23, x23, x21, LT\n"
+ "csel x22, x22, XZR, LT\n"
+ "cmp x23, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..c9a554e9ad
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,697 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[16];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[5];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[3];
+ inptrs[3] = input_ptrs[6];
+ inptrs[4] = input_ptrs[9];
+ inptrs[5] = input_ptrs[12];
+ inptrs[6] = input_ptrs[15];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[2];
+ inptrs[9] = input_ptrs[10];
+ inptrs[10] = input_ptrs[4];
+ inptrs[11] = input_ptrs[7];
+ inptrs[12] = input_ptrs[8];
+ inptrs[13] = input_ptrs[11];
+ inptrs[14] = input_ptrs[13];
+ inptrs[15] = input_ptrs[14];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "mov x16, #0x10\n" // cntb _, ALL, #1
+ "lsr x15, %x[n_channels], #0x3\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v27.8h }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v26.8h }, [x20]\n"
+ "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldp x12, x11, [x21, #0x0]\n"
+ "ldp x10, x9, [x21, #0x10]\n"
+ "mov x28, #0x0\n"
+ "sub x27, XZR, x16\n"
+ "cbz x15, 3f\n"
+ "ldr q25, [x14, #0x0]\n"
+ "ldr q0, [x14, #0x10]\n"
+ "cmp x16, x15, LSL #4\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q2, [x14, #0x30]\n"
+ "ldr q3, [x14, #0x40]\n"
+ "ldr q4, [x14, #0x50]\n"
+ "ldr q5, [x14, #0x60]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "add x14, x14, #0xa0\n"
+ "ldp x21, x20, [x13, #0x0]\n"
+ "ldr q9, [x21, x28]\n"
+ "ldr q10, [x20, x28]\n"
+ "ldp x21, x20, [x13, #0x10]\n"
+ "ldr q11, [x21, x28]\n"
+ "ldr q12, [x20, x28]\n"
+ "ldr x20, [x13, #0x20]\n"
+ "ldr q13, [x20, x28]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
+ "mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
+ "ldr x21, [x13, #0x28]\n"
+ "ldr x20, [x13, #0x30]\n"
+ "mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "ldr q18, [x21, x28]\n"
+ "ldr q25, [x14, #0x0]\n"
+ "fmla v24.8h, v0.8h, v10.8h\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "ldr x20, [x13, #0x48]\n"
+ "ldr q20, [x20, x28]\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v23.8h, v4.8h, v12.8h\n"
+ "ldr q16, [x21, x28]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "fmla v22.8h, v6.8h, v18.8h\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v21.8h, v3.8h, v13.8h\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "ldr x22, [x13, #0x58]\n"
+ "ldr x21, [x13, #0x60]\n"
+ "fmla v22.8h, v4.8h, v13.8h\n"
+ "fmla v21.8h, v8.8h, v17.8h\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x20, [x13, #0x68]\n"
+ "fmla v24.8h, v1.8h, v16.8h\n"
+ "fmla v23.8h, v0.8h, v16.8h\n"
+ "ldr q16, [x22, x28]\n"
+ "ldr x26, [x13, #0x70]\n"
+ "fmla v22.8h, v5.8h, v20.8h\n"
+ "fmla v21.8h, v4.8h, v20.8h\n"
+ "ldr q4, [x14, #0x50]\n"
+ "ldr x25, [x13, #0x78]\n"
+ "fmla v24.8h, v2.8h, v18.8h\n"
+ "fmla v23.8h, v1.8h, v18.8h\n"
+ "ldr q19, [x21, x28]\n"
+ "ldr q1, [x14, #0x20]\n"
+ "fmla v22.8h, v0.8h, v17.8h\n"
+ "ldr q0, [x14, #0x10]\n"
+ "fmla v21.8h, v2.8h, v16.8h\n"
+ "ldr q2, [x14, #0x30]\n"
+ "fmla v24.8h, v8.8h, v20.8h\n"
+ "fmla v23.8h, v7.8h, v20.8h\n"
+ "ldr q18, [x20, x28]\n"
+ "ldp x24, x23, [x13, #0x0]\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "fmla v21.8h, v5.8h, v18.8h\n"
+ "ldp x22, x21, [x13, #0x10]\n"
+ "ldr x20, [x13, #0x20]\n"
+ "ldr q13, [x20, x16]\n"
+ "fmla v24.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x26, x28]\n"
+ "fmla v23.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x25, x28]\n"
+ "ldr q3, [x14, #0x40]\n"
+ "fmla v22.8h, v7.8h, v17.8h\n"
+ "fmla v21.8h, v6.8h, v17.8h\n"
+ "ldr q11, [x22, x16]\n"
+ "ldr q5, [x14, #0x60]\n"
+ "fmla v24.8h, v6.8h, v19.8h\n"
+ "fmla v23.8h, v8.8h, v18.8h\n"
+ "ldr q9, [x24, x16]\n"
+ "ldr q10, [x23, x16]\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "fmla v21.8h, v7.8h, v16.8h\n"
+ "ldr q12, [x21, x16]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "fmax v24.8h, v24.8h, v27.8h\n"
+ "fmax v23.8h, v23.8h, v27.8h\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "fmax v22.8h, v22.8h, v27.8h\n"
+ "fmax v21.8h, v21.8h, v27.8h\n"
+ "add x16, x16, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "fmin v24.8h, v24.8h, v26.8h\n"
+ "fmin v23.8h, v23.8h, v26.8h\n"
+ "cmp x16, x15, LSL #4\n"
+ "fmin v22.8h, v22.8h, v26.8h\n"
+ "fmin v21.8h, v21.8h, v26.8h\n"
+ "add x28, x28, #0x10\n"
+ "str q24, [x12, x27]\n"
+ "add x14, x14, #0xa0\n"
+ "str q23, [x11, x27]\n"
+ "str q22, [x10, x27]\n"
+ "str q21, [x9, x27]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v24.16b, v25.16b\n fmla v24.8h, v4.8h, v9.8h\n"
+ "mov v23.16b, v25.16b\n fmla v23.8h, v3.8h, v9.8h\n"
+ "ldr x21, [x13, #0x28]\n"
+ "ldr x20, [x13, #0x30]\n"
+ "mov v22.16b, v25.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "mov v21.16b, v25.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "ldr q18, [x21, x28]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "fmla v24.8h, v0.8h, v10.8h\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x20, [x13, #0x48]\n"
+ "ldr q20, [x20, x28]\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "ldr x20, [x13, #0x40]\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v23.8h, v4.8h, v12.8h\n"
+ "ldr q16, [x21, x28]\n"
+ "ldr x21, [x13, #0x50]\n"
+ "fmla v22.8h, v6.8h, v18.8h\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v21.8h, v3.8h, v13.8h\n"
+ "ldr x20, [x13, #0x58]\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "ldr x23, [x13, #0x60]\n"
+ "ldr x22, [x13, #0x68]\n"
+ "fmla v22.8h, v4.8h, v13.8h\n"
+ "fmla v21.8h, v8.8h, v17.8h\n"
+ "ldr q17, [x21, x28]\n"
+ "ldr x21, [x13, #0x70]\n"
+ "fmla v24.8h, v1.8h, v16.8h\n"
+ "fmla v23.8h, v0.8h, v16.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr x20, [x13, #0x78]\n"
+ "fmla v22.8h, v5.8h, v20.8h\n"
+ "fmla v21.8h, v4.8h, v20.8h\n"
+ "add x27, x27, #0x10\n"
+ "fmla v24.8h, v2.8h, v18.8h\n"
+ "fmla v23.8h, v1.8h, v18.8h\n"
+ "ldr q19, [x23, x28]\n"
+ "fmla v22.8h, v0.8h, v17.8h\n"
+ "fmla v21.8h, v2.8h, v16.8h\n"
+ "fmla v24.8h, v8.8h, v20.8h\n"
+ "fmla v23.8h, v7.8h, v20.8h\n"
+ "ldr q18, [x22, x28]\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "fmla v21.8h, v5.8h, v18.8h\n"
+ "fmla v24.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x21, x28]\n"
+ "fmla v23.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v22.8h, v7.8h, v17.8h\n"
+ "fmla v21.8h, v6.8h, v17.8h\n"
+ "add x28, x28, #0x10\n"
+ "fmla v24.8h, v6.8h, v19.8h\n"
+ "fmla v23.8h, v8.8h, v18.8h\n"
+ "fmax v24.8h, v24.8h, v27.8h\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "fmla v21.8h, v7.8h, v16.8h\n"
+ "fmax v23.8h, v23.8h, v27.8h\n"
+ "fmax v22.8h, v22.8h, v27.8h\n"
+ "fmax v21.8h, v21.8h, v27.8h\n"
+ "fmin v24.8h, v24.8h, v26.8h\n"
+ "fmin v23.8h, v23.8h, v26.8h\n"
+ "str q24, [x12, x27]\n"
+ "fmin v22.8h, v22.8h, v26.8h\n"
+ "fmin v21.8h, v21.8h, v26.8h\n"
+ "str q23, [x11, x27]\n"
+ "str q22, [x10, x27]\n"
+ "str q21, [x9, x27]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x7\n"
+ "beq 56f\n"
+ "ldr q25, [x14, #0x0]\n"
+ "ldr q0, [x14, #0x10]\n"
+ "mov x20, x28\n"
+ "add x12, x12, x20\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q2, [x14, #0x30]\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
+ "ldr q3, [x14, #0x40]\n"
+ "ldr q4, [x14, #0x50]\n"
+ "add x9, x9, x20\n"
+ "ldr q5, [x14, #0x60]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "ldr x24, [x13, #0x0]\n"
+ "ldr x23, [x13, #0x8]\n"
+ "add x24, x24, x28\n"
+ "add x23, x23, x28\n"
+ "ldr x22, [x13, #0x10]\n"
+ "ldr x21, [x13, #0x18]\n"
+ "add x22, x22, x28\n"
+ "add x21, x21, x28\n"
+ "ldr x20, [x13, #0x20]\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 5f\n"
+ "ld1 { v9.d }[0], [x24], #0x8\n"
+ "ld1 { v10.d }[0], [x23], #0x8\n"
+ "ld1 { v11.d }[0], [x22], #0x8\n"
+ "ld1 { v12.d }[0], [x21], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.s }[2], [x24], #0x4\n"
+ "ld1 { v10.s }[2], [x23], #0x4\n"
+ "ld1 { v11.s }[2], [x22], #0x4\n"
+ "ld1 { v12.s }[2], [x21], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v9.h }[6], [x24], #0x2\n"
+ "ld1 { v10.h }[6], [x23], #0x2\n"
+ "ld1 { v11.h }[6], [x22], #0x2\n"
+ "ld1 { v12.h }[6], [x21], #0x2\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
+ "b 7f\n"
+ "4:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v9.h }[4], [x24], #0x2\n"
+ "ld1 { v10.h }[4], [x23], #0x2\n"
+ "ld1 { v11.h }[4], [x22], #0x2\n"
+ "ld1 { v12.h }[4], [x21], #0x2\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
+ "b 7f\n"
+ "5:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v9.s }[0], [x24], #0x4\n"
+ "ld1 { v10.s }[0], [x23], #0x4\n"
+ "ld1 { v11.s }[0], [x22], #0x4\n"
+ "ld1 { v12.s }[0], [x21], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v9.h }[2], [x24], #0x2\n"
+ "ld1 { v10.h }[2], [x23], #0x2\n"
+ "ld1 { v11.h }[2], [x22], #0x2\n"
+ "ld1 { v12.h }[2], [x21], #0x2\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
+ "b 7f\n"
+ "6:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v9.h }[0], [x24], #0x2\n"
+ "ld1 { v10.h }[0], [x23], #0x2\n"
+ "ld1 { v11.h }[0], [x22], #0x2\n"
+ "ld1 { v12.h }[0], [x21], #0x2\n"
+ "ld1 { v13.h }[0], [x20], #0x2\n"
+ "7:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 2: End
+ "mov v28.16b, v25.16b\n fmla v28.8h, v4.8h, v9.8h\n"
+ "mov v29.16b, v25.16b\n fmla v29.8h, v3.8h, v9.8h\n"
+ "ldr x20, [x13, #0x28]\n"
+ "add x20, x20, x28\n"
+ "mov v30.16b, v25.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v0.8h, v10.8h\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v28.8h, v5.8h, v12.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 9f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.h }[6], [x20], #0x2\n"
+ "b 11f\n"
+ "8:" // Oddments: Load input (3, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.h }[4], [x20], #0x2\n"
+ "b 11f\n"
+ "9:" // Oddments: Load input (3, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "11:" // Oddments: Load input (3, 0): Bit 2: End
+ "fmla v30.8h, v6.8h, v9.8h\n"
+ "ldr x20, [x13, #0x30]\n"
+ "fmla v28.8h, v7.8h, v13.8h\n"
+ "add x20, x20, x28\n"
+ "fmla v29.8h, v6.8h, v13.8h\n"
+ "fmla v30.8h, v4.8h, v13.8h\n"
+ "fmla v31.8h, v3.8h, v13.8h\n"
+ "tbz %x[n_channels], #2, 13f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 15f\n"
+ "12:" // Oddments: Load input (3, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 15f\n"
+ "13:" // Oddments: Load input (3, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "15:" // Oddments: Load input (3, 3): Bit 2: End
+ "ldr x20, [x13, #0x38]\n"
+ "fmla v31.8h, v8.8h, v11.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 17f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 19f\n"
+ "16:" // Oddments: Load input (0, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 19f\n"
+ "17:" // Oddments: Load input (0, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (0, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "19:" // Oddments: Load input (0, 1): Bit 2: End
+ "ldr x20, [x13, #0x40]\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v29.8h, v0.8h, v12.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 21f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v9.h }[6], [x20], #0x2\n"
+ "b 23f\n"
+ "20:" // Oddments: Load input (0, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v9.h }[4], [x20], #0x2\n"
+ "b 23f\n"
+ "21:" // Oddments: Load input (0, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (0, 2): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "23:" // Oddments: Load input (0, 2): Bit 2: End
+ "ldr x20, [x13, #0x48]\n"
+ "fmla v28.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v1.8h, v9.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 25f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
+ "b 27f\n"
+ "24:" // Oddments: Load input (2, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
+ "b 27f\n"
+ "25:" // Oddments: Load input (2, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (2, 2): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "27:" // Oddments: Load input (2, 2): Bit 2: End
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v28.8h, v8.8h, v10.8h\n"
+ "fmla v29.8h, v7.8h, v10.8h\n"
+ "add x20, x20, x28\n"
+ "fmla v30.8h, v5.8h, v10.8h\n"
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 29f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 31f\n"
+ "28:" // Oddments: Load input (1, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 31f\n"
+ "29:" // Oddments: Load input (1, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (1, 0): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "31:" // Oddments: Load input (1, 0): Bit 2: End
+ "ldr x20, [x13, #0x58]\n"
+ "fmla v28.8h, v3.8h, v11.8h\n"
+ "fmla v30.8h, v0.8h, v11.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 33f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 35f\n"
+ "32:" // Oddments: Load input (1, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 35f\n"
+ "33:" // Oddments: Load input (1, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "35:" // Oddments: Load input (1, 3): Bit 2: End
+ "ldr x20, [x13, #0x60]\n"
+ "fmla v29.8h, v5.8h, v12.8h\n"
+ "fmla v31.8h, v2.8h, v12.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 37f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v9.h }[6], [x20], #0x2\n"
+ "b 39f\n"
+ "36:" // Oddments: Load input (2, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v9.h }[4], [x20], #0x2\n"
+ "b 39f\n"
+ "37:" // Oddments: Load input (2, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "39:" // Oddments: Load input (2, 0): Bit 2: End
+ "ldr x20, [x13, #0x68]\n"
+ "fmla v28.8h, v6.8h, v9.8h\n"
+ "fmla v30.8h, v3.8h, v9.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 41f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
+ "b 43f\n"
+ "40:" // Oddments: Load input (2, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
+ "b 43f\n"
+ "41:" // Oddments: Load input (2, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 42f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "b 43f\n"
+ "42:" // Oddments: Load input (2, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "43:" // Oddments: Load input (2, 3): Bit 2: End
+ "ldr x20, [x13, #0x70]\n"
+ "fmla v29.8h, v8.8h, v10.8h\n"
+ "fmla v31.8h, v5.8h, v10.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 45f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 44f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 47f\n"
+ "44:" // Oddments: Load input (3, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 47f\n"
+ "45:" // Oddments: Load input (3, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 46f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 47f\n"
+ "46:" // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "47:" // Oddments: Load input (3, 1): Bit 2: End
+ "ldr x20, [x13, #0x78]\n"
+ "fmla v30.8h, v7.8h, v11.8h\n"
+ "fmla v31.8h, v6.8h, v11.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 49f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 48f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 51f\n"
+ "48:" // Oddments: Load input (3, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 51f\n"
+ "49:" // Oddments: Load input (3, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 50f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 51f\n"
+ "50:" // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "51:" // Oddments: Load input (3, 2): Bit 2: End
+ "fmla v30.8h, v8.8h, v12.8h\n"
+ "fmla v31.8h, v7.8h, v12.8h\n"
+ "fmax v28.8h, v28.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
+ "fmin v28.8h, v28.8h, v26.8h\n"
+ "fmin v29.8h, v29.8h, v26.8h\n"
+ "fmin v30.8h, v30.8h, v26.8h\n"
+ "fmin v31.8h, v31.8h, v26.8h\n"
+ "tbz %x[n_channels], #2, 53f\n"
+ "st1 { v28.d }[0], [x12], #0x8\n"
+ "st1 { v29.d }[0], [x11], #0x8\n"
+ "st1 { v30.d }[0], [x10], #0x8\n"
+ "st1 { v31.d }[0], [x9], #0x8\n"
+ "tbz %x[n_channels], #1, 52f\n"
+ "st1 { v28.s }[2], [x12], #0x4\n"
+ "st1 { v29.s }[2], [x11], #0x4\n"
+ "st1 { v30.s }[2], [x10], #0x4\n"
+ "st1 { v31.s }[2], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 55f\n"
+ "st1 { v28.h }[6], [x12], #0x2\n"
+ "st1 { v29.h }[6], [x11], #0x2\n"
+ "st1 { v30.h }[6], [x10], #0x2\n"
+ "st1 { v31.h }[6], [x9], #0x2\n"
+ "b 55f\n"
+ "52:" // Oddments: Store: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 55f\n"
+ "st1 { v28.h }[4], [x12], #0x2\n"
+ "st1 { v29.h }[4], [x11], #0x2\n"
+ "st1 { v30.h }[4], [x10], #0x2\n"
+ "st1 { v31.h }[4], [x9], #0x2\n"
+ "b 55f\n"
+ "53:" // Oddments: Store: Bit 2: Unset
+ "tbz %x[n_channels], #1, 54f\n"
+ "st1 { v28.s }[0], [x12], #0x4\n"
+ "st1 { v29.s }[0], [x11], #0x4\n"
+ "st1 { v30.s }[0], [x10], #0x4\n"
+ "st1 { v31.s }[0], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 55f\n"
+ "st1 { v28.h }[2], [x12], #0x2\n"
+ "st1 { v29.h }[2], [x11], #0x2\n"
+ "st1 { v30.h }[2], [x10], #0x2\n"
+ "st1 { v31.h }[2], [x9], #0x2\n"
+ "b 55f\n"
+ "54:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
+ "st1 { v28.h }[0], [x12], #0x2\n"
+ "st1 { v29.h }[0], [x11], #0x2\n"
+ "st1 { v30.h }[0], [x10], #0x2\n"
+ "st1 { v31.h }[0], [x9], #0x2\n"
+ "55:" // Oddments: Store: Bit 2: End
+ "56:" // End
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..6bbd3508cb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 3;
+ constexpr static unsigned int output_cols = 3;
+
+ a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..4e64a2bf2b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,1158 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x24, #0x0\n"
+ "mov x23, #0x0\n"
+ "1:" // Tile loop
+ "str x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x27, #0x3\n"
+ "mov x26, #0x3\n"
+ "str x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x24, x25\n" // offset = tile_i * ld_input_row
+ "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x20, x24, x22\n" // offset = tile_i * ld_output_row
+ "mov x24, #0x10\n" // cntb _, ALL, #1
+ "madd x21, x23, x8, x21\n" // offset += tile_j * ld_input_col
+ "ldr x16, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "lsl x8, x8, #0x1\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "madd x20, x23, x17, x20\n" // offset += tile_j * ld_output_col
+ "lsl x17, x17, #0x1\n"
+ "lsr x23, %x[n_channels], #0x3\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mul x21, x21, x27\n" // offset *= kernel_stride * output_size
+ "add x16, x16, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x13, x16, x25, LSL #1\n"
+ "mul x20, x20, x26\n" // offset *= output_tile_size
+ "add x12, x13, x25, LSL #1\n"
+ "add x11, x8, x8\n"
+ "add x15, x15, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x10, x12, x25, LSL #1\n"
+ "add x9, x11, x8\n"
+ "add x28, x15, x22, LSL #1\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v15.8h }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v14.8h }, [x20]\n"
+ "add x27, x10, x25, LSL #1\n"
+ "add x26, x9, x8\n"
+ "add x25, x28, x22, LSL #1\n"
+ "add x22, x17, x17\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x24\n"
+ "cbz x23, 4f\n"
+ "ldr q31, [x14, #0x0]\n"
+ "ldr q0, [x14, #0x10]\n"
+ "cmp x24, x23, LSL #4\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q2, [x14, #0x30]\n"
+ "ldr q3, [x14, #0x40]\n"
+ "ldr q4, [x14, #0x50]\n"
+ "ldr q5, [x14, #0x60]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "add x14, x14, #0xa0\n"
+ "ldr q9, [x12, x11]\n"
+ "ld1 { v10.8h }, [x16]\n"
+ "ldr q11, [x16, x26]\n"
+ "ld1 { v12.8h }, [x27]\n"
+ "ldr q13, [x13, x11]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v29.16b, v31.16b\n fmla v29.8h, v7.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+ "add x24, x24, #0x10\n"
+ "cmp x24, x23, LSL #4\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "add x20, x20, #0x10\n"
+ "add x21, x21, #0x10\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+ "fmla v28.8h, v0.8h, v10.8h\n"
+ "ldr q23, [x12, x9]\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q18, [x12, x8]\n"
+ "mov v22.16b, v31.16b\n fmla v22.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v6.8h, v18.8h\n"
+ "mov v21.16b, v31.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v5.8h, v13.8h\n"
+ "fmla v27.8h, v3.8h, v13.8h\n"
+ "fmla v26.8h, v2.8h, v13.8h\n"
+ "fmla v25.8h, v1.8h, v13.8h\n"
+ "fmla v24.8h, v0.8h, v13.8h\n"
+ "ldr q17, [x16, x8]\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "ldr q16, [x27, x26]\n"
+ "mov v20.16b, v31.16b\n fmla v20.8h, v1.8h, v9.8h\n"
+ "ldr q31, [x14, #0x0]\n"
+ "fmla v29.8h, v0.8h, v17.8h\n"
+ "fmla v21.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x16, x9]\n"
+ "fmla v28.8h, v7.8h, v18.8h\n"
+ "fmla v20.8h, v0.8h, v18.8h\n"
+ "fmla v26.8h, v4.8h, v18.8h\n"
+ "fmla v25.8h, v3.8h, v18.8h\n"
+ "fmla v22.8h, v1.8h, v18.8h\n"
+ "ld1 { v19.8h }, [x13]\n"
+ "fmla v29.8h, v2.8h, v16.8h\n"
+ "fmla v27.8h, v1.8h, v16.8h\n"
+ "ld1 { v18.8h }, [x10]\n"
+ "fmla v24.8h, v4.8h, v23.8h\n"
+ "fmla v28.8h, v1.8h, v17.8h\n"
+ "ldr q16, [x13, x26]\n"
+ "fmla v20.8h, v2.8h, v23.8h\n"
+ "fmla v21.8h, v1.8h, v23.8h\n"
+ "fmla v29.8h, v8.8h, v23.8h\n"
+ "fmla v27.8h, v7.8h, v23.8h\n"
+ "fmla v25.8h, v5.8h, v23.8h\n"
+ "ldr q17, [x10, x11]\n"
+ "fmla v26.8h, v0.8h, v19.8h\n"
+ "fmla v22.8h, v3.8h, v18.8h\n"
+ "fmla v24.8h, v2.8h, v16.8h\n"
+ "fmla v20.8h, v4.8h, v17.8h\n"
+ "fmla v21.8h, v3.8h, v17.8h\n"
+ "fmla v28.8h, v3.8h, v19.8h\n"
+ "ldr q19, [x10, x26]\n"
+ "fmla v27.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x27, x8]\n"
+ "fmla v26.8h, v6.8h, v18.8h\n"
+ "ldr q18, [x13, x8]\n"
+ "fmla v25.8h, v7.8h, v17.8h\n"
+ "fmla v22.8h, v5.8h, v17.8h\n"
+ "fmla v24.8h, v6.8h, v17.8h\n"
+ "fmla v21.8h, v5.8h, v19.8h\n"
+ "fmla v20.8h, v6.8h, v16.8h\n"
+ "fmla v26.8h, v8.8h, v17.8h\n"
+ "fmla v22.8h, v7.8h, v16.8h\n"
+ "ldr q17, [x27, x9]\n"
+ "fmla v29.8h, v3.8h, v18.8h\n"
+ "fmla v25.8h, v0.8h, v18.8h\n"
+ "fmla v24.8h, v8.8h, v19.8h\n"
+ "ldr q16, [x13, x9]\n"
+ "fmla v20.8h, v8.8h, v17.8h\n"
+ "add x13, x13, #0x10\n"
+ "fmla v21.8h, v7.8h, v17.8h\n"
+ "ldr q19, [x10, x9]\n"
+ "fmla v28.8h, v4.8h, v18.8h\n"
+ "fmla v26.8h, v1.8h, v18.8h\n"
+ "ldr q17, [x10, x8]\n"
+ "fmla v29.8h, v5.8h, v16.8h\n"
+ "add x10, x10, #0x10\n"
+ "fmla v27.8h, v4.8h, v16.8h\n"
+ "fmla v25.8h, v2.8h, v16.8h\n"
+ "fmla v24.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x16, x11]\n"
+ "fmla v22.8h, v4.8h, v17.8h\n"
+ "add x16, x16, #0x10\n"
+ "ld1 { v10.8h }, [x16]\n"
+ "fmla v20.8h, v3.8h, v17.8h\n"
+ "fmla v21.8h, v4.8h, v19.8h\n"
+ "ldr q4, [x14, #0x50]\n"
+ "fmla v26.8h, v7.8h, v17.8h\n"
+ "fmla v25.8h, v6.8h, v17.8h\n"
+ "ld1 { v18.8h }, [x12]\n"
+ "fmla v28.8h, v2.8h, v16.8h\n"
+ "fmla v29.8h, v1.8h, v16.8h\n"
+ "ldr q1, [x14, #0x20]\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmla v27.8h, v0.8h, v16.8h\n"
+ "ldr q17, [x12, x26]\n"
+ "fmla v24.8h, v7.8h, v19.8h\n"
+ "add x12, x12, #0x10\n"
+ "ldr q9, [x12, x11]\n"
+ "fmla v20.8h, v5.8h, v19.8h\n"
+ "fmla v22.8h, v0.8h, v18.8h\n"
+ "ldr q0, [x14, #0x10]\n"
+ "fmla v21.8h, v2.8h, v17.8h\n"
+ "ldr q2, [x14, #0x30]\n"
+ "fmla v25.8h, v8.8h, v19.8h\n"
+ "ldr q16, [x27, x11]\n"
+ "fmla v28.8h, v6.8h, v18.8h\n"
+ "fmla v26.8h, v3.8h, v18.8h\n"
+ "ldr q3, [x14, #0x40]\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmla v27.8h, v8.8h, v17.8h\n"
+ "fmla v24.8h, v5.8h, v17.8h\n"
+ "ldr q11, [x16, x26]\n"
+ "ldr q5, [x14, #0x60]\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "ldr q8, [x14, #0x90]\n"
+ "fmla v20.8h, v7.8h, v16.8h\n"
+ "ldr q7, [x14, #0x80]\n"
+ "fmla v21.8h, v6.8h, v16.8h\n"
+ "ldr q13, [x13, x11]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "add x27, x27, #0x10\n"
+ "ld1 { v12.8h }, [x27]\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "add x14, x14, #0xa0\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "st1 { v28.8h }, [x15]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "str q29, [x15, x17]\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "str q27, [x15, x22]\n"
+ "add x15, x15, #0x10\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
+ "st1 { v26.8h }, [x28]\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "str q25, [x28, x17]\n"
+ "str q24, [x28, x22]\n"
+ "add x28, x28, #0x10\n"
+ "st1 { v22.8h }, [x25]\n"
+ "str q20, [x25, x17]\n"
+ "str q21, [x25, x22]\n"
+ "add x25, x25, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v29.16b, v31.16b\n fmla v29.8h, v7.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+ "fmla v28.8h, v0.8h, v10.8h\n"
+ "ldr q23, [x12, x9]\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q18, [x12, x8]\n"
+ "mov v22.16b, v31.16b\n fmla v22.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v6.8h, v18.8h\n"
+ "mov v21.16b, v31.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v5.8h, v13.8h\n"
+ "fmla v27.8h, v3.8h, v13.8h\n"
+ "fmla v26.8h, v2.8h, v13.8h\n"
+ "fmla v25.8h, v1.8h, v13.8h\n"
+ "fmla v24.8h, v0.8h, v13.8h\n"
+ "ldr q17, [x16, x8]\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "ldr q16, [x27, x26]\n"
+ "mov v20.16b, v31.16b\n fmla v20.8h, v1.8h, v9.8h\n"
+ "fmla v29.8h, v0.8h, v17.8h\n"
+ "fmla v21.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x16, x9]\n"
+ "fmla v28.8h, v7.8h, v18.8h\n"
+ "fmla v20.8h, v0.8h, v18.8h\n"
+ "fmla v26.8h, v4.8h, v18.8h\n"
+ "fmla v25.8h, v3.8h, v18.8h\n"
+ "fmla v22.8h, v1.8h, v18.8h\n"
+ "ld1 { v19.8h }, [x13]\n"
+ "fmla v29.8h, v2.8h, v16.8h\n"
+ "fmla v27.8h, v1.8h, v16.8h\n"
+ "ld1 { v18.8h }, [x10]\n"
+ "fmla v24.8h, v4.8h, v23.8h\n"
+ "fmla v28.8h, v1.8h, v17.8h\n"
+ "ldr q16, [x13, x26]\n"
+ "fmla v20.8h, v2.8h, v23.8h\n"
+ "fmla v21.8h, v1.8h, v23.8h\n"
+ "fmla v29.8h, v8.8h, v23.8h\n"
+ "fmla v27.8h, v7.8h, v23.8h\n"
+ "fmla v25.8h, v5.8h, v23.8h\n"
+ "ldr q17, [x10, x11]\n"
+ "fmla v26.8h, v0.8h, v19.8h\n"
+ "fmla v22.8h, v3.8h, v18.8h\n"
+ "fmla v24.8h, v2.8h, v16.8h\n"
+ "fmla v20.8h, v4.8h, v17.8h\n"
+ "fmla v21.8h, v3.8h, v17.8h\n"
+ "fmla v28.8h, v3.8h, v19.8h\n"
+ "ldr q19, [x10, x26]\n"
+ "fmla v27.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x27, x8]\n"
+ "fmla v26.8h, v6.8h, v18.8h\n"
+ "ldr q18, [x13, x8]\n"
+ "fmla v25.8h, v7.8h, v17.8h\n"
+ "fmla v22.8h, v5.8h, v17.8h\n"
+ "fmla v24.8h, v6.8h, v17.8h\n"
+ "fmla v21.8h, v5.8h, v19.8h\n"
+ "fmla v20.8h, v6.8h, v16.8h\n"
+ "fmla v26.8h, v8.8h, v17.8h\n"
+ "fmla v22.8h, v7.8h, v16.8h\n"
+ "ldr q17, [x27, x9]\n"
+ "fmla v29.8h, v3.8h, v18.8h\n"
+ "fmla v25.8h, v0.8h, v18.8h\n"
+ "fmla v24.8h, v8.8h, v19.8h\n"
+ "ldr q16, [x13, x9]\n"
+ "fmla v20.8h, v8.8h, v17.8h\n"
+ "add x13, x13, #0x10\n"
+ "fmla v21.8h, v7.8h, v17.8h\n"
+ "ldr q19, [x10, x9]\n"
+ "fmla v28.8h, v4.8h, v18.8h\n"
+ "fmla v26.8h, v1.8h, v18.8h\n"
+ "ldr q17, [x10, x8]\n"
+ "fmla v29.8h, v5.8h, v16.8h\n"
+ "add x10, x10, #0x10\n"
+ "fmla v27.8h, v4.8h, v16.8h\n"
+ "fmla v25.8h, v2.8h, v16.8h\n"
+ "fmla v24.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x16, x11]\n"
+ "fmla v22.8h, v4.8h, v17.8h\n"
+ "add x16, x16, #0x10\n"
+ "fmla v20.8h, v3.8h, v17.8h\n"
+ "fmla v21.8h, v4.8h, v19.8h\n"
+ "fmla v26.8h, v7.8h, v17.8h\n"
+ "fmla v25.8h, v6.8h, v17.8h\n"
+ "ld1 { v18.8h }, [x12]\n"
+ "fmla v28.8h, v2.8h, v16.8h\n"
+ "fmla v29.8h, v1.8h, v16.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmla v27.8h, v0.8h, v16.8h\n"
+ "ldr q17, [x12, x26]\n"
+ "fmla v24.8h, v7.8h, v19.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "fmla v20.8h, v5.8h, v19.8h\n"
+ "fmla v22.8h, v0.8h, v18.8h\n"
+ "add x12, x12, #0x10\n"
+ "fmla v21.8h, v2.8h, v17.8h\n"
+ "fmla v25.8h, v8.8h, v19.8h\n"
+ "ldr q16, [x27, x11]\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmla v28.8h, v6.8h, v18.8h\n"
+ "fmla v26.8h, v3.8h, v18.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "add x27, x27, #0x10\n"
+ "fmla v27.8h, v8.8h, v17.8h\n"
+ "fmla v24.8h, v5.8h, v17.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "fmla v20.8h, v7.8h, v16.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmla v21.8h, v6.8h, v16.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "st1 { v28.8h }, [x15]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "str q29, [x15, x17]\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "str q27, [x15, x22]\n"
+ "add x15, x15, #0x10\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
+ "st1 { v26.8h }, [x28]\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "str q25, [x28, x17]\n"
+ "str q24, [x28, x22]\n"
+ "add x28, x28, #0x10\n"
+ "st1 { v22.8h }, [x25]\n"
+ "str q20, [x25, x17]\n"
+ "str q21, [x25, x22]\n"
+ "add x25, x25, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x7\n"
+ "beq 93f\n"
+ "ldr q31, [x14, #0x0]\n"
+ "ldr q0, [x14, #0x10]\n"
+ "add x24, x12, x11\n"
+ "add x23, x16, XZR\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q2, [x14, #0x30]\n"
+ "add x22, x16, x26\n"
+ "add x21, x27, XZR\n"
+ "ldr q3, [x14, #0x40]\n"
+ "ldr q4, [x14, #0x50]\n"
+ "add x20, x13, x11\n"
+ "ldr q5, [x14, #0x60]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "tbz %x[n_channels], #2, 6f\n"
+ "ldr d9, [x24], #0x8\n"
+ "ldr d10, [x23], #0x8\n"
+ "ldr d11, [x22], #0x8\n"
+ "ldr d12, [x21], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ld1 { v9.s }[2], [x24], #0x4\n"
+ "ld1 { v10.s }[2], [x23], #0x4\n"
+ "ld1 { v11.s }[2], [x22], #0x4\n"
+ "ld1 { v12.s }[2], [x21], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v9.h }[6], [x24]\n"
+ "ld1 { v10.h }[6], [x23]\n"
+ "ld1 { v11.h }[6], [x22]\n"
+ "ld1 { v12.h }[6], [x21]\n"
+ "ld1 { v13.h }[6], [x20]\n"
+ "b 8f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v9.h }[4], [x24]\n"
+ "ld1 { v10.h }[4], [x23]\n"
+ "ld1 { v11.h }[4], [x22]\n"
+ "ld1 { v12.h }[4], [x21]\n"
+ "ld1 { v13.h }[4], [x20]\n"
+ "b 8f\n"
+ "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr s9, [x24], #0x4\n"
+ "ldr s10, [x23], #0x4\n"
+ "ldr s11, [x22], #0x4\n"
+ "ldr s12, [x21], #0x4\n"
+ "ldr s13, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v9.h }[2], [x24]\n"
+ "ld1 { v10.h }[2], [x23]\n"
+ "ld1 { v11.h }[2], [x22]\n"
+ "ld1 { v12.h }[2], [x21]\n"
+ "ld1 { v13.h }[2], [x20]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Unset: Bit 1: Unset
+ "ldr h9, [x24, #0x0]\n"
+ "ldr h10, [x23, #0x0]\n"
+ "ldr h11, [x22, #0x0]\n"
+ "ldr h12, [x21, #0x0]\n"
+ "ldr h13, [x20, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: End
+ "mov v23.16b, v31.16b\n fmla v23.8h, v8.8h, v9.8h\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v6.8h, v9.8h\n"
+ "add x20, x27, x26\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v7.8h, v9.8h\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v3.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "fmla v23.8h, v0.8h, v10.8h\n"
+ "fmla v25.8h, v2.8h, v11.8h\n"
+ "mov v30.16b, v31.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "fmla v29.8h, v6.8h, v12.8h\n"
+ "fmla v23.8h, v5.8h, v13.8h\n"
+ "fmla v24.8h, v4.8h, v13.8h\n"
+ "fmla v25.8h, v3.8h, v13.8h\n"
+ "fmla v26.8h, v2.8h, v13.8h\n"
+ "fmla v27.8h, v1.8h, v13.8h\n"
+ "fmla v28.8h, v0.8h, v13.8h\n"
+ "tbz %x[n_channels], #2, 10f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 12f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 12f\n"
+ "10:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
+ "fmla v31.8h, v8.8h, v12.8h\n"
+ "add x20, x12, x8\n"
+ "tbz %x[n_channels], #2, 14f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 16f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 16f\n"
+ "14:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: End
+ "fmla v23.8h, v7.8h, v11.8h\n"
+ "fmla v24.8h, v6.8h, v11.8h\n"
+ "add x20, x16, x8\n"
+ "fmla v26.8h, v4.8h, v11.8h\n"
+ "fmla v27.8h, v3.8h, v11.8h\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "fmla v30.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #2, 18f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v13.h }[6], [x20]\n"
+ "b 20f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v13.h }[4], [x20]\n"
+ "b 20f\n"
+ "18:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr s13, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v13.h }[2], [x20]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h13, [x20, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: End
+ "fmla v23.8h, v1.8h, v13.8h\n"
+ "fmla v24.8h, v0.8h, v13.8h\n"
+ "add x20, x16, x9\n"
+ "tbz %x[n_channels], #2, 22f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 24f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 24f\n"
+ "22:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: End
+ "fmla v24.8h, v2.8h, v12.8h\n"
+ "fmla v25.8h, v1.8h, v12.8h\n"
+ "add x20, x12, x9\n"
+ "tbz %x[n_channels], #2, 26f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v10.h }[6], [x20]\n"
+ "b 28f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v10.h }[4], [x20]\n"
+ "b 28f\n"
+ "26:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr s10, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v10.h }[2], [x20]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h10, [x20, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: End
+ "fmla v24.8h, v8.8h, v10.8h\n"
+ "fmla v25.8h, v7.8h, v10.8h\n"
+ "add x20, x13, XZR\n"
+ "fmla v27.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "fmla v30.8h, v2.8h, v10.8h\n"
+ "fmla v31.8h, v1.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 30f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 32f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 32f\n"
+ "30:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: End
+ "fmla v23.8h, v3.8h, v11.8h\n"
+ "fmla v26.8h, v0.8h, v11.8h\n"
+ "add x20, x13, x26\n"
+ "tbz %x[n_channels], #2, 34f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v13.h }[6], [x20]\n"
+ "b 36f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v13.h }[4], [x20]\n"
+ "b 36f\n"
+ "34:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr s13, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v13.h }[2], [x20]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset: Bit 1: Unset
+ "ldr h13, [x20, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: End
+ "fmla v25.8h, v5.8h, v13.8h\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "add x20, x10, XZR\n"
+ "tbz %x[n_channels], #2, 38f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 40f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 40f\n"
+ "38:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
+ "fmla v26.8h, v6.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v12.8h\n"
+ "add x20, x10, x11\n"
+ "tbz %x[n_channels], #2, 42f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v10.h }[6], [x20]\n"
+ "b 44f\n"
+ "41:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v10.h }[4], [x20]\n"
+ "b 44f\n"
+ "42:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 43f\n"
+ "ldr s10, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v10.h }[2], [x20]\n"
+ "b 44f\n"
+ "43:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
+ "ldr h10, [x20, #0x0]\n"
+ "44:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
+ "fmla v26.8h, v8.8h, v10.8h\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
+ "add x20, x10, x26\n"
+ "fmla v28.8h, v6.8h, v10.8h\n"
+ "fmla v29.8h, v5.8h, v10.8h\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v31.8h, v3.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 46f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 45f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 48f\n"
+ "45:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 48f\n"
+ "46:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 47f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 48f\n"
+ "47:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "48:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: End
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "fmla v31.8h, v5.8h, v11.8h\n"
+ "add x20, x27, x8\n"
+ "tbz %x[n_channels], #2, 50f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 49f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v13.h }[6], [x20]\n"
+ "b 52f\n"
+ "49:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v13.h }[4], [x20]\n"
+ "b 52f\n"
+ "50:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 51f\n"
+ "ldr s13, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v13.h }[2], [x20]\n"
+ "b 52f\n"
+ "51:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h13, [x20, #0x0]\n"
+ "52:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: End
+ "fmla v29.8h, v7.8h, v13.8h\n"
+ "fmla v30.8h, v6.8h, v13.8h\n"
+ "add x20, x13, x8\n"
+ "tbz %x[n_channels], #2, 54f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 53f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 56f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 56f\n"
+ "53:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 56f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 56f\n"
+ "54:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 55f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 56f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 56f\n"
+ "55:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "56:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: End
+ "fmla v23.8h, v4.8h, v12.8h\n"
+ "fmla v24.8h, v3.8h, v12.8h\n"
+ "add x20, x13, x9\n"
+ "fmla v26.8h, v1.8h, v12.8h\n"
+ "fmla v27.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 58f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 57f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 60f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 60f\n"
+ "57:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 60f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 60f\n"
+ "58:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 59f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 60f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 60f\n"
+ "59:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "60:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
+ "fmla v24.8h, v5.8h, v11.8h\n"
+ "fmla v25.8h, v4.8h, v11.8h\n"
+ "add x20, x27, x9\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #2, 62f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 61f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 64f\n"
+ "ld1 { v13.h }[6], [x20]\n"
+ "b 64f\n"
+ "61:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 64f\n"
+ "ld1 { v13.h }[4], [x20]\n"
+ "b 64f\n"
+ "62:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 63f\n"
+ "ldr s13, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 64f\n"
+ "ld1 { v13.h }[2], [x20]\n"
+ "b 64f\n"
+ "63:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h13, [x20, #0x0]\n"
+ "64:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: End
+ "fmla v30.8h, v8.8h, v13.8h\n"
+ "fmla v31.8h, v7.8h, v13.8h\n"
+ "add x20, x10, x8\n"
+ "tbz %x[n_channels], #2, 66f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 65f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 68f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 68f\n"
+ "65:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 68f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 68f\n"
+ "66:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 67f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 68f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 68f\n"
+ "67:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "68:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
+ "fmla v26.8h, v7.8h, v12.8h\n"
+ "fmla v27.8h, v6.8h, v12.8h\n"
+ "add x20, x16, x11\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 70f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 69f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 72f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 72f\n"
+ "69:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 72f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 72f\n"
+ "70:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 71f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 72f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 72f\n"
+ "71:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "72:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: End
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "add x20, x10, x9\n"
+ "fmla v25.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #2, 74f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 73f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 76f\n"
+ "ld1 { v13.h }[6], [x20]\n"
+ "b 76f\n"
+ "73:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 76f\n"
+ "ld1 { v13.h }[4], [x20]\n"
+ "b 76f\n"
+ "74:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 75f\n"
+ "ldr s13, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 76f\n"
+ "ld1 { v13.h }[2], [x20]\n"
+ "b 76f\n"
+ "75:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h13, [x20, #0x0]\n"
+ "76:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
+ "fmla v27.8h, v8.8h, v13.8h\n"
+ "fmla v28.8h, v7.8h, v13.8h\n"
+ "add x20, x12, XZR\n"
+ "fmla v30.8h, v5.8h, v13.8h\n"
+ "fmla v31.8h, v4.8h, v13.8h\n"
+ "tbz %x[n_channels], #2, 78f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 77f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 80f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 80f\n"
+ "77:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 80f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 80f\n"
+ "78:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 79f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 80f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 80f\n"
+ "79:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "80:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: End
+ "fmla v23.8h, v6.8h, v12.8h\n"
+ "fmla v26.8h, v3.8h, v12.8h\n"
+ "add x20, x12, x26\n"
+ "fmla v29.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 82f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 81f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 84f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 84f\n"
+ "81:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 84f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 84f\n"
+ "82:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 83f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 84f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 84f\n"
+ "83:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "84:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: End
+ "fmla v25.8h, v8.8h, v11.8h\n"
+ "fmla v28.8h, v5.8h, v11.8h\n"
+ "add x20, x27, x11\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "tbz %x[n_channels], #2, 86f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 85f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 88f\n"
+ "ld1 { v13.h }[6], [x20]\n"
+ "b 88f\n"
+ "85:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 88f\n"
+ "ld1 { v13.h }[4], [x20]\n"
+ "b 88f\n"
+ "86:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 87f\n"
+ "ldr s13, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 88f\n"
+ "ld1 { v13.h }[2], [x20]\n"
+ "b 88f\n"
+ "87:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset: Bit 1: Unset
+ "ldr h13, [x20, #0x0]\n"
+ "88:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
+ "fmla v29.8h, v8.8h, v13.8h\n"
+ "fmla v30.8h, v7.8h, v13.8h\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "fmla v31.8h, v6.8h, v13.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
+ "tbz %x[n_channels], #2, 90f\n"
+ "mov x22, x15\n"
+ "mov x21, x28\n"
+ "st1 { v23.d }[0], [x22], x17\n"
+ "mov x20, x25\n"
+ "st1 { v26.d }[0], [x21], x17\n"
+ "add x15, x15, #0x8\n"
+ "st1 { v29.d }[0], [x20], x17\n"
+ "add x28, x28, #0x8\n"
+ "add x25, x25, #0x8\n"
+ "st1 { v24.d }[0], [x22], x17\n"
+ "st1 { v27.d }[0], [x21], x17\n"
+ "st1 { v30.d }[0], [x20], x17\n"
+ "st1 { v25.d }[0], [x22]\n"
+ "st1 { v28.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
+ "tbz %x[n_channels], #1, 89f\n"
+ "mov x22, x15\n"
+ "mov x21, x28\n"
+ "st1 { v23.s }[2], [x22], x17\n"
+ "mov x20, x25\n"
+ "st1 { v26.s }[2], [x21], x17\n"
+ "add x15, x15, #0x4\n"
+ "st1 { v29.s }[2], [x20], x17\n"
+ "add x28, x28, #0x4\n"
+ "add x25, x25, #0x4\n"
+ "st1 { v24.s }[2], [x22], x17\n"
+ "st1 { v27.s }[2], [x21], x17\n"
+ "st1 { v30.s }[2], [x20], x17\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "st1 { v28.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
+ "tbz %x[n_channels], #0, 92f\n"
+ "mov x22, x15\n"
+ "mov x21, x28\n"
+ "st1 { v23.h }[6], [x22], x17\n"
+ "mov x20, x25\n"
+ "st1 { v26.h }[6], [x21], x17\n"
+ "st1 { v29.h }[6], [x20], x17\n"
+ "st1 { v24.h }[6], [x22], x17\n"
+ "st1 { v27.h }[6], [x21], x17\n"
+ "st1 { v30.h }[6], [x20], x17\n"
+ "st1 { v25.h }[6], [x22]\n"
+ "st1 { v28.h }[6], [x21]\n"
+ "st1 { v31.h }[6], [x20]\n"
+ "b 92f\n"
+ "89:" // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 92f\n"
+ "mov x22, x15\n"
+ "mov x21, x28\n"
+ "st1 { v23.h }[4], [x22], x17\n"
+ "mov x20, x25\n"
+ "st1 { v26.h }[4], [x21], x17\n"
+ "st1 { v29.h }[4], [x20], x17\n"
+ "st1 { v24.h }[4], [x22], x17\n"
+ "st1 { v27.h }[4], [x21], x17\n"
+ "st1 { v30.h }[4], [x20], x17\n"
+ "st1 { v25.h }[4], [x22]\n"
+ "st1 { v28.h }[4], [x21]\n"
+ "st1 { v31.h }[4], [x20]\n"
+ "b 92f\n"
+ "90:" // Tile loop: Oddments: Store: Bit 2: Unset
+ "tbz %x[n_channels], #1, 91f\n"
+ "mov x22, x15\n"
+ "mov x21, x28\n"
+ "st1 { v23.s }[0], [x22], x17\n"
+ "mov x20, x25\n"
+ "st1 { v26.s }[0], [x21], x17\n"
+ "add x15, x15, #0x4\n"
+ "st1 { v29.s }[0], [x20], x17\n"
+ "add x28, x28, #0x4\n"
+ "add x25, x25, #0x4\n"
+ "st1 { v24.s }[0], [x22], x17\n"
+ "st1 { v27.s }[0], [x21], x17\n"
+ "st1 { v30.s }[0], [x20], x17\n"
+ "st1 { v25.s }[0], [x22]\n"
+ "st1 { v28.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
+ "tbz %x[n_channels], #0, 92f\n"
+ "mov x22, x15\n"
+ "mov x21, x28\n"
+ "st1 { v23.h }[2], [x22], x17\n"
+ "mov x20, x25\n"
+ "st1 { v26.h }[2], [x21], x17\n"
+ "st1 { v29.h }[2], [x20], x17\n"
+ "st1 { v24.h }[2], [x22], x17\n"
+ "st1 { v27.h }[2], [x21], x17\n"
+ "st1 { v30.h }[2], [x20], x17\n"
+ "st1 { v25.h }[2], [x22]\n"
+ "st1 { v28.h }[2], [x21]\n"
+ "st1 { v31.h }[2], [x20]\n"
+ "b 92f\n"
+ "91:" // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
+ "mov x22, x15\n"
+ "mov x21, x28\n"
+ "st1 { v23.h }[0], [x22], x17\n"
+ "mov x20, x25\n"
+ "st1 { v26.h }[0], [x21], x17\n"
+ "st1 { v29.h }[0], [x20], x17\n"
+ "st1 { v24.h }[0], [x22], x17\n"
+ "st1 { v27.h }[0], [x21], x17\n"
+ "st1 { v30.h }[0], [x20], x17\n"
+ "st1 { v25.h }[0], [x22]\n"
+ "st1 { v28.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
+ "92:" // Tile loop: Oddments: Store: Bit 2: End
+ "93:" // Tile loop: End
+ "ldr x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x23, x23, #0x1\n"
+ "add x21, x24, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x23, x20\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x24, x24, x21, LT\n"
+ "csel x23, x23, XZR, LT\n"
+ "cmp x24, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..72e68482c6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,1291 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[25];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[4];
+ inptrs[3] = input_ptrs[20];
+ inptrs[4] = input_ptrs[7];
+ inptrs[5] = input_ptrs[24];
+ inptrs[6] = input_ptrs[11];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[3];
+ inptrs[9] = input_ptrs[13];
+ inptrs[10] = input_ptrs[5];
+ inptrs[11] = input_ptrs[9];
+ inptrs[12] = input_ptrs[15];
+ inptrs[13] = input_ptrs[17];
+ inptrs[14] = input_ptrs[19];
+ inptrs[15] = input_ptrs[21];
+ inptrs[16] = input_ptrs[6];
+ inptrs[17] = input_ptrs[8];
+ inptrs[18] = input_ptrs[23];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[18];
+ inptrs[22] = input_ptrs[10];
+ inptrs[23] = input_ptrs[14];
+ inptrs[24] = input_ptrs[22];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "mov x7, #0x10\n" // cntb _, ALL, #1
+ "lsr x8, %x[n_channels], #0x3\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v15.8h }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v14.8h }, [x20]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x14, #0x0\n"
+ "sub x13, XZR, x7\n"
+ "cbz x8, 3f\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "cmp x7, x8, LSL #4\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "add x16, x16, #0xa0\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "ldr q9, [x21, x14]\n"
+ "ldr q10, [x20, x14]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q11, [x21, x14]\n"
+ "ldr q12, [x20, x14]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ldr q13, [x20, x14]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v7.8h, v9.8h\n"
+ "ldr x26, [x15, #0x30]\n"
+ "ldr x23, [x15, #0x38]\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "ldr x22, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ldr q19, [x20, x14]\n"
+ "fmla v28.8h, v4.8h, v13.8h\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+ "ldr x25, [x15, #0x50]\n"
+ "ldr x24, [x15, #0x58]\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x26, x14]\n"
+ "mov v23.16b, v31.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "ldr x20, [x15, #0x60]\n"
+ "fmla v29.8h, v5.8h, v13.8h\n"
+ "fmla v28.8h, v6.8h, v17.8h\n"
+ "ldr x12, [x15, #0x70]\n"
+ "ldr x11, [x15, #0x88]\n"
+ "mov v22.16b, v31.16b\n fmla v22.8h, v0.8h, v9.8h\n"
+ "fmla v27.8h, v3.8h, v13.8h\n"
+ "ldr x10, [x17, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "fmla v26.8h, v2.8h, v13.8h\n"
+ "fmla v25.8h, v1.8h, v13.8h\n"
+ "ldr x9, [x17, #0x8]\n"
+ "ldr x28, [x17, #0x10]\n"
+ "fmla v24.8h, v0.8h, v13.8h\n"
+ "ldr q18, [x23, x14]\n"
+ "fmla v23.8h, v6.8h, v12.8h\n"
+ "ldr q16, [x22, x14]\n"
+ "mov v21.16b, v31.16b\n fmla v21.8h, v1.8h, v9.8h\n"
+ "ldr q31, [x16, #0x0]\n"
+ "fmla v29.8h, v7.8h, v17.8h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "fmla v28.8h, v0.8h, v18.8h\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x21, x14]\n"
+ "ldr x22, [x15, #0x78]\n"
+ "fmla v26.8h, v4.8h, v17.8h\n"
+ "fmla v25.8h, v3.8h, v17.8h\n"
+ "ldr x21, [x15, #0x80]\n"
+ "ldr x27, [x17, #0x18]\n"
+ "fmla v21.8h, v0.8h, v17.8h\n"
+ "fmla v24.8h, v4.8h, v19.8h\n"
+ "fmla v23.8h, v1.8h, v17.8h\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v29.8h, v1.8h, v18.8h\n"
+ "ldr q20, [x24, x14]\n"
+ "fmla v28.8h, v2.8h, v16.8h\n"
+ "fmla v27.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x20, x14]\n"
+ "ldr x26, [x15, #0x90]\n"
+ "fmla v25.8h, v5.8h, v19.8h\n"
+ "fmla v21.8h, v2.8h, v19.8h\n"
+ "ldr x25, [x15, #0xa0]\n"
+ "ldr x20, [x15, #0x98]\n"
+ "fmla v26.8h, v0.8h, v17.8h\n"
+ "fmla v24.8h, v2.8h, v20.8h\n"
+ "fmla v28.8h, v8.8h, v19.8h\n"
+ "fmla v27.8h, v7.8h, v19.8h\n"
+ "fmla v22.8h, v1.8h, v19.8h\n"
+ "ldr q19, [x23, x14]\n"
+ "fmla v23.8h, v3.8h, v16.8h\n"
+ "ldr x24, [x15, #0xa8]\n"
+ "fmla v26.8h, v6.8h, v16.8h\n"
+ "ldr q18, [x21, x14]\n"
+ "fmla v25.8h, v7.8h, v19.8h\n"
+ "ldr x23, [x15, #0xc0]\n"
+ "fmla v24.8h, v6.8h, v19.8h\n"
+ "fmla v21.8h, v4.8h, v19.8h\n"
+ "fmla v29.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x12, x14]\n"
+ "fmla v27.8h, v5.8h, v20.8h\n"
+ "ldr q16, [x22, x14]\n"
+ "fmla v23.8h, v5.8h, v19.8h\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "fmla v26.8h, v8.8h, v19.8h\n"
+ "fmla v24.8h, v8.8h, v17.8h\n"
+ "fmla v21.8h, v6.8h, v16.8h\n"
+ "fmla v28.8h, v3.8h, v18.8h\n"
+ "fmla v25.8h, v0.8h, v18.8h\n"
+ "fmla v22.8h, v5.8h, v17.8h\n"
+ "ldr q17, [x11, x14]\n"
+ "fmla v23.8h, v7.8h, v16.8h\n"
+ "ldr q16, [x26, x14]\n"
+ "fmla v29.8h, v4.8h, v18.8h\n"
+ "fmla v26.8h, v1.8h, v18.8h\n"
+ "ldr q18, [x20, x14]\n"
+ "fmla v28.8h, v5.8h, v17.8h\n"
+ "fmla v27.8h, v4.8h, v17.8h\n"
+ "fmla v25.8h, v2.8h, v17.8h\n"
+ "fmla v24.8h, v1.8h, v17.8h\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v21.8h, v8.8h, v16.8h\n"
+ "ldr x20, [x15, #0x20]\n"
+ "fmla v22.8h, v7.8h, v16.8h\n"
+ "ldr q16, [x24, x14]\n"
+ "fmla v29.8h, v2.8h, v17.8h\n"
+ "fmla v26.8h, v7.8h, v18.8h\n"
+ "fmla v25.8h, v6.8h, v18.8h\n"
+ "fmla v23.8h, v4.8h, v18.8h\n"
+ "fmla v21.8h, v3.8h, v18.8h\n"
+ "ldr q18, [x22, x14]\n"
+ "fmla v22.8h, v4.8h, v16.8h\n"
+ "ldr q4, [x16, #0x50]\n"
+ "fmla v28.8h, v1.8h, v17.8h\n"
+ "ldr q1, [x16, #0x20]\n"
+ "fmla v27.8h, v0.8h, v17.8h\n"
+ "ldr q17, [x21, x14]\n"
+ "fmla v29.8h, v6.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmla v24.8h, v7.8h, v16.8h\n"
+ "fmla v21.8h, v5.8h, v16.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "str q29, [x10, x13]\n"
+ "fmla v23.8h, v0.8h, v18.8h\n"
+ "ldr q0, [x16, #0x10]\n"
+ "fmla v22.8h, v2.8h, v17.8h\n"
+ "ldr q2, [x16, #0x30]\n"
+ "fmla v25.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x23, x14]\n"
+ "fmla v26.8h, v3.8h, v18.8h\n"
+ "ldr q3, [x16, #0x40]\n"
+ "fmla v27.8h, v8.8h, v17.8h\n"
+ "fmla v24.8h, v5.8h, v17.8h\n"
+ "ldr q5, [x16, #0x60]\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmla v23.8h, v8.8h, v16.8h\n"
+ "ldr q8, [x16, #0x90]\n"
+ "fmla v21.8h, v7.8h, v16.8h\n"
+ "ldr q7, [x16, #0x80]\n"
+ "fmla v22.8h, v6.8h, v16.8h\n"
+ "ldr q13, [x20, x7]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "ldr x24, [x17, #0x20]\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "ldr q9, [x21, x7]\n"
+ "ldr q10, [x20, x7]\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q11, [x21, x7]\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "ldr q12, [x20, x7]\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "str q28, [x9, x13]\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "str q27, [x28, x13]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "str q26, [x27, x13]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x7, x7, #0x10\n"
+ "str q25, [x24, x13]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "cmp x7, x8, LSL #4\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "add x14, x14, #0x10\n"
+ "str q24, [x23, x13]\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "str q23, [x22, x13]\n"
+ "add x16, x16, #0xa0\n"
+ "str q21, [x21, x13]\n"
+ "str q22, [x20, x13]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v7.8h, v9.8h\n"
+ "ldr x23, [x15, #0x30]\n"
+ "ldr x22, [x15, #0x38]\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v6.8h, v9.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "ldr x21, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ldr q19, [x20, x14]\n"
+ "fmla v28.8h, v4.8h, v13.8h\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v4.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v3.8h, v9.8h\n"
+ "ldr x25, [x15, #0x50]\n"
+ "ldr x24, [x15, #0x58]\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q17, [x23, x14]\n"
+ "mov v23.16b, v31.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "ldr x23, [x15, #0x60]\n"
+ "fmla v29.8h, v5.8h, v13.8h\n"
+ "fmla v28.8h, v6.8h, v17.8h\n"
+ "ldr x12, [x15, #0x70]\n"
+ "ldr x11, [x15, #0x88]\n"
+ "mov v22.16b, v31.16b\n fmla v22.8h, v0.8h, v9.8h\n"
+ "fmla v27.8h, v3.8h, v13.8h\n"
+ "ldr x10, [x17, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "fmla v26.8h, v2.8h, v13.8h\n"
+ "fmla v25.8h, v1.8h, v13.8h\n"
+ "ldr x9, [x17, #0x8]\n"
+ "ldr x28, [x17, #0x10]\n"
+ "fmla v24.8h, v0.8h, v13.8h\n"
+ "ldr q18, [x22, x14]\n"
+ "fmla v23.8h, v6.8h, v12.8h\n"
+ "ldr q16, [x21, x14]\n"
+ "mov v21.16b, v31.16b\n fmla v21.8h, v1.8h, v9.8h\n"
+ "fmla v29.8h, v7.8h, v17.8h\n"
+ "ldr x22, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x78]\n"
+ "fmla v28.8h, v0.8h, v18.8h\n"
+ "fmla v22.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x20, x14]\n"
+ "ldr x20, [x15, #0x80]\n"
+ "fmla v26.8h, v4.8h, v17.8h\n"
+ "fmla v25.8h, v3.8h, v17.8h\n"
+ "ldr x27, [x17, #0x18]\n"
+ "fmla v21.8h, v0.8h, v17.8h\n"
+ "fmla v24.8h, v4.8h, v19.8h\n"
+ "fmla v23.8h, v1.8h, v17.8h\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v29.8h, v1.8h, v18.8h\n"
+ "ldr q20, [x24, x14]\n"
+ "fmla v28.8h, v2.8h, v16.8h\n"
+ "fmla v27.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x23, x14]\n"
+ "ldr x26, [x15, #0x90]\n"
+ "fmla v25.8h, v5.8h, v19.8h\n"
+ "fmla v21.8h, v2.8h, v19.8h\n"
+ "ldr x25, [x15, #0xa0]\n"
+ "ldr x24, [x15, #0x98]\n"
+ "fmla v26.8h, v0.8h, v17.8h\n"
+ "fmla v24.8h, v2.8h, v20.8h\n"
+ "fmla v28.8h, v8.8h, v19.8h\n"
+ "fmla v27.8h, v7.8h, v19.8h\n"
+ "fmla v22.8h, v1.8h, v19.8h\n"
+ "ldr q19, [x22, x14]\n"
+ "fmla v23.8h, v3.8h, v16.8h\n"
+ "ldr x23, [x15, #0xa8]\n"
+ "fmla v26.8h, v6.8h, v16.8h\n"
+ "ldr q18, [x20, x14]\n"
+ "fmla v25.8h, v7.8h, v19.8h\n"
+ "ldr x22, [x15, #0xc0]\n"
+ "fmla v24.8h, v6.8h, v19.8h\n"
+ "fmla v21.8h, v4.8h, v19.8h\n"
+ "fmla v29.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x12, x14]\n"
+ "fmla v27.8h, v5.8h, v20.8h\n"
+ "ldr q16, [x21, x14]\n"
+ "fmla v23.8h, v5.8h, v19.8h\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v26.8h, v8.8h, v19.8h\n"
+ "fmla v24.8h, v8.8h, v17.8h\n"
+ "fmla v21.8h, v6.8h, v16.8h\n"
+ "fmla v28.8h, v3.8h, v18.8h\n"
+ "fmla v25.8h, v0.8h, v18.8h\n"
+ "fmla v22.8h, v5.8h, v17.8h\n"
+ "ldr q17, [x11, x14]\n"
+ "fmla v23.8h, v7.8h, v16.8h\n"
+ "ldr q16, [x26, x14]\n"
+ "fmla v29.8h, v4.8h, v18.8h\n"
+ "fmla v26.8h, v1.8h, v18.8h\n"
+ "ldr q18, [x24, x14]\n"
+ "fmla v28.8h, v5.8h, v17.8h\n"
+ "fmla v27.8h, v4.8h, v17.8h\n"
+ "fmla v25.8h, v2.8h, v17.8h\n"
+ "fmla v24.8h, v1.8h, v17.8h\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v21.8h, v8.8h, v16.8h\n"
+ "fmla v22.8h, v7.8h, v16.8h\n"
+ "ldr q16, [x23, x14]\n"
+ "fmla v29.8h, v2.8h, v17.8h\n"
+ "fmla v26.8h, v7.8h, v18.8h\n"
+ "fmla v25.8h, v6.8h, v18.8h\n"
+ "fmla v23.8h, v4.8h, v18.8h\n"
+ "fmla v21.8h, v3.8h, v18.8h\n"
+ "ldr q18, [x21, x14]\n"
+ "fmla v22.8h, v4.8h, v16.8h\n"
+ "fmla v28.8h, v1.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmla v27.8h, v0.8h, v17.8h\n"
+ "ldr q17, [x20, x14]\n"
+ "fmla v29.8h, v6.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmla v24.8h, v7.8h, v16.8h\n"
+ "fmla v21.8h, v5.8h, v16.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "str q29, [x10, x13]\n"
+ "fmla v23.8h, v0.8h, v18.8h\n"
+ "fmla v22.8h, v2.8h, v17.8h\n"
+ "ldr x20, [x17, #0x20]\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmla v25.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x22, x14]\n"
+ "fmla v26.8h, v3.8h, v18.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmla v27.8h, v8.8h, v17.8h\n"
+ "fmla v24.8h, v5.8h, v17.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "str q28, [x9, x13]\n"
+ "fmla v23.8h, v8.8h, v16.8h\n"
+ "fmla v21.8h, v7.8h, v16.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "ldr x23, [x17, #0x28]\n"
+ "fmla v22.8h, v6.8h, v16.8h\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "str q27, [x28, x13]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "str q26, [x27, x13]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "str q25, [x20, x13]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "add x14, x14, #0x10\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "str q24, [x23, x13]\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "str q23, [x22, x13]\n"
+ "str q21, [x21, x13]\n"
+ "str q22, [x20, x13]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x7\n"
+ "beq 92f\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "mov x13, x14\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "ldr x24, [x15, #0x0]\n"
+ "ldr x23, [x15, #0x8]\n"
+ "add x24, x24, x14\n"
+ "add x23, x23, x14\n"
+ "ldr x22, [x15, #0x10]\n"
+ "ldr x21, [x15, #0x18]\n"
+ "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "ldr x20, [x15, #0x20]\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #2, 5f\n"
+ "ld1 { v9.d }[0], [x24], #0x8\n"
+ "ld1 { v10.d }[0], [x23], #0x8\n"
+ "ld1 { v11.d }[0], [x22], #0x8\n"
+ "ld1 { v12.d }[0], [x21], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.s }[2], [x24], #0x4\n"
+ "ld1 { v10.s }[2], [x23], #0x4\n"
+ "ld1 { v11.s }[2], [x22], #0x4\n"
+ "ld1 { v12.s }[2], [x21], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v9.h }[6], [x24], #0x2\n"
+ "ld1 { v10.h }[6], [x23], #0x2\n"
+ "ld1 { v11.h }[6], [x22], #0x2\n"
+ "ld1 { v12.h }[6], [x21], #0x2\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
+ "b 7f\n"
+ "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v9.h }[4], [x24], #0x2\n"
+ "ld1 { v10.h }[4], [x23], #0x2\n"
+ "ld1 { v11.h }[4], [x22], #0x2\n"
+ "ld1 { v12.h }[4], [x21], #0x2\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
+ "b 7f\n"
+ "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v9.s }[0], [x24], #0x4\n"
+ "ld1 { v10.s }[0], [x23], #0x4\n"
+ "ld1 { v11.s }[0], [x22], #0x4\n"
+ "ld1 { v12.s }[0], [x21], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v9.h }[2], [x24], #0x2\n"
+ "ld1 { v10.h }[2], [x23], #0x2\n"
+ "ld1 { v11.h }[2], [x22], #0x2\n"
+ "ld1 { v12.h }[2], [x21], #0x2\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
+ "b 7f\n"
+ "6:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v9.h }[0], [x24], #0x2\n"
+ "ld1 { v10.h }[0], [x23], #0x2\n"
+ "ld1 { v11.h }[0], [x22], #0x2\n"
+ "ld1 { v12.h }[0], [x21], #0x2\n"
+ "ld1 { v13.h }[0], [x20], #0x2\n"
+ "7:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 2: End
+ "mov v23.16b, v31.16b\n fmla v23.8h, v8.8h, v9.8h\n"
+ "mov v25.16b, v31.16b\n fmla v25.8h, v6.8h, v9.8h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "add x20, x20, x14\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v7.8h, v9.8h\n"
+ "mov v26.16b, v31.16b\n fmla v26.8h, v5.8h, v9.8h\n"
+ "mov v27.16b, v31.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v3.8h, v9.8h\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "fmla v23.8h, v0.8h, v10.8h\n"
+ "fmla v25.8h, v2.8h, v11.8h\n"
+ "mov v30.16b, v31.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "fmla v29.8h, v6.8h, v12.8h\n"
+ "fmla v23.8h, v5.8h, v13.8h\n"
+ "fmla v24.8h, v4.8h, v13.8h\n"
+ "fmla v25.8h, v3.8h, v13.8h\n"
+ "fmla v26.8h, v2.8h, v13.8h\n"
+ "fmla v27.8h, v1.8h, v13.8h\n"
+ "fmla v28.8h, v0.8h, v13.8h\n"
+ "tbz %x[n_channels], #2, 9f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 11f\n"
+ "8:" // Oddments: Load input (4, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 11f\n"
+ "9:" // Oddments: Load input (4, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (4, 4): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "11:" // Oddments: Load input (4, 4): Bit 2: End
+ "ldr x20, [x15, #0x30]\n"
+ "fmla v31.8h, v8.8h, v12.8h\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #2, 13f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 15f\n"
+ "12:" // Oddments: Load input (2, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 15f\n"
+ "13:" // Oddments: Load input (2, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "15:" // Oddments: Load input (2, 1): Bit 2: End
+ "ldr x20, [x15, #0x38]\n"
+ "fmla v23.8h, v7.8h, v11.8h\n"
+ "fmla v24.8h, v6.8h, v11.8h\n"
+ "add x20, x20, x14\n"
+ "fmla v26.8h, v4.8h, v11.8h\n"
+ "fmla v27.8h, v3.8h, v11.8h\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "fmla v30.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #2, 17f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
+ "b 19f\n"
+ "16:" // Oddments: Load input (0, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
+ "b 19f\n"
+ "17:" // Oddments: Load input (0, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (0, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v13.h }[0], [x20], #0x2\n"
+ "19:" // Oddments: Load input (0, 1): Bit 2: End
+ "ldr x20, [x15, #0x40]\n"
+ "fmla v23.8h, v1.8h, v13.8h\n"
+ "fmla v24.8h, v0.8h, v13.8h\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #2, 21f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 23f\n"
+ "20:" // Oddments: Load input (0, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 23f\n"
+ "21:" // Oddments: Load input (0, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (0, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "23:" // Oddments: Load input (0, 3): Bit 2: End
+ "ldr x20, [x15, #0x48]\n"
+ "fmla v24.8h, v2.8h, v12.8h\n"
+ "fmla v25.8h, v1.8h, v12.8h\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #2, 25f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
+ "b 27f\n"
+ "24:" // Oddments: Load input (2, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
+ "b 27f\n"
+ "25:" // Oddments: Load input (2, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (2, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "27:" // Oddments: Load input (2, 3): Bit 2: End
+ "ldr x20, [x15, #0x50]\n"
+ "fmla v24.8h, v8.8h, v10.8h\n"
+ "fmla v25.8h, v7.8h, v10.8h\n"
+ "add x20, x20, x14\n"
+ "fmla v27.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "fmla v30.8h, v2.8h, v10.8h\n"
+ "fmla v31.8h, v1.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 29f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 31f\n"
+ "28:" // Oddments: Load input (1, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 31f\n"
+ "29:" // Oddments: Load input (1, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (1, 0): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "31:" // Oddments: Load input (1, 0): Bit 2: End
+ "ldr x20, [x15, #0x58]\n"
+ "fmla v23.8h, v3.8h, v11.8h\n"
+ "fmla v26.8h, v0.8h, v11.8h\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #2, 33f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
+ "b 35f\n"
+ "32:" // Oddments: Load input (1, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
+ "b 35f\n"
+ "33:" // Oddments: Load input (1, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v13.h }[0], [x20], #0x2\n"
+ "35:" // Oddments: Load input (1, 4): Bit 2: End
+ "ldr x20, [x15, #0x60]\n"
+ "fmla v25.8h, v5.8h, v13.8h\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #2, 37f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 39f\n"
+ "36:" // Oddments: Load input (3, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 39f\n"
+ "37:" // Oddments: Load input (3, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "39:" // Oddments: Load input (3, 0): Bit 2: End
+ "ldr x20, [x15, #0x68]\n"
+ "fmla v26.8h, v6.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v12.8h\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #2, 41f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
+ "b 43f\n"
+ "40:" // Oddments: Load input (3, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
+ "b 43f\n"
+ "41:" // Oddments: Load input (3, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 42f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "b 43f\n"
+ "42:" // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "43:" // Oddments: Load input (3, 2): Bit 2: End
+ "ldr x20, [x15, #0x70]\n"
+ "fmla v26.8h, v8.8h, v10.8h\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
+ "add x20, x20, x14\n"
+ "fmla v28.8h, v6.8h, v10.8h\n"
+ "fmla v29.8h, v5.8h, v10.8h\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v31.8h, v3.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 45f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 44f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 47f\n"
+ "44:" // Oddments: Load input (3, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 47f\n"
+ "45:" // Oddments: Load input (3, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 46f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 47f\n"
+ "46:" // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "47:" // Oddments: Load input (3, 4): Bit 2: End
+ "ldr x20, [x15, #0x78]\n"
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "fmla v31.8h, v5.8h, v11.8h\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #2, 49f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 48f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
+ "b 51f\n"
+ "48:" // Oddments: Load input (4, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
+ "b 51f\n"
+ "49:" // Oddments: Load input (4, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 50f\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
+ "b 51f\n"
+ "50:" // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v13.h }[0], [x20], #0x2\n"
+ "51:" // Oddments: Load input (4, 1): Bit 2: End
+ "ldr x20, [x15, #0x80]\n"
+ "fmla v29.8h, v7.8h, v13.8h\n"
+ "fmla v30.8h, v6.8h, v13.8h\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #2, 53f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 52f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 55f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 55f\n"
+ "52:" // Oddments: Load input (1, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 55f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 55f\n"
+ "53:" // Oddments: Load input (1, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 54f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 55f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 55f\n"
+ "54:" // Oddments: Load input (1, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "55:" // Oddments: Load input (1, 1): Bit 2: End
+ "ldr x20, [x15, #0x88]\n"
+ "fmla v23.8h, v4.8h, v12.8h\n"
+ "fmla v24.8h, v3.8h, v12.8h\n"
+ "add x20, x20, x14\n"
+ "fmla v26.8h, v1.8h, v12.8h\n"
+ "fmla v27.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 57f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 56f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 59f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 59f\n"
+ "56:" // Oddments: Load input (1, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 59f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 59f\n"
+ "57:" // Oddments: Load input (1, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 58f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 59f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 59f\n"
+ "58:" // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "59:" // Oddments: Load input (1, 3): Bit 2: End
+ "ldr x20, [x15, #0x90]\n"
+ "fmla v24.8h, v5.8h, v11.8h\n"
+ "fmla v25.8h, v4.8h, v11.8h\n"
+ "add x20, x20, x14\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #2, 61f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 60f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 63f\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
+ "b 63f\n"
+ "60:" // Oddments: Load input (4, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 63f\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
+ "b 63f\n"
+ "61:" // Oddments: Load input (4, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 62f\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 63f\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
+ "b 63f\n"
+ "62:" // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v13.h }[0], [x20], #0x2\n"
+ "63:" // Oddments: Load input (4, 3): Bit 2: End
+ "ldr x20, [x15, #0x98]\n"
+ "fmla v30.8h, v8.8h, v13.8h\n"
+ "fmla v31.8h, v7.8h, v13.8h\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #2, 65f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 64f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 67f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 67f\n"
+ "64:" // Oddments: Load input (3, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 67f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 67f\n"
+ "65:" // Oddments: Load input (3, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 66f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 67f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 67f\n"
+ "66:" // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "67:" // Oddments: Load input (3, 1): Bit 2: End
+ "ldr x20, [x15, #0xa0]\n"
+ "fmla v26.8h, v7.8h, v12.8h\n"
+ "fmla v27.8h, v6.8h, v12.8h\n"
+ "add x20, x20, x14\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 69f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 68f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 71f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 71f\n"
+ "68:" // Oddments: Load input (0, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 71f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 71f\n"
+ "69:" // Oddments: Load input (0, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 70f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 71f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 71f\n"
+ "70:" // Oddments: Load input (0, 2): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "71:" // Oddments: Load input (0, 2): Bit 2: End
+ "ldr x20, [x15, #0xa8]\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "add x20, x20, x14\n"
+ "fmla v25.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #2, 73f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 72f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 75f\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
+ "b 75f\n"
+ "72:" // Oddments: Load input (3, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 75f\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
+ "b 75f\n"
+ "73:" // Oddments: Load input (3, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 74f\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 75f\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
+ "b 75f\n"
+ "74:" // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v13.h }[0], [x20], #0x2\n"
+ "75:" // Oddments: Load input (3, 3): Bit 2: End
+ "ldr x20, [x15, #0xb0]\n"
+ "fmla v27.8h, v8.8h, v13.8h\n"
+ "fmla v28.8h, v7.8h, v13.8h\n"
+ "add x20, x20, x14\n"
+ "fmla v30.8h, v5.8h, v13.8h\n"
+ "fmla v31.8h, v4.8h, v13.8h\n"
+ "tbz %x[n_channels], #2, 77f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 76f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 79f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 79f\n"
+ "76:" // Oddments: Load input (2, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 79f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 79f\n"
+ "77:" // Oddments: Load input (2, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 78f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 79f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 79f\n"
+ "78:" // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "79:" // Oddments: Load input (2, 0): Bit 2: End
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v23.8h, v6.8h, v12.8h\n"
+ "fmla v26.8h, v3.8h, v12.8h\n"
+ "add x20, x20, x14\n"
+ "fmla v29.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 81f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 80f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 83f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 83f\n"
+ "80:" // Oddments: Load input (2, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 83f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 83f\n"
+ "81:" // Oddments: Load input (2, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 82f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 83f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 83f\n"
+ "82:" // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "83:" // Oddments: Load input (2, 4): Bit 2: End
+ "ldr x20, [x15, #0xc0]\n"
+ "fmla v25.8h, v8.8h, v11.8h\n"
+ "fmla v28.8h, v5.8h, v11.8h\n"
+ "add x20, x20, x14\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "tbz %x[n_channels], #2, 85f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 84f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 87f\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
+ "b 87f\n"
+ "84:" // Oddments: Load input (4, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 87f\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
+ "b 87f\n"
+ "85:" // Oddments: Load input (4, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 86f\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 87f\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
+ "b 87f\n"
+ "86:" // Oddments: Load input (4, 2): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v13.h }[0], [x20], #0x2\n"
+ "87:" // Oddments: Load input (4, 2): Bit 2: End
+ "fmla v29.8h, v8.8h, v13.8h\n"
+ "fmla v30.8h, v7.8h, v13.8h\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "fmla v31.8h, v6.8h, v13.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
+ "tbz %x[n_channels], #2, 89f\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.d }[0], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.d }[0], [x23]\n"
+ "st1 { v25.d }[0], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.d }[0], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.d }[0], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "add x13, x13, #0x8\n"
+ "st1 { v28.d }[0], [x23]\n"
+ "st1 { v29.d }[0], [x22]\n"
+ "st1 { v30.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
+ "tbz %x[n_channels], #1, 88f\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.s }[2], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.s }[2], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.s }[2], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "add x13, x13, #0x4\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
+ "tbz %x[n_channels], #0, 91f\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.h }[6], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.h }[6], [x23]\n"
+ "st1 { v25.h }[6], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.h }[6], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.h }[6], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "st1 { v28.h }[6], [x23]\n"
+ "st1 { v29.h }[6], [x22]\n"
+ "st1 { v30.h }[6], [x21]\n"
+ "st1 { v31.h }[6], [x20]\n"
+ "b 91f\n"
+ "88:" // Oddments: Store: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 91f\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.h }[4], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.h }[4], [x23]\n"
+ "st1 { v25.h }[4], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.h }[4], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.h }[4], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "st1 { v28.h }[4], [x23]\n"
+ "st1 { v29.h }[4], [x22]\n"
+ "st1 { v30.h }[4], [x21]\n"
+ "st1 { v31.h }[4], [x20]\n"
+ "b 91f\n"
+ "89:" // Oddments: Store: Bit 2: Unset
+ "tbz %x[n_channels], #1, 90f\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.s }[0], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.s }[0], [x23]\n"
+ "st1 { v25.s }[0], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.s }[0], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.s }[0], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "add x13, x13, #0x4\n"
+ "st1 { v28.s }[0], [x23]\n"
+ "st1 { v29.s }[0], [x22]\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
+ "tbz %x[n_channels], #0, 91f\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.h }[2], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.h }[2], [x23]\n"
+ "st1 { v25.h }[2], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.h }[2], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.h }[2], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "st1 { v28.h }[2], [x23]\n"
+ "st1 { v29.h }[2], [x22]\n"
+ "st1 { v30.h }[2], [x21]\n"
+ "st1 { v31.h }[2], [x20]\n"
+ "b 91f\n"
+ "90:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.h }[0], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.h }[0], [x23]\n"
+ "st1 { v25.h }[0], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.h }[0], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.h }[0], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "st1 { v28.h }[0], [x23]\n"
+ "st1 { v29.h }[0], [x22]\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
+ "91:" // Oddments: Store: Bit 2: End
+ "92:" // End
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..04fb532937
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 4;
+ constexpr static unsigned int output_cols = 4;
+
+ a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..a1e1dd0e99
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,1736 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x27, #0x0\n"
+ "mov x26, #0x0\n"
+ "1:" // Tile loop
+ "str x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x4\n"
+ "mov x23, #0x4\n"
+ "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x27, x24\n" // offset = tile_i * ld_input_row
+ "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x20, x27, x22\n" // offset = tile_i * ld_output_row
+ "mov x6, #0x10\n" // cntb _, ALL, #1
+ "madd x21, x26, x4, x21\n" // offset += tile_j * ld_input_col
+ "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "lsl x4, x4, #0x1\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "madd x20, x26, x5, x20\n" // offset += tile_j * ld_output_col
+ "lsl x5, x5, #0x1\n"
+ "add x17, x4, x4\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mul x21, x21, x25\n" // offset *= kernel_stride * output_size
+ "add x7, x7, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x15, x7, x24, LSL #1\n"
+ "mul x20, x20, x23\n" // offset *= output_tile_size
+ "add x14, x15, x24, LSL #1\n"
+ "add x8, x8, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "lsr x13, %x[n_channels], #0x3\n"
+ "add x12, x14, x24, LSL #1\n"
+ "add x11, x17, x4\n"
+ "add x10, x8, x22, LSL #1\n"
+ "add x9, x12, x24, LSL #1\n"
+ "add x28, x11, x4\n"
+ "add x27, x10, x22, LSL #1\n"
+ "add x23, x5, x5\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v13.8h }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v15.8h }, [x20]\n"
+ "add x26, x9, x24, LSL #1\n"
+ "add x25, x28, x4\n"
+ "add x24, x27, x22, LSL #1\n"
+ "add x22, x23, x5\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x6\n"
+ "cbz x13, 4f\n"
+ "ldr q14, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "cmp x6, x13, LSL #4\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "add x16, x16, #0xa0\n"
+ "ldr q9, [x14, x17]\n"
+ "ld1 { v10.8h }, [x7]\n"
+ "ldr q11, [x7, x25]\n"
+ "ldr q12, [x14, x11]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v26.16b, v14.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+ "mov v28.16b, v14.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+ "add x6, x6, #0x10\n"
+ "cmp x6, x13, LSL #4\n"
+ "mov v16.16b, v14.16b\n fmla v16.8h, v3.8h, v9.8h\n"
+ "mov v22.16b, v14.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "add x20, x20, #0x10\n"
+ "add x21, x21, #0x10\n"
+ "mov v23.16b, v14.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+ "fmla v26.8h, v5.8h, v12.8h\n"
+ "mov v25.16b, v14.16b\n fmla v25.8h, v7.8h, v9.8h\n"
+ "mov v17.16b, v14.16b\n fmla v17.8h, v6.8h, v9.8h\n"
+ "mov v31.16b, v14.16b\n fmla v31.8h, v5.8h, v9.8h\n"
+ "mov v20.16b, v14.16b\n fmla v20.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x12, x17]\n"
+ "fmla v28.8h, v0.8h, v10.8h\n"
+ "ld1 { v30.8h }, [x26]\n"
+ "mov v29.16b, v14.16b\n fmla v29.8h, v2.8h, v11.8h\n"
+ "ldr q27, [x26, x25]\n"
+ "fmla v16.8h, v4.8h, v12.8h\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v23.8h, v1.8h, v12.8h\n"
+ "mov v21.16b, v14.16b\n fmla v21.8h, v6.8h, v30.8h\n"
+ "ldr q10, [x12, x11]\n"
+ "fmla v26.8h, v7.8h, v9.8h\n"
+ "fmla v25.8h, v8.8h, v12.8h\n"
+ "fmla v17.8h, v7.8h, v12.8h\n"
+ "fmla v29.8h, v6.8h, v12.8h\n"
+ "mov v24.16b, v14.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+ "mov v19.16b, v14.16b\n fmla v19.8h, v0.8h, v12.8h\n"
+ "ldr q11, [x7, x4]\n"
+ "mov v30.16b, v14.16b\n fmla v30.8h, v8.8h, v27.8h\n"
+ "ldr q12, [x7, x28]\n"
+ "fmla v16.8h, v6.8h, v9.8h\n"
+ "fmla v22.8h, v4.8h, v9.8h\n"
+ "fmla v23.8h, v3.8h, v9.8h\n"
+ "mov v27.16b, v14.16b\n fmla v27.8h, v1.8h, v9.8h\n"
+ "mov v18.16b, v14.16b\n fmla v18.8h, v0.8h, v9.8h\n"
+ "ldr q14, [x16, #0x0]\n"
+ "fmla v31.8h, v8.8h, v9.8h\n"
+ "fmla v20.8h, v5.8h, v9.8h\n"
+ "fmla v21.8h, v2.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x15]\n"
+ "fmla v26.8h, v8.8h, v10.8h\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "fmla v25.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x15, x25]\n"
+ "fmla v17.8h, v2.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "ld1 { v12.8h }, [x9]\n"
+ "fmla v16.8h, v7.8h, v10.8h\n"
+ "fmla v24.8h, v6.8h, v10.8h\n"
+ "fmla v22.8h, v5.8h, v10.8h\n"
+ "fmla v23.8h, v4.8h, v10.8h\n"
+ "fmla v19.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v2.8h, v10.8h\n"
+ "fmla v18.8h, v1.8h, v10.8h\n"
+ "fmla v30.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x15, x17]\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "fmla v20.8h, v6.8h, v12.8h\n"
+ "fmla v21.8h, v3.8h, v12.8h\n"
+ "ldr q12, [x9, x25]\n"
+ "fmla v26.8h, v1.8h, v10.8h\n"
+ "fmla v28.8h, v3.8h, v9.8h\n"
+ "fmla v29.8h, v5.8h, v11.8h\n"
+ "fmla v24.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x15, x11]\n"
+ "fmla v25.8h, v4.8h, v10.8h\n"
+ "fmla v17.8h, v3.8h, v10.8h\n"
+ "fmla v16.8h, v0.8h, v10.8h\n"
+ "fmla v19.8h, v8.8h, v12.8h\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr q9, [x26, x4]\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "fmla v26.8h, v2.8h, v11.8h\n"
+ "fmla v28.8h, v5.8h, v10.8h\n"
+ "ldr q10, [x14, x4]\n"
+ "fmla v25.8h, v5.8h, v11.8h\n"
+ "fmla v17.8h, v4.8h, v11.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "fmla v16.8h, v1.8h, v11.8h\n"
+ "fmla v24.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x14, x28]\n"
+ "fmla v21.8h, v7.8h, v9.8h\n"
+ "fmla v27.8h, v6.8h, v9.8h\n"
+ "ldr q12, [x26, x28]\n"
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmla v26.8h, v3.8h, v10.8h\n"
+ "fmla v20.8h, v1.8h, v10.8h\n"
+ "fmla v22.8h, v0.8h, v10.8h\n"
+ "fmla v28.8h, v7.8h, v10.8h\n"
+ "fmla v25.8h, v6.8h, v10.8h\n"
+ "ldr q10, [x7, x17]\n"
+ "fmla v18.8h, v8.8h, v12.8h\n"
+ "fmla v30.8h, v7.8h, v12.8h\n"
+ "ldr q9, [x12, x4]\n"
+ "fmla v17.8h, v8.8h, v11.8h\n"
+ "fmla v29.8h, v7.8h, v11.8h\n"
+ "fmla v16.8h, v5.8h, v11.8h\n"
+ "fmla v24.8h, v4.8h, v11.8h\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "fmla v19.8h, v1.8h, v11.8h\n"
+ "ldr q12, [x7, x11]\n"
+ "add x7, x7, #0x10\n"
+ "fmla v31.8h, v7.8h, v9.8h\n"
+ "fmla v26.8h, v6.8h, v9.8h\n"
+ "fmla v20.8h, v4.8h, v9.8h\n"
+ "fmla v22.8h, v3.8h, v9.8h\n"
+ "fmla v21.8h, v1.8h, v9.8h\n"
+ "fmla v27.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x12, x28]\n"
+ "fmla v28.8h, v2.8h, v10.8h\n"
+ "fmla v25.8h, v1.8h, v10.8h\n"
+ "fmla v17.8h, v0.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x14]\n"
+ "fmla v18.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v0.8h, v12.8h\n"
+ "fmla v31.8h, v3.8h, v10.8h\n"
+ "fmla v20.8h, v0.8h, v10.8h\n"
+ "fmla v16.8h, v8.8h, v9.8h\n"
+ "fmla v24.8h, v7.8h, v9.8h\n"
+ "fmla v23.8h, v5.8h, v9.8h\n"
+ "fmla v19.8h, v4.8h, v9.8h\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "ldr q11, [x9, x17]\n"
+ "fmla v25.8h, v2.8h, v12.8h\n"
+ "fmla v17.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x14, x25]\n"
+ "add x14, x14, #0x10\n"
+ "ldr q9, [x14, x17]\n"
+ "fmla v28.8h, v6.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x12]\n"
+ "fmla v27.8h, v4.8h, v11.8h\n"
+ "fmla v18.8h, v3.8h, v11.8h\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v19.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x12, x25]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v31.8h, v6.8h, v10.8h\n"
+ "fmla v20.8h, v3.8h, v10.8h\n"
+ "fmla v21.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x26, x17]\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
+ "fmla v18.8h, v6.8h, v10.8h\n"
+ "fmla v20.8h, v8.8h, v11.8h\n"
+ "fmla v22.8h, v7.8h, v11.8h\n"
+ "fmla v23.8h, v6.8h, v11.8h\n"
+ "fmla v21.8h, v5.8h, v11.8h\n"
+ "ldr q11, [x9, x11]\n"
+ "fmla v19.8h, v5.8h, v12.8h\n"
+ "fmla v27.8h, v5.8h, v11.8h\n"
+ "fmla v18.8h, v4.8h, v11.8h\n"
+ "fmla v30.8h, v3.8h, v11.8h\n"
+ "fmla v24.8h, v8.8h, v12.8h\n"
+ "ldr q12, [x26, x11]\n"
+ "fmla v21.8h, v8.8h, v10.8h\n"
+ "ldr q10, [x15, x4]\n"
+ "fmla v22.8h, v8.8h, v11.8h\n"
+ "fmla v23.8h, v7.8h, v11.8h\n"
+ "add x26, x26, #0x10\n"
+ "fmla v19.8h, v6.8h, v11.8h\n"
+ "ldr q11, [x15, x28]\n"
+ "fmla v27.8h, v8.8h, v12.8h\n"
+ "add x15, x15, #0x10\n"
+ "fmla v18.8h, v7.8h, v12.8h\n"
+ "fmla v30.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x9, x4]\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "fmla v25.8h, v3.8h, v10.8h\n"
+ "fmax v28.8h, v28.8h, v13.8h\n"
+ "fmla v31.8h, v1.8h, v10.8h\n"
+ "fmla v26.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x9, x28]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "fmla v17.8h, v5.8h, v11.8h\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "fmax v25.8h, v25.8h, v13.8h\n"
+ "add x9, x9, #0x10\n"
+ "fmla v16.8h, v2.8h, v11.8h\n"
+ "ldr q2, [x16, #0x30]\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x7, x25]\n"
+ "ldr q1, [x16, #0x20]\n"
+ "fmla v20.8h, v7.8h, v12.8h\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "ldr q6, [x16, #0x70]\n"
+ "fmla v21.8h, v4.8h, v12.8h\n"
+ "fmla v27.8h, v3.8h, v12.8h\n"
+ "ldr q12, [x14, x11]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "fmla v23.8h, v8.8h, v10.8h\n"
+ "ldr q8, [x16, #0x90]\n"
+ "fmla v19.8h, v7.8h, v10.8h\n"
+ "ldr q7, [x16, #0x80]\n"
+ "fmla v18.8h, v5.8h, v10.8h\n"
+ "ldr q5, [x16, #0x60]\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x7]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "fmax v17.8h, v17.8h, v13.8h\n"
+ "fmax v29.8h, v29.8h, v13.8h\n"
+ "add x16, x16, #0xa0\n"
+ "fmax v31.8h, v31.8h, v13.8h\n"
+ "fmax v26.8h, v26.8h, v13.8h\n"
+ "fmax v16.8h, v16.8h, v13.8h\n"
+ "fmax v24.8h, v24.8h, v13.8h\n"
+ "fmax v20.8h, v20.8h, v13.8h\n"
+ "fmax v22.8h, v22.8h, v13.8h\n"
+ "fmax v23.8h, v23.8h, v13.8h\n"
+ "fmax v19.8h, v19.8h, v13.8h\n"
+ "fmax v21.8h, v21.8h, v13.8h\n"
+ "fmax v27.8h, v27.8h, v13.8h\n"
+ "fmax v18.8h, v18.8h, v13.8h\n"
+ "fmax v30.8h, v30.8h, v13.8h\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "fmin v25.8h, v25.8h, v15.8h\n"
+ "st1 { v28.8h }, [x8]\n"
+ "fmin v17.8h, v17.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "str q25, [x8, x5]\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "fmin v26.8h, v26.8h, v15.8h\n"
+ "str q17, [x8, x23]\n"
+ "fmin v16.8h, v16.8h, v15.8h\n"
+ "fmin v24.8h, v24.8h, v15.8h\n"
+ "str q29, [x8, x22]\n"
+ "add x8, x8, #0x10\n"
+ "fmin v20.8h, v20.8h, v15.8h\n"
+ "fmin v22.8h, v22.8h, v15.8h\n"
+ "st1 { v31.8h }, [x10]\n"
+ "fmin v23.8h, v23.8h, v15.8h\n"
+ "fmin v19.8h, v19.8h, v15.8h\n"
+ "str q26, [x10, x5]\n"
+ "fmin v21.8h, v21.8h, v15.8h\n"
+ "fmin v27.8h, v27.8h, v15.8h\n"
+ "str q16, [x10, x23]\n"
+ "fmin v18.8h, v18.8h, v15.8h\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "str q24, [x10, x22]\n"
+ "add x10, x10, #0x10\n"
+ "st1 { v20.8h }, [x27]\n"
+ "str q22, [x27, x5]\n"
+ "str q23, [x27, x23]\n"
+ "str q19, [x27, x22]\n"
+ "add x27, x27, #0x10\n"
+ "st1 { v21.8h }, [x24]\n"
+ "str q27, [x24, x5]\n"
+ "str q18, [x24, x23]\n"
+ "str q30, [x24, x22]\n"
+ "add x24, x24, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v16.16b, v14.16b\n fmla v16.8h, v4.8h, v9.8h\n"
+ "mov v23.16b, v14.16b\n fmla v23.8h, v8.8h, v9.8h\n"
+ "mov v31.16b, v14.16b\n fmla v31.8h, v3.8h, v9.8h\n"
+ "mov v30.16b, v14.16b\n fmla v30.8h, v1.8h, v9.8h\n"
+ "mov v18.16b, v14.16b\n fmla v18.8h, v0.8h, v9.8h\n"
+ "fmla v16.8h, v5.8h, v12.8h\n"
+ "mov v17.16b, v14.16b\n fmla v17.8h, v7.8h, v9.8h\n"
+ "mov v19.16b, v14.16b\n fmla v19.8h, v6.8h, v9.8h\n"
+ "mov v28.16b, v14.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+ "mov v27.16b, v14.16b\n fmla v27.8h, v2.8h, v9.8h\n"
+ "ldr q24, [x12, x17]\n"
+ "fmla v23.8h, v0.8h, v10.8h\n"
+ "ld1 { v21.8h }, [x26]\n"
+ "mov v29.16b, v14.16b\n fmla v29.8h, v2.8h, v11.8h\n"
+ "ldr q20, [x26, x25]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "fmla v18.8h, v1.8h, v12.8h\n"
+ "mov v26.16b, v14.16b\n fmla v26.8h, v6.8h, v21.8h\n"
+ "ldr q9, [x12, x11]\n"
+ "fmla v16.8h, v7.8h, v24.8h\n"
+ "fmla v17.8h, v8.8h, v12.8h\n"
+ "fmla v19.8h, v7.8h, v12.8h\n"
+ "fmla v29.8h, v6.8h, v12.8h\n"
+ "mov v11.16b, v14.16b\n fmla v11.8h, v3.8h, v12.8h\n"
+ "mov v10.16b, v14.16b\n fmla v10.8h, v0.8h, v12.8h\n"
+ "ldr q22, [x7, x4]\n"
+ "mov v25.16b, v14.16b\n fmla v25.8h, v8.8h, v20.8h\n"
+ "ldr q21, [x7, x28]\n"
+ "fmla v31.8h, v6.8h, v24.8h\n"
+ "fmla v30.8h, v4.8h, v24.8h\n"
+ "fmla v18.8h, v3.8h, v24.8h\n"
+ "mov v12.16b, v14.16b\n fmla v12.8h, v1.8h, v24.8h\n"
+ "fmla v14.8h, v0.8h, v24.8h\n"
+ "fmla v28.8h, v8.8h, v24.8h\n"
+ "fmla v27.8h, v5.8h, v24.8h\n"
+ "fmla v26.8h, v2.8h, v24.8h\n"
+ "ld1 { v24.8h }, [x15]\n"
+ "fmla v16.8h, v8.8h, v9.8h\n"
+ "fmla v23.8h, v1.8h, v22.8h\n"
+ "fmla v17.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x15, x25]\n"
+ "fmla v19.8h, v2.8h, v21.8h\n"
+ "fmla v29.8h, v1.8h, v21.8h\n"
+ "ld1 { v20.8h }, [x9]\n"
+ "fmla v31.8h, v7.8h, v9.8h\n"
+ "fmla v11.8h, v6.8h, v9.8h\n"
+ "fmla v30.8h, v5.8h, v9.8h\n"
+ "fmla v18.8h, v4.8h, v9.8h\n"
+ "fmla v10.8h, v3.8h, v9.8h\n"
+ "fmla v12.8h, v2.8h, v9.8h\n"
+ "fmla v14.8h, v1.8h, v9.8h\n"
+ "fmla v25.8h, v0.8h, v9.8h\n"
+ "ldr q21, [x15, x17]\n"
+ "fmla v28.8h, v0.8h, v24.8h\n"
+ "fmla v27.8h, v6.8h, v20.8h\n"
+ "fmla v26.8h, v3.8h, v20.8h\n"
+ "ldr q20, [x9, x25]\n"
+ "fmla v16.8h, v1.8h, v21.8h\n"
+ "fmla v23.8h, v3.8h, v24.8h\n"
+ "fmla v29.8h, v5.8h, v22.8h\n"
+ "fmla v11.8h, v2.8h, v22.8h\n"
+ "ldr q22, [x15, x11]\n"
+ "fmla v17.8h, v4.8h, v21.8h\n"
+ "fmla v19.8h, v3.8h, v21.8h\n"
+ "fmla v31.8h, v0.8h, v21.8h\n"
+ "fmla v10.8h, v8.8h, v20.8h\n"
+ "fmla v25.8h, v5.8h, v20.8h\n"
+ "ldr q20, [x26, x4]\n"
+ "fmla v28.8h, v2.8h, v21.8h\n"
+ "fmla v16.8h, v2.8h, v22.8h\n"
+ "fmla v23.8h, v5.8h, v21.8h\n"
+ "ldr q21, [x14, x4]\n"
+ "fmla v17.8h, v5.8h, v22.8h\n"
+ "fmla v19.8h, v4.8h, v22.8h\n"
+ "fmla v29.8h, v3.8h, v22.8h\n"
+ "fmla v31.8h, v1.8h, v22.8h\n"
+ "fmla v11.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x14, x28]\n"
+ "fmla v26.8h, v7.8h, v20.8h\n"
+ "fmla v12.8h, v6.8h, v20.8h\n"
+ "ldr q20, [x26, x28]\n"
+ "fmla v28.8h, v4.8h, v21.8h\n"
+ "fmla v16.8h, v3.8h, v21.8h\n"
+ "fmla v27.8h, v1.8h, v21.8h\n"
+ "fmla v30.8h, v0.8h, v21.8h\n"
+ "fmla v23.8h, v7.8h, v21.8h\n"
+ "fmla v17.8h, v6.8h, v21.8h\n"
+ "ldr q21, [x7, x17]\n"
+ "fmla v14.8h, v8.8h, v20.8h\n"
+ "fmla v25.8h, v7.8h, v20.8h\n"
+ "ldr q20, [x12, x4]\n"
+ "fmla v19.8h, v8.8h, v22.8h\n"
+ "fmla v29.8h, v7.8h, v22.8h\n"
+ "fmla v31.8h, v5.8h, v22.8h\n"
+ "fmla v11.8h, v4.8h, v22.8h\n"
+ "fmla v18.8h, v2.8h, v22.8h\n"
+ "fmla v10.8h, v1.8h, v22.8h\n"
+ "ldr q22, [x7, x11]\n"
+ "add x7, x7, #0x10\n"
+ "fmla v28.8h, v7.8h, v20.8h\n"
+ "fmla v16.8h, v6.8h, v20.8h\n"
+ "fmla v27.8h, v4.8h, v20.8h\n"
+ "fmla v30.8h, v3.8h, v20.8h\n"
+ "fmla v26.8h, v1.8h, v20.8h\n"
+ "fmla v12.8h, v0.8h, v20.8h\n"
+ "ldr q20, [x12, x28]\n"
+ "fmla v23.8h, v2.8h, v21.8h\n"
+ "fmla v17.8h, v1.8h, v21.8h\n"
+ "fmla v19.8h, v0.8h, v21.8h\n"
+ "ld1 { v21.8h }, [x14]\n"
+ "fmla v14.8h, v2.8h, v20.8h\n"
+ "fmla v29.8h, v0.8h, v22.8h\n"
+ "fmla v28.8h, v3.8h, v21.8h\n"
+ "fmla v27.8h, v0.8h, v21.8h\n"
+ "fmla v31.8h, v8.8h, v20.8h\n"
+ "fmla v11.8h, v7.8h, v20.8h\n"
+ "fmla v18.8h, v5.8h, v20.8h\n"
+ "fmla v10.8h, v4.8h, v20.8h\n"
+ "fmla v25.8h, v1.8h, v20.8h\n"
+ "ldr q24, [x9, x17]\n"
+ "fmla v17.8h, v2.8h, v22.8h\n"
+ "fmla v19.8h, v1.8h, v22.8h\n"
+ "ldr q20, [x14, x25]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v23.8h, v6.8h, v21.8h\n"
+ "ld1 { v21.8h }, [x12]\n"
+ "fmla v12.8h, v4.8h, v24.8h\n"
+ "fmla v14.8h, v3.8h, v24.8h\n"
+ "fmla v29.8h, v8.8h, v20.8h\n"
+ "fmla v11.8h, v5.8h, v20.8h\n"
+ "fmla v10.8h, v2.8h, v20.8h\n"
+ "ldr q20, [x12, x25]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v28.8h, v6.8h, v21.8h\n"
+ "fmla v27.8h, v3.8h, v21.8h\n"
+ "fmla v26.8h, v0.8h, v21.8h\n"
+ "ldr q22, [x26, x17]\n"
+ "fmla v25.8h, v2.8h, v20.8h\n"
+ "fmla v12.8h, v7.8h, v22.8h\n"
+ "fmla v14.8h, v6.8h, v22.8h\n"
+ "fmla v27.8h, v8.8h, v24.8h\n"
+ "fmla v30.8h, v7.8h, v24.8h\n"
+ "fmla v18.8h, v6.8h, v24.8h\n"
+ "fmla v26.8h, v5.8h, v24.8h\n"
+ "ldr q21, [x9, x11]\n"
+ "fmla v10.8h, v5.8h, v20.8h\n"
+ "fmla v12.8h, v5.8h, v21.8h\n"
+ "fmla v14.8h, v4.8h, v21.8h\n"
+ "fmla v25.8h, v3.8h, v21.8h\n"
+ "fmla v11.8h, v8.8h, v20.8h\n"
+ "ldr q20, [x26, x11]\n"
+ "fmla v26.8h, v8.8h, v22.8h\n"
+ "ldr q9, [x15, x4]\n"
+ "fmla v30.8h, v8.8h, v21.8h\n"
+ "fmla v18.8h, v7.8h, v21.8h\n"
+ "add x26, x26, #0x10\n"
+ "fmla v10.8h, v6.8h, v21.8h\n"
+ "ldr q21, [x15, x28]\n"
+ "fmla v12.8h, v8.8h, v20.8h\n"
+ "add x15, x15, #0x10\n"
+ "fmla v14.8h, v7.8h, v20.8h\n"
+ "fmla v25.8h, v6.8h, v20.8h\n"
+ "ldr q24, [x9, x4]\n"
+ "fmla v23.8h, v4.8h, v9.8h\n"
+ "fmla v17.8h, v3.8h, v9.8h\n"
+ "fmax v23.8h, v23.8h, v13.8h\n"
+ "fmla v28.8h, v1.8h, v9.8h\n"
+ "fmla v16.8h, v0.8h, v9.8h\n"
+ "ldr q0, [x9, x28]\n"
+ "fmax v17.8h, v17.8h, v13.8h\n"
+ "fmla v19.8h, v5.8h, v21.8h\n"
+ "fmla v29.8h, v4.8h, v21.8h\n"
+ "fmax v19.8h, v19.8h, v13.8h\n"
+ "add x9, x9, #0x10\n"
+ "fmla v31.8h, v2.8h, v21.8h\n"
+ "fmla v11.8h, v1.8h, v21.8h\n"
+ "fmax v29.8h, v29.8h, v13.8h\n"
+ "fmla v27.8h, v7.8h, v24.8h\n"
+ "fmla v30.8h, v6.8h, v24.8h\n"
+ "fmax v28.8h, v28.8h, v13.8h\n"
+ "fmla v26.8h, v4.8h, v24.8h\n"
+ "fmla v12.8h, v3.8h, v24.8h\n"
+ "fmax v16.8h, v16.8h, v13.8h\n"
+ "fmla v18.8h, v8.8h, v0.8h\n"
+ "fmla v10.8h, v7.8h, v0.8h\n"
+ "fmax v31.8h, v31.8h, v13.8h\n"
+ "fmla v14.8h, v5.8h, v0.8h\n"
+ "fmla v25.8h, v4.8h, v0.8h\n"
+ "fmax v11.8h, v11.8h, v13.8h\n"
+ "fmax v27.8h, v27.8h, v13.8h\n"
+ "fmax v30.8h, v30.8h, v13.8h\n"
+ "fmax v18.8h, v18.8h, v13.8h\n"
+ "fmax v10.8h, v10.8h, v13.8h\n"
+ "fmax v26.8h, v26.8h, v13.8h\n"
+ "fmax v12.8h, v12.8h, v13.8h\n"
+ "fmax v14.8h, v14.8h, v13.8h\n"
+ "fmax v25.8h, v25.8h, v13.8h\n"
+ "fmin v23.8h, v23.8h, v15.8h\n"
+ "fmin v17.8h, v17.8h, v15.8h\n"
+ "st1 { v23.8h }, [x8]\n"
+ "fmin v19.8h, v19.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "str q17, [x8, x5]\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "fmin v16.8h, v16.8h, v15.8h\n"
+ "str q19, [x8, x23]\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "fmin v11.8h, v11.8h, v15.8h\n"
+ "str q29, [x8, x22]\n"
+ "add x8, x8, #0x10\n"
+ "fmin v27.8h, v27.8h, v15.8h\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "st1 { v28.8h }, [x10]\n"
+ "fmin v18.8h, v18.8h, v15.8h\n"
+ "fmin v10.8h, v10.8h, v15.8h\n"
+ "str q16, [x10, x5]\n"
+ "fmin v26.8h, v26.8h, v15.8h\n"
+ "fmin v12.8h, v12.8h, v15.8h\n"
+ "str q31, [x10, x23]\n"
+ "fmin v14.8h, v14.8h, v15.8h\n"
+ "fmin v25.8h, v25.8h, v15.8h\n"
+ "str q11, [x10, x22]\n"
+ "add x10, x10, #0x10\n"
+ "st1 { v27.8h }, [x27]\n"
+ "str q30, [x27, x5]\n"
+ "str q18, [x27, x23]\n"
+ "str q10, [x27, x22]\n"
+ "add x27, x27, #0x10\n"
+ "st1 { v26.8h }, [x24]\n"
+ "str q12, [x24, x5]\n"
+ "str q14, [x24, x23]\n"
+ "str q25, [x24, x22]\n"
+ "add x24, x24, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x7\n"
+ "beq 141f\n"
+ "ldr q14, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "add x23, x14, x17\n"
+ "add x22, x7, XZR\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "add x21, x7, x25\n"
+ "add x20, x14, x11\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "tbz %x[n_channels], #2, 6f\n"
+ "ldr d9, [x23], #0x8\n"
+ "ldr d10, [x22], #0x8\n"
+ "ldr d11, [x21], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ld1 { v9.s }[2], [x23], #0x4\n"
+ "ld1 { v10.s }[2], [x22], #0x4\n"
+ "ld1 { v11.s }[2], [x21], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v9.h }[6], [x23]\n"
+ "ld1 { v10.h }[6], [x22]\n"
+ "ld1 { v11.h }[6], [x21]\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 8f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v9.h }[4], [x23]\n"
+ "ld1 { v10.h }[4], [x22]\n"
+ "ld1 { v11.h }[4], [x21]\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 8f\n"
+ "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr s9, [x23], #0x4\n"
+ "ldr s10, [x22], #0x4\n"
+ "ldr s11, [x21], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v9.h }[2], [x23]\n"
+ "ld1 { v10.h }[2], [x22]\n"
+ "ld1 { v11.h }[2], [x21]\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h9, [x23, #0x0]\n"
+ "ldr h10, [x22, #0x0]\n"
+ "ldr h11, [x21, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: End
+ "mov v16.16b, v14.16b\n fmla v16.8h, v8.8h, v9.8h\n"
+ "mov v17.16b, v14.16b\n fmla v17.8h, v7.8h, v9.8h\n"
+ "add x20, x26, XZR\n"
+ "mov v18.16b, v14.16b\n fmla v18.8h, v6.8h, v9.8h\n"
+ "mov v21.16b, v14.16b\n fmla v21.8h, v4.8h, v9.8h\n"
+ "mov v22.16b, v14.16b\n fmla v22.8h, v3.8h, v9.8h\n"
+ "mov v25.16b, v14.16b\n fmla v25.8h, v1.8h, v9.8h\n"
+ "mov v26.16b, v14.16b\n fmla v26.8h, v0.8h, v9.8h\n"
+ "mov v19.16b, v14.16b\n fmla v19.8h, v2.8h, v11.8h\n"
+ "mov v20.16b, v14.16b\n fmla v20.8h, v5.8h, v9.8h\n"
+ "mov v24.16b, v14.16b\n fmla v24.8h, v2.8h, v9.8h\n"
+ "fmla v16.8h, v0.8h, v10.8h\n"
+ "fmla v17.8h, v8.8h, v12.8h\n"
+ "fmla v18.8h, v7.8h, v12.8h\n"
+ "fmla v19.8h, v6.8h, v12.8h\n"
+ "fmla v21.8h, v5.8h, v12.8h\n"
+ "fmla v22.8h, v4.8h, v12.8h\n"
+ "mov v23.16b, v14.16b\n fmla v23.8h, v3.8h, v12.8h\n"
+ "fmla v25.8h, v2.8h, v12.8h\n"
+ "fmla v26.8h, v1.8h, v12.8h\n"
+ "mov v27.16b, v14.16b\n fmla v27.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 10f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v10.h }[6], [x20]\n"
+ "b 12f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v10.h }[4], [x20]\n"
+ "b 12f\n"
+ "10:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr s10, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v10.h }[2], [x20]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Unset: Bit 1: Unset
+ "ldr h10, [x20, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: End
+ "mov v28.16b, v14.16b\n fmla v28.8h, v6.8h, v10.8h\n"
+ "add x20, x26, x25\n"
+ "tbz %x[n_channels], #2, 14f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 16f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 16f\n"
+ "14:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Unset
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: End
+ "mov v31.16b, v14.16b\n fmla v31.8h, v8.8h, v11.8h\n"
+ "add x20, x12, x17\n"
+ "tbz %x[n_channels], #2, 18f\n"
+ "ldr d9, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v9.h }[6], [x20]\n"
+ "b 20f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v9.h }[4], [x20]\n"
+ "b 20f\n"
+ "18:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr s9, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v9.h }[2], [x20]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
+ "ldr h9, [x20, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
+ "fmla v20.8h, v8.8h, v9.8h\n"
+ "fmla v21.8h, v7.8h, v9.8h\n"
+ "add x20, x7, x4\n"
+ "fmla v22.8h, v6.8h, v9.8h\n"
+ "fmla v24.8h, v5.8h, v9.8h\n"
+ "fmla v25.8h, v4.8h, v9.8h\n"
+ "fmla v26.8h, v3.8h, v9.8h\n"
+ "fmla v28.8h, v2.8h, v9.8h\n"
+ "mov v29.16b, v14.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "mov v30.16b, v14.16b\n fmla v30.8h, v0.8h, v9.8h\n"
+ "tbz %x[n_channels], #2, 22f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 24f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 24f\n"
+ "22:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 2: End
+ "fmla v16.8h, v1.8h, v12.8h\n"
+ "fmla v17.8h, v0.8h, v12.8h\n"
+ "add x20, x7, x28\n"
+ "tbz %x[n_channels], #2, 26f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 28f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 28f\n"
+ "26:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 2: End
+ "fmla v18.8h, v2.8h, v11.8h\n"
+ "fmla v19.8h, v1.8h, v11.8h\n"
+ "add x20, x12, x11\n"
+ "tbz %x[n_channels], #2, 30f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v10.h }[6], [x20]\n"
+ "b 32f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v10.h }[4], [x20]\n"
+ "b 32f\n"
+ "30:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr s10, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v10.h }[2], [x20]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h10, [x20, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
+ "fmla v21.8h, v8.8h, v10.8h\n"
+ "fmla v22.8h, v7.8h, v10.8h\n"
+ "add x20, x15, XZR\n"
+ "fmla v23.8h, v6.8h, v10.8h\n"
+ "fmla v25.8h, v5.8h, v10.8h\n"
+ "fmla v26.8h, v4.8h, v10.8h\n"
+ "fmla v27.8h, v3.8h, v10.8h\n"
+ "fmla v29.8h, v2.8h, v10.8h\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 34f\n"
+ "ldr d9, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v9.h }[6], [x20]\n"
+ "b 36f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v9.h }[4], [x20]\n"
+ "b 36f\n"
+ "34:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr s9, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v9.h }[2], [x20]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: Unset: Bit 1: Unset
+ "ldr h9, [x20, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 2: End
+ "fmla v16.8h, v3.8h, v9.8h\n"
+ "fmla v20.8h, v0.8h, v9.8h\n"
+ "add x20, x15, x25\n"
+ "tbz %x[n_channels], #2, 38f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 40f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 40f\n"
+ "38:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 2: Unset
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 2: End
+ "fmla v19.8h, v5.8h, v12.8h\n"
+ "fmla v23.8h, v2.8h, v12.8h\n"
+ "add x20, x9, XZR\n"
+ "tbz %x[n_channels], #2, 42f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 44f\n"
+ "41:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 44f\n"
+ "42:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 43f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 44f\n"
+ "43:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "44:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: End
+ "fmla v24.8h, v6.8h, v11.8h\n"
+ "fmla v28.8h, v3.8h, v11.8h\n"
+ "add x20, x15, x17\n"
+ "tbz %x[n_channels], #2, 46f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 45f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v10.h }[6], [x20]\n"
+ "b 48f\n"
+ "45:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v10.h }[4], [x20]\n"
+ "b 48f\n"
+ "46:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 47f\n"
+ "ldr s10, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v10.h }[2], [x20]\n"
+ "b 48f\n"
+ "47:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Unset: Bit 1: Unset
+ "ldr h10, [x20, #0x0]\n"
+ "48:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: End
+ "fmla v16.8h, v5.8h, v10.8h\n"
+ "fmla v17.8h, v4.8h, v10.8h\n"
+ "add x20, x9, x25\n"
+ "fmla v18.8h, v3.8h, v10.8h\n"
+ "fmla v20.8h, v2.8h, v10.8h\n"
+ "fmla v21.8h, v1.8h, v10.8h\n"
+ "fmla v22.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 50f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 49f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 52f\n"
+ "49:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 52f\n"
+ "50:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Unset
+ "tbz %x[n_channels], #1, 51f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 52f\n"
+ "51:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "52:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: End
+ "fmla v27.8h, v8.8h, v11.8h\n"
+ "fmla v31.8h, v5.8h, v11.8h\n"
+ "add x20, x15, x11\n"
+ "tbz %x[n_channels], #2, 54f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 53f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 56f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 56f\n"
+ "53:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 56f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 56f\n"
+ "54:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 55f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 56f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 56f\n"
+ "55:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "56:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
+ "fmla v17.8h, v5.8h, v12.8h\n"
+ "fmla v18.8h, v4.8h, v12.8h\n"
+ "add x20, x26, x4\n"
+ "fmla v19.8h, v3.8h, v12.8h\n"
+ "fmla v21.8h, v2.8h, v12.8h\n"
+ "fmla v22.8h, v1.8h, v12.8h\n"
+ "fmla v23.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 58f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 57f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 60f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 60f\n"
+ "57:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 60f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 60f\n"
+ "58:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 59f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 60f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 60f\n"
+ "59:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "60:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: End
+ "fmla v28.8h, v7.8h, v11.8h\n"
+ "fmla v29.8h, v6.8h, v11.8h\n"
+ "add x20, x14, x4\n"
+ "tbz %x[n_channels], #2, 62f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 61f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 64f\n"
+ "ld1 { v10.h }[6], [x20]\n"
+ "b 64f\n"
+ "61:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 64f\n"
+ "ld1 { v10.h }[4], [x20]\n"
+ "b 64f\n"
+ "62:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 63f\n"
+ "ldr s10, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 64f\n"
+ "ld1 { v10.h }[2], [x20]\n"
+ "b 64f\n"
+ "63:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h10, [x20, #0x0]\n"
+ "64:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: End
+ "fmla v16.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v6.8h, v10.8h\n"
+ "add x20, x26, x28\n"
+ "fmla v20.8h, v4.8h, v10.8h\n"
+ "fmla v21.8h, v3.8h, v10.8h\n"
+ "fmla v24.8h, v1.8h, v10.8h\n"
+ "fmla v25.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 66f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 65f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 68f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 68f\n"
+ "65:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 68f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 68f\n"
+ "66:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 67f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 68f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 68f\n"
+ "67:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "68:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: End
+ "fmla v30.8h, v8.8h, v11.8h\n"
+ "fmla v31.8h, v7.8h, v11.8h\n"
+ "add x20, x14, x28\n"
+ "tbz %x[n_channels], #2, 70f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 69f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 72f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 72f\n"
+ "69:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 72f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 72f\n"
+ "70:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 71f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 72f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 72f\n"
+ "71:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "72:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: End
+ "fmla v18.8h, v8.8h, v12.8h\n"
+ "fmla v19.8h, v7.8h, v12.8h\n"
+ "add x20, x7, x17\n"
+ "fmla v22.8h, v5.8h, v12.8h\n"
+ "fmla v23.8h, v4.8h, v12.8h\n"
+ "fmla v26.8h, v2.8h, v12.8h\n"
+ "fmla v27.8h, v1.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 74f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 73f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 76f\n"
+ "ld1 { v10.h }[6], [x20]\n"
+ "b 76f\n"
+ "73:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 76f\n"
+ "ld1 { v10.h }[4], [x20]\n"
+ "b 76f\n"
+ "74:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 75f\n"
+ "ldr s10, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 76f\n"
+ "ld1 { v10.h }[2], [x20]\n"
+ "b 76f\n"
+ "75:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: Unset: Bit 1: Unset
+ "ldr h10, [x20, #0x0]\n"
+ "76:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 2: End
+ "fmla v16.8h, v2.8h, v10.8h\n"
+ "fmla v17.8h, v1.8h, v10.8h\n"
+ "add x20, x12, x4\n"
+ "fmla v18.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 78f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 77f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 80f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 80f\n"
+ "77:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 80f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 80f\n"
+ "78:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 79f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 80f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 80f\n"
+ "79:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "80:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
+ "fmla v20.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v6.8h, v11.8h\n"
+ "add x20, x7, x11\n"
+ "fmla v24.8h, v4.8h, v11.8h\n"
+ "fmla v25.8h, v3.8h, v11.8h\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #2, 82f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 81f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 84f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 84f\n"
+ "81:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 84f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 84f\n"
+ "82:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 83f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 84f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 84f\n"
+ "83:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "84:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 2: End
+ "fmla v17.8h, v2.8h, v12.8h\n"
+ "fmla v18.8h, v1.8h, v12.8h\n"
+ "add x20, x14, XZR\n"
+ "fmla v19.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 86f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 85f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 88f\n"
+ "ld1 { v10.h }[6], [x20]\n"
+ "b 88f\n"
+ "85:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 88f\n"
+ "ld1 { v10.h }[4], [x20]\n"
+ "b 88f\n"
+ "86:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 87f\n"
+ "ldr s10, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 88f\n"
+ "ld1 { v10.h }[2], [x20]\n"
+ "b 88f\n"
+ "87:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset: Bit 1: Unset
+ "ldr h10, [x20, #0x0]\n"
+ "88:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: End
+ "fmla v16.8h, v6.8h, v10.8h\n"
+ "fmla v20.8h, v3.8h, v10.8h\n"
+ "add x20, x12, x28\n"
+ "fmla v24.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 90f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 89f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 92f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 92f\n"
+ "89:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 92f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 92f\n"
+ "90:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 91f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 92f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 92f\n"
+ "91:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "92:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: End
+ "fmla v22.8h, v8.8h, v11.8h\n"
+ "fmla v23.8h, v7.8h, v11.8h\n"
+ "add x20, x14, x25\n"
+ "fmla v26.8h, v5.8h, v11.8h\n"
+ "fmla v27.8h, v4.8h, v11.8h\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #2, 94f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 93f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 96f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 96f\n"
+ "93:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 96f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 96f\n"
+ "94:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Unset
+ "tbz %x[n_channels], #1, 95f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 96f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 96f\n"
+ "95:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "96:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: End
+ "fmla v19.8h, v8.8h, v12.8h\n"
+ "fmla v23.8h, v5.8h, v12.8h\n"
+ "add x20, x12, XZR\n"
+ "fmla v27.8h, v2.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 98f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 97f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 100f\n"
+ "ld1 { v10.h }[6], [x20]\n"
+ "b 100f\n"
+ "97:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 100f\n"
+ "ld1 { v10.h }[4], [x20]\n"
+ "b 100f\n"
+ "98:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 99f\n"
+ "ldr s10, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 100f\n"
+ "ld1 { v10.h }[2], [x20]\n"
+ "b 100f\n"
+ "99:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset: Bit 1: Unset
+ "ldr h10, [x20, #0x0]\n"
+ "100:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
+ "fmla v20.8h, v6.8h, v10.8h\n"
+ "fmla v24.8h, v3.8h, v10.8h\n"
+ "add x20, x9, x17\n"
+ "fmla v28.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 102f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 101f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 104f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 104f\n"
+ "101:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 104f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 104f\n"
+ "102:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 103f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 104f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 104f\n"
+ "103:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "104:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
+ "fmla v24.8h, v8.8h, v11.8h\n"
+ "fmla v25.8h, v7.8h, v11.8h\n"
+ "add x20, x12, x25\n"
+ "fmla v26.8h, v6.8h, v11.8h\n"
+ "fmla v28.8h, v5.8h, v11.8h\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "fmla v30.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #2, 106f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 105f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 108f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 108f\n"
+ "105:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 108f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 108f\n"
+ "106:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Unset
+ "tbz %x[n_channels], #1, 107f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 108f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 108f\n"
+ "107:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "108:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: End
+ "fmla v23.8h, v8.8h, v12.8h\n"
+ "fmla v27.8h, v5.8h, v12.8h\n"
+ "add x20, x26, x17\n"
+ "fmla v31.8h, v2.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 110f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 109f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 112f\n"
+ "ld1 { v10.h }[6], [x20]\n"
+ "b 112f\n"
+ "109:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 112f\n"
+ "ld1 { v10.h }[4], [x20]\n"
+ "b 112f\n"
+ "110:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 111f\n"
+ "ldr s10, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 112f\n"
+ "ld1 { v10.h }[2], [x20]\n"
+ "b 112f\n"
+ "111:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Unset: Bit 1: Unset
+ "ldr h10, [x20, #0x0]\n"
+ "112:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: End
+ "fmla v28.8h, v8.8h, v10.8h\n"
+ "fmla v29.8h, v7.8h, v10.8h\n"
+ "add x20, x9, x11\n"
+ "fmla v30.8h, v6.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 114f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 113f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 116f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 116f\n"
+ "113:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 116f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 116f\n"
+ "114:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 115f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 116f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 116f\n"
+ "115:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "116:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: End
+ "fmla v25.8h, v8.8h, v11.8h\n"
+ "fmla v26.8h, v7.8h, v11.8h\n"
+ "add x20, x26, x11\n"
+ "fmla v27.8h, v6.8h, v11.8h\n"
+ "fmla v29.8h, v5.8h, v11.8h\n"
+ "fmla v30.8h, v4.8h, v11.8h\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #2, 118f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 117f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 120f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 120f\n"
+ "117:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 120f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 120f\n"
+ "118:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 119f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 120f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 120f\n"
+ "119:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "120:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: End
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmla v30.8h, v7.8h, v12.8h\n"
+ "add x20, x15, x4\n"
+ "fmla v31.8h, v6.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 122f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 121f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 124f\n"
+ "ld1 { v10.h }[6], [x20]\n"
+ "b 124f\n"
+ "121:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 124f\n"
+ "ld1 { v10.h }[4], [x20]\n"
+ "b 124f\n"
+ "122:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 123f\n"
+ "ldr s10, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 124f\n"
+ "ld1 { v10.h }[2], [x20]\n"
+ "b 124f\n"
+ "123:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h10, [x20, #0x0]\n"
+ "124:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 2: End
+ "fmla v16.8h, v4.8h, v10.8h\n"
+ "fmla v17.8h, v3.8h, v10.8h\n"
+ "add x20, x15, x28\n"
+ "fmla v20.8h, v1.8h, v10.8h\n"
+ "fmla v21.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 126f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 125f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 128f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 128f\n"
+ "125:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 128f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 128f\n"
+ "126:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 127f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 128f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 128f\n"
+ "127:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "128:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: End
+ "fmla v18.8h, v5.8h, v11.8h\n"
+ "fmla v19.8h, v4.8h, v11.8h\n"
+ "add x20, x9, x4\n"
+ "fmla v22.8h, v2.8h, v11.8h\n"
+ "fmla v23.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #2, 130f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 129f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 132f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 132f\n"
+ "129:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 132f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 132f\n"
+ "130:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 131f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 132f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 132f\n"
+ "131:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "132:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: End
+ "fmla v24.8h, v7.8h, v12.8h\n"
+ "fmla v25.8h, v6.8h, v12.8h\n"
+ "add x20, x9, x28\n"
+ "fmla v28.8h, v4.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 134f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 133f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 136f\n"
+ "ld1 { v10.h }[6], [x20]\n"
+ "b 136f\n"
+ "133:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 136f\n"
+ "ld1 { v10.h }[4], [x20]\n"
+ "b 136f\n"
+ "134:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 135f\n"
+ "ldr s10, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 136f\n"
+ "ld1 { v10.h }[2], [x20]\n"
+ "b 136f\n"
+ "135:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset: Bit 1: Unset
+ "ldr h10, [x20, #0x0]\n"
+ "136:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
+ "fmla v26.8h, v8.8h, v10.8h\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
+ "fmax v16.8h, v16.8h, v13.8h\n"
+ "fmla v30.8h, v5.8h, v10.8h\n"
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmax v17.8h, v17.8h, v13.8h\n"
+ "fmax v18.8h, v18.8h, v13.8h\n"
+ "fmax v19.8h, v19.8h, v13.8h\n"
+ "fmax v20.8h, v20.8h, v13.8h\n"
+ "fmax v21.8h, v21.8h, v13.8h\n"
+ "fmax v22.8h, v22.8h, v13.8h\n"
+ "fmax v23.8h, v23.8h, v13.8h\n"
+ "fmax v24.8h, v24.8h, v13.8h\n"
+ "fmax v25.8h, v25.8h, v13.8h\n"
+ "fmax v26.8h, v26.8h, v13.8h\n"
+ "fmax v27.8h, v27.8h, v13.8h\n"
+ "fmax v28.8h, v28.8h, v13.8h\n"
+ "fmax v29.8h, v29.8h, v13.8h\n"
+ "fmax v30.8h, v30.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v13.8h\n"
+ "fmin v16.8h, v16.8h, v15.8h\n"
+ "fmin v17.8h, v17.8h, v15.8h\n"
+ "fmin v18.8h, v18.8h, v15.8h\n"
+ "fmin v19.8h, v19.8h, v15.8h\n"
+ "fmin v20.8h, v20.8h, v15.8h\n"
+ "fmin v21.8h, v21.8h, v15.8h\n"
+ "fmin v22.8h, v22.8h, v15.8h\n"
+ "fmin v23.8h, v23.8h, v15.8h\n"
+ "fmin v24.8h, v24.8h, v15.8h\n"
+ "fmin v25.8h, v25.8h, v15.8h\n"
+ "fmin v26.8h, v26.8h, v15.8h\n"
+ "fmin v27.8h, v27.8h, v15.8h\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "tbz %x[n_channels], #2, 138f\n"
+ "mov x23, x8\n"
+ "mov x22, x10\n"
+ "st1 { v16.d }[0], [x23], x5\n"
+ "mov x21, x27\n"
+ "mov x20, x24\n"
+ "st1 { v20.d }[0], [x22], x5\n"
+ "st1 { v24.d }[0], [x21], x5\n"
+ "add x8, x8, #0x8\n"
+ "add x10, x10, #0x8\n"
+ "st1 { v28.d }[0], [x20], x5\n"
+ "add x27, x27, #0x8\n"
+ "add x24, x24, #0x8\n"
+ "st1 { v17.d }[0], [x23], x5\n"
+ "st1 { v21.d }[0], [x22], x5\n"
+ "st1 { v25.d }[0], [x21], x5\n"
+ "st1 { v29.d }[0], [x20], x5\n"
+ "st1 { v18.d }[0], [x23], x5\n"
+ "st1 { v22.d }[0], [x22], x5\n"
+ "st1 { v26.d }[0], [x21], x5\n"
+ "st1 { v30.d }[0], [x20], x5\n"
+ "st1 { v19.d }[0], [x23]\n"
+ "st1 { v23.d }[0], [x22]\n"
+ "st1 { v27.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
+ "tbz %x[n_channels], #1, 137f\n"
+ "mov x23, x8\n"
+ "mov x22, x10\n"
+ "st1 { v16.s }[2], [x23], x5\n"
+ "mov x21, x27\n"
+ "mov x20, x24\n"
+ "st1 { v20.s }[2], [x22], x5\n"
+ "st1 { v24.s }[2], [x21], x5\n"
+ "add x8, x8, #0x4\n"
+ "add x10, x10, #0x4\n"
+ "st1 { v28.s }[2], [x20], x5\n"
+ "add x27, x27, #0x4\n"
+ "add x24, x24, #0x4\n"
+ "st1 { v17.s }[2], [x23], x5\n"
+ "st1 { v21.s }[2], [x22], x5\n"
+ "st1 { v25.s }[2], [x21], x5\n"
+ "st1 { v29.s }[2], [x20], x5\n"
+ "st1 { v18.s }[2], [x23], x5\n"
+ "st1 { v22.s }[2], [x22], x5\n"
+ "st1 { v26.s }[2], [x21], x5\n"
+ "st1 { v30.s }[2], [x20], x5\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v23.s }[2], [x22]\n"
+ "st1 { v27.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
+ "tbz %x[n_channels], #0, 140f\n"
+ "mov x23, x8\n"
+ "mov x22, x10\n"
+ "st1 { v16.h }[6], [x23], x5\n"
+ "mov x21, x27\n"
+ "mov x20, x24\n"
+ "st1 { v20.h }[6], [x22], x5\n"
+ "st1 { v24.h }[6], [x21], x5\n"
+ "st1 { v28.h }[6], [x20], x5\n"
+ "st1 { v17.h }[6], [x23], x5\n"
+ "st1 { v21.h }[6], [x22], x5\n"
+ "st1 { v25.h }[6], [x21], x5\n"
+ "st1 { v29.h }[6], [x20], x5\n"
+ "st1 { v18.h }[6], [x23], x5\n"
+ "st1 { v22.h }[6], [x22], x5\n"
+ "st1 { v26.h }[6], [x21], x5\n"
+ "st1 { v30.h }[6], [x20], x5\n"
+ "st1 { v19.h }[6], [x23]\n"
+ "st1 { v23.h }[6], [x22]\n"
+ "st1 { v27.h }[6], [x21]\n"
+ "st1 { v31.h }[6], [x20]\n"
+ "b 140f\n"
+ "137:" // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 140f\n"
+ "mov x23, x8\n"
+ "mov x22, x10\n"
+ "st1 { v16.h }[4], [x23], x5\n"
+ "mov x21, x27\n"
+ "mov x20, x24\n"
+ "st1 { v20.h }[4], [x22], x5\n"
+ "st1 { v24.h }[4], [x21], x5\n"
+ "st1 { v28.h }[4], [x20], x5\n"
+ "st1 { v17.h }[4], [x23], x5\n"
+ "st1 { v21.h }[4], [x22], x5\n"
+ "st1 { v25.h }[4], [x21], x5\n"
+ "st1 { v29.h }[4], [x20], x5\n"
+ "st1 { v18.h }[4], [x23], x5\n"
+ "st1 { v22.h }[4], [x22], x5\n"
+ "st1 { v26.h }[4], [x21], x5\n"
+ "st1 { v30.h }[4], [x20], x5\n"
+ "st1 { v19.h }[4], [x23]\n"
+ "st1 { v23.h }[4], [x22]\n"
+ "st1 { v27.h }[4], [x21]\n"
+ "st1 { v31.h }[4], [x20]\n"
+ "b 140f\n"
+ "138:" // Tile loop: Oddments: Store: Bit 2: Unset
+ "tbz %x[n_channels], #1, 139f\n"
+ "mov x23, x8\n"
+ "mov x22, x10\n"
+ "st1 { v16.s }[0], [x23], x5\n"
+ "mov x21, x27\n"
+ "mov x20, x24\n"
+ "st1 { v20.s }[0], [x22], x5\n"
+ "st1 { v24.s }[0], [x21], x5\n"
+ "add x8, x8, #0x4\n"
+ "add x10, x10, #0x4\n"
+ "st1 { v28.s }[0], [x20], x5\n"
+ "add x27, x27, #0x4\n"
+ "add x24, x24, #0x4\n"
+ "st1 { v17.s }[0], [x23], x5\n"
+ "st1 { v21.s }[0], [x22], x5\n"
+ "st1 { v25.s }[0], [x21], x5\n"
+ "st1 { v29.s }[0], [x20], x5\n"
+ "st1 { v18.s }[0], [x23], x5\n"
+ "st1 { v22.s }[0], [x22], x5\n"
+ "st1 { v26.s }[0], [x21], x5\n"
+ "st1 { v30.s }[0], [x20], x5\n"
+ "st1 { v19.s }[0], [x23]\n"
+ "st1 { v23.s }[0], [x22]\n"
+ "st1 { v27.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
+ "tbz %x[n_channels], #0, 140f\n"
+ "mov x23, x8\n"
+ "mov x22, x10\n"
+ "st1 { v16.h }[2], [x23], x5\n"
+ "mov x21, x27\n"
+ "mov x20, x24\n"
+ "st1 { v20.h }[2], [x22], x5\n"
+ "st1 { v24.h }[2], [x21], x5\n"
+ "st1 { v28.h }[2], [x20], x5\n"
+ "st1 { v17.h }[2], [x23], x5\n"
+ "st1 { v21.h }[2], [x22], x5\n"
+ "st1 { v25.h }[2], [x21], x5\n"
+ "st1 { v29.h }[2], [x20], x5\n"
+ "st1 { v18.h }[2], [x23], x5\n"
+ "st1 { v22.h }[2], [x22], x5\n"
+ "st1 { v26.h }[2], [x21], x5\n"
+ "st1 { v30.h }[2], [x20], x5\n"
+ "st1 { v19.h }[2], [x23]\n"
+ "st1 { v23.h }[2], [x22]\n"
+ "st1 { v27.h }[2], [x21]\n"
+ "st1 { v31.h }[2], [x20]\n"
+ "b 140f\n"
+ "139:" // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
+ "mov x23, x8\n"
+ "mov x22, x10\n"
+ "st1 { v16.h }[0], [x23], x5\n"
+ "mov x21, x27\n"
+ "mov x20, x24\n"
+ "st1 { v20.h }[0], [x22], x5\n"
+ "st1 { v24.h }[0], [x21], x5\n"
+ "st1 { v28.h }[0], [x20], x5\n"
+ "st1 { v17.h }[0], [x23], x5\n"
+ "st1 { v21.h }[0], [x22], x5\n"
+ "st1 { v25.h }[0], [x21], x5\n"
+ "st1 { v29.h }[0], [x20], x5\n"
+ "st1 { v18.h }[0], [x23], x5\n"
+ "st1 { v22.h }[0], [x22], x5\n"
+ "st1 { v26.h }[0], [x21], x5\n"
+ "st1 { v30.h }[0], [x20], x5\n"
+ "st1 { v19.h }[0], [x23]\n"
+ "st1 { v23.h }[0], [x22]\n"
+ "st1 { v27.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
+ "140:" // Tile loop: Oddments: Store: Bit 2: End
+ "141:" // Tile loop: End
+ "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x26, x26, #0x1\n"
+ "add x21, x27, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x26, x20\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x27, x27, x21, LT\n"
+ "csel x26, x26, XZR, LT\n"
+ "cmp x27, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..96feeeeece
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,2007 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[36];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[14];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[5];
+ inptrs[3] = input_ptrs[15];
+ inptrs[4] = input_ptrs[30];
+ inptrs[5] = input_ptrs[35];
+ inptrs[6] = input_ptrs[20];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[4];
+ inptrs[9] = input_ptrs[21];
+ inptrs[10] = input_ptrs[6];
+ inptrs[11] = input_ptrs[11];
+ inptrs[12] = input_ptrs[24];
+ inptrs[13] = input_ptrs[8];
+ inptrs[14] = input_ptrs[29];
+ inptrs[15] = input_ptrs[9];
+ inptrs[16] = input_ptrs[31];
+ inptrs[17] = input_ptrs[13];
+ inptrs[18] = input_ptrs[34];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[19];
+ inptrs[22] = input_ptrs[3];
+ inptrs[23] = input_ptrs[12];
+ inptrs[24] = input_ptrs[22];
+ inptrs[25] = input_ptrs[17];
+ inptrs[26] = input_ptrs[18];
+ inptrs[27] = input_ptrs[26];
+ inptrs[28] = input_ptrs[23];
+ inptrs[29] = input_ptrs[32];
+ inptrs[30] = input_ptrs[27];
+ inptrs[31] = input_ptrs[33];
+ inptrs[32] = input_ptrs[7];
+ inptrs[33] = input_ptrs[10];
+ inptrs[34] = input_ptrs[25];
+ inptrs[35] = input_ptrs[28];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "mov x6, #0x10\n" // cntb _, ALL, #1
+ "lsr x7, %x[n_channels], #0x3\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v13.8h }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v14.8h }, [x20]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x15, #0x0\n"
+ "sub x14, XZR, x6\n"
+ "cbz x7, 3f\n"
+ "ldr q30, [x17, #0x0]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "cmp x6, x7, LSL #4\n"
+ "ldr q1, [x17, #0x20]\n"
+ "ldr q2, [x17, #0x30]\n"
+ "ldr q3, [x17, #0x40]\n"
+ "ldr q4, [x17, #0x50]\n"
+ "ldr q5, [x17, #0x60]\n"
+ "ldr q6, [x17, #0x70]\n"
+ "ldr q7, [x17, #0x80]\n"
+ "ldr q8, [x17, #0x90]\n"
+ "add x17, x17, #0xa0\n"
+ "ldp x21, x20, [x16, #0x0]\n"
+ "ldr q9, [x21, x15]\n"
+ "ldr q10, [x20, x15]\n"
+ "ldp x21, x20, [x16, #0x10]\n"
+ "ldr q11, [x21, x15]\n"
+ "ldr q12, [x20, x15]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v23.16b, v30.16b\n fmla v23.8h, v4.8h, v9.8h\n"
+ "mov v17.16b, v30.16b\n fmla v17.8h, v8.8h, v9.8h\n"
+ "ldr x27, [x16, #0x20]\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v25.16b, v30.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+ "mov v28.16b, v30.16b\n fmla v28.8h, v1.8h, v9.8h\n"
+ "ldr x23, [x16, #0x28]\n"
+ "ldr x22, [x16, #0x38]\n"
+ "mov v20.16b, v30.16b\n fmla v20.8h, v0.8h, v9.8h\n"
+ "mov v16.16b, v30.16b\n fmla v16.8h, v7.8h, v9.8h\n"
+ "ldr x26, [x16, #0x40]\n"
+ "ldr x20, [x16, #0x48]\n"
+ "mov v15.16b, v30.16b\n fmla v15.8h, v6.8h, v9.8h\n"
+ "fmla v23.8h, v5.8h, v12.8h\n"
+ "ldr x25, [x16, #0x50]\n"
+ "ldr x21, [x16, #0x58]\n"
+ "mov v27.16b, v30.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+ "mov v31.16b, v30.16b\n fmla v31.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x24, x15]\n"
+ "ldr x13, [x16, #0x70]\n"
+ "fmla v17.8h, v0.8h, v10.8h\n"
+ "ldr q22, [x27, x15]\n"
+ "mov v10.16b, v30.16b\n fmla v10.8h, v2.8h, v11.8h\n"
+ "ldr q18, [x23, x15]\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v2.8h, v12.8h\n"
+ "ldr x24, [x16, #0x60]\n"
+ "ldr x23, [x16, #0x68]\n"
+ "fmla v20.8h, v1.8h, v12.8h\n"
+ "fmla v16.8h, v8.8h, v12.8h\n"
+ "ldr x12, [x8, #0x0]\n"
+ "ldr x11, [x8, #0x8]\n"
+ "fmla v15.8h, v7.8h, v12.8h\n"
+ "mov v29.16b, v30.16b\n fmla v29.8h, v6.8h, v22.8h\n"
+ "ldr q22, [x20, x15]\n"
+ "ldr x28, [x16, #0x88]\n"
+ "fmla v23.8h, v7.8h, v9.8h\n"
+ "fmla v10.8h, v6.8h, v12.8h\n"
+ "ldr x10, [x8, #0x10]\n"
+ "ldr x9, [x8, #0x18]\n"
+ "mov v21.16b, v30.16b\n fmla v21.8h, v3.8h, v12.8h\n"
+ "mov v19.16b, v30.16b\n fmla v19.8h, v0.8h, v12.8h\n"
+ "ldr q11, [x22, x15]\n"
+ "ldr x22, [x16, #0x78]\n"
+ "mov v24.16b, v30.16b\n fmla v24.8h, v8.8h, v18.8h\n"
+ "ldr q12, [x26, x15]\n"
+ "fmla v25.8h, v6.8h, v9.8h\n"
+ "ldr x20, [x16, #0x80]\n"
+ "fmla v28.8h, v4.8h, v9.8h\n"
+ "fmla v20.8h, v3.8h, v9.8h\n"
+ "add x14, x14, #0x10\n"
+ "mov v26.16b, v30.16b\n fmla v26.8h, v1.8h, v9.8h\n"
+ "mov v18.16b, v30.16b\n fmla v18.8h, v0.8h, v9.8h\n"
+ "ldr q30, [x17, #0x0]\n"
+ "fmla v27.8h, v8.8h, v9.8h\n"
+ "fmla v31.8h, v5.8h, v9.8h\n"
+ "fmla v29.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x25, x15]\n"
+ "fmla v17.8h, v1.8h, v11.8h\n"
+ "ldr x27, [x16, #0x90]\n"
+ "fmla v16.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x21, x15]\n"
+ "fmla v15.8h, v2.8h, v12.8h\n"
+ "ldr x21, [x16, #0x98]\n"
+ "fmla v23.8h, v8.8h, v22.8h\n"
+ "fmla v10.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x24, x15]\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v25.8h, v7.8h, v22.8h\n"
+ "fmla v21.8h, v6.8h, v22.8h\n"
+ "fmla v28.8h, v5.8h, v22.8h\n"
+ "fmla v20.8h, v4.8h, v22.8h\n"
+ "fmla v19.8h, v3.8h, v22.8h\n"
+ "fmla v26.8h, v2.8h, v22.8h\n"
+ "fmla v18.8h, v1.8h, v22.8h\n"
+ "fmla v24.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x23, x15]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v17.8h, v3.8h, v9.8h\n"
+ "fmla v27.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v6.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v12.8h\n"
+ "ldr q9, [x13, x15]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v16.8h, v4.8h, v22.8h\n"
+ "fmla v15.8h, v3.8h, v22.8h\n"
+ "fmla v23.8h, v1.8h, v22.8h\n"
+ "fmla v10.8h, v5.8h, v11.8h\n"
+ "fmla v21.8h, v2.8h, v11.8h\n"
+ "ldr q12, [x22, x15]\n"
+ "fmla v25.8h, v0.8h, v22.8h\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v19.8h, v8.8h, v9.8h\n"
+ "fmla v24.8h, v5.8h, v9.8h\n"
+ "ldr q11, [x20, x15]\n"
+ "ldr x22, [x16, #0xc0]\n"
+ "fmla v17.8h, v5.8h, v22.8h\n"
+ "fmla v27.8h, v2.8h, v22.8h\n"
+ "ldr q22, [x28, x15]\n"
+ "ldr x20, [x16, #0xc8]\n"
+ "fmla v16.8h, v5.8h, v12.8h\n"
+ "fmla v15.8h, v4.8h, v12.8h\n"
+ "fmla v23.8h, v2.8h, v12.8h\n"
+ "fmla v10.8h, v3.8h, v12.8h\n"
+ "fmla v25.8h, v1.8h, v12.8h\n"
+ "fmla v21.8h, v0.8h, v12.8h\n"
+ "ldr q9, [x21, x15]\n"
+ "ldr x28, [x16, #0xd8]\n"
+ "fmla v29.8h, v7.8h, v11.8h\n"
+ "fmla v26.8h, v6.8h, v11.8h\n"
+ "ldr q12, [x27, x15]\n"
+ "ldr x21, [x16, #0xd0]\n"
+ "fmla v17.8h, v7.8h, v22.8h\n"
+ "fmla v16.8h, v6.8h, v22.8h\n"
+ "fmla v27.8h, v4.8h, v22.8h\n"
+ "fmla v23.8h, v3.8h, v22.8h\n"
+ "fmla v31.8h, v1.8h, v22.8h\n"
+ "fmla v28.8h, v0.8h, v22.8h\n"
+ "ldr q11, [x26, x15]\n"
+ "ldr x27, [x16, #0xe0]\n"
+ "fmla v15.8h, v8.8h, v9.8h\n"
+ "fmla v18.8h, v8.8h, v12.8h\n"
+ "fmla v24.8h, v7.8h, v12.8h\n"
+ "ldr q12, [x25, x15]\n"
+ "fmla v19.8h, v1.8h, v9.8h\n"
+ "ldr x26, [x16, #0xe8]\n"
+ "fmla v10.8h, v7.8h, v9.8h\n"
+ "fmla v25.8h, v5.8h, v9.8h\n"
+ "fmla v21.8h, v4.8h, v9.8h\n"
+ "fmla v20.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x24, x15]\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla v17.8h, v2.8h, v11.8h\n"
+ "fmla v16.8h, v1.8h, v11.8h\n"
+ "fmla v15.8h, v0.8h, v11.8h\n"
+ "ldr q22, [x23, x15]\n"
+ "fmla v27.8h, v7.8h, v12.8h\n"
+ "ldr x25, [x16, #0xf8]\n"
+ "fmla v23.8h, v6.8h, v12.8h\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v26.8h, v0.8h, v12.8h\n"
+ "ldr q11, [x22, x15]\n"
+ "fmla v19.8h, v4.8h, v11.8h\n"
+ "ldr x23, [x16, #0x100]\n"
+ "fmla v18.8h, v2.8h, v11.8h\n"
+ "fmla v16.8h, v2.8h, v9.8h\n"
+ "fmla v15.8h, v1.8h, v9.8h\n"
+ "fmla v10.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x20, x15]\n"
+ "ldr x20, [x16, #0x108]\n"
+ "fmla v17.8h, v6.8h, v22.8h\n"
+ "fmla v27.8h, v3.8h, v22.8h\n"
+ "fmla v31.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x21, x15]\n"
+ "fmla v25.8h, v8.8h, v11.8h\n"
+ "ldr x22, [x16, #0x110]\n"
+ "fmla v21.8h, v7.8h, v11.8h\n"
+ "fmla v20.8h, v5.8h, v11.8h\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "ldr q12, [x28, x15]\n"
+ "fmla v19.8h, v2.8h, v9.8h\n"
+ "ldr x21, [x16, #0x118]\n"
+ "fmla v29.8h, v0.8h, v22.8h\n"
+ "fmla v26.8h, v4.8h, v12.8h\n"
+ "fmla v18.8h, v3.8h, v12.8h\n"
+ "fmla v10.8h, v8.8h, v9.8h\n"
+ "fmla v21.8h, v5.8h, v9.8h\n"
+ "ldr q11, [x27, x15]\n"
+ "fmla v27.8h, v6.8h, v22.8h\n"
+ "fmla v31.8h, v3.8h, v22.8h\n"
+ "ldr q22, [x26, x15]\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v20.8h, v6.8h, v12.8h\n"
+ "fmla v29.8h, v5.8h, v12.8h\n"
+ "fmla v19.8h, v5.8h, v11.8h\n"
+ "fmla v24.8h, v2.8h, v11.8h\n"
+ "fmla v26.8h, v7.8h, v22.8h\n"
+ "fmla v18.8h, v6.8h, v22.8h\n"
+ "fmla v31.8h, v8.8h, v12.8h\n"
+ "ldr q12, [x24, x15]\n"
+ "fmla v29.8h, v8.8h, v22.8h\n"
+ "ldr q22, [x23, x15]\n"
+ "fmla v28.8h, v8.8h, v12.8h\n"
+ "fmla v20.8h, v7.8h, v12.8h\n"
+ "fmla v19.8h, v6.8h, v12.8h\n"
+ "fmla v26.8h, v5.8h, v12.8h\n"
+ "fmla v18.8h, v4.8h, v12.8h\n"
+ "fmla v24.8h, v3.8h, v12.8h\n"
+ "ldr q12, [x20, x15]\n"
+ "ldp x20, x24, [x16, #0x0]\n"
+ "ldr q9, [x20, x6]\n"
+ "fmla v21.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x25, x15]\n"
+ "fmla v17.8h, v4.8h, v22.8h\n"
+ "fmla v16.8h, v3.8h, v22.8h\n"
+ "fmla v15.8h, v5.8h, v12.8h\n"
+ "fmax v17.8h, v17.8h, v13.8h\n"
+ "fmla v10.8h, v4.8h, v12.8h\n"
+ "fmla v26.8h, v8.8h, v11.8h\n"
+ "fmax v16.8h, v16.8h, v13.8h\n"
+ "fmla v18.8h, v7.8h, v11.8h\n"
+ "fmla v24.8h, v6.8h, v11.8h\n"
+ "ldr q11, [x22, x15]\n"
+ "fmax v15.8h, v15.8h, v13.8h\n"
+ "fmla v27.8h, v1.8h, v22.8h\n"
+ "fmla v23.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x21, x15]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "fmla v25.8h, v2.8h, v12.8h\n"
+ "ldr q2, [x17, #0x30]\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "ldr q1, [x17, #0x20]\n"
+ "fmax v10.8h, v10.8h, v13.8h\n"
+ "fmla v31.8h, v7.8h, v11.8h\n"
+ "fmla v28.8h, v6.8h, v11.8h\n"
+ "ldr q6, [x17, #0x70]\n"
+ "fmla v20.8h, v8.8h, v22.8h\n"
+ "ldr q8, [x17, #0x90]\n"
+ "fmla v19.8h, v7.8h, v22.8h\n"
+ "ldr q7, [x17, #0x80]\n"
+ "fmin v17.8h, v17.8h, v14.8h\n"
+ "fmin v16.8h, v16.8h, v14.8h\n"
+ "str q17, [x12, x14]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "fmin v15.8h, v15.8h, v14.8h\n"
+ "fmin v10.8h, v10.8h, v14.8h\n"
+ "str q16, [x11, x14]\n"
+ "ldr x22, [x8, #0x28]\n"
+ "fmax v27.8h, v27.8h, v13.8h\n"
+ "fmax v23.8h, v23.8h, v13.8h\n"
+ "str q15, [x10, x14]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "fmax v25.8h, v25.8h, v13.8h\n"
+ "fmax v21.8h, v21.8h, v13.8h\n"
+ "str q10, [x9, x14]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "fmla v26.8h, v3.8h, v11.8h\n"
+ "ldr q3, [x17, #0x40]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmla v18.8h, v5.8h, v22.8h\n"
+ "ldr q5, [x17, #0x60]\n"
+ "fmla v24.8h, v4.8h, v22.8h\n"
+ "ldr q10, [x24, x6]\n"
+ "ldr q4, [x17, #0x50]\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "str q27, [x23, x14]\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "fmax v31.8h, v31.8h, v13.8h\n"
+ "str q23, [x22, x14]\n"
+ "ldr x25, [x8, #0x40]\n"
+ "fmax v28.8h, v28.8h, v13.8h\n"
+ "fmax v20.8h, v20.8h, v13.8h\n"
+ "str q25, [x21, x14]\n"
+ "ldr x23, [x8, #0x48]\n"
+ "fmax v19.8h, v19.8h, v13.8h\n"
+ "str q21, [x20, x14]\n"
+ "ldr x22, [x8, #0x50]\n"
+ "ldr x24, [x8, #0x58]\n"
+ "ldp x21, x20, [x16, #0x10]\n"
+ "ldr q11, [x21, x6]\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "ldr q12, [x20, x6]\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
+ "fmin v19.8h, v19.8h, v14.8h\n"
+ "str q31, [x25, x14]\n"
+ "fmax v29.8h, v29.8h, v13.8h\n"
+ "fmax v26.8h, v26.8h, v13.8h\n"
+ "str q28, [x23, x14]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "fmax v18.8h, v18.8h, v13.8h\n"
+ "fmax v24.8h, v24.8h, v13.8h\n"
+ "str q20, [x22, x14]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "str q19, [x24, x14]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x6, x6, #0x10\n"
+ "cmp x6, x7, LSL #4\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "add x15, x15, #0x10\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "str q29, [x23, x14]\n"
+ "add x17, x17, #0xa0\n"
+ "str q26, [x22, x14]\n"
+ "str q18, [x21, x14]\n"
+ "str q24, [x20, x14]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v31.16b, v30.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+ "mov v17.16b, v30.16b\n fmla v17.8h, v8.8h, v9.8h\n"
+ "ldr x27, [x16, #0x20]\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v15.16b, v30.16b\n fmla v15.8h, v3.8h, v9.8h\n"
+ "mov v29.16b, v30.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "ldr x23, [x16, #0x28]\n"
+ "ldr x22, [x16, #0x38]\n"
+ "mov v19.16b, v30.16b\n fmla v19.8h, v0.8h, v9.8h\n"
+ "mov v20.16b, v30.16b\n fmla v20.8h, v7.8h, v9.8h\n"
+ "ldr x26, [x16, #0x40]\n"
+ "ldr x21, [x16, #0x48]\n"
+ "mov v21.16b, v30.16b\n fmla v21.8h, v6.8h, v9.8h\n"
+ "fmla v31.8h, v5.8h, v12.8h\n"
+ "ldr x25, [x16, #0x50]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "mov v18.16b, v30.16b\n fmla v18.8h, v5.8h, v9.8h\n"
+ "mov v27.16b, v30.16b\n fmla v27.8h, v2.8h, v9.8h\n"
+ "ldr q24, [x24, x15]\n"
+ "ldr x13, [x16, #0x70]\n"
+ "fmla v17.8h, v0.8h, v10.8h\n"
+ "ldr q22, [x27, x15]\n"
+ "mov v28.16b, v30.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q16, [x23, x15]\n"
+ "fmla v15.8h, v4.8h, v12.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "ldr x24, [x16, #0x60]\n"
+ "ldr x23, [x16, #0x68]\n"
+ "fmla v19.8h, v1.8h, v12.8h\n"
+ "fmla v20.8h, v8.8h, v12.8h\n"
+ "ldr x12, [x8, #0x0]\n"
+ "ldr x11, [x8, #0x8]\n"
+ "fmla v21.8h, v7.8h, v12.8h\n"
+ "mov v10.16b, v30.16b\n fmla v10.8h, v6.8h, v22.8h\n"
+ "ldr q22, [x21, x15]\n"
+ "ldr x28, [x16, #0x88]\n"
+ "fmla v31.8h, v7.8h, v24.8h\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "ldr x10, [x8, #0x10]\n"
+ "ldr x9, [x8, #0x18]\n"
+ "mov v9.16b, v30.16b\n fmla v9.8h, v3.8h, v12.8h\n"
+ "mov v11.16b, v30.16b\n fmla v11.8h, v0.8h, v12.8h\n"
+ "ldr q23, [x22, x15]\n"
+ "ldr x22, [x16, #0x78]\n"
+ "mov v12.16b, v30.16b\n fmla v12.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x26, x15]\n"
+ "fmla v15.8h, v6.8h, v24.8h\n"
+ "ldr x21, [x16, #0x80]\n"
+ "fmla v29.8h, v4.8h, v24.8h\n"
+ "fmla v19.8h, v3.8h, v24.8h\n"
+ "add x14, x14, #0x10\n"
+ "mov v26.16b, v30.16b\n fmla v26.8h, v1.8h, v24.8h\n"
+ "mov v25.16b, v30.16b\n fmla v25.8h, v0.8h, v24.8h\n"
+ "fmla v18.8h, v8.8h, v24.8h\n"
+ "fmla v27.8h, v5.8h, v24.8h\n"
+ "fmla v10.8h, v2.8h, v24.8h\n"
+ "ldr q24, [x25, x15]\n"
+ "fmla v17.8h, v1.8h, v23.8h\n"
+ "ldr x27, [x16, #0x90]\n"
+ "fmla v20.8h, v0.8h, v23.8h\n"
+ "ldr q23, [x20, x15]\n"
+ "fmla v21.8h, v2.8h, v16.8h\n"
+ "ldr x20, [x16, #0x98]\n"
+ "fmla v31.8h, v8.8h, v22.8h\n"
+ "fmla v28.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x24, x15]\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v15.8h, v7.8h, v22.8h\n"
+ "fmla v9.8h, v6.8h, v22.8h\n"
+ "fmla v29.8h, v5.8h, v22.8h\n"
+ "fmla v19.8h, v4.8h, v22.8h\n"
+ "fmla v11.8h, v3.8h, v22.8h\n"
+ "fmla v26.8h, v2.8h, v22.8h\n"
+ "fmla v25.8h, v1.8h, v22.8h\n"
+ "fmla v12.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x23, x15]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v17.8h, v3.8h, v24.8h\n"
+ "fmla v18.8h, v0.8h, v24.8h\n"
+ "fmla v27.8h, v6.8h, v16.8h\n"
+ "fmla v10.8h, v3.8h, v16.8h\n"
+ "ldr q16, [x13, x15]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v20.8h, v4.8h, v22.8h\n"
+ "fmla v21.8h, v3.8h, v22.8h\n"
+ "fmla v31.8h, v1.8h, v22.8h\n"
+ "fmla v28.8h, v5.8h, v23.8h\n"
+ "fmla v9.8h, v2.8h, v23.8h\n"
+ "ldr q23, [x22, x15]\n"
+ "fmla v15.8h, v0.8h, v22.8h\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v11.8h, v8.8h, v16.8h\n"
+ "fmla v12.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x21, x15]\n"
+ "ldr x22, [x16, #0xc0]\n"
+ "fmla v17.8h, v5.8h, v22.8h\n"
+ "fmla v18.8h, v2.8h, v22.8h\n"
+ "ldr q22, [x28, x15]\n"
+ "ldr x21, [x16, #0xc8]\n"
+ "fmla v20.8h, v5.8h, v23.8h\n"
+ "fmla v21.8h, v4.8h, v23.8h\n"
+ "fmla v31.8h, v2.8h, v23.8h\n"
+ "fmla v28.8h, v3.8h, v23.8h\n"
+ "fmla v15.8h, v1.8h, v23.8h\n"
+ "fmla v9.8h, v0.8h, v23.8h\n"
+ "ldr q23, [x20, x15]\n"
+ "ldr x28, [x16, #0xd8]\n"
+ "fmla v10.8h, v7.8h, v16.8h\n"
+ "fmla v26.8h, v6.8h, v16.8h\n"
+ "ldr q16, [x27, x15]\n"
+ "ldr x20, [x16, #0xd0]\n"
+ "fmla v17.8h, v7.8h, v22.8h\n"
+ "fmla v20.8h, v6.8h, v22.8h\n"
+ "fmla v18.8h, v4.8h, v22.8h\n"
+ "fmla v31.8h, v3.8h, v22.8h\n"
+ "fmla v27.8h, v1.8h, v22.8h\n"
+ "fmla v29.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x26, x15]\n"
+ "ldr x27, [x16, #0xe0]\n"
+ "fmla v21.8h, v8.8h, v23.8h\n"
+ "fmla v25.8h, v8.8h, v16.8h\n"
+ "fmla v12.8h, v7.8h, v16.8h\n"
+ "ldr q16, [x25, x15]\n"
+ "fmla v11.8h, v1.8h, v23.8h\n"
+ "ldr x26, [x16, #0xe8]\n"
+ "fmla v28.8h, v7.8h, v23.8h\n"
+ "fmla v15.8h, v5.8h, v23.8h\n"
+ "fmla v9.8h, v4.8h, v23.8h\n"
+ "fmla v19.8h, v2.8h, v23.8h\n"
+ "ldr q23, [x24, x15]\n"
+ "ldr x25, [x16, #0xf0]\n"
+ "fmla v17.8h, v2.8h, v22.8h\n"
+ "fmla v20.8h, v1.8h, v22.8h\n"
+ "fmla v21.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x23, x15]\n"
+ "fmla v18.8h, v7.8h, v16.8h\n"
+ "ldr x24, [x16, #0xf8]\n"
+ "fmla v31.8h, v6.8h, v16.8h\n"
+ "fmla v27.8h, v4.8h, v16.8h\n"
+ "fmla v29.8h, v3.8h, v16.8h\n"
+ "fmla v10.8h, v1.8h, v16.8h\n"
+ "fmla v26.8h, v0.8h, v16.8h\n"
+ "ldr q16, [x22, x15]\n"
+ "fmla v11.8h, v4.8h, v16.8h\n"
+ "ldr x23, [x16, #0x100]\n"
+ "fmla v25.8h, v2.8h, v16.8h\n"
+ "fmla v20.8h, v2.8h, v23.8h\n"
+ "fmla v21.8h, v1.8h, v23.8h\n"
+ "fmla v28.8h, v0.8h, v23.8h\n"
+ "ldr q23, [x21, x15]\n"
+ "ldr x22, [x16, #0x108]\n"
+ "fmla v17.8h, v6.8h, v22.8h\n"
+ "fmla v18.8h, v3.8h, v22.8h\n"
+ "fmla v27.8h, v0.8h, v22.8h\n"
+ "ldr q22, [x20, x15]\n"
+ "fmla v15.8h, v8.8h, v16.8h\n"
+ "ldr x21, [x16, #0x110]\n"
+ "fmla v9.8h, v7.8h, v16.8h\n"
+ "fmla v19.8h, v5.8h, v16.8h\n"
+ "fmla v12.8h, v1.8h, v16.8h\n"
+ "ldr q16, [x28, x15]\n"
+ "fmla v11.8h, v2.8h, v23.8h\n"
+ "ldr x20, [x16, #0x118]\n"
+ "fmla v10.8h, v0.8h, v22.8h\n"
+ "fmla v26.8h, v4.8h, v16.8h\n"
+ "fmla v25.8h, v3.8h, v16.8h\n"
+ "fmla v28.8h, v8.8h, v23.8h\n"
+ "fmla v9.8h, v5.8h, v23.8h\n"
+ "ldr q23, [x27, x15]\n"
+ "fmla v18.8h, v6.8h, v22.8h\n"
+ "fmla v27.8h, v3.8h, v22.8h\n"
+ "ldr q22, [x26, x15]\n"
+ "fmla v29.8h, v7.8h, v16.8h\n"
+ "fmla v19.8h, v6.8h, v16.8h\n"
+ "fmla v10.8h, v5.8h, v16.8h\n"
+ "fmla v11.8h, v5.8h, v23.8h\n"
+ "fmla v12.8h, v2.8h, v23.8h\n"
+ "fmla v26.8h, v7.8h, v22.8h\n"
+ "fmla v25.8h, v6.8h, v22.8h\n"
+ "fmla v27.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x25, x15]\n"
+ "fmla v10.8h, v8.8h, v22.8h\n"
+ "ldr q30, [x23, x15]\n"
+ "fmla v29.8h, v8.8h, v16.8h\n"
+ "fmla v19.8h, v7.8h, v16.8h\n"
+ "fmla v11.8h, v6.8h, v16.8h\n"
+ "fmla v26.8h, v5.8h, v16.8h\n"
+ "fmla v25.8h, v4.8h, v16.8h\n"
+ "fmla v12.8h, v3.8h, v16.8h\n"
+ "ldr q24, [x22, x15]\n"
+ "fmla v9.8h, v8.8h, v23.8h\n"
+ "ldr q16, [x24, x15]\n"
+ "fmla v17.8h, v4.8h, v30.8h\n"
+ "fmax v17.8h, v17.8h, v13.8h\n"
+ "fmla v20.8h, v3.8h, v30.8h\n"
+ "fmla v21.8h, v5.8h, v24.8h\n"
+ "fmax v20.8h, v20.8h, v13.8h\n"
+ "fmla v28.8h, v4.8h, v24.8h\n"
+ "fmla v26.8h, v8.8h, v16.8h\n"
+ "fmax v21.8h, v21.8h, v13.8h\n"
+ "fmla v25.8h, v7.8h, v16.8h\n"
+ "fmla v12.8h, v6.8h, v16.8h\n"
+ "ldr q23, [x21, x15]\n"
+ "fmax v28.8h, v28.8h, v13.8h\n"
+ "fmla v18.8h, v1.8h, v30.8h\n"
+ "fmla v31.8h, v0.8h, v30.8h\n"
+ "ldr q16, [x20, x15]\n"
+ "fmin v17.8h, v17.8h, v14.8h\n"
+ "fmla v15.8h, v2.8h, v24.8h\n"
+ "fmla v9.8h, v1.8h, v24.8h\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
+ "str q17, [x12, x14]\n"
+ "fmla v27.8h, v7.8h, v23.8h\n"
+ "fmla v29.8h, v6.8h, v23.8h\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "str q20, [x11, x14]\n"
+ "fmla v19.8h, v8.8h, v16.8h\n"
+ "fmla v11.8h, v7.8h, v16.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "str q21, [x10, x14]\n"
+ "fmax v18.8h, v18.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v13.8h\n"
+ "str q28, [x9, x14]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "fmax v15.8h, v15.8h, v13.8h\n"
+ "fmax v9.8h, v9.8h, v13.8h\n"
+ "ldr x22, [x8, #0x28]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "fmla v10.8h, v4.8h, v23.8h\n"
+ "fmla v26.8h, v3.8h, v23.8h\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "fmla v25.8h, v5.8h, v16.8h\n"
+ "fmla v12.8h, v4.8h, v16.8h\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
+ "str q18, [x23, x14]\n"
+ "fmin v15.8h, v15.8h, v14.8h\n"
+ "fmin v9.8h, v9.8h, v14.8h\n"
+ "str q31, [x22, x14]\n"
+ "ldr x23, [x8, #0x40]\n"
+ "fmax v27.8h, v27.8h, v13.8h\n"
+ "fmax v29.8h, v29.8h, v13.8h\n"
+ "str q15, [x21, x14]\n"
+ "ldr x22, [x8, #0x48]\n"
+ "fmax v19.8h, v19.8h, v13.8h\n"
+ "fmax v11.8h, v11.8h, v13.8h\n"
+ "str q9, [x20, x14]\n"
+ "ldr x21, [x8, #0x50]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "str q27, [x23, x14]\n"
+ "fmin v19.8h, v19.8h, v14.8h\n"
+ "fmin v11.8h, v11.8h, v14.8h\n"
+ "str q29, [x22, x14]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "fmax v10.8h, v10.8h, v13.8h\n"
+ "fmax v26.8h, v26.8h, v13.8h\n"
+ "str q19, [x21, x14]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "fmax v25.8h, v25.8h, v13.8h\n"
+ "fmax v12.8h, v12.8h, v13.8h\n"
+ "str q11, [x20, x14]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "fmin v10.8h, v10.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "str q10, [x23, x14]\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v12.8h, v12.8h, v14.8h\n"
+ "str q26, [x22, x14]\n"
+ "add x15, x15, #0x10\n"
+ "str q25, [x21, x14]\n"
+ "str q12, [x20, x14]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x7\n"
+ "beq 140f\n"
+ "ldr q30, [x17, #0x0]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "mov x14, x15\n"
+ "ldr q1, [x17, #0x20]\n"
+ "ldr q2, [x17, #0x30]\n"
+ "ldr q3, [x17, #0x40]\n"
+ "ldr q4, [x17, #0x50]\n"
+ "ldr q5, [x17, #0x60]\n"
+ "ldr q6, [x17, #0x70]\n"
+ "ldr q7, [x17, #0x80]\n"
+ "ldr q8, [x17, #0x90]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "add x23, x23, x15\n"
+ "add x22, x22, x15\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "add x21, x21, x15\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #2, 5f\n"
+ "ld1 { v9.d }[0], [x23], #0x8\n"
+ "ld1 { v10.d }[0], [x22], #0x8\n"
+ "ld1 { v11.d }[0], [x21], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.s }[2], [x23], #0x4\n"
+ "ld1 { v10.s }[2], [x22], #0x4\n"
+ "ld1 { v11.s }[2], [x21], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v9.h }[6], [x23], #0x2\n"
+ "ld1 { v10.h }[6], [x22], #0x2\n"
+ "ld1 { v11.h }[6], [x21], #0x2\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 7f\n"
+ "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v9.h }[4], [x23], #0x2\n"
+ "ld1 { v10.h }[4], [x22], #0x2\n"
+ "ld1 { v11.h }[4], [x21], #0x2\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 7f\n"
+ "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v9.s }[0], [x23], #0x4\n"
+ "ld1 { v10.s }[0], [x22], #0x4\n"
+ "ld1 { v11.s }[0], [x21], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v9.h }[2], [x23], #0x2\n"
+ "ld1 { v10.h }[2], [x22], #0x2\n"
+ "ld1 { v11.h }[2], [x21], #0x2\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 7f\n"
+ "6:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v9.h }[0], [x23], #0x2\n"
+ "ld1 { v10.h }[0], [x22], #0x2\n"
+ "ld1 { v11.h }[0], [x21], #0x2\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "7:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 2: End
+ "mov v16.16b, v30.16b\n fmla v16.8h, v8.8h, v9.8h\n"
+ "mov v17.16b, v30.16b\n fmla v17.8h, v7.8h, v9.8h\n"
+ "ldr x20, [x16, #0x20]\n"
+ "add x20, x20, x15\n"
+ "mov v18.16b, v30.16b\n fmla v18.8h, v6.8h, v9.8h\n"
+ "mov v21.16b, v30.16b\n fmla v21.8h, v4.8h, v9.8h\n"
+ "mov v22.16b, v30.16b\n fmla v22.8h, v3.8h, v9.8h\n"
+ "mov v25.16b, v30.16b\n fmla v25.8h, v1.8h, v9.8h\n"
+ "mov v26.16b, v30.16b\n fmla v26.8h, v0.8h, v9.8h\n"
+ "mov v19.16b, v30.16b\n fmla v19.8h, v2.8h, v11.8h\n"
+ "mov v20.16b, v30.16b\n fmla v20.8h, v5.8h, v9.8h\n"
+ "mov v24.16b, v30.16b\n fmla v24.8h, v2.8h, v9.8h\n"
+ "fmla v16.8h, v0.8h, v10.8h\n"
+ "fmla v17.8h, v8.8h, v12.8h\n"
+ "fmla v18.8h, v7.8h, v12.8h\n"
+ "fmla v19.8h, v6.8h, v12.8h\n"
+ "fmla v21.8h, v5.8h, v12.8h\n"
+ "fmla v22.8h, v4.8h, v12.8h\n"
+ "mov v23.16b, v30.16b\n fmla v23.8h, v3.8h, v12.8h\n"
+ "fmla v25.8h, v2.8h, v12.8h\n"
+ "fmla v26.8h, v1.8h, v12.8h\n"
+ "mov v27.16b, v30.16b\n fmla v27.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 9f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
+ "b 11f\n"
+ "8:" // Oddments: Load input (5, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
+ "b 11f\n"
+ "9:" // Oddments: Load input (5, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (5, 0): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "11:" // Oddments: Load input (5, 0): Bit 2: End
+ "ldr x20, [x16, #0x28]\n"
+ "mov v28.16b, v30.16b\n fmla v28.8h, v6.8h, v10.8h\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #2, 13f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 15f\n"
+ "12:" // Oddments: Load input (5, 5): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 15f\n"
+ "13:" // Oddments: Load input (5, 5): Bit 2: Unset
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (5, 5): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "15:" // Oddments: Load input (5, 5): Bit 2: End
+ "ldr x20, [x16, #0x30]\n"
+ "mov v31.16b, v30.16b\n fmla v31.8h, v8.8h, v11.8h\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #2, 17f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v9.h }[6], [x20], #0x2\n"
+ "b 19f\n"
+ "16:" // Oddments: Load input (3, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v9.h }[4], [x20], #0x2\n"
+ "b 19f\n"
+ "17:" // Oddments: Load input (3, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "19:" // Oddments: Load input (3, 2): Bit 2: End
+ "ldr x20, [x16, #0x38]\n"
+ "fmla v20.8h, v8.8h, v9.8h\n"
+ "fmla v21.8h, v7.8h, v9.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v22.8h, v6.8h, v9.8h\n"
+ "fmla v24.8h, v5.8h, v9.8h\n"
+ "fmla v25.8h, v4.8h, v9.8h\n"
+ "fmla v26.8h, v3.8h, v9.8h\n"
+ "fmla v28.8h, v2.8h, v9.8h\n"
+ "mov v29.16b, v30.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "fmla v30.8h, v0.8h, v9.8h\n"
+ "tbz %x[n_channels], #2, 21f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 23f\n"
+ "20:" // Oddments: Load input (0, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 23f\n"
+ "21:" // Oddments: Load input (0, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (0, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "23:" // Oddments: Load input (0, 1): Bit 2: End
+ "ldr x20, [x16, #0x40]\n"
+ "fmla v16.8h, v1.8h, v12.8h\n"
+ "fmla v17.8h, v0.8h, v12.8h\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #2, 25f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 27f\n"
+ "24:" // Oddments: Load input (0, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 27f\n"
+ "25:" // Oddments: Load input (0, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (0, 4): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "27:" // Oddments: Load input (0, 4): Bit 2: End
+ "ldr x20, [x16, #0x48]\n"
+ "fmla v18.8h, v2.8h, v11.8h\n"
+ "fmla v19.8h, v1.8h, v11.8h\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #2, 29f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
+ "b 31f\n"
+ "28:" // Oddments: Load input (3, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
+ "b 31f\n"
+ "29:" // Oddments: Load input (3, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "31:" // Oddments: Load input (3, 3): Bit 2: End
+ "ldr x20, [x16, #0x50]\n"
+ "fmla v21.8h, v8.8h, v10.8h\n"
+ "fmla v22.8h, v7.8h, v10.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v23.8h, v6.8h, v10.8h\n"
+ "fmla v25.8h, v5.8h, v10.8h\n"
+ "fmla v26.8h, v4.8h, v10.8h\n"
+ "fmla v27.8h, v3.8h, v10.8h\n"
+ "fmla v29.8h, v2.8h, v10.8h\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 33f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v9.h }[6], [x20], #0x2\n"
+ "b 35f\n"
+ "32:" // Oddments: Load input (1, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v9.h }[4], [x20], #0x2\n"
+ "b 35f\n"
+ "33:" // Oddments: Load input (1, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (1, 0): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "35:" // Oddments: Load input (1, 0): Bit 2: End
+ "ldr x20, [x16, #0x58]\n"
+ "fmla v16.8h, v3.8h, v9.8h\n"
+ "fmla v20.8h, v0.8h, v9.8h\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #2, 37f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 39f\n"
+ "36:" // Oddments: Load input (1, 5): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 39f\n"
+ "37:" // Oddments: Load input (1, 5): Bit 2: Unset
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (1, 5): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "39:" // Oddments: Load input (1, 5): Bit 2: End
+ "ldr x20, [x16, #0x60]\n"
+ "fmla v19.8h, v5.8h, v12.8h\n"
+ "fmla v23.8h, v2.8h, v12.8h\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #2, 41f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 43f\n"
+ "40:" // Oddments: Load input (4, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 43f\n"
+ "41:" // Oddments: Load input (4, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 42f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 43f\n"
+ "42:" // Oddments: Load input (4, 0): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "43:" // Oddments: Load input (4, 0): Bit 2: End
+ "ldr x20, [x16, #0x68]\n"
+ "fmla v24.8h, v6.8h, v11.8h\n"
+ "fmla v28.8h, v3.8h, v11.8h\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #2, 45f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 44f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
+ "b 47f\n"
+ "44:" // Oddments: Load input (1, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
+ "b 47f\n"
+ "45:" // Oddments: Load input (1, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 46f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "b 47f\n"
+ "46:" // Oddments: Load input (1, 2): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "47:" // Oddments: Load input (1, 2): Bit 2: End
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v16.8h, v5.8h, v10.8h\n"
+ "fmla v17.8h, v4.8h, v10.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v18.8h, v3.8h, v10.8h\n"
+ "fmla v20.8h, v2.8h, v10.8h\n"
+ "fmla v21.8h, v1.8h, v10.8h\n"
+ "fmla v22.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 49f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 48f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 51f\n"
+ "48:" // Oddments: Load input (4, 5): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 51f\n"
+ "49:" // Oddments: Load input (4, 5): Bit 2: Unset
+ "tbz %x[n_channels], #1, 50f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 51f\n"
+ "50:" // Oddments: Load input (4, 5): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "51:" // Oddments: Load input (4, 5): Bit 2: End
+ "ldr x20, [x16, #0x78]\n"
+ "fmla v27.8h, v8.8h, v11.8h\n"
+ "fmla v31.8h, v5.8h, v11.8h\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #2, 53f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 52f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 55f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 55f\n"
+ "52:" // Oddments: Load input (1, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 55f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 55f\n"
+ "53:" // Oddments: Load input (1, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 54f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 55f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 55f\n"
+ "54:" // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "55:" // Oddments: Load input (1, 3): Bit 2: End
+ "ldr x20, [x16, #0x80]\n"
+ "fmla v17.8h, v5.8h, v12.8h\n"
+ "fmla v18.8h, v4.8h, v12.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v19.8h, v3.8h, v12.8h\n"
+ "fmla v21.8h, v2.8h, v12.8h\n"
+ "fmla v22.8h, v1.8h, v12.8h\n"
+ "fmla v23.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 57f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 56f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 59f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 59f\n"
+ "56:" // Oddments: Load input (5, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 59f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 59f\n"
+ "57:" // Oddments: Load input (5, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 58f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 59f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 59f\n"
+ "58:" // Oddments: Load input (5, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "59:" // Oddments: Load input (5, 1): Bit 2: End
+ "ldr x20, [x16, #0x88]\n"
+ "fmla v28.8h, v7.8h, v11.8h\n"
+ "fmla v29.8h, v6.8h, v11.8h\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #2, 61f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 60f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 63f\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
+ "b 63f\n"
+ "60:" // Oddments: Load input (2, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 63f\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
+ "b 63f\n"
+ "61:" // Oddments: Load input (2, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 62f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 63f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "b 63f\n"
+ "62:" // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "63:" // Oddments: Load input (2, 1): Bit 2: End
+ "ldr x20, [x16, #0x90]\n"
+ "fmla v16.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v6.8h, v10.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v20.8h, v4.8h, v10.8h\n"
+ "fmla v21.8h, v3.8h, v10.8h\n"
+ "fmla v24.8h, v1.8h, v10.8h\n"
+ "fmla v25.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 65f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 64f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 67f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 67f\n"
+ "64:" // Oddments: Load input (5, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 67f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 67f\n"
+ "65:" // Oddments: Load input (5, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 66f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 67f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 67f\n"
+ "66:" // Oddments: Load input (5, 4): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "67:" // Oddments: Load input (5, 4): Bit 2: End
+ "ldr x20, [x16, #0x98]\n"
+ "fmla v30.8h, v8.8h, v11.8h\n"
+ "fmla v31.8h, v7.8h, v11.8h\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #2, 69f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 68f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 71f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 71f\n"
+ "68:" // Oddments: Load input (2, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 71f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 71f\n"
+ "69:" // Oddments: Load input (2, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 70f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 71f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 71f\n"
+ "70:" // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "71:" // Oddments: Load input (2, 4): Bit 2: End
+ "ldr x20, [x16, #0xa0]\n"
+ "fmla v18.8h, v8.8h, v12.8h\n"
+ "fmla v19.8h, v7.8h, v12.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v22.8h, v5.8h, v12.8h\n"
+ "fmla v23.8h, v4.8h, v12.8h\n"
+ "fmla v26.8h, v2.8h, v12.8h\n"
+ "fmla v27.8h, v1.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 73f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 72f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 75f\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
+ "b 75f\n"
+ "72:" // Oddments: Load input (0, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 75f\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
+ "b 75f\n"
+ "73:" // Oddments: Load input (0, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 74f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 75f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "b 75f\n"
+ "74:" // Oddments: Load input (0, 2): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "75:" // Oddments: Load input (0, 2): Bit 2: End
+ "ldr x20, [x16, #0xa8]\n"
+ "fmla v16.8h, v2.8h, v10.8h\n"
+ "fmla v17.8h, v1.8h, v10.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v18.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 77f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 76f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 79f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 79f\n"
+ "76:" // Oddments: Load input (3, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 79f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 79f\n"
+ "77:" // Oddments: Load input (3, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 78f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 79f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 79f\n"
+ "78:" // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "79:" // Oddments: Load input (3, 1): Bit 2: End
+ "ldr x20, [x16, #0xb0]\n"
+ "fmla v20.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v6.8h, v11.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v24.8h, v4.8h, v11.8h\n"
+ "fmla v25.8h, v3.8h, v11.8h\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #2, 81f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 80f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 83f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 83f\n"
+ "80:" // Oddments: Load input (0, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 83f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 83f\n"
+ "81:" // Oddments: Load input (0, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 82f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 83f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 83f\n"
+ "82:" // Oddments: Load input (0, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "83:" // Oddments: Load input (0, 3): Bit 2: End
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla v17.8h, v2.8h, v12.8h\n"
+ "fmla v18.8h, v1.8h, v12.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v19.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 85f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 84f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 87f\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
+ "b 87f\n"
+ "84:" // Oddments: Load input (2, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 87f\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
+ "b 87f\n"
+ "85:" // Oddments: Load input (2, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 86f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 87f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "b 87f\n"
+ "86:" // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "87:" // Oddments: Load input (2, 0): Bit 2: End
+ "ldr x20, [x16, #0xc0]\n"
+ "fmla v16.8h, v6.8h, v10.8h\n"
+ "fmla v20.8h, v3.8h, v10.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v24.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 89f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 88f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 91f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 91f\n"
+ "88:" // Oddments: Load input (3, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 91f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 91f\n"
+ "89:" // Oddments: Load input (3, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 90f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 91f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 91f\n"
+ "90:" // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "91:" // Oddments: Load input (3, 4): Bit 2: End
+ "ldr x20, [x16, #0xc8]\n"
+ "fmla v22.8h, v8.8h, v11.8h\n"
+ "fmla v23.8h, v7.8h, v11.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v26.8h, v5.8h, v11.8h\n"
+ "fmla v27.8h, v4.8h, v11.8h\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #2, 93f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 92f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 95f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 95f\n"
+ "92:" // Oddments: Load input (2, 5): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 95f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 95f\n"
+ "93:" // Oddments: Load input (2, 5): Bit 2: Unset
+ "tbz %x[n_channels], #1, 94f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 95f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 95f\n"
+ "94:" // Oddments: Load input (2, 5): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "95:" // Oddments: Load input (2, 5): Bit 2: End
+ "ldr x20, [x16, #0xd0]\n"
+ "fmla v19.8h, v8.8h, v12.8h\n"
+ "fmla v23.8h, v5.8h, v12.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v27.8h, v2.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 97f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 96f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 99f\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
+ "b 99f\n"
+ "96:" // Oddments: Load input (3, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 99f\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
+ "b 99f\n"
+ "97:" // Oddments: Load input (3, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 98f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 99f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "b 99f\n"
+ "98:" // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "99:" // Oddments: Load input (3, 0): Bit 2: End
+ "ldr x20, [x16, #0xd8]\n"
+ "fmla v20.8h, v6.8h, v10.8h\n"
+ "fmla v24.8h, v3.8h, v10.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v28.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 101f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 100f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 103f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 103f\n"
+ "100:" // Oddments: Load input (4, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 103f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 103f\n"
+ "101:" // Oddments: Load input (4, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 102f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 103f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 103f\n"
+ "102:" // Oddments: Load input (4, 2): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "103:" // Oddments: Load input (4, 2): Bit 2: End
+ "ldr x20, [x16, #0xe0]\n"
+ "fmla v24.8h, v8.8h, v11.8h\n"
+ "fmla v25.8h, v7.8h, v11.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v26.8h, v6.8h, v11.8h\n"
+ "fmla v28.8h, v5.8h, v11.8h\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "fmla v30.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #2, 105f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 104f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 107f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 107f\n"
+ "104:" // Oddments: Load input (3, 5): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 107f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 107f\n"
+ "105:" // Oddments: Load input (3, 5): Bit 2: Unset
+ "tbz %x[n_channels], #1, 106f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 107f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 107f\n"
+ "106:" // Oddments: Load input (3, 5): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "107:" // Oddments: Load input (3, 5): Bit 2: End
+ "ldr x20, [x16, #0xe8]\n"
+ "fmla v23.8h, v8.8h, v12.8h\n"
+ "fmla v27.8h, v5.8h, v12.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v31.8h, v2.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 109f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 108f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 111f\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
+ "b 111f\n"
+ "108:" // Oddments: Load input (5, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 111f\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
+ "b 111f\n"
+ "109:" // Oddments: Load input (5, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 110f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 111f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "b 111f\n"
+ "110:" // Oddments: Load input (5, 2): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "111:" // Oddments: Load input (5, 2): Bit 2: End
+ "ldr x20, [x16, #0xf0]\n"
+ "fmla v28.8h, v8.8h, v10.8h\n"
+ "fmla v29.8h, v7.8h, v10.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v30.8h, v6.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 113f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 112f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 115f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 115f\n"
+ "112:" // Oddments: Load input (4, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 115f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 115f\n"
+ "113:" // Oddments: Load input (4, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 114f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 115f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 115f\n"
+ "114:" // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "115:" // Oddments: Load input (4, 3): Bit 2: End
+ "ldr x20, [x16, #0xf8]\n"
+ "fmla v25.8h, v8.8h, v11.8h\n"
+ "fmla v26.8h, v7.8h, v11.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v27.8h, v6.8h, v11.8h\n"
+ "fmla v29.8h, v5.8h, v11.8h\n"
+ "fmla v30.8h, v4.8h, v11.8h\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #2, 117f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 116f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 119f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 119f\n"
+ "116:" // Oddments: Load input (5, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 119f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 119f\n"
+ "117:" // Oddments: Load input (5, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 118f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 119f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 119f\n"
+ "118:" // Oddments: Load input (5, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "119:" // Oddments: Load input (5, 3): Bit 2: End
+ "ldr x20, [x16, #0x100]\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmla v30.8h, v7.8h, v12.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v31.8h, v6.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 121f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 120f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 123f\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
+ "b 123f\n"
+ "120:" // Oddments: Load input (1, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 123f\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
+ "b 123f\n"
+ "121:" // Oddments: Load input (1, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 122f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 123f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "b 123f\n"
+ "122:" // Oddments: Load input (1, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "123:" // Oddments: Load input (1, 1): Bit 2: End
+ "ldr x20, [x16, #0x108]\n"
+ "fmla v16.8h, v4.8h, v10.8h\n"
+ "fmla v17.8h, v3.8h, v10.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v20.8h, v1.8h, v10.8h\n"
+ "fmla v21.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #2, 125f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 124f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 127f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 127f\n"
+ "124:" // Oddments: Load input (1, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 127f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 127f\n"
+ "125:" // Oddments: Load input (1, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 126f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 127f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 127f\n"
+ "126:" // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "127:" // Oddments: Load input (1, 4): Bit 2: End
+ "ldr x20, [x16, #0x110]\n"
+ "fmla v18.8h, v5.8h, v11.8h\n"
+ "fmla v19.8h, v4.8h, v11.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v22.8h, v2.8h, v11.8h\n"
+ "fmla v23.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #2, 129f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 128f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 131f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 131f\n"
+ "128:" // Oddments: Load input (4, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 131f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 131f\n"
+ "129:" // Oddments: Load input (4, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 130f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 131f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 131f\n"
+ "130:" // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "131:" // Oddments: Load input (4, 1): Bit 2: End
+ "ldr x20, [x16, #0x118]\n"
+ "fmla v24.8h, v7.8h, v12.8h\n"
+ "fmla v25.8h, v6.8h, v12.8h\n"
+ "add x20, x20, x15\n"
+ "fmla v28.8h, v4.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 133f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 132f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 135f\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
+ "b 135f\n"
+ "132:" // Oddments: Load input (4, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 135f\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
+ "b 135f\n"
+ "133:" // Oddments: Load input (4, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 134f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 135f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "b 135f\n"
+ "134:" // Oddments: Load input (4, 4): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "135:" // Oddments: Load input (4, 4): Bit 2: End
+ "fmla v26.8h, v8.8h, v10.8h\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
+ "fmax v16.8h, v16.8h, v13.8h\n"
+ "fmla v30.8h, v5.8h, v10.8h\n"
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmax v17.8h, v17.8h, v13.8h\n"
+ "fmax v18.8h, v18.8h, v13.8h\n"
+ "fmax v19.8h, v19.8h, v13.8h\n"
+ "fmax v20.8h, v20.8h, v13.8h\n"
+ "fmax v21.8h, v21.8h, v13.8h\n"
+ "fmax v22.8h, v22.8h, v13.8h\n"
+ "fmax v23.8h, v23.8h, v13.8h\n"
+ "fmax v24.8h, v24.8h, v13.8h\n"
+ "fmax v25.8h, v25.8h, v13.8h\n"
+ "fmax v26.8h, v26.8h, v13.8h\n"
+ "fmax v27.8h, v27.8h, v13.8h\n"
+ "fmax v28.8h, v28.8h, v13.8h\n"
+ "fmax v29.8h, v29.8h, v13.8h\n"
+ "fmax v30.8h, v30.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v13.8h\n"
+ "fmin v16.8h, v16.8h, v14.8h\n"
+ "fmin v17.8h, v17.8h, v14.8h\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "fmin v19.8h, v19.8h, v14.8h\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
+ "tbz %x[n_channels], #2, 137f\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
+ "st1 { v16.d }[0], [x23]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
+ "st1 { v17.d }[0], [x22]\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
+ "st1 { v18.d }[0], [x21]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
+ "st1 { v19.d }[0], [x20]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
+ "st1 { v20.d }[0], [x23]\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
+ "st1 { v21.d }[0], [x22]\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
+ "st1 { v22.d }[0], [x21]\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
+ "st1 { v23.d }[0], [x20]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
+ "st1 { v24.d }[0], [x23]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
+ "st1 { v25.d }[0], [x22]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
+ "st1 { v26.d }[0], [x21]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
+ "st1 { v27.d }[0], [x20]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
+ "add x14, x14, #0x8\n"
+ "st1 { v28.d }[0], [x23]\n"
+ "st1 { v29.d }[0], [x22]\n"
+ "st1 { v30.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
+ "tbz %x[n_channels], #1, 136f\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
+ "st1 { v18.s }[2], [x21]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
+ "st1 { v19.s }[2], [x20]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
+ "st1 { v21.s }[2], [x22]\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
+ "st1 { v23.s }[2], [x20]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
+ "st1 { v26.s }[2], [x21]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
+ "st1 { v27.s }[2], [x20]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
+ "add x14, x14, #0x4\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
+ "tbz %x[n_channels], #0, 139f\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
+ "st1 { v16.h }[6], [x23]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
+ "st1 { v17.h }[6], [x22]\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
+ "st1 { v18.h }[6], [x21]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
+ "st1 { v19.h }[6], [x20]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
+ "st1 { v20.h }[6], [x23]\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
+ "st1 { v21.h }[6], [x22]\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
+ "st1 { v22.h }[6], [x21]\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
+ "st1 { v23.h }[6], [x20]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
+ "st1 { v24.h }[6], [x23]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
+ "st1 { v25.h }[6], [x22]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
+ "st1 { v26.h }[6], [x21]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
+ "st1 { v27.h }[6], [x20]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
+ "st1 { v28.h }[6], [x23]\n"
+ "st1 { v29.h }[6], [x22]\n"
+ "st1 { v30.h }[6], [x21]\n"
+ "st1 { v31.h }[6], [x20]\n"
+ "b 139f\n"
+ "136:" // Oddments: Store: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 139f\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
+ "st1 { v16.h }[4], [x23]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
+ "st1 { v17.h }[4], [x22]\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
+ "st1 { v18.h }[4], [x21]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
+ "st1 { v19.h }[4], [x20]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
+ "st1 { v20.h }[4], [x23]\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
+ "st1 { v21.h }[4], [x22]\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
+ "st1 { v22.h }[4], [x21]\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
+ "st1 { v23.h }[4], [x20]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
+ "st1 { v24.h }[4], [x23]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
+ "st1 { v25.h }[4], [x22]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
+ "st1 { v26.h }[4], [x21]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
+ "st1 { v27.h }[4], [x20]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
+ "st1 { v28.h }[4], [x23]\n"
+ "st1 { v29.h }[4], [x22]\n"
+ "st1 { v30.h }[4], [x21]\n"
+ "st1 { v31.h }[4], [x20]\n"
+ "b 139f\n"
+ "137:" // Oddments: Store: Bit 2: Unset
+ "tbz %x[n_channels], #1, 138f\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
+ "st1 { v16.s }[0], [x23]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
+ "st1 { v17.s }[0], [x22]\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
+ "st1 { v18.s }[0], [x21]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
+ "st1 { v19.s }[0], [x20]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
+ "st1 { v20.s }[0], [x23]\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
+ "st1 { v21.s }[0], [x22]\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
+ "st1 { v22.s }[0], [x21]\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
+ "st1 { v23.s }[0], [x20]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
+ "st1 { v24.s }[0], [x23]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
+ "st1 { v25.s }[0], [x22]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
+ "st1 { v26.s }[0], [x21]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
+ "st1 { v27.s }[0], [x20]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
+ "add x14, x14, #0x4\n"
+ "st1 { v28.s }[0], [x23]\n"
+ "st1 { v29.s }[0], [x22]\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
+ "tbz %x[n_channels], #0, 139f\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
+ "st1 { v16.h }[2], [x23]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
+ "st1 { v17.h }[2], [x22]\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
+ "st1 { v18.h }[2], [x21]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
+ "st1 { v19.h }[2], [x20]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
+ "st1 { v20.h }[2], [x23]\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
+ "st1 { v21.h }[2], [x22]\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
+ "st1 { v22.h }[2], [x21]\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
+ "st1 { v23.h }[2], [x20]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
+ "st1 { v24.h }[2], [x23]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
+ "st1 { v25.h }[2], [x22]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
+ "st1 { v26.h }[2], [x21]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
+ "st1 { v27.h }[2], [x20]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
+ "st1 { v28.h }[2], [x23]\n"
+ "st1 { v29.h }[2], [x22]\n"
+ "st1 { v30.h }[2], [x21]\n"
+ "st1 { v31.h }[2], [x20]\n"
+ "b 139f\n"
+ "138:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
+ "st1 { v16.h }[0], [x23]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
+ "st1 { v17.h }[0], [x22]\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
+ "st1 { v18.h }[0], [x21]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
+ "st1 { v19.h }[0], [x20]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
+ "st1 { v20.h }[0], [x23]\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
+ "st1 { v21.h }[0], [x22]\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
+ "st1 { v22.h }[0], [x21]\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
+ "st1 { v23.h }[0], [x20]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
+ "st1 { v24.h }[0], [x23]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
+ "st1 { v25.h }[0], [x22]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
+ "st1 { v26.h }[0], [x21]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
+ "st1 { v27.h }[0], [x20]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
+ "st1 { v28.h }[0], [x23]\n"
+ "st1 { v29.h }[0], [x22]\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
+ "139:" // Oddments: Store: Bit 2: End
+ "140:" // End
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..8ad6a37fea
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..8954999990
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,895 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x23, #0x0\n"
+ "mov x27, #0x0\n"
+ "1:" // Tile loop
+ "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x4\n"
+ "mov x25, #0x2\n"
+ "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mul x22, x23, x24\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x22, x27, x6, x22\n" // offset += tile_j * ld_input_col
+ "ldr x7, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "lsl x6, x6, #0x1\n"
+ "mul x20, x23, x21\n" // offset = tile_i * ld_output_row
+ "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "mov x23, #0x10\n" // cntb _, ALL, #1
+ "mul x22, x22, x26\n" // offset *= kernel_stride * output_size
+ "add x8, x8, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x16, x8, x24, LSL #1\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x20, x27, x7, x20\n" // offset += tile_j * ld_output_col
+ "lsr x22, %x[n_channels], #0x3\n"
+ "add x14, x16, x24, LSL #1\n"
+ "mul x20, x20, x25\n" // offset *= output_tile_size
+ "add x13, x6, x6\n"
+ "add x12, x14, x24, LSL #1\n"
+ "add x11, x13, x6\n"
+ "add x17, x17, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v26.8h }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v27.8h }, [x20]\n"
+ "add x10, x12, x24, LSL #1\n"
+ "add x9, x11, x6\n"
+ "add x28, x17, x21, LSL #1\n"
+ "lsl x7, x7, #0x1\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x23\n"
+ "cbz x22, 4f\n"
+ "ldr q31, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x23, x22, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "ldr q9, [x14, x13]\n"
+ "ld1 { v10.8h }, [x8]\n"
+ "ldr q11, [x8, x6]\n"
+ "ldr q12, [x8, x11]\n"
+ "ldr q13, [x8, x9]\n"
+ "ld1 { v14.8h }, [x16]\n"
+ "ldr q15, [x16, x6]\n"
+ "ldr q16, [x8, x13]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v6.8h, v9.8h\n"
+ "add x23, x23, #0x10\n"
+ "add x8, x8, #0x10\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x8]\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q21, [x16, x9]\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "ldr q18, [x16, x11]\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "ldr q17, [x16, x13]\n"
+ "fmla v29.8h, v3.8h, v14.8h\n"
+ "ld1 { v20.8h }, [x12]\n"
+ "fmla v28.8h, v0.8h, v16.8h\n"
+ "add x16, x16, #0x10\n"
+ "fmla v29.8h, v4.8h, v15.8h\n"
+ "ld1 { v25.8h }, [x14]\n"
+ "fmla v28.8h, v4.8h, v18.8h\n"
+ "ldr q19, [x12, x6]\n"
+ "fmla v29.8h, v2.8h, v16.8h\n"
+ "ldr q18, [x14, x6]\n"
+ "fmla v28.8h, v5.8h, v21.8h\n"
+ "ldr q24, [x14, x11]\n"
+ "mov v23.16b, v31.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "mov v22.16b, v31.16b\n fmla v22.8h, v0.8h, v9.8h\n"
+ "ldr q31, [x15, #0x0]\n"
+ "cmp x23, x22, LSL #4\n"
+ "fmla v29.8h, v5.8h, v17.8h\n"
+ "fmla v28.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x12, x11]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v23.8h, v3.8h, v20.8h\n"
+ "ldr q16, [x12, x9]\n"
+ "fmla v22.8h, v4.8h, v17.8h\n"
+ "ldr q21, [x10, x6]\n"
+ "fmla v23.8h, v0.8h, v25.8h\n"
+ "ldr q0, [x15, #0x10]\n"
+ "fmla v22.8h, v1.8h, v24.8h\n"
+ "add x21, x21, #0x10\n"
+ "fmla v23.8h, v4.8h, v19.8h\n"
+ "ldr q20, [x14, x9]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "fmla v22.8h, v5.8h, v16.8h\n"
+ "ldr q19, [x10, x11]\n"
+ "fmla v29.8h, v6.8h, v25.8h\n"
+ "ld1 { v17.8h }, [x10]\n"
+ "fmla v23.8h, v1.8h, v18.8h\n"
+ "ldr q1, [x15, #0x20]\n"
+ "fmla v22.8h, v2.8h, v20.8h\n"
+ "ldr q2, [x15, #0x30]\n"
+ "fmla v29.8h, v7.8h, v18.8h\n"
+ "ldr q16, [x12, x13]\n"
+ "fmla v23.8h, v6.8h, v17.8h\n"
+ "ldr q18, [x10, x13]\n"
+ "fmla v22.8h, v3.8h, v16.8h\n"
+ "ldr q3, [x15, #0x40]\n"
+ "fmla v23.8h, v7.8h, v21.8h\n"
+ "ldr q13, [x8, x9]\n"
+ "fmla v22.8h, v7.8h, v19.8h\n"
+ "ld1 { v14.8h }, [x16]\n"
+ "fmla v28.8h, v7.8h, v24.8h\n"
+ "ldr q12, [x8, x11]\n"
+ "fmla v23.8h, v5.8h, v16.8h\n"
+ "ldr q16, [x8, x13]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "fmla v22.8h, v6.8h, v18.8h\n"
+ "fmla v28.8h, v8.8h, v20.8h\n"
+ "ldr q17, [x10, x9]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "fmla v23.8h, v8.8h, v18.8h\n"
+ "fmla v22.8h, v8.8h, v17.8h\n"
+ "ldr q11, [x8, x6]\n"
+ "ldr q15, [x16, x6]\n"
+ "fmax v29.8h, v29.8h, v26.8h\n"
+ "fmax v28.8h, v28.8h, v26.8h\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "fmax v23.8h, v23.8h, v26.8h\n"
+ "fmax v22.8h, v22.8h, v26.8h\n"
+ "add x14, x14, #0x10\n"
+ "ldr q9, [x14, x13]\n"
+ "fmin v29.8h, v29.8h, v27.8h\n"
+ "fmin v28.8h, v28.8h, v27.8h\n"
+ "fmin v23.8h, v23.8h, v27.8h\n"
+ "fmin v22.8h, v22.8h, v27.8h\n"
+ "add x12, x12, #0x10\n"
+ "add x10, x10, #0x10\n"
+ "st1 { v29.8h }, [x17]\n"
+ "add x15, x15, #0xa0\n"
+ "str q28, [x17, x7]\n"
+ "add x17, x17, #0x10\n"
+ "st1 { v23.8h }, [x28]\n"
+ "str q22, [x28, x7]\n"
+ "add x28, x28, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v29.16b, v31.16b\n fmla v29.8h, v8.8h, v9.8h\n"
+ "mov v28.16b, v31.16b\n fmla v28.8h, v6.8h, v9.8h\n"
+ "add x8, x8, #0x10\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q20, [x16, x9]\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "ldr q18, [x16, x11]\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "ldr q17, [x16, x13]\n"
+ "fmla v29.8h, v3.8h, v14.8h\n"
+ "ld1 { v19.8h }, [x12]\n"
+ "fmla v28.8h, v0.8h, v16.8h\n"
+ "add x16, x16, #0x10\n"
+ "fmla v29.8h, v4.8h, v15.8h\n"
+ "ld1 { v25.8h }, [x14]\n"
+ "fmla v28.8h, v4.8h, v18.8h\n"
+ "ldr q18, [x12, x6]\n"
+ "fmla v29.8h, v2.8h, v16.8h\n"
+ "ldr q24, [x14, x6]\n"
+ "fmla v28.8h, v5.8h, v20.8h\n"
+ "ldr q23, [x14, x11]\n"
+ "mov v22.16b, v31.16b\n fmla v22.8h, v2.8h, v9.8h\n"
+ "mov v21.16b, v31.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "fmla v29.8h, v5.8h, v17.8h\n"
+ "fmla v28.8h, v3.8h, v17.8h\n"
+ "ldr q17, [x12, x11]\n"
+ "fmla v22.8h, v3.8h, v19.8h\n"
+ "ldr q16, [x12, x9]\n"
+ "fmla v21.8h, v4.8h, v17.8h\n"
+ "ldr q20, [x10, x6]\n"
+ "fmla v22.8h, v0.8h, v25.8h\n"
+ "fmla v21.8h, v1.8h, v23.8h\n"
+ "fmla v22.8h, v4.8h, v18.8h\n"
+ "ldr q19, [x14, x9]\n"
+ "fmla v21.8h, v5.8h, v16.8h\n"
+ "ldr q18, [x10, x11]\n"
+ "fmla v29.8h, v6.8h, v25.8h\n"
+ "ld1 { v17.8h }, [x10]\n"
+ "fmla v22.8h, v1.8h, v24.8h\n"
+ "add x14, x14, #0x10\n"
+ "fmla v21.8h, v2.8h, v19.8h\n"
+ "fmla v29.8h, v7.8h, v24.8h\n"
+ "ldr q16, [x12, x13]\n"
+ "fmax v29.8h, v29.8h, v26.8h\n"
+ "fmla v22.8h, v6.8h, v17.8h\n"
+ "ldr q17, [x10, x13]\n"
+ "fmla v21.8h, v3.8h, v16.8h\n"
+ "fmin v29.8h, v29.8h, v27.8h\n"
+ "fmla v22.8h, v7.8h, v20.8h\n"
+ "fmla v21.8h, v7.8h, v18.8h\n"
+ "st1 { v29.8h }, [x17]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v28.8h, v7.8h, v23.8h\n"
+ "fmla v22.8h, v5.8h, v16.8h\n"
+ "fmla v21.8h, v6.8h, v17.8h\n"
+ "fmla v28.8h, v8.8h, v19.8h\n"
+ "ldr q16, [x10, x9]\n"
+ "fmax v28.8h, v28.8h, v26.8h\n"
+ "fmla v22.8h, v8.8h, v17.8h\n"
+ "fmla v21.8h, v8.8h, v16.8h\n"
+ "fmax v22.8h, v22.8h, v26.8h\n"
+ "add x10, x10, #0x10\n"
+ "fmax v21.8h, v21.8h, v26.8h\n"
+ "fmin v28.8h, v28.8h, v27.8h\n"
+ "str q28, [x17, x7]\n"
+ "add x17, x17, #0x10\n"
+ "fmin v22.8h, v22.8h, v27.8h\n"
+ "fmin v21.8h, v21.8h, v27.8h\n"
+ "st1 { v22.8h }, [x28]\n"
+ "str q21, [x28, x7]\n"
+ "add x28, x28, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x7\n"
+ "beq 81f\n"
+ "ldr q31, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "add x27, x14, x13\n"
+ "add x26, x8, XZR\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "add x25, x8, x6\n"
+ "add x24, x8, x11\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "add x23, x8, x9\n"
+ "add x22, x16, XZR\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "add x21, x16, x6\n"
+ "add x20, x8, x13\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "tbz %x[n_channels], #2, 6f\n"
+ "ldr d9, [x27], #0x8\n"
+ "ldr d10, [x26], #0x8\n"
+ "ldr d11, [x25], #0x8\n"
+ "ldr d12, [x24], #0x8\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d14, [x22], #0x8\n"
+ "ldr d15, [x21], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ld1 { v9.s }[2], [x27], #0x4\n"
+ "ld1 { v10.s }[2], [x26], #0x4\n"
+ "ld1 { v11.s }[2], [x25], #0x4\n"
+ "ld1 { v12.s }[2], [x24], #0x4\n"
+ "ld1 { v13.s }[2], [x23], #0x4\n"
+ "ld1 { v14.s }[2], [x22], #0x4\n"
+ "ld1 { v15.s }[2], [x21], #0x4\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v9.h }[6], [x27]\n"
+ "ld1 { v10.h }[6], [x26]\n"
+ "ld1 { v11.h }[6], [x25]\n"
+ "ld1 { v12.h }[6], [x24]\n"
+ "ld1 { v13.h }[6], [x23]\n"
+ "ld1 { v14.h }[6], [x22]\n"
+ "ld1 { v15.h }[6], [x21]\n"
+ "ld1 { v16.h }[6], [x20]\n"
+ "b 8f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v9.h }[4], [x27]\n"
+ "ld1 { v10.h }[4], [x26]\n"
+ "ld1 { v11.h }[4], [x25]\n"
+ "ld1 { v12.h }[4], [x24]\n"
+ "ld1 { v13.h }[4], [x23]\n"
+ "ld1 { v14.h }[4], [x22]\n"
+ "ld1 { v15.h }[4], [x21]\n"
+ "ld1 { v16.h }[4], [x20]\n"
+ "b 8f\n"
+ "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr s9, [x27], #0x4\n"
+ "ldr s10, [x26], #0x4\n"
+ "ldr s11, [x25], #0x4\n"
+ "ldr s12, [x24], #0x4\n"
+ "ldr s13, [x23], #0x4\n"
+ "ldr s14, [x22], #0x4\n"
+ "ldr s15, [x21], #0x4\n"
+ "ldr s16, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v9.h }[2], [x27]\n"
+ "ld1 { v10.h }[2], [x26]\n"
+ "ld1 { v11.h }[2], [x25]\n"
+ "ld1 { v12.h }[2], [x24]\n"
+ "ld1 { v13.h }[2], [x23]\n"
+ "ld1 { v14.h }[2], [x22]\n"
+ "ld1 { v15.h }[2], [x21]\n"
+ "ld1 { v16.h }[2], [x20]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Unset: Bit 1: Unset
+ "ldr h9, [x27, #0x0]\n"
+ "ldr h10, [x26, #0x0]\n"
+ "ldr h11, [x25, #0x0]\n"
+ "ldr h12, [x24, #0x0]\n"
+ "ldr h13, [x23, #0x0]\n"
+ "ldr h14, [x22, #0x0]\n"
+ "ldr h15, [x21, #0x0]\n"
+ "ldr h16, [x20, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: End
+ "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+ "fmla v28.8h, v0.8h, v10.8h\n"
+ "add x20, x16, x11\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v28.8h, v3.8h, v14.8h\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "fmla v28.8h, v4.8h, v15.8h\n"
+ "mov v30.16b, v31.16b\n fmla v30.8h, v2.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v2.8h, v16.8h\n"
+ "fmla v29.8h, v0.8h, v16.8h\n"
+ "tbz %x[n_channels], #2, 10f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 12f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 12f\n"
+ "10:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "add x20, x16, x9\n"
+ "tbz %x[n_channels], #2, 14f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 16f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 16f\n"
+ "14:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: End
+ "fmla v29.8h, v5.8h, v12.8h\n"
+ "add x20, x16, x13\n"
+ "tbz %x[n_channels], #2, 18f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v13.h }[6], [x20]\n"
+ "b 20f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v13.h }[4], [x20]\n"
+ "b 20f\n"
+ "18:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr s13, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v13.h }[2], [x20]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: Unset: Bit 1: Unset
+ "ldr h13, [x20, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 2: End
+ "fmla v28.8h, v5.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v13.8h\n"
+ "add x20, x12, XZR\n"
+ "tbz %x[n_channels], #2, 22f\n"
+ "ldr d14, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v14.h }[6], [x20]\n"
+ "b 24f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v14.h }[4], [x20]\n"
+ "b 24f\n"
+ "22:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr s14, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v14.h }[2], [x20]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset: Bit 1: Unset
+ "ldr h14, [x20, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
+ "fmla v30.8h, v3.8h, v14.8h\n"
+ "add x20, x14, XZR\n"
+ "tbz %x[n_channels], #2, 26f\n"
+ "ldr d15, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v15.h }[6], [x20]\n"
+ "b 28f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v15.h }[4], [x20]\n"
+ "b 28f\n"
+ "26:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr s15, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v15.h }[2], [x20]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: Unset: Bit 1: Unset
+ "ldr h15, [x20, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 2: End
+ "fmla v28.8h, v6.8h, v15.8h\n"
+ "fmla v30.8h, v0.8h, v15.8h\n"
+ "add x20, x12, x6\n"
+ "tbz %x[n_channels], #2, 30f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 32f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 32f\n"
+ "30:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
+ "fmla v30.8h, v4.8h, v11.8h\n"
+ "add x20, x14, x6\n"
+ "tbz %x[n_channels], #2, 34f\n"
+ "ldr d16, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v16.h }[6], [x20]\n"
+ "b 36f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v16.h }[4], [x20]\n"
+ "b 36f\n"
+ "34:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr s16, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v16.h }[2], [x20]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h16, [x20, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: End
+ "fmla v28.8h, v7.8h, v16.8h\n"
+ "fmla v30.8h, v1.8h, v16.8h\n"
+ "add x20, x12, x11\n"
+ "tbz %x[n_channels], #2, 38f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v13.h }[6], [x20]\n"
+ "b 40f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v13.h }[4], [x20]\n"
+ "b 40f\n"
+ "38:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr s13, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v13.h }[2], [x20]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h13, [x20, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
+ "fmla v31.8h, v4.8h, v13.8h\n"
+ "add x20, x14, x11\n"
+ "tbz %x[n_channels], #2, 42f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 44f\n"
+ "41:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 44f\n"
+ "42:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 43f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 44f\n"
+ "43:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "44:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: End
+ "fmla v29.8h, v7.8h, v12.8h\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "add x20, x12, x9\n"
+ "tbz %x[n_channels], #2, 46f\n"
+ "ldr d14, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 45f\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v14.h }[6], [x20]\n"
+ "b 48f\n"
+ "45:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v14.h }[4], [x20]\n"
+ "b 48f\n"
+ "46:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 47f\n"
+ "ldr s14, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v14.h }[2], [x20]\n"
+ "b 48f\n"
+ "47:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset: Bit 1: Unset
+ "ldr h14, [x20, #0x0]\n"
+ "48:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: End
+ "fmla v31.8h, v5.8h, v14.8h\n"
+ "add x20, x10, XZR\n"
+ "tbz %x[n_channels], #2, 50f\n"
+ "ldr d15, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 49f\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v15.h }[6], [x20]\n"
+ "b 52f\n"
+ "49:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v15.h }[4], [x20]\n"
+ "b 52f\n"
+ "50:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 51f\n"
+ "ldr s15, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v15.h }[2], [x20]\n"
+ "b 52f\n"
+ "51:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset: Bit 1: Unset
+ "ldr h15, [x20, #0x0]\n"
+ "52:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: End
+ "fmla v30.8h, v6.8h, v15.8h\n"
+ "add x20, x14, x9\n"
+ "tbz %x[n_channels], #2, 54f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 53f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 56f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 56f\n"
+ "53:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 56f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 56f\n"
+ "54:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 55f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 56f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 56f\n"
+ "55:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "56:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: End
+ "fmla v29.8h, v8.8h, v11.8h\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "add x20, x10, x6\n"
+ "tbz %x[n_channels], #2, 58f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 57f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 60f\n"
+ "ld1 { v13.h }[6], [x20]\n"
+ "b 60f\n"
+ "57:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 60f\n"
+ "ld1 { v13.h }[4], [x20]\n"
+ "b 60f\n"
+ "58:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 59f\n"
+ "ldr s13, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 60f\n"
+ "ld1 { v13.h }[2], [x20]\n"
+ "b 60f\n"
+ "59:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h13, [x20, #0x0]\n"
+ "60:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: End
+ "fmla v30.8h, v7.8h, v13.8h\n"
+ "add x20, x12, x13\n"
+ "tbz %x[n_channels], #2, 62f\n"
+ "ldr d16, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 61f\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 64f\n"
+ "ld1 { v16.h }[6], [x20]\n"
+ "b 64f\n"
+ "61:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 64f\n"
+ "ld1 { v16.h }[4], [x20]\n"
+ "b 64f\n"
+ "62:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 63f\n"
+ "ldr s16, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 64f\n"
+ "ld1 { v16.h }[2], [x20]\n"
+ "b 64f\n"
+ "63:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
+ "ldr h16, [x20, #0x0]\n"
+ "64:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
+ "fmla v30.8h, v5.8h, v16.8h\n"
+ "fmla v31.8h, v3.8h, v16.8h\n"
+ "add x20, x10, x11\n"
+ "tbz %x[n_channels], #2, 66f\n"
+ "ldr d14, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 65f\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 68f\n"
+ "ld1 { v14.h }[6], [x20]\n"
+ "b 68f\n"
+ "65:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 68f\n"
+ "ld1 { v14.h }[4], [x20]\n"
+ "b 68f\n"
+ "66:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 67f\n"
+ "ldr s14, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 68f\n"
+ "ld1 { v14.h }[2], [x20]\n"
+ "b 68f\n"
+ "67:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h14, [x20, #0x0]\n"
+ "68:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: End
+ "fmla v31.8h, v7.8h, v14.8h\n"
+ "add x20, x10, x13\n"
+ "tbz %x[n_channels], #2, 70f\n"
+ "ldr d15, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 69f\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 72f\n"
+ "ld1 { v15.h }[6], [x20]\n"
+ "b 72f\n"
+ "69:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 72f\n"
+ "ld1 { v15.h }[4], [x20]\n"
+ "b 72f\n"
+ "70:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 71f\n"
+ "ldr s15, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 72f\n"
+ "ld1 { v15.h }[2], [x20]\n"
+ "b 72f\n"
+ "71:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset: Bit 1: Unset
+ "ldr h15, [x20, #0x0]\n"
+ "72:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
+ "fmla v30.8h, v8.8h, v15.8h\n"
+ "fmla v31.8h, v6.8h, v15.8h\n"
+ "add x20, x10, x9\n"
+ "tbz %x[n_channels], #2, 74f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 73f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 76f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 76f\n"
+ "73:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 76f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 76f\n"
+ "74:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 75f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 76f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 76f\n"
+ "75:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "76:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
+ "fmla v31.8h, v8.8h, v11.8h\n"
+ "fmax v28.8h, v28.8h, v26.8h\n"
+ "fmax v29.8h, v29.8h, v26.8h\n"
+ "fmax v30.8h, v30.8h, v26.8h\n"
+ "fmax v31.8h, v31.8h, v26.8h\n"
+ "fmin v28.8h, v28.8h, v27.8h\n"
+ "fmin v29.8h, v29.8h, v27.8h\n"
+ "fmin v30.8h, v30.8h, v27.8h\n"
+ "fmin v31.8h, v31.8h, v27.8h\n"
+ "tbz %x[n_channels], #2, 78f\n"
+ "mov x21, x17\n"
+ "mov x20, x28\n"
+ "st1 { v28.d }[0], [x21], x7\n"
+ "st1 { v30.d }[0], [x20], x7\n"
+ "add x17, x17, #0x8\n"
+ "add x28, x28, #0x8\n"
+ "st1 { v29.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
+ "tbz %x[n_channels], #1, 77f\n"
+ "mov x21, x17\n"
+ "mov x20, x28\n"
+ "st1 { v28.s }[2], [x21], x7\n"
+ "st1 { v30.s }[2], [x20], x7\n"
+ "add x17, x17, #0x4\n"
+ "add x28, x28, #0x4\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
+ "tbz %x[n_channels], #0, 80f\n"
+ "mov x21, x17\n"
+ "mov x20, x28\n"
+ "st1 { v28.h }[6], [x21], x7\n"
+ "st1 { v30.h }[6], [x20], x7\n"
+ "st1 { v29.h }[6], [x21]\n"
+ "st1 { v31.h }[6], [x20]\n"
+ "b 80f\n"
+ "77:" // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 80f\n"
+ "mov x21, x17\n"
+ "mov x20, x28\n"
+ "st1 { v28.h }[4], [x21], x7\n"
+ "st1 { v30.h }[4], [x20], x7\n"
+ "st1 { v29.h }[4], [x21]\n"
+ "st1 { v31.h }[4], [x20]\n"
+ "b 80f\n"
+ "78:" // Tile loop: Oddments: Store: Bit 2: Unset
+ "tbz %x[n_channels], #1, 79f\n"
+ "mov x21, x17\n"
+ "mov x20, x28\n"
+ "st1 { v28.s }[0], [x21], x7\n"
+ "st1 { v30.s }[0], [x20], x7\n"
+ "add x17, x17, #0x4\n"
+ "add x28, x28, #0x4\n"
+ "st1 { v29.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
+ "tbz %x[n_channels], #0, 80f\n"
+ "mov x21, x17\n"
+ "mov x20, x28\n"
+ "st1 { v28.h }[2], [x21], x7\n"
+ "st1 { v30.h }[2], [x20], x7\n"
+ "st1 { v29.h }[2], [x21]\n"
+ "st1 { v31.h }[2], [x20]\n"
+ "b 80f\n"
+ "79:" // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
+ "mov x21, x17\n"
+ "mov x20, x28\n"
+ "st1 { v28.h }[0], [x21], x7\n"
+ "st1 { v30.h }[0], [x20], x7\n"
+ "st1 { v29.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
+ "80:" // Tile loop: Oddments: Store: Bit 2: End
+ "81:" // Tile loop: End
+ "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x27, x27, #0x1\n"
+ "add x21, x23, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x27, x20\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x23, x23, x21, LT\n"
+ "csel x27, x27, XZR, LT\n"
+ "cmp x23, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..6ae0b30afd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,897 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[25];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[1];
+ inptrs[3] = input_ptrs[3];
+ inptrs[4] = input_ptrs[4];
+ inptrs[5] = input_ptrs[5];
+ inptrs[6] = input_ptrs[6];
+ inptrs[7] = input_ptrs[2];
+ inptrs[8] = input_ptrs[8];
+ inptrs[9] = input_ptrs[9];
+ inptrs[10] = input_ptrs[7];
+ inptrs[11] = input_ptrs[15];
+ inptrs[12] = input_ptrs[10];
+ inptrs[13] = input_ptrs[16];
+ inptrs[14] = input_ptrs[11];
+ inptrs[15] = input_ptrs[18];
+ inptrs[16] = input_ptrs[13];
+ inptrs[17] = input_ptrs[19];
+ inptrs[18] = input_ptrs[20];
+ inptrs[19] = input_ptrs[14];
+ inptrs[20] = input_ptrs[21];
+ inptrs[21] = input_ptrs[17];
+ inptrs[22] = input_ptrs[23];
+ inptrs[23] = input_ptrs[22];
+ inptrs[24] = input_ptrs[24];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "mov x25, #0x10\n" // cntb _, ALL, #1
+ "lsr x24, %x[n_channels], #0x3\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v26.8h }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v27.8h }, [x20]\n"
+ "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldp x12, x11, [x21, #0x0]\n"
+ "ldp x10, x9, [x21, #0x10]\n"
+ "mov x28, #0x0\n"
+ "sub x22, XZR, x25\n"
+ "cbz x24, 3f\n"
+ "ldr q31, [x23, #0x0]\n"
+ "ldr q0, [x23, #0x10]\n"
+ "cmp x25, x24, LSL #4\n"
+ "ldr q1, [x23, #0x20]\n"
+ "ldr q2, [x23, #0x30]\n"
+ "ldr q3, [x23, #0x40]\n"
+ "ldr q4, [x23, #0x50]\n"
+ "ldr q5, [x23, #0x60]\n"
+ "ldr q6, [x23, #0x70]\n"
+ "ldr q7, [x23, #0x80]\n"
+ "ldr q8, [x23, #0x90]\n"
+ "add x23, x23, #0xa0\n"
+ "ldp x21, x20, [x13, #0x0]\n"
+ "ldr q9, [x21, x28]\n"
+ "ldr q10, [x20, x28]\n"
+ "ldp x21, x20, [x13, #0x10]\n"
+ "ldr q11, [x21, x28]\n"
+ "ldr q12, [x20, x28]\n"
+ "ldp x21, x20, [x13, #0x20]\n"
+ "ldr q13, [x21, x28]\n"
+ "ldr q14, [x20, x28]\n"
+ "ldp x21, x20, [x13, #0x30]\n"
+ "ldr q15, [x21, x28]\n"
+ "ldr q16, [x20, x28]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v24.16b, v31.16b\n fmla v24.8h, v8.8h, v9.8h\n"
+ "mov v23.16b, v31.16b\n fmla v23.8h, v6.8h, v9.8h\n"
+ "ldr x21, [x13, #0x40]\n"
+ "ldr x20, [x13, #0x48]\n"
+ "fmla v24.8h, v0.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v12.8h\n"
+ "ldr q20, [x20, x28]\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "ldr q19, [x21, x28]\n"
+ "fmla v23.8h, v2.8h, v13.8h\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v24.8h, v3.8h, v14.8h\n"
+ "fmla v23.8h, v0.8h, v16.8h\n"
+ "ldr x20, [x13, #0x58]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v24.8h, v4.8h, v15.8h\n"
+ "fmla v23.8h, v4.8h, v19.8h\n"
+ "ldr x21, [x13, #0x78]\n"
+ "ldr x20, [x13, #0x60]\n"
+ "ldr q22, [x20, x28]\n"
+ "fmla v24.8h, v2.8h, v16.8h\n"
+ "fmla v23.8h, v5.8h, v20.8h\n"
+ "ldr x20, [x13, #0x80]\n"
+ "ldr q21, [x20, x28]\n"
+ "mov v20.16b, v31.16b\n fmla v20.8h, v2.8h, v9.8h\n"
+ "mov v19.16b, v31.16b\n fmla v19.8h, v0.8h, v9.8h\n"
+ "ldr q31, [x23, #0x0]\n"
+ "fmla v24.8h, v5.8h, v18.8h\n"
+ "fmla v23.8h, v3.8h, v18.8h\n"
+ "ldr q16, [x21, x28]\n"
+ "ldr x20, [x13, #0x68]\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v20.8h, v3.8h, v17.8h\n"
+ "fmla v19.8h, v4.8h, v16.8h\n"
+ "ldr x20, [x13, #0x88]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v20.8h, v0.8h, v22.8h\n"
+ "ldr q0, [x23, #0x10]\n"
+ "fmla v19.8h, v1.8h, v21.8h\n"
+ "ldr x20, [x13, #0x70]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v20.8h, v4.8h, v18.8h\n"
+ "fmla v19.8h, v5.8h, v16.8h\n"
+ "ldr q4, [x23, #0x50]\n"
+ "ldr x20, [x13, #0x98]\n"
+ "fmla v24.8h, v6.8h, v22.8h\n"
+ "fmla v20.8h, v1.8h, v17.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr q1, [x23, #0x20]\n"
+ "fmla v19.8h, v2.8h, v16.8h\n"
+ "fmla v24.8h, v7.8h, v17.8h\n"
+ "ldr q2, [x23, #0x30]\n"
+ "ldr x20, [x13, #0x90]\n"
+ "fmla v23.8h, v7.8h, v21.8h\n"
+ "fmla v23.8h, v8.8h, v16.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr x20, [x13, #0xa8]\n"
+ "fmla v20.8h, v6.8h, v16.8h\n"
+ "fmax v24.8h, v24.8h, v26.8h\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x20, [x13, #0xa0]\n"
+ "fmla v19.8h, v3.8h, v17.8h\n"
+ "fmax v23.8h, v23.8h, v26.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr q3, [x23, #0x40]\n"
+ "fmla v20.8h, v7.8h, v16.8h\n"
+ "fmla v20.8h, v5.8h, v17.8h\n"
+ "ldr q5, [x23, #0x60]\n"
+ "ldr x20, [x13, #0xb0]\n"
+ "add x22, x22, #0x10\n"
+ "fmin v24.8h, v24.8h, v27.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr x20, [x13, #0xb8]\n"
+ "fmla v19.8h, v7.8h, v16.8h\n"
+ "fmin v23.8h, v23.8h, v27.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr q7, [x23, #0x80]\n"
+ "fmla v19.8h, v6.8h, v16.8h\n"
+ "fmla v20.8h, v8.8h, v16.8h\n"
+ "ldr q6, [x23, #0x70]\n"
+ "ldr x20, [x13, #0xc0]\n"
+ "fmax v20.8h, v20.8h, v26.8h\n"
+ "fmin v20.8h, v20.8h, v27.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v19.8h, v8.8h, v16.8h\n"
+ "ldr q8, [x23, #0x90]\n"
+ "fmax v19.8h, v19.8h, v26.8h\n"
+ "ldp x21, x20, [x13, #0x0]\n"
+ "ldr q9, [x21, x25]\n"
+ "fmin v19.8h, v19.8h, v27.8h\n"
+ "add x28, x28, #0x10\n"
+ "ldr q10, [x20, x25]\n"
+ "ldp x21, x20, [x13, #0x10]\n"
+ "str q24, [x12, x22]\n"
+ "add x23, x23, #0xa0\n"
+ "ldr q11, [x21, x25]\n"
+ "ldr q12, [x20, x25]\n"
+ "str q23, [x11, x22]\n"
+ "ldp x21, x20, [x13, #0x20]\n"
+ "ldr q13, [x21, x25]\n"
+ "str q20, [x10, x22]\n"
+ "ldr q14, [x20, x25]\n"
+ "ldp x21, x20, [x13, #0x30]\n"
+ "str q19, [x9, x22]\n"
+ "ldr q15, [x21, x25]\n"
+ "ldr q16, [x20, x25]\n"
+ "add x25, x25, #0x10\n"
+ "cmp x25, x24, LSL #4\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v25.16b, v31.16b\n fmla v25.8h, v8.8h, v9.8h\n"
+ "mov v24.16b, v31.16b\n fmla v24.8h, v6.8h, v9.8h\n"
+ "ldr x21, [x13, #0x40]\n"
+ "ldr x20, [x13, #0x48]\n"
+ "fmla v25.8h, v0.8h, v10.8h\n"
+ "fmla v24.8h, v1.8h, v12.8h\n"
+ "ldr q20, [x20, x28]\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v25.8h, v1.8h, v11.8h\n"
+ "ldr q18, [x21, x28]\n"
+ "fmla v24.8h, v2.8h, v13.8h\n"
+ "ldr q19, [x20, x28]\n"
+ "fmla v25.8h, v3.8h, v14.8h\n"
+ "fmla v24.8h, v0.8h, v16.8h\n"
+ "ldr x20, [x13, #0x58]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v25.8h, v4.8h, v15.8h\n"
+ "fmla v24.8h, v4.8h, v18.8h\n"
+ "ldr x21, [x13, #0x78]\n"
+ "ldr x20, [x13, #0x60]\n"
+ "ldr q23, [x20, x28]\n"
+ "fmla v25.8h, v2.8h, v16.8h\n"
+ "fmla v24.8h, v5.8h, v20.8h\n"
+ "ldr x20, [x13, #0x80]\n"
+ "ldr q22, [x20, x28]\n"
+ "mov v21.16b, v31.16b\n fmla v21.8h, v2.8h, v9.8h\n"
+ "mov v20.16b, v31.16b\n fmla v20.8h, v0.8h, v9.8h\n"
+ "ldr x20, [x13, #0x68]\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v25.8h, v5.8h, v19.8h\n"
+ "fmla v24.8h, v3.8h, v19.8h\n"
+ "ldr q16, [x21, x28]\n"
+ "fmla v21.8h, v3.8h, v17.8h\n"
+ "fmla v20.8h, v4.8h, v16.8h\n"
+ "ldr x20, [x13, #0x88]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.8h, v0.8h, v23.8h\n"
+ "fmla v20.8h, v1.8h, v22.8h\n"
+ "ldr x20, [x13, #0x70]\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x20, [x13, #0x98]\n"
+ "fmla v21.8h, v4.8h, v18.8h\n"
+ "ldr q19, [x20, x28]\n"
+ "fmla v20.8h, v5.8h, v16.8h\n"
+ "fmla v25.8h, v6.8h, v23.8h\n"
+ "ldr x20, [x13, #0x90]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.8h, v1.8h, v17.8h\n"
+ "ldr x20, [x13, #0xa8]\n"
+ "fmla v20.8h, v2.8h, v19.8h\n"
+ "fmla v25.8h, v7.8h, v17.8h\n"
+ "ldr q18, [x20, x28]\n"
+ "ldr x20, [x13, #0xa0]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v21.8h, v6.8h, v16.8h\n"
+ "fmla v20.8h, v3.8h, v18.8h\n"
+ "ldr x20, [x13, #0xb0]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.8h, v7.8h, v17.8h\n"
+ "fmla v20.8h, v7.8h, v16.8h\n"
+ "ldr x20, [x13, #0xb8]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v24.8h, v7.8h, v22.8h\n"
+ "fmla v21.8h, v5.8h, v18.8h\n"
+ "ldr x20, [x13, #0xc0]\n"
+ "fmla v20.8h, v6.8h, v17.8h\n"
+ "fmla v24.8h, v8.8h, v19.8h\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.8h, v8.8h, v17.8h\n"
+ "fmla v20.8h, v8.8h, v16.8h\n"
+ "fmax v25.8h, v25.8h, v26.8h\n"
+ "add x22, x22, #0x10\n"
+ "fmax v24.8h, v24.8h, v26.8h\n"
+ "fmax v21.8h, v21.8h, v26.8h\n"
+ "add x28, x28, #0x10\n"
+ "fmax v20.8h, v20.8h, v26.8h\n"
+ "fmin v25.8h, v25.8h, v27.8h\n"
+ "str q25, [x12, x22]\n"
+ "fmin v24.8h, v24.8h, v27.8h\n"
+ "fmin v21.8h, v21.8h, v27.8h\n"
+ "str q24, [x11, x22]\n"
+ "fmin v20.8h, v20.8h, v27.8h\n"
+ "str q21, [x10, x22]\n"
+ "str q20, [x9, x22]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x7\n"
+ "beq 80f\n"
+ "ldr q31, [x23, #0x0]\n"
+ "ldr q0, [x23, #0x10]\n"
+ "mov x20, x28\n"
+ "add x12, x12, x20\n"
+ "ldr q1, [x23, #0x20]\n"
+ "ldr q2, [x23, #0x30]\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
+ "ldr q3, [x23, #0x40]\n"
+ "ldr q4, [x23, #0x50]\n"
+ "add x9, x9, x20\n"
+ "ldr q5, [x23, #0x60]\n"
+ "ldr q6, [x23, #0x70]\n"
+ "ldr q7, [x23, #0x80]\n"
+ "ldr q8, [x23, #0x90]\n"
+ "ldr x27, [x13, #0x0]\n"
+ "ldr x26, [x13, #0x8]\n"
+ "add x27, x27, x28\n"
+ "add x26, x26, x28\n"
+ "ldr x25, [x13, #0x10]\n"
+ "ldr x24, [x13, #0x18]\n"
+ "add x25, x25, x28\n"
+ "add x24, x24, x28\n"
+ "ldr x23, [x13, #0x20]\n"
+ "ldr x22, [x13, #0x28]\n"
+ "add x23, x23, x28\n"
+ "add x22, x22, x28\n"
+ "ldr x21, [x13, #0x30]\n"
+ "ldr x20, [x13, #0x38]\n"
+ "add x21, x21, x28\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 5f\n"
+ "ld1 { v9.d }[0], [x27], #0x8\n"
+ "ld1 { v10.d }[0], [x26], #0x8\n"
+ "ld1 { v11.d }[0], [x25], #0x8\n"
+ "ld1 { v12.d }[0], [x24], #0x8\n"
+ "ld1 { v13.d }[0], [x23], #0x8\n"
+ "ld1 { v14.d }[0], [x22], #0x8\n"
+ "ld1 { v15.d }[0], [x21], #0x8\n"
+ "ld1 { v16.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.s }[2], [x27], #0x4\n"
+ "ld1 { v10.s }[2], [x26], #0x4\n"
+ "ld1 { v11.s }[2], [x25], #0x4\n"
+ "ld1 { v12.s }[2], [x24], #0x4\n"
+ "ld1 { v13.s }[2], [x23], #0x4\n"
+ "ld1 { v14.s }[2], [x22], #0x4\n"
+ "ld1 { v15.s }[2], [x21], #0x4\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v9.h }[6], [x27], #0x2\n"
+ "ld1 { v10.h }[6], [x26], #0x2\n"
+ "ld1 { v11.h }[6], [x25], #0x2\n"
+ "ld1 { v12.h }[6], [x24], #0x2\n"
+ "ld1 { v13.h }[6], [x23], #0x2\n"
+ "ld1 { v14.h }[6], [x22], #0x2\n"
+ "ld1 { v15.h }[6], [x21], #0x2\n"
+ "ld1 { v16.h }[6], [x20], #0x2\n"
+ "b 7f\n"
+ "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v9.h }[4], [x27], #0x2\n"
+ "ld1 { v10.h }[4], [x26], #0x2\n"
+ "ld1 { v11.h }[4], [x25], #0x2\n"
+ "ld1 { v12.h }[4], [x24], #0x2\n"
+ "ld1 { v13.h }[4], [x23], #0x2\n"
+ "ld1 { v14.h }[4], [x22], #0x2\n"
+ "ld1 { v15.h }[4], [x21], #0x2\n"
+ "ld1 { v16.h }[4], [x20], #0x2\n"
+ "b 7f\n"
+ "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v9.s }[0], [x27], #0x4\n"
+ "ld1 { v10.s }[0], [x26], #0x4\n"
+ "ld1 { v11.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x24], #0x4\n"
+ "ld1 { v13.s }[0], [x23], #0x4\n"
+ "ld1 { v14.s }[0], [x22], #0x4\n"
+ "ld1 { v15.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v9.h }[2], [x27], #0x2\n"
+ "ld1 { v10.h }[2], [x26], #0x2\n"
+ "ld1 { v11.h }[2], [x25], #0x2\n"
+ "ld1 { v12.h }[2], [x24], #0x2\n"
+ "ld1 { v13.h }[2], [x23], #0x2\n"
+ "ld1 { v14.h }[2], [x22], #0x2\n"
+ "ld1 { v15.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "b 7f\n"
+ "6:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v9.h }[0], [x27], #0x2\n"
+ "ld1 { v10.h }[0], [x26], #0x2\n"
+ "ld1 { v11.h }[0], [x25], #0x2\n"
+ "ld1 { v12.h }[0], [x24], #0x2\n"
+ "ld1 { v13.h }[0], [x23], #0x2\n"
+ "ld1 { v14.h }[0], [x22], #0x2\n"
+ "ld1 { v15.h }[0], [x21], #0x2\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "7:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 2: End
+ "mov v28.16b, v31.16b\n fmla v28.8h, v8.8h, v9.8h\n"
+ "fmla v28.8h, v0.8h, v10.8h\n"
+ "ldr x20, [x13, #0x40]\n"
+ "add x20, x20, x28\n"
+ "mov v29.16b, v31.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v28.8h, v3.8h, v14.8h\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "fmla v28.8h, v4.8h, v15.8h\n"
+ "mov v30.16b, v31.16b\n fmla v30.8h, v2.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v2.8h, v16.8h\n"
+ "fmla v29.8h, v0.8h, v16.8h\n"
+ "tbz %x[n_channels], #2, 9f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 11f\n"
+ "8:" // Oddments: Load input (1, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 11f\n"
+ "9:" // Oddments: Load input (1, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "11:" // Oddments: Load input (1, 3): Bit 2: End
+ "ldr x20, [x13, #0x48]\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 13f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 15f\n"
+ "12:" // Oddments: Load input (1, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 15f\n"
+ "13:" // Oddments: Load input (1, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "15:" // Oddments: Load input (1, 4): Bit 2: End
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v29.8h, v5.8h, v12.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 17f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
+ "b 19f\n"
+ "16:" // Oddments: Load input (1, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
+ "b 19f\n"
+ "17:" // Oddments: Load input (1, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (1, 2): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v13.h }[0], [x20], #0x2\n"
+ "19:" // Oddments: Load input (1, 2): Bit 2: End
+ "ldr x20, [x13, #0x58]\n"
+ "fmla v28.8h, v5.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v13.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 21f\n"
+ "ld1 { v14.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v14.h }[6], [x20], #0x2\n"
+ "b 23f\n"
+ "20:" // Oddments: Load input (3, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v14.h }[4], [x20], #0x2\n"
+ "b 23f\n"
+ "21:" // Oddments: Load input (3, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v14.h }[2], [x20], #0x2\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v14.h }[0], [x20], #0x2\n"
+ "23:" // Oddments: Load input (3, 0): Bit 2: End
+ "ldr x20, [x13, #0x60]\n"
+ "fmla v30.8h, v3.8h, v14.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 25f\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v15.h }[6], [x20], #0x2\n"
+ "b 27f\n"
+ "24:" // Oddments: Load input (2, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v15.h }[4], [x20], #0x2\n"
+ "b 27f\n"
+ "25:" // Oddments: Load input (2, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (2, 0): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "27:" // Oddments: Load input (2, 0): Bit 2: End
+ "ldr x20, [x13, #0x68]\n"
+ "fmla v28.8h, v6.8h, v15.8h\n"
+ "fmla v30.8h, v0.8h, v15.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 29f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 31f\n"
+ "28:" // Oddments: Load input (3, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 31f\n"
+ "29:" // Oddments: Load input (3, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "31:" // Oddments: Load input (3, 1): Bit 2: End
+ "ldr x20, [x13, #0x70]\n"
+ "fmla v30.8h, v4.8h, v11.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 33f\n"
+ "ld1 { v16.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v16.h }[6], [x20], #0x2\n"
+ "b 35f\n"
+ "32:" // Oddments: Load input (2, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v16.h }[4], [x20], #0x2\n"
+ "b 35f\n"
+ "33:" // Oddments: Load input (2, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "35:" // Oddments: Load input (2, 1): Bit 2: End
+ "ldr x20, [x13, #0x78]\n"
+ "fmla v28.8h, v7.8h, v16.8h\n"
+ "fmla v30.8h, v1.8h, v16.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 37f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
+ "b 39f\n"
+ "36:" // Oddments: Load input (3, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
+ "b 39f\n"
+ "37:" // Oddments: Load input (3, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v13.h }[0], [x20], #0x2\n"
+ "39:" // Oddments: Load input (3, 3): Bit 2: End
+ "ldr x20, [x13, #0x80]\n"
+ "fmla v31.8h, v4.8h, v13.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 41f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 43f\n"
+ "40:" // Oddments: Load input (2, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 43f\n"
+ "41:" // Oddments: Load input (2, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 42f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 43f\n"
+ "42:" // Oddments: Load input (2, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "43:" // Oddments: Load input (2, 3): Bit 2: End
+ "ldr x20, [x13, #0x88]\n"
+ "fmla v29.8h, v7.8h, v12.8h\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 45f\n"
+ "ld1 { v14.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 44f\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v14.h }[6], [x20], #0x2\n"
+ "b 47f\n"
+ "44:" // Oddments: Load input (3, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v14.h }[4], [x20], #0x2\n"
+ "b 47f\n"
+ "45:" // Oddments: Load input (3, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 46f\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v14.h }[2], [x20], #0x2\n"
+ "b 47f\n"
+ "46:" // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v14.h }[0], [x20], #0x2\n"
+ "47:" // Oddments: Load input (3, 4): Bit 2: End
+ "ldr x20, [x13, #0x90]\n"
+ "fmla v31.8h, v5.8h, v14.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 49f\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 48f\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v15.h }[6], [x20], #0x2\n"
+ "b 51f\n"
+ "48:" // Oddments: Load input (4, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v15.h }[4], [x20], #0x2\n"
+ "b 51f\n"
+ "49:" // Oddments: Load input (4, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 50f\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "b 51f\n"
+ "50:" // Oddments: Load input (4, 0): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "51:" // Oddments: Load input (4, 0): Bit 2: End
+ "ldr x20, [x13, #0x98]\n"
+ "fmla v30.8h, v6.8h, v15.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 53f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 52f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 55f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 55f\n"
+ "52:" // Oddments: Load input (2, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 55f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 55f\n"
+ "53:" // Oddments: Load input (2, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 54f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 55f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 55f\n"
+ "54:" // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "55:" // Oddments: Load input (2, 4): Bit 2: End
+ "ldr x20, [x13, #0xa0]\n"
+ "fmla v29.8h, v8.8h, v11.8h\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 57f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 56f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 59f\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
+ "b 59f\n"
+ "56:" // Oddments: Load input (4, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 59f\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
+ "b 59f\n"
+ "57:" // Oddments: Load input (4, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 58f\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 59f\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
+ "b 59f\n"
+ "58:" // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v13.h }[0], [x20], #0x2\n"
+ "59:" // Oddments: Load input (4, 1): Bit 2: End
+ "ldr x20, [x13, #0xa8]\n"
+ "fmla v30.8h, v7.8h, v13.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 61f\n"
+ "ld1 { v16.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 60f\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 63f\n"
+ "ld1 { v16.h }[6], [x20], #0x2\n"
+ "b 63f\n"
+ "60:" // Oddments: Load input (3, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 63f\n"
+ "ld1 { v16.h }[4], [x20], #0x2\n"
+ "b 63f\n"
+ "61:" // Oddments: Load input (3, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 62f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 63f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "b 63f\n"
+ "62:" // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "63:" // Oddments: Load input (3, 2): Bit 2: End
+ "ldr x20, [x13, #0xb0]\n"
+ "fmla v30.8h, v5.8h, v16.8h\n"
+ "fmla v31.8h, v3.8h, v16.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 65f\n"
+ "ld1 { v14.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 64f\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 67f\n"
+ "ld1 { v14.h }[6], [x20], #0x2\n"
+ "b 67f\n"
+ "64:" // Oddments: Load input (4, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 67f\n"
+ "ld1 { v14.h }[4], [x20], #0x2\n"
+ "b 67f\n"
+ "65:" // Oddments: Load input (4, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 66f\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 67f\n"
+ "ld1 { v14.h }[2], [x20], #0x2\n"
+ "b 67f\n"
+ "66:" // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v14.h }[0], [x20], #0x2\n"
+ "67:" // Oddments: Load input (4, 3): Bit 2: End
+ "ldr x20, [x13, #0xb8]\n"
+ "fmla v31.8h, v7.8h, v14.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 69f\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 68f\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 71f\n"
+ "ld1 { v15.h }[6], [x20], #0x2\n"
+ "b 71f\n"
+ "68:" // Oddments: Load input (4, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 71f\n"
+ "ld1 { v15.h }[4], [x20], #0x2\n"
+ "b 71f\n"
+ "69:" // Oddments: Load input (4, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 70f\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 71f\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "b 71f\n"
+ "70:" // Oddments: Load input (4, 2): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "71:" // Oddments: Load input (4, 2): Bit 2: End
+ "ldr x20, [x13, #0xc0]\n"
+ "fmla v30.8h, v8.8h, v15.8h\n"
+ "fmla v31.8h, v6.8h, v15.8h\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #2, 73f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 72f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 75f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 75f\n"
+ "72:" // Oddments: Load input (4, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 75f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 75f\n"
+ "73:" // Oddments: Load input (4, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 74f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 75f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 75f\n"
+ "74:" // Oddments: Load input (4, 4): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "75:" // Oddments: Load input (4, 4): Bit 2: End
+ "fmla v31.8h, v8.8h, v11.8h\n"
+ "fmax v28.8h, v28.8h, v26.8h\n"
+ "fmax v29.8h, v29.8h, v26.8h\n"
+ "fmax v30.8h, v30.8h, v26.8h\n"
+ "fmax v31.8h, v31.8h, v26.8h\n"
+ "fmin v28.8h, v28.8h, v27.8h\n"
+ "fmin v29.8h, v29.8h, v27.8h\n"
+ "fmin v30.8h, v30.8h, v27.8h\n"
+ "fmin v31.8h, v31.8h, v27.8h\n"
+ "tbz %x[n_channels], #2, 77f\n"
+ "st1 { v28.d }[0], [x12], #0x8\n"
+ "st1 { v29.d }[0], [x11], #0x8\n"
+ "st1 { v30.d }[0], [x10], #0x8\n"
+ "st1 { v31.d }[0], [x9], #0x8\n"
+ "tbz %x[n_channels], #1, 76f\n"
+ "st1 { v28.s }[2], [x12], #0x4\n"
+ "st1 { v29.s }[2], [x11], #0x4\n"
+ "st1 { v30.s }[2], [x10], #0x4\n"
+ "st1 { v31.s }[2], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 79f\n"
+ "st1 { v28.h }[6], [x12], #0x2\n"
+ "st1 { v29.h }[6], [x11], #0x2\n"
+ "st1 { v30.h }[6], [x10], #0x2\n"
+ "st1 { v31.h }[6], [x9], #0x2\n"
+ "b 79f\n"
+ "76:" // Oddments: Store: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 79f\n"
+ "st1 { v28.h }[4], [x12], #0x2\n"
+ "st1 { v29.h }[4], [x11], #0x2\n"
+ "st1 { v30.h }[4], [x10], #0x2\n"
+ "st1 { v31.h }[4], [x9], #0x2\n"
+ "b 79f\n"
+ "77:" // Oddments: Store: Bit 2: Unset
+ "tbz %x[n_channels], #1, 78f\n"
+ "st1 { v28.s }[0], [x12], #0x4\n"
+ "st1 { v29.s }[0], [x11], #0x4\n"
+ "st1 { v30.s }[0], [x10], #0x4\n"
+ "st1 { v31.s }[0], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 79f\n"
+ "st1 { v28.h }[2], [x12], #0x2\n"
+ "st1 { v29.h }[2], [x11], #0x2\n"
+ "st1 { v30.h }[2], [x10], #0x2\n"
+ "st1 { v31.h }[2], [x9], #0x2\n"
+ "b 79f\n"
+ "78:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
+ "st1 { v28.h }[0], [x12], #0x2\n"
+ "st1 { v29.h }[0], [x11], #0x2\n"
+ "st1 { v30.h }[0], [x10], #0x2\n"
+ "st1 { v31.h }[0], [x9], #0x2\n"
+ "79:" // Oddments: Store: Bit 2: End
+ "80:" // End
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..1d1d491c28
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..cecaf79704
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,1387 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x27, #0x0\n"
+ "mov x26, #0x0\n"
+ "1:" // Tile loop
+ "str x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x23, #0x2\n"
+ "mov x25, #0x2\n"
+ "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x2, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mul x22, x27, x24\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x22, x26, x2, x22\n" // offset += tile_j * ld_input_col
+ "ldr x3, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "lsl x2, x2, #0x1\n"
+ "mul x20, x27, x21\n" // offset = tile_i * ld_output_row
+ "ldr x4, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x6, x2, x2\n"
+ "mul x22, x22, x23\n" // offset *= kernel_stride * output_size
+ "add x4, x4, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x7, x4, x24, LSL #1\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x20, x26, x3, x20\n" // offset += tile_j * ld_output_col
+ "add x17, x7, x24, LSL #1\n"
+ "mov x23, #0x10\n" // cntb _, ALL, #1
+ "mul x20, x20, x25\n" // offset *= output_tile_size
+ "lsr x22, %x[n_channels], #0x3\n"
+ "add x16, x17, x24, LSL #1\n"
+ "add x15, x6, x2\n"
+ "add x14, x16, x24, LSL #1\n"
+ "add x13, x15, x2\n"
+ "add x5, x5, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v27.8h }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v15.8h }, [x20]\n"
+ "add x12, x14, x24, LSL #1\n"
+ "add x11, x13, x2\n"
+ "add x10, x5, x21, LSL #1\n"
+ "lsl x3, x3, #0x1\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x23\n"
+ "cbz x22, 4f\n"
+ "ldr q25, [x8, #0x0]\n"
+ "ldr q0, [x8, #0x10]\n"
+ "cmp x23, x22, LSL #4\n"
+ "ldr q1, [x8, #0x20]\n"
+ "ldr q2, [x8, #0x30]\n"
+ "ldr q3, [x8, #0x40]\n"
+ "ldr q4, [x8, #0x50]\n"
+ "add x8, x8, #0x60\n"
+ "ld1 { v5.8h }, [x4]\n"
+ "ldr q6, [x4, x2]\n"
+ "ld1 { v7.8h }, [x7]\n"
+ "ldr q8, [x7, x2]\n"
+ "ldr q9, [x4, x6]\n"
+ "ldr q13, [x7, x6]\n"
+ "ldr q11, [x4, x15]\n"
+ "ldr q12, [x4, x13]\n"
+ "ldr q10, [x7, x11]\n"
+ "ld1 { v14.8h }, [x17]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v30.16b, v25.16b\n fmla v30.8h, v0.8h, v5.8h\n"
+ "ldr q23, [x7, x15]\n"
+ "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v6.8h\n"
+ "add x23, x23, #0x10\n"
+ "mov v29.16b, v25.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+ "mov v28.16b, v25.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+ "ldr q19, [x8, #0x0]\n"
+ "ldr q25, [x8, #0x140]\n"
+ "fmla v30.8h, v1.8h, v6.8h\n"
+ "ldr q21, [x7, x13]\n"
+ "fmla v31.8h, v1.8h, v9.8h\n"
+ "add x7, x7, #0x10\n"
+ "fmla v29.8h, v1.8h, v8.8h\n"
+ "fmla v28.8h, v1.8h, v13.8h\n"
+ "ldr q1, [x8, #0x10]\n"
+ "cmp x23, x22, LSL #4\n"
+ "fmla v30.8h, v2.8h, v9.8h\n"
+ "ldr q18, [x4, x11]\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "add x4, x4, #0x10\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "fmla v28.8h, v2.8h, v23.8h\n"
+ "ldr q17, [x8, #0x20]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v30.8h, v3.8h, v11.8h\n"
+ "ldr q6, [x17, x2]\n"
+ "fmla v31.8h, v3.8h, v12.8h\n"
+ "add x21, x21, #0x10\n"
+ "fmla v29.8h, v3.8h, v23.8h\n"
+ "fmla v28.8h, v3.8h, v21.8h\n"
+ "ldr q16, [x8, #0x30]\n"
+ "fmla v30.8h, v4.8h, v12.8h\n"
+ "ldr q2, [x17, x6]\n"
+ "fmla v31.8h, v4.8h, v18.8h\n"
+ "ldr q0, [x17, x15]\n"
+ "fmla v29.8h, v4.8h, v21.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q20, [x8, #0x40]\n"
+ "fmla v30.8h, v19.8h, v7.8h\n"
+ "ld1 { v7.8h }, [x7]\n"
+ "fmla v31.8h, v19.8h, v8.8h\n"
+ "fmla v29.8h, v19.8h, v14.8h\n"
+ "fmla v28.8h, v19.8h, v6.8h\n"
+ "ldr q19, [x8, #0x50]\n"
+ "fmla v30.8h, v1.8h, v8.8h\n"
+ "ldr q26, [x17, x11]\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "fmla v29.8h, v1.8h, v6.8h\n"
+ "fmla v28.8h, v1.8h, v2.8h\n"
+ "ldr q18, [x8, #0x60]\n"
+ "fmla v30.8h, v17.8h, v13.8h\n"
+ "ldr q1, [x17, x13]\n"
+ "fmla v31.8h, v17.8h, v23.8h\n"
+ "add x17, x17, #0x10\n"
+ "fmla v29.8h, v17.8h, v2.8h\n"
+ "fmla v28.8h, v17.8h, v0.8h\n"
+ "ldr q17, [x8, #0x70]\n"
+ "fmla v30.8h, v16.8h, v23.8h\n"
+ "ld1 { v24.8h }, [x16]\n"
+ "fmla v31.8h, v16.8h, v21.8h\n"
+ "fmla v29.8h, v16.8h, v0.8h\n"
+ "fmla v28.8h, v16.8h, v1.8h\n"
+ "ldr q16, [x8, #0x80]\n"
+ "fmla v30.8h, v20.8h, v21.8h\n"
+ "ldr q23, [x16, x2]\n"
+ "fmla v31.8h, v20.8h, v10.8h\n"
+ "ldr q22, [x16, x6]\n"
+ "fmla v29.8h, v20.8h, v1.8h\n"
+ "fmla v28.8h, v20.8h, v26.8h\n"
+ "ldr q21, [x8, #0x90]\n"
+ "fmla v30.8h, v19.8h, v14.8h\n"
+ "ldr q5, [x16, x11]\n"
+ "fmla v31.8h, v19.8h, v6.8h\n"
+ "fmla v29.8h, v19.8h, v24.8h\n"
+ "fmla v28.8h, v19.8h, v23.8h\n"
+ "ldr q11, [x8, #0xa0]\n"
+ "fmla v30.8h, v18.8h, v6.8h\n"
+ "ldr q20, [x16, x15]\n"
+ "fmla v31.8h, v18.8h, v2.8h\n"
+ "fmla v29.8h, v18.8h, v23.8h\n"
+ "fmla v28.8h, v18.8h, v22.8h\n"
+ "ldr q18, [x8, #0xb0]\n"
+ "fmla v30.8h, v17.8h, v2.8h\n"
+ "ldr q19, [x16, x13]\n"
+ "fmla v31.8h, v17.8h, v0.8h\n"
+ "add x16, x16, #0x10\n"
+ "fmla v29.8h, v17.8h, v22.8h\n"
+ "fmla v28.8h, v17.8h, v20.8h\n"
+ "ldr q17, [x8, #0xc0]\n"
+ "fmla v30.8h, v16.8h, v0.8h\n"
+ "ld1 { v0.8h }, [x14]\n"
+ "fmla v31.8h, v16.8h, v1.8h\n"
+ "fmla v29.8h, v16.8h, v20.8h\n"
+ "fmla v28.8h, v16.8h, v19.8h\n"
+ "ldr q16, [x8, #0xd0]\n"
+ "fmla v30.8h, v21.8h, v1.8h\n"
+ "ldr q4, [x14, x2]\n"
+ "fmla v31.8h, v21.8h, v26.8h\n"
+ "ldr q12, [x14, x13]\n"
+ "fmla v29.8h, v21.8h, v19.8h\n"
+ "fmla v28.8h, v21.8h, v5.8h\n"
+ "ldr q13, [x8, #0xe0]\n"
+ "fmla v30.8h, v11.8h, v24.8h\n"
+ "ldr q6, [x14, x6]\n"
+ "fmla v31.8h, v11.8h, v23.8h\n"
+ "fmla v29.8h, v11.8h, v0.8h\n"
+ "fmla v28.8h, v11.8h, v4.8h\n"
+ "ldr q24, [x8, #0xf0]\n"
+ "fmla v30.8h, v18.8h, v23.8h\n"
+ "ldr q26, [x14, x15]\n"
+ "fmla v31.8h, v18.8h, v22.8h\n"
+ "fmla v29.8h, v18.8h, v4.8h\n"
+ "fmla v28.8h, v18.8h, v6.8h\n"
+ "ldr q23, [x8, #0x100]\n"
+ "fmla v30.8h, v17.8h, v22.8h\n"
+ "ldr q22, [x14, x11]\n"
+ "fmla v31.8h, v17.8h, v20.8h\n"
+ "add x14, x14, #0x10\n"
+ "fmla v29.8h, v17.8h, v6.8h\n"
+ "fmla v28.8h, v17.8h, v26.8h\n"
+ "ldr q21, [x8, #0x110]\n"
+ "fmla v30.8h, v16.8h, v20.8h\n"
+ "ld1 { v18.8h }, [x12]\n"
+ "fmla v31.8h, v16.8h, v19.8h\n"
+ "fmla v29.8h, v16.8h, v26.8h\n"
+ "fmla v28.8h, v16.8h, v12.8h\n"
+ "ldr q20, [x8, #0x120]\n"
+ "fmla v30.8h, v13.8h, v19.8h\n"
+ "ldr q17, [x12, x2]\n"
+ "fmla v31.8h, v13.8h, v5.8h\n"
+ "ld1 { v14.8h }, [x17]\n"
+ "fmla v29.8h, v13.8h, v12.8h\n"
+ "fmla v28.8h, v13.8h, v22.8h\n"
+ "ldr q19, [x8, #0x130]\n"
+ "fmla v30.8h, v24.8h, v0.8h\n"
+ "ldr q16, [x12, x6]\n"
+ "fmla v31.8h, v24.8h, v4.8h\n"
+ "fmla v29.8h, v24.8h, v18.8h\n"
+ "ldr q18, [x12, x15]\n"
+ "fmla v28.8h, v24.8h, v17.8h\n"
+ "ldr q0, [x8, #0x150]\n"
+ "fmla v30.8h, v23.8h, v4.8h\n"
+ "ldr q13, [x7, x6]\n"
+ "fmla v31.8h, v23.8h, v6.8h\n"
+ "fmla v29.8h, v23.8h, v17.8h\n"
+ "ldr q17, [x12, x13]\n"
+ "fmla v28.8h, v23.8h, v16.8h\n"
+ "ldr q1, [x8, #0x160]\n"
+ "fmla v30.8h, v21.8h, v6.8h\n"
+ "ld1 { v5.8h }, [x4]\n"
+ "fmla v31.8h, v21.8h, v26.8h\n"
+ "fmla v29.8h, v21.8h, v16.8h\n"
+ "ldr q16, [x12, x11]\n"
+ "fmla v28.8h, v21.8h, v18.8h\n"
+ "ldr q2, [x8, #0x170]\n"
+ "fmla v30.8h, v20.8h, v26.8h\n"
+ "ldr q6, [x4, x2]\n"
+ "fmla v31.8h, v20.8h, v12.8h\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.8h, v20.8h, v18.8h\n"
+ "ldr q11, [x4, x15]\n"
+ "fmla v28.8h, v20.8h, v17.8h\n"
+ "ldr q3, [x8, #0x180]\n"
+ "fmla v30.8h, v19.8h, v12.8h\n"
+ "ldr q8, [x7, x2]\n"
+ "fmla v31.8h, v19.8h, v22.8h\n"
+ "ldr q10, [x7, x11]\n"
+ "fmla v29.8h, v19.8h, v17.8h\n"
+ "ldr q12, [x4, x13]\n"
+ "fmla v28.8h, v19.8h, v16.8h\n"
+ "ldr q9, [x4, x6]\n"
+ "ldr q4, [x8, #0x190]\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
+ "add x8, x8, #0x1a0\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmax v28.8h, v28.8h, v27.8h\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "st1 { v30.8h }, [x5]\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "str q31, [x5, x3]\n"
+ "add x5, x5, #0x10\n"
+ "st1 { v29.8h }, [x10]\n"
+ "str q28, [x10, x3]\n"
+ "add x10, x10, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr q22, [x7, x15]\n"
+ "mov v5.16b, v25.16b\n fmla v5.8h, v0.8h, v6.8h\n"
+ "mov v30.16b, v25.16b\n fmla v30.8h, v0.8h, v7.8h\n"
+ "mov v29.16b, v25.16b\n fmla v29.8h, v0.8h, v8.8h\n"
+ "ldr q19, [x8, #0x0]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q21, [x7, x13]\n"
+ "fmla v5.8h, v1.8h, v9.8h\n"
+ "add x7, x7, #0x10\n"
+ "fmla v30.8h, v1.8h, v8.8h\n"
+ "fmla v29.8h, v1.8h, v13.8h\n"
+ "ldr q18, [x8, #0x10]\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "ldr q16, [x4, x11]\n"
+ "fmla v5.8h, v2.8h, v11.8h\n"
+ "add x4, x4, #0x10\n"
+ "fmla v30.8h, v2.8h, v13.8h\n"
+ "fmla v29.8h, v2.8h, v22.8h\n"
+ "ldr q17, [x8, #0x20]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q6, [x17, x2]\n"
+ "fmla v5.8h, v3.8h, v12.8h\n"
+ "fmla v30.8h, v3.8h, v22.8h\n"
+ "fmla v29.8h, v3.8h, v21.8h\n"
+ "ldr q20, [x8, #0x30]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q2, [x17, x6]\n"
+ "fmla v5.8h, v4.8h, v16.8h\n"
+ "ldr q28, [x17, x15]\n"
+ "fmla v30.8h, v4.8h, v21.8h\n"
+ "fmla v29.8h, v4.8h, v10.8h\n"
+ "ldr q16, [x8, #0x40]\n"
+ "fmla v31.8h, v19.8h, v7.8h\n"
+ "fmla v5.8h, v19.8h, v8.8h\n"
+ "fmla v30.8h, v19.8h, v14.8h\n"
+ "fmla v29.8h, v19.8h, v6.8h\n"
+ "ldr q19, [x8, #0x50]\n"
+ "fmla v31.8h, v18.8h, v8.8h\n"
+ "ldr q1, [x17, x11]\n"
+ "fmla v5.8h, v18.8h, v13.8h\n"
+ "fmla v30.8h, v18.8h, v6.8h\n"
+ "fmla v29.8h, v18.8h, v2.8h\n"
+ "ldr q18, [x8, #0x60]\n"
+ "fmla v31.8h, v17.8h, v13.8h\n"
+ "ldr q26, [x17, x13]\n"
+ "fmla v5.8h, v17.8h, v22.8h\n"
+ "add x17, x17, #0x10\n"
+ "fmla v30.8h, v17.8h, v2.8h\n"
+ "fmla v29.8h, v17.8h, v28.8h\n"
+ "ldr q17, [x8, #0x70]\n"
+ "fmla v31.8h, v20.8h, v22.8h\n"
+ "ld1 { v25.8h }, [x16]\n"
+ "fmla v5.8h, v20.8h, v21.8h\n"
+ "fmla v30.8h, v20.8h, v28.8h\n"
+ "fmla v29.8h, v20.8h, v26.8h\n"
+ "ldr q24, [x8, #0x80]\n"
+ "fmla v31.8h, v16.8h, v21.8h\n"
+ "ldr q23, [x16, x2]\n"
+ "fmla v5.8h, v16.8h, v10.8h\n"
+ "ldr q0, [x16, x6]\n"
+ "fmla v30.8h, v16.8h, v26.8h\n"
+ "fmla v29.8h, v16.8h, v1.8h\n"
+ "ldr q22, [x8, #0x90]\n"
+ "fmla v31.8h, v19.8h, v14.8h\n"
+ "ldr q16, [x16, x11]\n"
+ "fmla v5.8h, v19.8h, v6.8h\n"
+ "fmla v30.8h, v19.8h, v25.8h\n"
+ "fmla v29.8h, v19.8h, v23.8h\n"
+ "ldr q21, [x8, #0xa0]\n"
+ "fmla v31.8h, v18.8h, v6.8h\n"
+ "ldr q20, [x16, x15]\n"
+ "fmla v5.8h, v18.8h, v2.8h\n"
+ "fmla v30.8h, v18.8h, v23.8h\n"
+ "fmla v29.8h, v18.8h, v0.8h\n"
+ "ldr q18, [x8, #0xb0]\n"
+ "fmla v31.8h, v17.8h, v2.8h\n"
+ "ldr q19, [x16, x13]\n"
+ "fmla v5.8h, v17.8h, v28.8h\n"
+ "add x16, x16, #0x10\n"
+ "fmla v30.8h, v17.8h, v0.8h\n"
+ "fmla v29.8h, v17.8h, v20.8h\n"
+ "ldr q17, [x8, #0xc0]\n"
+ "fmla v31.8h, v24.8h, v28.8h\n"
+ "ld1 { v7.8h }, [x14]\n"
+ "fmla v5.8h, v24.8h, v26.8h\n"
+ "fmla v30.8h, v24.8h, v20.8h\n"
+ "fmla v29.8h, v24.8h, v19.8h\n"
+ "ldr q2, [x8, #0xd0]\n"
+ "fmla v31.8h, v22.8h, v26.8h\n"
+ "ldr q28, [x14, x2]\n"
+ "fmla v5.8h, v22.8h, v1.8h\n"
+ "ldr q13, [x14, x13]\n"
+ "fmla v30.8h, v22.8h, v19.8h\n"
+ "fmla v29.8h, v22.8h, v16.8h\n"
+ "ldr q14, [x8, #0xe0]\n"
+ "fmla v31.8h, v21.8h, v25.8h\n"
+ "ldr q26, [x14, x6]\n"
+ "fmla v5.8h, v21.8h, v23.8h\n"
+ "fmla v30.8h, v21.8h, v7.8h\n"
+ "fmla v29.8h, v21.8h, v28.8h\n"
+ "ldr q25, [x8, #0xf0]\n"
+ "fmla v31.8h, v18.8h, v23.8h\n"
+ "ldr q24, [x14, x15]\n"
+ "fmla v5.8h, v18.8h, v0.8h\n"
+ "fmla v30.8h, v18.8h, v28.8h\n"
+ "fmla v29.8h, v18.8h, v26.8h\n"
+ "ldr q23, [x8, #0x100]\n"
+ "fmla v31.8h, v17.8h, v0.8h\n"
+ "ldr q22, [x14, x11]\n"
+ "fmla v5.8h, v17.8h, v20.8h\n"
+ "add x14, x14, #0x10\n"
+ "fmla v30.8h, v17.8h, v26.8h\n"
+ "fmla v29.8h, v17.8h, v24.8h\n"
+ "ldr q21, [x8, #0x110]\n"
+ "fmla v31.8h, v2.8h, v20.8h\n"
+ "ld1 { v18.8h }, [x12]\n"
+ "fmla v5.8h, v2.8h, v19.8h\n"
+ "fmla v30.8h, v2.8h, v24.8h\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "ldr q20, [x8, #0x120]\n"
+ "fmla v31.8h, v14.8h, v19.8h\n"
+ "ldr q17, [x12, x2]\n"
+ "fmla v5.8h, v14.8h, v16.8h\n"
+ "fmla v30.8h, v14.8h, v13.8h\n"
+ "fmla v29.8h, v14.8h, v22.8h\n"
+ "ldr q19, [x8, #0x130]\n"
+ "add x8, x8, #0x140\n"
+ "fmla v31.8h, v25.8h, v7.8h\n"
+ "ldr q16, [x12, x6]\n"
+ "fmla v5.8h, v25.8h, v28.8h\n"
+ "fmla v30.8h, v25.8h, v18.8h\n"
+ "ldr q18, [x12, x15]\n"
+ "fmla v29.8h, v25.8h, v17.8h\n"
+ "fmla v31.8h, v23.8h, v28.8h\n"
+ "fmla v5.8h, v23.8h, v26.8h\n"
+ "fmla v30.8h, v23.8h, v17.8h\n"
+ "ldr q17, [x12, x13]\n"
+ "fmla v29.8h, v23.8h, v16.8h\n"
+ "fmla v31.8h, v21.8h, v26.8h\n"
+ "fmla v5.8h, v21.8h, v24.8h\n"
+ "fmla v30.8h, v21.8h, v16.8h\n"
+ "ldr q16, [x12, x11]\n"
+ "fmla v29.8h, v21.8h, v18.8h\n"
+ "add x12, x12, #0x10\n"
+ "fmla v31.8h, v20.8h, v24.8h\n"
+ "fmla v5.8h, v20.8h, v13.8h\n"
+ "fmla v30.8h, v20.8h, v18.8h\n"
+ "fmla v29.8h, v20.8h, v17.8h\n"
+ "fmla v31.8h, v19.8h, v13.8h\n"
+ "fmla v5.8h, v19.8h, v22.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
+ "fmla v30.8h, v19.8h, v17.8h\n"
+ "fmla v29.8h, v19.8h, v16.8h\n"
+ "fmax v5.8h, v5.8h, v27.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "fmin v5.8h, v5.8h, v15.8h\n"
+ "st1 { v31.8h }, [x5]\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "str q5, [x5, x3]\n"
+ "add x5, x5, #0x10\n"
+ "st1 { v30.8h }, [x10]\n"
+ "str q29, [x10, x3]\n"
+ "add x10, x10, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x7\n"
+ "beq 117f\n"
+ "ldr q25, [x8, #0x0]\n"
+ "ldr q0, [x8, #0x10]\n"
+ "add x9, x4, XZR\n"
+ "add x28, x4, x2\n"
+ "ldr q1, [x8, #0x20]\n"
+ "ldr q2, [x8, #0x30]\n"
+ "add x27, x7, XZR\n"
+ "add x26, x7, x2\n"
+ "ldr q3, [x8, #0x40]\n"
+ "ldr q4, [x8, #0x50]\n"
+ "add x25, x4, x6\n"
+ "add x24, x7, x6\n"
+ "add x23, x4, x15\n"
+ "add x22, x4, x13\n"
+ "add x21, x7, x11\n"
+ "add x20, x17, XZR\n"
+ "add x8, x8, #0x60\n"
+ "tbz %x[n_channels], #2, 6f\n"
+ "ldr d5, [x9], #0x8\n"
+ "ldr d6, [x28], #0x8\n"
+ "ldr d7, [x27], #0x8\n"
+ "ldr d8, [x26], #0x8\n"
+ "ldr d9, [x25], #0x8\n"
+ "ldr d13, [x24], #0x8\n"
+ "ldr d11, [x23], #0x8\n"
+ "ldr d12, [x22], #0x8\n"
+ "ldr d10, [x21], #0x8\n"
+ "ldr d14, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ld1 { v5.s }[2], [x9], #0x4\n"
+ "ld1 { v6.s }[2], [x28], #0x4\n"
+ "ld1 { v7.s }[2], [x27], #0x4\n"
+ "ld1 { v8.s }[2], [x26], #0x4\n"
+ "ld1 { v9.s }[2], [x25], #0x4\n"
+ "ld1 { v13.s }[2], [x24], #0x4\n"
+ "ld1 { v11.s }[2], [x23], #0x4\n"
+ "ld1 { v12.s }[2], [x22], #0x4\n"
+ "ld1 { v10.s }[2], [x21], #0x4\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v5.h }[6], [x9]\n"
+ "ld1 { v6.h }[6], [x28]\n"
+ "ld1 { v7.h }[6], [x27]\n"
+ "ld1 { v8.h }[6], [x26]\n"
+ "ld1 { v9.h }[6], [x25]\n"
+ "ld1 { v13.h }[6], [x24]\n"
+ "ld1 { v11.h }[6], [x23]\n"
+ "ld1 { v12.h }[6], [x22]\n"
+ "ld1 { v10.h }[6], [x21]\n"
+ "ld1 { v14.h }[6], [x20]\n"
+ "b 8f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v5.h }[4], [x9]\n"
+ "ld1 { v6.h }[4], [x28]\n"
+ "ld1 { v7.h }[4], [x27]\n"
+ "ld1 { v8.h }[4], [x26]\n"
+ "ld1 { v9.h }[4], [x25]\n"
+ "ld1 { v13.h }[4], [x24]\n"
+ "ld1 { v11.h }[4], [x23]\n"
+ "ld1 { v12.h }[4], [x22]\n"
+ "ld1 { v10.h }[4], [x21]\n"
+ "ld1 { v14.h }[4], [x20]\n"
+ "b 8f\n"
+ "6:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr s5, [x9], #0x4\n"
+ "ldr s6, [x28], #0x4\n"
+ "ldr s7, [x27], #0x4\n"
+ "ldr s8, [x26], #0x4\n"
+ "ldr s9, [x25], #0x4\n"
+ "ldr s13, [x24], #0x4\n"
+ "ldr s11, [x23], #0x4\n"
+ "ldr s12, [x22], #0x4\n"
+ "ldr s10, [x21], #0x4\n"
+ "ldr s14, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v5.h }[2], [x9]\n"
+ "ld1 { v6.h }[2], [x28]\n"
+ "ld1 { v7.h }[2], [x27]\n"
+ "ld1 { v8.h }[2], [x26]\n"
+ "ld1 { v9.h }[2], [x25]\n"
+ "ld1 { v13.h }[2], [x24]\n"
+ "ld1 { v11.h }[2], [x23]\n"
+ "ld1 { v12.h }[2], [x22]\n"
+ "ld1 { v10.h }[2], [x21]\n"
+ "ld1 { v14.h }[2], [x20]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Unset: Bit 1: Unset
+ "ldr h5, [x9, #0x0]\n"
+ "ldr h6, [x28, #0x0]\n"
+ "ldr h7, [x27, #0x0]\n"
+ "ldr h8, [x26, #0x0]\n"
+ "ldr h9, [x25, #0x0]\n"
+ "ldr h13, [x24, #0x0]\n"
+ "ldr h11, [x23, #0x0]\n"
+ "ldr h12, [x22, #0x0]\n"
+ "ldr h10, [x21, #0x0]\n"
+ "ldr h14, [x20, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: End
+ "mov v28.16b, v25.16b\n fmla v28.8h, v0.8h, v5.8h\n"
+ "mov v29.16b, v25.16b\n fmla v29.8h, v0.8h, v6.8h\n"
+ "add x20, x7, x15\n"
+ "mov v30.16b, v25.16b\n fmla v30.8h, v0.8h, v7.8h\n"
+ "mov v31.16b, v25.16b\n fmla v31.8h, v0.8h, v8.8h\n"
+ "fmla v28.8h, v1.8h, v6.8h\n"
+ "fmla v29.8h, v1.8h, v9.8h\n"
+ "fmla v30.8h, v1.8h, v8.8h\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "fmla v28.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v2.8h, v13.8h\n"
+ "tbz %x[n_channels], #2, 10f\n"
+ "ldr d5, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v5.h }[6], [x20]\n"
+ "b 12f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v5.h }[4], [x20]\n"
+ "b 12f\n"
+ "10:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr s5, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v5.h }[2], [x20]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h5, [x20, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 2: End
+ "fmla v31.8h, v2.8h, v5.8h\n"
+ "fmla v28.8h, v3.8h, v11.8h\n"
+ "add x20, x7, x13\n"
+ "fmla v29.8h, v3.8h, v12.8h\n"
+ "fmla v30.8h, v3.8h, v5.8h\n"
+ "tbz %x[n_channels], #2, 14f\n"
+ "ldr d6, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ld1 { v6.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v6.h }[6], [x20]\n"
+ "b 16f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v6.h }[4], [x20]\n"
+ "b 16f\n"
+ "14:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr s6, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v6.h }[2], [x20]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: Unset: Bit 1: Unset
+ "ldr h6, [x20, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 2: End
+ "fmla v31.8h, v3.8h, v6.8h\n"
+ "fmla v28.8h, v4.8h, v12.8h\n"
+ "add x20, x4, x11\n"
+ "tbz %x[n_channels], #2, 18f\n"
+ "ldr d9, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v9.h }[6], [x20]\n"
+ "b 20f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v9.h }[4], [x20]\n"
+ "b 20f\n"
+ "18:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 2: Unset
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr s9, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v9.h }[2], [x20]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 2: Unset: Bit 1: Unset
+ "ldr h9, [x20, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 2: End
+ "ldr q0, [x8, #0x0]\n"
+ "fmla v29.8h, v4.8h, v9.8h\n"
+ "fmla v30.8h, v4.8h, v6.8h\n"
+ "add x20, x17, x2\n"
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmla v28.8h, v0.8h, v7.8h\n"
+ "add x8, x8, #0x10\n"
+ "fmla v29.8h, v0.8h, v8.8h\n"
+ "fmla v30.8h, v0.8h, v14.8h\n"
+ "tbz %x[n_channels], #2, 22f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 24f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 24f\n"
+ "22:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 2: End
+ "ldr q1, [x8, #0x0]\n"
+ "fmla v31.8h, v0.8h, v11.8h\n"
+ "fmla v28.8h, v1.8h, v8.8h\n"
+ "add x20, x17, x6\n"
+ "fmla v29.8h, v1.8h, v13.8h\n"
+ "fmla v30.8h, v1.8h, v11.8h\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #2, 26f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 28f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 28f\n"
+ "26:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 2: End
+ "ldr q2, [x8, #0x0]\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "add x20, x17, x15\n"
+ "fmla v29.8h, v2.8h, v5.8h\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #2, 30f\n"
+ "ldr d9, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v9.h }[6], [x20]\n"
+ "b 32f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v9.h }[4], [x20]\n"
+ "b 32f\n"
+ "30:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr s9, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v9.h }[2], [x20]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h9, [x20, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 2: End
+ "ldr q3, [x8, #0x0]\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "fmla v28.8h, v3.8h, v5.8h\n"
+ "add x20, x17, x13\n"
+ "fmla v29.8h, v3.8h, v6.8h\n"
+ "fmla v30.8h, v3.8h, v9.8h\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #2, 34f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v13.h }[6], [x20]\n"
+ "b 36f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v13.h }[4], [x20]\n"
+ "b 36f\n"
+ "34:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr s13, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v13.h }[2], [x20]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: Unset: Bit 1: Unset
+ "ldr h13, [x20, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 2: End
+ "ldr q4, [x8, #0x0]\n"
+ "fmla v31.8h, v3.8h, v13.8h\n"
+ "fmla v28.8h, v4.8h, v6.8h\n"
+ "add x20, x17, x11\n"
+ "fmla v29.8h, v4.8h, v10.8h\n"
+ "fmla v30.8h, v4.8h, v13.8h\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #2, 38f\n"
+ "ldr d8, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ld1 { v8.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v8.h }[6], [x20]\n"
+ "b 40f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v8.h }[4], [x20]\n"
+ "b 40f\n"
+ "38:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Unset
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr s8, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v8.h }[2], [x20]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: Unset: Bit 1: Unset
+ "ldr h8, [x20, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 2: End
+ "ldr q0, [x8, #0x0]\n"
+ "fmla v31.8h, v4.8h, v8.8h\n"
+ "fmla v28.8h, v0.8h, v14.8h\n"
+ "add x20, x16, XZR\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #2, 42f\n"
+ "ldr d5, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v5.h }[6], [x20]\n"
+ "b 44f\n"
+ "41:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v5.h }[4], [x20]\n"
+ "b 44f\n"
+ "42:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 43f\n"
+ "ldr s5, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v5.h }[2], [x20]\n"
+ "b 44f\n"
+ "43:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: Unset: Bit 1: Unset
+ "ldr h5, [x20, #0x0]\n"
+ "44:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 2: End
+ "fmla v30.8h, v0.8h, v5.8h\n"
+ "add x20, x16, x2\n"
+ "tbz %x[n_channels], #2, 46f\n"
+ "ldr d6, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 45f\n"
+ "ld1 { v6.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v6.h }[6], [x20]\n"
+ "b 48f\n"
+ "45:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v6.h }[4], [x20]\n"
+ "b 48f\n"
+ "46:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 47f\n"
+ "ldr s6, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v6.h }[2], [x20]\n"
+ "b 48f\n"
+ "47:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h6, [x20, #0x0]\n"
+ "48:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 2: End
+ "ldr q1, [x8, #0x0]\n"
+ "fmla v31.8h, v0.8h, v6.8h\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "add x20, x16, x6\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v30.8h, v1.8h, v6.8h\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #2, 50f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 49f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v10.h }[6], [x20]\n"
+ "b 52f\n"
+ "49:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v10.h }[4], [x20]\n"
+ "b 52f\n"
+ "50:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 51f\n"
+ "ldr s10, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v10.h }[2], [x20]\n"
+ "b 52f\n"
+ "51:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: Unset: Bit 1: Unset
+ "ldr h10, [x20, #0x0]\n"
+ "52:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 2: End
+ "ldr q2, [x8, #0x0]\n"
+ "fmla v31.8h, v1.8h, v10.8h\n"
+ "fmla v28.8h, v2.8h, v12.8h\n"
+ "add x20, x16, x15\n"
+ "fmla v29.8h, v2.8h, v9.8h\n"
+ "fmla v30.8h, v2.8h, v10.8h\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #2, 54f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 53f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 56f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 56f\n"
+ "53:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 56f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 56f\n"
+ "54:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 55f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 56f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 56f\n"
+ "55:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "56:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 2: End
+ "ldr q3, [x8, #0x0]\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v28.8h, v3.8h, v9.8h\n"
+ "add x20, x16, x13\n"
+ "fmla v29.8h, v3.8h, v13.8h\n"
+ "fmla v30.8h, v3.8h, v11.8h\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #2, 58f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 57f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 60f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 60f\n"
+ "57:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 60f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 60f\n"
+ "58:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 59f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 60f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 60f\n"
+ "59:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "60:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 2: End
+ "ldr q4, [x8, #0x0]\n"
+ "fmla v31.8h, v3.8h, v12.8h\n"
+ "fmla v28.8h, v4.8h, v13.8h\n"
+ "add x20, x16, x11\n"
+ "fmla v29.8h, v4.8h, v8.8h\n"
+ "fmla v30.8h, v4.8h, v12.8h\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #2, 62f\n"
+ "ldr d14, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 61f\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 64f\n"
+ "ld1 { v14.h }[6], [x20]\n"
+ "b 64f\n"
+ "61:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 64f\n"
+ "ld1 { v14.h }[4], [x20]\n"
+ "b 64f\n"
+ "62:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Unset
+ "tbz %x[n_channels], #1, 63f\n"
+ "ldr s14, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 64f\n"
+ "ld1 { v14.h }[2], [x20]\n"
+ "b 64f\n"
+ "63:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: Unset: Bit 1: Unset
+ "ldr h14, [x20, #0x0]\n"
+ "64:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 2: End
+ "ldr q0, [x8, #0x0]\n"
+ "fmla v31.8h, v4.8h, v14.8h\n"
+ "fmla v28.8h, v0.8h, v5.8h\n"
+ "add x20, x14, XZR\n"
+ "fmla v29.8h, v0.8h, v6.8h\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #2, 66f\n"
+ "ldr d9, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 65f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 68f\n"
+ "ld1 { v9.h }[6], [x20]\n"
+ "b 68f\n"
+ "65:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 68f\n"
+ "ld1 { v9.h }[4], [x20]\n"
+ "b 68f\n"
+ "66:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 67f\n"
+ "ldr s9, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 68f\n"
+ "ld1 { v9.h }[2], [x20]\n"
+ "b 68f\n"
+ "67:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: Unset: Bit 1: Unset
+ "ldr h9, [x20, #0x0]\n"
+ "68:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 2: End
+ "fmla v30.8h, v0.8h, v9.8h\n"
+ "add x20, x14, x2\n"
+ "tbz %x[n_channels], #2, 70f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 69f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 72f\n"
+ "ld1 { v13.h }[6], [x20]\n"
+ "b 72f\n"
+ "69:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 72f\n"
+ "ld1 { v13.h }[4], [x20]\n"
+ "b 72f\n"
+ "70:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 71f\n"
+ "ldr s13, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 72f\n"
+ "ld1 { v13.h }[2], [x20]\n"
+ "b 72f\n"
+ "71:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h13, [x20, #0x0]\n"
+ "72:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 2: End
+ "ldr q1, [x8, #0x0]\n"
+ "fmla v31.8h, v0.8h, v13.8h\n"
+ "fmla v28.8h, v1.8h, v6.8h\n"
+ "add x20, x14, x6\n"
+ "fmla v29.8h, v1.8h, v10.8h\n"
+ "fmla v30.8h, v1.8h, v13.8h\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #2, 74f\n"
+ "ldr d5, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 73f\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 76f\n"
+ "ld1 { v5.h }[6], [x20]\n"
+ "b 76f\n"
+ "73:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 76f\n"
+ "ld1 { v5.h }[4], [x20]\n"
+ "b 76f\n"
+ "74:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 75f\n"
+ "ldr s5, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 76f\n"
+ "ld1 { v5.h }[2], [x20]\n"
+ "b 76f\n"
+ "75:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: Unset: Bit 1: Unset
+ "ldr h5, [x20, #0x0]\n"
+ "76:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 2: End
+ "ldr q2, [x8, #0x0]\n"
+ "fmla v31.8h, v1.8h, v5.8h\n"
+ "fmla v28.8h, v2.8h, v10.8h\n"
+ "add x20, x14, x15\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v2.8h, v5.8h\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #2, 78f\n"
+ "ldr d6, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 77f\n"
+ "ld1 { v6.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 80f\n"
+ "ld1 { v6.h }[6], [x20]\n"
+ "b 80f\n"
+ "77:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 80f\n"
+ "ld1 { v6.h }[4], [x20]\n"
+ "b 80f\n"
+ "78:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 79f\n"
+ "ldr s6, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 80f\n"
+ "ld1 { v6.h }[2], [x20]\n"
+ "b 80f\n"
+ "79:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h6, [x20, #0x0]\n"
+ "80:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 2: End
+ "ldr q3, [x8, #0x0]\n"
+ "fmla v31.8h, v2.8h, v6.8h\n"
+ "fmla v28.8h, v3.8h, v11.8h\n"
+ "add x20, x14, x13\n"
+ "fmla v29.8h, v3.8h, v12.8h\n"
+ "fmla v30.8h, v3.8h, v6.8h\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #2, 82f\n"
+ "ldr d8, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 81f\n"
+ "ld1 { v8.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 84f\n"
+ "ld1 { v8.h }[6], [x20]\n"
+ "b 84f\n"
+ "81:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 84f\n"
+ "ld1 { v8.h }[4], [x20]\n"
+ "b 84f\n"
+ "82:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 83f\n"
+ "ldr s8, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 84f\n"
+ "ld1 { v8.h }[2], [x20]\n"
+ "b 84f\n"
+ "83:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: Unset: Bit 1: Unset
+ "ldr h8, [x20, #0x0]\n"
+ "84:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 2: End
+ "ldr q4, [x8, #0x0]\n"
+ "fmla v31.8h, v3.8h, v8.8h\n"
+ "fmla v28.8h, v4.8h, v12.8h\n"
+ "add x20, x14, x11\n"
+ "fmla v29.8h, v4.8h, v14.8h\n"
+ "fmla v30.8h, v4.8h, v8.8h\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #2, 86f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 85f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 88f\n"
+ "ld1 { v10.h }[6], [x20]\n"
+ "b 88f\n"
+ "85:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 88f\n"
+ "ld1 { v10.h }[4], [x20]\n"
+ "b 88f\n"
+ "86:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Unset
+ "tbz %x[n_channels], #1, 87f\n"
+ "ldr s10, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 88f\n"
+ "ld1 { v10.h }[2], [x20]\n"
+ "b 88f\n"
+ "87:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: Unset: Bit 1: Unset
+ "ldr h10, [x20, #0x0]\n"
+ "88:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 2: End
+ "ldr q0, [x8, #0x0]\n"
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmla v28.8h, v0.8h, v9.8h\n"
+ "add x20, x12, XZR\n"
+ "fmla v29.8h, v0.8h, v13.8h\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #2, 90f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 89f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 92f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 92f\n"
+ "89:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 92f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 92f\n"
+ "90:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 91f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 92f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 92f\n"
+ "91:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "92:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 2: End
+ "fmla v30.8h, v0.8h, v11.8h\n"
+ "add x20, x12, x2\n"
+ "tbz %x[n_channels], #2, 94f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 93f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 96f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 96f\n"
+ "93:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 96f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 96f\n"
+ "94:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 95f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 96f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 96f\n"
+ "95:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "96:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 2: End
+ "ldr q1, [x8, #0x0]\n"
+ "fmla v31.8h, v0.8h, v12.8h\n"
+ "fmla v28.8h, v1.8h, v13.8h\n"
+ "add x20, x12, x6\n"
+ "fmla v29.8h, v1.8h, v5.8h\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #2, 98f\n"
+ "ldr d9, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 97f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 100f\n"
+ "ld1 { v9.h }[6], [x20]\n"
+ "b 100f\n"
+ "97:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 100f\n"
+ "ld1 { v9.h }[4], [x20]\n"
+ "b 100f\n"
+ "98:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 99f\n"
+ "ldr s9, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 100f\n"
+ "ld1 { v9.h }[2], [x20]\n"
+ "b 100f\n"
+ "99:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: Unset: Bit 1: Unset
+ "ldr h9, [x20, #0x0]\n"
+ "100:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 2: End
+ "ldr q2, [x8, #0x0]\n"
+ "fmla v31.8h, v1.8h, v9.8h\n"
+ "fmla v28.8h, v2.8h, v5.8h\n"
+ "add x20, x12, x15\n"
+ "fmla v29.8h, v2.8h, v6.8h\n"
+ "fmla v30.8h, v2.8h, v9.8h\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #2, 102f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 101f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 104f\n"
+ "ld1 { v11.h }[6], [x20]\n"
+ "b 104f\n"
+ "101:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 104f\n"
+ "ld1 { v11.h }[4], [x20]\n"
+ "b 104f\n"
+ "102:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 103f\n"
+ "ldr s11, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 104f\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "b 104f\n"
+ "103:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: Unset: Bit 1: Unset
+ "ldr h11, [x20, #0x0]\n"
+ "104:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 2: End
+ "ldr q3, [x8, #0x0]\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v28.8h, v3.8h, v6.8h\n"
+ "add x20, x12, x13\n"
+ "fmla v29.8h, v3.8h, v8.8h\n"
+ "fmla v30.8h, v3.8h, v11.8h\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #2, 106f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 105f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 108f\n"
+ "ld1 { v12.h }[6], [x20]\n"
+ "b 108f\n"
+ "105:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 108f\n"
+ "ld1 { v12.h }[4], [x20]\n"
+ "b 108f\n"
+ "106:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 107f\n"
+ "ldr s12, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 108f\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "b 108f\n"
+ "107:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: Unset: Bit 1: Unset
+ "ldr h12, [x20, #0x0]\n"
+ "108:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 2: End
+ "ldr q4, [x8, #0x0]\n"
+ "fmla v31.8h, v3.8h, v12.8h\n"
+ "fmla v28.8h, v4.8h, v8.8h\n"
+ "add x20, x12, x11\n"
+ "fmla v29.8h, v4.8h, v10.8h\n"
+ "fmla v30.8h, v4.8h, v12.8h\n"
+ "tbz %x[n_channels], #2, 110f\n"
+ "ldr d9, [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 109f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 112f\n"
+ "ld1 { v9.h }[6], [x20]\n"
+ "b 112f\n"
+ "109:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 112f\n"
+ "ld1 { v9.h }[4], [x20]\n"
+ "b 112f\n"
+ "110:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Unset
+ "tbz %x[n_channels], #1, 111f\n"
+ "ldr s9, [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 112f\n"
+ "ld1 { v9.h }[2], [x20]\n"
+ "b 112f\n"
+ "111:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: Unset: Bit 1: Unset
+ "ldr h9, [x20, #0x0]\n"
+ "112:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 2: End
+ "fmla v31.8h, v4.8h, v9.8h\n"
+ "fmax v28.8h, v28.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "tbz %x[n_channels], #2, 114f\n"
+ "mov x21, x5\n"
+ "mov x20, x10\n"
+ "st1 { v28.d }[0], [x21], x3\n"
+ "st1 { v30.d }[0], [x20], x3\n"
+ "add x5, x5, #0x8\n"
+ "add x10, x10, #0x8\n"
+ "st1 { v29.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
+ "tbz %x[n_channels], #1, 113f\n"
+ "mov x21, x5\n"
+ "mov x20, x10\n"
+ "st1 { v28.s }[2], [x21], x3\n"
+ "st1 { v30.s }[2], [x20], x3\n"
+ "add x5, x5, #0x4\n"
+ "add x10, x10, #0x4\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
+ "tbz %x[n_channels], #0, 116f\n"
+ "mov x21, x5\n"
+ "mov x20, x10\n"
+ "st1 { v28.h }[6], [x21], x3\n"
+ "st1 { v30.h }[6], [x20], x3\n"
+ "st1 { v29.h }[6], [x21]\n"
+ "st1 { v31.h }[6], [x20]\n"
+ "b 116f\n"
+ "113:" // Tile loop: Oddments: Store: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 116f\n"
+ "mov x21, x5\n"
+ "mov x20, x10\n"
+ "st1 { v28.h }[4], [x21], x3\n"
+ "st1 { v30.h }[4], [x20], x3\n"
+ "st1 { v29.h }[4], [x21]\n"
+ "st1 { v31.h }[4], [x20]\n"
+ "b 116f\n"
+ "114:" // Tile loop: Oddments: Store: Bit 2: Unset
+ "tbz %x[n_channels], #1, 115f\n"
+ "mov x21, x5\n"
+ "mov x20, x10\n"
+ "st1 { v28.s }[0], [x21], x3\n"
+ "st1 { v30.s }[0], [x20], x3\n"
+ "add x5, x5, #0x4\n"
+ "add x10, x10, #0x4\n"
+ "st1 { v29.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
+ "tbz %x[n_channels], #0, 116f\n"
+ "mov x21, x5\n"
+ "mov x20, x10\n"
+ "st1 { v28.h }[2], [x21], x3\n"
+ "st1 { v30.h }[2], [x20], x3\n"
+ "st1 { v29.h }[2], [x21]\n"
+ "st1 { v31.h }[2], [x20]\n"
+ "b 116f\n"
+ "115:" // Tile loop: Oddments: Store: Bit 2: Unset: Bit 1: Unset
+ "mov x21, x5\n"
+ "mov x20, x10\n"
+ "st1 { v28.h }[0], [x21], x3\n"
+ "st1 { v30.h }[0], [x20], x3\n"
+ "st1 { v29.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
+ "116:" // Tile loop: Oddments: Store: Bit 2: End
+ "117:" // Tile loop: End
+ "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x26, x26, #0x1\n"
+ "add x21, x27, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x26, x20\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x27, x27, x21, LT\n"
+ "csel x26, x26, XZR, LT\n"
+ "cmp x27, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..4913340c4c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,1427 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[36];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[0];
+ inptrs[1] = input_ptrs[1];
+ inptrs[2] = input_ptrs[6];
+ inptrs[3] = input_ptrs[7];
+ inptrs[4] = input_ptrs[2];
+ inptrs[5] = input_ptrs[8];
+ inptrs[6] = input_ptrs[3];
+ inptrs[7] = input_ptrs[4];
+ inptrs[8] = input_ptrs[11];
+ inptrs[9] = input_ptrs[12];
+ inptrs[10] = input_ptrs[9];
+ inptrs[11] = input_ptrs[10];
+ inptrs[12] = input_ptrs[5];
+ inptrs[13] = input_ptrs[13];
+ inptrs[14] = input_ptrs[14];
+ inptrs[15] = input_ptrs[15];
+ inptrs[16] = input_ptrs[16];
+ inptrs[17] = input_ptrs[17];
+ inptrs[18] = input_ptrs[18];
+ inptrs[19] = input_ptrs[19];
+ inptrs[20] = input_ptrs[20];
+ inptrs[21] = input_ptrs[21];
+ inptrs[22] = input_ptrs[22];
+ inptrs[23] = input_ptrs[23];
+ inptrs[24] = input_ptrs[24];
+ inptrs[25] = input_ptrs[25];
+ inptrs[26] = input_ptrs[26];
+ inptrs[27] = input_ptrs[27];
+ inptrs[28] = input_ptrs[28];
+ inptrs[29] = input_ptrs[29];
+ inptrs[30] = input_ptrs[30];
+ inptrs[31] = input_ptrs[31];
+ inptrs[32] = input_ptrs[32];
+ inptrs[33] = input_ptrs[33];
+ inptrs[34] = input_ptrs[34];
+ inptrs[35] = input_ptrs[35];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "mov x17, #0x10\n" // cntb _, ALL, #1
+ "lsr x9, %x[n_channels], #0x3\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v27.8h }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v15.8h }, [x20]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldp x14, x13, [x21, #0x0]\n"
+ "ldp x12, x11, [x21, #0x10]\n"
+ "mov x10, #0x0\n"
+ "sub x28, XZR, x17\n"
+ "cbz x9, 3f\n"
+ "ldr q26, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "cmp x17, x9, LSL #4\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "add x16, x16, #0x60\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "ldr q5, [x21, x10]\n"
+ "ldr q6, [x20, x10]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q7, [x21, x10]\n"
+ "ldr q8, [x20, x10]\n"
+ "ldp x21, x20, [x15, #0x20]\n"
+ "ldr q9, [x21, x10]\n"
+ "ldr q13, [x20, x10]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ldr q11, [x21, x10]\n"
+ "ldr q12, [x20, x10]\n"
+ "ldp x21, x20, [x15, #0x40]\n"
+ "ldr q10, [x21, x10]\n"
+ "ldr q14, [x20, x10]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v30.16b, v26.16b\n fmla v30.8h, v0.8h, v5.8h\n"
+ "mov v31.16b, v26.16b\n fmla v31.8h, v0.8h, v6.8h\n"
+ "ldr x20, [x15, #0x50]\n"
+ "ldr q24, [x20, x10]\n"
+ "mov v28.16b, v26.16b\n fmla v28.8h, v0.8h, v7.8h\n"
+ "mov v29.16b, v26.16b\n fmla v29.8h, v0.8h, v8.8h\n"
+ "ldr q23, [x16, #0x0]\n"
+ "ldr q26, [x16, #0x140]\n"
+ "fmla v30.8h, v1.8h, v6.8h\n"
+ "fmla v31.8h, v1.8h, v9.8h\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ldr q22, [x20, x10]\n"
+ "fmla v28.8h, v1.8h, v8.8h\n"
+ "fmla v29.8h, v1.8h, v13.8h\n"
+ "ldr q21, [x16, #0x10]\n"
+ "ldr x20, [x15, #0x60]\n"
+ "fmla v30.8h, v2.8h, v9.8h\n"
+ "ldr q17, [x20, x10]\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "ldr x20, [x15, #0x68]\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "fmla v29.8h, v2.8h, v24.8h\n"
+ "ldr q16, [x16, #0x20]\n"
+ "ldr x22, [x15, #0x70]\n"
+ "fmla v30.8h, v3.8h, v11.8h\n"
+ "ldr q5, [x20, x10]\n"
+ "fmla v31.8h, v3.8h, v12.8h\n"
+ "ldr x20, [x15, #0x78]\n"
+ "fmla v28.8h, v3.8h, v24.8h\n"
+ "fmla v29.8h, v3.8h, v22.8h\n"
+ "ldr q20, [x16, #0x30]\n"
+ "ldr x21, [x15, #0x80]\n"
+ "fmla v30.8h, v4.8h, v12.8h\n"
+ "ldr q19, [x22, x10]\n"
+ "fmla v31.8h, v4.8h, v17.8h\n"
+ "ldr q2, [x20, x10]\n"
+ "fmla v28.8h, v4.8h, v22.8h\n"
+ "fmla v29.8h, v4.8h, v10.8h\n"
+ "ldr q18, [x16, #0x40]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla v30.8h, v23.8h, v7.8h\n"
+ "fmla v31.8h, v23.8h, v8.8h\n"
+ "ldr x23, [x15, #0x90]\n"
+ "ldr x26, [x15, #0x98]\n"
+ "fmla v28.8h, v23.8h, v14.8h\n"
+ "fmla v29.8h, v23.8h, v5.8h\n"
+ "ldr q1, [x16, #0x50]\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "fmla v30.8h, v21.8h, v8.8h\n"
+ "ldr q25, [x20, x10]\n"
+ "fmla v31.8h, v21.8h, v13.8h\n"
+ "ldr x25, [x15, #0xa8]\n"
+ "fmla v28.8h, v21.8h, v5.8h\n"
+ "fmla v29.8h, v21.8h, v19.8h\n"
+ "ldr q17, [x16, #0x60]\n"
+ "ldr x24, [x15, #0xb0]\n"
+ "fmla v30.8h, v16.8h, v13.8h\n"
+ "ldr q8, [x21, x10]\n"
+ "fmla v31.8h, v16.8h, v24.8h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v28.8h, v16.8h, v19.8h\n"
+ "fmla v29.8h, v16.8h, v2.8h\n"
+ "ldr q16, [x16, #0x70]\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "fmla v30.8h, v20.8h, v24.8h\n"
+ "ldr q24, [x23, x10]\n"
+ "fmla v31.8h, v20.8h, v22.8h\n"
+ "ldr x27, [x15, #0xc8]\n"
+ "fmla v28.8h, v20.8h, v2.8h\n"
+ "fmla v29.8h, v20.8h, v8.8h\n"
+ "ldr q23, [x16, #0x80]\n"
+ "ldr x23, [x15, #0xd0]\n"
+ "fmla v30.8h, v18.8h, v22.8h\n"
+ "ldr q22, [x26, x10]\n"
+ "fmla v31.8h, v18.8h, v10.8h\n"
+ "ldr q21, [x22, x10]\n"
+ "fmla v28.8h, v18.8h, v8.8h\n"
+ "fmla v29.8h, v18.8h, v25.8h\n"
+ "ldr q20, [x16, #0x90]\n"
+ "ldr x22, [x15, #0xd8]\n"
+ "fmla v30.8h, v1.8h, v14.8h\n"
+ "ldr q0, [x20, x10]\n"
+ "fmla v31.8h, v1.8h, v5.8h\n"
+ "ldr x20, [x15, #0xe0]\n"
+ "fmla v28.8h, v1.8h, v24.8h\n"
+ "fmla v29.8h, v1.8h, v22.8h\n"
+ "ldr q6, [x16, #0xa0]\n"
+ "ldr x26, [x15, #0xf8]\n"
+ "fmla v30.8h, v17.8h, v5.8h\n"
+ "ldr q1, [x25, x10]\n"
+ "fmla v31.8h, v17.8h, v19.8h\n"
+ "ldr x25, [x15, #0xe8]\n"
+ "fmla v28.8h, v17.8h, v22.8h\n"
+ "fmla v29.8h, v17.8h, v21.8h\n"
+ "ldr q18, [x16, #0xb0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v30.8h, v16.8h, v19.8h\n"
+ "ldr q19, [x24, x10]\n"
+ "fmla v31.8h, v16.8h, v2.8h\n"
+ "ldr x24, [x15, #0xf0]\n"
+ "fmla v28.8h, v16.8h, v21.8h\n"
+ "fmla v29.8h, v16.8h, v1.8h\n"
+ "ldr q17, [x16, #0xc0]\n"
+ "fmla v30.8h, v23.8h, v2.8h\n"
+ "ldr q16, [x21, x10]\n"
+ "fmla v31.8h, v23.8h, v8.8h\n"
+ "ldr x21, [x15, #0x100]\n"
+ "fmla v28.8h, v23.8h, v1.8h\n"
+ "fmla v29.8h, v23.8h, v19.8h\n"
+ "ldr q13, [x16, #0xd0]\n"
+ "fmla v30.8h, v20.8h, v8.8h\n"
+ "ldr q2, [x27, x10]\n"
+ "fmla v31.8h, v20.8h, v25.8h\n"
+ "ldr q10, [x20, x10]\n"
+ "fmla v28.8h, v20.8h, v19.8h\n"
+ "fmla v29.8h, v20.8h, v0.8h\n"
+ "ldr q9, [x16, #0xe0]\n"
+ "ldr x20, [x15, #0x108]\n"
+ "fmla v30.8h, v6.8h, v24.8h\n"
+ "ldr q5, [x23, x10]\n"
+ "fmla v31.8h, v6.8h, v22.8h\n"
+ "ldr x23, [x15, #0x110]\n"
+ "fmla v28.8h, v6.8h, v16.8h\n"
+ "fmla v29.8h, v6.8h, v2.8h\n"
+ "ldr q24, [x16, #0xf0]\n"
+ "fmla v30.8h, v18.8h, v22.8h\n"
+ "ldr q25, [x22, x10]\n"
+ "fmla v31.8h, v18.8h, v21.8h\n"
+ "ldr x22, [x15, #0x118]\n"
+ "fmla v28.8h, v18.8h, v2.8h\n"
+ "fmla v29.8h, v18.8h, v5.8h\n"
+ "ldr q23, [x16, #0x100]\n"
+ "fmla v30.8h, v17.8h, v21.8h\n"
+ "ldr q22, [x25, x10]\n"
+ "fmla v31.8h, v17.8h, v1.8h\n"
+ "fmla v28.8h, v17.8h, v5.8h\n"
+ "fmla v29.8h, v17.8h, v25.8h\n"
+ "ldr q21, [x16, #0x110]\n"
+ "fmla v30.8h, v13.8h, v1.8h\n"
+ "ldr q18, [x24, x10]\n"
+ "fmla v31.8h, v13.8h, v19.8h\n"
+ "fmla v28.8h, v13.8h, v25.8h\n"
+ "fmla v29.8h, v13.8h, v10.8h\n"
+ "ldr q20, [x16, #0x120]\n"
+ "fmla v30.8h, v9.8h, v19.8h\n"
+ "ldr q17, [x26, x10]\n"
+ "fmla v31.8h, v9.8h, v0.8h\n"
+ "fmla v28.8h, v9.8h, v10.8h\n"
+ "fmla v29.8h, v9.8h, v22.8h\n"
+ "ldr q19, [x16, #0x130]\n"
+ "fmla v30.8h, v24.8h, v16.8h\n"
+ "ldr q16, [x21, x10]\n"
+ "fmla v31.8h, v24.8h, v2.8h\n"
+ "fmla v28.8h, v24.8h, v18.8h\n"
+ "ldr q18, [x20, x10]\n"
+ "fmla v29.8h, v24.8h, v17.8h\n"
+ "ldr q0, [x16, #0x150]\n"
+ "fmla v30.8h, v23.8h, v2.8h\n"
+ "fmla v31.8h, v23.8h, v5.8h\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "fmla v28.8h, v23.8h, v17.8h\n"
+ "ldr q17, [x23, x10]\n"
+ "fmla v29.8h, v23.8h, v16.8h\n"
+ "ldr q1, [x16, #0x160]\n"
+ "fmla v30.8h, v21.8h, v5.8h\n"
+ "ldr q5, [x21, x17]\n"
+ "fmla v31.8h, v21.8h, v25.8h\n"
+ "fmla v28.8h, v21.8h, v16.8h\n"
+ "ldr q16, [x22, x10]\n"
+ "fmla v29.8h, v21.8h, v18.8h\n"
+ "ldr q2, [x16, #0x170]\n"
+ "fmla v30.8h, v20.8h, v25.8h\n"
+ "ldr q6, [x20, x17]\n"
+ "fmla v31.8h, v20.8h, v10.8h\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q7, [x21, x17]\n"
+ "fmla v28.8h, v20.8h, v18.8h\n"
+ "fmla v29.8h, v20.8h, v17.8h\n"
+ "ldr q3, [x16, #0x180]\n"
+ "fmla v30.8h, v19.8h, v10.8h\n"
+ "ldr q8, [x20, x17]\n"
+ "fmla v31.8h, v19.8h, v22.8h\n"
+ "ldp x21, x20, [x15, #0x20]\n"
+ "ldr q13, [x20, x17]\n"
+ "fmla v28.8h, v19.8h, v17.8h\n"
+ "fmla v29.8h, v19.8h, v16.8h\n"
+ "ldr q9, [x21, x17]\n"
+ "ldr q4, [x16, #0x190]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
+ "ldr q11, [x21, x17]\n"
+ "ldr q12, [x20, x17]\n"
+ "fmax v28.8h, v28.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "ldp x21, x20, [x15, #0x40]\n"
+ "ldr q10, [x21, x17]\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "ldr q14, [x20, x17]\n"
+ "add x17, x17, #0x10\n"
+ "cmp x17, x9, LSL #4\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "add x10, x10, #0x10\n"
+ "str q30, [x14, x28]\n"
+ "add x16, x16, #0x1a0\n"
+ "str q31, [x13, x28]\n"
+ "str q28, [x12, x28]\n"
+ "str q29, [x11, x28]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v31.16b, v26.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+ "mov v5.16b, v26.16b\n fmla v5.8h, v0.8h, v6.8h\n"
+ "ldr x20, [x15, #0x50]\n"
+ "ldr q22, [x20, x10]\n"
+ "mov v30.16b, v26.16b\n fmla v30.8h, v0.8h, v7.8h\n"
+ "mov v29.16b, v26.16b\n fmla v29.8h, v0.8h, v8.8h\n"
+ "ldr q19, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x58]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q21, [x20, x10]\n"
+ "fmla v5.8h, v1.8h, v9.8h\n"
+ "ldr x21, [x15, #0x60]\n"
+ "fmla v30.8h, v1.8h, v8.8h\n"
+ "fmla v29.8h, v1.8h, v13.8h\n"
+ "ldr q18, [x16, #0x10]\n"
+ "ldr x20, [x15, #0x68]\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "ldr q16, [x21, x10]\n"
+ "fmla v5.8h, v2.8h, v11.8h\n"
+ "ldr x23, [x15, #0x70]\n"
+ "fmla v30.8h, v2.8h, v13.8h\n"
+ "fmla v29.8h, v2.8h, v22.8h\n"
+ "ldr q17, [x16, #0x20]\n"
+ "ldr x21, [x15, #0x78]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q6, [x20, x10]\n"
+ "fmla v5.8h, v3.8h, v12.8h\n"
+ "ldr x22, [x15, #0x80]\n"
+ "fmla v30.8h, v3.8h, v22.8h\n"
+ "fmla v29.8h, v3.8h, v21.8h\n"
+ "ldr q20, [x16, #0x30]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q2, [x23, x10]\n"
+ "fmla v5.8h, v4.8h, v16.8h\n"
+ "ldr q28, [x21, x10]\n"
+ "fmla v30.8h, v4.8h, v21.8h\n"
+ "fmla v29.8h, v4.8h, v10.8h\n"
+ "ldr q16, [x16, #0x40]\n"
+ "ldr x21, [x15, #0x90]\n"
+ "fmla v31.8h, v19.8h, v7.8h\n"
+ "fmla v5.8h, v19.8h, v8.8h\n"
+ "ldr x27, [x15, #0x98]\n"
+ "ldr x26, [x15, #0xa0]\n"
+ "fmla v30.8h, v19.8h, v14.8h\n"
+ "fmla v29.8h, v19.8h, v6.8h\n"
+ "ldr q19, [x16, #0x50]\n"
+ "ldr x25, [x15, #0xa8]\n"
+ "fmla v31.8h, v18.8h, v8.8h\n"
+ "ldr q1, [x20, x10]\n"
+ "fmla v5.8h, v18.8h, v13.8h\n"
+ "ldr x24, [x15, #0xb0]\n"
+ "fmla v30.8h, v18.8h, v6.8h\n"
+ "fmla v29.8h, v18.8h, v2.8h\n"
+ "ldr q18, [x16, #0x60]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v31.8h, v17.8h, v13.8h\n"
+ "ldr q26, [x22, x10]\n"
+ "fmla v5.8h, v17.8h, v22.8h\n"
+ "ldr x23, [x15, #0xc0]\n"
+ "fmla v30.8h, v17.8h, v2.8h\n"
+ "fmla v29.8h, v17.8h, v28.8h\n"
+ "ldr q17, [x16, #0x70]\n"
+ "ldr x22, [x15, #0xc8]\n"
+ "fmla v31.8h, v20.8h, v22.8h\n"
+ "ldr q25, [x21, x10]\n"
+ "fmla v5.8h, v20.8h, v21.8h\n"
+ "ldr x21, [x15, #0xd0]\n"
+ "fmla v30.8h, v20.8h, v28.8h\n"
+ "fmla v29.8h, v20.8h, v26.8h\n"
+ "ldr q24, [x16, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v31.8h, v16.8h, v21.8h\n"
+ "ldr q23, [x27, x10]\n"
+ "fmla v5.8h, v16.8h, v10.8h\n"
+ "ldr q0, [x26, x10]\n"
+ "fmla v30.8h, v16.8h, v26.8h\n"
+ "fmla v29.8h, v16.8h, v1.8h\n"
+ "ldr q22, [x16, #0x90]\n"
+ "ldr x27, [x15, #0xd8]\n"
+ "fmla v31.8h, v19.8h, v14.8h\n"
+ "ldr q16, [x20, x10]\n"
+ "fmla v5.8h, v19.8h, v6.8h\n"
+ "ldr x20, [x15, #0xe0]\n"
+ "fmla v30.8h, v19.8h, v25.8h\n"
+ "fmla v29.8h, v19.8h, v23.8h\n"
+ "ldr q21, [x16, #0xa0]\n"
+ "ldr x26, [x15, #0xf8]\n"
+ "fmla v31.8h, v18.8h, v6.8h\n"
+ "ldr q20, [x25, x10]\n"
+ "fmla v5.8h, v18.8h, v2.8h\n"
+ "ldr x25, [x15, #0xe8]\n"
+ "fmla v30.8h, v18.8h, v23.8h\n"
+ "fmla v29.8h, v18.8h, v0.8h\n"
+ "ldr q18, [x16, #0xb0]\n"
+ "fmla v31.8h, v17.8h, v2.8h\n"
+ "ldr q19, [x24, x10]\n"
+ "fmla v5.8h, v17.8h, v28.8h\n"
+ "ldr x24, [x15, #0xf0]\n"
+ "fmla v30.8h, v17.8h, v0.8h\n"
+ "fmla v29.8h, v17.8h, v20.8h\n"
+ "ldr q17, [x16, #0xc0]\n"
+ "fmla v31.8h, v24.8h, v28.8h\n"
+ "ldr q7, [x23, x10]\n"
+ "fmla v5.8h, v24.8h, v26.8h\n"
+ "ldr x23, [x15, #0x100]\n"
+ "fmla v30.8h, v24.8h, v20.8h\n"
+ "fmla v29.8h, v24.8h, v19.8h\n"
+ "ldr q3, [x16, #0xd0]\n"
+ "fmla v31.8h, v22.8h, v26.8h\n"
+ "ldr q28, [x22, x10]\n"
+ "fmla v5.8h, v22.8h, v1.8h\n"
+ "ldr q13, [x20, x10]\n"
+ "fmla v30.8h, v22.8h, v19.8h\n"
+ "fmla v29.8h, v22.8h, v16.8h\n"
+ "ldr q11, [x16, #0xe0]\n"
+ "ldr x22, [x15, #0x108]\n"
+ "fmla v31.8h, v21.8h, v25.8h\n"
+ "ldr q26, [x21, x10]\n"
+ "fmla v5.8h, v21.8h, v23.8h\n"
+ "ldr x21, [x15, #0x110]\n"
+ "fmla v30.8h, v21.8h, v7.8h\n"
+ "fmla v29.8h, v21.8h, v28.8h\n"
+ "ldr q25, [x16, #0xf0]\n"
+ "fmla v31.8h, v18.8h, v23.8h\n"
+ "ldr q24, [x27, x10]\n"
+ "fmla v5.8h, v18.8h, v0.8h\n"
+ "ldr x20, [x15, #0x118]\n"
+ "fmla v30.8h, v18.8h, v28.8h\n"
+ "fmla v29.8h, v18.8h, v26.8h\n"
+ "ldr q23, [x16, #0x100]\n"
+ "fmla v31.8h, v17.8h, v0.8h\n"
+ "ldr q22, [x25, x10]\n"
+ "fmla v5.8h, v17.8h, v20.8h\n"
+ "fmla v30.8h, v17.8h, v26.8h\n"
+ "fmla v29.8h, v17.8h, v24.8h\n"
+ "ldr q21, [x16, #0x110]\n"
+ "fmla v31.8h, v3.8h, v20.8h\n"
+ "ldr q18, [x24, x10]\n"
+ "fmla v5.8h, v3.8h, v19.8h\n"
+ "fmla v30.8h, v3.8h, v24.8h\n"
+ "fmla v29.8h, v3.8h, v13.8h\n"
+ "ldr q20, [x16, #0x120]\n"
+ "fmla v31.8h, v11.8h, v19.8h\n"
+ "ldr q17, [x26, x10]\n"
+ "fmla v5.8h, v11.8h, v16.8h\n"
+ "fmla v30.8h, v11.8h, v13.8h\n"
+ "fmla v29.8h, v11.8h, v22.8h\n"
+ "ldr q19, [x16, #0x130]\n"
+ "add x16, x16, #0x140\n"
+ "fmla v31.8h, v25.8h, v7.8h\n"
+ "ldr q16, [x23, x10]\n"
+ "fmla v5.8h, v25.8h, v28.8h\n"
+ "fmla v30.8h, v25.8h, v18.8h\n"
+ "ldr q18, [x22, x10]\n"
+ "fmla v29.8h, v25.8h, v17.8h\n"
+ "fmla v31.8h, v23.8h, v28.8h\n"
+ "fmla v5.8h, v23.8h, v26.8h\n"
+ "fmla v30.8h, v23.8h, v17.8h\n"
+ "ldr q17, [x21, x10]\n"
+ "fmla v29.8h, v23.8h, v16.8h\n"
+ "fmla v31.8h, v21.8h, v26.8h\n"
+ "fmla v5.8h, v21.8h, v24.8h\n"
+ "fmla v30.8h, v21.8h, v16.8h\n"
+ "ldr q16, [x20, x10]\n"
+ "fmla v29.8h, v21.8h, v18.8h\n"
+ "add x10, x10, #0x10\n"
+ "fmla v31.8h, v20.8h, v24.8h\n"
+ "fmla v5.8h, v20.8h, v13.8h\n"
+ "fmla v30.8h, v20.8h, v18.8h\n"
+ "fmla v29.8h, v20.8h, v17.8h\n"
+ "fmla v31.8h, v19.8h, v13.8h\n"
+ "fmla v5.8h, v19.8h, v22.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
+ "fmla v30.8h, v19.8h, v17.8h\n"
+ "fmla v29.8h, v19.8h, v16.8h\n"
+ "fmax v5.8h, v5.8h, v27.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "fmin v5.8h, v5.8h, v15.8h\n"
+ "str q31, [x14, x28]\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "str q5, [x13, x28]\n"
+ "str q30, [x12, x28]\n"
+ "str q29, [x11, x28]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x7\n"
+ "beq 116f\n"
+ "ldr q26, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "mov x20, x10\n"
+ "add x14, x14, x20\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "add x13, x13, x20\n"
+ "add x12, x12, x20\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "add x11, x11, x20\n"
+ "ldr x9, [x15, #0x0]\n"
+ "ldr x28, [x15, #0x8]\n"
+ "add x9, x9, x10\n"
+ "add x28, x28, x10\n"
+ "ldr x27, [x15, #0x10]\n"
+ "ldr x26, [x15, #0x18]\n"
+ "add x27, x27, x10\n"
+ "add x26, x26, x10\n"
+ "ldr x25, [x15, #0x20]\n"
+ "ldr x24, [x15, #0x28]\n"
+ "add x25, x25, x10\n"
+ "add x24, x24, x10\n"
+ "ldr x23, [x15, #0x30]\n"
+ "ldr x22, [x15, #0x38]\n"
+ "add x23, x23, x10\n"
+ "add x22, x22, x10\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "add x21, x21, x10\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x60\n"
+ "tbz %x[n_channels], #2, 5f\n"
+ "ld1 { v5.d }[0], [x9], #0x8\n"
+ "ld1 { v6.d }[0], [x28], #0x8\n"
+ "ld1 { v7.d }[0], [x27], #0x8\n"
+ "ld1 { v8.d }[0], [x26], #0x8\n"
+ "ld1 { v9.d }[0], [x25], #0x8\n"
+ "ld1 { v13.d }[0], [x24], #0x8\n"
+ "ld1 { v11.d }[0], [x23], #0x8\n"
+ "ld1 { v12.d }[0], [x22], #0x8\n"
+ "ld1 { v10.d }[0], [x21], #0x8\n"
+ "ld1 { v14.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v5.s }[2], [x9], #0x4\n"
+ "ld1 { v6.s }[2], [x28], #0x4\n"
+ "ld1 { v7.s }[2], [x27], #0x4\n"
+ "ld1 { v8.s }[2], [x26], #0x4\n"
+ "ld1 { v9.s }[2], [x25], #0x4\n"
+ "ld1 { v13.s }[2], [x24], #0x4\n"
+ "ld1 { v11.s }[2], [x23], #0x4\n"
+ "ld1 { v12.s }[2], [x22], #0x4\n"
+ "ld1 { v10.s }[2], [x21], #0x4\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v5.h }[6], [x9], #0x2\n"
+ "ld1 { v6.h }[6], [x28], #0x2\n"
+ "ld1 { v7.h }[6], [x27], #0x2\n"
+ "ld1 { v8.h }[6], [x26], #0x2\n"
+ "ld1 { v9.h }[6], [x25], #0x2\n"
+ "ld1 { v13.h }[6], [x24], #0x2\n"
+ "ld1 { v11.h }[6], [x23], #0x2\n"
+ "ld1 { v12.h }[6], [x22], #0x2\n"
+ "ld1 { v10.h }[6], [x21], #0x2\n"
+ "ld1 { v14.h }[6], [x20], #0x2\n"
+ "b 7f\n"
+ "4:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v5.h }[4], [x9], #0x2\n"
+ "ld1 { v6.h }[4], [x28], #0x2\n"
+ "ld1 { v7.h }[4], [x27], #0x2\n"
+ "ld1 { v8.h }[4], [x26], #0x2\n"
+ "ld1 { v9.h }[4], [x25], #0x2\n"
+ "ld1 { v13.h }[4], [x24], #0x2\n"
+ "ld1 { v11.h }[4], [x23], #0x2\n"
+ "ld1 { v12.h }[4], [x22], #0x2\n"
+ "ld1 { v10.h }[4], [x21], #0x2\n"
+ "ld1 { v14.h }[4], [x20], #0x2\n"
+ "b 7f\n"
+ "5:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v5.s }[0], [x9], #0x4\n"
+ "ld1 { v6.s }[0], [x28], #0x4\n"
+ "ld1 { v7.s }[0], [x27], #0x4\n"
+ "ld1 { v8.s }[0], [x26], #0x4\n"
+ "ld1 { v9.s }[0], [x25], #0x4\n"
+ "ld1 { v13.s }[0], [x24], #0x4\n"
+ "ld1 { v11.s }[0], [x23], #0x4\n"
+ "ld1 { v12.s }[0], [x22], #0x4\n"
+ "ld1 { v10.s }[0], [x21], #0x4\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v5.h }[2], [x9], #0x2\n"
+ "ld1 { v6.h }[2], [x28], #0x2\n"
+ "ld1 { v7.h }[2], [x27], #0x2\n"
+ "ld1 { v8.h }[2], [x26], #0x2\n"
+ "ld1 { v9.h }[2], [x25], #0x2\n"
+ "ld1 { v13.h }[2], [x24], #0x2\n"
+ "ld1 { v11.h }[2], [x23], #0x2\n"
+ "ld1 { v12.h }[2], [x22], #0x2\n"
+ "ld1 { v10.h }[2], [x21], #0x2\n"
+ "ld1 { v14.h }[2], [x20], #0x2\n"
+ "b 7f\n"
+ "6:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v5.h }[0], [x9], #0x2\n"
+ "ld1 { v6.h }[0], [x28], #0x2\n"
+ "ld1 { v7.h }[0], [x27], #0x2\n"
+ "ld1 { v8.h }[0], [x26], #0x2\n"
+ "ld1 { v9.h }[0], [x25], #0x2\n"
+ "ld1 { v13.h }[0], [x24], #0x2\n"
+ "ld1 { v11.h }[0], [x23], #0x2\n"
+ "ld1 { v12.h }[0], [x22], #0x2\n"
+ "ld1 { v10.h }[0], [x21], #0x2\n"
+ "ld1 { v14.h }[0], [x20], #0x2\n"
+ "7:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 2: End
+ "mov v28.16b, v26.16b\n fmla v28.8h, v0.8h, v5.8h\n"
+ "mov v29.16b, v26.16b\n fmla v29.8h, v0.8h, v6.8h\n"
+ "ldr x20, [x15, #0x50]\n"
+ "add x20, x20, x10\n"
+ "mov v30.16b, v26.16b\n fmla v30.8h, v0.8h, v7.8h\n"
+ "mov v31.16b, v26.16b\n fmla v31.8h, v0.8h, v8.8h\n"
+ "fmla v28.8h, v1.8h, v6.8h\n"
+ "fmla v29.8h, v1.8h, v9.8h\n"
+ "fmla v30.8h, v1.8h, v8.8h\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "fmla v28.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v2.8h, v13.8h\n"
+ "tbz %x[n_channels], #2, 9f\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v5.h }[6], [x20], #0x2\n"
+ "b 11f\n"
+ "8:" // Oddments: Load input (1, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v5.h }[4], [x20], #0x2\n"
+ "b 11f\n"
+ "9:" // Oddments: Load input (1, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (1, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "11:" // Oddments: Load input (1, 3): Bit 2: End
+ "ldr x20, [x15, #0x58]\n"
+ "fmla v31.8h, v2.8h, v5.8h\n"
+ "fmla v28.8h, v3.8h, v11.8h\n"
+ "add x20, x20, x10\n"
+ "fmla v29.8h, v3.8h, v12.8h\n"
+ "fmla v30.8h, v3.8h, v5.8h\n"
+ "tbz %x[n_channels], #2, 13f\n"
+ "ld1 { v6.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v6.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v6.h }[6], [x20], #0x2\n"
+ "b 15f\n"
+ "12:" // Oddments: Load input (1, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v6.h }[4], [x20], #0x2\n"
+ "b 15f\n"
+ "13:" // Oddments: Load input (1, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (1, 4): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v6.h }[0], [x20], #0x2\n"
+ "15:" // Oddments: Load input (1, 4): Bit 2: End
+ "ldr x20, [x15, #0x60]\n"
+ "fmla v31.8h, v3.8h, v6.8h\n"
+ "fmla v28.8h, v4.8h, v12.8h\n"
+ "add x20, x20, x10\n"
+ "tbz %x[n_channels], #2, 17f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v9.h }[6], [x20], #0x2\n"
+ "b 19f\n"
+ "16:" // Oddments: Load input (0, 5): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v9.h }[4], [x20], #0x2\n"
+ "b 19f\n"
+ "17:" // Oddments: Load input (0, 5): Bit 2: Unset
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (0, 5): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "19:" // Oddments: Load input (0, 5): Bit 2: End
+ "ldr q0, [x16, #0x0]\n"
+ "fmla v29.8h, v4.8h, v9.8h\n"
+ "fmla v30.8h, v4.8h, v6.8h\n"
+ "ldr x20, [x15, #0x68]\n"
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmla v28.8h, v0.8h, v7.8h\n"
+ "add x20, x20, x10\n"
+ "fmla v29.8h, v0.8h, v8.8h\n"
+ "fmla v30.8h, v0.8h, v14.8h\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #2, 21f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 23f\n"
+ "20:" // Oddments: Load input (2, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 23f\n"
+ "21:" // Oddments: Load input (2, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (2, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "23:" // Oddments: Load input (2, 1): Bit 2: End
+ "ldr q1, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "fmla v31.8h, v0.8h, v11.8h\n"
+ "fmla v28.8h, v1.8h, v8.8h\n"
+ "fmla v29.8h, v1.8h, v13.8h\n"
+ "fmla v30.8h, v1.8h, v11.8h\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #2, 25f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 27f\n"
+ "24:" // Oddments: Load input (2, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 27f\n"
+ "25:" // Oddments: Load input (2, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (2, 2): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "27:" // Oddments: Load input (2, 2): Bit 2: End
+ "ldr q2, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "fmla v29.8h, v2.8h, v5.8h\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #2, 29f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v9.h }[6], [x20], #0x2\n"
+ "b 31f\n"
+ "28:" // Oddments: Load input (2, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v9.h }[4], [x20], #0x2\n"
+ "b 31f\n"
+ "29:" // Oddments: Load input (2, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (2, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "31:" // Oddments: Load input (2, 3): Bit 2: End
+ "ldr q3, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x80]\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "fmla v28.8h, v3.8h, v5.8h\n"
+ "fmla v29.8h, v3.8h, v6.8h\n"
+ "fmla v30.8h, v3.8h, v9.8h\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #2, 33f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
+ "b 35f\n"
+ "32:" // Oddments: Load input (2, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
+ "b 35f\n"
+ "33:" // Oddments: Load input (2, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (2, 4): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v13.h }[0], [x20], #0x2\n"
+ "35:" // Oddments: Load input (2, 4): Bit 2: End
+ "ldr q4, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla v31.8h, v3.8h, v13.8h\n"
+ "fmla v28.8h, v4.8h, v6.8h\n"
+ "fmla v29.8h, v4.8h, v10.8h\n"
+ "fmla v30.8h, v4.8h, v13.8h\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #2, 37f\n"
+ "ld1 { v8.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v8.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v8.h }[6], [x20], #0x2\n"
+ "b 39f\n"
+ "36:" // Oddments: Load input (2, 5): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v8.h }[4], [x20], #0x2\n"
+ "b 39f\n"
+ "37:" // Oddments: Load input (2, 5): Bit 2: Unset
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (2, 5): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "39:" // Oddments: Load input (2, 5): Bit 2: End
+ "ldr q0, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x90]\n"
+ "fmla v31.8h, v4.8h, v8.8h\n"
+ "fmla v28.8h, v0.8h, v14.8h\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #2, 41f\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v5.h }[6], [x20], #0x2\n"
+ "b 43f\n"
+ "40:" // Oddments: Load input (3, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v5.h }[4], [x20], #0x2\n"
+ "b 43f\n"
+ "41:" // Oddments: Load input (3, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 42f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "b 43f\n"
+ "42:" // Oddments: Load input (3, 0): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "43:" // Oddments: Load input (3, 0): Bit 2: End
+ "ldr x20, [x15, #0x98]\n"
+ "fmla v30.8h, v0.8h, v5.8h\n"
+ "add x20, x20, x10\n"
+ "tbz %x[n_channels], #2, 45f\n"
+ "ld1 { v6.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 44f\n"
+ "ld1 { v6.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v6.h }[6], [x20], #0x2\n"
+ "b 47f\n"
+ "44:" // Oddments: Load input (3, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v6.h }[4], [x20], #0x2\n"
+ "b 47f\n"
+ "45:" // Oddments: Load input (3, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 46f\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
+ "b 47f\n"
+ "46:" // Oddments: Load input (3, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v6.h }[0], [x20], #0x2\n"
+ "47:" // Oddments: Load input (3, 1): Bit 2: End
+ "ldr q1, [x16, #0x0]\n"
+ "ldr x20, [x15, #0xa0]\n"
+ "fmla v31.8h, v0.8h, v6.8h\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v30.8h, v1.8h, v6.8h\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #2, 49f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 48f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
+ "b 51f\n"
+ "48:" // Oddments: Load input (3, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
+ "b 51f\n"
+ "49:" // Oddments: Load input (3, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 50f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "b 51f\n"
+ "50:" // Oddments: Load input (3, 2): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "51:" // Oddments: Load input (3, 2): Bit 2: End
+ "ldr q2, [x16, #0x0]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "fmla v31.8h, v1.8h, v10.8h\n"
+ "fmla v28.8h, v2.8h, v12.8h\n"
+ "fmla v29.8h, v2.8h, v9.8h\n"
+ "fmla v30.8h, v2.8h, v10.8h\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #2, 53f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 52f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 55f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 55f\n"
+ "52:" // Oddments: Load input (3, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 55f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 55f\n"
+ "53:" // Oddments: Load input (3, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 54f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 55f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 55f\n"
+ "54:" // Oddments: Load input (3, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "55:" // Oddments: Load input (3, 3): Bit 2: End
+ "ldr q3, [x16, #0x0]\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v28.8h, v3.8h, v9.8h\n"
+ "fmla v29.8h, v3.8h, v13.8h\n"
+ "fmla v30.8h, v3.8h, v11.8h\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #2, 57f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 56f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 59f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 59f\n"
+ "56:" // Oddments: Load input (3, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 59f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 59f\n"
+ "57:" // Oddments: Load input (3, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 58f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 59f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 59f\n"
+ "58:" // Oddments: Load input (3, 4): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "59:" // Oddments: Load input (3, 4): Bit 2: End
+ "ldr q4, [x16, #0x0]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v31.8h, v3.8h, v12.8h\n"
+ "fmla v28.8h, v4.8h, v13.8h\n"
+ "fmla v29.8h, v4.8h, v8.8h\n"
+ "fmla v30.8h, v4.8h, v12.8h\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #2, 61f\n"
+ "ld1 { v14.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 60f\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 63f\n"
+ "ld1 { v14.h }[6], [x20], #0x2\n"
+ "b 63f\n"
+ "60:" // Oddments: Load input (3, 5): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 63f\n"
+ "ld1 { v14.h }[4], [x20], #0x2\n"
+ "b 63f\n"
+ "61:" // Oddments: Load input (3, 5): Bit 2: Unset
+ "tbz %x[n_channels], #1, 62f\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 63f\n"
+ "ld1 { v14.h }[2], [x20], #0x2\n"
+ "b 63f\n"
+ "62:" // Oddments: Load input (3, 5): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v14.h }[0], [x20], #0x2\n"
+ "63:" // Oddments: Load input (3, 5): Bit 2: End
+ "ldr q0, [x16, #0x0]\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "fmla v31.8h, v4.8h, v14.8h\n"
+ "fmla v28.8h, v0.8h, v5.8h\n"
+ "fmla v29.8h, v0.8h, v6.8h\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #2, 65f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 64f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 67f\n"
+ "ld1 { v9.h }[6], [x20], #0x2\n"
+ "b 67f\n"
+ "64:" // Oddments: Load input (4, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 67f\n"
+ "ld1 { v9.h }[4], [x20], #0x2\n"
+ "b 67f\n"
+ "65:" // Oddments: Load input (4, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 66f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 67f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "b 67f\n"
+ "66:" // Oddments: Load input (4, 0): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "67:" // Oddments: Load input (4, 0): Bit 2: End
+ "ldr x20, [x15, #0xc8]\n"
+ "fmla v30.8h, v0.8h, v9.8h\n"
+ "add x20, x20, x10\n"
+ "tbz %x[n_channels], #2, 69f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 68f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 71f\n"
+ "ld1 { v13.h }[6], [x20], #0x2\n"
+ "b 71f\n"
+ "68:" // Oddments: Load input (4, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 71f\n"
+ "ld1 { v13.h }[4], [x20], #0x2\n"
+ "b 71f\n"
+ "69:" // Oddments: Load input (4, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 70f\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 71f\n"
+ "ld1 { v13.h }[2], [x20], #0x2\n"
+ "b 71f\n"
+ "70:" // Oddments: Load input (4, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v13.h }[0], [x20], #0x2\n"
+ "71:" // Oddments: Load input (4, 1): Bit 2: End
+ "ldr q1, [x16, #0x0]\n"
+ "ldr x20, [x15, #0xd0]\n"
+ "fmla v31.8h, v0.8h, v13.8h\n"
+ "fmla v28.8h, v1.8h, v6.8h\n"
+ "fmla v29.8h, v1.8h, v10.8h\n"
+ "fmla v30.8h, v1.8h, v13.8h\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #2, 73f\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 72f\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 75f\n"
+ "ld1 { v5.h }[6], [x20], #0x2\n"
+ "b 75f\n"
+ "72:" // Oddments: Load input (4, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 75f\n"
+ "ld1 { v5.h }[4], [x20], #0x2\n"
+ "b 75f\n"
+ "73:" // Oddments: Load input (4, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 74f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 75f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "b 75f\n"
+ "74:" // Oddments: Load input (4, 2): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "75:" // Oddments: Load input (4, 2): Bit 2: End
+ "ldr q2, [x16, #0x0]\n"
+ "ldr x20, [x15, #0xd8]\n"
+ "fmla v31.8h, v1.8h, v5.8h\n"
+ "fmla v28.8h, v2.8h, v10.8h\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v2.8h, v5.8h\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #2, 77f\n"
+ "ld1 { v6.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 76f\n"
+ "ld1 { v6.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 79f\n"
+ "ld1 { v6.h }[6], [x20], #0x2\n"
+ "b 79f\n"
+ "76:" // Oddments: Load input (4, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 79f\n"
+ "ld1 { v6.h }[4], [x20], #0x2\n"
+ "b 79f\n"
+ "77:" // Oddments: Load input (4, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 78f\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 79f\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
+ "b 79f\n"
+ "78:" // Oddments: Load input (4, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v6.h }[0], [x20], #0x2\n"
+ "79:" // Oddments: Load input (4, 3): Bit 2: End
+ "ldr q3, [x16, #0x0]\n"
+ "ldr x20, [x15, #0xe0]\n"
+ "fmla v31.8h, v2.8h, v6.8h\n"
+ "fmla v28.8h, v3.8h, v11.8h\n"
+ "fmla v29.8h, v3.8h, v12.8h\n"
+ "fmla v30.8h, v3.8h, v6.8h\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #2, 81f\n"
+ "ld1 { v8.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 80f\n"
+ "ld1 { v8.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 83f\n"
+ "ld1 { v8.h }[6], [x20], #0x2\n"
+ "b 83f\n"
+ "80:" // Oddments: Load input (4, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 83f\n"
+ "ld1 { v8.h }[4], [x20], #0x2\n"
+ "b 83f\n"
+ "81:" // Oddments: Load input (4, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 82f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 83f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "b 83f\n"
+ "82:" // Oddments: Load input (4, 4): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "83:" // Oddments: Load input (4, 4): Bit 2: End
+ "ldr q4, [x16, #0x0]\n"
+ "ldr x20, [x15, #0xe8]\n"
+ "fmla v31.8h, v3.8h, v8.8h\n"
+ "fmla v28.8h, v4.8h, v12.8h\n"
+ "fmla v29.8h, v4.8h, v14.8h\n"
+ "fmla v30.8h, v4.8h, v8.8h\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #2, 85f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 84f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 87f\n"
+ "ld1 { v10.h }[6], [x20], #0x2\n"
+ "b 87f\n"
+ "84:" // Oddments: Load input (4, 5): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 87f\n"
+ "ld1 { v10.h }[4], [x20], #0x2\n"
+ "b 87f\n"
+ "85:" // Oddments: Load input (4, 5): Bit 2: Unset
+ "tbz %x[n_channels], #1, 86f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 87f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "b 87f\n"
+ "86:" // Oddments: Load input (4, 5): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "87:" // Oddments: Load input (4, 5): Bit 2: End
+ "ldr q0, [x16, #0x0]\n"
+ "ldr x20, [x15, #0xf0]\n"
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmla v28.8h, v0.8h, v9.8h\n"
+ "fmla v29.8h, v0.8h, v13.8h\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #2, 89f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 88f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 91f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 91f\n"
+ "88:" // Oddments: Load input (5, 0): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 91f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 91f\n"
+ "89:" // Oddments: Load input (5, 0): Bit 2: Unset
+ "tbz %x[n_channels], #1, 90f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 91f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 91f\n"
+ "90:" // Oddments: Load input (5, 0): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "91:" // Oddments: Load input (5, 0): Bit 2: End
+ "ldr x20, [x15, #0xf8]\n"
+ "fmla v30.8h, v0.8h, v11.8h\n"
+ "add x20, x20, x10\n"
+ "tbz %x[n_channels], #2, 93f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 92f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 95f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 95f\n"
+ "92:" // Oddments: Load input (5, 1): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 95f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 95f\n"
+ "93:" // Oddments: Load input (5, 1): Bit 2: Unset
+ "tbz %x[n_channels], #1, 94f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 95f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 95f\n"
+ "94:" // Oddments: Load input (5, 1): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "95:" // Oddments: Load input (5, 1): Bit 2: End
+ "ldr q1, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x100]\n"
+ "fmla v31.8h, v0.8h, v12.8h\n"
+ "fmla v28.8h, v1.8h, v13.8h\n"
+ "fmla v29.8h, v1.8h, v5.8h\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #2, 97f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 96f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 99f\n"
+ "ld1 { v9.h }[6], [x20], #0x2\n"
+ "b 99f\n"
+ "96:" // Oddments: Load input (5, 2): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 99f\n"
+ "ld1 { v9.h }[4], [x20], #0x2\n"
+ "b 99f\n"
+ "97:" // Oddments: Load input (5, 2): Bit 2: Unset
+ "tbz %x[n_channels], #1, 98f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 99f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "b 99f\n"
+ "98:" // Oddments: Load input (5, 2): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "99:" // Oddments: Load input (5, 2): Bit 2: End
+ "ldr q2, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x108]\n"
+ "fmla v31.8h, v1.8h, v9.8h\n"
+ "fmla v28.8h, v2.8h, v5.8h\n"
+ "fmla v29.8h, v2.8h, v6.8h\n"
+ "fmla v30.8h, v2.8h, v9.8h\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #2, 101f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 100f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 103f\n"
+ "ld1 { v11.h }[6], [x20], #0x2\n"
+ "b 103f\n"
+ "100:" // Oddments: Load input (5, 3): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 103f\n"
+ "ld1 { v11.h }[4], [x20], #0x2\n"
+ "b 103f\n"
+ "101:" // Oddments: Load input (5, 3): Bit 2: Unset
+ "tbz %x[n_channels], #1, 102f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 103f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 103f\n"
+ "102:" // Oddments: Load input (5, 3): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "103:" // Oddments: Load input (5, 3): Bit 2: End
+ "ldr q3, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x110]\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v28.8h, v3.8h, v6.8h\n"
+ "fmla v29.8h, v3.8h, v8.8h\n"
+ "fmla v30.8h, v3.8h, v11.8h\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #2, 105f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 104f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 107f\n"
+ "ld1 { v12.h }[6], [x20], #0x2\n"
+ "b 107f\n"
+ "104:" // Oddments: Load input (5, 4): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 107f\n"
+ "ld1 { v12.h }[4], [x20], #0x2\n"
+ "b 107f\n"
+ "105:" // Oddments: Load input (5, 4): Bit 2: Unset
+ "tbz %x[n_channels], #1, 106f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 107f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 107f\n"
+ "106:" // Oddments: Load input (5, 4): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "107:" // Oddments: Load input (5, 4): Bit 2: End
+ "ldr q4, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x118]\n"
+ "fmla v31.8h, v3.8h, v12.8h\n"
+ "fmla v28.8h, v4.8h, v8.8h\n"
+ "fmla v29.8h, v4.8h, v10.8h\n"
+ "fmla v30.8h, v4.8h, v12.8h\n"
+ "add x20, x20, x10\n"
+ "tbz %x[n_channels], #2, 109f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 108f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 111f\n"
+ "ld1 { v9.h }[6], [x20], #0x2\n"
+ "b 111f\n"
+ "108:" // Oddments: Load input (5, 5): Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 111f\n"
+ "ld1 { v9.h }[4], [x20], #0x2\n"
+ "b 111f\n"
+ "109:" // Oddments: Load input (5, 5): Bit 2: Unset
+ "tbz %x[n_channels], #1, 110f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 111f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "b 111f\n"
+ "110:" // Oddments: Load input (5, 5): Bit 2: Unset: Bit 1: Unset
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "111:" // Oddments: Load input (5, 5): Bit 2: End
+ "fmla v31.8h, v4.8h, v9.8h\n"
+ "fmax v28.8h, v28.8h, v27.8h\n"
+ "fmax v29.8h, v29.8h, v27.8h\n"
+ "fmax v30.8h, v30.8h, v27.8h\n"
+ "fmax v31.8h, v31.8h, v27.8h\n"
+ "fmin v28.8h, v28.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v15.8h\n"
+ "fmin v30.8h, v30.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v15.8h\n"
+ "tbz %x[n_channels], #2, 113f\n"
+ "st1 { v28.d }[0], [x14], #0x8\n"
+ "st1 { v29.d }[0], [x13], #0x8\n"
+ "st1 { v30.d }[0], [x12], #0x8\n"
+ "st1 { v31.d }[0], [x11], #0x8\n"
+ "tbz %x[n_channels], #1, 112f\n"
+ "st1 { v28.s }[2], [x14], #0x4\n"
+ "st1 { v29.s }[2], [x13], #0x4\n"
+ "st1 { v30.s }[2], [x12], #0x4\n"
+ "st1 { v31.s }[2], [x11], #0x4\n"
+ "tbz %x[n_channels], #0, 115f\n"
+ "st1 { v28.h }[6], [x14], #0x2\n"
+ "st1 { v29.h }[6], [x13], #0x2\n"
+ "st1 { v30.h }[6], [x12], #0x2\n"
+ "st1 { v31.h }[6], [x11], #0x2\n"
+ "b 115f\n"
+ "112:" // Oddments: Store: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 115f\n"
+ "st1 { v28.h }[4], [x14], #0x2\n"
+ "st1 { v29.h }[4], [x13], #0x2\n"
+ "st1 { v30.h }[4], [x12], #0x2\n"
+ "st1 { v31.h }[4], [x11], #0x2\n"
+ "b 115f\n"
+ "113:" // Oddments: Store: Bit 2: Unset
+ "tbz %x[n_channels], #1, 114f\n"
+ "st1 { v28.s }[0], [x14], #0x4\n"
+ "st1 { v29.s }[0], [x13], #0x4\n"
+ "st1 { v30.s }[0], [x12], #0x4\n"
+ "st1 { v31.s }[0], [x11], #0x4\n"
+ "tbz %x[n_channels], #0, 115f\n"
+ "st1 { v28.h }[2], [x14], #0x2\n"
+ "st1 { v29.h }[2], [x13], #0x2\n"
+ "st1 { v30.h }[2], [x12], #0x2\n"
+ "st1 { v31.h }[2], [x11], #0x2\n"
+ "b 115f\n"
+ "114:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
+ "st1 { v28.h }[0], [x14], #0x2\n"
+ "st1 { v29.h }[0], [x13], #0x2\n"
+ "st1 { v30.h }[0], [x12], #0x2\n"
+ "st1 { v31.h }[0], [x11], #0x2\n"
+ "115:" // Oddments: Store: Bit 2: End
+ "116:" // End
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..b7608af721
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(const __fp16 *const *const, __fp16 *const *const, const void *, const void *, const unsigned int, const unsigned int, const __fp16, const __fp16);
+
+class a64_fp16_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+ KernelType kernel = a64_fp16_nhwc_generic_output9_mla_depthfirst_impl;
+
+ public:
+ a64_fp16_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<__fp16, __fp16, __fp16, __fp16>(9, arm_gemm::VLType::None) {}
+
+ KernelType get_kernel() const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..08f40b785f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,520 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(
+ const __fp16 *const *const inptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ const void *bias,
+ const unsigned int n_points,
+ const unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ const __fp16 minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ld1r { v2.8h }, [%x[minmax_vals]]\n"
+ "lsr x9, %x[n_channels], #0x3\n"
+ "add x20, %x[minmax_vals], #0x2\n"
+ "ld1r { v1.8h }, [x20]\n"
+ "mov x11, #0x0\n"
+ "cbz x9, 5f\n"
+ "1:" // Channel loop
+ "movi v23.16b, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "ldr q23, [%x[bias], x11]\n"
+ "2:" // Channel loop: Load bias: Done
+ "ldr q0, [%x[params], #0x0]\n"
+ "mov x26, %x[inptrs]\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "subs x25, %x[n_points], #0x1\n"
+ "ldr q14, [x21, x11]\n"
+ "ldr q15, [x20, x11]\n"
+ "mov v24.16b, v23.16b\n"
+ "mov v25.16b, v23.16b\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "ldr q16, [x21, x11]\n"
+ "mov v26.16b, v23.16b\n"
+ "mov v27.16b, v23.16b\n"
+ "ldr q17, [x20, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "mov v28.16b, v23.16b\n"
+ "mov v29.16b, v23.16b\n"
+ "ldr q18, [x21, x11]\n"
+ "ldr q19, [x20, x11]\n"
+ "mov v30.16b, v23.16b\n"
+ "mov v31.16b, v23.16b\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "ldr q20, [x21, x11]\n"
+ "add %x[params], %x[params], #0x10\n"
+ "ldr q21, [x20, x11]\n"
+ "ldr x20, [x26], #0x8\n"
+ "ldr q22, [x20, x11]\n"
+ "ble 4f\n"
+ "3:" // Channel loop: Planar loop
+ "ldp x20, x24, [x26], #0x10\n"
+ "ldp x23, x22, [x26], #0x10\n"
+ "subs x25, x25, #0x1\n"
+ "fmla v23.8h, v14.8h, v0.8h\n"
+ "ldr q14, [x20, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "fmla v24.8h, v15.8h, v0.8h\n"
+ "fmla v25.8h, v16.8h, v0.8h\n"
+ "ldr q15, [x24, x11]\n"
+ "ldr q16, [x23, x11]\n"
+ "fmla v26.8h, v17.8h, v0.8h\n"
+ "fmla v27.8h, v18.8h, v0.8h\n"
+ "ldr q17, [x22, x11]\n"
+ "ldr q18, [x21, x11]\n"
+ "fmla v28.8h, v19.8h, v0.8h\n"
+ "fmla v29.8h, v20.8h, v0.8h\n"
+ "ldr q19, [x20, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "fmla v30.8h, v21.8h, v0.8h\n"
+ "fmla v31.8h, v22.8h, v0.8h\n"
+ "ldr q0, [%x[params], #0x0]\n"
+ "ldr q20, [x21, x11]\n"
+ "add %x[params], %x[params], #0x10\n"
+ "ldr q21, [x20, x11]\n"
+ "ldr x20, [x26], #0x8\n"
+ "ldr q22, [x20, x11]\n"
+ "bgt 3b\n"
+ "4:" // Channel loop: Planar tail
+ "fmla v23.8h, v14.8h, v0.8h\n"
+ "fmla v24.8h, v15.8h, v0.8h\n"
+ "fmax v23.8h, v23.8h, v2.8h\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "fmla v25.8h, v16.8h, v0.8h\n"
+ "fmla v26.8h, v17.8h, v0.8h\n"
+ "fmax v24.8h, v24.8h, v2.8h\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "fmla v27.8h, v18.8h, v0.8h\n"
+ "fmla v28.8h, v19.8h, v0.8h\n"
+ "fmax v25.8h, v25.8h, v2.8h\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "fmla v29.8h, v20.8h, v0.8h\n"
+ "fmla v30.8h, v21.8h, v0.8h\n"
+ "fmax v26.8h, v26.8h, v2.8h\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "fmla v31.8h, v22.8h, v0.8h\n"
+ "fmax v27.8h, v27.8h, v2.8h\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "fmax v28.8h, v28.8h, v2.8h\n"
+ "fmax v29.8h, v29.8h, v2.8h\n"
+ "fmax v30.8h, v30.8h, v2.8h\n"
+ "fmax v31.8h, v31.8h, v2.8h\n"
+ "fmin v23.8h, v23.8h, v1.8h\n"
+ "fmin v24.8h, v24.8h, v1.8h\n"
+ "str q23, [x28, x11]\n"
+ "fmin v25.8h, v25.8h, v1.8h\n"
+ "fmin v26.8h, v26.8h, v1.8h\n"
+ "str q24, [x27, x11]\n"
+ "fmin v27.8h, v27.8h, v1.8h\n"
+ "fmin v28.8h, v28.8h, v1.8h\n"
+ "str q25, [x26, x11]\n"
+ "fmin v29.8h, v29.8h, v1.8h\n"
+ "fmin v30.8h, v30.8h, v1.8h\n"
+ "str q26, [x25, x11]\n"
+ "fmin v31.8h, v31.8h, v1.8h\n"
+ "str q27, [x24, x11]\n"
+ "str q28, [x23, x11]\n"
+ "str q29, [x22, x11]\n"
+ "str q30, [x21, x11]\n"
+ "str q31, [x20, x11]\n"
+ "add x11, x11, #0x10\n"
+ "cmp x11, x9, LSL #4\n"
+ "blt 1b\n"
+ "5:" // Oddments
+ "tst %x[n_channels], #0x7\n"
+ "beq 25f\n"
+ "movi v23.16b, #0x0\n"
+ "cbz %x[bias], 10f\n"
+ "add x20, %x[bias], x11\n"
+ "tbz %x[n_channels], #2, 7f\n"
+ "ld1 { v23.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v23.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v23.h }[6], [x20], #0x2\n"
+ "b 9f\n"
+ "6:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v23.h }[4], [x20], #0x2\n"
+ "b 9f\n"
+ "7:" // Oddments: Load bias: Bit 2: Unset
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "b 9f\n"
+ "8:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "9:" // Oddments: Load bias: Bit 2: End
+ "10:" // Oddments: Load bias: Done
+ "ldr q0, [%x[params], #0x0]\n"
+ "mov x10, %x[inptrs]\n"
+ "ldp x9, x28, [x10], #0x10\n"
+ "mov v24.16b, v23.16b\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "mov v25.16b, v23.16b\n"
+ "mov v26.16b, v23.16b\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "ldr x21, [x10], #0x8\n"
+ "mov v27.16b, v23.16b\n"
+ "mov v28.16b, v23.16b\n"
+ "mov v29.16b, v23.16b\n"
+ "mov v30.16b, v23.16b\n"
+ "add x9, x9, x11\n"
+ "add x28, x28, x11\n"
+ "mov v31.16b, v23.16b\n"
+ "add x27, x27, x11\n"
+ "add x26, x26, x11\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
+ "add %x[params], %x[params], #0x10\n"
+ "tbz %x[n_channels], #2, 12f\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d15, [x28], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d17, [x26], #0x8\n"
+ "ldr d18, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v15.s }[2], [x28], #0x4\n"
+ "ld1 { v16.s }[2], [x27], #0x4\n"
+ "ld1 { v17.s }[2], [x26], #0x4\n"
+ "ld1 { v18.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v21.s }[2], [x22], #0x4\n"
+ "ld1 { v22.s }[2], [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v14.h }[6], [x9], #0x2\n"
+ "ld1 { v15.h }[6], [x28], #0x2\n"
+ "ld1 { v16.h }[6], [x27], #0x2\n"
+ "ld1 { v17.h }[6], [x26], #0x2\n"
+ "ld1 { v18.h }[6], [x25], #0x2\n"
+ "ld1 { v19.h }[6], [x24], #0x2\n"
+ "ld1 { v20.h }[6], [x23], #0x2\n"
+ "ld1 { v21.h }[6], [x22], #0x2\n"
+ "ld1 { v22.h }[6], [x21], #0x2\n"
+ "b 14f\n"
+ "11:" // Oddments: Load: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v14.h }[4], [x9], #0x2\n"
+ "ld1 { v15.h }[4], [x28], #0x2\n"
+ "ld1 { v16.h }[4], [x27], #0x2\n"
+ "ld1 { v17.h }[4], [x26], #0x2\n"
+ "ld1 { v18.h }[4], [x25], #0x2\n"
+ "ld1 { v19.h }[4], [x24], #0x2\n"
+ "ld1 { v20.h }[4], [x23], #0x2\n"
+ "ld1 { v21.h }[4], [x22], #0x2\n"
+ "ld1 { v22.h }[4], [x21], #0x2\n"
+ "b 14f\n"
+ "12:" // Oddments: Load: Bit 2: Unset
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s15, [x28], #0x4\n"
+ "ldr s16, [x27], #0x4\n"
+ "ldr s17, [x26], #0x4\n"
+ "ldr s18, [x25], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s22, [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v14.h }[2], [x9], #0x2\n"
+ "ld1 { v15.h }[2], [x28], #0x2\n"
+ "ld1 { v16.h }[2], [x27], #0x2\n"
+ "ld1 { v17.h }[2], [x26], #0x2\n"
+ "ld1 { v18.h }[2], [x25], #0x2\n"
+ "ld1 { v19.h }[2], [x24], #0x2\n"
+ "ld1 { v20.h }[2], [x23], #0x2\n"
+ "ld1 { v21.h }[2], [x22], #0x2\n"
+ "ld1 { v22.h }[2], [x21], #0x2\n"
+ "b 14f\n"
+ "13:" // Oddments: Load: Bit 2: Unset: Bit 1: Unset
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
+ "14:" // Oddments: Load: Bit 2: End
+ "subs x20, %x[n_points], #0x1\n"
+ "ble 20f\n"
+ "15:" // Oddments: Planar loop
+ "ldp x9, x28, [x10], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "fmla v23.8h, v14.8h, v0.8h\n"
+ "fmla v24.8h, v15.8h, v0.8h\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "fmla v25.8h, v16.8h, v0.8h\n"
+ "fmla v26.8h, v17.8h, v0.8h\n"
+ "ldr x21, [x10], #0x8\n"
+ "fmla v27.8h, v18.8h, v0.8h\n"
+ "fmla v28.8h, v19.8h, v0.8h\n"
+ "add x9, x9, x11\n"
+ "fmla v29.8h, v20.8h, v0.8h\n"
+ "fmla v30.8h, v21.8h, v0.8h\n"
+ "add x28, x28, x11\n"
+ "add x27, x27, x11\n"
+ "fmla v31.8h, v22.8h, v0.8h\n"
+ "ldr q0, [%x[params], #0x0]\n"
+ "add x26, x26, x11\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
+ "add %x[params], %x[params], #0x10\n"
+ "tbz %x[n_channels], #2, 17f\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d15, [x28], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d17, [x26], #0x8\n"
+ "ldr d18, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v15.s }[2], [x28], #0x4\n"
+ "ld1 { v16.s }[2], [x27], #0x4\n"
+ "ld1 { v17.s }[2], [x26], #0x4\n"
+ "ld1 { v18.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v21.s }[2], [x22], #0x4\n"
+ "ld1 { v22.s }[2], [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v14.h }[6], [x9], #0x2\n"
+ "ld1 { v15.h }[6], [x28], #0x2\n"
+ "ld1 { v16.h }[6], [x27], #0x2\n"
+ "ld1 { v17.h }[6], [x26], #0x2\n"
+ "ld1 { v18.h }[6], [x25], #0x2\n"
+ "ld1 { v19.h }[6], [x24], #0x2\n"
+ "ld1 { v20.h }[6], [x23], #0x2\n"
+ "ld1 { v21.h }[6], [x22], #0x2\n"
+ "ld1 { v22.h }[6], [x21], #0x2\n"
+ "b 19f\n"
+ "16:" // Oddments: Planar loop: Load: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v14.h }[4], [x9], #0x2\n"
+ "ld1 { v15.h }[4], [x28], #0x2\n"
+ "ld1 { v16.h }[4], [x27], #0x2\n"
+ "ld1 { v17.h }[4], [x26], #0x2\n"
+ "ld1 { v18.h }[4], [x25], #0x2\n"
+ "ld1 { v19.h }[4], [x24], #0x2\n"
+ "ld1 { v20.h }[4], [x23], #0x2\n"
+ "ld1 { v21.h }[4], [x22], #0x2\n"
+ "ld1 { v22.h }[4], [x21], #0x2\n"
+ "b 19f\n"
+ "17:" // Oddments: Planar loop: Load: Bit 2: Unset
+ "tbz %x[n_channels], #1, 18f\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s15, [x28], #0x4\n"
+ "ldr s16, [x27], #0x4\n"
+ "ldr s17, [x26], #0x4\n"
+ "ldr s18, [x25], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s22, [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v14.h }[2], [x9], #0x2\n"
+ "ld1 { v15.h }[2], [x28], #0x2\n"
+ "ld1 { v16.h }[2], [x27], #0x2\n"
+ "ld1 { v17.h }[2], [x26], #0x2\n"
+ "ld1 { v18.h }[2], [x25], #0x2\n"
+ "ld1 { v19.h }[2], [x24], #0x2\n"
+ "ld1 { v20.h }[2], [x23], #0x2\n"
+ "ld1 { v21.h }[2], [x22], #0x2\n"
+ "ld1 { v22.h }[2], [x21], #0x2\n"
+ "b 19f\n"
+ "18:" // Oddments: Planar loop: Load: Bit 2: Unset: Bit 1: Unset
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
+ "19:" // Oddments: Planar loop: Load: Bit 2: End
+ "subs x20, x20, #0x1\n"
+ "bgt 15b\n"
+ "20:" // Oddments: Planar tail
+ "fmla v23.8h, v14.8h, v0.8h\n"
+ "fmla v24.8h, v15.8h, v0.8h\n"
+ "fmax v23.8h, v23.8h, v2.8h\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "fmla v25.8h, v16.8h, v0.8h\n"
+ "fmla v26.8h, v17.8h, v0.8h\n"
+ "fmax v24.8h, v24.8h, v2.8h\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "fmla v27.8h, v18.8h, v0.8h\n"
+ "fmla v28.8h, v19.8h, v0.8h\n"
+ "fmax v25.8h, v25.8h, v2.8h\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "fmla v29.8h, v20.8h, v0.8h\n"
+ "fmla v30.8h, v21.8h, v0.8h\n"
+ "fmax v26.8h, v26.8h, v2.8h\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "fmla v31.8h, v22.8h, v0.8h\n"
+ "fmax v27.8h, v27.8h, v2.8h\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "add x28, x28, x11\n"
+ "fmax v28.8h, v28.8h, v2.8h\n"
+ "fmax v29.8h, v29.8h, v2.8h\n"
+ "add x27, x27, x11\n"
+ "add x26, x26, x11\n"
+ "fmax v30.8h, v30.8h, v2.8h\n"
+ "fmax v31.8h, v31.8h, v2.8h\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "fmin v23.8h, v23.8h, v1.8h\n"
+ "fmin v24.8h, v24.8h, v1.8h\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "fmin v25.8h, v25.8h, v1.8h\n"
+ "fmin v26.8h, v26.8h, v1.8h\n"
+ "add x21, x21, x11\n"
+ "add x20, x20, x11\n"
+ "fmin v27.8h, v27.8h, v1.8h\n"
+ "fmin v28.8h, v28.8h, v1.8h\n"
+ "fmin v29.8h, v29.8h, v1.8h\n"
+ "fmin v30.8h, v30.8h, v1.8h\n"
+ "fmin v31.8h, v31.8h, v1.8h\n"
+ "tbz %x[n_channels], #2, 22f\n"
+ "st1 { v23.d }[0], [x28], #0x8\n"
+ "st1 { v24.d }[0], [x27], #0x8\n"
+ "st1 { v25.d }[0], [x26], #0x8\n"
+ "st1 { v26.d }[0], [x25], #0x8\n"
+ "st1 { v27.d }[0], [x24], #0x8\n"
+ "st1 { v28.d }[0], [x23], #0x8\n"
+ "st1 { v29.d }[0], [x22], #0x8\n"
+ "st1 { v30.d }[0], [x21], #0x8\n"
+ "st1 { v31.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "st1 { v23.s }[2], [x28], #0x4\n"
+ "st1 { v24.s }[2], [x27], #0x4\n"
+ "st1 { v25.s }[2], [x26], #0x4\n"
+ "st1 { v26.s }[2], [x25], #0x4\n"
+ "st1 { v27.s }[2], [x24], #0x4\n"
+ "st1 { v28.s }[2], [x23], #0x4\n"
+ "st1 { v29.s }[2], [x22], #0x4\n"
+ "st1 { v30.s }[2], [x21], #0x4\n"
+ "st1 { v31.s }[2], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "st1 { v23.h }[6], [x28], #0x2\n"
+ "st1 { v24.h }[6], [x27], #0x2\n"
+ "st1 { v25.h }[6], [x26], #0x2\n"
+ "st1 { v26.h }[6], [x25], #0x2\n"
+ "st1 { v27.h }[6], [x24], #0x2\n"
+ "st1 { v28.h }[6], [x23], #0x2\n"
+ "st1 { v29.h }[6], [x22], #0x2\n"
+ "st1 { v30.h }[6], [x21], #0x2\n"
+ "st1 { v31.h }[6], [x20], #0x2\n"
+ "b 24f\n"
+ "21:" // Oddments: Store: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 24f\n"
+ "st1 { v23.h }[4], [x28], #0x2\n"
+ "st1 { v24.h }[4], [x27], #0x2\n"
+ "st1 { v25.h }[4], [x26], #0x2\n"
+ "st1 { v26.h }[4], [x25], #0x2\n"
+ "st1 { v27.h }[4], [x24], #0x2\n"
+ "st1 { v28.h }[4], [x23], #0x2\n"
+ "st1 { v29.h }[4], [x22], #0x2\n"
+ "st1 { v30.h }[4], [x21], #0x2\n"
+ "st1 { v31.h }[4], [x20], #0x2\n"
+ "b 24f\n"
+ "22:" // Oddments: Store: Bit 2: Unset
+ "tbz %x[n_channels], #1, 23f\n"
+ "st1 { v23.s }[0], [x28], #0x4\n"
+ "st1 { v24.s }[0], [x27], #0x4\n"
+ "st1 { v25.s }[0], [x26], #0x4\n"
+ "st1 { v26.s }[0], [x25], #0x4\n"
+ "st1 { v27.s }[0], [x24], #0x4\n"
+ "st1 { v28.s }[0], [x23], #0x4\n"
+ "st1 { v29.s }[0], [x22], #0x4\n"
+ "st1 { v30.s }[0], [x21], #0x4\n"
+ "st1 { v31.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "st1 { v23.h }[2], [x28], #0x2\n"
+ "st1 { v24.h }[2], [x27], #0x2\n"
+ "st1 { v25.h }[2], [x26], #0x2\n"
+ "st1 { v26.h }[2], [x25], #0x2\n"
+ "st1 { v27.h }[2], [x24], #0x2\n"
+ "st1 { v28.h }[2], [x23], #0x2\n"
+ "st1 { v29.h }[2], [x22], #0x2\n"
+ "st1 { v30.h }[2], [x21], #0x2\n"
+ "st1 { v31.h }[2], [x20], #0x2\n"
+ "b 24f\n"
+ "23:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
+ "st1 { v23.h }[0], [x28], #0x2\n"
+ "st1 { v24.h }[0], [x27], #0x2\n"
+ "st1 { v25.h }[0], [x26], #0x2\n"
+ "st1 { v26.h }[0], [x25], #0x2\n"
+ "st1 { v27.h }[0], [x24], #0x2\n"
+ "st1 { v28.h }[0], [x23], #0x2\n"
+ "st1 { v29.h }[0], [x22], #0x2\n"
+ "st1 { v30.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x20], #0x2\n"
+ "24:" // Oddments: Store: Bit 2: End
+ "25:" // End
+ : [params] "+&r" (params)
+ : [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
+ : "cc", "memory", "v0", "v1", "v2", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..3646c18b04
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const __fp16 *const *const, __fp16 *const *const, const __fp16 *, const __fp16 *, const unsigned int, const unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+ using Parent = GenericDepthfirstMultiplierKernelStrategy<__fp16, __fp16, __fp16, __fp16>;
+ a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+ : Parent(2, 8, arm_gemm::VLType::None)
+ {
+ }
+ Parent::KernelType kernel = a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..cee3fb59c5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,1044 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+ const __fp16 *const *const inptrs,
+ __fp16 *const *const outptrs,
+ const __fp16 *weights,
+ const __fp16 *bias,
+ const unsigned int kernel_points,
+ const unsigned int n_output_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ const __fp16 minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ld1r { v8.8h }, [%x[minmax_vals]]\n"
+ "lsr x11, %x[n_output_channels], #0x3\n"
+ "add x20, %x[minmax_vals], #0x2\n"
+ "ld1r { v7.8h }, [x20]\n"
+ "mov x10, #0x0\n"
+ "cbz x11, 8f\n"
+ "1:" // Output channel loop
+ "movi v31.16b, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "lsl x20, x10, #0x1\n"
+ "ldr q31, [%x[bias], x20]\n"
+ "2:" // Output channel loop: Load bias: Done
+ "ldr q6, [%x[weights], #0x0]\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr q1, [x21, #0x0]\n"
+ "ldr q0, [x20, #0x0]\n"
+ "mov v16.16b, v31.16b\n"
+ "mov v17.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v19.16b, v31.16b\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "mov v20.16b, v31.16b\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v22.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ "mov v24.16b, v31.16b\n"
+ "mov v25.16b, v31.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v27.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v31.16b, v31.16b\n"
+ "cbz x23, 6f\n"
+ "ldr q5, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "ldr q4, [x21, #0x0]\n"
+ "ldr q3, [x20, #0x0]\n"
+ "beq 4f\n"
+ "3:" // Output channel loop: Kernel loop
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "ldr q1, [x21, #0x0]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
+ "ldr q0, [x20, #0x0]\n"
+ "ldr q6, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v16.8h, v5.8h, v4.h[0]\n"
+ "fmla v17.8h, v5.8h, v4.h[1]\n"
+ "fmla v18.8h, v5.8h, v4.h[2]\n"
+ "fmla v19.8h, v5.8h, v4.h[3]\n"
+ "fmla v20.8h, v5.8h, v4.h[4]\n"
+ "fmla v21.8h, v5.8h, v4.h[5]\n"
+ "fmla v22.8h, v5.8h, v4.h[6]\n"
+ "fmla v23.8h, v5.8h, v4.h[7]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "fmla v24.8h, v5.8h, v3.h[0]\n"
+ "fmla v25.8h, v5.8h, v3.h[1]\n"
+ "fmla v26.8h, v5.8h, v3.h[2]\n"
+ "fmla v27.8h, v5.8h, v3.h[3]\n"
+ "fmla v28.8h, v5.8h, v3.h[4]\n"
+ "fmla v29.8h, v5.8h, v3.h[5]\n"
+ "fmla v30.8h, v5.8h, v3.h[6]\n"
+ "fmla v31.8h, v5.8h, v3.h[7]\n"
+ "ldr q3, [x20, #0x0]\n"
+ "ldr q5, [%x[weights], #0x10]\n"
+ "add %x[weights], %x[weights], #0x20\n"
+ "bgt 3b\n"
+ "4:" // Output channel loop: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 5f\n"
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "lsl x28, x10, #0x1\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
+ "fmla v16.8h, v5.8h, v4.h[0]\n"
+ "fmla v17.8h, v5.8h, v4.h[1]\n"
+ "fmin v16.8h, v16.8h, v7.8h\n"
+ "fmla v18.8h, v5.8h, v4.h[2]\n"
+ "fmla v19.8h, v5.8h, v4.h[3]\n"
+ "fmin v17.8h, v17.8h, v7.8h\n"
+ "fmla v20.8h, v5.8h, v4.h[4]\n"
+ "fmla v21.8h, v5.8h, v4.h[5]\n"
+ "fmin v18.8h, v18.8h, v7.8h\n"
+ "fmla v22.8h, v5.8h, v4.h[6]\n"
+ "fmla v23.8h, v5.8h, v4.h[7]\n"
+ "fmin v19.8h, v19.8h, v7.8h\n"
+ "fmla v24.8h, v5.8h, v3.h[0]\n"
+ "fmla v25.8h, v5.8h, v3.h[1]\n"
+ "fmin v20.8h, v20.8h, v7.8h\n"
+ "fmla v26.8h, v5.8h, v3.h[2]\n"
+ "fmla v27.8h, v5.8h, v3.h[3]\n"
+ "fmin v21.8h, v21.8h, v7.8h\n"
+ "fmla v28.8h, v5.8h, v3.h[4]\n"
+ "fmla v29.8h, v5.8h, v3.h[5]\n"
+ "fmin v22.8h, v22.8h, v7.8h\n"
+ "fmla v30.8h, v5.8h, v3.h[6]\n"
+ "fmla v31.8h, v5.8h, v3.h[7]\n"
+ "fmin v23.8h, v23.8h, v7.8h\n"
+ "fmax v16.8h, v16.8h, v8.8h\n"
+ "fmax v17.8h, v17.8h, v8.8h\n"
+ "str q16, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax v18.8h, v18.8h, v8.8h\n"
+ "fmax v19.8h, v19.8h, v8.8h\n"
+ "str q17, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax v20.8h, v20.8h, v8.8h\n"
+ "fmax v21.8h, v21.8h, v8.8h\n"
+ "str q18, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmax v22.8h, v22.8h, v8.8h\n"
+ "fmax v23.8h, v23.8h, v8.8h\n"
+ "str q19, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin v24.8h, v24.8h, v7.8h\n"
+ "fmin v25.8h, v25.8h, v7.8h\n"
+ "str q20, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin v26.8h, v26.8h, v7.8h\n"
+ "fmin v27.8h, v27.8h, v7.8h\n"
+ "str q21, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin v28.8h, v28.8h, v7.8h\n"
+ "fmin v29.8h, v29.8h, v7.8h\n"
+ "str q22, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmin v30.8h, v30.8h, v7.8h\n"
+ "fmin v31.8h, v31.8h, v7.8h\n"
+ "str q23, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax v24.8h, v24.8h, v8.8h\n"
+ "fmax v25.8h, v25.8h, v8.8h\n"
+ "str q24, [x27, x28]\n"
+ "fmax v26.8h, v26.8h, v8.8h\n"
+ "fmax v27.8h, v27.8h, v8.8h\n"
+ "str q25, [x26, x28]\n"
+ "fmax v28.8h, v28.8h, v8.8h\n"
+ "fmax v29.8h, v29.8h, v8.8h\n"
+ "str q26, [x25, x28]\n"
+ "fmax v30.8h, v30.8h, v8.8h\n"
+ "fmax v31.8h, v31.8h, v8.8h\n"
+ "str q27, [x24, x28]\n"
+ "str q28, [x23, x28]\n"
+ "str q29, [x22, x28]\n"
+ "str q30, [x21, x28]\n"
+ "str q31, [x20, x28]\n"
+ "b 7f\n"
+ "5:" // Output channel loop: Odd tail
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "ldp x20, x9, [x22], #0x10\n"
+ "lsl x28, x10, #0x1\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "ldr q2, [x20, #0x0]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
+ "ldr q1, [%x[weights], #0x0]\n"
+ "ldr q0, [x9, #0x0]\n"
+ "fmla v16.8h, v5.8h, v4.h[0]\n"
+ "fmla v17.8h, v5.8h, v4.h[1]\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "fmla v18.8h, v5.8h, v4.h[2]\n"
+ "fmla v19.8h, v5.8h, v4.h[3]\n"
+ "fmla v20.8h, v5.8h, v4.h[4]\n"
+ "fmla v21.8h, v5.8h, v4.h[5]\n"
+ "fmla v22.8h, v5.8h, v4.h[6]\n"
+ "fmla v23.8h, v5.8h, v4.h[7]\n"
+ "fmla v24.8h, v5.8h, v3.h[0]\n"
+ "fmla v25.8h, v5.8h, v3.h[1]\n"
+ "fmla v26.8h, v5.8h, v3.h[2]\n"
+ "fmla v27.8h, v5.8h, v3.h[3]\n"
+ "fmla v28.8h, v5.8h, v3.h[4]\n"
+ "fmla v29.8h, v5.8h, v3.h[5]\n"
+ "fmla v30.8h, v5.8h, v3.h[6]\n"
+ "fmla v31.8h, v5.8h, v3.h[7]\n"
+ "fmla v16.8h, v1.8h, v2.h[0]\n"
+ "fmla v17.8h, v1.8h, v2.h[1]\n"
+ "fmin v16.8h, v16.8h, v7.8h\n"
+ "fmla v18.8h, v1.8h, v2.h[2]\n"
+ "fmla v19.8h, v1.8h, v2.h[3]\n"
+ "fmin v17.8h, v17.8h, v7.8h\n"
+ "fmla v20.8h, v1.8h, v2.h[4]\n"
+ "fmla v21.8h, v1.8h, v2.h[5]\n"
+ "fmin v18.8h, v18.8h, v7.8h\n"
+ "fmla v22.8h, v1.8h, v2.h[6]\n"
+ "fmla v23.8h, v1.8h, v2.h[7]\n"
+ "fmin v19.8h, v19.8h, v7.8h\n"
+ "fmla v24.8h, v1.8h, v0.h[0]\n"
+ "fmla v25.8h, v1.8h, v0.h[1]\n"
+ "fmin v20.8h, v20.8h, v7.8h\n"
+ "fmla v26.8h, v1.8h, v0.h[2]\n"
+ "fmla v27.8h, v1.8h, v0.h[3]\n"
+ "fmin v21.8h, v21.8h, v7.8h\n"
+ "fmla v28.8h, v1.8h, v0.h[4]\n"
+ "fmla v29.8h, v1.8h, v0.h[5]\n"
+ "fmin v22.8h, v22.8h, v7.8h\n"
+ "fmla v30.8h, v1.8h, v0.h[6]\n"
+ "fmla v31.8h, v1.8h, v0.h[7]\n"
+ "fmin v23.8h, v23.8h, v7.8h\n"
+ "fmax v16.8h, v16.8h, v8.8h\n"
+ "fmax v17.8h, v17.8h, v8.8h\n"
+ "str q16, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax v18.8h, v18.8h, v8.8h\n"
+ "fmax v19.8h, v19.8h, v8.8h\n"
+ "str q17, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax v20.8h, v20.8h, v8.8h\n"
+ "fmax v21.8h, v21.8h, v8.8h\n"
+ "str q18, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmax v22.8h, v22.8h, v8.8h\n"
+ "fmax v23.8h, v23.8h, v8.8h\n"
+ "str q19, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin v24.8h, v24.8h, v7.8h\n"
+ "fmin v25.8h, v25.8h, v7.8h\n"
+ "str q20, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin v26.8h, v26.8h, v7.8h\n"
+ "fmin v27.8h, v27.8h, v7.8h\n"
+ "str q21, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin v28.8h, v28.8h, v7.8h\n"
+ "fmin v29.8h, v29.8h, v7.8h\n"
+ "str q22, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmin v30.8h, v30.8h, v7.8h\n"
+ "fmin v31.8h, v31.8h, v7.8h\n"
+ "str q23, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax v24.8h, v24.8h, v8.8h\n"
+ "fmax v25.8h, v25.8h, v8.8h\n"
+ "str q24, [x27, x28]\n"
+ "fmax v26.8h, v26.8h, v8.8h\n"
+ "fmax v27.8h, v27.8h, v8.8h\n"
+ "str q25, [x26, x28]\n"
+ "fmax v28.8h, v28.8h, v8.8h\n"
+ "fmax v29.8h, v29.8h, v8.8h\n"
+ "str q26, [x25, x28]\n"
+ "fmax v30.8h, v30.8h, v8.8h\n"
+ "fmax v31.8h, v31.8h, v8.8h\n"
+ "str q27, [x24, x28]\n"
+ "str q28, [x23, x28]\n"
+ "str q29, [x22, x28]\n"
+ "str q30, [x21, x28]\n"
+ "str q31, [x20, x28]\n"
+ "b 7f\n"
+ "6:" // Output channel loop: Single kernel point
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "fmin v16.8h, v16.8h, v7.8h\n"
+ "lsl x28, x10, #0x1\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "fmin v17.8h, v17.8h, v7.8h\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "fmin v18.8h, v18.8h, v7.8h\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "fmin v19.8h, v19.8h, v7.8h\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "fmin v20.8h, v20.8h, v7.8h\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmin v21.8h, v21.8h, v7.8h\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmin v22.8h, v22.8h, v7.8h\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
+ "fmin v23.8h, v23.8h, v7.8h\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmax v16.8h, v16.8h, v8.8h\n"
+ "fmax v17.8h, v17.8h, v8.8h\n"
+ "str q16, [x27, x28]\n"
+ "fmax v18.8h, v18.8h, v8.8h\n"
+ "fmax v19.8h, v19.8h, v8.8h\n"
+ "str q17, [x26, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax v20.8h, v20.8h, v8.8h\n"
+ "fmax v21.8h, v21.8h, v8.8h\n"
+ "str q18, [x25, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax v22.8h, v22.8h, v8.8h\n"
+ "fmax v23.8h, v23.8h, v8.8h\n"
+ "str q19, [x24, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmin v24.8h, v24.8h, v7.8h\n"
+ "fmin v25.8h, v25.8h, v7.8h\n"
+ "str q20, [x23, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin v26.8h, v26.8h, v7.8h\n"
+ "fmin v27.8h, v27.8h, v7.8h\n"
+ "str q21, [x22, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin v28.8h, v28.8h, v7.8h\n"
+ "fmin v29.8h, v29.8h, v7.8h\n"
+ "str q22, [x21, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin v30.8h, v30.8h, v7.8h\n"
+ "fmin v31.8h, v31.8h, v7.8h\n"
+ "str q23, [x20, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax v24.8h, v24.8h, v8.8h\n"
+ "fmax v25.8h, v25.8h, v8.8h\n"
+ "str q24, [x27, x28]\n"
+ "fmax v26.8h, v26.8h, v8.8h\n"
+ "fmax v27.8h, v27.8h, v8.8h\n"
+ "str q25, [x26, x28]\n"
+ "fmax v28.8h, v28.8h, v8.8h\n"
+ "fmax v29.8h, v29.8h, v8.8h\n"
+ "str q26, [x25, x28]\n"
+ "fmax v30.8h, v30.8h, v8.8h\n"
+ "fmax v31.8h, v31.8h, v8.8h\n"
+ "str q27, [x24, x28]\n"
+ "str q28, [x23, x28]\n"
+ "str q29, [x22, x28]\n"
+ "str q30, [x21, x28]\n"
+ "str q31, [x20, x28]\n"
+ "7:" // Output channel loop: Done
+ "add x10, x10, #0x8\n"
+ "cmp x10, x11, LSL #3\n"
+ "blt 1b\n"
+ "tst %x[n_output_channels], #0x7\n"
+ "beq 23f\n"
+ "8:" // Output channel oddments
+ "movi v31.16b, #0x0\n"
+ "cbz %x[bias], 13f\n"
+ "add x20, %x[bias], x10, LSL #1\n"
+ "tbz %x[n_output_channels], #2, 10f\n"
+ "ld1 { v31.d }[0], [x20], #0x8\n"
+ "tbz %x[n_output_channels], #1, 9f\n"
+ "ld1 { v31.s }[2], [x20], #0x4\n"
+ "tbz %x[n_output_channels], #0, 12f\n"
+ "ld1 { v31.h }[6], [x20]\n"
+ "b 12f\n"
+ "9:" // Output channel oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz %x[n_output_channels], #0, 12f\n"
+ "ld1 { v31.h }[4], [x20]\n"
+ "b 12f\n"
+ "10:" // Output channel oddments: Load bias: Bit 2: Unset
+ "tbz %x[n_output_channels], #1, 11f\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
+ "tbz %x[n_output_channels], #0, 12f\n"
+ "ld1 { v31.h }[2], [x20]\n"
+ "b 12f\n"
+ "11:" // Output channel oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "ld1 { v31.h }[0], [x20]\n"
+ "12:" // Output channel oddments: Load bias: Bit 2: End
+ "13:" // Output channel oddments: Load bias: Done
+ "ldr q6, [%x[weights], #0x0]\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr q1, [x21, #0x0]\n"
+ "ldr q0, [x20, #0x0]\n"
+ "mov v16.16b, v31.16b\n"
+ "mov v17.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v19.16b, v31.16b\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "mov v20.16b, v31.16b\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v22.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ "mov v24.16b, v31.16b\n"
+ "mov v25.16b, v31.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v27.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v31.16b, v31.16b\n"
+ "cbz x23, 17f\n"
+ "ldr q5, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "ldr q4, [x21, #0x0]\n"
+ "ldr q3, [x20, #0x0]\n"
+ "beq 15f\n"
+ "14:" // Output channel oddments: Kernel loop
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "ldr q1, [x21, #0x0]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
+ "ldr q0, [x20, #0x0]\n"
+ "ldr q6, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v16.8h, v5.8h, v4.h[0]\n"
+ "fmla v17.8h, v5.8h, v4.h[1]\n"
+ "fmla v18.8h, v5.8h, v4.h[2]\n"
+ "fmla v19.8h, v5.8h, v4.h[3]\n"
+ "fmla v20.8h, v5.8h, v4.h[4]\n"
+ "fmla v21.8h, v5.8h, v4.h[5]\n"
+ "fmla v22.8h, v5.8h, v4.h[6]\n"
+ "fmla v23.8h, v5.8h, v4.h[7]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "fmla v24.8h, v5.8h, v3.h[0]\n"
+ "fmla v25.8h, v5.8h, v3.h[1]\n"
+ "fmla v26.8h, v5.8h, v3.h[2]\n"
+ "fmla v27.8h, v5.8h, v3.h[3]\n"
+ "fmla v28.8h, v5.8h, v3.h[4]\n"
+ "fmla v29.8h, v5.8h, v3.h[5]\n"
+ "fmla v30.8h, v5.8h, v3.h[6]\n"
+ "fmla v31.8h, v5.8h, v3.h[7]\n"
+ "ldr q3, [x20, #0x0]\n"
+ "ldr q5, [%x[weights], #0x10]\n"
+ "add %x[weights], %x[weights], #0x20\n"
+ "bgt 14b\n"
+ "15:" // Output channel oddments: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 16f\n"
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
+ "fmla v16.8h, v5.8h, v4.h[0]\n"
+ "fmla v17.8h, v5.8h, v4.h[1]\n"
+ "fmla v18.8h, v5.8h, v4.h[2]\n"
+ "fmla v19.8h, v5.8h, v4.h[3]\n"
+ "fmla v20.8h, v5.8h, v4.h[4]\n"
+ "fmla v21.8h, v5.8h, v4.h[5]\n"
+ "fmla v22.8h, v5.8h, v4.h[6]\n"
+ "fmla v23.8h, v5.8h, v4.h[7]\n"
+ "fmla v24.8h, v5.8h, v3.h[0]\n"
+ "fmla v25.8h, v5.8h, v3.h[1]\n"
+ "fmla v26.8h, v5.8h, v3.h[2]\n"
+ "fmla v27.8h, v5.8h, v3.h[3]\n"
+ "fmla v28.8h, v5.8h, v3.h[4]\n"
+ "fmla v29.8h, v5.8h, v3.h[5]\n"
+ "fmla v30.8h, v5.8h, v3.h[6]\n"
+ "fmla v31.8h, v5.8h, v3.h[7]\n"
+ "b 18f\n"
+ "16:" // Output channel oddments: Odd tail
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "ldr q2, [x21, #0x0]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
+ "ldr q1, [x20, #0x0]\n"
+ "ldr q0, [%x[weights], #0x0]\n"
+ "fmla v16.8h, v5.8h, v4.h[0]\n"
+ "fmla v17.8h, v5.8h, v4.h[1]\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "fmla v18.8h, v5.8h, v4.h[2]\n"
+ "fmla v19.8h, v5.8h, v4.h[3]\n"
+ "fmla v20.8h, v5.8h, v4.h[4]\n"
+ "fmla v21.8h, v5.8h, v4.h[5]\n"
+ "fmla v22.8h, v5.8h, v4.h[6]\n"
+ "fmla v23.8h, v5.8h, v4.h[7]\n"
+ "fmla v24.8h, v5.8h, v3.h[0]\n"
+ "fmla v25.8h, v5.8h, v3.h[1]\n"
+ "fmla v26.8h, v5.8h, v3.h[2]\n"
+ "fmla v27.8h, v5.8h, v3.h[3]\n"
+ "fmla v28.8h, v5.8h, v3.h[4]\n"
+ "fmla v29.8h, v5.8h, v3.h[5]\n"
+ "fmla v30.8h, v5.8h, v3.h[6]\n"
+ "fmla v31.8h, v5.8h, v3.h[7]\n"
+ "fmla v16.8h, v0.8h, v2.h[0]\n"
+ "fmla v17.8h, v0.8h, v2.h[1]\n"
+ "fmla v18.8h, v0.8h, v2.h[2]\n"
+ "fmla v19.8h, v0.8h, v2.h[3]\n"
+ "fmla v20.8h, v0.8h, v2.h[4]\n"
+ "fmla v21.8h, v0.8h, v2.h[5]\n"
+ "fmla v22.8h, v0.8h, v2.h[6]\n"
+ "fmla v23.8h, v0.8h, v2.h[7]\n"
+ "fmla v24.8h, v0.8h, v1.h[0]\n"
+ "fmla v25.8h, v0.8h, v1.h[1]\n"
+ "fmla v26.8h, v0.8h, v1.h[2]\n"
+ "fmla v27.8h, v0.8h, v1.h[3]\n"
+ "fmla v28.8h, v0.8h, v1.h[4]\n"
+ "fmla v29.8h, v0.8h, v1.h[5]\n"
+ "fmla v30.8h, v0.8h, v1.h[6]\n"
+ "fmla v31.8h, v0.8h, v1.h[7]\n"
+ "b 18f\n"
+ "17:" // Output channel oddments: Single kernel point
+ "fmla v16.8h, v6.8h, v1.h[0]\n"
+ "fmla v17.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v1.h[2]\n"
+ "fmla v19.8h, v6.8h, v1.h[3]\n"
+ "fmla v20.8h, v6.8h, v1.h[4]\n"
+ "fmla v21.8h, v6.8h, v1.h[5]\n"
+ "fmla v22.8h, v6.8h, v1.h[6]\n"
+ "fmla v23.8h, v6.8h, v1.h[7]\n"
+ "fmla v24.8h, v6.8h, v0.h[0]\n"
+ "fmla v25.8h, v6.8h, v0.h[1]\n"
+ "fmla v26.8h, v6.8h, v0.h[2]\n"
+ "fmla v27.8h, v6.8h, v0.h[3]\n"
+ "fmla v28.8h, v6.8h, v0.h[4]\n"
+ "fmla v29.8h, v6.8h, v0.h[5]\n"
+ "fmla v30.8h, v6.8h, v0.h[6]\n"
+ "fmla v31.8h, v6.8h, v0.h[7]\n"
+ "18:" // Output channel oddments: Done
+ "fmin v16.8h, v16.8h, v7.8h\n"
+ "fmin v17.8h, v17.8h, v7.8h\n"
+ "fmin v18.8h, v18.8h, v7.8h\n"
+ "fmin v19.8h, v19.8h, v7.8h\n"
+ "fmin v20.8h, v20.8h, v7.8h\n"
+ "fmin v21.8h, v21.8h, v7.8h\n"
+ "fmin v22.8h, v22.8h, v7.8h\n"
+ "fmin v23.8h, v23.8h, v7.8h\n"
+ "fmin v24.8h, v24.8h, v7.8h\n"
+ "fmin v25.8h, v25.8h, v7.8h\n"
+ "fmin v26.8h, v26.8h, v7.8h\n"
+ "fmin v27.8h, v27.8h, v7.8h\n"
+ "fmin v28.8h, v28.8h, v7.8h\n"
+ "fmin v29.8h, v29.8h, v7.8h\n"
+ "fmin v30.8h, v30.8h, v7.8h\n"
+ "fmin v31.8h, v31.8h, v7.8h\n"
+ "fmax v16.8h, v16.8h, v8.8h\n"
+ "fmax v17.8h, v17.8h, v8.8h\n"
+ "fmax v18.8h, v18.8h, v8.8h\n"
+ "fmax v19.8h, v19.8h, v8.8h\n"
+ "fmax v20.8h, v20.8h, v8.8h\n"
+ "fmax v21.8h, v21.8h, v8.8h\n"
+ "fmax v22.8h, v22.8h, v8.8h\n"
+ "fmax v23.8h, v23.8h, v8.8h\n"
+ "fmax v24.8h, v24.8h, v8.8h\n"
+ "fmax v25.8h, v25.8h, v8.8h\n"
+ "fmax v26.8h, v26.8h, v8.8h\n"
+ "fmax v27.8h, v27.8h, v8.8h\n"
+ "fmax v28.8h, v28.8h, v8.8h\n"
+ "fmax v29.8h, v29.8h, v8.8h\n"
+ "fmax v30.8h, v30.8h, v8.8h\n"
+ "fmax v31.8h, v31.8h, v8.8h\n"
+ "tbz %x[n_output_channels], #2, 20f\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.d }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "st1 { v17.d }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.d }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.d }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.d }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.d }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.d }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.d }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "add x10, x10, #0x4\n"
+ "st1 { v24.d }[0], [x27]\n"
+ "st1 { v25.d }[0], [x26]\n"
+ "st1 { v26.d }[0], [x25]\n"
+ "st1 { v27.d }[0], [x24]\n"
+ "st1 { v28.d }[0], [x23]\n"
+ "st1 { v29.d }[0], [x22]\n"
+ "st1 { v30.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
+ "tbz %x[n_output_channels], #1, 19f\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "st1 { v17.s }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.s }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.s }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.s }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "add x10, x10, #0x2\n"
+ "st1 { v24.s }[2], [x27]\n"
+ "st1 { v25.s }[2], [x26]\n"
+ "st1 { v26.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x24]\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
+ "tbz %x[n_output_channels], #0, 22f\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.h }[6], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "st1 { v17.h }[6], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.h }[6], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.h }[6], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.h }[6], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.h }[6], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.h }[6], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.h }[6], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v24.h }[6], [x27]\n"
+ "st1 { v25.h }[6], [x26]\n"
+ "st1 { v26.h }[6], [x25]\n"
+ "st1 { v27.h }[6], [x24]\n"
+ "st1 { v28.h }[6], [x23]\n"
+ "st1 { v29.h }[6], [x22]\n"
+ "st1 { v30.h }[6], [x21]\n"
+ "st1 { v31.h }[6], [x20]\n"
+ "b 22f\n"
+ "19:" // Output channel oddments: Done: Store: Bit 2: Bit 1: Unset
+ "tbz %x[n_output_channels], #0, 22f\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.h }[4], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "st1 { v17.h }[4], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.h }[4], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.h }[4], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.h }[4], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.h }[4], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.h }[4], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.h }[4], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v24.h }[4], [x27]\n"
+ "st1 { v25.h }[4], [x26]\n"
+ "st1 { v26.h }[4], [x25]\n"
+ "st1 { v27.h }[4], [x24]\n"
+ "st1 { v28.h }[4], [x23]\n"
+ "st1 { v29.h }[4], [x22]\n"
+ "st1 { v30.h }[4], [x21]\n"
+ "st1 { v31.h }[4], [x20]\n"
+ "b 22f\n"
+ "20:" // Output channel oddments: Done: Store: Bit 2: Unset
+ "tbz %x[n_output_channels], #1, 21f\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.s }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "st1 { v17.s }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.s }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.s }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.s }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.s }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.s }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.s }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "add x10, x10, #0x2\n"
+ "st1 { v24.s }[0], [x27]\n"
+ "st1 { v25.s }[0], [x26]\n"
+ "st1 { v26.s }[0], [x25]\n"
+ "st1 { v27.s }[0], [x24]\n"
+ "st1 { v28.s }[0], [x23]\n"
+ "st1 { v29.s }[0], [x22]\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
+ "tbz %x[n_output_channels], #0, 22f\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.h }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "st1 { v17.h }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.h }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.h }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.h }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.h }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.h }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.h }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v24.h }[2], [x27]\n"
+ "st1 { v25.h }[2], [x26]\n"
+ "st1 { v26.h }[2], [x25]\n"
+ "st1 { v27.h }[2], [x24]\n"
+ "st1 { v28.h }[2], [x23]\n"
+ "st1 { v29.h }[2], [x22]\n"
+ "st1 { v30.h }[2], [x21]\n"
+ "st1 { v31.h }[2], [x20]\n"
+ "b 22f\n"
+ "21:" // Output channel oddments: Done: Store: Bit 2: Unset: Bit 1: Unset
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "add x26, x26, x10, LSL #1\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "add x24, x24, x10, LSL #1\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "add x22, x22, x10, LSL #1\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v16.h }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "add x27, x27, x10, LSL #1\n"
+ "st1 { v17.h }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #1\n"
+ "st1 { v18.h }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #1\n"
+ "st1 { v19.h }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #1\n"
+ "st1 { v20.h }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #1\n"
+ "st1 { v21.h }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #1\n"
+ "st1 { v22.h }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #1\n"
+ "st1 { v23.h }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #1\n"
+ "st1 { v24.h }[0], [x27]\n"
+ "st1 { v25.h }[0], [x26]\n"
+ "st1 { v26.h }[0], [x25]\n"
+ "st1 { v27.h }[0], [x24]\n"
+ "st1 { v28.h }[0], [x23]\n"
+ "st1 { v29.h }[0], [x22]\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
+ "22:" // Output channel oddments: Done: Store: Bit 2: End
+ "23:" // Done
+ : [weights] "+&r" (weights)
+ : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..5d3db974f0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..fd8686c15e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,527 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x23, #0x0\n"
+ "mov x22, #0x0\n"
+ "1:" // Tile loop
+ "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x27, #0x2\n"
+ "mov x26, #0x2\n"
+ "str x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x23, x25\n" // offset = tile_i * ld_input_row
+ "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x20, x23, x24\n" // offset = tile_i * ld_output_row
+ "mov x23, #0x10\n" // cntb _, ALL, #1
+ "madd x21, x22, x15, x21\n" // offset += tile_j * ld_input_col
+ "ldr x13, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "lsl x15, x15, #0x2\n"
+ "ldr x12, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "madd x20, x22, x14, x20\n" // offset += tile_j * ld_output_col
+ "lsr x22, %x[n_channels], #0x2\n"
+ "add x11, x15, x15\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mul x21, x21, x27\n" // offset *= kernel_stride * output_size
+ "add x13, x13, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x9, x13, x25, LSL #2\n"
+ "mul x20, x20, x26\n" // offset *= output_tile_size
+ "add x28, x9, x25, LSL #2\n"
+ "add x12, x12, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v27.4s }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v26.4s }, [x20]\n"
+ "add x27, x28, x25, LSL #2\n"
+ "add x26, x11, x15\n"
+ "add x25, x12, x24, LSL #2\n"
+ "lsl x14, x14, #0x2\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x23\n"
+ "cbz x22, 4f\n"
+ "ldr q25, [x10, #0x0]\n"
+ "ldr q0, [x10, #0x10]\n"
+ "cmp x23, x22, LSL #4\n"
+ "ldr q1, [x10, #0x20]\n"
+ "ldr q2, [x10, #0x30]\n"
+ "ldr q3, [x10, #0x40]\n"
+ "ldr q4, [x10, #0x50]\n"
+ "ldr q5, [x10, #0x60]\n"
+ "ldr q6, [x10, #0x70]\n"
+ "ldr q7, [x10, #0x80]\n"
+ "ldr q8, [x10, #0x90]\n"
+ "add x10, x10, #0xa0\n"
+ "ldr q9, [x9, x15]\n"
+ "ld1 { v10.4s }, [x13]\n"
+ "ldr q11, [x13, x26]\n"
+ "ldr q12, [x9, x11]\n"
+ "ldr q13, [x28, x15]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
+ "mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
+ "add x23, x23, #0x10\n"
+ "cmp x23, x22, LSL #4\n"
+ "mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+ "mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "ld1 { v18.4s }, [x27]\n"
+ "ldr q25, [x10, #0x0]\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "ldr q20, [x28, x11]\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x27, x26]\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
+ "add x20, x20, #0x10\n"
+ "add x21, x21, #0x10\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "ldr q16, [x13, x15]\n"
+ "fmla v22.4s, v6.4s, v18.4s\n"
+ "ldr q18, [x13, x11]\n"
+ "fmla v21.4s, v3.4s, v13.4s\n"
+ "add x13, x13, #0x10\n"
+ "fmla v24.4s, v7.4s, v13.4s\n"
+ "fmla v23.4s, v6.4s, v13.4s\n"
+ "fmla v22.4s, v4.4s, v13.4s\n"
+ "fmla v21.4s, v8.4s, v17.4s\n"
+ "ld1 { v17.4s }, [x9]\n"
+ "fmla v24.4s, v1.4s, v16.4s\n"
+ "fmla v23.4s, v0.4s, v16.4s\n"
+ "ldr q16, [x9, x26]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v22.4s, v5.4s, v20.4s\n"
+ "fmla v21.4s, v4.4s, v20.4s\n"
+ "ldr q4, [x10, #0x50]\n"
+ "fmla v24.4s, v2.4s, v18.4s\n"
+ "fmla v23.4s, v1.4s, v18.4s\n"
+ "ld1 { v19.4s }, [x28]\n"
+ "ldr q1, [x10, #0x20]\n"
+ "fmla v22.4s, v0.4s, v17.4s\n"
+ "ldr q0, [x10, #0x10]\n"
+ "fmla v21.4s, v2.4s, v16.4s\n"
+ "ldr q2, [x10, #0x30]\n"
+ "fmla v24.4s, v8.4s, v20.4s\n"
+ "fmla v23.4s, v7.4s, v20.4s\n"
+ "ldr q18, [x28, x26]\n"
+ "add x28, x28, #0x10\n"
+ "ldr q13, [x28, x15]\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "fmla v21.4s, v5.4s, v18.4s\n"
+ "fmla v24.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x27, x15]\n"
+ "ldr q3, [x10, #0x40]\n"
+ "fmla v23.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x27, x11]\n"
+ "ldr q5, [x10, #0x60]\n"
+ "fmla v22.4s, v7.4s, v17.4s\n"
+ "fmla v21.4s, v6.4s, v17.4s\n"
+ "ldr q11, [x13, x26]\n"
+ "fmla v24.4s, v6.4s, v19.4s\n"
+ "ldr q9, [x9, x15]\n"
+ "fmla v23.4s, v8.4s, v18.4s\n"
+ "ld1 { v10.4s }, [x13]\n"
+ "ldr q6, [x10, #0x70]\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "fmla v21.4s, v7.4s, v16.4s\n"
+ "ldr q12, [x9, x11]\n"
+ "ldr q7, [x10, #0x80]\n"
+ "fmax v24.4s, v24.4s, v27.4s\n"
+ "fmax v23.4s, v23.4s, v27.4s\n"
+ "ldr q8, [x10, #0x90]\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
+ "add x27, x27, #0x10\n"
+ "fmin v24.4s, v24.4s, v26.4s\n"
+ "fmin v23.4s, v23.4s, v26.4s\n"
+ "st1 { v24.4s }, [x12]\n"
+ "add x10, x10, #0xa0\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "str q23, [x12, x14]\n"
+ "add x12, x12, #0x10\n"
+ "st1 { v22.4s }, [x25]\n"
+ "str q21, [x25, x14]\n"
+ "add x25, x25, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
+ "mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
+ "mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+ "mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "ld1 { v18.4s }, [x27]\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "ldr q20, [x28, x11]\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x27, x26]\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "ldr q16, [x13, x15]\n"
+ "fmla v22.4s, v6.4s, v18.4s\n"
+ "ldr q18, [x13, x11]\n"
+ "fmla v21.4s, v3.4s, v13.4s\n"
+ "add x13, x13, #0x10\n"
+ "fmla v24.4s, v7.4s, v13.4s\n"
+ "fmla v23.4s, v6.4s, v13.4s\n"
+ "fmla v22.4s, v4.4s, v13.4s\n"
+ "fmla v21.4s, v8.4s, v17.4s\n"
+ "ld1 { v17.4s }, [x9]\n"
+ "fmla v24.4s, v1.4s, v16.4s\n"
+ "fmla v23.4s, v0.4s, v16.4s\n"
+ "ldr q16, [x9, x26]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v22.4s, v5.4s, v20.4s\n"
+ "fmla v21.4s, v4.4s, v20.4s\n"
+ "fmla v24.4s, v2.4s, v18.4s\n"
+ "fmla v23.4s, v1.4s, v18.4s\n"
+ "ld1 { v19.4s }, [x28]\n"
+ "fmla v22.4s, v0.4s, v17.4s\n"
+ "fmla v21.4s, v2.4s, v16.4s\n"
+ "fmla v24.4s, v8.4s, v20.4s\n"
+ "fmla v23.4s, v7.4s, v20.4s\n"
+ "ldr q18, [x28, x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "fmla v21.4s, v5.4s, v18.4s\n"
+ "fmla v24.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x27, x15]\n"
+ "fmla v23.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x27, x11]\n"
+ "fmla v22.4s, v7.4s, v17.4s\n"
+ "fmla v21.4s, v6.4s, v17.4s\n"
+ "add x27, x27, #0x10\n"
+ "fmla v24.4s, v6.4s, v19.4s\n"
+ "fmla v23.4s, v8.4s, v18.4s\n"
+ "fmax v24.4s, v24.4s, v27.4s\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "fmla v21.4s, v7.4s, v16.4s\n"
+ "fmax v23.4s, v23.4s, v27.4s\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
+ "fmin v24.4s, v24.4s, v26.4s\n"
+ "fmin v23.4s, v23.4s, v26.4s\n"
+ "st1 { v24.4s }, [x12]\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "str q23, [x12, x14]\n"
+ "add x12, x12, #0x10\n"
+ "st1 { v22.4s }, [x25]\n"
+ "str q21, [x25, x14]\n"
+ "add x25, x25, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 31f\n"
+ "ldr q25, [x10, #0x0]\n"
+ "ldr q0, [x10, #0x10]\n"
+ "add x24, x9, x15\n"
+ "add x23, x13, XZR\n"
+ "ldr q1, [x10, #0x20]\n"
+ "ldr q2, [x10, #0x30]\n"
+ "add x22, x13, x26\n"
+ "add x21, x9, x11\n"
+ "ldr q3, [x10, #0x40]\n"
+ "ldr q4, [x10, #0x50]\n"
+ "add x20, x28, x15\n"
+ "ldr q5, [x10, #0x60]\n"
+ "ldr q6, [x10, #0x70]\n"
+ "ldr q7, [x10, #0x80]\n"
+ "ldr q8, [x10, #0x90]\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ldr d9, [x24], #0x8\n"
+ "ldr d10, [x23], #0x8\n"
+ "ldr d11, [x22], #0x8\n"
+ "ldr d12, [x21], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v9.s }[2], [x24]\n"
+ "ld1 { v10.s }[2], [x23]\n"
+ "ld1 { v11.s }[2], [x22]\n"
+ "ld1 { v12.s }[2], [x21]\n"
+ "ld1 { v13.s }[2], [x20]\n"
+ "b 6f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset
+ "ldr s9, [x24, #0x0]\n"
+ "ldr s10, [x23, #0x0]\n"
+ "ldr s11, [x22, #0x0]\n"
+ "ldr s12, [x21, #0x0]\n"
+ "ldr s13, [x20, #0x0]\n"
+ "6:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
+ "mov v28.16b, v25.16b\n fmla v28.4s, v4.4s, v9.4s\n"
+ "mov v29.16b, v25.16b\n fmla v29.4s, v3.4s, v9.4s\n"
+ "add x20, x27, XZR\n"
+ "mov v30.16b, v25.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v12.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr d9, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v9.s }[2], [x20]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+ "ldr s9, [x20, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+ "fmla v30.4s, v6.4s, v9.4s\n"
+ "fmla v28.4s, v7.4s, v13.4s\n"
+ "add x20, x27, x26\n"
+ "fmla v29.4s, v6.4s, v13.4s\n"
+ "fmla v30.4s, v4.4s, v13.4s\n"
+ "fmla v31.4s, v3.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 10f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "10:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+ "fmla v31.4s, v8.4s, v11.4s\n"
+ "add x20, x13, x15\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v29.4s, v0.4s, v12.4s\n"
+ "add x20, x13, x11\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr d9, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v9.s }[2], [x20]\n"
+ "b 14f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+ "ldr s9, [x20, #0x0]\n"
+ "14:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v1.4s, v9.4s\n"
+ "add x20, x28, x11\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v10.s }[2], [x20]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset
+ "ldr s10, [x20, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "add x20, x9, XZR\n"
+ "fmla v30.4s, v5.4s, v10.4s\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 18f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "18:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+ "fmla v28.4s, v3.4s, v11.4s\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "add x20, x9, x26\n"
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+ "fmla v29.4s, v5.4s, v12.4s\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
+ "add x20, x28, XZR\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ldr d9, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 22f\n"
+ "ld1 { v9.s }[2], [x20]\n"
+ "b 22f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+ "ldr s9, [x20, #0x0]\n"
+ "22:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+ "fmla v28.4s, v6.4s, v9.4s\n"
+ "fmla v30.4s, v3.4s, v9.4s\n"
+ "add x20, x28, x26\n"
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v10.s }[2], [x20]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+ "ldr s10, [x20, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+ "fmla v29.4s, v8.4s, v10.4s\n"
+ "fmla v31.4s, v5.4s, v10.4s\n"
+ "add x20, x27, x15\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 26f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 26f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "26:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+ "fmla v30.4s, v7.4s, v11.4s\n"
+ "fmla v31.4s, v6.4s, v11.4s\n"
+ "add x20, x27, x11\n"
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+ "fmla v30.4s, v8.4s, v12.4s\n"
+ "fmla v31.4s, v7.4s, v12.4s\n"
+ "fmax v28.4s, v28.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
+ "fmin v28.4s, v28.4s, v26.4s\n"
+ "fmin v29.4s, v29.4s, v26.4s\n"
+ "fmin v30.4s, v30.4s, v26.4s\n"
+ "fmin v31.4s, v31.4s, v26.4s\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "mov x21, x12\n"
+ "mov x20, x25\n"
+ "st1 { v28.d }[0], [x21], x14\n"
+ "st1 { v30.d }[0], [x20], x14\n"
+ "add x12, x12, #0x8\n"
+ "add x25, x25, #0x8\n"
+ "st1 { v29.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
+ "tbz %x[n_channels], #0, 30f\n"
+ "mov x21, x12\n"
+ "mov x20, x25\n"
+ "st1 { v28.s }[2], [x21], x14\n"
+ "st1 { v30.s }[2], [x20], x14\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
+ "b 30f\n"
+ "29:" // Tile loop: Oddments: Store: Bit 1: Unset
+ "mov x21, x12\n"
+ "mov x20, x25\n"
+ "st1 { v28.s }[0], [x21], x14\n"
+ "st1 { v30.s }[0], [x20], x14\n"
+ "st1 { v29.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
+ "30:" // Tile loop: Oddments: Store: Bit 1: End
+ "31:" // Tile loop: End
+ "ldr x22, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x22, x22, #0x1\n"
+ "add x21, x23, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x22, x20\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x23, x23, x21, LT\n"
+ "csel x22, x22, XZR, LT\n"
+ "cmp x23, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..7dedfd972a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,513 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[16];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[5];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[3];
+ inptrs[3] = input_ptrs[6];
+ inptrs[4] = input_ptrs[9];
+ inptrs[5] = input_ptrs[12];
+ inptrs[6] = input_ptrs[15];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[2];
+ inptrs[9] = input_ptrs[10];
+ inptrs[10] = input_ptrs[4];
+ inptrs[11] = input_ptrs[7];
+ inptrs[12] = input_ptrs[8];
+ inptrs[13] = input_ptrs[11];
+ inptrs[14] = input_ptrs[13];
+ inptrs[15] = input_ptrs[14];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "mov x16, #0x10\n" // cntb _, ALL, #1
+ "lsr x15, %x[n_channels], #0x2\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v27.4s }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v26.4s }, [x20]\n"
+ "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldp x12, x11, [x21, #0x0]\n"
+ "ldp x10, x9, [x21, #0x10]\n"
+ "mov x28, #0x0\n"
+ "sub x27, XZR, x16\n"
+ "cbz x15, 3f\n"
+ "ldr q25, [x14, #0x0]\n"
+ "ldr q0, [x14, #0x10]\n"
+ "cmp x16, x15, LSL #4\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q2, [x14, #0x30]\n"
+ "ldr q3, [x14, #0x40]\n"
+ "ldr q4, [x14, #0x50]\n"
+ "ldr q5, [x14, #0x60]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "add x14, x14, #0xa0\n"
+ "ldp x21, x20, [x13, #0x0]\n"
+ "ldr q9, [x21, x28]\n"
+ "ldr q10, [x20, x28]\n"
+ "ldp x21, x20, [x13, #0x10]\n"
+ "ldr q11, [x21, x28]\n"
+ "ldr q12, [x20, x28]\n"
+ "ldr x20, [x13, #0x20]\n"
+ "ldr q13, [x20, x28]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
+ "mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
+ "ldr x21, [x13, #0x28]\n"
+ "ldr x20, [x13, #0x30]\n"
+ "mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+ "mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "ldr q18, [x21, x28]\n"
+ "ldr q25, [x14, #0x0]\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
+ "ldr x20, [x13, #0x48]\n"
+ "ldr q20, [x20, x28]\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "ldr q16, [x21, x28]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "fmla v22.4s, v6.4s, v18.4s\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v21.4s, v3.4s, v13.4s\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v24.4s, v7.4s, v13.4s\n"
+ "fmla v23.4s, v6.4s, v13.4s\n"
+ "ldr x22, [x13, #0x58]\n"
+ "ldr x21, [x13, #0x60]\n"
+ "fmla v22.4s, v4.4s, v13.4s\n"
+ "fmla v21.4s, v8.4s, v17.4s\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x20, [x13, #0x68]\n"
+ "fmla v24.4s, v1.4s, v16.4s\n"
+ "fmla v23.4s, v0.4s, v16.4s\n"
+ "ldr q16, [x22, x28]\n"
+ "ldr x26, [x13, #0x70]\n"
+ "fmla v22.4s, v5.4s, v20.4s\n"
+ "fmla v21.4s, v4.4s, v20.4s\n"
+ "ldr q4, [x14, #0x50]\n"
+ "ldr x25, [x13, #0x78]\n"
+ "fmla v24.4s, v2.4s, v18.4s\n"
+ "fmla v23.4s, v1.4s, v18.4s\n"
+ "ldr q19, [x21, x28]\n"
+ "ldr q1, [x14, #0x20]\n"
+ "fmla v22.4s, v0.4s, v17.4s\n"
+ "ldr q0, [x14, #0x10]\n"
+ "fmla v21.4s, v2.4s, v16.4s\n"
+ "ldr q2, [x14, #0x30]\n"
+ "fmla v24.4s, v8.4s, v20.4s\n"
+ "fmla v23.4s, v7.4s, v20.4s\n"
+ "ldr q18, [x20, x28]\n"
+ "ldp x24, x23, [x13, #0x0]\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "fmla v21.4s, v5.4s, v18.4s\n"
+ "ldp x22, x21, [x13, #0x10]\n"
+ "ldr x20, [x13, #0x20]\n"
+ "ldr q13, [x20, x16]\n"
+ "fmla v24.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x26, x28]\n"
+ "fmla v23.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x25, x28]\n"
+ "ldr q3, [x14, #0x40]\n"
+ "fmla v22.4s, v7.4s, v17.4s\n"
+ "fmla v21.4s, v6.4s, v17.4s\n"
+ "ldr q11, [x22, x16]\n"
+ "ldr q5, [x14, #0x60]\n"
+ "fmla v24.4s, v6.4s, v19.4s\n"
+ "fmla v23.4s, v8.4s, v18.4s\n"
+ "ldr q9, [x24, x16]\n"
+ "ldr q10, [x23, x16]\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "fmla v21.4s, v7.4s, v16.4s\n"
+ "ldr q12, [x21, x16]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "fmax v24.4s, v24.4s, v27.4s\n"
+ "fmax v23.4s, v23.4s, v27.4s\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
+ "add x16, x16, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "fmin v24.4s, v24.4s, v26.4s\n"
+ "fmin v23.4s, v23.4s, v26.4s\n"
+ "cmp x16, x15, LSL #4\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "add x28, x28, #0x10\n"
+ "str q24, [x12, x27]\n"
+ "add x14, x14, #0xa0\n"
+ "str q23, [x11, x27]\n"
+ "str q22, [x10, x27]\n"
+ "str q21, [x9, x27]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v24.16b, v25.16b\n fmla v24.4s, v4.4s, v9.4s\n"
+ "mov v23.16b, v25.16b\n fmla v23.4s, v3.4s, v9.4s\n"
+ "ldr x21, [x13, #0x28]\n"
+ "ldr x20, [x13, #0x30]\n"
+ "mov v22.16b, v25.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+ "mov v21.16b, v25.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "ldr q18, [x21, x28]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x20, [x13, #0x48]\n"
+ "ldr q20, [x20, x28]\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
+ "ldr x20, [x13, #0x40]\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "ldr q16, [x21, x28]\n"
+ "ldr x21, [x13, #0x50]\n"
+ "fmla v22.4s, v6.4s, v18.4s\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v21.4s, v3.4s, v13.4s\n"
+ "ldr x20, [x13, #0x58]\n"
+ "fmla v24.4s, v7.4s, v13.4s\n"
+ "fmla v23.4s, v6.4s, v13.4s\n"
+ "ldr x23, [x13, #0x60]\n"
+ "ldr x22, [x13, #0x68]\n"
+ "fmla v22.4s, v4.4s, v13.4s\n"
+ "fmla v21.4s, v8.4s, v17.4s\n"
+ "ldr q17, [x21, x28]\n"
+ "ldr x21, [x13, #0x70]\n"
+ "fmla v24.4s, v1.4s, v16.4s\n"
+ "fmla v23.4s, v0.4s, v16.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr x20, [x13, #0x78]\n"
+ "fmla v22.4s, v5.4s, v20.4s\n"
+ "fmla v21.4s, v4.4s, v20.4s\n"
+ "add x27, x27, #0x10\n"
+ "fmla v24.4s, v2.4s, v18.4s\n"
+ "fmla v23.4s, v1.4s, v18.4s\n"
+ "ldr q19, [x23, x28]\n"
+ "fmla v22.4s, v0.4s, v17.4s\n"
+ "fmla v21.4s, v2.4s, v16.4s\n"
+ "fmla v24.4s, v8.4s, v20.4s\n"
+ "fmla v23.4s, v7.4s, v20.4s\n"
+ "ldr q18, [x22, x28]\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "fmla v21.4s, v5.4s, v18.4s\n"
+ "fmla v24.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x21, x28]\n"
+ "fmla v23.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v22.4s, v7.4s, v17.4s\n"
+ "fmla v21.4s, v6.4s, v17.4s\n"
+ "add x28, x28, #0x10\n"
+ "fmla v24.4s, v6.4s, v19.4s\n"
+ "fmla v23.4s, v8.4s, v18.4s\n"
+ "fmax v24.4s, v24.4s, v27.4s\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "fmla v21.4s, v7.4s, v16.4s\n"
+ "fmax v23.4s, v23.4s, v27.4s\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
+ "fmin v24.4s, v24.4s, v26.4s\n"
+ "fmin v23.4s, v23.4s, v26.4s\n"
+ "str q24, [x12, x27]\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "str q23, [x11, x27]\n"
+ "str q22, [x10, x27]\n"
+ "str q21, [x9, x27]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 30f\n"
+ "ldr q25, [x14, #0x0]\n"
+ "ldr q0, [x14, #0x10]\n"
+ "mov x20, x28\n"
+ "add x12, x12, x20\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q2, [x14, #0x30]\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
+ "ldr q3, [x14, #0x40]\n"
+ "ldr q4, [x14, #0x50]\n"
+ "add x9, x9, x20\n"
+ "ldr q5, [x14, #0x60]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "ldr x24, [x13, #0x0]\n"
+ "ldr x23, [x13, #0x8]\n"
+ "add x24, x24, x28\n"
+ "add x23, x23, x28\n"
+ "ldr x22, [x13, #0x10]\n"
+ "ldr x21, [x13, #0x18]\n"
+ "add x22, x22, x28\n"
+ "add x21, x21, x28\n"
+ "ldr x20, [x13, #0x20]\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.d }[0], [x24], #0x8\n"
+ "ld1 { v10.d }[0], [x23], #0x8\n"
+ "ld1 { v11.d }[0], [x22], #0x8\n"
+ "ld1 { v12.d }[0], [x21], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 5f\n"
+ "ld1 { v9.s }[2], [x24], #0x4\n"
+ "ld1 { v10.s }[2], [x23], #0x4\n"
+ "ld1 { v11.s }[2], [x22], #0x4\n"
+ "ld1 { v12.s }[2], [x21], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "b 5f\n"
+ "4:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset
+ "ld1 { v9.s }[0], [x24], #0x4\n"
+ "ld1 { v10.s }[0], [x23], #0x4\n"
+ "ld1 { v11.s }[0], [x22], #0x4\n"
+ "ld1 { v12.s }[0], [x21], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "5:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
+ "mov v28.16b, v25.16b\n fmla v28.4s, v4.4s, v9.4s\n"
+ "mov v29.16b, v25.16b\n fmla v29.4s, v3.4s, v9.4s\n"
+ "ldr x20, [x13, #0x28]\n"
+ "add x20, x20, x28\n"
+ "mov v30.16b, v25.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v12.4s\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "b 7f\n"
+ "6:" // Oddments: Load input (3, 0): Bit 1: Unset
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "7:" // Oddments: Load input (3, 0): Bit 1: End
+ "fmla v30.4s, v6.4s, v9.4s\n"
+ "ldr x20, [x13, #0x30]\n"
+ "fmla v28.4s, v7.4s, v13.4s\n"
+ "add x20, x20, x28\n"
+ "fmla v29.4s, v6.4s, v13.4s\n"
+ "fmla v30.4s, v4.4s, v13.4s\n"
+ "fmla v31.4s, v3.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 9f\n"
+ "8:" // Oddments: Load input (3, 3): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "9:" // Oddments: Load input (3, 3): Bit 1: End
+ "ldr x20, [x13, #0x38]\n"
+ "fmla v31.4s, v8.4s, v11.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (0, 1): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "11:" // Oddments: Load input (0, 1): Bit 1: End
+ "ldr x20, [x13, #0x40]\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "fmla v29.4s, v0.4s, v12.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "b 13f\n"
+ "12:" // Oddments: Load input (0, 2): Bit 1: Unset
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "13:" // Oddments: Load input (0, 2): Bit 1: End
+ "ldr x20, [x13, #0x48]\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v1.4s, v9.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (2, 2): Bit 1: Unset
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "15:" // Oddments: Load input (2, 2): Bit 1: End
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "add x20, x20, x28\n"
+ "fmla v30.4s, v5.4s, v10.4s\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 17f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 17f\n"
+ "16:" // Oddments: Load input (1, 0): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "17:" // Oddments: Load input (1, 0): Bit 1: End
+ "ldr x20, [x13, #0x58]\n"
+ "fmla v28.4s, v3.4s, v11.4s\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (1, 3): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "19:" // Oddments: Load input (1, 3): Bit 1: End
+ "ldr x20, [x13, #0x60]\n"
+ "fmla v29.4s, v5.4s, v12.4s\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 21f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "b 21f\n"
+ "20:" // Oddments: Load input (2, 0): Bit 1: Unset
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "21:" // Oddments: Load input (2, 0): Bit 1: End
+ "ldr x20, [x13, #0x68]\n"
+ "fmla v28.4s, v6.4s, v9.4s\n"
+ "fmla v30.4s, v3.4s, v9.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (2, 3): Bit 1: Unset
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "23:" // Oddments: Load input (2, 3): Bit 1: End
+ "ldr x20, [x13, #0x70]\n"
+ "fmla v29.4s, v8.4s, v10.4s\n"
+ "fmla v31.4s, v5.4s, v10.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 25f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 25f\n"
+ "24:" // Oddments: Load input (3, 1): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "25:" // Oddments: Load input (3, 1): Bit 1: End
+ "ldr x20, [x13, #0x78]\n"
+ "fmla v30.4s, v7.4s, v11.4s\n"
+ "fmla v31.4s, v6.4s, v11.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (3, 2): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "27:" // Oddments: Load input (3, 2): Bit 1: End
+ "fmla v30.4s, v8.4s, v12.4s\n"
+ "fmla v31.4s, v7.4s, v12.4s\n"
+ "fmax v28.4s, v28.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
+ "fmin v28.4s, v28.4s, v26.4s\n"
+ "fmin v29.4s, v29.4s, v26.4s\n"
+ "fmin v30.4s, v30.4s, v26.4s\n"
+ "fmin v31.4s, v31.4s, v26.4s\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "st1 { v28.d }[0], [x12], #0x8\n"
+ "st1 { v29.d }[0], [x11], #0x8\n"
+ "st1 { v30.d }[0], [x10], #0x8\n"
+ "st1 { v31.d }[0], [x9], #0x8\n"
+ "tbz %x[n_channels], #0, 29f\n"
+ "st1 { v28.s }[2], [x12], #0x4\n"
+ "st1 { v29.s }[2], [x11], #0x4\n"
+ "st1 { v30.s }[2], [x10], #0x4\n"
+ "st1 { v31.s }[2], [x9], #0x4\n"
+ "b 29f\n"
+ "28:" // Oddments: Store: Bit 1: Unset
+ "st1 { v28.s }[0], [x12], #0x4\n"
+ "st1 { v29.s }[0], [x11], #0x4\n"
+ "st1 { v30.s }[0], [x10], #0x4\n"
+ "st1 { v31.s }[0], [x9], #0x4\n"
+ "29:" // Oddments: Store: Bit 1: End
+ "30:" // End
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..c2d86615e3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 3;
+ constexpr static unsigned int output_cols = 3;
+
+ a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..9bfcd9cd3c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,828 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x24, #0x0\n"
+ "mov x23, #0x0\n"
+ "1:" // Tile loop
+ "str x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x27, #0x3\n"
+ "mov x26, #0x3\n"
+ "str x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x24, x25\n" // offset = tile_i * ld_input_row
+ "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x20, x24, x22\n" // offset = tile_i * ld_output_row
+ "mov x24, #0x10\n" // cntb _, ALL, #1
+ "madd x21, x23, x8, x21\n" // offset += tile_j * ld_input_col
+ "ldr x16, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "lsl x8, x8, #0x2\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "madd x20, x23, x17, x20\n" // offset += tile_j * ld_output_col
+ "lsl x17, x17, #0x2\n"
+ "lsr x23, %x[n_channels], #0x2\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mul x21, x21, x27\n" // offset *= kernel_stride * output_size
+ "add x16, x16, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x13, x16, x25, LSL #2\n"
+ "mul x20, x20, x26\n" // offset *= output_tile_size
+ "add x12, x13, x25, LSL #2\n"
+ "add x11, x8, x8\n"
+ "add x15, x15, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x10, x12, x25, LSL #2\n"
+ "add x9, x11, x8\n"
+ "add x28, x15, x22, LSL #2\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "add x27, x10, x25, LSL #2\n"
+ "add x26, x9, x8\n"
+ "add x25, x28, x22, LSL #2\n"
+ "add x22, x17, x17\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x24\n"
+ "cbz x23, 4f\n"
+ "ldr q31, [x14, #0x0]\n"
+ "ldr q0, [x14, #0x10]\n"
+ "cmp x24, x23, LSL #4\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q2, [x14, #0x30]\n"
+ "ldr q3, [x14, #0x40]\n"
+ "ldr q4, [x14, #0x50]\n"
+ "ldr q5, [x14, #0x60]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "add x14, x14, #0xa0\n"
+ "ldr q9, [x12, x11]\n"
+ "ld1 { v10.4s }, [x16]\n"
+ "ldr q11, [x16, x26]\n"
+ "ld1 { v12.4s }, [x27]\n"
+ "ldr q13, [x13, x11]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v29.16b, v31.16b\n fmla v29.4s, v7.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+ "add x24, x24, #0x10\n"
+ "cmp x24, x23, LSL #4\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
+ "fmla v29.4s, v4.4s, v13.4s\n"
+ "add x20, x20, #0x10\n"
+ "add x21, x21, #0x10\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "ldr q23, [x12, x9]\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "ldr q18, [x12, x8]\n"
+ "mov v22.16b, v31.16b\n fmla v22.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v6.4s, v18.4s\n"
+ "mov v21.16b, v31.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v5.4s, v13.4s\n"
+ "fmla v27.4s, v3.4s, v13.4s\n"
+ "fmla v26.4s, v2.4s, v13.4s\n"
+ "fmla v25.4s, v1.4s, v13.4s\n"
+ "fmla v24.4s, v0.4s, v13.4s\n"
+ "ldr q17, [x16, x8]\n"
+ "fmla v22.4s, v6.4s, v12.4s\n"
+ "ldr q16, [x27, x26]\n"
+ "mov v20.16b, v31.16b\n fmla v20.4s, v1.4s, v9.4s\n"
+ "ldr q31, [x14, #0x0]\n"
+ "fmla v29.4s, v0.4s, v17.4s\n"
+ "fmla v21.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x16, x9]\n"
+ "fmla v28.4s, v7.4s, v18.4s\n"
+ "fmla v20.4s, v0.4s, v18.4s\n"
+ "fmla v26.4s, v4.4s, v18.4s\n"
+ "fmla v25.4s, v3.4s, v18.4s\n"
+ "fmla v22.4s, v1.4s, v18.4s\n"
+ "ld1 { v19.4s }, [x13]\n"
+ "fmla v29.4s, v2.4s, v16.4s\n"
+ "fmla v27.4s, v1.4s, v16.4s\n"
+ "ld1 { v18.4s }, [x10]\n"
+ "fmla v24.4s, v4.4s, v23.4s\n"
+ "fmla v28.4s, v1.4s, v17.4s\n"
+ "ldr q16, [x13, x26]\n"
+ "fmla v20.4s, v2.4s, v23.4s\n"
+ "fmla v21.4s, v1.4s, v23.4s\n"
+ "fmla v29.4s, v8.4s, v23.4s\n"
+ "fmla v27.4s, v7.4s, v23.4s\n"
+ "fmla v25.4s, v5.4s, v23.4s\n"
+ "ldr q17, [x10, x11]\n"
+ "fmla v26.4s, v0.4s, v19.4s\n"
+ "fmla v22.4s, v3.4s, v18.4s\n"
+ "fmla v24.4s, v2.4s, v16.4s\n"
+ "fmla v20.4s, v4.4s, v17.4s\n"
+ "fmla v21.4s, v3.4s, v17.4s\n"
+ "fmla v28.4s, v3.4s, v19.4s\n"
+ "ldr q19, [x10, x26]\n"
+ "fmla v27.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x27, x8]\n"
+ "fmla v26.4s, v6.4s, v18.4s\n"
+ "ldr q18, [x13, x8]\n"
+ "fmla v25.4s, v7.4s, v17.4s\n"
+ "fmla v22.4s, v5.4s, v17.4s\n"
+ "fmla v24.4s, v6.4s, v17.4s\n"
+ "fmla v21.4s, v5.4s, v19.4s\n"
+ "fmla v20.4s, v6.4s, v16.4s\n"
+ "fmla v26.4s, v8.4s, v17.4s\n"
+ "fmla v22.4s, v7.4s, v16.4s\n"
+ "ldr q17, [x27, x9]\n"
+ "fmla v29.4s, v3.4s, v18.4s\n"
+ "fmla v25.4s, v0.4s, v18.4s\n"
+ "fmla v24.4s, v8.4s, v19.4s\n"
+ "ldr q16, [x13, x9]\n"
+ "fmla v20.4s, v8.4s, v17.4s\n"
+ "add x13, x13, #0x10\n"
+ "fmla v21.4s, v7.4s, v17.4s\n"
+ "ldr q19, [x10, x9]\n"
+ "fmla v28.4s, v4.4s, v18.4s\n"
+ "fmla v26.4s, v1.4s, v18.4s\n"
+ "ldr q17, [x10, x8]\n"
+ "fmla v29.4s, v5.4s, v16.4s\n"
+ "add x10, x10, #0x10\n"
+ "fmla v27.4s, v4.4s, v16.4s\n"
+ "fmla v25.4s, v2.4s, v16.4s\n"
+ "fmla v24.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x16, x11]\n"
+ "fmla v22.4s, v4.4s, v17.4s\n"
+ "add x16, x16, #0x10\n"
+ "ld1 { v10.4s }, [x16]\n"
+ "fmla v20.4s, v3.4s, v17.4s\n"
+ "fmla v21.4s, v4.4s, v19.4s\n"
+ "ldr q4, [x14, #0x50]\n"
+ "fmla v26.4s, v7.4s, v17.4s\n"
+ "fmla v25.4s, v6.4s, v17.4s\n"
+ "ld1 { v18.4s }, [x12]\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "fmla v29.4s, v1.4s, v16.4s\n"
+ "ldr q1, [x14, #0x20]\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmla v27.4s, v0.4s, v16.4s\n"
+ "ldr q17, [x12, x26]\n"
+ "fmla v24.4s, v7.4s, v19.4s\n"
+ "add x12, x12, #0x10\n"
+ "ldr q9, [x12, x11]\n"
+ "fmla v20.4s, v5.4s, v19.4s\n"
+ "fmla v22.4s, v0.4s, v18.4s\n"
+ "ldr q0, [x14, #0x10]\n"
+ "fmla v21.4s, v2.4s, v17.4s\n"
+ "ldr q2, [x14, #0x30]\n"
+ "fmla v25.4s, v8.4s, v19.4s\n"
+ "ldr q16, [x27, x11]\n"
+ "fmla v28.4s, v6.4s, v18.4s\n"
+ "fmla v26.4s, v3.4s, v18.4s\n"
+ "ldr q3, [x14, #0x40]\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmla v27.4s, v8.4s, v17.4s\n"
+ "fmla v24.4s, v5.4s, v17.4s\n"
+ "ldr q11, [x16, x26]\n"
+ "ldr q5, [x14, #0x60]\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "ldr q8, [x14, #0x90]\n"
+ "fmla v20.4s, v7.4s, v16.4s\n"
+ "ldr q7, [x14, #0x80]\n"
+ "fmla v21.4s, v6.4s, v16.4s\n"
+ "ldr q13, [x13, x11]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "add x27, x27, #0x10\n"
+ "ld1 { v12.4s }, [x27]\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "add x14, x14, #0xa0\n"
+ "fmax v20.4s, v20.4s, v15.4s\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "st1 { v28.4s }, [x15]\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "str q29, [x15, x17]\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "str q27, [x15, x22]\n"
+ "add x15, x15, #0x10\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
+ "st1 { v26.4s }, [x28]\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "str q25, [x28, x17]\n"
+ "str q24, [x28, x22]\n"
+ "add x28, x28, #0x10\n"
+ "st1 { v22.4s }, [x25]\n"
+ "str q20, [x25, x17]\n"
+ "str q21, [x25, x22]\n"
+ "add x25, x25, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v29.16b, v31.16b\n fmla v29.4s, v7.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
+ "fmla v29.4s, v4.4s, v13.4s\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "ldr q23, [x12, x9]\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "ldr q18, [x12, x8]\n"
+ "mov v22.16b, v31.16b\n fmla v22.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v6.4s, v18.4s\n"
+ "mov v21.16b, v31.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v5.4s, v13.4s\n"
+ "fmla v27.4s, v3.4s, v13.4s\n"
+ "fmla v26.4s, v2.4s, v13.4s\n"
+ "fmla v25.4s, v1.4s, v13.4s\n"
+ "fmla v24.4s, v0.4s, v13.4s\n"
+ "ldr q17, [x16, x8]\n"
+ "fmla v22.4s, v6.4s, v12.4s\n"
+ "ldr q16, [x27, x26]\n"
+ "mov v20.16b, v31.16b\n fmla v20.4s, v1.4s, v9.4s\n"
+ "fmla v29.4s, v0.4s, v17.4s\n"
+ "fmla v21.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x16, x9]\n"
+ "fmla v28.4s, v7.4s, v18.4s\n"
+ "fmla v20.4s, v0.4s, v18.4s\n"
+ "fmla v26.4s, v4.4s, v18.4s\n"
+ "fmla v25.4s, v3.4s, v18.4s\n"
+ "fmla v22.4s, v1.4s, v18.4s\n"
+ "ld1 { v19.4s }, [x13]\n"
+ "fmla v29.4s, v2.4s, v16.4s\n"
+ "fmla v27.4s, v1.4s, v16.4s\n"
+ "ld1 { v18.4s }, [x10]\n"
+ "fmla v24.4s, v4.4s, v23.4s\n"
+ "fmla v28.4s, v1.4s, v17.4s\n"
+ "ldr q16, [x13, x26]\n"
+ "fmla v20.4s, v2.4s, v23.4s\n"
+ "fmla v21.4s, v1.4s, v23.4s\n"
+ "fmla v29.4s, v8.4s, v23.4s\n"
+ "fmla v27.4s, v7.4s, v23.4s\n"
+ "fmla v25.4s, v5.4s, v23.4s\n"
+ "ldr q17, [x10, x11]\n"
+ "fmla v26.4s, v0.4s, v19.4s\n"
+ "fmla v22.4s, v3.4s, v18.4s\n"
+ "fmla v24.4s, v2.4s, v16.4s\n"
+ "fmla v20.4s, v4.4s, v17.4s\n"
+ "fmla v21.4s, v3.4s, v17.4s\n"
+ "fmla v28.4s, v3.4s, v19.4s\n"
+ "ldr q19, [x10, x26]\n"
+ "fmla v27.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x27, x8]\n"
+ "fmla v26.4s, v6.4s, v18.4s\n"
+ "ldr q18, [x13, x8]\n"
+ "fmla v25.4s, v7.4s, v17.4s\n"
+ "fmla v22.4s, v5.4s, v17.4s\n"
+ "fmla v24.4s, v6.4s, v17.4s\n"
+ "fmla v21.4s, v5.4s, v19.4s\n"
+ "fmla v20.4s, v6.4s, v16.4s\n"
+ "fmla v26.4s, v8.4s, v17.4s\n"
+ "fmla v22.4s, v7.4s, v16.4s\n"
+ "ldr q17, [x27, x9]\n"
+ "fmla v29.4s, v3.4s, v18.4s\n"
+ "fmla v25.4s, v0.4s, v18.4s\n"
+ "fmla v24.4s, v8.4s, v19.4s\n"
+ "ldr q16, [x13, x9]\n"
+ "fmla v20.4s, v8.4s, v17.4s\n"
+ "add x13, x13, #0x10\n"
+ "fmla v21.4s, v7.4s, v17.4s\n"
+ "ldr q19, [x10, x9]\n"
+ "fmla v28.4s, v4.4s, v18.4s\n"
+ "fmla v26.4s, v1.4s, v18.4s\n"
+ "ldr q17, [x10, x8]\n"
+ "fmla v29.4s, v5.4s, v16.4s\n"
+ "add x10, x10, #0x10\n"
+ "fmla v27.4s, v4.4s, v16.4s\n"
+ "fmla v25.4s, v2.4s, v16.4s\n"
+ "fmla v24.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x16, x11]\n"
+ "fmla v22.4s, v4.4s, v17.4s\n"
+ "add x16, x16, #0x10\n"
+ "fmla v20.4s, v3.4s, v17.4s\n"
+ "fmla v21.4s, v4.4s, v19.4s\n"
+ "fmla v26.4s, v7.4s, v17.4s\n"
+ "fmla v25.4s, v6.4s, v17.4s\n"
+ "ld1 { v18.4s }, [x12]\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "fmla v29.4s, v1.4s, v16.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmla v27.4s, v0.4s, v16.4s\n"
+ "ldr q17, [x12, x26]\n"
+ "fmla v24.4s, v7.4s, v19.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "fmla v20.4s, v5.4s, v19.4s\n"
+ "fmla v22.4s, v0.4s, v18.4s\n"
+ "add x12, x12, #0x10\n"
+ "fmla v21.4s, v2.4s, v17.4s\n"
+ "fmla v25.4s, v8.4s, v19.4s\n"
+ "ldr q16, [x27, x11]\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmla v28.4s, v6.4s, v18.4s\n"
+ "fmla v26.4s, v3.4s, v18.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "add x27, x27, #0x10\n"
+ "fmla v27.4s, v8.4s, v17.4s\n"
+ "fmla v24.4s, v5.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "fmla v20.4s, v7.4s, v16.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmla v21.4s, v6.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "fmax v20.4s, v20.4s, v15.4s\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "st1 { v28.4s }, [x15]\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "str q29, [x15, x17]\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "str q27, [x15, x22]\n"
+ "add x15, x15, #0x10\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
+ "st1 { v26.4s }, [x28]\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "str q25, [x28, x17]\n"
+ "str q24, [x28, x22]\n"
+ "add x28, x28, #0x10\n"
+ "st1 { v22.4s }, [x25]\n"
+ "str q20, [x25, x17]\n"
+ "str q21, [x25, x22]\n"
+ "add x25, x25, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 49f\n"
+ "ldr q31, [x14, #0x0]\n"
+ "ldr q0, [x14, #0x10]\n"
+ "add x24, x12, x11\n"
+ "add x23, x16, XZR\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q2, [x14, #0x30]\n"
+ "add x22, x16, x26\n"
+ "add x21, x27, XZR\n"
+ "ldr q3, [x14, #0x40]\n"
+ "ldr q4, [x14, #0x50]\n"
+ "add x20, x13, x11\n"
+ "ldr q5, [x14, #0x60]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ldr d9, [x24], #0x8\n"
+ "ldr d10, [x23], #0x8\n"
+ "ldr d11, [x22], #0x8\n"
+ "ldr d12, [x21], #0x8\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v9.s }[2], [x24]\n"
+ "ld1 { v10.s }[2], [x23]\n"
+ "ld1 { v11.s }[2], [x22]\n"
+ "ld1 { v12.s }[2], [x21]\n"
+ "ld1 { v13.s }[2], [x20]\n"
+ "b 6f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset
+ "ldr s9, [x24, #0x0]\n"
+ "ldr s10, [x23, #0x0]\n"
+ "ldr s11, [x22, #0x0]\n"
+ "ldr s12, [x21, #0x0]\n"
+ "ldr s13, [x20, #0x0]\n"
+ "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
+ "mov v23.16b, v31.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v6.4s, v9.4s\n"
+ "add x20, x27, x26\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v7.4s, v9.4s\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v3.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "fmla v23.4s, v0.4s, v10.4s\n"
+ "fmla v25.4s, v2.4s, v11.4s\n"
+ "mov v30.16b, v31.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v29.4s, v6.4s, v12.4s\n"
+ "fmla v23.4s, v5.4s, v13.4s\n"
+ "fmla v24.4s, v4.4s, v13.4s\n"
+ "fmla v25.4s, v3.4s, v13.4s\n"
+ "fmla v26.4s, v2.4s, v13.4s\n"
+ "fmla v27.4s, v1.4s, v13.4s\n"
+ "fmla v28.4s, v0.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+ "fmla v31.4s, v8.4s, v12.4s\n"
+ "add x20, x12, x8\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 10f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "10:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+ "fmla v23.4s, v7.4s, v11.4s\n"
+ "fmla v24.4s, v6.4s, v11.4s\n"
+ "add x20, x16, x8\n"
+ "fmla v26.4s, v4.4s, v11.4s\n"
+ "fmla v27.4s, v3.4s, v11.4s\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v13.s }[2], [x20]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+ "ldr s13, [x20, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+ "fmla v23.4s, v1.4s, v13.4s\n"
+ "fmla v24.4s, v0.4s, v13.4s\n"
+ "add x20, x16, x9\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 14f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "14:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End
+ "fmla v24.4s, v2.4s, v12.4s\n"
+ "fmla v25.4s, v1.4s, v12.4s\n"
+ "add x20, x12, x9\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v10.s }[2], [x20]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+ "ldr s10, [x20, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+ "fmla v24.4s, v8.4s, v10.4s\n"
+ "fmla v25.4s, v7.4s, v10.4s\n"
+ "add x20, x13, XZR\n"
+ "fmla v27.4s, v5.4s, v10.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "fmla v30.4s, v2.4s, v10.4s\n"
+ "fmla v31.4s, v1.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 18f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "18:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+ "fmla v23.4s, v3.4s, v11.4s\n"
+ "fmla v26.4s, v0.4s, v11.4s\n"
+ "add x20, x13, x26\n"
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v13.s }[2], [x20]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+ "ldr s13, [x20, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+ "fmla v25.4s, v5.4s, v13.4s\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "add x20, x10, XZR\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 22f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 22f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "22:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+ "fmla v26.4s, v6.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "add x20, x10, x11\n"
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v10.s }[2], [x20]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+ "ldr s10, [x20, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
+ "add x20, x10, x26\n"
+ "fmla v28.4s, v6.4s, v10.4s\n"
+ "fmla v29.4s, v5.4s, v10.4s\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v31.4s, v3.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 26f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 26f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "26:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v5.4s, v11.4s\n"
+ "add x20, x27, x8\n"
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v13.s }[2], [x20]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+ "ldr s13, [x20, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+ "fmla v29.4s, v7.4s, v13.4s\n"
+ "fmla v30.4s, v6.4s, v13.4s\n"
+ "add x20, x13, x8\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 30f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 30f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "30:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "fmla v24.4s, v3.4s, v12.4s\n"
+ "add x20, x13, x9\n"
+ "fmla v26.4s, v1.4s, v12.4s\n"
+ "fmla v27.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+ "fmla v24.4s, v5.4s, v11.4s\n"
+ "fmla v25.4s, v4.4s, v11.4s\n"
+ "add x20, x27, x9\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 34f\n"
+ "ld1 { v13.s }[2], [x20]\n"
+ "b 34f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+ "ldr s13, [x20, #0x0]\n"
+ "34:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+ "fmla v30.4s, v8.4s, v13.4s\n"
+ "fmla v31.4s, v7.4s, v13.4s\n"
+ "add x20, x10, x8\n"
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+ "fmla v26.4s, v7.4s, v12.4s\n"
+ "fmla v27.4s, v6.4s, v12.4s\n"
+ "add x20, x16, x11\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 38f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 38f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "38:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "add x20, x10, x9\n"
+ "fmla v25.4s, v0.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v13.s }[2], [x20]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+ "ldr s13, [x20, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+ "fmla v27.4s, v8.4s, v13.4s\n"
+ "fmla v28.4s, v7.4s, v13.4s\n"
+ "add x20, x12, XZR\n"
+ "fmla v30.4s, v5.4s, v13.4s\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 42f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 42f\n"
+ "41:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "42:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+ "fmla v23.4s, v6.4s, v12.4s\n"
+ "fmla v26.4s, v3.4s, v12.4s\n"
+ "add x20, x12, x26\n"
+ "fmla v29.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 43f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 44f\n"
+ "43:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "44:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v11.4s\n"
+ "add x20, x27, x11\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 45f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 46f\n"
+ "ld1 { v13.s }[2], [x20]\n"
+ "b 46f\n"
+ "45:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+ "ldr s13, [x20, #0x0]\n"
+ "46:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+ "fmla v29.4s, v8.4s, v13.4s\n"
+ "fmla v30.4s, v7.4s, v13.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "fmla v31.4s, v6.4s, v13.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
+ "tbz %x[n_channels], #1, 47f\n"
+ "mov x22, x15\n"
+ "mov x21, x28\n"
+ "st1 { v23.d }[0], [x22], x17\n"
+ "mov x20, x25\n"
+ "st1 { v26.d }[0], [x21], x17\n"
+ "add x15, x15, #0x8\n"
+ "st1 { v29.d }[0], [x20], x17\n"
+ "add x28, x28, #0x8\n"
+ "add x25, x25, #0x8\n"
+ "st1 { v24.d }[0], [x22], x17\n"
+ "st1 { v27.d }[0], [x21], x17\n"
+ "st1 { v30.d }[0], [x20], x17\n"
+ "st1 { v25.d }[0], [x22]\n"
+ "st1 { v28.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "mov x22, x15\n"
+ "mov x21, x28\n"
+ "st1 { v23.s }[2], [x22], x17\n"
+ "mov x20, x25\n"
+ "st1 { v26.s }[2], [x21], x17\n"
+ "st1 { v29.s }[2], [x20], x17\n"
+ "st1 { v24.s }[2], [x22], x17\n"
+ "st1 { v27.s }[2], [x21], x17\n"
+ "st1 { v30.s }[2], [x20], x17\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "st1 { v28.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
+ "b 48f\n"
+ "47:" // Tile loop: Oddments: Store: Bit 1: Unset
+ "mov x22, x15\n"
+ "mov x21, x28\n"
+ "st1 { v23.s }[0], [x22], x17\n"
+ "mov x20, x25\n"
+ "st1 { v26.s }[0], [x21], x17\n"
+ "st1 { v29.s }[0], [x20], x17\n"
+ "st1 { v24.s }[0], [x22], x17\n"
+ "st1 { v27.s }[0], [x21], x17\n"
+ "st1 { v30.s }[0], [x20], x17\n"
+ "st1 { v25.s }[0], [x22]\n"
+ "st1 { v28.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
+ "48:" // Tile loop: Oddments: Store: Bit 1: End
+ "49:" // Tile loop: End
+ "ldr x23, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x23, x23, #0x1\n"
+ "add x21, x24, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x23, x20\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x24, x24, x21, LT\n"
+ "csel x23, x23, XZR, LT\n"
+ "cmp x24, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..972f7eb535
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,905 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[25];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[4];
+ inptrs[3] = input_ptrs[20];
+ inptrs[4] = input_ptrs[7];
+ inptrs[5] = input_ptrs[24];
+ inptrs[6] = input_ptrs[11];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[3];
+ inptrs[9] = input_ptrs[13];
+ inptrs[10] = input_ptrs[5];
+ inptrs[11] = input_ptrs[9];
+ inptrs[12] = input_ptrs[15];
+ inptrs[13] = input_ptrs[17];
+ inptrs[14] = input_ptrs[19];
+ inptrs[15] = input_ptrs[21];
+ inptrs[16] = input_ptrs[6];
+ inptrs[17] = input_ptrs[8];
+ inptrs[18] = input_ptrs[23];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[18];
+ inptrs[22] = input_ptrs[10];
+ inptrs[23] = input_ptrs[14];
+ inptrs[24] = input_ptrs[22];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "mov x7, #0x10\n" // cntb _, ALL, #1
+ "lsr x8, %x[n_channels], #0x2\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x14, #0x0\n"
+ "sub x13, XZR, x7\n"
+ "cbz x8, 3f\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "cmp x7, x8, LSL #4\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "add x16, x16, #0xa0\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "ldr q9, [x21, x14]\n"
+ "ldr q10, [x20, x14]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q11, [x21, x14]\n"
+ "ldr q12, [x20, x14]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ldr q13, [x20, x14]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v7.4s, v9.4s\n"
+ "ldr x26, [x15, #0x30]\n"
+ "ldr x23, [x15, #0x38]\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
+ "ldr x22, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ldr q19, [x20, x14]\n"
+ "fmla v28.4s, v4.4s, v13.4s\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "ldr x21, [x15, #0x40]\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+ "ldr x25, [x15, #0x50]\n"
+ "ldr x24, [x15, #0x58]\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x26, x14]\n"
+ "mov v23.16b, v31.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+ "ldr x20, [x15, #0x60]\n"
+ "fmla v29.4s, v5.4s, v13.4s\n"
+ "fmla v28.4s, v6.4s, v17.4s\n"
+ "ldr x12, [x15, #0x70]\n"
+ "ldr x11, [x15, #0x88]\n"
+ "mov v22.16b, v31.16b\n fmla v22.4s, v0.4s, v9.4s\n"
+ "fmla v27.4s, v3.4s, v13.4s\n"
+ "ldr x10, [x17, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "fmla v26.4s, v2.4s, v13.4s\n"
+ "fmla v25.4s, v1.4s, v13.4s\n"
+ "ldr x9, [x17, #0x8]\n"
+ "ldr x28, [x17, #0x10]\n"
+ "fmla v24.4s, v0.4s, v13.4s\n"
+ "ldr q18, [x23, x14]\n"
+ "fmla v23.4s, v6.4s, v12.4s\n"
+ "ldr q16, [x22, x14]\n"
+ "mov v21.16b, v31.16b\n fmla v21.4s, v1.4s, v9.4s\n"
+ "ldr q31, [x16, #0x0]\n"
+ "fmla v29.4s, v7.4s, v17.4s\n"
+ "ldr x23, [x15, #0x68]\n"
+ "fmla v28.4s, v0.4s, v18.4s\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x21, x14]\n"
+ "ldr x22, [x15, #0x78]\n"
+ "fmla v26.4s, v4.4s, v17.4s\n"
+ "fmla v25.4s, v3.4s, v17.4s\n"
+ "ldr x21, [x15, #0x80]\n"
+ "ldr x27, [x17, #0x18]\n"
+ "fmla v21.4s, v0.4s, v17.4s\n"
+ "fmla v24.4s, v4.4s, v19.4s\n"
+ "fmla v23.4s, v1.4s, v17.4s\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v29.4s, v1.4s, v18.4s\n"
+ "ldr q20, [x24, x14]\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "fmla v27.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x20, x14]\n"
+ "ldr x26, [x15, #0x90]\n"
+ "fmla v25.4s, v5.4s, v19.4s\n"
+ "fmla v21.4s, v2.4s, v19.4s\n"
+ "ldr x25, [x15, #0xa0]\n"
+ "ldr x20, [x15, #0x98]\n"
+ "fmla v26.4s, v0.4s, v17.4s\n"
+ "fmla v24.4s, v2.4s, v20.4s\n"
+ "fmla v28.4s, v8.4s, v19.4s\n"
+ "fmla v27.4s, v7.4s, v19.4s\n"
+ "fmla v22.4s, v1.4s, v19.4s\n"
+ "ldr q19, [x23, x14]\n"
+ "fmla v23.4s, v3.4s, v16.4s\n"
+ "ldr x24, [x15, #0xa8]\n"
+ "fmla v26.4s, v6.4s, v16.4s\n"
+ "ldr q18, [x21, x14]\n"
+ "fmla v25.4s, v7.4s, v19.4s\n"
+ "ldr x23, [x15, #0xc0]\n"
+ "fmla v24.4s, v6.4s, v19.4s\n"
+ "fmla v21.4s, v4.4s, v19.4s\n"
+ "fmla v29.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x12, x14]\n"
+ "fmla v27.4s, v5.4s, v20.4s\n"
+ "ldr q16, [x22, x14]\n"
+ "fmla v23.4s, v5.4s, v19.4s\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "ldr x22, [x15, #0xb0]\n"
+ "ldr x21, [x15, #0xb8]\n"
+ "fmla v26.4s, v8.4s, v19.4s\n"
+ "fmla v24.4s, v8.4s, v17.4s\n"
+ "fmla v21.4s, v6.4s, v16.4s\n"
+ "fmla v28.4s, v3.4s, v18.4s\n"
+ "fmla v25.4s, v0.4s, v18.4s\n"
+ "fmla v22.4s, v5.4s, v17.4s\n"
+ "ldr q17, [x11, x14]\n"
+ "fmla v23.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x26, x14]\n"
+ "fmla v29.4s, v4.4s, v18.4s\n"
+ "fmla v26.4s, v1.4s, v18.4s\n"
+ "ldr q18, [x20, x14]\n"
+ "fmla v28.4s, v5.4s, v17.4s\n"
+ "fmla v27.4s, v4.4s, v17.4s\n"
+ "fmla v25.4s, v2.4s, v17.4s\n"
+ "fmla v24.4s, v1.4s, v17.4s\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v21.4s, v8.4s, v16.4s\n"
+ "ldr x20, [x15, #0x20]\n"
+ "fmla v22.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x24, x14]\n"
+ "fmla v29.4s, v2.4s, v17.4s\n"
+ "fmla v26.4s, v7.4s, v18.4s\n"
+ "fmla v25.4s, v6.4s, v18.4s\n"
+ "fmla v23.4s, v4.4s, v18.4s\n"
+ "fmla v21.4s, v3.4s, v18.4s\n"
+ "ldr q18, [x22, x14]\n"
+ "fmla v22.4s, v4.4s, v16.4s\n"
+ "ldr q4, [x16, #0x50]\n"
+ "fmla v28.4s, v1.4s, v17.4s\n"
+ "ldr q1, [x16, #0x20]\n"
+ "fmla v27.4s, v0.4s, v17.4s\n"
+ "ldr q17, [x21, x14]\n"
+ "fmla v29.4s, v6.4s, v18.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmla v24.4s, v7.4s, v16.4s\n"
+ "fmla v21.4s, v5.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "str q29, [x10, x13]\n"
+ "fmla v23.4s, v0.4s, v18.4s\n"
+ "ldr q0, [x16, #0x10]\n"
+ "fmla v22.4s, v2.4s, v17.4s\n"
+ "ldr q2, [x16, #0x30]\n"
+ "fmla v25.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x23, x14]\n"
+ "fmla v26.4s, v3.4s, v18.4s\n"
+ "ldr q3, [x16, #0x40]\n"
+ "fmla v27.4s, v8.4s, v17.4s\n"
+ "fmla v24.4s, v5.4s, v17.4s\n"
+ "ldr q5, [x16, #0x60]\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmla v23.4s, v8.4s, v16.4s\n"
+ "ldr q8, [x16, #0x90]\n"
+ "fmla v21.4s, v7.4s, v16.4s\n"
+ "ldr q7, [x16, #0x80]\n"
+ "fmla v22.4s, v6.4s, v16.4s\n"
+ "ldr q13, [x20, x7]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "ldr x24, [x17, #0x20]\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "ldr q9, [x21, x7]\n"
+ "ldr q10, [x20, x7]\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q11, [x21, x7]\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "ldr q12, [x20, x7]\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "str q28, [x9, x13]\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "str q27, [x28, x13]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "str q26, [x27, x13]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x7, x7, #0x10\n"
+ "str q25, [x24, x13]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "cmp x7, x8, LSL #4\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "add x14, x14, #0x10\n"
+ "str q24, [x23, x13]\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "str q23, [x22, x13]\n"
+ "add x16, x16, #0xa0\n"
+ "str q21, [x21, x13]\n"
+ "str q22, [x20, x13]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v7.4s, v9.4s\n"
+ "ldr x23, [x15, #0x30]\n"
+ "ldr x22, [x15, #0x38]\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v6.4s, v9.4s\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
+ "ldr x21, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ldr q19, [x20, x14]\n"
+ "fmla v28.4s, v4.4s, v13.4s\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "ldr x20, [x15, #0x40]\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v4.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v3.4s, v9.4s\n"
+ "ldr x25, [x15, #0x50]\n"
+ "ldr x24, [x15, #0x58]\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "ldr q17, [x23, x14]\n"
+ "mov v23.16b, v31.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+ "ldr x23, [x15, #0x60]\n"
+ "fmla v29.4s, v5.4s, v13.4s\n"
+ "fmla v28.4s, v6.4s, v17.4s\n"
+ "ldr x12, [x15, #0x70]\n"
+ "ldr x11, [x15, #0x88]\n"
+ "mov v22.16b, v31.16b\n fmla v22.4s, v0.4s, v9.4s\n"
+ "fmla v27.4s, v3.4s, v13.4s\n"
+ "ldr x10, [x17, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "fmla v26.4s, v2.4s, v13.4s\n"
+ "fmla v25.4s, v1.4s, v13.4s\n"
+ "ldr x9, [x17, #0x8]\n"
+ "ldr x28, [x17, #0x10]\n"
+ "fmla v24.4s, v0.4s, v13.4s\n"
+ "ldr q18, [x22, x14]\n"
+ "fmla v23.4s, v6.4s, v12.4s\n"
+ "ldr q16, [x21, x14]\n"
+ "mov v21.16b, v31.16b\n fmla v21.4s, v1.4s, v9.4s\n"
+ "fmla v29.4s, v7.4s, v17.4s\n"
+ "ldr x22, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x78]\n"
+ "fmla v28.4s, v0.4s, v18.4s\n"
+ "fmla v22.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x20, x14]\n"
+ "ldr x20, [x15, #0x80]\n"
+ "fmla v26.4s, v4.4s, v17.4s\n"
+ "fmla v25.4s, v3.4s, v17.4s\n"
+ "ldr x27, [x17, #0x18]\n"
+ "fmla v21.4s, v0.4s, v17.4s\n"
+ "fmla v24.4s, v4.4s, v19.4s\n"
+ "fmla v23.4s, v1.4s, v17.4s\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v29.4s, v1.4s, v18.4s\n"
+ "ldr q20, [x24, x14]\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "fmla v27.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x23, x14]\n"
+ "ldr x26, [x15, #0x90]\n"
+ "fmla v25.4s, v5.4s, v19.4s\n"
+ "fmla v21.4s, v2.4s, v19.4s\n"
+ "ldr x25, [x15, #0xa0]\n"
+ "ldr x24, [x15, #0x98]\n"
+ "fmla v26.4s, v0.4s, v17.4s\n"
+ "fmla v24.4s, v2.4s, v20.4s\n"
+ "fmla v28.4s, v8.4s, v19.4s\n"
+ "fmla v27.4s, v7.4s, v19.4s\n"
+ "fmla v22.4s, v1.4s, v19.4s\n"
+ "ldr q19, [x22, x14]\n"
+ "fmla v23.4s, v3.4s, v16.4s\n"
+ "ldr x23, [x15, #0xa8]\n"
+ "fmla v26.4s, v6.4s, v16.4s\n"
+ "ldr q18, [x20, x14]\n"
+ "fmla v25.4s, v7.4s, v19.4s\n"
+ "ldr x22, [x15, #0xc0]\n"
+ "fmla v24.4s, v6.4s, v19.4s\n"
+ "fmla v21.4s, v4.4s, v19.4s\n"
+ "fmla v29.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x12, x14]\n"
+ "fmla v27.4s, v5.4s, v20.4s\n"
+ "ldr q16, [x21, x14]\n"
+ "fmla v23.4s, v5.4s, v19.4s\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v26.4s, v8.4s, v19.4s\n"
+ "fmla v24.4s, v8.4s, v17.4s\n"
+ "fmla v21.4s, v6.4s, v16.4s\n"
+ "fmla v28.4s, v3.4s, v18.4s\n"
+ "fmla v25.4s, v0.4s, v18.4s\n"
+ "fmla v22.4s, v5.4s, v17.4s\n"
+ "ldr q17, [x11, x14]\n"
+ "fmla v23.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x26, x14]\n"
+ "fmla v29.4s, v4.4s, v18.4s\n"
+ "fmla v26.4s, v1.4s, v18.4s\n"
+ "ldr q18, [x24, x14]\n"
+ "fmla v28.4s, v5.4s, v17.4s\n"
+ "fmla v27.4s, v4.4s, v17.4s\n"
+ "fmla v25.4s, v2.4s, v17.4s\n"
+ "fmla v24.4s, v1.4s, v17.4s\n"
+ "ldr q17, [x25, x14]\n"
+ "fmla v21.4s, v8.4s, v16.4s\n"
+ "fmla v22.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x23, x14]\n"
+ "fmla v29.4s, v2.4s, v17.4s\n"
+ "fmla v26.4s, v7.4s, v18.4s\n"
+ "fmla v25.4s, v6.4s, v18.4s\n"
+ "fmla v23.4s, v4.4s, v18.4s\n"
+ "fmla v21.4s, v3.4s, v18.4s\n"
+ "ldr q18, [x21, x14]\n"
+ "fmla v22.4s, v4.4s, v16.4s\n"
+ "fmla v28.4s, v1.4s, v17.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmla v27.4s, v0.4s, v17.4s\n"
+ "ldr q17, [x20, x14]\n"
+ "fmla v29.4s, v6.4s, v18.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmla v24.4s, v7.4s, v16.4s\n"
+ "fmla v21.4s, v5.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "str q29, [x10, x13]\n"
+ "fmla v23.4s, v0.4s, v18.4s\n"
+ "fmla v22.4s, v2.4s, v17.4s\n"
+ "ldr x20, [x17, #0x20]\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmla v25.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x22, x14]\n"
+ "fmla v26.4s, v3.4s, v18.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmla v27.4s, v8.4s, v17.4s\n"
+ "fmla v24.4s, v5.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "str q28, [x9, x13]\n"
+ "fmla v23.4s, v8.4s, v16.4s\n"
+ "fmla v21.4s, v7.4s, v16.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "ldr x23, [x17, #0x28]\n"
+ "fmla v22.4s, v6.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "str q27, [x28, x13]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "str q26, [x27, x13]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "str q25, [x20, x13]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "fmax v21.4s, v21.4s, v15.4s\n"
+ "fmax v22.4s, v22.4s, v15.4s\n"
+ "add x14, x14, #0x10\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "str q24, [x23, x13]\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "str q23, [x22, x13]\n"
+ "str q21, [x21, x13]\n"
+ "str q22, [x20, x13]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 48f\n"
+ "ldr q31, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "mov x13, x14\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "ldr x24, [x15, #0x0]\n"
+ "ldr x23, [x15, #0x8]\n"
+ "add x24, x24, x14\n"
+ "add x23, x23, x14\n"
+ "ldr x22, [x15, #0x10]\n"
+ "ldr x21, [x15, #0x18]\n"
+ "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "ldr x20, [x15, #0x20]\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.d }[0], [x24], #0x8\n"
+ "ld1 { v10.d }[0], [x23], #0x8\n"
+ "ld1 { v11.d }[0], [x22], #0x8\n"
+ "ld1 { v12.d }[0], [x21], #0x8\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 5f\n"
+ "ld1 { v9.s }[2], [x24], #0x4\n"
+ "ld1 { v10.s }[2], [x23], #0x4\n"
+ "ld1 { v11.s }[2], [x22], #0x4\n"
+ "ld1 { v12.s }[2], [x21], #0x4\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "b 5f\n"
+ "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset
+ "ld1 { v9.s }[0], [x24], #0x4\n"
+ "ld1 { v10.s }[0], [x23], #0x4\n"
+ "ld1 { v11.s }[0], [x22], #0x4\n"
+ "ld1 { v12.s }[0], [x21], #0x4\n"
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
+ "mov v23.16b, v31.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+ "mov v25.16b, v31.16b\n fmla v25.4s, v6.4s, v9.4s\n"
+ "ldr x20, [x15, #0x28]\n"
+ "add x20, x20, x14\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v7.4s, v9.4s\n"
+ "mov v26.16b, v31.16b\n fmla v26.4s, v5.4s, v9.4s\n"
+ "mov v27.16b, v31.16b\n fmla v27.4s, v4.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v3.4s, v9.4s\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v2.4s, v9.4s\n"
+ "fmla v23.4s, v0.4s, v10.4s\n"
+ "fmla v25.4s, v2.4s, v11.4s\n"
+ "mov v30.16b, v31.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v29.4s, v6.4s, v12.4s\n"
+ "fmla v23.4s, v5.4s, v13.4s\n"
+ "fmla v24.4s, v4.4s, v13.4s\n"
+ "fmla v25.4s, v3.4s, v13.4s\n"
+ "fmla v26.4s, v2.4s, v13.4s\n"
+ "fmla v27.4s, v1.4s, v13.4s\n"
+ "fmla v28.4s, v0.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 7f\n"
+ "6:" // Oddments: Load input (4, 4): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "7:" // Oddments: Load input (4, 4): Bit 1: End
+ "ldr x20, [x15, #0x30]\n"
+ "fmla v31.4s, v8.4s, v12.4s\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 9f\n"
+ "8:" // Oddments: Load input (2, 1): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "9:" // Oddments: Load input (2, 1): Bit 1: End
+ "ldr x20, [x15, #0x38]\n"
+ "fmla v23.4s, v7.4s, v11.4s\n"
+ "fmla v24.4s, v6.4s, v11.4s\n"
+ "add x20, x20, x14\n"
+ "fmla v26.4s, v4.4s, v11.4s\n"
+ "fmla v27.4s, v3.4s, v11.4s\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (0, 1): Bit 1: Unset
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "11:" // Oddments: Load input (0, 1): Bit 1: End
+ "ldr x20, [x15, #0x40]\n"
+ "fmla v23.4s, v1.4s, v13.4s\n"
+ "fmla v24.4s, v0.4s, v13.4s\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 13f\n"
+ "12:" // Oddments: Load input (0, 3): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "13:" // Oddments: Load input (0, 3): Bit 1: End
+ "ldr x20, [x15, #0x48]\n"
+ "fmla v24.4s, v2.4s, v12.4s\n"
+ "fmla v25.4s, v1.4s, v12.4s\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (2, 3): Bit 1: Unset
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "15:" // Oddments: Load input (2, 3): Bit 1: End
+ "ldr x20, [x15, #0x50]\n"
+ "fmla v24.4s, v8.4s, v10.4s\n"
+ "fmla v25.4s, v7.4s, v10.4s\n"
+ "add x20, x20, x14\n"
+ "fmla v27.4s, v5.4s, v10.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "fmla v30.4s, v2.4s, v10.4s\n"
+ "fmla v31.4s, v1.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 17f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 17f\n"
+ "16:" // Oddments: Load input (1, 0): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "17:" // Oddments: Load input (1, 0): Bit 1: End
+ "ldr x20, [x15, #0x58]\n"
+ "fmla v23.4s, v3.4s, v11.4s\n"
+ "fmla v26.4s, v0.4s, v11.4s\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (1, 4): Bit 1: Unset
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "19:" // Oddments: Load input (1, 4): Bit 1: End
+ "ldr x20, [x15, #0x60]\n"
+ "fmla v25.4s, v5.4s, v13.4s\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 21f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 21f\n"
+ "20:" // Oddments: Load input (3, 0): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "21:" // Oddments: Load input (3, 0): Bit 1: End
+ "ldr x20, [x15, #0x68]\n"
+ "fmla v26.4s, v6.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (3, 2): Bit 1: Unset
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "23:" // Oddments: Load input (3, 2): Bit 1: End
+ "ldr x20, [x15, #0x70]\n"
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
+ "add x20, x20, x14\n"
+ "fmla v28.4s, v6.4s, v10.4s\n"
+ "fmla v29.4s, v5.4s, v10.4s\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "fmla v31.4s, v3.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 25f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 25f\n"
+ "24:" // Oddments: Load input (3, 4): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "25:" // Oddments: Load input (3, 4): Bit 1: End
+ "ldr x20, [x15, #0x78]\n"
+ "fmla v28.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v5.4s, v11.4s\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (4, 1): Bit 1: Unset
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "27:" // Oddments: Load input (4, 1): Bit 1: End
+ "ldr x20, [x15, #0x80]\n"
+ "fmla v29.4s, v7.4s, v13.4s\n"
+ "fmla v30.4s, v6.4s, v13.4s\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 29f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 29f\n"
+ "28:" // Oddments: Load input (1, 1): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "29:" // Oddments: Load input (1, 1): Bit 1: End
+ "ldr x20, [x15, #0x88]\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "fmla v24.4s, v3.4s, v12.4s\n"
+ "add x20, x20, x14\n"
+ "fmla v26.4s, v1.4s, v12.4s\n"
+ "fmla v27.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (1, 3): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "31:" // Oddments: Load input (1, 3): Bit 1: End
+ "ldr x20, [x15, #0x90]\n"
+ "fmla v24.4s, v5.4s, v11.4s\n"
+ "fmla v25.4s, v4.4s, v11.4s\n"
+ "add x20, x20, x14\n"
+ "fmla v27.4s, v2.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 33f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "b 33f\n"
+ "32:" // Oddments: Load input (4, 3): Bit 1: Unset
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "33:" // Oddments: Load input (4, 3): Bit 1: End
+ "ldr x20, [x15, #0x98]\n"
+ "fmla v30.4s, v8.4s, v13.4s\n"
+ "fmla v31.4s, v7.4s, v13.4s\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (3, 1): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "35:" // Oddments: Load input (3, 1): Bit 1: End
+ "ldr x20, [x15, #0xa0]\n"
+ "fmla v26.4s, v7.4s, v12.4s\n"
+ "fmla v27.4s, v6.4s, v12.4s\n"
+ "add x20, x20, x14\n"
+ "fmla v29.4s, v4.4s, v12.4s\n"
+ "fmla v30.4s, v3.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 37f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 37f\n"
+ "36:" // Oddments: Load input (0, 2): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "37:" // Oddments: Load input (0, 2): Bit 1: End
+ "ldr x20, [x15, #0xa8]\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "add x20, x20, x14\n"
+ "fmla v25.4s, v0.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (3, 3): Bit 1: Unset
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "39:" // Oddments: Load input (3, 3): Bit 1: End
+ "ldr x20, [x15, #0xb0]\n"
+ "fmla v27.4s, v8.4s, v13.4s\n"
+ "fmla v28.4s, v7.4s, v13.4s\n"
+ "add x20, x20, x14\n"
+ "fmla v30.4s, v5.4s, v13.4s\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 41f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 41f\n"
+ "40:" // Oddments: Load input (2, 0): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "41:" // Oddments: Load input (2, 0): Bit 1: End
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v23.4s, v6.4s, v12.4s\n"
+ "fmla v26.4s, v3.4s, v12.4s\n"
+ "add x20, x20, x14\n"
+ "fmla v29.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 42f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 43f\n"
+ "42:" // Oddments: Load input (2, 4): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "43:" // Oddments: Load input (2, 4): Bit 1: End
+ "ldr x20, [x15, #0xc0]\n"
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v11.4s\n"
+ "add x20, x20, x14\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 44f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 45f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "b 45f\n"
+ "44:" // Oddments: Load input (4, 2): Bit 1: Unset
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "45:" // Oddments: Load input (4, 2): Bit 1: End
+ "fmla v29.4s, v8.4s, v13.4s\n"
+ "fmla v30.4s, v7.4s, v13.4s\n"
+ "fmax v23.4s, v23.4s, v15.4s\n"
+ "fmla v31.4s, v6.4s, v13.4s\n"
+ "fmax v24.4s, v24.4s, v15.4s\n"
+ "fmax v25.4s, v25.4s, v15.4s\n"
+ "fmax v26.4s, v26.4s, v15.4s\n"
+ "fmax v27.4s, v27.4s, v15.4s\n"
+ "fmax v28.4s, v28.4s, v15.4s\n"
+ "fmax v29.4s, v29.4s, v15.4s\n"
+ "fmax v30.4s, v30.4s, v15.4s\n"
+ "fmax v31.4s, v31.4s, v15.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
+ "tbz %x[n_channels], #1, 46f\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.d }[0], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.d }[0], [x23]\n"
+ "st1 { v25.d }[0], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.d }[0], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.d }[0], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "add x13, x13, #0x8\n"
+ "st1 { v28.d }[0], [x23]\n"
+ "st1 { v29.d }[0], [x22]\n"
+ "st1 { v30.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.s }[2], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.s }[2], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.s }[2], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
+ "b 47f\n"
+ "46:" // Oddments: Store: Bit 1: Unset
+ "ldr x20, [x17, #0x0]\n"
+ "add x20, x20, x13\n"
+ "st1 { v23.s }[0], [x20]\n"
+ "ldr x23, [x17, #0x8]\n"
+ "ldr x22, [x17, #0x10]\n"
+ "ldr x21, [x17, #0x18]\n"
+ "add x23, x23, x13\n"
+ "add x22, x22, x13\n"
+ "ldr x20, [x17, #0x20]\n"
+ "add x21, x21, x13\n"
+ "add x20, x20, x13\n"
+ "st1 { v24.s }[0], [x23]\n"
+ "st1 { v25.s }[0], [x22]\n"
+ "ldr x23, [x17, #0x28]\n"
+ "ldr x22, [x17, #0x30]\n"
+ "add x23, x23, x13\n"
+ "st1 { v26.s }[0], [x21]\n"
+ "ldr x21, [x17, #0x38]\n"
+ "add x22, x22, x13\n"
+ "add x21, x21, x13\n"
+ "st1 { v27.s }[0], [x20]\n"
+ "ldr x20, [x17, #0x40]\n"
+ "add x20, x20, x13\n"
+ "st1 { v28.s }[0], [x23]\n"
+ "st1 { v29.s }[0], [x22]\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
+ "47:" // Oddments: Store: Bit 1: End
+ "48:" // End
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..8a198c1818
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 4;
+ constexpr static unsigned int output_cols = 4;
+
+ a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..3adf8b0d9f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,1232 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x27, #0x0\n"
+ "mov x26, #0x0\n"
+ "1:" // Tile loop
+ "str x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x4\n"
+ "mov x23, #0x4\n"
+ "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x27, x24\n" // offset = tile_i * ld_input_row
+ "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x20, x27, x22\n" // offset = tile_i * ld_output_row
+ "mov x6, #0x10\n" // cntb _, ALL, #1
+ "madd x21, x26, x4, x21\n" // offset += tile_j * ld_input_col
+ "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "lsl x4, x4, #0x2\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "madd x20, x26, x5, x20\n" // offset += tile_j * ld_output_col
+ "lsl x5, x5, #0x2\n"
+ "add x17, x4, x4\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "mul x21, x21, x25\n" // offset *= kernel_stride * output_size
+ "add x7, x7, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x15, x7, x24, LSL #2\n"
+ "mul x20, x20, x23\n" // offset *= output_tile_size
+ "add x14, x15, x24, LSL #2\n"
+ "add x8, x8, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "lsr x13, %x[n_channels], #0x2\n"
+ "add x12, x14, x24, LSL #2\n"
+ "add x11, x17, x4\n"
+ "add x10, x8, x22, LSL #2\n"
+ "add x9, x12, x24, LSL #2\n"
+ "add x28, x11, x4\n"
+ "add x27, x10, x22, LSL #2\n"
+ "add x23, x5, x5\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "add x26, x9, x24, LSL #2\n"
+ "add x25, x28, x4\n"
+ "add x24, x27, x22, LSL #2\n"
+ "add x22, x23, x5\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x6\n"
+ "cbz x13, 4f\n"
+ "ldr q14, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "cmp x6, x13, LSL #4\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "add x16, x16, #0xa0\n"
+ "ldr q9, [x14, x17]\n"
+ "ld1 { v10.4s }, [x7]\n"
+ "ldr q11, [x7, x25]\n"
+ "ldr q12, [x14, x11]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v26.16b, v14.16b\n fmla v26.4s, v4.4s, v9.4s\n"
+ "mov v28.16b, v14.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+ "add x6, x6, #0x10\n"
+ "cmp x6, x13, LSL #4\n"
+ "mov v16.16b, v14.16b\n fmla v16.4s, v3.4s, v9.4s\n"
+ "mov v22.16b, v14.16b\n fmla v22.4s, v1.4s, v9.4s\n"
+ "add x20, x20, #0x10\n"
+ "add x21, x21, #0x10\n"
+ "mov v23.16b, v14.16b\n fmla v23.4s, v0.4s, v9.4s\n"
+ "fmla v26.4s, v5.4s, v12.4s\n"
+ "mov v25.16b, v14.16b\n fmla v25.4s, v7.4s, v9.4s\n"
+ "mov v17.16b, v14.16b\n fmla v17.4s, v6.4s, v9.4s\n"
+ "mov v31.16b, v14.16b\n fmla v31.4s, v5.4s, v9.4s\n"
+ "mov v20.16b, v14.16b\n fmla v20.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x12, x17]\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "ld1 { v30.4s }, [x26]\n"
+ "mov v29.16b, v14.16b\n fmla v29.4s, v2.4s, v11.4s\n"
+ "ldr q27, [x26, x25]\n"
+ "fmla v16.4s, v4.4s, v12.4s\n"
+ "fmla v22.4s, v2.4s, v12.4s\n"
+ "fmla v23.4s, v1.4s, v12.4s\n"
+ "mov v21.16b, v14.16b\n fmla v21.4s, v6.4s, v30.4s\n"
+ "ldr q10, [x12, x11]\n"
+ "fmla v26.4s, v7.4s, v9.4s\n"
+ "fmla v25.4s, v8.4s, v12.4s\n"
+ "fmla v17.4s, v7.4s, v12.4s\n"
+ "fmla v29.4s, v6.4s, v12.4s\n"
+ "mov v24.16b, v14.16b\n fmla v24.4s, v3.4s, v12.4s\n"
+ "mov v19.16b, v14.16b\n fmla v19.4s, v0.4s, v12.4s\n"
+ "ldr q11, [x7, x4]\n"
+ "mov v30.16b, v14.16b\n fmla v30.4s, v8.4s, v27.4s\n"
+ "ldr q12, [x7, x28]\n"
+ "fmla v16.4s, v6.4s, v9.4s\n"
+ "fmla v22.4s, v4.4s, v9.4s\n"
+ "fmla v23.4s, v3.4s, v9.4s\n"
+ "mov v27.16b, v14.16b\n fmla v27.4s, v1.4s, v9.4s\n"
+ "mov v18.16b, v14.16b\n fmla v18.4s, v0.4s, v9.4s\n"
+ "ldr q14, [x16, #0x0]\n"
+ "fmla v31.4s, v8.4s, v9.4s\n"
+ "fmla v20.4s, v5.4s, v9.4s\n"
+ "fmla v21.4s, v2.4s, v9.4s\n"
+ "ld1 { v9.4s }, [x15]\n"
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "fmla v25.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x15, x25]\n"
+ "fmla v17.4s, v2.4s, v12.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "ld1 { v12.4s }, [x9]\n"
+ "fmla v16.4s, v7.4s, v10.4s\n"
+ "fmla v24.4s, v6.4s, v10.4s\n"
+ "fmla v22.4s, v5.4s, v10.4s\n"
+ "fmla v23.4s, v4.4s, v10.4s\n"
+ "fmla v19.4s, v3.4s, v10.4s\n"
+ "fmla v27.4s, v2.4s, v10.4s\n"
+ "fmla v18.4s, v1.4s, v10.4s\n"
+ "fmla v30.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x15, x17]\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v20.4s, v6.4s, v12.4s\n"
+ "fmla v21.4s, v3.4s, v12.4s\n"
+ "ldr q12, [x9, x25]\n"
+ "fmla v26.4s, v1.4s, v10.4s\n"
+ "fmla v28.4s, v3.4s, v9.4s\n"
+ "fmla v29.4s, v5.4s, v11.4s\n"
+ "fmla v24.4s, v2.4s, v11.4s\n"
+ "ldr q11, [x15, x11]\n"
+ "fmla v25.4s, v4.4s, v10.4s\n"
+ "fmla v17.4s, v3.4s, v10.4s\n"
+ "fmla v16.4s, v0.4s, v10.4s\n"
+ "fmla v19.4s, v8.4s, v12.4s\n"
+ "fmla v30.4s, v5.4s, v12.4s\n"
+ "ldr q9, [x26, x4]\n"
+ "fmla v31.4s, v2.4s, v10.4s\n"
+ "fmla v26.4s, v2.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v10.4s\n"
+ "ldr q10, [x14, x4]\n"
+ "fmla v25.4s, v5.4s, v11.4s\n"
+ "fmla v17.4s, v4.4s, v11.4s\n"
+ "fmla v29.4s, v3.4s, v11.4s\n"
+ "fmla v16.4s, v1.4s, v11.4s\n"
+ "fmla v24.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x14, x28]\n"
+ "fmla v21.4s, v7.4s, v9.4s\n"
+ "fmla v27.4s, v6.4s, v9.4s\n"
+ "ldr q12, [x26, x28]\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmla v26.4s, v3.4s, v10.4s\n"
+ "fmla v20.4s, v1.4s, v10.4s\n"
+ "fmla v22.4s, v0.4s, v10.4s\n"
+ "fmla v28.4s, v7.4s, v10.4s\n"
+ "fmla v25.4s, v6.4s, v10.4s\n"
+ "ldr q10, [x7, x17]\n"
+ "fmla v18.4s, v8.4s, v12.4s\n"
+ "fmla v30.4s, v7.4s, v12.4s\n"
+ "ldr q9, [x12, x4]\n"
+ "fmla v17.4s, v8.4s, v11.4s\n"
+ "fmla v29.4s, v7.4s, v11.4s\n"
+ "fmla v16.4s, v5.4s, v11.4s\n"
+ "fmla v24.4s, v4.4s, v11.4s\n"
+ "fmla v23.4s, v2.4s, v11.4s\n"
+ "fmla v19.4s, v1.4s, v11.4s\n"
+ "ldr q12, [x7, x11]\n"
+ "add x7, x7, #0x10\n"
+ "fmla v31.4s, v7.4s, v9.4s\n"
+ "fmla v26.4s, v6.4s, v9.4s\n"
+ "fmla v20.4s, v4.4s, v9.4s\n"
+ "fmla v22.4s, v3.4s, v9.4s\n"
+ "fmla v21.4s, v1.4s, v9.4s\n"
+ "fmla v27.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x12, x28]\n"
+ "fmla v28.4s, v2.4s, v10.4s\n"
+ "fmla v25.4s, v1.4s, v10.4s\n"
+ "fmla v17.4s, v0.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x14]\n"
+ "fmla v18.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v0.4s, v12.4s\n"
+ "fmla v31.4s, v3.4s, v10.4s\n"
+ "fmla v20.4s, v0.4s, v10.4s\n"
+ "fmla v16.4s, v8.4s, v9.4s\n"
+ "fmla v24.4s, v7.4s, v9.4s\n"
+ "fmla v23.4s, v5.4s, v9.4s\n"
+ "fmla v19.4s, v4.4s, v9.4s\n"
+ "fmla v30.4s, v1.4s, v9.4s\n"
+ "ldr q11, [x9, x17]\n"
+ "fmla v25.4s, v2.4s, v12.4s\n"
+ "fmla v17.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x14, x25]\n"
+ "add x14, x14, #0x10\n"
+ "ldr q9, [x14, x17]\n"
+ "fmla v28.4s, v6.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x12]\n"
+ "fmla v27.4s, v4.4s, v11.4s\n"
+ "fmla v18.4s, v3.4s, v11.4s\n"
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmla v24.4s, v5.4s, v12.4s\n"
+ "fmla v19.4s, v2.4s, v12.4s\n"
+ "ldr q12, [x12, x25]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v31.4s, v6.4s, v10.4s\n"
+ "fmla v20.4s, v3.4s, v10.4s\n"
+ "fmla v21.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x26, x17]\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
+ "fmla v18.4s, v6.4s, v10.4s\n"
+ "fmla v20.4s, v8.4s, v11.4s\n"
+ "fmla v22.4s, v7.4s, v11.4s\n"
+ "fmla v23.4s, v6.4s, v11.4s\n"
+ "fmla v21.4s, v5.4s, v11.4s\n"
+ "ldr q11, [x9, x11]\n"
+ "fmla v19.4s, v5.4s, v12.4s\n"
+ "fmla v27.4s, v5.4s, v11.4s\n"
+ "fmla v18.4s, v4.4s, v11.4s\n"
+ "fmla v30.4s, v3.4s, v11.4s\n"
+ "fmla v24.4s, v8.4s, v12.4s\n"
+ "ldr q12, [x26, x11]\n"
+ "fmla v21.4s, v8.4s, v10.4s\n"
+ "ldr q10, [x15, x4]\n"
+ "fmla v22.4s, v8.4s, v11.4s\n"
+ "fmla v23.4s, v7.4s, v11.4s\n"
+ "add x26, x26, #0x10\n"
+ "fmla v19.4s, v6.4s, v11.4s\n"
+ "ldr q11, [x15, x28]\n"
+ "fmla v27.4s, v8.4s, v12.4s\n"
+ "add x15, x15, #0x10\n"
+ "fmla v18.4s, v7.4s, v12.4s\n"
+ "fmla v30.4s, v6.4s, v12.4s\n"
+ "ldr q12, [x9, x4]\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "fmla v25.4s, v3.4s, v10.4s\n"
+ "fmax v28.4s, v28.4s, v13.4s\n"
+ "fmla v31.4s, v1.4s, v10.4s\n"
+ "fmla v26.4s, v0.4s, v10.4s\n"
+ "ldr q10, [x9, x28]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "fmla v17.4s, v5.4s, v11.4s\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "fmax v25.4s, v25.4s, v13.4s\n"
+ "add x9, x9, #0x10\n"
+ "fmla v16.4s, v2.4s, v11.4s\n"
+ "ldr q2, [x16, #0x30]\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "ldr q11, [x7, x25]\n"
+ "ldr q1, [x16, #0x20]\n"
+ "fmla v20.4s, v7.4s, v12.4s\n"
+ "fmla v22.4s, v6.4s, v12.4s\n"
+ "ldr q6, [x16, #0x70]\n"
+ "fmla v21.4s, v4.4s, v12.4s\n"
+ "fmla v27.4s, v3.4s, v12.4s\n"
+ "ldr q12, [x14, x11]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "fmla v23.4s, v8.4s, v10.4s\n"
+ "ldr q8, [x16, #0x90]\n"
+ "fmla v19.4s, v7.4s, v10.4s\n"
+ "ldr q7, [x16, #0x80]\n"
+ "fmla v18.4s, v5.4s, v10.4s\n"
+ "ldr q5, [x16, #0x60]\n"
+ "fmla v30.4s, v4.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x7]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "fmax v17.4s, v17.4s, v13.4s\n"
+ "fmax v29.4s, v29.4s, v13.4s\n"
+ "add x16, x16, #0xa0\n"
+ "fmax v31.4s, v31.4s, v13.4s\n"
+ "fmax v26.4s, v26.4s, v13.4s\n"
+ "fmax v16.4s, v16.4s, v13.4s\n"
+ "fmax v24.4s, v24.4s, v13.4s\n"
+ "fmax v20.4s, v20.4s, v13.4s\n"
+ "fmax v22.4s, v22.4s, v13.4s\n"
+ "fmax v23.4s, v23.4s, v13.4s\n"
+ "fmax v19.4s, v19.4s, v13.4s\n"
+ "fmax v21.4s, v21.4s, v13.4s\n"
+ "fmax v27.4s, v27.4s, v13.4s\n"
+ "fmax v18.4s, v18.4s, v13.4s\n"
+ "fmax v30.4s, v30.4s, v13.4s\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "fmin v25.4s, v25.4s, v15.4s\n"
+ "st1 { v28.4s }, [x8]\n"
+ "fmin v17.4s, v17.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "str q25, [x8, x5]\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "fmin v26.4s, v26.4s, v15.4s\n"
+ "str q17, [x8, x23]\n"
+ "fmin v16.4s, v16.4s, v15.4s\n"
+ "fmin v24.4s, v24.4s, v15.4s\n"
+ "str q29, [x8, x22]\n"
+ "add x8, x8, #0x10\n"
+ "fmin v20.4s, v20.4s, v15.4s\n"
+ "fmin v22.4s, v22.4s, v15.4s\n"
+ "st1 { v31.4s }, [x10]\n"
+ "fmin v23.4s, v23.4s, v15.4s\n"
+ "fmin v19.4s, v19.4s, v15.4s\n"
+ "str q26, [x10, x5]\n"
+ "fmin v21.4s, v21.4s, v15.4s\n"
+ "fmin v27.4s, v27.4s, v15.4s\n"
+ "str q16, [x10, x23]\n"
+ "fmin v18.4s, v18.4s, v15.4s\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "str q24, [x10, x22]\n"
+ "add x10, x10, #0x10\n"
+ "st1 { v20.4s }, [x27]\n"
+ "str q22, [x27, x5]\n"
+ "str q23, [x27, x23]\n"
+ "str q19, [x27, x22]\n"
+ "add x27, x27, #0x10\n"
+ "st1 { v21.4s }, [x24]\n"
+ "str q27, [x24, x5]\n"
+ "str q18, [x24, x23]\n"
+ "str q30, [x24, x22]\n"
+ "add x24, x24, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v16.16b, v14.16b\n fmla v16.4s, v4.4s, v9.4s\n"
+ "mov v23.16b, v14.16b\n fmla v23.4s, v8.4s, v9.4s\n"
+ "mov v31.16b, v14.16b\n fmla v31.4s, v3.4s, v9.4s\n"
+ "mov v30.16b, v14.16b\n fmla v30.4s, v1.4s, v9.4s\n"
+ "mov v18.16b, v14.16b\n fmla v18.4s, v0.4s, v9.4s\n"
+ "fmla v16.4s, v5.4s, v12.4s\n"
+ "mov v17.16b, v14.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+ "mov v19.16b, v14.16b\n fmla v19.4s, v6.4s, v9.4s\n"
+ "mov v28.16b, v14.16b\n fmla v28.4s, v5.4s, v9.4s\n"
+ "mov v27.16b, v14.16b\n fmla v27.4s, v2.4s, v9.4s\n"
+ "ldr q24, [x12, x17]\n"
+ "fmla v23.4s, v0.4s, v10.4s\n"
+ "ld1 { v21.4s }, [x26]\n"
+ "mov v29.16b, v14.16b\n fmla v29.4s, v2.4s, v11.4s\n"
+ "ldr q20, [x26, x25]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "fmla v18.4s, v1.4s, v12.4s\n"
+ "mov v26.16b, v14.16b\n fmla v26.4s, v6.4s, v21.4s\n"
+ "ldr q9, [x12, x11]\n"
+ "fmla v16.4s, v7.4s, v24.4s\n"
+ "fmla v17.4s, v8.4s, v12.4s\n"
+ "fmla v19.4s, v7.4s, v12.4s\n"
+ "fmla v29.4s, v6.4s, v12.4s\n"
+ "mov v11.16b, v14.16b\n fmla v11.4s, v3.4s, v12.4s\n"
+ "mov v10.16b, v14.16b\n fmla v10.4s, v0.4s, v12.4s\n"
+ "ldr q22, [x7, x4]\n"
+ "mov v25.16b, v14.16b\n fmla v25.4s, v8.4s, v20.4s\n"
+ "ldr q21, [x7, x28]\n"
+ "fmla v31.4s, v6.4s, v24.4s\n"
+ "fmla v30.4s, v4.4s, v24.4s\n"
+ "fmla v18.4s, v3.4s, v24.4s\n"
+ "mov v12.16b, v14.16b\n fmla v12.4s, v1.4s, v24.4s\n"
+ "fmla v14.4s, v0.4s, v24.4s\n"
+ "fmla v28.4s, v8.4s, v24.4s\n"
+ "fmla v27.4s, v5.4s, v24.4s\n"
+ "fmla v26.4s, v2.4s, v24.4s\n"
+ "ld1 { v24.4s }, [x15]\n"
+ "fmla v16.4s, v8.4s, v9.4s\n"
+ "fmla v23.4s, v1.4s, v22.4s\n"
+ "fmla v17.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x15, x25]\n"
+ "fmla v19.4s, v2.4s, v21.4s\n"
+ "fmla v29.4s, v1.4s, v21.4s\n"
+ "ld1 { v20.4s }, [x9]\n"
+ "fmla v31.4s, v7.4s, v9.4s\n"
+ "fmla v11.4s, v6.4s, v9.4s\n"
+ "fmla v30.4s, v5.4s, v9.4s\n"
+ "fmla v18.4s, v4.4s, v9.4s\n"
+ "fmla v10.4s, v3.4s, v9.4s\n"
+ "fmla v12.4s, v2.4s, v9.4s\n"
+ "fmla v14.4s, v1.4s, v9.4s\n"
+ "fmla v25.4s, v0.4s, v9.4s\n"
+ "ldr q21, [x15, x17]\n"
+ "fmla v28.4s, v0.4s, v24.4s\n"
+ "fmla v27.4s, v6.4s, v20.4s\n"
+ "fmla v26.4s, v3.4s, v20.4s\n"
+ "ldr q20, [x9, x25]\n"
+ "fmla v16.4s, v1.4s, v21.4s\n"
+ "fmla v23.4s, v3.4s, v24.4s\n"
+ "fmla v29.4s, v5.4s, v22.4s\n"
+ "fmla v11.4s, v2.4s, v22.4s\n"
+ "ldr q22, [x15, x11]\n"
+ "fmla v17.4s, v4.4s, v21.4s\n"
+ "fmla v19.4s, v3.4s, v21.4s\n"
+ "fmla v31.4s, v0.4s, v21.4s\n"
+ "fmla v10.4s, v8.4s, v20.4s\n"
+ "fmla v25.4s, v5.4s, v20.4s\n"
+ "ldr q20, [x26, x4]\n"
+ "fmla v28.4s, v2.4s, v21.4s\n"
+ "fmla v16.4s, v2.4s, v22.4s\n"
+ "fmla v23.4s, v5.4s, v21.4s\n"
+ "ldr q21, [x14, x4]\n"
+ "fmla v17.4s, v5.4s, v22.4s\n"
+ "fmla v19.4s, v4.4s, v22.4s\n"
+ "fmla v29.4s, v3.4s, v22.4s\n"
+ "fmla v31.4s, v1.4s, v22.4s\n"
+ "fmla v11.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x14, x28]\n"
+ "fmla v26.4s, v7.4s, v20.4s\n"
+ "fmla v12.4s, v6.4s, v20.4s\n"
+ "ldr q20, [x26, x28]\n"
+ "fmla v28.4s, v4.4s, v21.4s\n"
+ "fmla v16.4s, v3.4s, v21.4s\n"
+ "fmla v27.4s, v1.4s, v21.4s\n"
+ "fmla v30.4s, v0.4s, v21.4s\n"
+ "fmla v23.4s, v7.4s, v21.4s\n"
+ "fmla v17.4s, v6.4s, v21.4s\n"
+ "ldr q21, [x7, x17]\n"
+ "fmla v14.4s, v8.4s, v20.4s\n"
+ "fmla v25.4s, v7.4s, v20.4s\n"
+ "ldr q20, [x12, x4]\n"
+ "fmla v19.4s, v8.4s, v22.4s\n"
+ "fmla v29.4s, v7.4s, v22.4s\n"
+ "fmla v31.4s, v5.4s, v22.4s\n"
+ "fmla v11.4s, v4.4s, v22.4s\n"
+ "fmla v18.4s, v2.4s, v22.4s\n"
+ "fmla v10.4s, v1.4s, v22.4s\n"
+ "ldr q22, [x7, x11]\n"
+ "add x7, x7, #0x10\n"
+ "fmla v28.4s, v7.4s, v20.4s\n"
+ "fmla v16.4s, v6.4s, v20.4s\n"
+ "fmla v27.4s, v4.4s, v20.4s\n"
+ "fmla v30.4s, v3.4s, v20.4s\n"
+ "fmla v26.4s, v1.4s, v20.4s\n"
+ "fmla v12.4s, v0.4s, v20.4s\n"
+ "ldr q20, [x12, x28]\n"
+ "fmla v23.4s, v2.4s, v21.4s\n"
+ "fmla v17.4s, v1.4s, v21.4s\n"
+ "fmla v19.4s, v0.4s, v21.4s\n"
+ "ld1 { v21.4s }, [x14]\n"
+ "fmla v14.4s, v2.4s, v20.4s\n"
+ "fmla v29.4s, v0.4s, v22.4s\n"
+ "fmla v28.4s, v3.4s, v21.4s\n"
+ "fmla v27.4s, v0.4s, v21.4s\n"
+ "fmla v31.4s, v8.4s, v20.4s\n"
+ "fmla v11.4s, v7.4s, v20.4s\n"
+ "fmla v18.4s, v5.4s, v20.4s\n"
+ "fmla v10.4s, v4.4s, v20.4s\n"
+ "fmla v25.4s, v1.4s, v20.4s\n"
+ "ldr q24, [x9, x17]\n"
+ "fmla v17.4s, v2.4s, v22.4s\n"
+ "fmla v19.4s, v1.4s, v22.4s\n"
+ "ldr q20, [x14, x25]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v23.4s, v6.4s, v21.4s\n"
+ "ld1 { v21.4s }, [x12]\n"
+ "fmla v12.4s, v4.4s, v24.4s\n"
+ "fmla v14.4s, v3.4s, v24.4s\n"
+ "fmla v29.4s, v8.4s, v20.4s\n"
+ "fmla v11.4s, v5.4s, v20.4s\n"
+ "fmla v10.4s, v2.4s, v20.4s\n"
+ "ldr q20, [x12, x25]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v28.4s, v6.4s, v21.4s\n"
+ "fmla v27.4s, v3.4s, v21.4s\n"
+ "fmla v26.4s, v0.4s, v21.4s\n"
+ "ldr q22, [x26, x17]\n"
+ "fmla v25.4s, v2.4s, v20.4s\n"
+ "fmla v12.4s, v7.4s, v22.4s\n"
+ "fmla v14.4s, v6.4s, v22.4s\n"
+ "fmla v27.4s, v8.4s, v24.4s\n"
+ "fmla v30.4s, v7.4s, v24.4s\n"
+ "fmla v18.4s, v6.4s, v24.4s\n"
+ "fmla v26.4s, v5.4s, v24.4s\n"
+ "ldr q21, [x9, x11]\n"
+ "fmla v10.4s, v5.4s, v20.4s\n"
+ "fmla v12.4s, v5.4s, v21.4s\n"
+ "fmla v14.4s, v4.4s, v21.4s\n"
+ "fmla v25.4s, v3.4s, v21.4s\n"
+ "fmla v11.4s, v8.4s, v20.4s\n"
+ "ldr q20, [x26, x11]\n"
+ "fmla v26.4s, v8.4s, v22.4s\n"
+ "ldr q9, [x15, x4]\n"
+ "fmla v30.4s, v8.4s, v21.4s\n"
+ "fmla v18.4s, v7.4s, v21.4s\n"
+ "add x26, x26, #0x10\n"
+ "fmla v10.4s, v6.4s, v21.4s\n"
+ "ldr q21, [x15, x28]\n"
+ "fmla v12.4s, v8.4s, v20.4s\n"
+ "add x15, x15, #0x10\n"
+ "fmla v14.4s, v7.4s, v20.4s\n"
+ "fmla v25.4s, v6.4s, v20.4s\n"
+ "ldr q24, [x9, x4]\n"
+ "fmla v23.4s, v4.4s, v9.4s\n"
+ "fmla v17.4s, v3.4s, v9.4s\n"
+ "fmax v23.4s, v23.4s, v13.4s\n"
+ "fmla v28.4s, v1.4s, v9.4s\n"
+ "fmla v16.4s, v0.4s, v9.4s\n"
+ "ldr q0, [x9, x28]\n"
+ "fmax v17.4s, v17.4s, v13.4s\n"
+ "fmla v19.4s, v5.4s, v21.4s\n"
+ "fmla v29.4s, v4.4s, v21.4s\n"
+ "fmax v19.4s, v19.4s, v13.4s\n"
+ "add x9, x9, #0x10\n"
+ "fmla v31.4s, v2.4s, v21.4s\n"
+ "fmla v11.4s, v1.4s, v21.4s\n"
+ "fmax v29.4s, v29.4s, v13.4s\n"
+ "fmla v27.4s, v7.4s, v24.4s\n"
+ "fmla v30.4s, v6.4s, v24.4s\n"
+ "fmax v28.4s, v28.4s, v13.4s\n"
+ "fmla v26.4s, v4.4s, v24.4s\n"
+ "fmla v12.4s, v3.4s, v24.4s\n"
+ "fmax v16.4s, v16.4s, v13.4s\n"
+ "fmla v18.4s, v8.4s, v0.4s\n"
+ "fmla v10.4s, v7.4s, v0.4s\n"
+ "fmax v31.4s, v31.4s, v13.4s\n"
+ "fmla v14.4s, v5.4s, v0.4s\n"
+ "fmla v25.4s, v4.4s, v0.4s\n"
+ "fmax v11.4s, v11.4s, v13.4s\n"
+ "fmax v27.4s, v27.4s, v13.4s\n"
+ "fmax v30.4s, v30.4s, v13.4s\n"
+ "fmax v18.4s, v18.4s, v13.4s\n"
+ "fmax v10.4s, v10.4s, v13.4s\n"
+ "fmax v26.4s, v26.4s, v13.4s\n"
+ "fmax v12.4s, v12.4s, v13.4s\n"
+ "fmax v14.4s, v14.4s, v13.4s\n"
+ "fmax v25.4s, v25.4s, v13.4s\n"
+ "fmin v23.4s, v23.4s, v15.4s\n"
+ "fmin v17.4s, v17.4s, v15.4s\n"
+ "st1 { v23.4s }, [x8]\n"
+ "fmin v19.4s, v19.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "str q17, [x8, x5]\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "fmin v16.4s, v16.4s, v15.4s\n"
+ "str q19, [x8, x23]\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "fmin v11.4s, v11.4s, v15.4s\n"
+ "str q29, [x8, x22]\n"
+ "add x8, x8, #0x10\n"
+ "fmin v27.4s, v27.4s, v15.4s\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "st1 { v28.4s }, [x10]\n"
+ "fmin v18.4s, v18.4s, v15.4s\n"
+ "fmin v10.4s, v10.4s, v15.4s\n"
+ "str q16, [x10, x5]\n"
+ "fmin v26.4s, v26.4s, v15.4s\n"
+ "fmin v12.4s, v12.4s, v15.4s\n"
+ "str q31, [x10, x23]\n"
+ "fmin v14.4s, v14.4s, v15.4s\n"
+ "fmin v25.4s, v25.4s, v15.4s\n"
+ "str q11, [x10, x22]\n"
+ "add x10, x10, #0x10\n"
+ "st1 { v27.4s }, [x27]\n"
+ "str q30, [x27, x5]\n"
+ "str q18, [x27, x23]\n"
+ "str q10, [x27, x22]\n"
+ "add x27, x27, #0x10\n"
+ "st1 { v26.4s }, [x24]\n"
+ "str q12, [x24, x5]\n"
+ "str q14, [x24, x23]\n"
+ "str q25, [x24, x22]\n"
+ "add x24, x24, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 73f\n"
+ "ldr q14, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "add x23, x14, x17\n"
+ "add x22, x7, XZR\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "add x21, x7, x25\n"
+ "add x20, x14, x11\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "ldr q5, [x16, #0x60]\n"
+ "ldr q6, [x16, #0x70]\n"
+ "ldr q7, [x16, #0x80]\n"
+ "ldr q8, [x16, #0x90]\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ldr d9, [x23], #0x8\n"
+ "ldr d10, [x22], #0x8\n"
+ "ldr d11, [x21], #0x8\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v9.s }[2], [x23]\n"
+ "ld1 { v10.s }[2], [x22]\n"
+ "ld1 { v11.s }[2], [x21]\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 6f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset
+ "ldr s9, [x23, #0x0]\n"
+ "ldr s10, [x22, #0x0]\n"
+ "ldr s11, [x21, #0x0]\n"
+ "ldr s12, [x20, #0x0]\n"
+ "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
+ "mov v16.16b, v14.16b\n fmla v16.4s, v8.4s, v9.4s\n"
+ "mov v17.16b, v14.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+ "add x20, x26, XZR\n"
+ "mov v18.16b, v14.16b\n fmla v18.4s, v6.4s, v9.4s\n"
+ "mov v21.16b, v14.16b\n fmla v21.4s, v4.4s, v9.4s\n"
+ "mov v22.16b, v14.16b\n fmla v22.4s, v3.4s, v9.4s\n"
+ "mov v25.16b, v14.16b\n fmla v25.4s, v1.4s, v9.4s\n"
+ "mov v26.16b, v14.16b\n fmla v26.4s, v0.4s, v9.4s\n"
+ "mov v19.16b, v14.16b\n fmla v19.4s, v2.4s, v11.4s\n"
+ "mov v20.16b, v14.16b\n fmla v20.4s, v5.4s, v9.4s\n"
+ "mov v24.16b, v14.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+ "fmla v16.4s, v0.4s, v10.4s\n"
+ "fmla v17.4s, v8.4s, v12.4s\n"
+ "fmla v18.4s, v7.4s, v12.4s\n"
+ "fmla v19.4s, v6.4s, v12.4s\n"
+ "fmla v21.4s, v5.4s, v12.4s\n"
+ "fmla v22.4s, v4.4s, v12.4s\n"
+ "mov v23.16b, v14.16b\n fmla v23.4s, v3.4s, v12.4s\n"
+ "fmla v25.4s, v2.4s, v12.4s\n"
+ "fmla v26.4s, v1.4s, v12.4s\n"
+ "mov v27.16b, v14.16b\n fmla v27.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v10.s }[2], [x20]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
+ "ldr s10, [x20, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
+ "mov v28.16b, v14.16b\n fmla v28.4s, v6.4s, v10.4s\n"
+ "add x20, x26, x25\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 10f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "10:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
+ "mov v31.16b, v14.16b\n fmla v31.4s, v8.4s, v11.4s\n"
+ "add x20, x12, x17\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr d9, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v9.s }[2], [x20]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+ "ldr s9, [x20, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+ "fmla v20.4s, v8.4s, v9.4s\n"
+ "fmla v21.4s, v7.4s, v9.4s\n"
+ "add x20, x7, x4\n"
+ "fmla v22.4s, v6.4s, v9.4s\n"
+ "fmla v24.4s, v5.4s, v9.4s\n"
+ "fmla v25.4s, v4.4s, v9.4s\n"
+ "fmla v26.4s, v3.4s, v9.4s\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "mov v29.16b, v14.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "mov v30.16b, v14.16b\n fmla v30.4s, v0.4s, v9.4s\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 14f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "14:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+ "fmla v16.4s, v1.4s, v12.4s\n"
+ "fmla v17.4s, v0.4s, v12.4s\n"
+ "add x20, x7, x28\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: End
+ "fmla v18.4s, v2.4s, v11.4s\n"
+ "fmla v19.4s, v1.4s, v11.4s\n"
+ "add x20, x12, x11\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v10.s }[2], [x20]\n"
+ "b 18f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+ "ldr s10, [x20, #0x0]\n"
+ "18:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+ "fmla v21.4s, v8.4s, v10.4s\n"
+ "fmla v22.4s, v7.4s, v10.4s\n"
+ "add x20, x15, XZR\n"
+ "fmla v23.4s, v6.4s, v10.4s\n"
+ "fmla v25.4s, v5.4s, v10.4s\n"
+ "fmla v26.4s, v4.4s, v10.4s\n"
+ "fmla v27.4s, v3.4s, v10.4s\n"
+ "fmla v29.4s, v2.4s, v10.4s\n"
+ "fmla v30.4s, v1.4s, v10.4s\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr d9, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v9.s }[2], [x20]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+ "ldr s9, [x20, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+ "fmla v16.4s, v3.4s, v9.4s\n"
+ "fmla v20.4s, v0.4s, v9.4s\n"
+ "add x20, x15, x25\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 22f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 22f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "22:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: End
+ "fmla v19.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v2.4s, v12.4s\n"
+ "add x20, x9, XZR\n"
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+ "fmla v24.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v3.4s, v11.4s\n"
+ "add x20, x15, x17\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 26f\n"
+ "ld1 { v10.s }[2], [x20]\n"
+ "b 26f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset
+ "ldr s10, [x20, #0x0]\n"
+ "26:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End
+ "fmla v16.4s, v5.4s, v10.4s\n"
+ "fmla v17.4s, v4.4s, v10.4s\n"
+ "add x20, x9, x25\n"
+ "fmla v18.4s, v3.4s, v10.4s\n"
+ "fmla v20.4s, v2.4s, v10.4s\n"
+ "fmla v21.4s, v1.4s, v10.4s\n"
+ "fmla v22.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End
+ "fmla v27.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v5.4s, v11.4s\n"
+ "add x20, x15, x11\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 30f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 30f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "30:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+ "fmla v17.4s, v5.4s, v12.4s\n"
+ "fmla v18.4s, v4.4s, v12.4s\n"
+ "add x20, x26, x4\n"
+ "fmla v19.4s, v3.4s, v12.4s\n"
+ "fmla v21.4s, v2.4s, v12.4s\n"
+ "fmla v22.4s, v1.4s, v12.4s\n"
+ "fmla v23.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End
+ "fmla v28.4s, v7.4s, v11.4s\n"
+ "fmla v29.4s, v6.4s, v11.4s\n"
+ "add x20, x14, x4\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 34f\n"
+ "ld1 { v10.s }[2], [x20]\n"
+ "b 34f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+ "ldr s10, [x20, #0x0]\n"
+ "34:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+ "fmla v16.4s, v7.4s, v10.4s\n"
+ "fmla v17.4s, v6.4s, v10.4s\n"
+ "add x20, x26, x28\n"
+ "fmla v20.4s, v4.4s, v10.4s\n"
+ "fmla v21.4s, v3.4s, v10.4s\n"
+ "fmla v24.4s, v1.4s, v10.4s\n"
+ "fmla v25.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End
+ "fmla v30.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v7.4s, v11.4s\n"
+ "add x20, x14, x28\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 38f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 38f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "38:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+ "fmla v18.4s, v8.4s, v12.4s\n"
+ "fmla v19.4s, v7.4s, v12.4s\n"
+ "add x20, x7, x17\n"
+ "fmla v22.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "fmla v26.4s, v2.4s, v12.4s\n"
+ "fmla v27.4s, v1.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v10.s }[2], [x20]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+ "ldr s10, [x20, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+ "fmla v16.4s, v2.4s, v10.4s\n"
+ "fmla v17.4s, v1.4s, v10.4s\n"
+ "add x20, x12, x4\n"
+ "fmla v18.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 42f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 42f\n"
+ "41:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "42:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+ "fmla v20.4s, v7.4s, v11.4s\n"
+ "fmla v21.4s, v6.4s, v11.4s\n"
+ "add x20, x7, x11\n"
+ "fmla v24.4s, v4.4s, v11.4s\n"
+ "fmla v25.4s, v3.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 43f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 44f\n"
+ "43:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "44:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End
+ "fmla v17.4s, v2.4s, v12.4s\n"
+ "fmla v18.4s, v1.4s, v12.4s\n"
+ "add x20, x14, XZR\n"
+ "fmla v19.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 45f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 46f\n"
+ "ld1 { v10.s }[2], [x20]\n"
+ "b 46f\n"
+ "45:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+ "ldr s10, [x20, #0x0]\n"
+ "46:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+ "fmla v16.4s, v6.4s, v10.4s\n"
+ "fmla v20.4s, v3.4s, v10.4s\n"
+ "add x20, x12, x28\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 47f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 48f\n"
+ "47:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "48:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+ "fmla v22.4s, v8.4s, v11.4s\n"
+ "fmla v23.4s, v7.4s, v11.4s\n"
+ "add x20, x14, x25\n"
+ "fmla v26.4s, v5.4s, v11.4s\n"
+ "fmla v27.4s, v4.4s, v11.4s\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 49f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 50f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 50f\n"
+ "49:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "50:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End
+ "fmla v19.4s, v8.4s, v12.4s\n"
+ "fmla v23.4s, v5.4s, v12.4s\n"
+ "add x20, x12, XZR\n"
+ "fmla v27.4s, v2.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 51f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v10.s }[2], [x20]\n"
+ "b 52f\n"
+ "51:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+ "ldr s10, [x20, #0x0]\n"
+ "52:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+ "fmla v20.4s, v6.4s, v10.4s\n"
+ "fmla v24.4s, v3.4s, v10.4s\n"
+ "add x20, x9, x17\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 53f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 54f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 54f\n"
+ "53:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "54:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+ "fmla v24.4s, v8.4s, v11.4s\n"
+ "fmla v25.4s, v7.4s, v11.4s\n"
+ "add x20, x12, x25\n"
+ "fmla v26.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v11.4s\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "fmla v30.4s, v3.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 55f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 56f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 56f\n"
+ "55:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "56:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End
+ "fmla v23.4s, v8.4s, v12.4s\n"
+ "fmla v27.4s, v5.4s, v12.4s\n"
+ "add x20, x26, x17\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 57f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 58f\n"
+ "ld1 { v10.s }[2], [x20]\n"
+ "b 58f\n"
+ "57:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset
+ "ldr s10, [x20, #0x0]\n"
+ "58:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "add x20, x9, x11\n"
+ "fmla v30.4s, v6.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 59f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 60f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 60f\n"
+ "59:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "60:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "fmla v26.4s, v7.4s, v11.4s\n"
+ "add x20, x26, x11\n"
+ "fmla v27.4s, v6.4s, v11.4s\n"
+ "fmla v29.4s, v5.4s, v11.4s\n"
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 61f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 62f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 62f\n"
+ "61:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "62:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmla v30.4s, v7.4s, v12.4s\n"
+ "add x20, x15, x4\n"
+ "fmla v31.4s, v6.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 63f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 64f\n"
+ "ld1 { v10.s }[2], [x20]\n"
+ "b 64f\n"
+ "63:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset
+ "ldr s10, [x20, #0x0]\n"
+ "64:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End
+ "fmla v16.4s, v4.4s, v10.4s\n"
+ "fmla v17.4s, v3.4s, v10.4s\n"
+ "add x20, x15, x28\n"
+ "fmla v20.4s, v1.4s, v10.4s\n"
+ "fmla v21.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 65f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 66f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 66f\n"
+ "65:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "66:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+ "fmla v18.4s, v5.4s, v11.4s\n"
+ "fmla v19.4s, v4.4s, v11.4s\n"
+ "add x20, x9, x4\n"
+ "fmla v22.4s, v2.4s, v11.4s\n"
+ "fmla v23.4s, v1.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 67f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 68f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 68f\n"
+ "67:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "68:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+ "fmla v24.4s, v7.4s, v12.4s\n"
+ "fmla v25.4s, v6.4s, v12.4s\n"
+ "add x20, x9, x28\n"
+ "fmla v28.4s, v4.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 69f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 70f\n"
+ "ld1 { v10.s }[2], [x20]\n"
+ "b 70f\n"
+ "69:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+ "ldr s10, [x20, #0x0]\n"
+ "70:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
+ "fmax v16.4s, v16.4s, v13.4s\n"
+ "fmla v30.4s, v5.4s, v10.4s\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmax v17.4s, v17.4s, v13.4s\n"
+ "fmax v18.4s, v18.4s, v13.4s\n"
+ "fmax v19.4s, v19.4s, v13.4s\n"
+ "fmax v20.4s, v20.4s, v13.4s\n"
+ "fmax v21.4s, v21.4s, v13.4s\n"
+ "fmax v22.4s, v22.4s, v13.4s\n"
+ "fmax v23.4s, v23.4s, v13.4s\n"
+ "fmax v24.4s, v24.4s, v13.4s\n"
+ "fmax v25.4s, v25.4s, v13.4s\n"
+ "fmax v26.4s, v26.4s, v13.4s\n"
+ "fmax v27.4s, v27.4s, v13.4s\n"
+ "fmax v28.4s, v28.4s, v13.4s\n"
+ "fmax v29.4s, v29.4s, v13.4s\n"
+ "fmax v30.4s, v30.4s, v13.4s\n"
+ "fmax v31.4s, v31.4s, v13.4s\n"
+ "fmin v16.4s, v16.4s, v15.4s\n"
+ "fmin v17.4s, v17.4s, v15.4s\n"
+ "fmin v18.4s, v18.4s, v15.4s\n"
+ "fmin v19.4s, v19.4s, v15.4s\n"
+ "fmin v20.4s, v20.4s, v15.4s\n"
+ "fmin v21.4s, v21.4s, v15.4s\n"
+ "fmin v22.4s, v22.4s, v15.4s\n"
+ "fmin v23.4s, v23.4s, v15.4s\n"
+ "fmin v24.4s, v24.4s, v15.4s\n"
+ "fmin v25.4s, v25.4s, v15.4s\n"
+ "fmin v26.4s, v26.4s, v15.4s\n"
+ "fmin v27.4s, v27.4s, v15.4s\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "tbz %x[n_channels], #1, 71f\n"
+ "mov x23, x8\n"
+ "mov x22, x10\n"
+ "st1 { v16.d }[0], [x23], x5\n"
+ "mov x21, x27\n"
+ "mov x20, x24\n"
+ "st1 { v20.d }[0], [x22], x5\n"
+ "st1 { v24.d }[0], [x21], x5\n"
+ "add x8, x8, #0x8\n"
+ "add x10, x10, #0x8\n"
+ "st1 { v28.d }[0], [x20], x5\n"
+ "add x27, x27, #0x8\n"
+ "add x24, x24, #0x8\n"
+ "st1 { v17.d }[0], [x23], x5\n"
+ "st1 { v21.d }[0], [x22], x5\n"
+ "st1 { v25.d }[0], [x21], x5\n"
+ "st1 { v29.d }[0], [x20], x5\n"
+ "st1 { v18.d }[0], [x23], x5\n"
+ "st1 { v22.d }[0], [x22], x5\n"
+ "st1 { v26.d }[0], [x21], x5\n"
+ "st1 { v30.d }[0], [x20], x5\n"
+ "st1 { v19.d }[0], [x23]\n"
+ "st1 { v23.d }[0], [x22]\n"
+ "st1 { v27.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
+ "tbz %x[n_channels], #0, 72f\n"
+ "mov x23, x8\n"
+ "mov x22, x10\n"
+ "st1 { v16.s }[2], [x23], x5\n"
+ "mov x21, x27\n"
+ "mov x20, x24\n"
+ "st1 { v20.s }[2], [x22], x5\n"
+ "st1 { v24.s }[2], [x21], x5\n"
+ "st1 { v28.s }[2], [x20], x5\n"
+ "st1 { v17.s }[2], [x23], x5\n"
+ "st1 { v21.s }[2], [x22], x5\n"
+ "st1 { v25.s }[2], [x21], x5\n"
+ "st1 { v29.s }[2], [x20], x5\n"
+ "st1 { v18.s }[2], [x23], x5\n"
+ "st1 { v22.s }[2], [x22], x5\n"
+ "st1 { v26.s }[2], [x21], x5\n"
+ "st1 { v30.s }[2], [x20], x5\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v23.s }[2], [x22]\n"
+ "st1 { v27.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
+ "b 72f\n"
+ "71:" // Tile loop: Oddments: Store: Bit 1: Unset
+ "mov x23, x8\n"
+ "mov x22, x10\n"
+ "st1 { v16.s }[0], [x23], x5\n"
+ "mov x21, x27\n"
+ "mov x20, x24\n"
+ "st1 { v20.s }[0], [x22], x5\n"
+ "st1 { v24.s }[0], [x21], x5\n"
+ "st1 { v28.s }[0], [x20], x5\n"
+ "st1 { v17.s }[0], [x23], x5\n"
+ "st1 { v21.s }[0], [x22], x5\n"
+ "st1 { v25.s }[0], [x21], x5\n"
+ "st1 { v29.s }[0], [x20], x5\n"
+ "st1 { v18.s }[0], [x23], x5\n"
+ "st1 { v22.s }[0], [x22], x5\n"
+ "st1 { v26.s }[0], [x21], x5\n"
+ "st1 { v30.s }[0], [x20], x5\n"
+ "st1 { v19.s }[0], [x23]\n"
+ "st1 { v23.s }[0], [x22]\n"
+ "st1 { v27.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
+ "72:" // Tile loop: Oddments: Store: Bit 1: End
+ "73:" // Tile loop: End
+ "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x26, x26, #0x1\n"
+ "add x21, x27, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x26, x20\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x27, x27, x21, LT\n"
+ "csel x26, x26, XZR, LT\n"
+ "cmp x27, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..76045f30d6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[36];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[14];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[5];
+ inptrs[3] = input_ptrs[15];
+ inptrs[4] = input_ptrs[30];
+ inptrs[5] = input_ptrs[35];
+ inptrs[6] = input_ptrs[20];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[4];
+ inptrs[9] = input_ptrs[21];
+ inptrs[10] = input_ptrs[6];
+ inptrs[11] = input_ptrs[11];
+ inptrs[12] = input_ptrs[24];
+ inptrs[13] = input_ptrs[8];
+ inptrs[14] = input_ptrs[29];
+ inptrs[15] = input_ptrs[9];
+ inptrs[16] = input_ptrs[31];
+ inptrs[17] = input_ptrs[13];
+ inptrs[18] = input_ptrs[34];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[19];
+ inptrs[22] = input_ptrs[3];
+ inptrs[23] = input_ptrs[12];
+ inptrs[24] = input_ptrs[22];
+ inptrs[25] = input_ptrs[17];
+ inptrs[26] = input_ptrs[18];
+ inptrs[27] = input_ptrs[26];
+ inptrs[28] = input_ptrs[23];
+ inptrs[29] = input_ptrs[32];
+ inptrs[30] = input_ptrs[27];
+ inptrs[31] = input_ptrs[33];
+ inptrs[32] = input_ptrs[7];
+ inptrs[33] = input_ptrs[10];
+ inptrs[34] = input_ptrs[25];
+ inptrs[35] = input_ptrs[28];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "mov x6, #0x10\n" // cntb _, ALL, #1
+ "lsr x7, %x[n_channels], #0x2\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x15, #0x0\n"
+ "sub x14, XZR, x6\n"
+ "cbz x7, 3f\n"
+ "ldr q30, [x17, #0x0]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "cmp x6, x7, LSL #4\n"
+ "ldr q1, [x17, #0x20]\n"
+ "ldr q2, [x17, #0x30]\n"
+ "ldr q3, [x17, #0x40]\n"
+ "ldr q4, [x17, #0x50]\n"
+ "ldr q5, [x17, #0x60]\n"
+ "ldr q6, [x17, #0x70]\n"
+ "ldr q7, [x17, #0x80]\n"
+ "ldr q8, [x17, #0x90]\n"
+ "add x17, x17, #0xa0\n"
+ "ldp x21, x20, [x16, #0x0]\n"
+ "ldr q9, [x21, x15]\n"
+ "ldr q10, [x20, x15]\n"
+ "ldp x21, x20, [x16, #0x10]\n"
+ "ldr q11, [x21, x15]\n"
+ "ldr q12, [x20, x15]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v23.16b, v30.16b\n fmla v23.4s, v4.4s, v9.4s\n"
+ "mov v17.16b, v30.16b\n fmla v17.4s, v8.4s, v9.4s\n"
+ "ldr x27, [x16, #0x20]\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v25.16b, v30.16b\n fmla v25.4s, v3.4s, v9.4s\n"
+ "mov v28.16b, v30.16b\n fmla v28.4s, v1.4s, v9.4s\n"
+ "ldr x23, [x16, #0x28]\n"
+ "ldr x22, [x16, #0x38]\n"
+ "mov v20.16b, v30.16b\n fmla v20.4s, v0.4s, v9.4s\n"
+ "mov v16.16b, v30.16b\n fmla v16.4s, v7.4s, v9.4s\n"
+ "ldr x26, [x16, #0x40]\n"
+ "ldr x20, [x16, #0x48]\n"
+ "mov v15.16b, v30.16b\n fmla v15.4s, v6.4s, v9.4s\n"
+ "fmla v23.4s, v5.4s, v12.4s\n"
+ "ldr x25, [x16, #0x50]\n"
+ "ldr x21, [x16, #0x58]\n"
+ "mov v27.16b, v30.16b\n fmla v27.4s, v5.4s, v9.4s\n"
+ "mov v31.16b, v30.16b\n fmla v31.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x24, x15]\n"
+ "ldr x13, [x16, #0x70]\n"
+ "fmla v17.4s, v0.4s, v10.4s\n"
+ "ldr q22, [x27, x15]\n"
+ "mov v10.16b, v30.16b\n fmla v10.4s, v2.4s, v11.4s\n"
+ "ldr q18, [x23, x15]\n"
+ "fmla v25.4s, v4.4s, v12.4s\n"
+ "fmla v28.4s, v2.4s, v12.4s\n"
+ "ldr x24, [x16, #0x60]\n"
+ "ldr x23, [x16, #0x68]\n"
+ "fmla v20.4s, v1.4s, v12.4s\n"
+ "fmla v16.4s, v8.4s, v12.4s\n"
+ "ldr x12, [x8, #0x0]\n"
+ "ldr x11, [x8, #0x8]\n"
+ "fmla v15.4s, v7.4s, v12.4s\n"
+ "mov v29.16b, v30.16b\n fmla v29.4s, v6.4s, v22.4s\n"
+ "ldr q22, [x20, x15]\n"
+ "ldr x28, [x16, #0x88]\n"
+ "fmla v23.4s, v7.4s, v9.4s\n"
+ "fmla v10.4s, v6.4s, v12.4s\n"
+ "ldr x10, [x8, #0x10]\n"
+ "ldr x9, [x8, #0x18]\n"
+ "mov v21.16b, v30.16b\n fmla v21.4s, v3.4s, v12.4s\n"
+ "mov v19.16b, v30.16b\n fmla v19.4s, v0.4s, v12.4s\n"
+ "ldr q11, [x22, x15]\n"
+ "ldr x22, [x16, #0x78]\n"
+ "mov v24.16b, v30.16b\n fmla v24.4s, v8.4s, v18.4s\n"
+ "ldr q12, [x26, x15]\n"
+ "fmla v25.4s, v6.4s, v9.4s\n"
+ "ldr x20, [x16, #0x80]\n"
+ "fmla v28.4s, v4.4s, v9.4s\n"
+ "fmla v20.4s, v3.4s, v9.4s\n"
+ "add x14, x14, #0x10\n"
+ "mov v26.16b, v30.16b\n fmla v26.4s, v1.4s, v9.4s\n"
+ "mov v18.16b, v30.16b\n fmla v18.4s, v0.4s, v9.4s\n"
+ "ldr q30, [x17, #0x0]\n"
+ "fmla v27.4s, v8.4s, v9.4s\n"
+ "fmla v31.4s, v5.4s, v9.4s\n"
+ "fmla v29.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x25, x15]\n"
+ "fmla v17.4s, v1.4s, v11.4s\n"
+ "ldr x27, [x16, #0x90]\n"
+ "fmla v16.4s, v0.4s, v11.4s\n"
+ "ldr q11, [x21, x15]\n"
+ "fmla v15.4s, v2.4s, v12.4s\n"
+ "ldr x21, [x16, #0x98]\n"
+ "fmla v23.4s, v8.4s, v22.4s\n"
+ "fmla v10.4s, v1.4s, v12.4s\n"
+ "ldr q12, [x24, x15]\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v25.4s, v7.4s, v22.4s\n"
+ "fmla v21.4s, v6.4s, v22.4s\n"
+ "fmla v28.4s, v5.4s, v22.4s\n"
+ "fmla v20.4s, v4.4s, v22.4s\n"
+ "fmla v19.4s, v3.4s, v22.4s\n"
+ "fmla v26.4s, v2.4s, v22.4s\n"
+ "fmla v18.4s, v1.4s, v22.4s\n"
+ "fmla v24.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x23, x15]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v17.4s, v3.4s, v9.4s\n"
+ "fmla v27.4s, v0.4s, v9.4s\n"
+ "fmla v31.4s, v6.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "ldr q9, [x13, x15]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v16.4s, v4.4s, v22.4s\n"
+ "fmla v15.4s, v3.4s, v22.4s\n"
+ "fmla v23.4s, v1.4s, v22.4s\n"
+ "fmla v10.4s, v5.4s, v11.4s\n"
+ "fmla v21.4s, v2.4s, v11.4s\n"
+ "ldr q12, [x22, x15]\n"
+ "fmla v25.4s, v0.4s, v22.4s\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v19.4s, v8.4s, v9.4s\n"
+ "fmla v24.4s, v5.4s, v9.4s\n"
+ "ldr q11, [x20, x15]\n"
+ "ldr x22, [x16, #0xc0]\n"
+ "fmla v17.4s, v5.4s, v22.4s\n"
+ "fmla v27.4s, v2.4s, v22.4s\n"
+ "ldr q22, [x28, x15]\n"
+ "ldr x20, [x16, #0xc8]\n"
+ "fmla v16.4s, v5.4s, v12.4s\n"
+ "fmla v15.4s, v4.4s, v12.4s\n"
+ "fmla v23.4s, v2.4s, v12.4s\n"
+ "fmla v10.4s, v3.4s, v12.4s\n"
+ "fmla v25.4s, v1.4s, v12.4s\n"
+ "fmla v21.4s, v0.4s, v12.4s\n"
+ "ldr q9, [x21, x15]\n"
+ "ldr x28, [x16, #0xd8]\n"
+ "fmla v29.4s, v7.4s, v11.4s\n"
+ "fmla v26.4s, v6.4s, v11.4s\n"
+ "ldr q12, [x27, x15]\n"
+ "ldr x21, [x16, #0xd0]\n"
+ "fmla v17.4s, v7.4s, v22.4s\n"
+ "fmla v16.4s, v6.4s, v22.4s\n"
+ "fmla v27.4s, v4.4s, v22.4s\n"
+ "fmla v23.4s, v3.4s, v22.4s\n"
+ "fmla v31.4s, v1.4s, v22.4s\n"
+ "fmla v28.4s, v0.4s, v22.4s\n"
+ "ldr q11, [x26, x15]\n"
+ "ldr x27, [x16, #0xe0]\n"
+ "fmla v15.4s, v8.4s, v9.4s\n"
+ "fmla v18.4s, v8.4s, v12.4s\n"
+ "fmla v24.4s, v7.4s, v12.4s\n"
+ "ldr q12, [x25, x15]\n"
+ "fmla v19.4s, v1.4s, v9.4s\n"
+ "ldr x26, [x16, #0xe8]\n"
+ "fmla v10.4s, v7.4s, v9.4s\n"
+ "fmla v25.4s, v5.4s, v9.4s\n"
+ "fmla v21.4s, v4.4s, v9.4s\n"
+ "fmla v20.4s, v2.4s, v9.4s\n"
+ "ldr q9, [x24, x15]\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla v17.4s, v2.4s, v11.4s\n"
+ "fmla v16.4s, v1.4s, v11.4s\n"
+ "fmla v15.4s, v0.4s, v11.4s\n"
+ "ldr q22, [x23, x15]\n"
+ "fmla v27.4s, v7.4s, v12.4s\n"
+ "ldr x25, [x16, #0xf8]\n"
+ "fmla v23.4s, v6.4s, v12.4s\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "fmla v28.4s, v3.4s, v12.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "fmla v26.4s, v0.4s, v12.4s\n"
+ "ldr q11, [x22, x15]\n"
+ "fmla v19.4s, v4.4s, v11.4s\n"
+ "ldr x23, [x16, #0x100]\n"
+ "fmla v18.4s, v2.4s, v11.4s\n"
+ "fmla v16.4s, v2.4s, v9.4s\n"
+ "fmla v15.4s, v1.4s, v9.4s\n"
+ "fmla v10.4s, v0.4s, v9.4s\n"
+ "ldr q9, [x20, x15]\n"
+ "ldr x20, [x16, #0x108]\n"
+ "fmla v17.4s, v6.4s, v22.4s\n"
+ "fmla v27.4s, v3.4s, v22.4s\n"
+ "fmla v31.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x21, x15]\n"
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "ldr x22, [x16, #0x110]\n"
+ "fmla v21.4s, v7.4s, v11.4s\n"
+ "fmla v20.4s, v5.4s, v11.4s\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "ldr q12, [x28, x15]\n"
+ "fmla v19.4s, v2.4s, v9.4s\n"
+ "ldr x21, [x16, #0x118]\n"
+ "fmla v29.4s, v0.4s, v22.4s\n"
+ "fmla v26.4s, v4.4s, v12.4s\n"
+ "fmla v18.4s, v3.4s, v12.4s\n"
+ "fmla v10.4s, v8.4s, v9.4s\n"
+ "fmla v21.4s, v5.4s, v9.4s\n"
+ "ldr q11, [x27, x15]\n"
+ "fmla v27.4s, v6.4s, v22.4s\n"
+ "fmla v31.4s, v3.4s, v22.4s\n"
+ "ldr q22, [x26, x15]\n"
+ "fmla v28.4s, v7.4s, v12.4s\n"
+ "fmla v20.4s, v6.4s, v12.4s\n"
+ "fmla v29.4s, v5.4s, v12.4s\n"
+ "fmla v19.4s, v5.4s, v11.4s\n"
+ "fmla v24.4s, v2.4s, v11.4s\n"
+ "fmla v26.4s, v7.4s, v22.4s\n"
+ "fmla v18.4s, v6.4s, v22.4s\n"
+ "fmla v31.4s, v8.4s, v12.4s\n"
+ "ldr q12, [x24, x15]\n"
+ "fmla v29.4s, v8.4s, v22.4s\n"
+ "ldr q22, [x23, x15]\n"
+ "fmla v28.4s, v8.4s, v12.4s\n"
+ "fmla v20.4s, v7.4s, v12.4s\n"
+ "fmla v19.4s, v6.4s, v12.4s\n"
+ "fmla v26.4s, v5.4s, v12.4s\n"
+ "fmla v18.4s, v4.4s, v12.4s\n"
+ "fmla v24.4s, v3.4s, v12.4s\n"
+ "ldr q12, [x20, x15]\n"
+ "ldp x20, x24, [x16, #0x0]\n"
+ "ldr q9, [x20, x6]\n"
+ "fmla v21.4s, v8.4s, v11.4s\n"
+ "ldr q11, [x25, x15]\n"
+ "fmla v17.4s, v4.4s, v22.4s\n"
+ "fmla v16.4s, v3.4s, v22.4s\n"
+ "fmla v15.4s, v5.4s, v12.4s\n"
+ "fmax v17.4s, v17.4s, v13.4s\n"
+ "fmla v10.4s, v4.4s, v12.4s\n"
+ "fmla v26.4s, v8.4s, v11.4s\n"
+ "fmax v16.4s, v16.4s, v13.4s\n"
+ "fmla v18.4s, v7.4s, v11.4s\n"
+ "fmla v24.4s, v6.4s, v11.4s\n"
+ "ldr q11, [x22, x15]\n"
+ "fmax v15.4s, v15.4s, v13.4s\n"
+ "fmla v27.4s, v1.4s, v22.4s\n"
+ "fmla v23.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x21, x15]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "fmla v25.4s, v2.4s, v12.4s\n"
+ "ldr q2, [x17, #0x30]\n"
+ "fmla v21.4s, v1.4s, v12.4s\n"
+ "ldr q1, [x17, #0x20]\n"
+ "fmax v10.4s, v10.4s, v13.4s\n"
+ "fmla v31.4s, v7.4s, v11.4s\n"
+ "fmla v28.4s, v6.4s, v11.4s\n"
+ "ldr q6, [x17, #0x70]\n"
+ "fmla v20.4s, v8.4s, v22.4s\n"
+ "ldr q8, [x17, #0x90]\n"
+ "fmla v19.4s, v7.4s, v22.4s\n"
+ "ldr q7, [x17, #0x80]\n"
+ "fmin v17.4s, v17.4s, v14.4s\n"
+ "fmin v16.4s, v16.4s, v14.4s\n"
+ "str q17, [x12, x14]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "fmin v15.4s, v15.4s, v14.4s\n"
+ "fmin v10.4s, v10.4s, v14.4s\n"
+ "str q16, [x11, x14]\n"
+ "ldr x22, [x8, #0x28]\n"
+ "fmax v27.4s, v27.4s, v13.4s\n"
+ "fmax v23.4s, v23.4s, v13.4s\n"
+ "str q15, [x10, x14]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "fmax v25.4s, v25.4s, v13.4s\n"
+ "fmax v21.4s, v21.4s, v13.4s\n"
+ "str q10, [x9, x14]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "fmla v26.4s, v3.4s, v11.4s\n"
+ "ldr q3, [x17, #0x40]\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmla v18.4s, v5.4s, v22.4s\n"
+ "ldr q5, [x17, #0x60]\n"
+ "fmla v24.4s, v4.4s, v22.4s\n"
+ "ldr q10, [x24, x6]\n"
+ "ldr q4, [x17, #0x50]\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "str q27, [x23, x14]\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "fmax v31.4s, v31.4s, v13.4s\n"
+ "str q23, [x22, x14]\n"
+ "ldr x25, [x8, #0x40]\n"
+ "fmax v28.4s, v28.4s, v13.4s\n"
+ "fmax v20.4s, v20.4s, v13.4s\n"
+ "str q25, [x21, x14]\n"
+ "ldr x23, [x8, #0x48]\n"
+ "fmax v19.4s, v19.4s, v13.4s\n"
+ "str q21, [x20, x14]\n"
+ "ldr x22, [x8, #0x50]\n"
+ "ldr x24, [x8, #0x58]\n"
+ "ldp x21, x20, [x16, #0x10]\n"
+ "ldr q11, [x21, x6]\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "ldr q12, [x20, x6]\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
+ "fmin v19.4s, v19.4s, v14.4s\n"
+ "str q31, [x25, x14]\n"
+ "fmax v29.4s, v29.4s, v13.4s\n"
+ "fmax v26.4s, v26.4s, v13.4s\n"
+ "str q28, [x23, x14]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "fmax v18.4s, v18.4s, v13.4s\n"
+ "fmax v24.4s, v24.4s, v13.4s\n"
+ "str q20, [x22, x14]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "str q19, [x24, x14]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x6, x6, #0x10\n"
+ "cmp x6, x7, LSL #4\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "add x15, x15, #0x10\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "str q29, [x23, x14]\n"
+ "add x17, x17, #0xa0\n"
+ "str q26, [x22, x14]\n"
+ "str q18, [x21, x14]\n"
+ "str q24, [x20, x14]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v31.16b, v30.16b\n fmla v31.4s, v4.4s, v9.4s\n"
+ "mov v17.16b, v30.16b\n fmla v17.4s, v8.4s, v9.4s\n"
+ "ldr x27, [x16, #0x20]\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v15.16b, v30.16b\n fmla v15.4s, v3.4s, v9.4s\n"
+ "mov v29.16b, v30.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "ldr x23, [x16, #0x28]\n"
+ "ldr x22, [x16, #0x38]\n"
+ "mov v19.16b, v30.16b\n fmla v19.4s, v0.4s, v9.4s\n"
+ "mov v20.16b, v30.16b\n fmla v20.4s, v7.4s, v9.4s\n"
+ "ldr x26, [x16, #0x40]\n"
+ "ldr x21, [x16, #0x48]\n"
+ "mov v21.16b, v30.16b\n fmla v21.4s, v6.4s, v9.4s\n"
+ "fmla v31.4s, v5.4s, v12.4s\n"
+ "ldr x25, [x16, #0x50]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "mov v18.16b, v30.16b\n fmla v18.4s, v5.4s, v9.4s\n"
+ "mov v27.16b, v30.16b\n fmla v27.4s, v2.4s, v9.4s\n"
+ "ldr q24, [x24, x15]\n"
+ "ldr x13, [x16, #0x70]\n"
+ "fmla v17.4s, v0.4s, v10.4s\n"
+ "ldr q22, [x27, x15]\n"
+ "mov v28.16b, v30.16b\n fmla v28.4s, v2.4s, v11.4s\n"
+ "ldr q16, [x23, x15]\n"
+ "fmla v15.4s, v4.4s, v12.4s\n"
+ "fmla v29.4s, v2.4s, v12.4s\n"
+ "ldr x24, [x16, #0x60]\n"
+ "ldr x23, [x16, #0x68]\n"
+ "fmla v19.4s, v1.4s, v12.4s\n"
+ "fmla v20.4s, v8.4s, v12.4s\n"
+ "ldr x12, [x8, #0x0]\n"
+ "ldr x11, [x8, #0x8]\n"
+ "fmla v21.4s, v7.4s, v12.4s\n"
+ "mov v10.16b, v30.16b\n fmla v10.4s, v6.4s, v22.4s\n"
+ "ldr q22, [x21, x15]\n"
+ "ldr x28, [x16, #0x88]\n"
+ "fmla v31.4s, v7.4s, v24.4s\n"
+ "fmla v28.4s, v6.4s, v12.4s\n"
+ "ldr x10, [x8, #0x10]\n"
+ "ldr x9, [x8, #0x18]\n"
+ "mov v9.16b, v30.16b\n fmla v9.4s, v3.4s, v12.4s\n"
+ "mov v11.16b, v30.16b\n fmla v11.4s, v0.4s, v12.4s\n"
+ "ldr q23, [x22, x15]\n"
+ "ldr x22, [x16, #0x78]\n"
+ "mov v12.16b, v30.16b\n fmla v12.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x26, x15]\n"
+ "fmla v15.4s, v6.4s, v24.4s\n"
+ "ldr x21, [x16, #0x80]\n"
+ "fmla v29.4s, v4.4s, v24.4s\n"
+ "fmla v19.4s, v3.4s, v24.4s\n"
+ "add x14, x14, #0x10\n"
+ "mov v26.16b, v30.16b\n fmla v26.4s, v1.4s, v24.4s\n"
+ "mov v25.16b, v30.16b\n fmla v25.4s, v0.4s, v24.4s\n"
+ "fmla v18.4s, v8.4s, v24.4s\n"
+ "fmla v27.4s, v5.4s, v24.4s\n"
+ "fmla v10.4s, v2.4s, v24.4s\n"
+ "ldr q24, [x25, x15]\n"
+ "fmla v17.4s, v1.4s, v23.4s\n"
+ "ldr x27, [x16, #0x90]\n"
+ "fmla v20.4s, v0.4s, v23.4s\n"
+ "ldr q23, [x20, x15]\n"
+ "fmla v21.4s, v2.4s, v16.4s\n"
+ "ldr x20, [x16, #0x98]\n"
+ "fmla v31.4s, v8.4s, v22.4s\n"
+ "fmla v28.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x24, x15]\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v15.4s, v7.4s, v22.4s\n"
+ "fmla v9.4s, v6.4s, v22.4s\n"
+ "fmla v29.4s, v5.4s, v22.4s\n"
+ "fmla v19.4s, v4.4s, v22.4s\n"
+ "fmla v11.4s, v3.4s, v22.4s\n"
+ "fmla v26.4s, v2.4s, v22.4s\n"
+ "fmla v25.4s, v1.4s, v22.4s\n"
+ "fmla v12.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x23, x15]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v17.4s, v3.4s, v24.4s\n"
+ "fmla v18.4s, v0.4s, v24.4s\n"
+ "fmla v27.4s, v6.4s, v16.4s\n"
+ "fmla v10.4s, v3.4s, v16.4s\n"
+ "ldr q16, [x13, x15]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v20.4s, v4.4s, v22.4s\n"
+ "fmla v21.4s, v3.4s, v22.4s\n"
+ "fmla v31.4s, v1.4s, v22.4s\n"
+ "fmla v28.4s, v5.4s, v23.4s\n"
+ "fmla v9.4s, v2.4s, v23.4s\n"
+ "ldr q23, [x22, x15]\n"
+ "fmla v15.4s, v0.4s, v22.4s\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v11.4s, v8.4s, v16.4s\n"
+ "fmla v12.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x21, x15]\n"
+ "ldr x22, [x16, #0xc0]\n"
+ "fmla v17.4s, v5.4s, v22.4s\n"
+ "fmla v18.4s, v2.4s, v22.4s\n"
+ "ldr q22, [x28, x15]\n"
+ "ldr x21, [x16, #0xc8]\n"
+ "fmla v20.4s, v5.4s, v23.4s\n"
+ "fmla v21.4s, v4.4s, v23.4s\n"
+ "fmla v31.4s, v2.4s, v23.4s\n"
+ "fmla v28.4s, v3.4s, v23.4s\n"
+ "fmla v15.4s, v1.4s, v23.4s\n"
+ "fmla v9.4s, v0.4s, v23.4s\n"
+ "ldr q23, [x20, x15]\n"
+ "ldr x28, [x16, #0xd8]\n"
+ "fmla v10.4s, v7.4s, v16.4s\n"
+ "fmla v26.4s, v6.4s, v16.4s\n"
+ "ldr q16, [x27, x15]\n"
+ "ldr x20, [x16, #0xd0]\n"
+ "fmla v17.4s, v7.4s, v22.4s\n"
+ "fmla v20.4s, v6.4s, v22.4s\n"
+ "fmla v18.4s, v4.4s, v22.4s\n"
+ "fmla v31.4s, v3.4s, v22.4s\n"
+ "fmla v27.4s, v1.4s, v22.4s\n"
+ "fmla v29.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x26, x15]\n"
+ "ldr x27, [x16, #0xe0]\n"
+ "fmla v21.4s, v8.4s, v23.4s\n"
+ "fmla v25.4s, v8.4s, v16.4s\n"
+ "fmla v12.4s, v7.4s, v16.4s\n"
+ "ldr q16, [x25, x15]\n"
+ "fmla v11.4s, v1.4s, v23.4s\n"
+ "ldr x26, [x16, #0xe8]\n"
+ "fmla v28.4s, v7.4s, v23.4s\n"
+ "fmla v15.4s, v5.4s, v23.4s\n"
+ "fmla v9.4s, v4.4s, v23.4s\n"
+ "fmla v19.4s, v2.4s, v23.4s\n"
+ "ldr q23, [x24, x15]\n"
+ "ldr x25, [x16, #0xf0]\n"
+ "fmla v17.4s, v2.4s, v22.4s\n"
+ "fmla v20.4s, v1.4s, v22.4s\n"
+ "fmla v21.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x23, x15]\n"
+ "fmla v18.4s, v7.4s, v16.4s\n"
+ "ldr x24, [x16, #0xf8]\n"
+ "fmla v31.4s, v6.4s, v16.4s\n"
+ "fmla v27.4s, v4.4s, v16.4s\n"
+ "fmla v29.4s, v3.4s, v16.4s\n"
+ "fmla v10.4s, v1.4s, v16.4s\n"
+ "fmla v26.4s, v0.4s, v16.4s\n"
+ "ldr q16, [x22, x15]\n"
+ "fmla v11.4s, v4.4s, v16.4s\n"
+ "ldr x23, [x16, #0x100]\n"
+ "fmla v25.4s, v2.4s, v16.4s\n"
+ "fmla v20.4s, v2.4s, v23.4s\n"
+ "fmla v21.4s, v1.4s, v23.4s\n"
+ "fmla v28.4s, v0.4s, v23.4s\n"
+ "ldr q23, [x21, x15]\n"
+ "ldr x22, [x16, #0x108]\n"
+ "fmla v17.4s, v6.4s, v22.4s\n"
+ "fmla v18.4s, v3.4s, v22.4s\n"
+ "fmla v27.4s, v0.4s, v22.4s\n"
+ "ldr q22, [x20, x15]\n"
+ "fmla v15.4s, v8.4s, v16.4s\n"
+ "ldr x21, [x16, #0x110]\n"
+ "fmla v9.4s, v7.4s, v16.4s\n"
+ "fmla v19.4s, v5.4s, v16.4s\n"
+ "fmla v12.4s, v1.4s, v16.4s\n"
+ "ldr q16, [x28, x15]\n"
+ "fmla v11.4s, v2.4s, v23.4s\n"
+ "ldr x20, [x16, #0x118]\n"
+ "fmla v10.4s, v0.4s, v22.4s\n"
+ "fmla v26.4s, v4.4s, v16.4s\n"
+ "fmla v25.4s, v3.4s, v16.4s\n"
+ "fmla v28.4s, v8.4s, v23.4s\n"
+ "fmla v9.4s, v5.4s, v23.4s\n"
+ "ldr q23, [x27, x15]\n"
+ "fmla v18.4s, v6.4s, v22.4s\n"
+ "fmla v27.4s, v3.4s, v22.4s\n"
+ "ldr q22, [x26, x15]\n"
+ "fmla v29.4s, v7.4s, v16.4s\n"
+ "fmla v19.4s, v6.4s, v16.4s\n"
+ "fmla v10.4s, v5.4s, v16.4s\n"
+ "fmla v11.4s, v5.4s, v23.4s\n"
+ "fmla v12.4s, v2.4s, v23.4s\n"
+ "fmla v26.4s, v7.4s, v22.4s\n"
+ "fmla v25.4s, v6.4s, v22.4s\n"
+ "fmla v27.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x25, x15]\n"
+ "fmla v10.4s, v8.4s, v22.4s\n"
+ "ldr q30, [x23, x15]\n"
+ "fmla v29.4s, v8.4s, v16.4s\n"
+ "fmla v19.4s, v7.4s, v16.4s\n"
+ "fmla v11.4s, v6.4s, v16.4s\n"
+ "fmla v26.4s, v5.4s, v16.4s\n"
+ "fmla v25.4s, v4.4s, v16.4s\n"
+ "fmla v12.4s, v3.4s, v16.4s\n"
+ "ldr q24, [x22, x15]\n"
+ "fmla v9.4s, v8.4s, v23.4s\n"
+ "ldr q16, [x24, x15]\n"
+ "fmla v17.4s, v4.4s, v30.4s\n"
+ "fmax v17.4s, v17.4s, v13.4s\n"
+ "fmla v20.4s, v3.4s, v30.4s\n"
+ "fmla v21.4s, v5.4s, v24.4s\n"
+ "fmax v20.4s, v20.4s, v13.4s\n"
+ "fmla v28.4s, v4.4s, v24.4s\n"
+ "fmla v26.4s, v8.4s, v16.4s\n"
+ "fmax v21.4s, v21.4s, v13.4s\n"
+ "fmla v25.4s, v7.4s, v16.4s\n"
+ "fmla v12.4s, v6.4s, v16.4s\n"
+ "ldr q23, [x21, x15]\n"
+ "fmax v28.4s, v28.4s, v13.4s\n"
+ "fmla v18.4s, v1.4s, v30.4s\n"
+ "fmla v31.4s, v0.4s, v30.4s\n"
+ "ldr q16, [x20, x15]\n"
+ "fmin v17.4s, v17.4s, v14.4s\n"
+ "fmla v15.4s, v2.4s, v24.4s\n"
+ "fmla v9.4s, v1.4s, v24.4s\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
+ "str q17, [x12, x14]\n"
+ "fmla v27.4s, v7.4s, v23.4s\n"
+ "fmla v29.4s, v6.4s, v23.4s\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "str q20, [x11, x14]\n"
+ "fmla v19.4s, v8.4s, v16.4s\n"
+ "fmla v11.4s, v7.4s, v16.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "str q21, [x10, x14]\n"
+ "fmax v18.4s, v18.4s, v13.4s\n"
+ "fmax v31.4s, v31.4s, v13.4s\n"
+ "str q28, [x9, x14]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "fmax v15.4s, v15.4s, v13.4s\n"
+ "fmax v9.4s, v9.4s, v13.4s\n"
+ "ldr x22, [x8, #0x28]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "fmla v10.4s, v4.4s, v23.4s\n"
+ "fmla v26.4s, v3.4s, v23.4s\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "fmla v25.4s, v5.4s, v16.4s\n"
+ "fmla v12.4s, v4.4s, v16.4s\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
+ "str q18, [x23, x14]\n"
+ "fmin v15.4s, v15.4s, v14.4s\n"
+ "fmin v9.4s, v9.4s, v14.4s\n"
+ "str q31, [x22, x14]\n"
+ "ldr x23, [x8, #0x40]\n"
+ "fmax v27.4s, v27.4s, v13.4s\n"
+ "fmax v29.4s, v29.4s, v13.4s\n"
+ "str q15, [x21, x14]\n"
+ "ldr x22, [x8, #0x48]\n"
+ "fmax v19.4s, v19.4s, v13.4s\n"
+ "fmax v11.4s, v11.4s, v13.4s\n"
+ "str q9, [x20, x14]\n"
+ "ldr x21, [x8, #0x50]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "str q27, [x23, x14]\n"
+ "fmin v19.4s, v19.4s, v14.4s\n"
+ "fmin v11.4s, v11.4s, v14.4s\n"
+ "str q29, [x22, x14]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "fmax v10.4s, v10.4s, v13.4s\n"
+ "fmax v26.4s, v26.4s, v13.4s\n"
+ "str q19, [x21, x14]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "fmax v25.4s, v25.4s, v13.4s\n"
+ "fmax v12.4s, v12.4s, v13.4s\n"
+ "str q11, [x20, x14]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "fmin v10.4s, v10.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "str q10, [x23, x14]\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v12.4s, v12.4s, v14.4s\n"
+ "str q26, [x22, x14]\n"
+ "add x15, x15, #0x10\n"
+ "str q25, [x21, x14]\n"
+ "str q12, [x20, x14]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 72f\n"
+ "ldr q30, [x17, #0x0]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "mov x14, x15\n"
+ "ldr q1, [x17, #0x20]\n"
+ "ldr q2, [x17, #0x30]\n"
+ "ldr q3, [x17, #0x40]\n"
+ "ldr q4, [x17, #0x50]\n"
+ "ldr q5, [x17, #0x60]\n"
+ "ldr q6, [x17, #0x70]\n"
+ "ldr q7, [x17, #0x80]\n"
+ "ldr q8, [x17, #0x90]\n"
+ "ldr x23, [x16, #0x0]\n"
+ "ldr x22, [x16, #0x8]\n"
+ "add x23, x23, x15\n"
+ "add x22, x22, x15\n"
+ "ldr x21, [x16, #0x10]\n"
+ "ldr x20, [x16, #0x18]\n"
+ "add x21, x21, x15\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.d }[0], [x23], #0x8\n"
+ "ld1 { v10.d }[0], [x22], #0x8\n"
+ "ld1 { v11.d }[0], [x21], #0x8\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 5f\n"
+ "ld1 { v9.s }[2], [x23], #0x4\n"
+ "ld1 { v10.s }[2], [x22], #0x4\n"
+ "ld1 { v11.s }[2], [x21], #0x4\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 5f\n"
+ "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset
+ "ld1 { v9.s }[0], [x23], #0x4\n"
+ "ld1 { v10.s }[0], [x22], #0x4\n"
+ "ld1 { v11.s }[0], [x21], #0x4\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
+ "mov v16.16b, v30.16b\n fmla v16.4s, v8.4s, v9.4s\n"
+ "mov v17.16b, v30.16b\n fmla v17.4s, v7.4s, v9.4s\n"
+ "ldr x20, [x16, #0x20]\n"
+ "add x20, x20, x15\n"
+ "mov v18.16b, v30.16b\n fmla v18.4s, v6.4s, v9.4s\n"
+ "mov v21.16b, v30.16b\n fmla v21.4s, v4.4s, v9.4s\n"
+ "mov v22.16b, v30.16b\n fmla v22.4s, v3.4s, v9.4s\n"
+ "mov v25.16b, v30.16b\n fmla v25.4s, v1.4s, v9.4s\n"
+ "mov v26.16b, v30.16b\n fmla v26.4s, v0.4s, v9.4s\n"
+ "mov v19.16b, v30.16b\n fmla v19.4s, v2.4s, v11.4s\n"
+ "mov v20.16b, v30.16b\n fmla v20.4s, v5.4s, v9.4s\n"
+ "mov v24.16b, v30.16b\n fmla v24.4s, v2.4s, v9.4s\n"
+ "fmla v16.4s, v0.4s, v10.4s\n"
+ "fmla v17.4s, v8.4s, v12.4s\n"
+ "fmla v18.4s, v7.4s, v12.4s\n"
+ "fmla v19.4s, v6.4s, v12.4s\n"
+ "fmla v21.4s, v5.4s, v12.4s\n"
+ "fmla v22.4s, v4.4s, v12.4s\n"
+ "mov v23.16b, v30.16b\n fmla v23.4s, v3.4s, v12.4s\n"
+ "fmla v25.4s, v2.4s, v12.4s\n"
+ "fmla v26.4s, v1.4s, v12.4s\n"
+ "mov v27.16b, v30.16b\n fmla v27.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "b 7f\n"
+ "6:" // Oddments: Load input (5, 0): Bit 1: Unset
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "7:" // Oddments: Load input (5, 0): Bit 1: End
+ "ldr x20, [x16, #0x28]\n"
+ "mov v28.16b, v30.16b\n fmla v28.4s, v6.4s, v10.4s\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 9f\n"
+ "8:" // Oddments: Load input (5, 5): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "9:" // Oddments: Load input (5, 5): Bit 1: End
+ "ldr x20, [x16, #0x30]\n"
+ "mov v31.16b, v30.16b\n fmla v31.4s, v8.4s, v11.4s\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (3, 2): Bit 1: Unset
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "11:" // Oddments: Load input (3, 2): Bit 1: End
+ "ldr x20, [x16, #0x38]\n"
+ "fmla v20.4s, v8.4s, v9.4s\n"
+ "fmla v21.4s, v7.4s, v9.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v22.4s, v6.4s, v9.4s\n"
+ "fmla v24.4s, v5.4s, v9.4s\n"
+ "fmla v25.4s, v4.4s, v9.4s\n"
+ "fmla v26.4s, v3.4s, v9.4s\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "mov v29.16b, v30.16b\n fmla v29.4s, v1.4s, v9.4s\n"
+ "fmla v30.4s, v0.4s, v9.4s\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 13f\n"
+ "12:" // Oddments: Load input (0, 1): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "13:" // Oddments: Load input (0, 1): Bit 1: End
+ "ldr x20, [x16, #0x40]\n"
+ "fmla v16.4s, v1.4s, v12.4s\n"
+ "fmla v17.4s, v0.4s, v12.4s\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (0, 4): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "15:" // Oddments: Load input (0, 4): Bit 1: End
+ "ldr x20, [x16, #0x48]\n"
+ "fmla v18.4s, v2.4s, v11.4s\n"
+ "fmla v19.4s, v1.4s, v11.4s\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 17f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "b 17f\n"
+ "16:" // Oddments: Load input (3, 3): Bit 1: Unset
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "17:" // Oddments: Load input (3, 3): Bit 1: End
+ "ldr x20, [x16, #0x50]\n"
+ "fmla v21.4s, v8.4s, v10.4s\n"
+ "fmla v22.4s, v7.4s, v10.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v23.4s, v6.4s, v10.4s\n"
+ "fmla v25.4s, v5.4s, v10.4s\n"
+ "fmla v26.4s, v4.4s, v10.4s\n"
+ "fmla v27.4s, v3.4s, v10.4s\n"
+ "fmla v29.4s, v2.4s, v10.4s\n"
+ "fmla v30.4s, v1.4s, v10.4s\n"
+ "fmla v31.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (1, 0): Bit 1: Unset
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "19:" // Oddments: Load input (1, 0): Bit 1: End
+ "ldr x20, [x16, #0x58]\n"
+ "fmla v16.4s, v3.4s, v9.4s\n"
+ "fmla v20.4s, v0.4s, v9.4s\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 21f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 21f\n"
+ "20:" // Oddments: Load input (1, 5): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "21:" // Oddments: Load input (1, 5): Bit 1: End
+ "ldr x20, [x16, #0x60]\n"
+ "fmla v19.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v2.4s, v12.4s\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (4, 0): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "23:" // Oddments: Load input (4, 0): Bit 1: End
+ "ldr x20, [x16, #0x68]\n"
+ "fmla v24.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v3.4s, v11.4s\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 25f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "b 25f\n"
+ "24:" // Oddments: Load input (1, 2): Bit 1: Unset
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "25:" // Oddments: Load input (1, 2): Bit 1: End
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v16.4s, v5.4s, v10.4s\n"
+ "fmla v17.4s, v4.4s, v10.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v18.4s, v3.4s, v10.4s\n"
+ "fmla v20.4s, v2.4s, v10.4s\n"
+ "fmla v21.4s, v1.4s, v10.4s\n"
+ "fmla v22.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (4, 5): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "27:" // Oddments: Load input (4, 5): Bit 1: End
+ "ldr x20, [x16, #0x78]\n"
+ "fmla v27.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v5.4s, v11.4s\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 29f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 29f\n"
+ "28:" // Oddments: Load input (1, 3): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "29:" // Oddments: Load input (1, 3): Bit 1: End
+ "ldr x20, [x16, #0x80]\n"
+ "fmla v17.4s, v5.4s, v12.4s\n"
+ "fmla v18.4s, v4.4s, v12.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v19.4s, v3.4s, v12.4s\n"
+ "fmla v21.4s, v2.4s, v12.4s\n"
+ "fmla v22.4s, v1.4s, v12.4s\n"
+ "fmla v23.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (5, 1): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "31:" // Oddments: Load input (5, 1): Bit 1: End
+ "ldr x20, [x16, #0x88]\n"
+ "fmla v28.4s, v7.4s, v11.4s\n"
+ "fmla v29.4s, v6.4s, v11.4s\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 33f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "b 33f\n"
+ "32:" // Oddments: Load input (2, 1): Bit 1: Unset
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "33:" // Oddments: Load input (2, 1): Bit 1: End
+ "ldr x20, [x16, #0x90]\n"
+ "fmla v16.4s, v7.4s, v10.4s\n"
+ "fmla v17.4s, v6.4s, v10.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v20.4s, v4.4s, v10.4s\n"
+ "fmla v21.4s, v3.4s, v10.4s\n"
+ "fmla v24.4s, v1.4s, v10.4s\n"
+ "fmla v25.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (5, 4): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "35:" // Oddments: Load input (5, 4): Bit 1: End
+ "ldr x20, [x16, #0x98]\n"
+ "fmla v30.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v7.4s, v11.4s\n"
+ "add x20, x20, x15\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 37f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 37f\n"
+ "36:" // Oddments: Load input (2, 4): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "37:" // Oddments: Load input (2, 4): Bit 1: End
+ "ldr x20, [x16, #0xa0]\n"
+ "fmla v18.4s, v8.4s, v12.4s\n"
+ "fmla v19.4s, v7.4s, v12.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v22.4s, v5.4s, v12.4s\n"
+ "fmla v23.4s, v4.4s, v12.4s\n"
+ "fmla v26.4s, v2.4s, v12.4s\n"
+ "fmla v27.4s, v1.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (0, 2): Bit 1: Unset
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "39:" // Oddments: Load input (0, 2): Bit 1: End
+ "ldr x20, [x16, #0xa8]\n"
+ "fmla v16.4s, v2.4s, v10.4s\n"
+ "fmla v17.4s, v1.4s, v10.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v18.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 41f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 41f\n"
+ "40:" // Oddments: Load input (3, 1): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "41:" // Oddments: Load input (3, 1): Bit 1: End
+ "ldr x20, [x16, #0xb0]\n"
+ "fmla v20.4s, v7.4s, v11.4s\n"
+ "fmla v21.4s, v6.4s, v11.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v24.4s, v4.4s, v11.4s\n"
+ "fmla v25.4s, v3.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 42f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 43f\n"
+ "42:" // Oddments: Load input (0, 3): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "43:" // Oddments: Load input (0, 3): Bit 1: End
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla v17.4s, v2.4s, v12.4s\n"
+ "fmla v18.4s, v1.4s, v12.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v19.4s, v0.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 44f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 45f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "b 45f\n"
+ "44:" // Oddments: Load input (2, 0): Bit 1: Unset
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "45:" // Oddments: Load input (2, 0): Bit 1: End
+ "ldr x20, [x16, #0xc0]\n"
+ "fmla v16.4s, v6.4s, v10.4s\n"
+ "fmla v20.4s, v3.4s, v10.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 46f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 47f\n"
+ "46:" // Oddments: Load input (3, 4): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "47:" // Oddments: Load input (3, 4): Bit 1: End
+ "ldr x20, [x16, #0xc8]\n"
+ "fmla v22.4s, v8.4s, v11.4s\n"
+ "fmla v23.4s, v7.4s, v11.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v26.4s, v5.4s, v11.4s\n"
+ "fmla v27.4s, v4.4s, v11.4s\n"
+ "fmla v30.4s, v2.4s, v11.4s\n"
+ "fmla v31.4s, v1.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 48f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 49f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 49f\n"
+ "48:" // Oddments: Load input (2, 5): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "49:" // Oddments: Load input (2, 5): Bit 1: End
+ "ldr x20, [x16, #0xd0]\n"
+ "fmla v19.4s, v8.4s, v12.4s\n"
+ "fmla v23.4s, v5.4s, v12.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v27.4s, v2.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 50f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "b 51f\n"
+ "50:" // Oddments: Load input (3, 0): Bit 1: Unset
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "51:" // Oddments: Load input (3, 0): Bit 1: End
+ "ldr x20, [x16, #0xd8]\n"
+ "fmla v20.4s, v6.4s, v10.4s\n"
+ "fmla v24.4s, v3.4s, v10.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 52f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 53f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 53f\n"
+ "52:" // Oddments: Load input (4, 2): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "53:" // Oddments: Load input (4, 2): Bit 1: End
+ "ldr x20, [x16, #0xe0]\n"
+ "fmla v24.4s, v8.4s, v11.4s\n"
+ "fmla v25.4s, v7.4s, v11.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v26.4s, v6.4s, v11.4s\n"
+ "fmla v28.4s, v5.4s, v11.4s\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "fmla v30.4s, v3.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 54f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 55f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 55f\n"
+ "54:" // Oddments: Load input (3, 5): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "55:" // Oddments: Load input (3, 5): Bit 1: End
+ "ldr x20, [x16, #0xe8]\n"
+ "fmla v23.4s, v8.4s, v12.4s\n"
+ "fmla v27.4s, v5.4s, v12.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v31.4s, v2.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 56f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 57f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "b 57f\n"
+ "56:" // Oddments: Load input (5, 2): Bit 1: Unset
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "57:" // Oddments: Load input (5, 2): Bit 1: End
+ "ldr x20, [x16, #0xf0]\n"
+ "fmla v28.4s, v8.4s, v10.4s\n"
+ "fmla v29.4s, v7.4s, v10.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v30.4s, v6.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 58f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 59f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 59f\n"
+ "58:" // Oddments: Load input (4, 3): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "59:" // Oddments: Load input (4, 3): Bit 1: End
+ "ldr x20, [x16, #0xf8]\n"
+ "fmla v25.4s, v8.4s, v11.4s\n"
+ "fmla v26.4s, v7.4s, v11.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v27.4s, v6.4s, v11.4s\n"
+ "fmla v29.4s, v5.4s, v11.4s\n"
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 60f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 61f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 61f\n"
+ "60:" // Oddments: Load input (5, 3): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "61:" // Oddments: Load input (5, 3): Bit 1: End
+ "ldr x20, [x16, #0x100]\n"
+ "fmla v29.4s, v8.4s, v12.4s\n"
+ "fmla v30.4s, v7.4s, v12.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v31.4s, v6.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 62f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 63f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "b 63f\n"
+ "62:" // Oddments: Load input (1, 1): Bit 1: Unset
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "63:" // Oddments: Load input (1, 1): Bit 1: End
+ "ldr x20, [x16, #0x108]\n"
+ "fmla v16.4s, v4.4s, v10.4s\n"
+ "fmla v17.4s, v3.4s, v10.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v20.4s, v1.4s, v10.4s\n"
+ "fmla v21.4s, v0.4s, v10.4s\n"
+ "tbz %x[n_channels], #1, 64f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 65f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 65f\n"
+ "64:" // Oddments: Load input (1, 4): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "65:" // Oddments: Load input (1, 4): Bit 1: End
+ "ldr x20, [x16, #0x110]\n"
+ "fmla v18.4s, v5.4s, v11.4s\n"
+ "fmla v19.4s, v4.4s, v11.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v22.4s, v2.4s, v11.4s\n"
+ "fmla v23.4s, v1.4s, v11.4s\n"
+ "tbz %x[n_channels], #1, 66f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 67f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 67f\n"
+ "66:" // Oddments: Load input (4, 1): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "67:" // Oddments: Load input (4, 1): Bit 1: End
+ "ldr x20, [x16, #0x118]\n"
+ "fmla v24.4s, v7.4s, v12.4s\n"
+ "fmla v25.4s, v6.4s, v12.4s\n"
+ "add x20, x20, x15\n"
+ "fmla v28.4s, v4.4s, v12.4s\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 68f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 69f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "b 69f\n"
+ "68:" // Oddments: Load input (4, 4): Bit 1: Unset
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "69:" // Oddments: Load input (4, 4): Bit 1: End
+ "fmla v26.4s, v8.4s, v10.4s\n"
+ "fmla v27.4s, v7.4s, v10.4s\n"
+ "fmax v16.4s, v16.4s, v13.4s\n"
+ "fmla v30.4s, v5.4s, v10.4s\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmax v17.4s, v17.4s, v13.4s\n"
+ "fmax v18.4s, v18.4s, v13.4s\n"
+ "fmax v19.4s, v19.4s, v13.4s\n"
+ "fmax v20.4s, v20.4s, v13.4s\n"
+ "fmax v21.4s, v21.4s, v13.4s\n"
+ "fmax v22.4s, v22.4s, v13.4s\n"
+ "fmax v23.4s, v23.4s, v13.4s\n"
+ "fmax v24.4s, v24.4s, v13.4s\n"
+ "fmax v25.4s, v25.4s, v13.4s\n"
+ "fmax v26.4s, v26.4s, v13.4s\n"
+ "fmax v27.4s, v27.4s, v13.4s\n"
+ "fmax v28.4s, v28.4s, v13.4s\n"
+ "fmax v29.4s, v29.4s, v13.4s\n"
+ "fmax v30.4s, v30.4s, v13.4s\n"
+ "fmax v31.4s, v31.4s, v13.4s\n"
+ "fmin v16.4s, v16.4s, v14.4s\n"
+ "fmin v17.4s, v17.4s, v14.4s\n"
+ "fmin v18.4s, v18.4s, v14.4s\n"
+ "fmin v19.4s, v19.4s, v14.4s\n"
+ "fmin v20.4s, v20.4s, v14.4s\n"
+ "fmin v21.4s, v21.4s, v14.4s\n"
+ "fmin v22.4s, v22.4s, v14.4s\n"
+ "fmin v23.4s, v23.4s, v14.4s\n"
+ "fmin v24.4s, v24.4s, v14.4s\n"
+ "fmin v25.4s, v25.4s, v14.4s\n"
+ "fmin v26.4s, v26.4s, v14.4s\n"
+ "fmin v27.4s, v27.4s, v14.4s\n"
+ "fmin v28.4s, v28.4s, v14.4s\n"
+ "fmin v29.4s, v29.4s, v14.4s\n"
+ "fmin v30.4s, v30.4s, v14.4s\n"
+ "fmin v31.4s, v31.4s, v14.4s\n"
+ "tbz %x[n_channels], #1, 70f\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
+ "st1 { v16.d }[0], [x23]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
+ "st1 { v17.d }[0], [x22]\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
+ "st1 { v18.d }[0], [x21]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
+ "st1 { v19.d }[0], [x20]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
+ "st1 { v20.d }[0], [x23]\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
+ "st1 { v21.d }[0], [x22]\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
+ "st1 { v22.d }[0], [x21]\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
+ "st1 { v23.d }[0], [x20]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
+ "st1 { v24.d }[0], [x23]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
+ "st1 { v25.d }[0], [x22]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
+ "st1 { v26.d }[0], [x21]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
+ "st1 { v27.d }[0], [x20]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
+ "add x14, x14, #0x8\n"
+ "st1 { v28.d }[0], [x23]\n"
+ "st1 { v29.d }[0], [x22]\n"
+ "st1 { v30.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
+ "tbz %x[n_channels], #0, 71f\n"
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
+ "st1 { v18.s }[2], [x21]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
+ "st1 { v19.s }[2], [x20]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
+ "st1 { v21.s }[2], [x22]\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
+ "st1 { v23.s }[2], [x20]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
+ "st1 { v25.s }[2], [x22]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
+ "st1 { v26.s }[2], [x21]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
+ "st1 { v27.s }[2], [x20]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
+ "b 71f\n"
+ "70:" // Oddments: Store: Bit 1: Unset
+ "ldr x23, [x8, #0x0]\n"
+ "ldr x22, [x8, #0x8]\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "ldr x21, [x8, #0x10]\n"
+ "ldr x20, [x8, #0x18]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
+ "st1 { v16.s }[0], [x23]\n"
+ "ldr x23, [x8, #0x20]\n"
+ "add x23, x23, x14\n"
+ "st1 { v17.s }[0], [x22]\n"
+ "ldr x22, [x8, #0x28]\n"
+ "add x22, x22, x14\n"
+ "st1 { v18.s }[0], [x21]\n"
+ "ldr x21, [x8, #0x30]\n"
+ "add x21, x21, x14\n"
+ "st1 { v19.s }[0], [x20]\n"
+ "ldr x20, [x8, #0x38]\n"
+ "add x20, x20, x14\n"
+ "st1 { v20.s }[0], [x23]\n"
+ "ldr x23, [x8, #0x40]\n"
+ "add x23, x23, x14\n"
+ "st1 { v21.s }[0], [x22]\n"
+ "ldr x22, [x8, #0x48]\n"
+ "add x22, x22, x14\n"
+ "st1 { v22.s }[0], [x21]\n"
+ "ldr x21, [x8, #0x50]\n"
+ "add x21, x21, x14\n"
+ "st1 { v23.s }[0], [x20]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "add x20, x20, x14\n"
+ "st1 { v24.s }[0], [x23]\n"
+ "ldr x23, [x8, #0x60]\n"
+ "add x23, x23, x14\n"
+ "st1 { v25.s }[0], [x22]\n"
+ "ldr x22, [x8, #0x68]\n"
+ "add x22, x22, x14\n"
+ "st1 { v26.s }[0], [x21]\n"
+ "ldr x21, [x8, #0x70]\n"
+ "add x21, x21, x14\n"
+ "st1 { v27.s }[0], [x20]\n"
+ "ldr x20, [x8, #0x78]\n"
+ "add x20, x20, x14\n"
+ "st1 { v28.s }[0], [x23]\n"
+ "st1 { v29.s }[0], [x22]\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
+ "71:" // Oddments: Store: Bit 1: End
+ "72:" // End
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..f727efea80
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..5ab61fad4c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,615 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x23, #0x0\n"
+ "mov x27, #0x0\n"
+ "1:" // Tile loop
+ "str x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x4\n"
+ "mov x25, #0x2\n"
+ "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mul x22, x23, x24\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x22, x27, x6, x22\n" // offset += tile_j * ld_input_col
+ "ldr x7, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "lsl x6, x6, #0x2\n"
+ "mul x20, x23, x21\n" // offset = tile_i * ld_output_row
+ "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "mov x23, #0x10\n" // cntb _, ALL, #1
+ "mul x22, x22, x26\n" // offset *= kernel_stride * output_size
+ "add x8, x8, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x16, x8, x24, LSL #2\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x20, x27, x7, x20\n" // offset += tile_j * ld_output_col
+ "lsr x22, %x[n_channels], #0x2\n"
+ "add x14, x16, x24, LSL #2\n"
+ "mul x20, x20, x25\n" // offset *= output_tile_size
+ "add x13, x6, x6\n"
+ "add x12, x14, x24, LSL #2\n"
+ "add x11, x13, x6\n"
+ "add x17, x17, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v26.4s }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v27.4s }, [x20]\n"
+ "add x10, x12, x24, LSL #2\n"
+ "add x9, x11, x6\n"
+ "add x28, x17, x21, LSL #2\n"
+ "lsl x7, x7, #0x2\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x23\n"
+ "cbz x22, 4f\n"
+ "ldr q31, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x23, x22, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "ldr q9, [x14, x13]\n"
+ "ld1 { v10.4s }, [x8]\n"
+ "ldr q11, [x8, x6]\n"
+ "ldr q12, [x8, x11]\n"
+ "ldr q13, [x8, x9]\n"
+ "ld1 { v14.4s }, [x16]\n"
+ "ldr q15, [x16, x6]\n"
+ "ldr q16, [x8, x13]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v6.4s, v9.4s\n"
+ "add x23, x23, #0x10\n"
+ "add x8, x8, #0x10\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
+ "ld1 { v10.4s }, [x8]\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "ldr q21, [x16, x9]\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "ldr q18, [x16, x11]\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "ldr q17, [x16, x13]\n"
+ "fmla v29.4s, v3.4s, v14.4s\n"
+ "ld1 { v20.4s }, [x12]\n"
+ "fmla v28.4s, v0.4s, v16.4s\n"
+ "add x16, x16, #0x10\n"
+ "fmla v29.4s, v4.4s, v15.4s\n"
+ "ld1 { v25.4s }, [x14]\n"
+ "fmla v28.4s, v4.4s, v18.4s\n"
+ "ldr q19, [x12, x6]\n"
+ "fmla v29.4s, v2.4s, v16.4s\n"
+ "ldr q18, [x14, x6]\n"
+ "fmla v28.4s, v5.4s, v21.4s\n"
+ "ldr q24, [x14, x11]\n"
+ "mov v23.16b, v31.16b\n fmla v23.4s, v2.4s, v9.4s\n"
+ "mov v22.16b, v31.16b\n fmla v22.4s, v0.4s, v9.4s\n"
+ "ldr q31, [x15, #0x0]\n"
+ "cmp x23, x22, LSL #4\n"
+ "fmla v29.4s, v5.4s, v17.4s\n"
+ "fmla v28.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x12, x11]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v23.4s, v3.4s, v20.4s\n"
+ "ldr q16, [x12, x9]\n"
+ "fmla v22.4s, v4.4s, v17.4s\n"
+ "ldr q21, [x10, x6]\n"
+ "fmla v23.4s, v0.4s, v25.4s\n"
+ "ldr q0, [x15, #0x10]\n"
+ "fmla v22.4s, v1.4s, v24.4s\n"
+ "add x21, x21, #0x10\n"
+ "fmla v23.4s, v4.4s, v19.4s\n"
+ "ldr q20, [x14, x9]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "fmla v22.4s, v5.4s, v16.4s\n"
+ "ldr q19, [x10, x11]\n"
+ "fmla v29.4s, v6.4s, v25.4s\n"
+ "ld1 { v17.4s }, [x10]\n"
+ "fmla v23.4s, v1.4s, v18.4s\n"
+ "ldr q1, [x15, #0x20]\n"
+ "fmla v22.4s, v2.4s, v20.4s\n"
+ "ldr q2, [x15, #0x30]\n"
+ "fmla v29.4s, v7.4s, v18.4s\n"
+ "ldr q16, [x12, x13]\n"
+ "fmla v23.4s, v6.4s, v17.4s\n"
+ "ldr q18, [x10, x13]\n"
+ "fmla v22.4s, v3.4s, v16.4s\n"
+ "ldr q3, [x15, #0x40]\n"
+ "fmla v23.4s, v7.4s, v21.4s\n"
+ "ldr q13, [x8, x9]\n"
+ "fmla v22.4s, v7.4s, v19.4s\n"
+ "ld1 { v14.4s }, [x16]\n"
+ "fmla v28.4s, v7.4s, v24.4s\n"
+ "ldr q12, [x8, x11]\n"
+ "fmla v23.4s, v5.4s, v16.4s\n"
+ "ldr q16, [x8, x13]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "fmla v22.4s, v6.4s, v18.4s\n"
+ "fmla v28.4s, v8.4s, v20.4s\n"
+ "ldr q17, [x10, x9]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "fmla v23.4s, v8.4s, v18.4s\n"
+ "fmla v22.4s, v8.4s, v17.4s\n"
+ "ldr q11, [x8, x6]\n"
+ "ldr q15, [x16, x6]\n"
+ "fmax v29.4s, v29.4s, v26.4s\n"
+ "fmax v28.4s, v28.4s, v26.4s\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "fmax v23.4s, v23.4s, v26.4s\n"
+ "fmax v22.4s, v22.4s, v26.4s\n"
+ "add x14, x14, #0x10\n"
+ "ldr q9, [x14, x13]\n"
+ "fmin v29.4s, v29.4s, v27.4s\n"
+ "fmin v28.4s, v28.4s, v27.4s\n"
+ "fmin v23.4s, v23.4s, v27.4s\n"
+ "fmin v22.4s, v22.4s, v27.4s\n"
+ "add x12, x12, #0x10\n"
+ "add x10, x10, #0x10\n"
+ "st1 { v29.4s }, [x17]\n"
+ "add x15, x15, #0xa0\n"
+ "str q28, [x17, x7]\n"
+ "add x17, x17, #0x10\n"
+ "st1 { v23.4s }, [x28]\n"
+ "str q22, [x28, x7]\n"
+ "add x28, x28, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v29.16b, v31.16b\n fmla v29.4s, v8.4s, v9.4s\n"
+ "mov v28.16b, v31.16b\n fmla v28.4s, v6.4s, v9.4s\n"
+ "add x8, x8, #0x10\n"
+ "fmla v29.4s, v0.4s, v10.4s\n"
+ "fmla v28.4s, v1.4s, v12.4s\n"
+ "ldr q20, [x16, x9]\n"
+ "fmla v29.4s, v1.4s, v11.4s\n"
+ "ldr q18, [x16, x11]\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "ldr q17, [x16, x13]\n"
+ "fmla v29.4s, v3.4s, v14.4s\n"
+ "ld1 { v19.4s }, [x12]\n"
+ "fmla v28.4s, v0.4s, v16.4s\n"
+ "add x16, x16, #0x10\n"
+ "fmla v29.4s, v4.4s, v15.4s\n"
+ "ld1 { v25.4s }, [x14]\n"
+ "fmla v28.4s, v4.4s, v18.4s\n"
+ "ldr q18, [x12, x6]\n"
+ "fmla v29.4s, v2.4s, v16.4s\n"
+ "ldr q24, [x14, x6]\n"
+ "fmla v28.4s, v5.4s, v20.4s\n"
+ "ldr q23, [x14, x11]\n"
+ "mov v22.16b, v31.16b\n fmla v22.4s, v2.4s, v9.4s\n"
+ "mov v21.16b, v31.16b\n fmla v21.4s, v0.4s, v9.4s\n"
+ "fmla v29.4s, v5.4s, v17.4s\n"
+ "fmla v28.4s, v3.4s, v17.4s\n"
+ "ldr q17, [x12, x11]\n"
+ "fmla v22.4s, v3.4s, v19.4s\n"
+ "ldr q16, [x12, x9]\n"
+ "fmla v21.4s, v4.4s, v17.4s\n"
+ "ldr q20, [x10, x6]\n"
+ "fmla v22.4s, v0.4s, v25.4s\n"
+ "fmla v21.4s, v1.4s, v23.4s\n"
+ "fmla v22.4s, v4.4s, v18.4s\n"
+ "ldr q19, [x14, x9]\n"
+ "fmla v21.4s, v5.4s, v16.4s\n"
+ "ldr q18, [x10, x11]\n"
+ "fmla v29.4s, v6.4s, v25.4s\n"
+ "ld1 { v17.4s }, [x10]\n"
+ "fmla v22.4s, v1.4s, v24.4s\n"
+ "add x14, x14, #0x10\n"
+ "fmla v21.4s, v2.4s, v19.4s\n"
+ "fmla v29.4s, v7.4s, v24.4s\n"
+ "ldr q16, [x12, x13]\n"
+ "fmax v29.4s, v29.4s, v26.4s\n"
+ "fmla v22.4s, v6.4s, v17.4s\n"
+ "ldr q17, [x10, x13]\n"
+ "fmla v21.4s, v3.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v27.4s\n"
+ "fmla v22.4s, v7.4s, v20.4s\n"
+ "fmla v21.4s, v7.4s, v18.4s\n"
+ "st1 { v29.4s }, [x17]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v28.4s, v7.4s, v23.4s\n"
+ "fmla v22.4s, v5.4s, v16.4s\n"
+ "fmla v21.4s, v6.4s, v17.4s\n"
+ "fmla v28.4s, v8.4s, v19.4s\n"
+ "ldr q16, [x10, x9]\n"
+ "fmax v28.4s, v28.4s, v26.4s\n"
+ "fmla v22.4s, v8.4s, v17.4s\n"
+ "fmla v21.4s, v8.4s, v16.4s\n"
+ "fmax v22.4s, v22.4s, v26.4s\n"
+ "add x10, x10, #0x10\n"
+ "fmax v21.4s, v21.4s, v26.4s\n"
+ "fmin v28.4s, v28.4s, v27.4s\n"
+ "str q28, [x17, x7]\n"
+ "add x17, x17, #0x10\n"
+ "fmin v22.4s, v22.4s, v27.4s\n"
+ "fmin v21.4s, v21.4s, v27.4s\n"
+ "st1 { v22.4s }, [x28]\n"
+ "str q21, [x28, x7]\n"
+ "add x28, x28, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 43f\n"
+ "ldr q31, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "add x27, x14, x13\n"
+ "add x26, x8, XZR\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "add x25, x8, x6\n"
+ "add x24, x8, x11\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "add x23, x8, x9\n"
+ "add x22, x16, XZR\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "add x21, x16, x6\n"
+ "add x20, x8, x13\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ldr d9, [x27], #0x8\n"
+ "ldr d10, [x26], #0x8\n"
+ "ldr d11, [x25], #0x8\n"
+ "ldr d12, [x24], #0x8\n"
+ "ldr d13, [x23], #0x8\n"
+ "ldr d14, [x22], #0x8\n"
+ "ldr d15, [x21], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v9.s }[2], [x27]\n"
+ "ld1 { v10.s }[2], [x26]\n"
+ "ld1 { v11.s }[2], [x25]\n"
+ "ld1 { v12.s }[2], [x24]\n"
+ "ld1 { v13.s }[2], [x23]\n"
+ "ld1 { v14.s }[2], [x22]\n"
+ "ld1 { v15.s }[2], [x21]\n"
+ "ld1 { v16.s }[2], [x20]\n"
+ "b 6f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset
+ "ldr s9, [x27, #0x0]\n"
+ "ldr s10, [x26, #0x0]\n"
+ "ldr s11, [x25, #0x0]\n"
+ "ldr s12, [x24, #0x0]\n"
+ "ldr s13, [x23, #0x0]\n"
+ "ldr s14, [x22, #0x0]\n"
+ "ldr s15, [x21, #0x0]\n"
+ "ldr s16, [x20, #0x0]\n"
+ "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
+ "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "add x20, x16, x11\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "fmla v28.4s, v3.4s, v14.4s\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "fmla v28.4s, v4.4s, v15.4s\n"
+ "mov v30.16b, v31.16b\n fmla v30.4s, v2.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "fmla v29.4s, v0.4s, v16.4s\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "add x20, x16, x9\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 10f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "10:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+ "fmla v29.4s, v5.4s, v12.4s\n"
+ "add x20, x16, x13\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v13.s }[2], [x20]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset
+ "ldr s13, [x20, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End
+ "fmla v28.4s, v5.4s, v13.4s\n"
+ "fmla v29.4s, v3.4s, v13.4s\n"
+ "add x20, x12, XZR\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr d14, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v14.s }[2], [x20]\n"
+ "b 14f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+ "ldr s14, [x20, #0x0]\n"
+ "14:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+ "fmla v30.4s, v3.4s, v14.4s\n"
+ "add x20, x14, XZR\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr d15, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v15.s }[2], [x20]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+ "ldr s15, [x20, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+ "fmla v28.4s, v6.4s, v15.4s\n"
+ "fmla v30.4s, v0.4s, v15.4s\n"
+ "add x20, x12, x6\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 18f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "18:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "add x20, x14, x6\n"
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr d16, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v16.s }[2], [x20]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+ "ldr s16, [x20, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+ "fmla v28.4s, v7.4s, v16.4s\n"
+ "fmla v30.4s, v1.4s, v16.4s\n"
+ "add x20, x12, x11\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 22f\n"
+ "ld1 { v13.s }[2], [x20]\n"
+ "b 22f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+ "ldr s13, [x20, #0x0]\n"
+ "22:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+ "fmla v31.4s, v4.4s, v13.4s\n"
+ "add x20, x14, x11\n"
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+ "fmla v29.4s, v7.4s, v12.4s\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "add x20, x12, x9\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ldr d14, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 26f\n"
+ "ld1 { v14.s }[2], [x20]\n"
+ "b 26f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+ "ldr s14, [x20, #0x0]\n"
+ "26:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+ "fmla v31.4s, v5.4s, v14.4s\n"
+ "add x20, x10, XZR\n"
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr d15, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v15.s }[2], [x20]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+ "ldr s15, [x20, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+ "fmla v30.4s, v6.4s, v15.4s\n"
+ "add x20, x14, x9\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 30f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 30f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "30:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+ "fmla v29.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "add x20, x10, x6\n"
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v13.s }[2], [x20]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+ "ldr s13, [x20, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+ "fmla v30.4s, v7.4s, v13.4s\n"
+ "add x20, x12, x13\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ldr d16, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 34f\n"
+ "ld1 { v16.s }[2], [x20]\n"
+ "b 34f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+ "ldr s16, [x20, #0x0]\n"
+ "34:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+ "fmla v30.4s, v5.4s, v16.4s\n"
+ "fmla v31.4s, v3.4s, v16.4s\n"
+ "add x20, x10, x11\n"
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr d14, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v14.s }[2], [x20]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+ "ldr s14, [x20, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+ "fmla v31.4s, v7.4s, v14.4s\n"
+ "add x20, x10, x13\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ldr d15, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 38f\n"
+ "ld1 { v15.s }[2], [x20]\n"
+ "b 38f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+ "ldr s15, [x20, #0x0]\n"
+ "38:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+ "fmla v30.4s, v8.4s, v15.4s\n"
+ "fmla v31.4s, v6.4s, v15.4s\n"
+ "add x20, x10, x9\n"
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+ "fmla v31.4s, v8.4s, v11.4s\n"
+ "fmax v28.4s, v28.4s, v26.4s\n"
+ "fmax v29.4s, v29.4s, v26.4s\n"
+ "fmax v30.4s, v30.4s, v26.4s\n"
+ "fmax v31.4s, v31.4s, v26.4s\n"
+ "fmin v28.4s, v28.4s, v27.4s\n"
+ "fmin v29.4s, v29.4s, v27.4s\n"
+ "fmin v30.4s, v30.4s, v27.4s\n"
+ "fmin v31.4s, v31.4s, v27.4s\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "mov x21, x17\n"
+ "mov x20, x28\n"
+ "st1 { v28.d }[0], [x21], x7\n"
+ "st1 { v30.d }[0], [x20], x7\n"
+ "add x17, x17, #0x8\n"
+ "add x28, x28, #0x8\n"
+ "st1 { v29.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
+ "tbz %x[n_channels], #0, 42f\n"
+ "mov x21, x17\n"
+ "mov x20, x28\n"
+ "st1 { v28.s }[2], [x21], x7\n"
+ "st1 { v30.s }[2], [x20], x7\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
+ "b 42f\n"
+ "41:" // Tile loop: Oddments: Store: Bit 1: Unset
+ "mov x21, x17\n"
+ "mov x20, x28\n"
+ "st1 { v28.s }[0], [x21], x7\n"
+ "st1 { v30.s }[0], [x20], x7\n"
+ "st1 { v29.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
+ "42:" // Tile loop: Oddments: Store: Bit 1: End
+ "43:" // Tile loop: End
+ "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x27, x27, #0x1\n"
+ "add x21, x23, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x27, x20\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x23, x23, x21, LT\n"
+ "csel x27, x27, XZR, LT\n"
+ "cmp x23, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..24fe255dfb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,629 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[25];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[1];
+ inptrs[3] = input_ptrs[3];
+ inptrs[4] = input_ptrs[4];
+ inptrs[5] = input_ptrs[5];
+ inptrs[6] = input_ptrs[6];
+ inptrs[7] = input_ptrs[2];
+ inptrs[8] = input_ptrs[8];
+ inptrs[9] = input_ptrs[9];
+ inptrs[10] = input_ptrs[7];
+ inptrs[11] = input_ptrs[15];
+ inptrs[12] = input_ptrs[10];
+ inptrs[13] = input_ptrs[16];
+ inptrs[14] = input_ptrs[11];
+ inptrs[15] = input_ptrs[18];
+ inptrs[16] = input_ptrs[13];
+ inptrs[17] = input_ptrs[19];
+ inptrs[18] = input_ptrs[20];
+ inptrs[19] = input_ptrs[14];
+ inptrs[20] = input_ptrs[21];
+ inptrs[21] = input_ptrs[17];
+ inptrs[22] = input_ptrs[23];
+ inptrs[23] = input_ptrs[22];
+ inptrs[24] = input_ptrs[24];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "mov x25, #0x10\n" // cntb _, ALL, #1
+ "lsr x24, %x[n_channels], #0x2\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v26.4s }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v27.4s }, [x20]\n"
+ "add x13, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldp x12, x11, [x21, #0x0]\n"
+ "ldp x10, x9, [x21, #0x10]\n"
+ "mov x28, #0x0\n"
+ "sub x22, XZR, x25\n"
+ "cbz x24, 3f\n"
+ "ldr q31, [x23, #0x0]\n"
+ "ldr q0, [x23, #0x10]\n"
+ "cmp x25, x24, LSL #4\n"
+ "ldr q1, [x23, #0x20]\n"
+ "ldr q2, [x23, #0x30]\n"
+ "ldr q3, [x23, #0x40]\n"
+ "ldr q4, [x23, #0x50]\n"
+ "ldr q5, [x23, #0x60]\n"
+ "ldr q6, [x23, #0x70]\n"
+ "ldr q7, [x23, #0x80]\n"
+ "ldr q8, [x23, #0x90]\n"
+ "add x23, x23, #0xa0\n"
+ "ldp x21, x20, [x13, #0x0]\n"
+ "ldr q9, [x21, x28]\n"
+ "ldr q10, [x20, x28]\n"
+ "ldp x21, x20, [x13, #0x10]\n"
+ "ldr q11, [x21, x28]\n"
+ "ldr q12, [x20, x28]\n"
+ "ldp x21, x20, [x13, #0x20]\n"
+ "ldr q13, [x21, x28]\n"
+ "ldr q14, [x20, x28]\n"
+ "ldp x21, x20, [x13, #0x30]\n"
+ "ldr q15, [x21, x28]\n"
+ "ldr q16, [x20, x28]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v24.16b, v31.16b\n fmla v24.4s, v8.4s, v9.4s\n"
+ "mov v23.16b, v31.16b\n fmla v23.4s, v6.4s, v9.4s\n"
+ "ldr x21, [x13, #0x40]\n"
+ "ldr x20, [x13, #0x48]\n"
+ "fmla v24.4s, v0.4s, v10.4s\n"
+ "fmla v23.4s, v1.4s, v12.4s\n"
+ "ldr q20, [x20, x28]\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v24.4s, v1.4s, v11.4s\n"
+ "ldr q19, [x21, x28]\n"
+ "fmla v23.4s, v2.4s, v13.4s\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v24.4s, v3.4s, v14.4s\n"
+ "fmla v23.4s, v0.4s, v16.4s\n"
+ "ldr x20, [x13, #0x58]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v24.4s, v4.4s, v15.4s\n"
+ "fmla v23.4s, v4.4s, v19.4s\n"
+ "ldr x21, [x13, #0x78]\n"
+ "ldr x20, [x13, #0x60]\n"
+ "ldr q22, [x20, x28]\n"
+ "fmla v24.4s, v2.4s, v16.4s\n"
+ "fmla v23.4s, v5.4s, v20.4s\n"
+ "ldr x20, [x13, #0x80]\n"
+ "ldr q21, [x20, x28]\n"
+ "mov v20.16b, v31.16b\n fmla v20.4s, v2.4s, v9.4s\n"
+ "mov v19.16b, v31.16b\n fmla v19.4s, v0.4s, v9.4s\n"
+ "ldr q31, [x23, #0x0]\n"
+ "fmla v24.4s, v5.4s, v18.4s\n"
+ "fmla v23.4s, v3.4s, v18.4s\n"
+ "ldr q16, [x21, x28]\n"
+ "ldr x20, [x13, #0x68]\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v20.4s, v3.4s, v17.4s\n"
+ "fmla v19.4s, v4.4s, v16.4s\n"
+ "ldr x20, [x13, #0x88]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v20.4s, v0.4s, v22.4s\n"
+ "ldr q0, [x23, #0x10]\n"
+ "fmla v19.4s, v1.4s, v21.4s\n"
+ "ldr x20, [x13, #0x70]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v20.4s, v4.4s, v18.4s\n"
+ "fmla v19.4s, v5.4s, v16.4s\n"
+ "ldr q4, [x23, #0x50]\n"
+ "ldr x20, [x13, #0x98]\n"
+ "fmla v24.4s, v6.4s, v22.4s\n"
+ "fmla v20.4s, v1.4s, v17.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr q1, [x23, #0x20]\n"
+ "fmla v19.4s, v2.4s, v16.4s\n"
+ "fmla v24.4s, v7.4s, v17.4s\n"
+ "ldr q2, [x23, #0x30]\n"
+ "ldr x20, [x13, #0x90]\n"
+ "fmla v23.4s, v7.4s, v21.4s\n"
+ "fmla v23.4s, v8.4s, v16.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr x20, [x13, #0xa8]\n"
+ "fmla v20.4s, v6.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v26.4s\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x20, [x13, #0xa0]\n"
+ "fmla v19.4s, v3.4s, v17.4s\n"
+ "fmax v23.4s, v23.4s, v26.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr q3, [x23, #0x40]\n"
+ "fmla v20.4s, v7.4s, v16.4s\n"
+ "fmla v20.4s, v5.4s, v17.4s\n"
+ "ldr q5, [x23, #0x60]\n"
+ "ldr x20, [x13, #0xb0]\n"
+ "add x22, x22, #0x10\n"
+ "fmin v24.4s, v24.4s, v27.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr x20, [x13, #0xb8]\n"
+ "fmla v19.4s, v7.4s, v16.4s\n"
+ "fmin v23.4s, v23.4s, v27.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "ldr q7, [x23, #0x80]\n"
+ "fmla v19.4s, v6.4s, v16.4s\n"
+ "fmla v20.4s, v8.4s, v16.4s\n"
+ "ldr q6, [x23, #0x70]\n"
+ "ldr x20, [x13, #0xc0]\n"
+ "fmax v20.4s, v20.4s, v26.4s\n"
+ "fmin v20.4s, v20.4s, v27.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v19.4s, v8.4s, v16.4s\n"
+ "ldr q8, [x23, #0x90]\n"
+ "fmax v19.4s, v19.4s, v26.4s\n"
+ "ldp x21, x20, [x13, #0x0]\n"
+ "ldr q9, [x21, x25]\n"
+ "fmin v19.4s, v19.4s, v27.4s\n"
+ "add x28, x28, #0x10\n"
+ "ldr q10, [x20, x25]\n"
+ "ldp x21, x20, [x13, #0x10]\n"
+ "str q24, [x12, x22]\n"
+ "add x23, x23, #0xa0\n"
+ "ldr q11, [x21, x25]\n"
+ "ldr q12, [x20, x25]\n"
+ "str q23, [x11, x22]\n"
+ "ldp x21, x20, [x13, #0x20]\n"
+ "ldr q13, [x21, x25]\n"
+ "str q20, [x10, x22]\n"
+ "ldr q14, [x20, x25]\n"
+ "ldp x21, x20, [x13, #0x30]\n"
+ "str q19, [x9, x22]\n"
+ "ldr q15, [x21, x25]\n"
+ "ldr q16, [x20, x25]\n"
+ "add x25, x25, #0x10\n"
+ "cmp x25, x24, LSL #4\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v25.16b, v31.16b\n fmla v25.4s, v8.4s, v9.4s\n"
+ "mov v24.16b, v31.16b\n fmla v24.4s, v6.4s, v9.4s\n"
+ "ldr x21, [x13, #0x40]\n"
+ "ldr x20, [x13, #0x48]\n"
+ "fmla v25.4s, v0.4s, v10.4s\n"
+ "fmla v24.4s, v1.4s, v12.4s\n"
+ "ldr q20, [x20, x28]\n"
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v25.4s, v1.4s, v11.4s\n"
+ "ldr q18, [x21, x28]\n"
+ "fmla v24.4s, v2.4s, v13.4s\n"
+ "ldr q19, [x20, x28]\n"
+ "fmla v25.4s, v3.4s, v14.4s\n"
+ "fmla v24.4s, v0.4s, v16.4s\n"
+ "ldr x20, [x13, #0x58]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v25.4s, v4.4s, v15.4s\n"
+ "fmla v24.4s, v4.4s, v18.4s\n"
+ "ldr x21, [x13, #0x78]\n"
+ "ldr x20, [x13, #0x60]\n"
+ "ldr q23, [x20, x28]\n"
+ "fmla v25.4s, v2.4s, v16.4s\n"
+ "fmla v24.4s, v5.4s, v20.4s\n"
+ "ldr x20, [x13, #0x80]\n"
+ "ldr q22, [x20, x28]\n"
+ "mov v21.16b, v31.16b\n fmla v21.4s, v2.4s, v9.4s\n"
+ "mov v20.16b, v31.16b\n fmla v20.4s, v0.4s, v9.4s\n"
+ "ldr x20, [x13, #0x68]\n"
+ "ldr q18, [x20, x28]\n"
+ "fmla v25.4s, v5.4s, v19.4s\n"
+ "fmla v24.4s, v3.4s, v19.4s\n"
+ "ldr q16, [x21, x28]\n"
+ "fmla v21.4s, v3.4s, v17.4s\n"
+ "fmla v20.4s, v4.4s, v16.4s\n"
+ "ldr x20, [x13, #0x88]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.4s, v0.4s, v23.4s\n"
+ "fmla v20.4s, v1.4s, v22.4s\n"
+ "ldr x20, [x13, #0x70]\n"
+ "ldr q17, [x20, x28]\n"
+ "ldr x20, [x13, #0x98]\n"
+ "fmla v21.4s, v4.4s, v18.4s\n"
+ "ldr q19, [x20, x28]\n"
+ "fmla v20.4s, v5.4s, v16.4s\n"
+ "fmla v25.4s, v6.4s, v23.4s\n"
+ "ldr x20, [x13, #0x90]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.4s, v1.4s, v17.4s\n"
+ "ldr x20, [x13, #0xa8]\n"
+ "fmla v20.4s, v2.4s, v19.4s\n"
+ "fmla v25.4s, v7.4s, v17.4s\n"
+ "ldr q18, [x20, x28]\n"
+ "ldr x20, [x13, #0xa0]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v21.4s, v6.4s, v16.4s\n"
+ "fmla v20.4s, v3.4s, v18.4s\n"
+ "ldr x20, [x13, #0xb0]\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.4s, v7.4s, v17.4s\n"
+ "fmla v20.4s, v7.4s, v16.4s\n"
+ "ldr x20, [x13, #0xb8]\n"
+ "ldr q17, [x20, x28]\n"
+ "fmla v24.4s, v7.4s, v22.4s\n"
+ "fmla v21.4s, v5.4s, v18.4s\n"
+ "ldr x20, [x13, #0xc0]\n"
+ "fmla v20.4s, v6.4s, v17.4s\n"
+ "fmla v24.4s, v8.4s, v19.4s\n"
+ "ldr q16, [x20, x28]\n"
+ "fmla v21.4s, v8.4s, v17.4s\n"
+ "fmla v20.4s, v8.4s, v16.4s\n"
+ "fmax v25.4s, v25.4s, v26.4s\n"
+ "add x22, x22, #0x10\n"
+ "fmax v24.4s, v24.4s, v26.4s\n"
+ "fmax v21.4s, v21.4s, v26.4s\n"
+ "add x28, x28, #0x10\n"
+ "fmax v20.4s, v20.4s, v26.4s\n"
+ "fmin v25.4s, v25.4s, v27.4s\n"
+ "str q25, [x12, x22]\n"
+ "fmin v24.4s, v24.4s, v27.4s\n"
+ "fmin v21.4s, v21.4s, v27.4s\n"
+ "str q24, [x11, x22]\n"
+ "fmin v20.4s, v20.4s, v27.4s\n"
+ "str q21, [x10, x22]\n"
+ "str q20, [x9, x22]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 42f\n"
+ "ldr q31, [x23, #0x0]\n"
+ "ldr q0, [x23, #0x10]\n"
+ "mov x20, x28\n"
+ "add x12, x12, x20\n"
+ "ldr q1, [x23, #0x20]\n"
+ "ldr q2, [x23, #0x30]\n"
+ "add x11, x11, x20\n"
+ "add x10, x10, x20\n"
+ "ldr q3, [x23, #0x40]\n"
+ "ldr q4, [x23, #0x50]\n"
+ "add x9, x9, x20\n"
+ "ldr q5, [x23, #0x60]\n"
+ "ldr q6, [x23, #0x70]\n"
+ "ldr q7, [x23, #0x80]\n"
+ "ldr q8, [x23, #0x90]\n"
+ "ldr x27, [x13, #0x0]\n"
+ "ldr x26, [x13, #0x8]\n"
+ "add x27, x27, x28\n"
+ "add x26, x26, x28\n"
+ "ldr x25, [x13, #0x10]\n"
+ "ldr x24, [x13, #0x18]\n"
+ "add x25, x25, x28\n"
+ "add x24, x24, x28\n"
+ "ldr x23, [x13, #0x20]\n"
+ "ldr x22, [x13, #0x28]\n"
+ "add x23, x23, x28\n"
+ "add x22, x22, x28\n"
+ "ldr x21, [x13, #0x30]\n"
+ "ldr x20, [x13, #0x38]\n"
+ "add x21, x21, x28\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.d }[0], [x27], #0x8\n"
+ "ld1 { v10.d }[0], [x26], #0x8\n"
+ "ld1 { v11.d }[0], [x25], #0x8\n"
+ "ld1 { v12.d }[0], [x24], #0x8\n"
+ "ld1 { v13.d }[0], [x23], #0x8\n"
+ "ld1 { v14.d }[0], [x22], #0x8\n"
+ "ld1 { v15.d }[0], [x21], #0x8\n"
+ "ld1 { v16.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 5f\n"
+ "ld1 { v9.s }[2], [x27], #0x4\n"
+ "ld1 { v10.s }[2], [x26], #0x4\n"
+ "ld1 { v11.s }[2], [x25], #0x4\n"
+ "ld1 { v12.s }[2], [x24], #0x4\n"
+ "ld1 { v13.s }[2], [x23], #0x4\n"
+ "ld1 { v14.s }[2], [x22], #0x4\n"
+ "ld1 { v15.s }[2], [x21], #0x4\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
+ "b 5f\n"
+ "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset
+ "ld1 { v9.s }[0], [x27], #0x4\n"
+ "ld1 { v10.s }[0], [x26], #0x4\n"
+ "ld1 { v11.s }[0], [x25], #0x4\n"
+ "ld1 { v12.s }[0], [x24], #0x4\n"
+ "ld1 { v13.s }[0], [x23], #0x4\n"
+ "ld1 { v14.s }[0], [x22], #0x4\n"
+ "ld1 { v15.s }[0], [x21], #0x4\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
+ "mov v28.16b, v31.16b\n fmla v28.4s, v8.4s, v9.4s\n"
+ "fmla v28.4s, v0.4s, v10.4s\n"
+ "ldr x20, [x13, #0x40]\n"
+ "add x20, x20, x28\n"
+ "mov v29.16b, v31.16b\n fmla v29.4s, v6.4s, v9.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "fmla v28.4s, v3.4s, v14.4s\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "fmla v28.4s, v4.4s, v15.4s\n"
+ "mov v30.16b, v31.16b\n fmla v30.4s, v2.4s, v9.4s\n"
+ "fmla v31.4s, v0.4s, v9.4s\n"
+ "fmla v28.4s, v2.4s, v16.4s\n"
+ "fmla v29.4s, v0.4s, v16.4s\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 7f\n"
+ "6:" // Oddments: Load input (1, 3): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "7:" // Oddments: Load input (1, 3): Bit 1: End
+ "ldr x20, [x13, #0x48]\n"
+ "fmla v29.4s, v4.4s, v11.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 9f\n"
+ "8:" // Oddments: Load input (1, 4): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "9:" // Oddments: Load input (1, 4): Bit 1: End
+ "ldr x20, [x13, #0x50]\n"
+ "fmla v29.4s, v5.4s, v12.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (1, 2): Bit 1: Unset
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "11:" // Oddments: Load input (1, 2): Bit 1: End
+ "ldr x20, [x13, #0x58]\n"
+ "fmla v28.4s, v5.4s, v13.4s\n"
+ "fmla v29.4s, v3.4s, v13.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v14.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
+ "b 13f\n"
+ "12:" // Oddments: Load input (3, 0): Bit 1: Unset
+ "ld1 { v14.s }[0], [x20], #0x4\n"
+ "13:" // Oddments: Load input (3, 0): Bit 1: End
+ "ldr x20, [x13, #0x60]\n"
+ "fmla v30.4s, v3.4s, v14.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (2, 0): Bit 1: Unset
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "15:" // Oddments: Load input (2, 0): Bit 1: End
+ "ldr x20, [x13, #0x68]\n"
+ "fmla v28.4s, v6.4s, v15.4s\n"
+ "fmla v30.4s, v0.4s, v15.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 17f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 17f\n"
+ "16:" // Oddments: Load input (3, 1): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "17:" // Oddments: Load input (3, 1): Bit 1: End
+ "ldr x20, [x13, #0x70]\n"
+ "fmla v30.4s, v4.4s, v11.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v16.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (2, 1): Bit 1: Unset
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "19:" // Oddments: Load input (2, 1): Bit 1: End
+ "ldr x20, [x13, #0x78]\n"
+ "fmla v28.4s, v7.4s, v16.4s\n"
+ "fmla v30.4s, v1.4s, v16.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 21f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "b 21f\n"
+ "20:" // Oddments: Load input (3, 3): Bit 1: Unset
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "21:" // Oddments: Load input (3, 3): Bit 1: End
+ "ldr x20, [x13, #0x80]\n"
+ "fmla v31.4s, v4.4s, v13.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (2, 3): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "23:" // Oddments: Load input (2, 3): Bit 1: End
+ "ldr x20, [x13, #0x88]\n"
+ "fmla v29.4s, v7.4s, v12.4s\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v14.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 25f\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
+ "b 25f\n"
+ "24:" // Oddments: Load input (3, 4): Bit 1: Unset
+ "ld1 { v14.s }[0], [x20], #0x4\n"
+ "25:" // Oddments: Load input (3, 4): Bit 1: End
+ "ldr x20, [x13, #0x90]\n"
+ "fmla v31.4s, v5.4s, v14.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (4, 0): Bit 1: Unset
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "27:" // Oddments: Load input (4, 0): Bit 1: End
+ "ldr x20, [x13, #0x98]\n"
+ "fmla v30.4s, v6.4s, v15.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 29f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 29f\n"
+ "28:" // Oddments: Load input (2, 4): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "29:" // Oddments: Load input (2, 4): Bit 1: End
+ "ldr x20, [x13, #0xa0]\n"
+ "fmla v29.4s, v8.4s, v11.4s\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (4, 1): Bit 1: Unset
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "31:" // Oddments: Load input (4, 1): Bit 1: End
+ "ldr x20, [x13, #0xa8]\n"
+ "fmla v30.4s, v7.4s, v13.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v16.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 33f\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
+ "b 33f\n"
+ "32:" // Oddments: Load input (3, 2): Bit 1: Unset
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "33:" // Oddments: Load input (3, 2): Bit 1: End
+ "ldr x20, [x13, #0xb0]\n"
+ "fmla v30.4s, v5.4s, v16.4s\n"
+ "fmla v31.4s, v3.4s, v16.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v14.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (4, 3): Bit 1: Unset
+ "ld1 { v14.s }[0], [x20], #0x4\n"
+ "35:" // Oddments: Load input (4, 3): Bit 1: End
+ "ldr x20, [x13, #0xb8]\n"
+ "fmla v31.4s, v7.4s, v14.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 37f\n"
+ "ld1 { v15.s }[2], [x20], #0x4\n"
+ "b 37f\n"
+ "36:" // Oddments: Load input (4, 2): Bit 1: Unset
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "37:" // Oddments: Load input (4, 2): Bit 1: End
+ "ldr x20, [x13, #0xc0]\n"
+ "fmla v30.4s, v8.4s, v15.4s\n"
+ "fmla v31.4s, v6.4s, v15.4s\n"
+ "add x20, x20, x28\n"
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (4, 4): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "39:" // Oddments: Load input (4, 4): Bit 1: End
+ "fmla v31.4s, v8.4s, v11.4s\n"
+ "fmax v28.4s, v28.4s, v26.4s\n"
+ "fmax v29.4s, v29.4s, v26.4s\n"
+ "fmax v30.4s, v30.4s, v26.4s\n"
+ "fmax v31.4s, v31.4s, v26.4s\n"
+ "fmin v28.4s, v28.4s, v27.4s\n"
+ "fmin v29.4s, v29.4s, v27.4s\n"
+ "fmin v30.4s, v30.4s, v27.4s\n"
+ "fmin v31.4s, v31.4s, v27.4s\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "st1 { v28.d }[0], [x12], #0x8\n"
+ "st1 { v29.d }[0], [x11], #0x8\n"
+ "st1 { v30.d }[0], [x10], #0x8\n"
+ "st1 { v31.d }[0], [x9], #0x8\n"
+ "tbz %x[n_channels], #0, 41f\n"
+ "st1 { v28.s }[2], [x12], #0x4\n"
+ "st1 { v29.s }[2], [x11], #0x4\n"
+ "st1 { v30.s }[2], [x10], #0x4\n"
+ "st1 { v31.s }[2], [x9], #0x4\n"
+ "b 41f\n"
+ "40:" // Oddments: Store: Bit 1: Unset
+ "st1 { v28.s }[0], [x12], #0x4\n"
+ "st1 { v29.s }[0], [x11], #0x4\n"
+ "st1 { v30.s }[0], [x10], #0x4\n"
+ "st1 { v31.s }[0], [x9], #0x4\n"
+ "41:" // Oddments: Store: Bit 1: End
+ "42:" // End
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..de8a1e4514
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..3426fbc3f9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,991 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x27, #0x0\n"
+ "mov x26, #0x0\n"
+ "1:" // Tile loop
+ "str x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x23, #0x2\n"
+ "mov x25, #0x2\n"
+ "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x2, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mul x22, x27, x24\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x22, x26, x2, x22\n" // offset += tile_j * ld_input_col
+ "ldr x3, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "lsl x2, x2, #0x2\n"
+ "mul x20, x27, x21\n" // offset = tile_i * ld_output_row
+ "ldr x4, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x6, x2, x2\n"
+ "mul x22, x22, x23\n" // offset *= kernel_stride * output_size
+ "add x4, x4, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x7, x4, x24, LSL #2\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x20, x26, x3, x20\n" // offset += tile_j * ld_output_col
+ "add x17, x7, x24, LSL #2\n"
+ "mov x23, #0x10\n" // cntb _, ALL, #1
+ "mul x20, x20, x25\n" // offset *= output_tile_size
+ "lsr x22, %x[n_channels], #0x2\n"
+ "add x16, x17, x24, LSL #2\n"
+ "add x15, x6, x2\n"
+ "add x14, x16, x24, LSL #2\n"
+ "add x13, x15, x2\n"
+ "add x5, x5, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v27.4s }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "add x12, x14, x24, LSL #2\n"
+ "add x11, x13, x2\n"
+ "add x10, x5, x21, LSL #2\n"
+ "lsl x3, x3, #0x2\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x23\n"
+ "cbz x22, 4f\n"
+ "ldr q25, [x8, #0x0]\n"
+ "ldr q0, [x8, #0x10]\n"
+ "cmp x23, x22, LSL #4\n"
+ "ldr q1, [x8, #0x20]\n"
+ "ldr q2, [x8, #0x30]\n"
+ "ldr q3, [x8, #0x40]\n"
+ "ldr q4, [x8, #0x50]\n"
+ "add x8, x8, #0x60\n"
+ "ld1 { v5.4s }, [x4]\n"
+ "ldr q6, [x4, x2]\n"
+ "ld1 { v7.4s }, [x7]\n"
+ "ldr q8, [x7, x2]\n"
+ "ldr q9, [x4, x6]\n"
+ "ldr q13, [x7, x6]\n"
+ "ldr q11, [x4, x15]\n"
+ "ldr q12, [x4, x13]\n"
+ "ldr q10, [x7, x11]\n"
+ "ld1 { v14.4s }, [x17]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v30.16b, v25.16b\n fmla v30.4s, v0.4s, v5.4s\n"
+ "ldr q23, [x7, x15]\n"
+ "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v6.4s\n"
+ "add x23, x23, #0x10\n"
+ "mov v29.16b, v25.16b\n fmla v29.4s, v0.4s, v7.4s\n"
+ "mov v28.16b, v25.16b\n fmla v28.4s, v0.4s, v8.4s\n"
+ "ldr q19, [x8, #0x0]\n"
+ "ldr q25, [x8, #0x140]\n"
+ "fmla v30.4s, v1.4s, v6.4s\n"
+ "ldr q21, [x7, x13]\n"
+ "fmla v31.4s, v1.4s, v9.4s\n"
+ "add x7, x7, #0x10\n"
+ "fmla v29.4s, v1.4s, v8.4s\n"
+ "fmla v28.4s, v1.4s, v13.4s\n"
+ "ldr q1, [x8, #0x10]\n"
+ "cmp x23, x22, LSL #4\n"
+ "fmla v30.4s, v2.4s, v9.4s\n"
+ "ldr q18, [x4, x11]\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "add x4, x4, #0x10\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "fmla v28.4s, v2.4s, v23.4s\n"
+ "ldr q17, [x8, #0x20]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v30.4s, v3.4s, v11.4s\n"
+ "ldr q6, [x17, x2]\n"
+ "fmla v31.4s, v3.4s, v12.4s\n"
+ "add x21, x21, #0x10\n"
+ "fmla v29.4s, v3.4s, v23.4s\n"
+ "fmla v28.4s, v3.4s, v21.4s\n"
+ "ldr q16, [x8, #0x30]\n"
+ "fmla v30.4s, v4.4s, v12.4s\n"
+ "ldr q2, [x17, x6]\n"
+ "fmla v31.4s, v4.4s, v18.4s\n"
+ "ldr q0, [x17, x15]\n"
+ "fmla v29.4s, v4.4s, v21.4s\n"
+ "fmla v28.4s, v4.4s, v10.4s\n"
+ "ldr q20, [x8, #0x40]\n"
+ "fmla v30.4s, v19.4s, v7.4s\n"
+ "ld1 { v7.4s }, [x7]\n"
+ "fmla v31.4s, v19.4s, v8.4s\n"
+ "fmla v29.4s, v19.4s, v14.4s\n"
+ "fmla v28.4s, v19.4s, v6.4s\n"
+ "ldr q19, [x8, #0x50]\n"
+ "fmla v30.4s, v1.4s, v8.4s\n"
+ "ldr q26, [x17, x11]\n"
+ "fmla v31.4s, v1.4s, v13.4s\n"
+ "fmla v29.4s, v1.4s, v6.4s\n"
+ "fmla v28.4s, v1.4s, v2.4s\n"
+ "ldr q18, [x8, #0x60]\n"
+ "fmla v30.4s, v17.4s, v13.4s\n"
+ "ldr q1, [x17, x13]\n"
+ "fmla v31.4s, v17.4s, v23.4s\n"
+ "add x17, x17, #0x10\n"
+ "fmla v29.4s, v17.4s, v2.4s\n"
+ "fmla v28.4s, v17.4s, v0.4s\n"
+ "ldr q17, [x8, #0x70]\n"
+ "fmla v30.4s, v16.4s, v23.4s\n"
+ "ld1 { v24.4s }, [x16]\n"
+ "fmla v31.4s, v16.4s, v21.4s\n"
+ "fmla v29.4s, v16.4s, v0.4s\n"
+ "fmla v28.4s, v16.4s, v1.4s\n"
+ "ldr q16, [x8, #0x80]\n"
+ "fmla v30.4s, v20.4s, v21.4s\n"
+ "ldr q23, [x16, x2]\n"
+ "fmla v31.4s, v20.4s, v10.4s\n"
+ "ldr q22, [x16, x6]\n"
+ "fmla v29.4s, v20.4s, v1.4s\n"
+ "fmla v28.4s, v20.4s, v26.4s\n"
+ "ldr q21, [x8, #0x90]\n"
+ "fmla v30.4s, v19.4s, v14.4s\n"
+ "ldr q5, [x16, x11]\n"
+ "fmla v31.4s, v19.4s, v6.4s\n"
+ "fmla v29.4s, v19.4s, v24.4s\n"
+ "fmla v28.4s, v19.4s, v23.4s\n"
+ "ldr q11, [x8, #0xa0]\n"
+ "fmla v30.4s, v18.4s, v6.4s\n"
+ "ldr q20, [x16, x15]\n"
+ "fmla v31.4s, v18.4s, v2.4s\n"
+ "fmla v29.4s, v18.4s, v23.4s\n"
+ "fmla v28.4s, v18.4s, v22.4s\n"
+ "ldr q18, [x8, #0xb0]\n"
+ "fmla v30.4s, v17.4s, v2.4s\n"
+ "ldr q19, [x16, x13]\n"
+ "fmla v31.4s, v17.4s, v0.4s\n"
+ "add x16, x16, #0x10\n"
+ "fmla v29.4s, v17.4s, v22.4s\n"
+ "fmla v28.4s, v17.4s, v20.4s\n"
+ "ldr q17, [x8, #0xc0]\n"
+ "fmla v30.4s, v16.4s, v0.4s\n"
+ "ld1 { v0.4s }, [x14]\n"
+ "fmla v31.4s, v16.4s, v1.4s\n"
+ "fmla v29.4s, v16.4s, v20.4s\n"
+ "fmla v28.4s, v16.4s, v19.4s\n"
+ "ldr q16, [x8, #0xd0]\n"
+ "fmla v30.4s, v21.4s, v1.4s\n"
+ "ldr q4, [x14, x2]\n"
+ "fmla v31.4s, v21.4s, v26.4s\n"
+ "ldr q12, [x14, x13]\n"
+ "fmla v29.4s, v21.4s, v19.4s\n"
+ "fmla v28.4s, v21.4s, v5.4s\n"
+ "ldr q13, [x8, #0xe0]\n"
+ "fmla v30.4s, v11.4s, v24.4s\n"
+ "ldr q6, [x14, x6]\n"
+ "fmla v31.4s, v11.4s, v23.4s\n"
+ "fmla v29.4s, v11.4s, v0.4s\n"
+ "fmla v28.4s, v11.4s, v4.4s\n"
+ "ldr q24, [x8, #0xf0]\n"
+ "fmla v30.4s, v18.4s, v23.4s\n"
+ "ldr q26, [x14, x15]\n"
+ "fmla v31.4s, v18.4s, v22.4s\n"
+ "fmla v29.4s, v18.4s, v4.4s\n"
+ "fmla v28.4s, v18.4s, v6.4s\n"
+ "ldr q23, [x8, #0x100]\n"
+ "fmla v30.4s, v17.4s, v22.4s\n"
+ "ldr q22, [x14, x11]\n"
+ "fmla v31.4s, v17.4s, v20.4s\n"
+ "add x14, x14, #0x10\n"
+ "fmla v29.4s, v17.4s, v6.4s\n"
+ "fmla v28.4s, v17.4s, v26.4s\n"
+ "ldr q21, [x8, #0x110]\n"
+ "fmla v30.4s, v16.4s, v20.4s\n"
+ "ld1 { v18.4s }, [x12]\n"
+ "fmla v31.4s, v16.4s, v19.4s\n"
+ "fmla v29.4s, v16.4s, v26.4s\n"
+ "fmla v28.4s, v16.4s, v12.4s\n"
+ "ldr q20, [x8, #0x120]\n"
+ "fmla v30.4s, v13.4s, v19.4s\n"
+ "ldr q17, [x12, x2]\n"
+ "fmla v31.4s, v13.4s, v5.4s\n"
+ "ld1 { v14.4s }, [x17]\n"
+ "fmla v29.4s, v13.4s, v12.4s\n"
+ "fmla v28.4s, v13.4s, v22.4s\n"
+ "ldr q19, [x8, #0x130]\n"
+ "fmla v30.4s, v24.4s, v0.4s\n"
+ "ldr q16, [x12, x6]\n"
+ "fmla v31.4s, v24.4s, v4.4s\n"
+ "fmla v29.4s, v24.4s, v18.4s\n"
+ "ldr q18, [x12, x15]\n"
+ "fmla v28.4s, v24.4s, v17.4s\n"
+ "ldr q0, [x8, #0x150]\n"
+ "fmla v30.4s, v23.4s, v4.4s\n"
+ "ldr q13, [x7, x6]\n"
+ "fmla v31.4s, v23.4s, v6.4s\n"
+ "fmla v29.4s, v23.4s, v17.4s\n"
+ "ldr q17, [x12, x13]\n"
+ "fmla v28.4s, v23.4s, v16.4s\n"
+ "ldr q1, [x8, #0x160]\n"
+ "fmla v30.4s, v21.4s, v6.4s\n"
+ "ld1 { v5.4s }, [x4]\n"
+ "fmla v31.4s, v21.4s, v26.4s\n"
+ "fmla v29.4s, v21.4s, v16.4s\n"
+ "ldr q16, [x12, x11]\n"
+ "fmla v28.4s, v21.4s, v18.4s\n"
+ "ldr q2, [x8, #0x170]\n"
+ "fmla v30.4s, v20.4s, v26.4s\n"
+ "ldr q6, [x4, x2]\n"
+ "fmla v31.4s, v20.4s, v12.4s\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.4s, v20.4s, v18.4s\n"
+ "ldr q11, [x4, x15]\n"
+ "fmla v28.4s, v20.4s, v17.4s\n"
+ "ldr q3, [x8, #0x180]\n"
+ "fmla v30.4s, v19.4s, v12.4s\n"
+ "ldr q8, [x7, x2]\n"
+ "fmla v31.4s, v19.4s, v22.4s\n"
+ "ldr q10, [x7, x11]\n"
+ "fmla v29.4s, v19.4s, v17.4s\n"
+ "ldr q12, [x4, x13]\n"
+ "fmla v28.4s, v19.4s, v16.4s\n"
+ "ldr q9, [x4, x6]\n"
+ "ldr q4, [x8, #0x190]\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
+ "add x8, x8, #0x1a0\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmax v28.4s, v28.4s, v27.4s\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "st1 { v30.4s }, [x5]\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "str q31, [x5, x3]\n"
+ "add x5, x5, #0x10\n"
+ "st1 { v29.4s }, [x10]\n"
+ "str q28, [x10, x3]\n"
+ "add x10, x10, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+ "ldr q22, [x7, x15]\n"
+ "mov v5.16b, v25.16b\n fmla v5.4s, v0.4s, v6.4s\n"
+ "mov v30.16b, v25.16b\n fmla v30.4s, v0.4s, v7.4s\n"
+ "mov v29.16b, v25.16b\n fmla v29.4s, v0.4s, v8.4s\n"
+ "ldr q19, [x8, #0x0]\n"
+ "fmla v31.4s, v1.4s, v6.4s\n"
+ "ldr q21, [x7, x13]\n"
+ "fmla v5.4s, v1.4s, v9.4s\n"
+ "add x7, x7, #0x10\n"
+ "fmla v30.4s, v1.4s, v8.4s\n"
+ "fmla v29.4s, v1.4s, v13.4s\n"
+ "ldr q18, [x8, #0x10]\n"
+ "fmla v31.4s, v2.4s, v9.4s\n"
+ "ldr q16, [x4, x11]\n"
+ "fmla v5.4s, v2.4s, v11.4s\n"
+ "add x4, x4, #0x10\n"
+ "fmla v30.4s, v2.4s, v13.4s\n"
+ "fmla v29.4s, v2.4s, v22.4s\n"
+ "ldr q17, [x8, #0x20]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr q6, [x17, x2]\n"
+ "fmla v5.4s, v3.4s, v12.4s\n"
+ "fmla v30.4s, v3.4s, v22.4s\n"
+ "fmla v29.4s, v3.4s, v21.4s\n"
+ "ldr q20, [x8, #0x30]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "ldr q2, [x17, x6]\n"
+ "fmla v5.4s, v4.4s, v16.4s\n"
+ "ldr q28, [x17, x15]\n"
+ "fmla v30.4s, v4.4s, v21.4s\n"
+ "fmla v29.4s, v4.4s, v10.4s\n"
+ "ldr q16, [x8, #0x40]\n"
+ "fmla v31.4s, v19.4s, v7.4s\n"
+ "fmla v5.4s, v19.4s, v8.4s\n"
+ "fmla v30.4s, v19.4s, v14.4s\n"
+ "fmla v29.4s, v19.4s, v6.4s\n"
+ "ldr q19, [x8, #0x50]\n"
+ "fmla v31.4s, v18.4s, v8.4s\n"
+ "ldr q1, [x17, x11]\n"
+ "fmla v5.4s, v18.4s, v13.4s\n"
+ "fmla v30.4s, v18.4s, v6.4s\n"
+ "fmla v29.4s, v18.4s, v2.4s\n"
+ "ldr q18, [x8, #0x60]\n"
+ "fmla v31.4s, v17.4s, v13.4s\n"
+ "ldr q26, [x17, x13]\n"
+ "fmla v5.4s, v17.4s, v22.4s\n"
+ "add x17, x17, #0x10\n"
+ "fmla v30.4s, v17.4s, v2.4s\n"
+ "fmla v29.4s, v17.4s, v28.4s\n"
+ "ldr q17, [x8, #0x70]\n"
+ "fmla v31.4s, v20.4s, v22.4s\n"
+ "ld1 { v25.4s }, [x16]\n"
+ "fmla v5.4s, v20.4s, v21.4s\n"
+ "fmla v30.4s, v20.4s, v28.4s\n"
+ "fmla v29.4s, v20.4s, v26.4s\n"
+ "ldr q24, [x8, #0x80]\n"
+ "fmla v31.4s, v16.4s, v21.4s\n"
+ "ldr q23, [x16, x2]\n"
+ "fmla v5.4s, v16.4s, v10.4s\n"
+ "ldr q0, [x16, x6]\n"
+ "fmla v30.4s, v16.4s, v26.4s\n"
+ "fmla v29.4s, v16.4s, v1.4s\n"
+ "ldr q22, [x8, #0x90]\n"
+ "fmla v31.4s, v19.4s, v14.4s\n"
+ "ldr q16, [x16, x11]\n"
+ "fmla v5.4s, v19.4s, v6.4s\n"
+ "fmla v30.4s, v19.4s, v25.4s\n"
+ "fmla v29.4s, v19.4s, v23.4s\n"
+ "ldr q21, [x8, #0xa0]\n"
+ "fmla v31.4s, v18.4s, v6.4s\n"
+ "ldr q20, [x16, x15]\n"
+ "fmla v5.4s, v18.4s, v2.4s\n"
+ "fmla v30.4s, v18.4s, v23.4s\n"
+ "fmla v29.4s, v18.4s, v0.4s\n"
+ "ldr q18, [x8, #0xb0]\n"
+ "fmla v31.4s, v17.4s, v2.4s\n"
+ "ldr q19, [x16, x13]\n"
+ "fmla v5.4s, v17.4s, v28.4s\n"
+ "add x16, x16, #0x10\n"
+ "fmla v30.4s, v17.4s, v0.4s\n"
+ "fmla v29.4s, v17.4s, v20.4s\n"
+ "ldr q17, [x8, #0xc0]\n"
+ "fmla v31.4s, v24.4s, v28.4s\n"
+ "ld1 { v7.4s }, [x14]\n"
+ "fmla v5.4s, v24.4s, v26.4s\n"
+ "fmla v30.4s, v24.4s, v20.4s\n"
+ "fmla v29.4s, v24.4s, v19.4s\n"
+ "ldr q2, [x8, #0xd0]\n"
+ "fmla v31.4s, v22.4s, v26.4s\n"
+ "ldr q28, [x14, x2]\n"
+ "fmla v5.4s, v22.4s, v1.4s\n"
+ "ldr q13, [x14, x13]\n"
+ "fmla v30.4s, v22.4s, v19.4s\n"
+ "fmla v29.4s, v22.4s, v16.4s\n"
+ "ldr q14, [x8, #0xe0]\n"
+ "fmla v31.4s, v21.4s, v25.4s\n"
+ "ldr q26, [x14, x6]\n"
+ "fmla v5.4s, v21.4s, v23.4s\n"
+ "fmla v30.4s, v21.4s, v7.4s\n"
+ "fmla v29.4s, v21.4s, v28.4s\n"
+ "ldr q25, [x8, #0xf0]\n"
+ "fmla v31.4s, v18.4s, v23.4s\n"
+ "ldr q24, [x14, x15]\n"
+ "fmla v5.4s, v18.4s, v0.4s\n"
+ "fmla v30.4s, v18.4s, v28.4s\n"
+ "fmla v29.4s, v18.4s, v26.4s\n"
+ "ldr q23, [x8, #0x100]\n"
+ "fmla v31.4s, v17.4s, v0.4s\n"
+ "ldr q22, [x14, x11]\n"
+ "fmla v5.4s, v17.4s, v20.4s\n"
+ "add x14, x14, #0x10\n"
+ "fmla v30.4s, v17.4s, v26.4s\n"
+ "fmla v29.4s, v17.4s, v24.4s\n"
+ "ldr q21, [x8, #0x110]\n"
+ "fmla v31.4s, v2.4s, v20.4s\n"
+ "ld1 { v18.4s }, [x12]\n"
+ "fmla v5.4s, v2.4s, v19.4s\n"
+ "fmla v30.4s, v2.4s, v24.4s\n"
+ "fmla v29.4s, v2.4s, v13.4s\n"
+ "ldr q20, [x8, #0x120]\n"
+ "fmla v31.4s, v14.4s, v19.4s\n"
+ "ldr q17, [x12, x2]\n"
+ "fmla v5.4s, v14.4s, v16.4s\n"
+ "fmla v30.4s, v14.4s, v13.4s\n"
+ "fmla v29.4s, v14.4s, v22.4s\n"
+ "ldr q19, [x8, #0x130]\n"
+ "add x8, x8, #0x140\n"
+ "fmla v31.4s, v25.4s, v7.4s\n"
+ "ldr q16, [x12, x6]\n"
+ "fmla v5.4s, v25.4s, v28.4s\n"
+ "fmla v30.4s, v25.4s, v18.4s\n"
+ "ldr q18, [x12, x15]\n"
+ "fmla v29.4s, v25.4s, v17.4s\n"
+ "fmla v31.4s, v23.4s, v28.4s\n"
+ "fmla v5.4s, v23.4s, v26.4s\n"
+ "fmla v30.4s, v23.4s, v17.4s\n"
+ "ldr q17, [x12, x13]\n"
+ "fmla v29.4s, v23.4s, v16.4s\n"
+ "fmla v31.4s, v21.4s, v26.4s\n"
+ "fmla v5.4s, v21.4s, v24.4s\n"
+ "fmla v30.4s, v21.4s, v16.4s\n"
+ "ldr q16, [x12, x11]\n"
+ "fmla v29.4s, v21.4s, v18.4s\n"
+ "add x12, x12, #0x10\n"
+ "fmla v31.4s, v20.4s, v24.4s\n"
+ "fmla v5.4s, v20.4s, v13.4s\n"
+ "fmla v30.4s, v20.4s, v18.4s\n"
+ "fmla v29.4s, v20.4s, v17.4s\n"
+ "fmla v31.4s, v19.4s, v13.4s\n"
+ "fmla v5.4s, v19.4s, v22.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
+ "fmla v30.4s, v19.4s, v17.4s\n"
+ "fmla v29.4s, v19.4s, v16.4s\n"
+ "fmax v5.4s, v5.4s, v27.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "fmin v5.4s, v5.4s, v15.4s\n"
+ "st1 { v31.4s }, [x5]\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "str q5, [x5, x3]\n"
+ "add x5, x5, #0x10\n"
+ "st1 { v30.4s }, [x10]\n"
+ "str q29, [x10, x3]\n"
+ "add x10, x10, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 61f\n"
+ "ldr q25, [x8, #0x0]\n"
+ "ldr q0, [x8, #0x10]\n"
+ "add x9, x4, XZR\n"
+ "add x28, x4, x2\n"
+ "ldr q1, [x8, #0x20]\n"
+ "ldr q2, [x8, #0x30]\n"
+ "add x27, x7, XZR\n"
+ "add x26, x7, x2\n"
+ "ldr q3, [x8, #0x40]\n"
+ "ldr q4, [x8, #0x50]\n"
+ "add x25, x4, x6\n"
+ "add x24, x7, x6\n"
+ "add x23, x4, x15\n"
+ "add x22, x4, x13\n"
+ "add x21, x7, x11\n"
+ "add x20, x17, XZR\n"
+ "add x8, x8, #0x60\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ldr d5, [x9], #0x8\n"
+ "ldr d6, [x28], #0x8\n"
+ "ldr d7, [x27], #0x8\n"
+ "ldr d8, [x26], #0x8\n"
+ "ldr d9, [x25], #0x8\n"
+ "ldr d13, [x24], #0x8\n"
+ "ldr d11, [x23], #0x8\n"
+ "ldr d12, [x22], #0x8\n"
+ "ldr d10, [x21], #0x8\n"
+ "ldr d14, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v5.s }[2], [x9]\n"
+ "ld1 { v6.s }[2], [x28]\n"
+ "ld1 { v7.s }[2], [x27]\n"
+ "ld1 { v8.s }[2], [x26]\n"
+ "ld1 { v9.s }[2], [x25]\n"
+ "ld1 { v13.s }[2], [x24]\n"
+ "ld1 { v11.s }[2], [x23]\n"
+ "ld1 { v12.s }[2], [x22]\n"
+ "ld1 { v10.s }[2], [x21]\n"
+ "ld1 { v14.s }[2], [x20]\n"
+ "b 6f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset
+ "ldr s5, [x9, #0x0]\n"
+ "ldr s6, [x28, #0x0]\n"
+ "ldr s7, [x27, #0x0]\n"
+ "ldr s8, [x26, #0x0]\n"
+ "ldr s9, [x25, #0x0]\n"
+ "ldr s13, [x24, #0x0]\n"
+ "ldr s11, [x23, #0x0]\n"
+ "ldr s12, [x22, #0x0]\n"
+ "ldr s10, [x21, #0x0]\n"
+ "ldr s14, [x20, #0x0]\n"
+ "6:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
+ "mov v28.16b, v25.16b\n fmla v28.4s, v0.4s, v5.4s\n"
+ "mov v29.16b, v25.16b\n fmla v29.4s, v0.4s, v6.4s\n"
+ "add x20, x7, x15\n"
+ "mov v30.16b, v25.16b\n fmla v30.4s, v0.4s, v7.4s\n"
+ "mov v31.16b, v25.16b\n fmla v31.4s, v0.4s, v8.4s\n"
+ "fmla v28.4s, v1.4s, v6.4s\n"
+ "fmla v29.4s, v1.4s, v9.4s\n"
+ "fmla v30.4s, v1.4s, v8.4s\n"
+ "fmla v31.4s, v1.4s, v13.4s\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "fmla v30.4s, v2.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr d5, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v5.s }[2], [x20]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+ "ldr s5, [x20, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+ "fmla v31.4s, v2.4s, v5.4s\n"
+ "fmla v28.4s, v3.4s, v11.4s\n"
+ "add x20, x7, x13\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "fmla v30.4s, v3.4s, v5.4s\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr d6, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v6.s }[2], [x20]\n"
+ "b 10f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+ "ldr s6, [x20, #0x0]\n"
+ "10:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+ "fmla v31.4s, v3.4s, v6.4s\n"
+ "fmla v28.4s, v4.4s, v12.4s\n"
+ "add x20, x4, x11\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr d9, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v9.s }[2], [x20]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: Unset
+ "ldr s9, [x20, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: End
+ "ldr q0, [x8, #0x0]\n"
+ "fmla v29.4s, v4.4s, v9.4s\n"
+ "fmla v30.4s, v4.4s, v6.4s\n"
+ "add x20, x17, x2\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmla v28.4s, v0.4s, v7.4s\n"
+ "add x8, x8, #0x10\n"
+ "fmla v29.4s, v0.4s, v8.4s\n"
+ "fmla v30.4s, v0.4s, v14.4s\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 14f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "14:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+ "ldr q1, [x8, #0x0]\n"
+ "fmla v31.4s, v0.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v8.4s\n"
+ "add x20, x17, x6\n"
+ "fmla v29.4s, v1.4s, v13.4s\n"
+ "fmla v30.4s, v1.4s, v11.4s\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End
+ "ldr q2, [x8, #0x0]\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "add x20, x17, x15\n"
+ "fmla v29.4s, v2.4s, v5.4s\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ldr d9, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v9.s }[2], [x20]\n"
+ "b 18f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+ "ldr s9, [x20, #0x0]\n"
+ "18:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+ "ldr q3, [x8, #0x0]\n"
+ "fmla v31.4s, v2.4s, v9.4s\n"
+ "fmla v28.4s, v3.4s, v5.4s\n"
+ "add x20, x17, x13\n"
+ "fmla v29.4s, v3.4s, v6.4s\n"
+ "fmla v30.4s, v3.4s, v9.4s\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v13.s }[2], [x20]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+ "ldr s13, [x20, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+ "ldr q4, [x8, #0x0]\n"
+ "fmla v31.4s, v3.4s, v13.4s\n"
+ "fmla v28.4s, v4.4s, v6.4s\n"
+ "add x20, x17, x11\n"
+ "fmla v29.4s, v4.4s, v10.4s\n"
+ "fmla v30.4s, v4.4s, v13.4s\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ldr d8, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 22f\n"
+ "ld1 { v8.s }[2], [x20]\n"
+ "b 22f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset
+ "ldr s8, [x20, #0x0]\n"
+ "22:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End
+ "ldr q0, [x8, #0x0]\n"
+ "fmla v31.4s, v4.4s, v8.4s\n"
+ "fmla v28.4s, v0.4s, v14.4s\n"
+ "add x20, x16, XZR\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr d5, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v5.s }[2], [x20]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+ "ldr s5, [x20, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+ "fmla v30.4s, v0.4s, v5.4s\n"
+ "add x20, x16, x2\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ldr d6, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 26f\n"
+ "ld1 { v6.s }[2], [x20]\n"
+ "b 26f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+ "ldr s6, [x20, #0x0]\n"
+ "26:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+ "ldr q1, [x8, #0x0]\n"
+ "fmla v31.4s, v0.4s, v6.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "add x20, x16, x6\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "fmla v30.4s, v1.4s, v6.4s\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v10.s }[2], [x20]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+ "ldr s10, [x20, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+ "ldr q2, [x8, #0x0]\n"
+ "fmla v31.4s, v1.4s, v10.4s\n"
+ "fmla v28.4s, v2.4s, v12.4s\n"
+ "add x20, x16, x15\n"
+ "fmla v29.4s, v2.4s, v9.4s\n"
+ "fmla v30.4s, v2.4s, v10.4s\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 30f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 30f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "30:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+ "ldr q3, [x8, #0x0]\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v28.4s, v3.4s, v9.4s\n"
+ "add x20, x16, x13\n"
+ "fmla v29.4s, v3.4s, v13.4s\n"
+ "fmla v30.4s, v3.4s, v11.4s\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+ "ldr q4, [x8, #0x0]\n"
+ "fmla v31.4s, v3.4s, v12.4s\n"
+ "fmla v28.4s, v4.4s, v13.4s\n"
+ "add x20, x16, x11\n"
+ "fmla v29.4s, v4.4s, v8.4s\n"
+ "fmla v30.4s, v4.4s, v12.4s\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ldr d14, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 34f\n"
+ "ld1 { v14.s }[2], [x20]\n"
+ "b 34f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset
+ "ldr s14, [x20, #0x0]\n"
+ "34:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End
+ "ldr q0, [x8, #0x0]\n"
+ "fmla v31.4s, v4.4s, v14.4s\n"
+ "fmla v28.4s, v0.4s, v5.4s\n"
+ "add x20, x14, XZR\n"
+ "fmla v29.4s, v0.4s, v6.4s\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr d9, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v9.s }[2], [x20]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+ "ldr s9, [x20, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+ "fmla v30.4s, v0.4s, v9.4s\n"
+ "add x20, x14, x2\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ldr d13, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 38f\n"
+ "ld1 { v13.s }[2], [x20]\n"
+ "b 38f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+ "ldr s13, [x20, #0x0]\n"
+ "38:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+ "ldr q1, [x8, #0x0]\n"
+ "fmla v31.4s, v0.4s, v13.4s\n"
+ "fmla v28.4s, v1.4s, v6.4s\n"
+ "add x20, x14, x6\n"
+ "fmla v29.4s, v1.4s, v10.4s\n"
+ "fmla v30.4s, v1.4s, v13.4s\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr d5, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v5.s }[2], [x20]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+ "ldr s5, [x20, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+ "ldr q2, [x8, #0x0]\n"
+ "fmla v31.4s, v1.4s, v5.4s\n"
+ "fmla v28.4s, v2.4s, v10.4s\n"
+ "add x20, x14, x15\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "fmla v30.4s, v2.4s, v5.4s\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "ldr d6, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 42f\n"
+ "ld1 { v6.s }[2], [x20]\n"
+ "b 42f\n"
+ "41:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+ "ldr s6, [x20, #0x0]\n"
+ "42:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+ "ldr q3, [x8, #0x0]\n"
+ "fmla v31.4s, v2.4s, v6.4s\n"
+ "fmla v28.4s, v3.4s, v11.4s\n"
+ "add x20, x14, x13\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "fmla v30.4s, v3.4s, v6.4s\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #1, 43f\n"
+ "ldr d8, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v8.s }[2], [x20]\n"
+ "b 44f\n"
+ "43:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+ "ldr s8, [x20, #0x0]\n"
+ "44:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+ "ldr q4, [x8, #0x0]\n"
+ "fmla v31.4s, v3.4s, v8.4s\n"
+ "fmla v28.4s, v4.4s, v12.4s\n"
+ "add x20, x14, x11\n"
+ "fmla v29.4s, v4.4s, v14.4s\n"
+ "fmla v30.4s, v4.4s, v8.4s\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #1, 45f\n"
+ "ldr d10, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 46f\n"
+ "ld1 { v10.s }[2], [x20]\n"
+ "b 46f\n"
+ "45:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset
+ "ldr s10, [x20, #0x0]\n"
+ "46:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End
+ "ldr q0, [x8, #0x0]\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmla v28.4s, v0.4s, v9.4s\n"
+ "add x20, x12, XZR\n"
+ "fmla v29.4s, v0.4s, v13.4s\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #1, 47f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 48f\n"
+ "47:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "48:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "add x20, x12, x2\n"
+ "tbz %x[n_channels], #1, 49f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 50f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 50f\n"
+ "49:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "50:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End
+ "ldr q1, [x8, #0x0]\n"
+ "fmla v31.4s, v0.4s, v12.4s\n"
+ "fmla v28.4s, v1.4s, v13.4s\n"
+ "add x20, x12, x6\n"
+ "fmla v29.4s, v1.4s, v5.4s\n"
+ "fmla v30.4s, v1.4s, v12.4s\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #1, 51f\n"
+ "ldr d9, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v9.s }[2], [x20]\n"
+ "b 52f\n"
+ "51:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset
+ "ldr s9, [x20, #0x0]\n"
+ "52:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End
+ "ldr q2, [x8, #0x0]\n"
+ "fmla v31.4s, v1.4s, v9.4s\n"
+ "fmla v28.4s, v2.4s, v5.4s\n"
+ "add x20, x12, x15\n"
+ "fmla v29.4s, v2.4s, v6.4s\n"
+ "fmla v30.4s, v2.4s, v9.4s\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #1, 53f\n"
+ "ldr d11, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 54f\n"
+ "ld1 { v11.s }[2], [x20]\n"
+ "b 54f\n"
+ "53:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset
+ "ldr s11, [x20, #0x0]\n"
+ "54:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End
+ "ldr q3, [x8, #0x0]\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v28.4s, v3.4s, v6.4s\n"
+ "add x20, x12, x13\n"
+ "fmla v29.4s, v3.4s, v8.4s\n"
+ "fmla v30.4s, v3.4s, v11.4s\n"
+ "add x8, x8, #0x10\n"
+ "tbz %x[n_channels], #1, 55f\n"
+ "ldr d12, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 56f\n"
+ "ld1 { v12.s }[2], [x20]\n"
+ "b 56f\n"
+ "55:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset
+ "ldr s12, [x20, #0x0]\n"
+ "56:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End
+ "ldr q4, [x8, #0x0]\n"
+ "fmla v31.4s, v3.4s, v12.4s\n"
+ "fmla v28.4s, v4.4s, v8.4s\n"
+ "add x20, x12, x11\n"
+ "fmla v29.4s, v4.4s, v10.4s\n"
+ "fmla v30.4s, v4.4s, v12.4s\n"
+ "tbz %x[n_channels], #1, 57f\n"
+ "ldr d9, [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 58f\n"
+ "ld1 { v9.s }[2], [x20]\n"
+ "b 58f\n"
+ "57:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
+ "ldr s9, [x20, #0x0]\n"
+ "58:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
+ "fmla v31.4s, v4.4s, v9.4s\n"
+ "fmax v28.4s, v28.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "tbz %x[n_channels], #1, 59f\n"
+ "mov x21, x5\n"
+ "mov x20, x10\n"
+ "st1 { v28.d }[0], [x21], x3\n"
+ "st1 { v30.d }[0], [x20], x3\n"
+ "add x5, x5, #0x8\n"
+ "add x10, x10, #0x8\n"
+ "st1 { v29.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
+ "tbz %x[n_channels], #0, 60f\n"
+ "mov x21, x5\n"
+ "mov x20, x10\n"
+ "st1 { v28.s }[2], [x21], x3\n"
+ "st1 { v30.s }[2], [x20], x3\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
+ "b 60f\n"
+ "59:" // Tile loop: Oddments: Store: Bit 1: Unset
+ "mov x21, x5\n"
+ "mov x20, x10\n"
+ "st1 { v28.s }[0], [x21], x3\n"
+ "st1 { v30.s }[0], [x20], x3\n"
+ "st1 { v29.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
+ "60:" // Tile loop: Oddments: Store: Bit 1: End
+ "61:" // Tile loop: End
+ "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x26, x26, #0x1\n"
+ "add x21, x27, #0x1\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x26, x20\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x27, x27, x21, LT\n"
+ "csel x26, x26, XZR, LT\n"
+ "cmp x27, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..32939eb6dc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,1043 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[36];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[0];
+ inptrs[1] = input_ptrs[1];
+ inptrs[2] = input_ptrs[6];
+ inptrs[3] = input_ptrs[7];
+ inptrs[4] = input_ptrs[2];
+ inptrs[5] = input_ptrs[8];
+ inptrs[6] = input_ptrs[3];
+ inptrs[7] = input_ptrs[4];
+ inptrs[8] = input_ptrs[11];
+ inptrs[9] = input_ptrs[12];
+ inptrs[10] = input_ptrs[9];
+ inptrs[11] = input_ptrs[10];
+ inptrs[12] = input_ptrs[5];
+ inptrs[13] = input_ptrs[13];
+ inptrs[14] = input_ptrs[14];
+ inptrs[15] = input_ptrs[15];
+ inptrs[16] = input_ptrs[16];
+ inptrs[17] = input_ptrs[17];
+ inptrs[18] = input_ptrs[18];
+ inptrs[19] = input_ptrs[19];
+ inptrs[20] = input_ptrs[20];
+ inptrs[21] = input_ptrs[21];
+ inptrs[22] = input_ptrs[22];
+ inptrs[23] = input_ptrs[23];
+ inptrs[24] = input_ptrs[24];
+ inptrs[25] = input_ptrs[25];
+ inptrs[26] = input_ptrs[26];
+ inptrs[27] = input_ptrs[27];
+ inptrs[28] = input_ptrs[28];
+ inptrs[29] = input_ptrs[29];
+ inptrs[30] = input_ptrs[30];
+ inptrs[31] = input_ptrs[31];
+ inptrs[32] = input_ptrs[32];
+ inptrs[33] = input_ptrs[33];
+ inptrs[34] = input_ptrs[34];
+ inptrs[35] = input_ptrs[35];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "mov x17, #0x10\n" // cntb _, ALL, #1
+ "lsr x9, %x[n_channels], #0x2\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "ld1r { v27.4s }, [x20]\n"
+ "add x20, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldp x14, x13, [x21, #0x0]\n"
+ "ldp x12, x11, [x21, #0x10]\n"
+ "mov x10, #0x0\n"
+ "sub x28, XZR, x17\n"
+ "cbz x9, 3f\n"
+ "ldr q26, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "cmp x17, x9, LSL #4\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "add x16, x16, #0x60\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "ldr q5, [x21, x10]\n"
+ "ldr q6, [x20, x10]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q7, [x21, x10]\n"
+ "ldr q8, [x20, x10]\n"
+ "ldp x21, x20, [x15, #0x20]\n"
+ "ldr q9, [x21, x10]\n"
+ "ldr q13, [x20, x10]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ldr q11, [x21, x10]\n"
+ "ldr q12, [x20, x10]\n"
+ "ldp x21, x20, [x15, #0x40]\n"
+ "ldr q10, [x21, x10]\n"
+ "ldr q14, [x20, x10]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v30.16b, v26.16b\n fmla v30.4s, v0.4s, v5.4s\n"
+ "mov v31.16b, v26.16b\n fmla v31.4s, v0.4s, v6.4s\n"
+ "ldr x20, [x15, #0x50]\n"
+ "ldr q24, [x20, x10]\n"
+ "mov v28.16b, v26.16b\n fmla v28.4s, v0.4s, v7.4s\n"
+ "mov v29.16b, v26.16b\n fmla v29.4s, v0.4s, v8.4s\n"
+ "ldr q23, [x16, #0x0]\n"
+ "ldr q26, [x16, #0x140]\n"
+ "fmla v30.4s, v1.4s, v6.4s\n"
+ "fmla v31.4s, v1.4s, v9.4s\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ldr q22, [x20, x10]\n"
+ "fmla v28.4s, v1.4s, v8.4s\n"
+ "fmla v29.4s, v1.4s, v13.4s\n"
+ "ldr q21, [x16, #0x10]\n"
+ "ldr x20, [x15, #0x60]\n"
+ "fmla v30.4s, v2.4s, v9.4s\n"
+ "ldr q17, [x20, x10]\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "ldr x20, [x15, #0x68]\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "fmla v29.4s, v2.4s, v24.4s\n"
+ "ldr q16, [x16, #0x20]\n"
+ "ldr x22, [x15, #0x70]\n"
+ "fmla v30.4s, v3.4s, v11.4s\n"
+ "ldr q5, [x20, x10]\n"
+ "fmla v31.4s, v3.4s, v12.4s\n"
+ "ldr x20, [x15, #0x78]\n"
+ "fmla v28.4s, v3.4s, v24.4s\n"
+ "fmla v29.4s, v3.4s, v22.4s\n"
+ "ldr q20, [x16, #0x30]\n"
+ "ldr x21, [x15, #0x80]\n"
+ "fmla v30.4s, v4.4s, v12.4s\n"
+ "ldr q19, [x22, x10]\n"
+ "fmla v31.4s, v4.4s, v17.4s\n"
+ "ldr q2, [x20, x10]\n"
+ "fmla v28.4s, v4.4s, v22.4s\n"
+ "fmla v29.4s, v4.4s, v10.4s\n"
+ "ldr q18, [x16, #0x40]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla v30.4s, v23.4s, v7.4s\n"
+ "fmla v31.4s, v23.4s, v8.4s\n"
+ "ldr x23, [x15, #0x90]\n"
+ "ldr x26, [x15, #0x98]\n"
+ "fmla v28.4s, v23.4s, v14.4s\n"
+ "fmla v29.4s, v23.4s, v5.4s\n"
+ "ldr q1, [x16, #0x50]\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "fmla v30.4s, v21.4s, v8.4s\n"
+ "ldr q25, [x20, x10]\n"
+ "fmla v31.4s, v21.4s, v13.4s\n"
+ "ldr x25, [x15, #0xa8]\n"
+ "fmla v28.4s, v21.4s, v5.4s\n"
+ "fmla v29.4s, v21.4s, v19.4s\n"
+ "ldr q17, [x16, #0x60]\n"
+ "ldr x24, [x15, #0xb0]\n"
+ "fmla v30.4s, v16.4s, v13.4s\n"
+ "ldr q8, [x21, x10]\n"
+ "fmla v31.4s, v16.4s, v24.4s\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v28.4s, v16.4s, v19.4s\n"
+ "fmla v29.4s, v16.4s, v2.4s\n"
+ "ldr q16, [x16, #0x70]\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "fmla v30.4s, v20.4s, v24.4s\n"
+ "ldr q24, [x23, x10]\n"
+ "fmla v31.4s, v20.4s, v22.4s\n"
+ "ldr x27, [x15, #0xc8]\n"
+ "fmla v28.4s, v20.4s, v2.4s\n"
+ "fmla v29.4s, v20.4s, v8.4s\n"
+ "ldr q23, [x16, #0x80]\n"
+ "ldr x23, [x15, #0xd0]\n"
+ "fmla v30.4s, v18.4s, v22.4s\n"
+ "ldr q22, [x26, x10]\n"
+ "fmla v31.4s, v18.4s, v10.4s\n"
+ "ldr q21, [x22, x10]\n"
+ "fmla v28.4s, v18.4s, v8.4s\n"
+ "fmla v29.4s, v18.4s, v25.4s\n"
+ "ldr q20, [x16, #0x90]\n"
+ "ldr x22, [x15, #0xd8]\n"
+ "fmla v30.4s, v1.4s, v14.4s\n"
+ "ldr q0, [x20, x10]\n"
+ "fmla v31.4s, v1.4s, v5.4s\n"
+ "ldr x20, [x15, #0xe0]\n"
+ "fmla v28.4s, v1.4s, v24.4s\n"
+ "fmla v29.4s, v1.4s, v22.4s\n"
+ "ldr q6, [x16, #0xa0]\n"
+ "ldr x26, [x15, #0xf8]\n"
+ "fmla v30.4s, v17.4s, v5.4s\n"
+ "ldr q1, [x25, x10]\n"
+ "fmla v31.4s, v17.4s, v19.4s\n"
+ "ldr x25, [x15, #0xe8]\n"
+ "fmla v28.4s, v17.4s, v22.4s\n"
+ "fmla v29.4s, v17.4s, v21.4s\n"
+ "ldr q18, [x16, #0xb0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v30.4s, v16.4s, v19.4s\n"
+ "ldr q19, [x24, x10]\n"
+ "fmla v31.4s, v16.4s, v2.4s\n"
+ "ldr x24, [x15, #0xf0]\n"
+ "fmla v28.4s, v16.4s, v21.4s\n"
+ "fmla v29.4s, v16.4s, v1.4s\n"
+ "ldr q17, [x16, #0xc0]\n"
+ "fmla v30.4s, v23.4s, v2.4s\n"
+ "ldr q16, [x21, x10]\n"
+ "fmla v31.4s, v23.4s, v8.4s\n"
+ "ldr x21, [x15, #0x100]\n"
+ "fmla v28.4s, v23.4s, v1.4s\n"
+ "fmla v29.4s, v23.4s, v19.4s\n"
+ "ldr q13, [x16, #0xd0]\n"
+ "fmla v30.4s, v20.4s, v8.4s\n"
+ "ldr q2, [x27, x10]\n"
+ "fmla v31.4s, v20.4s, v25.4s\n"
+ "ldr q10, [x20, x10]\n"
+ "fmla v28.4s, v20.4s, v19.4s\n"
+ "fmla v29.4s, v20.4s, v0.4s\n"
+ "ldr q9, [x16, #0xe0]\n"
+ "ldr x20, [x15, #0x108]\n"
+ "fmla v30.4s, v6.4s, v24.4s\n"
+ "ldr q5, [x23, x10]\n"
+ "fmla v31.4s, v6.4s, v22.4s\n"
+ "ldr x23, [x15, #0x110]\n"
+ "fmla v28.4s, v6.4s, v16.4s\n"
+ "fmla v29.4s, v6.4s, v2.4s\n"
+ "ldr q24, [x16, #0xf0]\n"
+ "fmla v30.4s, v18.4s, v22.4s\n"
+ "ldr q25, [x22, x10]\n"
+ "fmla v31.4s, v18.4s, v21.4s\n"
+ "ldr x22, [x15, #0x118]\n"
+ "fmla v28.4s, v18.4s, v2.4s\n"
+ "fmla v29.4s, v18.4s, v5.4s\n"
+ "ldr q23, [x16, #0x100]\n"
+ "fmla v30.4s, v17.4s, v21.4s\n"
+ "ldr q22, [x25, x10]\n"
+ "fmla v31.4s, v17.4s, v1.4s\n"
+ "fmla v28.4s, v17.4s, v5.4s\n"
+ "fmla v29.4s, v17.4s, v25.4s\n"
+ "ldr q21, [x16, #0x110]\n"
+ "fmla v30.4s, v13.4s, v1.4s\n"
+ "ldr q18, [x24, x10]\n"
+ "fmla v31.4s, v13.4s, v19.4s\n"
+ "fmla v28.4s, v13.4s, v25.4s\n"
+ "fmla v29.4s, v13.4s, v10.4s\n"
+ "ldr q20, [x16, #0x120]\n"
+ "fmla v30.4s, v9.4s, v19.4s\n"
+ "ldr q17, [x26, x10]\n"
+ "fmla v31.4s, v9.4s, v0.4s\n"
+ "fmla v28.4s, v9.4s, v10.4s\n"
+ "fmla v29.4s, v9.4s, v22.4s\n"
+ "ldr q19, [x16, #0x130]\n"
+ "fmla v30.4s, v24.4s, v16.4s\n"
+ "ldr q16, [x21, x10]\n"
+ "fmla v31.4s, v24.4s, v2.4s\n"
+ "fmla v28.4s, v24.4s, v18.4s\n"
+ "ldr q18, [x20, x10]\n"
+ "fmla v29.4s, v24.4s, v17.4s\n"
+ "ldr q0, [x16, #0x150]\n"
+ "fmla v30.4s, v23.4s, v2.4s\n"
+ "fmla v31.4s, v23.4s, v5.4s\n"
+ "ldp x21, x20, [x15, #0x0]\n"
+ "fmla v28.4s, v23.4s, v17.4s\n"
+ "ldr q17, [x23, x10]\n"
+ "fmla v29.4s, v23.4s, v16.4s\n"
+ "ldr q1, [x16, #0x160]\n"
+ "fmla v30.4s, v21.4s, v5.4s\n"
+ "ldr q5, [x21, x17]\n"
+ "fmla v31.4s, v21.4s, v25.4s\n"
+ "fmla v28.4s, v21.4s, v16.4s\n"
+ "ldr q16, [x22, x10]\n"
+ "fmla v29.4s, v21.4s, v18.4s\n"
+ "ldr q2, [x16, #0x170]\n"
+ "fmla v30.4s, v20.4s, v25.4s\n"
+ "ldr q6, [x20, x17]\n"
+ "fmla v31.4s, v20.4s, v10.4s\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr q7, [x21, x17]\n"
+ "fmla v28.4s, v20.4s, v18.4s\n"
+ "fmla v29.4s, v20.4s, v17.4s\n"
+ "ldr q3, [x16, #0x180]\n"
+ "fmla v30.4s, v19.4s, v10.4s\n"
+ "ldr q8, [x20, x17]\n"
+ "fmla v31.4s, v19.4s, v22.4s\n"
+ "ldp x21, x20, [x15, #0x20]\n"
+ "ldr q13, [x20, x17]\n"
+ "fmla v28.4s, v19.4s, v17.4s\n"
+ "fmla v29.4s, v19.4s, v16.4s\n"
+ "ldr q9, [x21, x17]\n"
+ "ldr q4, [x16, #0x190]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
+ "ldr q11, [x21, x17]\n"
+ "ldr q12, [x20, x17]\n"
+ "fmax v28.4s, v28.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "ldp x21, x20, [x15, #0x40]\n"
+ "ldr q10, [x21, x17]\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "ldr q14, [x20, x17]\n"
+ "add x17, x17, #0x10\n"
+ "cmp x17, x9, LSL #4\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "add x10, x10, #0x10\n"
+ "str q30, [x14, x28]\n"
+ "add x16, x16, #0x1a0\n"
+ "str q31, [x13, x28]\n"
+ "str q28, [x12, x28]\n"
+ "str q29, [x11, x28]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v31.16b, v26.16b\n fmla v31.4s, v0.4s, v5.4s\n"
+ "mov v5.16b, v26.16b\n fmla v5.4s, v0.4s, v6.4s\n"
+ "ldr x20, [x15, #0x50]\n"
+ "ldr q22, [x20, x10]\n"
+ "mov v30.16b, v26.16b\n fmla v30.4s, v0.4s, v7.4s\n"
+ "mov v29.16b, v26.16b\n fmla v29.4s, v0.4s, v8.4s\n"
+ "ldr q19, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x58]\n"
+ "fmla v31.4s, v1.4s, v6.4s\n"
+ "ldr q21, [x20, x10]\n"
+ "fmla v5.4s, v1.4s, v9.4s\n"
+ "ldr x21, [x15, #0x60]\n"
+ "fmla v30.4s, v1.4s, v8.4s\n"
+ "fmla v29.4s, v1.4s, v13.4s\n"
+ "ldr q18, [x16, #0x10]\n"
+ "ldr x20, [x15, #0x68]\n"
+ "fmla v31.4s, v2.4s, v9.4s\n"
+ "ldr q16, [x21, x10]\n"
+ "fmla v5.4s, v2.4s, v11.4s\n"
+ "ldr x23, [x15, #0x70]\n"
+ "fmla v30.4s, v2.4s, v13.4s\n"
+ "fmla v29.4s, v2.4s, v22.4s\n"
+ "ldr q17, [x16, #0x20]\n"
+ "ldr x21, [x15, #0x78]\n"
+ "fmla v31.4s, v3.4s, v11.4s\n"
+ "ldr q6, [x20, x10]\n"
+ "fmla v5.4s, v3.4s, v12.4s\n"
+ "ldr x22, [x15, #0x80]\n"
+ "fmla v30.4s, v3.4s, v22.4s\n"
+ "fmla v29.4s, v3.4s, v21.4s\n"
+ "ldr q20, [x16, #0x30]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla v31.4s, v4.4s, v12.4s\n"
+ "ldr q2, [x23, x10]\n"
+ "fmla v5.4s, v4.4s, v16.4s\n"
+ "ldr q28, [x21, x10]\n"
+ "fmla v30.4s, v4.4s, v21.4s\n"
+ "fmla v29.4s, v4.4s, v10.4s\n"
+ "ldr q16, [x16, #0x40]\n"
+ "ldr x21, [x15, #0x90]\n"
+ "fmla v31.4s, v19.4s, v7.4s\n"
+ "fmla v5.4s, v19.4s, v8.4s\n"
+ "ldr x27, [x15, #0x98]\n"
+ "ldr x26, [x15, #0xa0]\n"
+ "fmla v30.4s, v19.4s, v14.4s\n"
+ "fmla v29.4s, v19.4s, v6.4s\n"
+ "ldr q19, [x16, #0x50]\n"
+ "ldr x25, [x15, #0xa8]\n"
+ "fmla v31.4s, v18.4s, v8.4s\n"
+ "ldr q1, [x20, x10]\n"
+ "fmla v5.4s, v18.4s, v13.4s\n"
+ "ldr x24, [x15, #0xb0]\n"
+ "fmla v30.4s, v18.4s, v6.4s\n"
+ "fmla v29.4s, v18.4s, v2.4s\n"
+ "ldr q18, [x16, #0x60]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v31.4s, v17.4s, v13.4s\n"
+ "ldr q26, [x22, x10]\n"
+ "fmla v5.4s, v17.4s, v22.4s\n"
+ "ldr x23, [x15, #0xc0]\n"
+ "fmla v30.4s, v17.4s, v2.4s\n"
+ "fmla v29.4s, v17.4s, v28.4s\n"
+ "ldr q17, [x16, #0x70]\n"
+ "ldr x22, [x15, #0xc8]\n"
+ "fmla v31.4s, v20.4s, v22.4s\n"
+ "ldr q25, [x21, x10]\n"
+ "fmla v5.4s, v20.4s, v21.4s\n"
+ "ldr x21, [x15, #0xd0]\n"
+ "fmla v30.4s, v20.4s, v28.4s\n"
+ "fmla v29.4s, v20.4s, v26.4s\n"
+ "ldr q24, [x16, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v31.4s, v16.4s, v21.4s\n"
+ "ldr q23, [x27, x10]\n"
+ "fmla v5.4s, v16.4s, v10.4s\n"
+ "ldr q0, [x26, x10]\n"
+ "fmla v30.4s, v16.4s, v26.4s\n"
+ "fmla v29.4s, v16.4s, v1.4s\n"
+ "ldr q22, [x16, #0x90]\n"
+ "ldr x27, [x15, #0xd8]\n"
+ "fmla v31.4s, v19.4s, v14.4s\n"
+ "ldr q16, [x20, x10]\n"
+ "fmla v5.4s, v19.4s, v6.4s\n"
+ "ldr x20, [x15, #0xe0]\n"
+ "fmla v30.4s, v19.4s, v25.4s\n"
+ "fmla v29.4s, v19.4s, v23.4s\n"
+ "ldr q21, [x16, #0xa0]\n"
+ "ldr x26, [x15, #0xf8]\n"
+ "fmla v31.4s, v18.4s, v6.4s\n"
+ "ldr q20, [x25, x10]\n"
+ "fmla v5.4s, v18.4s, v2.4s\n"
+ "ldr x25, [x15, #0xe8]\n"
+ "fmla v30.4s, v18.4s, v23.4s\n"
+ "fmla v29.4s, v18.4s, v0.4s\n"
+ "ldr q18, [x16, #0xb0]\n"
+ "fmla v31.4s, v17.4s, v2.4s\n"
+ "ldr q19, [x24, x10]\n"
+ "fmla v5.4s, v17.4s, v28.4s\n"
+ "ldr x24, [x15, #0xf0]\n"
+ "fmla v30.4s, v17.4s, v0.4s\n"
+ "fmla v29.4s, v17.4s, v20.4s\n"
+ "ldr q17, [x16, #0xc0]\n"
+ "fmla v31.4s, v24.4s, v28.4s\n"
+ "ldr q7, [x23, x10]\n"
+ "fmla v5.4s, v24.4s, v26.4s\n"
+ "ldr x23, [x15, #0x100]\n"
+ "fmla v30.4s, v24.4s, v20.4s\n"
+ "fmla v29.4s, v24.4s, v19.4s\n"
+ "ldr q3, [x16, #0xd0]\n"
+ "fmla v31.4s, v22.4s, v26.4s\n"
+ "ldr q28, [x22, x10]\n"
+ "fmla v5.4s, v22.4s, v1.4s\n"
+ "ldr q13, [x20, x10]\n"
+ "fmla v30.4s, v22.4s, v19.4s\n"
+ "fmla v29.4s, v22.4s, v16.4s\n"
+ "ldr q11, [x16, #0xe0]\n"
+ "ldr x22, [x15, #0x108]\n"
+ "fmla v31.4s, v21.4s, v25.4s\n"
+ "ldr q26, [x21, x10]\n"
+ "fmla v5.4s, v21.4s, v23.4s\n"
+ "ldr x21, [x15, #0x110]\n"
+ "fmla v30.4s, v21.4s, v7.4s\n"
+ "fmla v29.4s, v21.4s, v28.4s\n"
+ "ldr q25, [x16, #0xf0]\n"
+ "fmla v31.4s, v18.4s, v23.4s\n"
+ "ldr q24, [x27, x10]\n"
+ "fmla v5.4s, v18.4s, v0.4s\n"
+ "ldr x20, [x15, #0x118]\n"
+ "fmla v30.4s, v18.4s, v28.4s\n"
+ "fmla v29.4s, v18.4s, v26.4s\n"
+ "ldr q23, [x16, #0x100]\n"
+ "fmla v31.4s, v17.4s, v0.4s\n"
+ "ldr q22, [x25, x10]\n"
+ "fmla v5.4s, v17.4s, v20.4s\n"
+ "fmla v30.4s, v17.4s, v26.4s\n"
+ "fmla v29.4s, v17.4s, v24.4s\n"
+ "ldr q21, [x16, #0x110]\n"
+ "fmla v31.4s, v3.4s, v20.4s\n"
+ "ldr q18, [x24, x10]\n"
+ "fmla v5.4s, v3.4s, v19.4s\n"
+ "fmla v30.4s, v3.4s, v24.4s\n"
+ "fmla v29.4s, v3.4s, v13.4s\n"
+ "ldr q20, [x16, #0x120]\n"
+ "fmla v31.4s, v11.4s, v19.4s\n"
+ "ldr q17, [x26, x10]\n"
+ "fmla v5.4s, v11.4s, v16.4s\n"
+ "fmla v30.4s, v11.4s, v13.4s\n"
+ "fmla v29.4s, v11.4s, v22.4s\n"
+ "ldr q19, [x16, #0x130]\n"
+ "add x16, x16, #0x140\n"
+ "fmla v31.4s, v25.4s, v7.4s\n"
+ "ldr q16, [x23, x10]\n"
+ "fmla v5.4s, v25.4s, v28.4s\n"
+ "fmla v30.4s, v25.4s, v18.4s\n"
+ "ldr q18, [x22, x10]\n"
+ "fmla v29.4s, v25.4s, v17.4s\n"
+ "fmla v31.4s, v23.4s, v28.4s\n"
+ "fmla v5.4s, v23.4s, v26.4s\n"
+ "fmla v30.4s, v23.4s, v17.4s\n"
+ "ldr q17, [x21, x10]\n"
+ "fmla v29.4s, v23.4s, v16.4s\n"
+ "fmla v31.4s, v21.4s, v26.4s\n"
+ "fmla v5.4s, v21.4s, v24.4s\n"
+ "fmla v30.4s, v21.4s, v16.4s\n"
+ "ldr q16, [x20, x10]\n"
+ "fmla v29.4s, v21.4s, v18.4s\n"
+ "add x10, x10, #0x10\n"
+ "fmla v31.4s, v20.4s, v24.4s\n"
+ "fmla v5.4s, v20.4s, v13.4s\n"
+ "fmla v30.4s, v20.4s, v18.4s\n"
+ "fmla v29.4s, v20.4s, v17.4s\n"
+ "fmla v31.4s, v19.4s, v13.4s\n"
+ "fmla v5.4s, v19.4s, v22.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
+ "fmla v30.4s, v19.4s, v17.4s\n"
+ "fmla v29.4s, v19.4s, v16.4s\n"
+ "fmax v5.4s, v5.4s, v27.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "fmin v5.4s, v5.4s, v15.4s\n"
+ "str q31, [x14, x28]\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "str q5, [x13, x28]\n"
+ "str q30, [x12, x28]\n"
+ "str q29, [x11, x28]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 60f\n"
+ "ldr q26, [x16, #0x0]\n"
+ "ldr q0, [x16, #0x10]\n"
+ "mov x20, x10\n"
+ "add x14, x14, x20\n"
+ "ldr q1, [x16, #0x20]\n"
+ "ldr q2, [x16, #0x30]\n"
+ "add x13, x13, x20\n"
+ "add x12, x12, x20\n"
+ "ldr q3, [x16, #0x40]\n"
+ "ldr q4, [x16, #0x50]\n"
+ "add x11, x11, x20\n"
+ "ldr x9, [x15, #0x0]\n"
+ "ldr x28, [x15, #0x8]\n"
+ "add x9, x9, x10\n"
+ "add x28, x28, x10\n"
+ "ldr x27, [x15, #0x10]\n"
+ "ldr x26, [x15, #0x18]\n"
+ "add x27, x27, x10\n"
+ "add x26, x26, x10\n"
+ "ldr x25, [x15, #0x20]\n"
+ "ldr x24, [x15, #0x28]\n"
+ "add x25, x25, x10\n"
+ "add x24, x24, x10\n"
+ "ldr x23, [x15, #0x30]\n"
+ "ldr x22, [x15, #0x38]\n"
+ "add x23, x23, x10\n"
+ "add x22, x22, x10\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "add x21, x21, x10\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x60\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v5.d }[0], [x9], #0x8\n"
+ "ld1 { v6.d }[0], [x28], #0x8\n"
+ "ld1 { v7.d }[0], [x27], #0x8\n"
+ "ld1 { v8.d }[0], [x26], #0x8\n"
+ "ld1 { v9.d }[0], [x25], #0x8\n"
+ "ld1 { v13.d }[0], [x24], #0x8\n"
+ "ld1 { v11.d }[0], [x23], #0x8\n"
+ "ld1 { v12.d }[0], [x22], #0x8\n"
+ "ld1 { v10.d }[0], [x21], #0x8\n"
+ "ld1 { v14.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 5f\n"
+ "ld1 { v5.s }[2], [x9], #0x4\n"
+ "ld1 { v6.s }[2], [x28], #0x4\n"
+ "ld1 { v7.s }[2], [x27], #0x4\n"
+ "ld1 { v8.s }[2], [x26], #0x4\n"
+ "ld1 { v9.s }[2], [x25], #0x4\n"
+ "ld1 { v13.s }[2], [x24], #0x4\n"
+ "ld1 { v11.s }[2], [x23], #0x4\n"
+ "ld1 { v12.s }[2], [x22], #0x4\n"
+ "ld1 { v10.s }[2], [x21], #0x4\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
+ "b 5f\n"
+ "4:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset
+ "ld1 { v5.s }[0], [x9], #0x4\n"
+ "ld1 { v6.s }[0], [x28], #0x4\n"
+ "ld1 { v7.s }[0], [x27], #0x4\n"
+ "ld1 { v8.s }[0], [x26], #0x4\n"
+ "ld1 { v9.s }[0], [x25], #0x4\n"
+ "ld1 { v13.s }[0], [x24], #0x4\n"
+ "ld1 { v11.s }[0], [x23], #0x4\n"
+ "ld1 { v12.s }[0], [x22], #0x4\n"
+ "ld1 { v10.s }[0], [x21], #0x4\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
+ "5:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
+ "mov v28.16b, v26.16b\n fmla v28.4s, v0.4s, v5.4s\n"
+ "mov v29.16b, v26.16b\n fmla v29.4s, v0.4s, v6.4s\n"
+ "ldr x20, [x15, #0x50]\n"
+ "add x20, x20, x10\n"
+ "mov v30.16b, v26.16b\n fmla v30.4s, v0.4s, v7.4s\n"
+ "mov v31.16b, v26.16b\n fmla v31.4s, v0.4s, v8.4s\n"
+ "fmla v28.4s, v1.4s, v6.4s\n"
+ "fmla v29.4s, v1.4s, v9.4s\n"
+ "fmla v30.4s, v1.4s, v8.4s\n"
+ "fmla v31.4s, v1.4s, v13.4s\n"
+ "fmla v28.4s, v2.4s, v9.4s\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "fmla v30.4s, v2.4s, v13.4s\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
+ "b 7f\n"
+ "6:" // Oddments: Load input (1, 3): Bit 1: Unset
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "7:" // Oddments: Load input (1, 3): Bit 1: End
+ "ldr x20, [x15, #0x58]\n"
+ "fmla v31.4s, v2.4s, v5.4s\n"
+ "fmla v28.4s, v3.4s, v11.4s\n"
+ "add x20, x20, x10\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "fmla v30.4s, v3.4s, v5.4s\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v6.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v6.s }[2], [x20], #0x4\n"
+ "b 9f\n"
+ "8:" // Oddments: Load input (1, 4): Bit 1: Unset
+ "ld1 { v6.s }[0], [x20], #0x4\n"
+ "9:" // Oddments: Load input (1, 4): Bit 1: End
+ "ldr x20, [x15, #0x60]\n"
+ "fmla v31.4s, v3.4s, v6.4s\n"
+ "fmla v28.4s, v4.4s, v12.4s\n"
+ "add x20, x20, x10\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (0, 5): Bit 1: Unset
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "11:" // Oddments: Load input (0, 5): Bit 1: End
+ "ldr q0, [x16, #0x0]\n"
+ "fmla v29.4s, v4.4s, v9.4s\n"
+ "fmla v30.4s, v4.4s, v6.4s\n"
+ "ldr x20, [x15, #0x68]\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmla v28.4s, v0.4s, v7.4s\n"
+ "add x20, x20, x10\n"
+ "fmla v29.4s, v0.4s, v8.4s\n"
+ "fmla v30.4s, v0.4s, v14.4s\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 13f\n"
+ "12:" // Oddments: Load input (2, 1): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "13:" // Oddments: Load input (2, 1): Bit 1: End
+ "ldr q1, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "fmla v31.4s, v0.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v8.4s\n"
+ "fmla v29.4s, v1.4s, v13.4s\n"
+ "fmla v30.4s, v1.4s, v11.4s\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (2, 2): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "15:" // Oddments: Load input (2, 2): Bit 1: End
+ "ldr q2, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "fmla v31.4s, v1.4s, v12.4s\n"
+ "fmla v28.4s, v2.4s, v13.4s\n"
+ "fmla v29.4s, v2.4s, v5.4s\n"
+ "fmla v30.4s, v2.4s, v12.4s\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 17f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "b 17f\n"
+ "16:" // Oddments: Load input (2, 3): Bit 1: Unset
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "17:" // Oddments: Load input (2, 3): Bit 1: End
+ "ldr q3, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x80]\n"
+ "fmla v31.4s, v2.4s, v9.4s\n"
+ "fmla v28.4s, v3.4s, v5.4s\n"
+ "fmla v29.4s, v3.4s, v6.4s\n"
+ "fmla v30.4s, v3.4s, v9.4s\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (2, 4): Bit 1: Unset
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "19:" // Oddments: Load input (2, 4): Bit 1: End
+ "ldr q4, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla v31.4s, v3.4s, v13.4s\n"
+ "fmla v28.4s, v4.4s, v6.4s\n"
+ "fmla v29.4s, v4.4s, v10.4s\n"
+ "fmla v30.4s, v4.4s, v13.4s\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v8.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 21f\n"
+ "ld1 { v8.s }[2], [x20], #0x4\n"
+ "b 21f\n"
+ "20:" // Oddments: Load input (2, 5): Bit 1: Unset
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "21:" // Oddments: Load input (2, 5): Bit 1: End
+ "ldr q0, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x90]\n"
+ "fmla v31.4s, v4.4s, v8.4s\n"
+ "fmla v28.4s, v0.4s, v14.4s\n"
+ "fmla v29.4s, v0.4s, v11.4s\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (3, 0): Bit 1: Unset
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "23:" // Oddments: Load input (3, 0): Bit 1: End
+ "ldr x20, [x15, #0x98]\n"
+ "fmla v30.4s, v0.4s, v5.4s\n"
+ "add x20, x20, x10\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v6.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 25f\n"
+ "ld1 { v6.s }[2], [x20], #0x4\n"
+ "b 25f\n"
+ "24:" // Oddments: Load input (3, 1): Bit 1: Unset
+ "ld1 { v6.s }[0], [x20], #0x4\n"
+ "25:" // Oddments: Load input (3, 1): Bit 1: End
+ "ldr q1, [x16, #0x0]\n"
+ "ldr x20, [x15, #0xa0]\n"
+ "fmla v31.4s, v0.4s, v6.4s\n"
+ "fmla v28.4s, v1.4s, v11.4s\n"
+ "fmla v29.4s, v1.4s, v12.4s\n"
+ "fmla v30.4s, v1.4s, v6.4s\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (3, 2): Bit 1: Unset
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "27:" // Oddments: Load input (3, 2): Bit 1: End
+ "ldr q2, [x16, #0x0]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "fmla v31.4s, v1.4s, v10.4s\n"
+ "fmla v28.4s, v2.4s, v12.4s\n"
+ "fmla v29.4s, v2.4s, v9.4s\n"
+ "fmla v30.4s, v2.4s, v10.4s\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 29f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 29f\n"
+ "28:" // Oddments: Load input (3, 3): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "29:" // Oddments: Load input (3, 3): Bit 1: End
+ "ldr q3, [x16, #0x0]\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v28.4s, v3.4s, v9.4s\n"
+ "fmla v29.4s, v3.4s, v13.4s\n"
+ "fmla v30.4s, v3.4s, v11.4s\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (3, 4): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "31:" // Oddments: Load input (3, 4): Bit 1: End
+ "ldr q4, [x16, #0x0]\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla v31.4s, v3.4s, v12.4s\n"
+ "fmla v28.4s, v4.4s, v13.4s\n"
+ "fmla v29.4s, v4.4s, v8.4s\n"
+ "fmla v30.4s, v4.4s, v12.4s\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v14.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 33f\n"
+ "ld1 { v14.s }[2], [x20], #0x4\n"
+ "b 33f\n"
+ "32:" // Oddments: Load input (3, 5): Bit 1: Unset
+ "ld1 { v14.s }[0], [x20], #0x4\n"
+ "33:" // Oddments: Load input (3, 5): Bit 1: End
+ "ldr q0, [x16, #0x0]\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "fmla v31.4s, v4.4s, v14.4s\n"
+ "fmla v28.4s, v0.4s, v5.4s\n"
+ "fmla v29.4s, v0.4s, v6.4s\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (4, 0): Bit 1: Unset
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "35:" // Oddments: Load input (4, 0): Bit 1: End
+ "ldr x20, [x15, #0xc8]\n"
+ "fmla v30.4s, v0.4s, v9.4s\n"
+ "add x20, x20, x10\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 37f\n"
+ "ld1 { v13.s }[2], [x20], #0x4\n"
+ "b 37f\n"
+ "36:" // Oddments: Load input (4, 1): Bit 1: Unset
+ "ld1 { v13.s }[0], [x20], #0x4\n"
+ "37:" // Oddments: Load input (4, 1): Bit 1: End
+ "ldr q1, [x16, #0x0]\n"
+ "ldr x20, [x15, #0xd0]\n"
+ "fmla v31.4s, v0.4s, v13.4s\n"
+ "fmla v28.4s, v1.4s, v6.4s\n"
+ "fmla v29.4s, v1.4s, v10.4s\n"
+ "fmla v30.4s, v1.4s, v13.4s\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v5.s }[2], [x20], #0x4\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (4, 2): Bit 1: Unset
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "39:" // Oddments: Load input (4, 2): Bit 1: End
+ "ldr q2, [x16, #0x0]\n"
+ "ldr x20, [x15, #0xd8]\n"
+ "fmla v31.4s, v1.4s, v5.4s\n"
+ "fmla v28.4s, v2.4s, v10.4s\n"
+ "fmla v29.4s, v2.4s, v11.4s\n"
+ "fmla v30.4s, v2.4s, v5.4s\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "ld1 { v6.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 41f\n"
+ "ld1 { v6.s }[2], [x20], #0x4\n"
+ "b 41f\n"
+ "40:" // Oddments: Load input (4, 3): Bit 1: Unset
+ "ld1 { v6.s }[0], [x20], #0x4\n"
+ "41:" // Oddments: Load input (4, 3): Bit 1: End
+ "ldr q3, [x16, #0x0]\n"
+ "ldr x20, [x15, #0xe0]\n"
+ "fmla v31.4s, v2.4s, v6.4s\n"
+ "fmla v28.4s, v3.4s, v11.4s\n"
+ "fmla v29.4s, v3.4s, v12.4s\n"
+ "fmla v30.4s, v3.4s, v6.4s\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #1, 42f\n"
+ "ld1 { v8.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v8.s }[2], [x20], #0x4\n"
+ "b 43f\n"
+ "42:" // Oddments: Load input (4, 4): Bit 1: Unset
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "43:" // Oddments: Load input (4, 4): Bit 1: End
+ "ldr q4, [x16, #0x0]\n"
+ "ldr x20, [x15, #0xe8]\n"
+ "fmla v31.4s, v3.4s, v8.4s\n"
+ "fmla v28.4s, v4.4s, v12.4s\n"
+ "fmla v29.4s, v4.4s, v14.4s\n"
+ "fmla v30.4s, v4.4s, v8.4s\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #1, 44f\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 45f\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "b 45f\n"
+ "44:" // Oddments: Load input (4, 5): Bit 1: Unset
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "45:" // Oddments: Load input (4, 5): Bit 1: End
+ "ldr q0, [x16, #0x0]\n"
+ "ldr x20, [x15, #0xf0]\n"
+ "fmla v31.4s, v4.4s, v10.4s\n"
+ "fmla v28.4s, v0.4s, v9.4s\n"
+ "fmla v29.4s, v0.4s, v13.4s\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #1, 46f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 47f\n"
+ "46:" // Oddments: Load input (5, 0): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "47:" // Oddments: Load input (5, 0): Bit 1: End
+ "ldr x20, [x15, #0xf8]\n"
+ "fmla v30.4s, v0.4s, v11.4s\n"
+ "add x20, x20, x10\n"
+ "tbz %x[n_channels], #1, 48f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 49f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 49f\n"
+ "48:" // Oddments: Load input (5, 1): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "49:" // Oddments: Load input (5, 1): Bit 1: End
+ "ldr q1, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x100]\n"
+ "fmla v31.4s, v0.4s, v12.4s\n"
+ "fmla v28.4s, v1.4s, v13.4s\n"
+ "fmla v29.4s, v1.4s, v5.4s\n"
+ "fmla v30.4s, v1.4s, v12.4s\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #1, 50f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "b 51f\n"
+ "50:" // Oddments: Load input (5, 2): Bit 1: Unset
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "51:" // Oddments: Load input (5, 2): Bit 1: End
+ "ldr q2, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x108]\n"
+ "fmla v31.4s, v1.4s, v9.4s\n"
+ "fmla v28.4s, v2.4s, v5.4s\n"
+ "fmla v29.4s, v2.4s, v6.4s\n"
+ "fmla v30.4s, v2.4s, v9.4s\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #1, 52f\n"
+ "ld1 { v11.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 53f\n"
+ "ld1 { v11.s }[2], [x20], #0x4\n"
+ "b 53f\n"
+ "52:" // Oddments: Load input (5, 3): Bit 1: Unset
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "53:" // Oddments: Load input (5, 3): Bit 1: End
+ "ldr q3, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x110]\n"
+ "fmla v31.4s, v2.4s, v11.4s\n"
+ "fmla v28.4s, v3.4s, v6.4s\n"
+ "fmla v29.4s, v3.4s, v8.4s\n"
+ "fmla v30.4s, v3.4s, v11.4s\n"
+ "add x20, x20, x10\n"
+ "add x16, x16, #0x10\n"
+ "tbz %x[n_channels], #1, 54f\n"
+ "ld1 { v12.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 55f\n"
+ "ld1 { v12.s }[2], [x20], #0x4\n"
+ "b 55f\n"
+ "54:" // Oddments: Load input (5, 4): Bit 1: Unset
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "55:" // Oddments: Load input (5, 4): Bit 1: End
+ "ldr q4, [x16, #0x0]\n"
+ "ldr x20, [x15, #0x118]\n"
+ "fmla v31.4s, v3.4s, v12.4s\n"
+ "fmla v28.4s, v4.4s, v8.4s\n"
+ "fmla v29.4s, v4.4s, v10.4s\n"
+ "fmla v30.4s, v4.4s, v12.4s\n"
+ "add x20, x20, x10\n"
+ "tbz %x[n_channels], #1, 56f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 57f\n"
+ "ld1 { v9.s }[2], [x20], #0x4\n"
+ "b 57f\n"
+ "56:" // Oddments: Load input (5, 5): Bit 1: Unset
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "57:" // Oddments: Load input (5, 5): Bit 1: End
+ "fmla v31.4s, v4.4s, v9.4s\n"
+ "fmax v28.4s, v28.4s, v27.4s\n"
+ "fmax v29.4s, v29.4s, v27.4s\n"
+ "fmax v30.4s, v30.4s, v27.4s\n"
+ "fmax v31.4s, v31.4s, v27.4s\n"
+ "fmin v28.4s, v28.4s, v15.4s\n"
+ "fmin v29.4s, v29.4s, v15.4s\n"
+ "fmin v30.4s, v30.4s, v15.4s\n"
+ "fmin v31.4s, v31.4s, v15.4s\n"
+ "tbz %x[n_channels], #1, 58f\n"
+ "st1 { v28.d }[0], [x14], #0x8\n"
+ "st1 { v29.d }[0], [x13], #0x8\n"
+ "st1 { v30.d }[0], [x12], #0x8\n"
+ "st1 { v31.d }[0], [x11], #0x8\n"
+ "tbz %x[n_channels], #0, 59f\n"
+ "st1 { v28.s }[2], [x14], #0x4\n"
+ "st1 { v29.s }[2], [x13], #0x4\n"
+ "st1 { v30.s }[2], [x12], #0x4\n"
+ "st1 { v31.s }[2], [x11], #0x4\n"
+ "b 59f\n"
+ "58:" // Oddments: Store: Bit 1: Unset
+ "st1 { v28.s }[0], [x14], #0x4\n"
+ "st1 { v29.s }[0], [x13], #0x4\n"
+ "st1 { v30.s }[0], [x12], #0x4\n"
+ "st1 { v31.s }[0], [x11], #0x4\n"
+ "59:" // Oddments: Store: Bit 1: End
+ "60:" // End
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..8a8060770c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float);
+
+class a64_fp32_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<float, float, float, float>
+{
+ KernelType kernel = a64_fp32_nhwc_generic_output9_mla_depthfirst_impl;
+
+ public:
+ a64_fp32_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<float, float, float, float>(9, arm_gemm::VLType::None) {}
+
+ KernelType get_kernel() const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..a2f577784f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(
+ const float *const *const inptrs,
+ float *const *const outptrs,
+ const void *params,
+ const void *bias,
+ const unsigned int n_points,
+ const unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ const float minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ld1r { v2.4s }, [%x[minmax_vals]]\n"
+ "lsr x9, %x[n_channels], #0x2\n"
+ "add x20, %x[minmax_vals], #0x4\n"
+ "ld1r { v1.4s }, [x20]\n"
+ "mov x11, #0x0\n"
+ "cbz x9, 5f\n"
+ "1:" // Channel loop
+ "movi v23.16b, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "ldr q23, [%x[bias], x11]\n"
+ "2:" // Channel loop: Load bias: Done
+ "ldr q0, [%x[params], #0x0]\n"
+ "mov x26, %x[inptrs]\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "subs x25, %x[n_points], #0x1\n"
+ "ldr q14, [x21, x11]\n"
+ "ldr q15, [x20, x11]\n"
+ "mov v24.16b, v23.16b\n"
+ "mov v25.16b, v23.16b\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "ldr q16, [x21, x11]\n"
+ "mov v26.16b, v23.16b\n"
+ "mov v27.16b, v23.16b\n"
+ "ldr q17, [x20, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "mov v28.16b, v23.16b\n"
+ "mov v29.16b, v23.16b\n"
+ "ldr q18, [x21, x11]\n"
+ "ldr q19, [x20, x11]\n"
+ "mov v30.16b, v23.16b\n"
+ "mov v31.16b, v23.16b\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "ldr q20, [x21, x11]\n"
+ "add %x[params], %x[params], #0x10\n"
+ "ldr q21, [x20, x11]\n"
+ "ldr x20, [x26], #0x8\n"
+ "ldr q22, [x20, x11]\n"
+ "ble 4f\n"
+ "3:" // Channel loop: Planar loop
+ "ldp x20, x24, [x26], #0x10\n"
+ "ldp x23, x22, [x26], #0x10\n"
+ "subs x25, x25, #0x1\n"
+ "fmla v23.4s, v14.4s, v0.4s\n"
+ "ldr q14, [x20, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "fmla v24.4s, v15.4s, v0.4s\n"
+ "fmla v25.4s, v16.4s, v0.4s\n"
+ "ldr q15, [x24, x11]\n"
+ "ldr q16, [x23, x11]\n"
+ "fmla v26.4s, v17.4s, v0.4s\n"
+ "fmla v27.4s, v18.4s, v0.4s\n"
+ "ldr q17, [x22, x11]\n"
+ "ldr q18, [x21, x11]\n"
+ "fmla v28.4s, v19.4s, v0.4s\n"
+ "fmla v29.4s, v20.4s, v0.4s\n"
+ "ldr q19, [x20, x11]\n"
+ "ldp x21, x20, [x26], #0x10\n"
+ "fmla v30.4s, v21.4s, v0.4s\n"
+ "fmla v31.4s, v22.4s, v0.4s\n"
+ "ldr q0, [%x[params], #0x0]\n"
+ "ldr q20, [x21, x11]\n"
+ "add %x[params], %x[params], #0x10\n"
+ "ldr q21, [x20, x11]\n"
+ "ldr x20, [x26], #0x8\n"
+ "ldr q22, [x20, x11]\n"
+ "bgt 3b\n"
+ "4:" // Channel loop: Planar tail
+ "fmla v23.4s, v14.4s, v0.4s\n"
+ "fmla v24.4s, v15.4s, v0.4s\n"
+ "fmax v23.4s, v23.4s, v2.4s\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "fmla v25.4s, v16.4s, v0.4s\n"
+ "fmla v26.4s, v17.4s, v0.4s\n"
+ "fmax v24.4s, v24.4s, v2.4s\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "fmla v27.4s, v18.4s, v0.4s\n"
+ "fmla v28.4s, v19.4s, v0.4s\n"
+ "fmax v25.4s, v25.4s, v2.4s\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "fmla v29.4s, v20.4s, v0.4s\n"
+ "fmla v30.4s, v21.4s, v0.4s\n"
+ "fmax v26.4s, v26.4s, v2.4s\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "fmla v31.4s, v22.4s, v0.4s\n"
+ "fmax v27.4s, v27.4s, v2.4s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "fmax v28.4s, v28.4s, v2.4s\n"
+ "fmax v29.4s, v29.4s, v2.4s\n"
+ "fmax v30.4s, v30.4s, v2.4s\n"
+ "fmax v31.4s, v31.4s, v2.4s\n"
+ "fmin v23.4s, v23.4s, v1.4s\n"
+ "fmin v24.4s, v24.4s, v1.4s\n"
+ "str q23, [x28, x11]\n"
+ "fmin v25.4s, v25.4s, v1.4s\n"
+ "fmin v26.4s, v26.4s, v1.4s\n"
+ "str q24, [x27, x11]\n"
+ "fmin v27.4s, v27.4s, v1.4s\n"
+ "fmin v28.4s, v28.4s, v1.4s\n"
+ "str q25, [x26, x11]\n"
+ "fmin v29.4s, v29.4s, v1.4s\n"
+ "fmin v30.4s, v30.4s, v1.4s\n"
+ "str q26, [x25, x11]\n"
+ "fmin v31.4s, v31.4s, v1.4s\n"
+ "str q27, [x24, x11]\n"
+ "str q28, [x23, x11]\n"
+ "str q29, [x22, x11]\n"
+ "str q30, [x21, x11]\n"
+ "str q31, [x20, x11]\n"
+ "add x11, x11, #0x10\n"
+ "cmp x11, x9, LSL #4\n"
+ "blt 1b\n"
+ "5:" // Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 17f\n"
+ "movi v23.16b, #0x0\n"
+ "cbz %x[bias], 8f\n"
+ "add x20, %x[bias], x11\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v23.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v23.s }[2], [x20], #0x4\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 1: Unset
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "7:" // Oddments: Load bias: Bit 1: End
+ "8:" // Oddments: Load bias: Done
+ "ldr q0, [%x[params], #0x0]\n"
+ "mov x10, %x[inptrs]\n"
+ "ldp x9, x28, [x10], #0x10\n"
+ "mov v24.16b, v23.16b\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "mov v25.16b, v23.16b\n"
+ "mov v26.16b, v23.16b\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "ldr x21, [x10], #0x8\n"
+ "mov v27.16b, v23.16b\n"
+ "mov v28.16b, v23.16b\n"
+ "mov v29.16b, v23.16b\n"
+ "mov v30.16b, v23.16b\n"
+ "add x9, x9, x11\n"
+ "add x28, x28, x11\n"
+ "mov v31.16b, v23.16b\n"
+ "add x27, x27, x11\n"
+ "add x26, x26, x11\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
+ "add %x[params], %x[params], #0x10\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d15, [x28], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d17, [x26], #0x8\n"
+ "ldr d18, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v15.s }[2], [x28], #0x4\n"
+ "ld1 { v16.s }[2], [x27], #0x4\n"
+ "ld1 { v17.s }[2], [x26], #0x4\n"
+ "ld1 { v18.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v21.s }[2], [x22], #0x4\n"
+ "ld1 { v22.s }[2], [x21], #0x4\n"
+ "b 10f\n"
+ "9:" // Oddments: Load: Bit 1: Unset
+ "ldr s14, [x9], #0x4\n"
+ "ldr s15, [x28], #0x4\n"
+ "ldr s16, [x27], #0x4\n"
+ "ldr s17, [x26], #0x4\n"
+ "ldr s18, [x25], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s22, [x21], #0x4\n"
+ "10:" // Oddments: Load: Bit 1: End
+ "subs x20, %x[n_points], #0x1\n"
+ "ble 14f\n"
+ "11:" // Oddments: Planar loop
+ "ldp x9, x28, [x10], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "fmla v23.4s, v14.4s, v0.4s\n"
+ "fmla v24.4s, v15.4s, v0.4s\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "fmla v25.4s, v16.4s, v0.4s\n"
+ "fmla v26.4s, v17.4s, v0.4s\n"
+ "ldr x21, [x10], #0x8\n"
+ "fmla v27.4s, v18.4s, v0.4s\n"
+ "fmla v28.4s, v19.4s, v0.4s\n"
+ "add x9, x9, x11\n"
+ "fmla v29.4s, v20.4s, v0.4s\n"
+ "fmla v30.4s, v21.4s, v0.4s\n"
+ "add x28, x28, x11\n"
+ "add x27, x27, x11\n"
+ "fmla v31.4s, v22.4s, v0.4s\n"
+ "ldr q0, [%x[params], #0x0]\n"
+ "add x26, x26, x11\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
+ "add %x[params], %x[params], #0x10\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d15, [x28], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d17, [x26], #0x8\n"
+ "ldr d18, [x25], #0x8\n"
+ "ldr d19, [x24], #0x8\n"
+ "ldr d20, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "ldr d22, [x21], #0x8\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v15.s }[2], [x28], #0x4\n"
+ "ld1 { v16.s }[2], [x27], #0x4\n"
+ "ld1 { v17.s }[2], [x26], #0x4\n"
+ "ld1 { v18.s }[2], [x25], #0x4\n"
+ "ld1 { v19.s }[2], [x24], #0x4\n"
+ "ld1 { v20.s }[2], [x23], #0x4\n"
+ "ld1 { v21.s }[2], [x22], #0x4\n"
+ "ld1 { v22.s }[2], [x21], #0x4\n"
+ "b 13f\n"
+ "12:" // Oddments: Planar loop: Load: Bit 1: Unset
+ "ldr s14, [x9], #0x4\n"
+ "ldr s15, [x28], #0x4\n"
+ "ldr s16, [x27], #0x4\n"
+ "ldr s17, [x26], #0x4\n"
+ "ldr s18, [x25], #0x4\n"
+ "ldr s19, [x24], #0x4\n"
+ "ldr s20, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s22, [x21], #0x4\n"
+ "13:" // Oddments: Planar loop: Load: Bit 1: End
+ "subs x20, x20, #0x1\n"
+ "bgt 11b\n"
+ "14:" // Oddments: Planar tail
+ "fmla v23.4s, v14.4s, v0.4s\n"
+ "fmla v24.4s, v15.4s, v0.4s\n"
+ "fmax v23.4s, v23.4s, v2.4s\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "fmla v25.4s, v16.4s, v0.4s\n"
+ "fmla v26.4s, v17.4s, v0.4s\n"
+ "fmax v24.4s, v24.4s, v2.4s\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "fmla v27.4s, v18.4s, v0.4s\n"
+ "fmla v28.4s, v19.4s, v0.4s\n"
+ "fmax v25.4s, v25.4s, v2.4s\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "fmla v29.4s, v20.4s, v0.4s\n"
+ "fmla v30.4s, v21.4s, v0.4s\n"
+ "fmax v26.4s, v26.4s, v2.4s\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "fmla v31.4s, v22.4s, v0.4s\n"
+ "fmax v27.4s, v27.4s, v2.4s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "add x28, x28, x11\n"
+ "fmax v28.4s, v28.4s, v2.4s\n"
+ "fmax v29.4s, v29.4s, v2.4s\n"
+ "add x27, x27, x11\n"
+ "add x26, x26, x11\n"
+ "fmax v30.4s, v30.4s, v2.4s\n"
+ "fmax v31.4s, v31.4s, v2.4s\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "fmin v23.4s, v23.4s, v1.4s\n"
+ "fmin v24.4s, v24.4s, v1.4s\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "fmin v25.4s, v25.4s, v1.4s\n"
+ "fmin v26.4s, v26.4s, v1.4s\n"
+ "add x21, x21, x11\n"
+ "add x20, x20, x11\n"
+ "fmin v27.4s, v27.4s, v1.4s\n"
+ "fmin v28.4s, v28.4s, v1.4s\n"
+ "fmin v29.4s, v29.4s, v1.4s\n"
+ "fmin v30.4s, v30.4s, v1.4s\n"
+ "fmin v31.4s, v31.4s, v1.4s\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "st1 { v23.d }[0], [x28], #0x8\n"
+ "st1 { v24.d }[0], [x27], #0x8\n"
+ "st1 { v25.d }[0], [x26], #0x8\n"
+ "st1 { v26.d }[0], [x25], #0x8\n"
+ "st1 { v27.d }[0], [x24], #0x8\n"
+ "st1 { v28.d }[0], [x23], #0x8\n"
+ "st1 { v29.d }[0], [x22], #0x8\n"
+ "st1 { v30.d }[0], [x21], #0x8\n"
+ "st1 { v31.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "st1 { v23.s }[2], [x28], #0x4\n"
+ "st1 { v24.s }[2], [x27], #0x4\n"
+ "st1 { v25.s }[2], [x26], #0x4\n"
+ "st1 { v26.s }[2], [x25], #0x4\n"
+ "st1 { v27.s }[2], [x24], #0x4\n"
+ "st1 { v28.s }[2], [x23], #0x4\n"
+ "st1 { v29.s }[2], [x22], #0x4\n"
+ "st1 { v30.s }[2], [x21], #0x4\n"
+ "st1 { v31.s }[2], [x20], #0x4\n"
+ "b 16f\n"
+ "15:" // Oddments: Store: Bit 1: Unset
+ "st1 { v23.s }[0], [x28], #0x4\n"
+ "st1 { v24.s }[0], [x27], #0x4\n"
+ "st1 { v25.s }[0], [x26], #0x4\n"
+ "st1 { v26.s }[0], [x25], #0x4\n"
+ "st1 { v27.s }[0], [x24], #0x4\n"
+ "st1 { v28.s }[0], [x23], #0x4\n"
+ "st1 { v29.s }[0], [x22], #0x4\n"
+ "st1 { v30.s }[0], [x21], #0x4\n"
+ "st1 { v31.s }[0], [x20], #0x4\n"
+ "16:" // Oddments: Store: Bit 1: End
+ "17:" // End
+ : [params] "+&r" (params)
+ : [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
+ : "cc", "memory", "v0", "v1", "v2", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..6c07fa645c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+struct a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst : DepthfirstMultiplierStrategy<float, float, float, float>
+{
+ using Parent = DepthfirstMultiplierStrategy<float, float, float, float>;
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(const CPUInfo *)
+ : Parent(3, 3, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
+
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..9cafd23fb8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(
+ const float *const *const inptrs,
+ float *const *const outptrs,
+ const void *params,
+ const unsigned int n_output_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ const float minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ld1r { v27.4s }, [%x[clamps]]\n"
+ "ldr x21, [%x[inptrs], #0x0]\n"
+ "lsr x22, %x[channel_multiplier], #0x2\n"
+ "add x20, %x[clamps], #0x4\n"
+ "ldr q0, [x21, #0x0]\n"
+ "ldr q1, [x21, #0x10]\n"
+ "mov x21, #0x0\n"
+ "mov x14, #0x0\n"
+ "ld1r { v26.4s }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x8]\n"
+ "ldr q2, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "ldr x20, [%x[inptrs], #0x10]\n"
+ "ldr q4, [x20, #0x0]\n"
+ "ldr q5, [x20, #0x10]\n"
+ "ldr x20, [%x[inptrs], #0x18]\n"
+ "ldr q6, [x20, #0x0]\n"
+ "ldr q7, [x20, #0x10]\n"
+ "ldr x20, [%x[inptrs], #0x20]\n"
+ "ldr q8, [x20, #0x0]\n"
+ "ldr q9, [x20, #0x10]\n"
+ "ldr x20, [%x[inptrs], #0x28]\n"
+ "ldr q10, [x20, #0x0]\n"
+ "ldr q11, [x20, #0x10]\n"
+ "ldr x20, [%x[inptrs], #0x30]\n"
+ "ldr q12, [x20, #0x0]\n"
+ "ldr q13, [x20, #0x10]\n"
+ "ldp x13, x12, [%x[outptrs], #0x0]\n"
+ "ldp x11, x10, [%x[outptrs], #0x10]\n"
+ "ldp x9, x28, [%x[outptrs], #0x20]\n"
+ "ldp x27, x26, [%x[outptrs], #0x30]\n"
+ "ldr x25, [%x[outptrs], #0x40]\n"
+ "cbz x22, 3f\n"
+ "ldr q14, [%x[params], #0x0]\n"
+ "ldr q31, [%x[params], #0x10]\n"
+ "subs x22, x22, #0x1\n"
+ "mov v15.16b, v14.16b\n"
+ "ldr q30, [%x[params], #0x20]\n"
+ "ldr q29, [%x[params], #0x30]\n"
+ "mov v16.16b, v14.16b\n"
+ "mov v17.16b, v14.16b\n"
+ "mov v18.16b, v14.16b\n"
+ "mov v19.16b, v14.16b\n"
+ "add %x[params], %x[params], #0x40\n"
+ "mov v20.16b, v14.16b\n"
+ "mov v21.16b, v14.16b\n"
+ "mov v22.16b, v14.16b\n"
+ "beq 2f\n"
+ "1:" // Output channel complete vector loop
+ "fmla v14.4s, v31.4s, v0.s[0]\n"
+ "fmla v15.4s, v31.4s, v0.s[2]\n"
+ "subs x22, x22, #0x1\n"
+ "add x21, x21, #0x4\n"
+ "fmla v16.4s, v31.4s, v1.s[0]\n"
+ "fmla v17.4s, v31.4s, v4.s[0]\n"
+ "fmla v18.4s, v31.4s, v4.s[2]\n"
+ "fmla v19.4s, v31.4s, v5.s[0]\n"
+ "fmla v20.4s, v31.4s, v8.s[0]\n"
+ "fmla v21.4s, v31.4s, v8.s[2]\n"
+ "fmla v22.4s, v31.4s, v9.s[0]\n"
+ "ldr q25, [%x[params], #0x0]\n"
+ "fmla v14.4s, v30.4s, v0.s[1]\n"
+ "fmla v15.4s, v30.4s, v0.s[3]\n"
+ "fmla v16.4s, v30.4s, v1.s[1]\n"
+ "fmla v17.4s, v30.4s, v4.s[1]\n"
+ "fmla v18.4s, v30.4s, v4.s[3]\n"
+ "fmla v19.4s, v30.4s, v5.s[1]\n"
+ "fmla v20.4s, v30.4s, v8.s[1]\n"
+ "fmla v21.4s, v30.4s, v8.s[3]\n"
+ "fmla v22.4s, v30.4s, v9.s[1]\n"
+ "ldr q24, [%x[params], #0x10]\n"
+ "fmla v14.4s, v29.4s, v0.s[2]\n"
+ "fmla v15.4s, v29.4s, v1.s[0]\n"
+ "fmla v16.4s, v29.4s, v1.s[2]\n"
+ "fmla v17.4s, v29.4s, v4.s[2]\n"
+ "fmla v18.4s, v29.4s, v5.s[0]\n"
+ "fmla v19.4s, v29.4s, v5.s[2]\n"
+ "fmla v20.4s, v29.4s, v8.s[2]\n"
+ "fmla v21.4s, v29.4s, v9.s[0]\n"
+ "fmla v22.4s, v29.4s, v9.s[2]\n"
+ "ldr q23, [%x[params], #0x20]\n"
+ "fmla v14.4s, v25.4s, v2.s[0]\n"
+ "fmla v15.4s, v25.4s, v2.s[2]\n"
+ "fmla v16.4s, v25.4s, v3.s[0]\n"
+ "fmla v17.4s, v25.4s, v6.s[0]\n"
+ "fmla v18.4s, v25.4s, v6.s[2]\n"
+ "fmla v19.4s, v25.4s, v7.s[0]\n"
+ "fmla v20.4s, v25.4s, v10.s[0]\n"
+ "fmla v21.4s, v25.4s, v10.s[2]\n"
+ "fmla v22.4s, v25.4s, v11.s[0]\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ "fmla v14.4s, v24.4s, v2.s[1]\n"
+ "fmla v15.4s, v24.4s, v2.s[3]\n"
+ "fmla v16.4s, v24.4s, v3.s[1]\n"
+ "fmla v17.4s, v24.4s, v6.s[1]\n"
+ "fmla v18.4s, v24.4s, v6.s[3]\n"
+ "fmla v19.4s, v24.4s, v7.s[1]\n"
+ "fmla v20.4s, v24.4s, v10.s[1]\n"
+ "fmla v21.4s, v24.4s, v10.s[3]\n"
+ "fmla v22.4s, v24.4s, v11.s[1]\n"
+ "ldr q24, [%x[params], #0x40]\n"
+ "fmla v14.4s, v23.4s, v2.s[2]\n"
+ "fmla v15.4s, v23.4s, v3.s[0]\n"
+ "fmla v16.4s, v23.4s, v3.s[2]\n"
+ "fmla v17.4s, v23.4s, v6.s[2]\n"
+ "fmla v18.4s, v23.4s, v7.s[0]\n"
+ "fmla v19.4s, v23.4s, v7.s[2]\n"
+ "fmla v20.4s, v23.4s, v10.s[2]\n"
+ "fmla v21.4s, v23.4s, v11.s[0]\n"
+ "fmla v22.4s, v23.4s, v11.s[2]\n"
+ "ldr q23, [%x[params], #0x50]\n"
+ "fmla v14.4s, v25.4s, v4.s[0]\n"
+ "fmla v15.4s, v25.4s, v4.s[2]\n"
+ "fmla v16.4s, v25.4s, v5.s[0]\n"
+ "fmla v17.4s, v25.4s, v8.s[0]\n"
+ "fmla v18.4s, v25.4s, v8.s[2]\n"
+ "fmla v19.4s, v25.4s, v9.s[0]\n"
+ "fmla v20.4s, v25.4s, v12.s[0]\n"
+ "fmla v21.4s, v25.4s, v12.s[2]\n"
+ "fmla v22.4s, v25.4s, v13.s[0]\n"
+ "ldr q31, [%x[params], #0x70]\n"
+ "fmla v14.4s, v24.4s, v4.s[1]\n"
+ "fmla v15.4s, v24.4s, v4.s[3]\n"
+ "fmla v16.4s, v24.4s, v5.s[1]\n"
+ "fmla v17.4s, v24.4s, v8.s[1]\n"
+ "fmla v18.4s, v24.4s, v8.s[3]\n"
+ "fmla v19.4s, v24.4s, v9.s[1]\n"
+ "fmla v20.4s, v24.4s, v12.s[1]\n"
+ "fmla v21.4s, v24.4s, v12.s[3]\n"
+ "fmla v22.4s, v24.4s, v13.s[1]\n"
+ "ldr q30, [%x[params], #0x80]\n"
+ "fmla v14.4s, v23.4s, v4.s[2]\n"
+ "fmla v15.4s, v23.4s, v5.s[0]\n"
+ "fmin v14.4s, v14.4s, v26.4s\n"
+ "fmla v16.4s, v23.4s, v5.s[2]\n"
+ "fmla v17.4s, v23.4s, v8.s[2]\n"
+ "fmax v14.4s, v14.4s, v27.4s\n"
+ "str q14, [x13, x14]\n"
+ "ldr q14, [%x[params], #0x60]\n"
+ "fmla v18.4s, v23.4s, v9.s[0]\n"
+ "fmla v19.4s, v23.4s, v9.s[2]\n"
+ "fmin v15.4s, v15.4s, v26.4s\n"
+ "fmla v20.4s, v23.4s, v12.s[2]\n"
+ "fmla v21.4s, v23.4s, v13.s[0]\n"
+ "fmin v16.4s, v16.4s, v26.4s\n"
+ "fmla v22.4s, v23.4s, v13.s[2]\n"
+ "ldr q29, [%x[params], #0x90]\n"
+ "fmin v17.4s, v17.4s, v26.4s\n"
+ "add %x[params], %x[params], #0xa0\n"
+ "fmin v18.4s, v18.4s, v26.4s\n"
+ "fmin v19.4s, v19.4s, v26.4s\n"
+ "fmin v20.4s, v20.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmax v15.4s, v15.4s, v27.4s\n"
+ "str q15, [x12, x14]\n"
+ "fmax v16.4s, v16.4s, v27.4s\n"
+ "fmax v17.4s, v17.4s, v27.4s\n"
+ "str q16, [x11, x14]\n"
+ "fmax v18.4s, v18.4s, v27.4s\n"
+ "fmax v19.4s, v19.4s, v27.4s\n"
+ "str q17, [x10, x14]\n"
+ "fmax v20.4s, v20.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
+ "str q18, [x9, x14]\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
+ "str q19, [x28, x14]\n"
+ "mov v15.16b, v14.16b\n"
+ "str q20, [x27, x14]\n"
+ "mov v16.16b, v14.16b\n"
+ "mov v17.16b, v14.16b\n"
+ "str q21, [x26, x14]\n"
+ "mov v18.16b, v14.16b\n"
+ "mov v19.16b, v14.16b\n"
+ "str q22, [x25, x14]\n"
+ "mov v20.16b, v14.16b\n"
+ "mov v21.16b, v14.16b\n"
+ "add x14, x14, #0x10\n"
+ "mov v22.16b, v14.16b\n"
+ "bgt 1b\n"
+ "2:" // Output channel complete vector tail
+ "fmla v14.4s, v31.4s, v0.s[0]\n"
+ "fmla v15.4s, v31.4s, v0.s[2]\n"
+ "fmla v16.4s, v31.4s, v1.s[0]\n"
+ "fmla v17.4s, v31.4s, v4.s[0]\n"
+ "fmla v18.4s, v31.4s, v4.s[2]\n"
+ "fmla v19.4s, v31.4s, v5.s[0]\n"
+ "fmla v20.4s, v31.4s, v8.s[0]\n"
+ "fmla v21.4s, v31.4s, v8.s[2]\n"
+ "fmla v22.4s, v31.4s, v9.s[0]\n"
+ "ldr q25, [%x[params], #0x0]\n"
+ "fmla v14.4s, v30.4s, v0.s[1]\n"
+ "fmla v15.4s, v30.4s, v0.s[3]\n"
+ "fmla v16.4s, v30.4s, v1.s[1]\n"
+ "fmla v17.4s, v30.4s, v4.s[1]\n"
+ "fmla v18.4s, v30.4s, v4.s[3]\n"
+ "fmla v19.4s, v30.4s, v5.s[1]\n"
+ "fmla v20.4s, v30.4s, v8.s[1]\n"
+ "fmla v21.4s, v30.4s, v8.s[3]\n"
+ "fmla v22.4s, v30.4s, v9.s[1]\n"
+ "ldr q24, [%x[params], #0x10]\n"
+ "fmla v14.4s, v29.4s, v0.s[2]\n"
+ "fmla v15.4s, v29.4s, v1.s[0]\n"
+ "fmla v16.4s, v29.4s, v1.s[2]\n"
+ "fmla v17.4s, v29.4s, v4.s[2]\n"
+ "fmla v18.4s, v29.4s, v5.s[0]\n"
+ "fmla v19.4s, v29.4s, v5.s[2]\n"
+ "fmla v20.4s, v29.4s, v8.s[2]\n"
+ "fmla v21.4s, v29.4s, v9.s[0]\n"
+ "fmla v22.4s, v29.4s, v9.s[2]\n"
+ "ldr q23, [%x[params], #0x20]\n"
+ "fmla v14.4s, v25.4s, v2.s[0]\n"
+ "fmla v15.4s, v25.4s, v2.s[2]\n"
+ "fmla v16.4s, v25.4s, v3.s[0]\n"
+ "fmla v17.4s, v25.4s, v6.s[0]\n"
+ "fmla v18.4s, v25.4s, v6.s[2]\n"
+ "fmla v19.4s, v25.4s, v7.s[0]\n"
+ "fmla v20.4s, v25.4s, v10.s[0]\n"
+ "fmla v21.4s, v25.4s, v10.s[2]\n"
+ "fmla v22.4s, v25.4s, v11.s[0]\n"
+ "ldr q25, [%x[params], #0x30]\n"
+ "fmla v14.4s, v24.4s, v2.s[1]\n"
+ "fmla v15.4s, v24.4s, v2.s[3]\n"
+ "fmla v16.4s, v24.4s, v3.s[1]\n"
+ "fmla v17.4s, v24.4s, v6.s[1]\n"
+ "fmla v18.4s, v24.4s, v6.s[3]\n"
+ "fmla v19.4s, v24.4s, v7.s[1]\n"
+ "fmla v20.4s, v24.4s, v10.s[1]\n"
+ "fmla v21.4s, v24.4s, v10.s[3]\n"
+ "fmla v22.4s, v24.4s, v11.s[1]\n"
+ "ldr q24, [%x[params], #0x40]\n"
+ "fmla v14.4s, v23.4s, v2.s[2]\n"
+ "fmla v15.4s, v23.4s, v3.s[0]\n"
+ "fmla v16.4s, v23.4s, v3.s[2]\n"
+ "fmla v17.4s, v23.4s, v6.s[2]\n"
+ "fmla v18.4s, v23.4s, v7.s[0]\n"
+ "fmla v19.4s, v23.4s, v7.s[2]\n"
+ "fmla v20.4s, v23.4s, v10.s[2]\n"
+ "fmla v21.4s, v23.4s, v11.s[0]\n"
+ "fmla v22.4s, v23.4s, v11.s[2]\n"
+ "ldr q23, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x60\n"
+ "fmla v14.4s, v25.4s, v4.s[0]\n"
+ "fmla v15.4s, v25.4s, v4.s[2]\n"
+ "fmla v16.4s, v25.4s, v5.s[0]\n"
+ "fmla v17.4s, v25.4s, v8.s[0]\n"
+ "fmla v18.4s, v25.4s, v8.s[2]\n"
+ "fmla v19.4s, v25.4s, v9.s[0]\n"
+ "fmla v20.4s, v25.4s, v12.s[0]\n"
+ "fmla v21.4s, v25.4s, v12.s[2]\n"
+ "fmla v22.4s, v25.4s, v13.s[0]\n"
+ "fmla v14.4s, v24.4s, v4.s[1]\n"
+ "fmla v15.4s, v24.4s, v4.s[3]\n"
+ "fmla v16.4s, v24.4s, v5.s[1]\n"
+ "fmla v17.4s, v24.4s, v8.s[1]\n"
+ "fmla v18.4s, v24.4s, v8.s[3]\n"
+ "fmla v19.4s, v24.4s, v9.s[1]\n"
+ "fmla v20.4s, v24.4s, v12.s[1]\n"
+ "fmla v21.4s, v24.4s, v12.s[3]\n"
+ "fmla v22.4s, v24.4s, v13.s[1]\n"
+ "fmla v14.4s, v23.4s, v4.s[2]\n"
+ "fmla v15.4s, v23.4s, v5.s[0]\n"
+ "fmin v14.4s, v14.4s, v26.4s\n"
+ "fmla v16.4s, v23.4s, v5.s[2]\n"
+ "fmla v17.4s, v23.4s, v8.s[2]\n"
+ "fmin v15.4s, v15.4s, v26.4s\n"
+ "fmla v18.4s, v23.4s, v9.s[0]\n"
+ "fmla v19.4s, v23.4s, v9.s[2]\n"
+ "fmin v16.4s, v16.4s, v26.4s\n"
+ "fmla v20.4s, v23.4s, v12.s[2]\n"
+ "fmla v21.4s, v23.4s, v13.s[0]\n"
+ "fmin v17.4s, v17.4s, v26.4s\n"
+ "fmla v22.4s, v23.4s, v13.s[2]\n"
+ "fmin v18.4s, v18.4s, v26.4s\n"
+ "fmin v19.4s, v19.4s, v26.4s\n"
+ "fmin v20.4s, v20.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmax v14.4s, v14.4s, v27.4s\n"
+ "fmax v15.4s, v15.4s, v27.4s\n"
+ "str q14, [x13, x14]\n"
+ "fmax v16.4s, v16.4s, v27.4s\n"
+ "fmax v17.4s, v17.4s, v27.4s\n"
+ "str q15, [x12, x14]\n"
+ "fmax v18.4s, v18.4s, v27.4s\n"
+ "fmax v19.4s, v19.4s, v27.4s\n"
+ "str q16, [x11, x14]\n"
+ "fmax v20.4s, v20.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
+ "str q17, [x10, x14]\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
+ "str q18, [x9, x14]\n"
+ "str q19, [x28, x14]\n"
+ "str q20, [x27, x14]\n"
+ "str q21, [x26, x14]\n"
+ "str q22, [x25, x14]\n"
+ "add x14, x14, #0x10\n"
+ "3:" // Output channel oddments
+ "tst %x[channel_multiplier], #0x3\n"
+ "beq 6f\n"
+ "ldr q14, [%x[params], #0x0]\n"
+ "ldr q25, [%x[params], #0x10]\n"
+ "mov v15.16b, v14.16b\n"
+ "mov v16.16b, v14.16b\n"
+ "ldr q24, [%x[params], #0x20]\n"
+ "ldr q23, [%x[params], #0x30]\n"
+ "mov v17.16b, v14.16b\n"
+ "mov v18.16b, v14.16b\n"
+ "mov v19.16b, v14.16b\n"
+ "mov v20.16b, v14.16b\n"
+ "fmla v15.4s, v25.4s, v0.s[2]\n"
+ "mov v21.16b, v14.16b\n"
+ "mov v22.16b, v14.16b\n"
+ "fmla v14.4s, v25.4s, v0.s[0]\n"
+ "fmla v16.4s, v25.4s, v1.s[0]\n"
+ "fmla v17.4s, v25.4s, v4.s[0]\n"
+ "fmla v18.4s, v25.4s, v4.s[2]\n"
+ "fmla v19.4s, v25.4s, v5.s[0]\n"
+ "fmla v20.4s, v25.4s, v8.s[0]\n"
+ "fmla v21.4s, v25.4s, v8.s[2]\n"
+ "fmla v22.4s, v25.4s, v9.s[0]\n"
+ "ldr q25, [%x[params], #0x40]\n"
+ "fmla v14.4s, v24.4s, v0.s[1]\n"
+ "fmla v15.4s, v24.4s, v0.s[3]\n"
+ "fmla v16.4s, v24.4s, v1.s[1]\n"
+ "fmla v17.4s, v24.4s, v4.s[1]\n"
+ "fmla v18.4s, v24.4s, v4.s[3]\n"
+ "fmla v19.4s, v24.4s, v5.s[1]\n"
+ "fmla v20.4s, v24.4s, v8.s[1]\n"
+ "fmla v21.4s, v24.4s, v8.s[3]\n"
+ "fmla v22.4s, v24.4s, v9.s[1]\n"
+ "ldr q24, [%x[params], #0x50]\n"
+ "fmla v14.4s, v23.4s, v0.s[2]\n"
+ "fmla v15.4s, v23.4s, v1.s[0]\n"
+ "fmla v16.4s, v23.4s, v1.s[2]\n"
+ "fmla v17.4s, v23.4s, v4.s[2]\n"
+ "fmla v18.4s, v23.4s, v5.s[0]\n"
+ "fmla v19.4s, v23.4s, v5.s[2]\n"
+ "fmla v20.4s, v23.4s, v8.s[2]\n"
+ "fmla v21.4s, v23.4s, v9.s[0]\n"
+ "fmla v22.4s, v23.4s, v9.s[2]\n"
+ "ldr q23, [%x[params], #0x60]\n"
+ "fmla v14.4s, v25.4s, v2.s[0]\n"
+ "fmla v15.4s, v25.4s, v2.s[2]\n"
+ "fmla v16.4s, v25.4s, v3.s[0]\n"
+ "fmla v17.4s, v25.4s, v6.s[0]\n"
+ "fmla v18.4s, v25.4s, v6.s[2]\n"
+ "fmla v19.4s, v25.4s, v7.s[0]\n"
+ "fmla v20.4s, v25.4s, v10.s[0]\n"
+ "fmla v21.4s, v25.4s, v10.s[2]\n"
+ "fmla v22.4s, v25.4s, v11.s[0]\n"
+ "ldr q25, [%x[params], #0x70]\n"
+ "fmla v14.4s, v24.4s, v2.s[1]\n"
+ "fmla v15.4s, v24.4s, v2.s[3]\n"
+ "fmla v16.4s, v24.4s, v3.s[1]\n"
+ "fmla v17.4s, v24.4s, v6.s[1]\n"
+ "fmla v18.4s, v24.4s, v6.s[3]\n"
+ "fmla v19.4s, v24.4s, v7.s[1]\n"
+ "fmla v20.4s, v24.4s, v10.s[1]\n"
+ "fmla v21.4s, v24.4s, v10.s[3]\n"
+ "fmla v22.4s, v24.4s, v11.s[1]\n"
+ "ldr q24, [%x[params], #0x80]\n"
+ "fmla v14.4s, v23.4s, v2.s[2]\n"
+ "fmla v15.4s, v23.4s, v3.s[0]\n"
+ "fmla v16.4s, v23.4s, v3.s[2]\n"
+ "fmla v17.4s, v23.4s, v6.s[2]\n"
+ "fmla v18.4s, v23.4s, v7.s[0]\n"
+ "fmla v19.4s, v23.4s, v7.s[2]\n"
+ "fmla v20.4s, v23.4s, v10.s[2]\n"
+ "fmla v21.4s, v23.4s, v11.s[0]\n"
+ "fmla v22.4s, v23.4s, v11.s[2]\n"
+ "ldr q23, [%x[params], #0x90]\n"
+ "add %x[params], %x[params], #0xa0\n"
+ "fmla v14.4s, v25.4s, v4.s[0]\n"
+ "fmla v15.4s, v25.4s, v4.s[2]\n"
+ "fmla v16.4s, v25.4s, v5.s[0]\n"
+ "fmla v17.4s, v25.4s, v8.s[0]\n"
+ "fmla v18.4s, v25.4s, v8.s[2]\n"
+ "fmla v19.4s, v25.4s, v9.s[0]\n"
+ "fmla v20.4s, v25.4s, v12.s[0]\n"
+ "fmla v21.4s, v25.4s, v12.s[2]\n"
+ "fmla v22.4s, v25.4s, v13.s[0]\n"
+ "fmla v14.4s, v24.4s, v4.s[1]\n"
+ "fmla v15.4s, v24.4s, v4.s[3]\n"
+ "fmla v16.4s, v24.4s, v5.s[1]\n"
+ "fmla v17.4s, v24.4s, v8.s[1]\n"
+ "fmla v18.4s, v24.4s, v8.s[3]\n"
+ "fmla v19.4s, v24.4s, v9.s[1]\n"
+ "fmla v20.4s, v24.4s, v12.s[1]\n"
+ "fmla v21.4s, v24.4s, v12.s[3]\n"
+ "fmla v22.4s, v24.4s, v13.s[1]\n"
+ "fmla v14.4s, v23.4s, v4.s[2]\n"
+ "fmla v15.4s, v23.4s, v5.s[0]\n"
+ "fmin v14.4s, v14.4s, v26.4s\n"
+ "fmla v16.4s, v23.4s, v5.s[2]\n"
+ "fmla v17.4s, v23.4s, v8.s[2]\n"
+ "fmin v15.4s, v15.4s, v26.4s\n"
+ "fmla v18.4s, v23.4s, v9.s[0]\n"
+ "fmla v19.4s, v23.4s, v9.s[2]\n"
+ "fmin v16.4s, v16.4s, v26.4s\n"
+ "fmla v20.4s, v23.4s, v12.s[2]\n"
+ "fmla v21.4s, v23.4s, v13.s[0]\n"
+ "fmin v17.4s, v17.4s, v26.4s\n"
+ "fmla v22.4s, v23.4s, v13.s[2]\n"
+ "fmin v18.4s, v18.4s, v26.4s\n"
+ "fmin v19.4s, v19.4s, v26.4s\n"
+ "fmin v20.4s, v20.4s, v26.4s\n"
+ "fmin v21.4s, v21.4s, v26.4s\n"
+ "fmin v22.4s, v22.4s, v26.4s\n"
+ "fmax v14.4s, v14.4s, v27.4s\n"
+ "fmax v15.4s, v15.4s, v27.4s\n"
+ "fmax v16.4s, v16.4s, v27.4s\n"
+ "fmax v17.4s, v17.4s, v27.4s\n"
+ "fmax v18.4s, v18.4s, v27.4s\n"
+ "fmax v19.4s, v19.4s, v27.4s\n"
+ "fmax v20.4s, v20.4s, v27.4s\n"
+ "fmax v21.4s, v21.4s, v27.4s\n"
+ "fmax v22.4s, v22.4s, v27.4s\n"
+ "tbz %x[channel_multiplier], #1, 4f\n"
+ "add x20, x13, x14\n"
+ "add x22, x12, x14\n"
+ "st1 { v14.d }[0], [x20]\n"
+ "add x21, x11, x14\n"
+ "add x20, x10, x14\n"
+ "st1 { v15.d }[0], [x22]\n"
+ "add x24, x9, x14\n"
+ "add x23, x28, x14\n"
+ "st1 { v16.d }[0], [x21]\n"
+ "add x22, x27, x14\n"
+ "add x21, x26, x14\n"
+ "st1 { v17.d }[0], [x20]\n"
+ "add x20, x25, x14\n"
+ "st1 { v18.d }[0], [x24]\n"
+ "add x14, x14, #0x8\n"
+ "st1 { v19.d }[0], [x23]\n"
+ "st1 { v20.d }[0], [x22]\n"
+ "st1 { v21.d }[0], [x21]\n"
+ "st1 { v22.d }[0], [x20]\n"
+ "tbz %x[channel_multiplier], #0, 5f\n"
+ "add x20, x13, x14\n"
+ "add x22, x12, x14\n"
+ "st1 { v14.s }[2], [x20]\n"
+ "add x21, x11, x14\n"
+ "add x20, x10, x14\n"
+ "st1 { v15.s }[2], [x22]\n"
+ "add x24, x9, x14\n"
+ "add x23, x28, x14\n"
+ "st1 { v16.s }[2], [x21]\n"
+ "add x22, x27, x14\n"
+ "add x21, x26, x14\n"
+ "st1 { v17.s }[2], [x20]\n"
+ "add x20, x25, x14\n"
+ "st1 { v18.s }[2], [x24]\n"
+ "st1 { v19.s }[2], [x23]\n"
+ "st1 { v20.s }[2], [x22]\n"
+ "st1 { v21.s }[2], [x21]\n"
+ "st1 { v22.s }[2], [x20]\n"
+ "b 5f\n"
+ "4:" // Output channel oddments: Store: Bit 1: Unset
+ "add x20, x13, x14\n"
+ "add x22, x12, x14\n"
+ "st1 { v14.s }[0], [x20]\n"
+ "add x21, x11, x14\n"
+ "add x20, x10, x14\n"
+ "st1 { v15.s }[0], [x22]\n"
+ "add x24, x9, x14\n"
+ "add x23, x28, x14\n"
+ "st1 { v16.s }[0], [x21]\n"
+ "add x22, x27, x14\n"
+ "add x21, x26, x14\n"
+ "st1 { v17.s }[0], [x20]\n"
+ "add x20, x25, x14\n"
+ "st1 { v18.s }[0], [x24]\n"
+ "st1 { v19.s }[0], [x23]\n"
+ "st1 { v20.s }[0], [x22]\n"
+ "st1 { v21.s }[0], [x21]\n"
+ "st1 { v22.s }[0], [x20]\n"
+ "5:" // Output channel oddments: Store: Bit 1: End
+ "6:" // End
+ : [params] "+&r" (params)
+ : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..9f514c78e7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+struct a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst : DepthfirstMultiplierStrategy<float, float, float, float>
+{
+ using Parent = DepthfirstMultiplierStrategy<float, float, float, float>;
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(const CPUInfo *)
+ : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
+
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..c9bb1f41da
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
@@ -0,0 +1,917 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(
+ const float *const *const inptrs,
+ float *const *const outptrs,
+ const void *params,
+ const unsigned int n_output_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ const float minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ld1r { v26.4s }, [%x[clamps]]\n"
+ "ldr x21, [%x[inptrs], #0x0]\n"
+ "lsr x22, %x[channel_multiplier], #0x2\n"
+ "add x20, %x[clamps], #0x4\n"
+ "ldr q0, [x21, #0x0]\n"
+ "ldr q1, [x21, #0x10]\n"
+ "mov x21, #0x0\n"
+ "mov x13, #0x0\n"
+ "ld1r { v25.4s }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x8]\n"
+ "ldr q2, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "ldr x20, [%x[inptrs], #0x10]\n"
+ "ldr q4, [x20, #0x0]\n"
+ "ldr q5, [x20, #0x10]\n"
+ "ldr x20, [%x[inptrs], #0x18]\n"
+ "ldr q6, [x20, #0x0]\n"
+ "ldr q7, [x20, #0x10]\n"
+ "ldr x20, [%x[inptrs], #0x20]\n"
+ "ldr q8, [x20, #0x0]\n"
+ "ldr q9, [x20, #0x10]\n"
+ "ldr x20, [%x[inptrs], #0x28]\n"
+ "ldr q10, [x20, #0x0]\n"
+ "ldr q11, [x20, #0x10]\n"
+ "ldp x12, x11, [%x[outptrs], #0x0]\n"
+ "ldp x10, x9, [%x[outptrs], #0x10]\n"
+ "ldp x28, x27, [%x[outptrs], #0x20]\n"
+ "ldp x26, x25, [%x[outptrs], #0x30]\n"
+ "cbz x22, 3f\n"
+ "ldr q12, [%x[params], #0x0]\n"
+ "ldr q31, [%x[params], #0x10]\n"
+ "subs x22, x22, #0x1\n"
+ "mov v13.16b, v12.16b\n"
+ "ldr q30, [%x[params], #0x20]\n"
+ "ldr q29, [%x[params], #0x30]\n"
+ "mov v14.16b, v12.16b\n"
+ "mov v15.16b, v12.16b\n"
+ "ldr q28, [%x[params], #0x40]\n"
+ "ldr q27, [%x[params], #0x50]\n"
+ "mov v16.16b, v12.16b\n"
+ "mov v17.16b, v12.16b\n"
+ "mov v18.16b, v12.16b\n"
+ "mov v19.16b, v12.16b\n"
+ "add %x[params], %x[params], #0x60\n"
+ "beq 2f\n"
+ "1:" // Output channel complete vector loop
+ "fmla v12.4s, v31.4s, v0.s[0]\n"
+ "fmla v13.4s, v31.4s, v0.s[1]\n"
+ "subs x22, x22, #0x1\n"
+ "add x21, x21, #0x4\n"
+ "fmla v14.4s, v31.4s, v0.s[2]\n"
+ "fmla v15.4s, v31.4s, v0.s[3]\n"
+ "fmla v16.4s, v31.4s, v2.s[0]\n"
+ "fmla v17.4s, v31.4s, v2.s[1]\n"
+ "fmla v18.4s, v31.4s, v2.s[2]\n"
+ "fmla v19.4s, v31.4s, v2.s[3]\n"
+ "ldr q24, [%x[params], #0x0]\n"
+ "fmla v12.4s, v30.4s, v0.s[1]\n"
+ "fmla v13.4s, v30.4s, v0.s[2]\n"
+ "fmla v14.4s, v30.4s, v0.s[3]\n"
+ "fmla v15.4s, v30.4s, v1.s[0]\n"
+ "fmla v16.4s, v30.4s, v2.s[1]\n"
+ "fmla v17.4s, v30.4s, v2.s[2]\n"
+ "fmla v18.4s, v30.4s, v2.s[3]\n"
+ "fmla v19.4s, v30.4s, v3.s[0]\n"
+ "ldr q23, [%x[params], #0x10]\n"
+ "fmla v12.4s, v29.4s, v0.s[2]\n"
+ "fmla v13.4s, v29.4s, v0.s[3]\n"
+ "fmla v14.4s, v29.4s, v1.s[0]\n"
+ "fmla v15.4s, v29.4s, v1.s[1]\n"
+ "fmla v16.4s, v29.4s, v2.s[2]\n"
+ "fmla v17.4s, v29.4s, v2.s[3]\n"
+ "fmla v18.4s, v29.4s, v3.s[0]\n"
+ "fmla v19.4s, v29.4s, v3.s[1]\n"
+ "ldr q22, [%x[params], #0x20]\n"
+ "fmla v12.4s, v28.4s, v0.s[3]\n"
+ "fmla v13.4s, v28.4s, v1.s[0]\n"
+ "fmla v14.4s, v28.4s, v1.s[1]\n"
+ "fmla v15.4s, v28.4s, v1.s[2]\n"
+ "fmla v16.4s, v28.4s, v2.s[3]\n"
+ "fmla v17.4s, v28.4s, v3.s[0]\n"
+ "fmla v18.4s, v28.4s, v3.s[1]\n"
+ "fmla v19.4s, v28.4s, v3.s[2]\n"
+ "ldr q21, [%x[params], #0x30]\n"
+ "fmla v12.4s, v27.4s, v1.s[0]\n"
+ "fmla v13.4s, v27.4s, v1.s[1]\n"
+ "fmla v14.4s, v27.4s, v1.s[2]\n"
+ "fmla v15.4s, v27.4s, v1.s[3]\n"
+ "fmla v16.4s, v27.4s, v3.s[0]\n"
+ "fmla v17.4s, v27.4s, v3.s[1]\n"
+ "fmla v18.4s, v27.4s, v3.s[2]\n"
+ "fmla v19.4s, v27.4s, v3.s[3]\n"
+ "ldr q20, [%x[params], #0x40]\n"
+ "fmla v12.4s, v24.4s, v2.s[0]\n"
+ "fmla v13.4s, v24.4s, v2.s[1]\n"
+ "fmla v14.4s, v24.4s, v2.s[2]\n"
+ "fmla v15.4s, v24.4s, v2.s[3]\n"
+ "fmla v16.4s, v24.4s, v4.s[0]\n"
+ "fmla v17.4s, v24.4s, v4.s[1]\n"
+ "fmla v18.4s, v24.4s, v4.s[2]\n"
+ "fmla v19.4s, v24.4s, v4.s[3]\n"
+ "ldr q24, [%x[params], #0x50]\n"
+ "fmla v12.4s, v23.4s, v2.s[1]\n"
+ "fmla v13.4s, v23.4s, v2.s[2]\n"
+ "fmla v14.4s, v23.4s, v2.s[3]\n"
+ "fmla v15.4s, v23.4s, v3.s[0]\n"
+ "fmla v16.4s, v23.4s, v4.s[1]\n"
+ "fmla v17.4s, v23.4s, v4.s[2]\n"
+ "fmla v18.4s, v23.4s, v4.s[3]\n"
+ "fmla v19.4s, v23.4s, v5.s[0]\n"
+ "ldr q23, [%x[params], #0x60]\n"
+ "fmla v12.4s, v22.4s, v2.s[2]\n"
+ "fmla v13.4s, v22.4s, v2.s[3]\n"
+ "fmla v14.4s, v22.4s, v3.s[0]\n"
+ "fmla v15.4s, v22.4s, v3.s[1]\n"
+ "fmla v16.4s, v22.4s, v4.s[2]\n"
+ "fmla v17.4s, v22.4s, v4.s[3]\n"
+ "fmla v18.4s, v22.4s, v5.s[0]\n"
+ "fmla v19.4s, v22.4s, v5.s[1]\n"
+ "ldr q22, [%x[params], #0x70]\n"
+ "fmla v12.4s, v21.4s, v2.s[3]\n"
+ "fmla v13.4s, v21.4s, v3.s[0]\n"
+ "fmla v14.4s, v21.4s, v3.s[1]\n"
+ "fmla v15.4s, v21.4s, v3.s[2]\n"
+ "fmla v16.4s, v21.4s, v4.s[3]\n"
+ "fmla v17.4s, v21.4s, v5.s[0]\n"
+ "fmla v18.4s, v21.4s, v5.s[1]\n"
+ "fmla v19.4s, v21.4s, v5.s[2]\n"
+ "ldr q21, [%x[params], #0x80]\n"
+ "fmla v12.4s, v20.4s, v3.s[0]\n"
+ "fmla v13.4s, v20.4s, v3.s[1]\n"
+ "fmla v14.4s, v20.4s, v3.s[2]\n"
+ "fmla v15.4s, v20.4s, v3.s[3]\n"
+ "fmla v16.4s, v20.4s, v5.s[0]\n"
+ "fmla v17.4s, v20.4s, v5.s[1]\n"
+ "fmla v18.4s, v20.4s, v5.s[2]\n"
+ "fmla v19.4s, v20.4s, v5.s[3]\n"
+ "ldr q20, [%x[params], #0x90]\n"
+ "fmla v12.4s, v24.4s, v4.s[0]\n"
+ "fmla v13.4s, v24.4s, v4.s[1]\n"
+ "fmla v14.4s, v24.4s, v4.s[2]\n"
+ "fmla v15.4s, v24.4s, v4.s[3]\n"
+ "fmla v16.4s, v24.4s, v6.s[0]\n"
+ "fmla v17.4s, v24.4s, v6.s[1]\n"
+ "fmla v18.4s, v24.4s, v6.s[2]\n"
+ "fmla v19.4s, v24.4s, v6.s[3]\n"
+ "ldr q24, [%x[params], #0xa0]\n"
+ "fmla v12.4s, v23.4s, v4.s[1]\n"
+ "fmla v13.4s, v23.4s, v4.s[2]\n"
+ "fmla v14.4s, v23.4s, v4.s[3]\n"
+ "fmla v15.4s, v23.4s, v5.s[0]\n"
+ "fmla v16.4s, v23.4s, v6.s[1]\n"
+ "fmla v17.4s, v23.4s, v6.s[2]\n"
+ "fmla v18.4s, v23.4s, v6.s[3]\n"
+ "fmla v19.4s, v23.4s, v7.s[0]\n"
+ "ldr q23, [%x[params], #0xb0]\n"
+ "fmla v12.4s, v22.4s, v4.s[2]\n"
+ "fmla v13.4s, v22.4s, v4.s[3]\n"
+ "fmla v14.4s, v22.4s, v5.s[0]\n"
+ "fmla v15.4s, v22.4s, v5.s[1]\n"
+ "fmla v16.4s, v22.4s, v6.s[2]\n"
+ "fmla v17.4s, v22.4s, v6.s[3]\n"
+ "fmla v18.4s, v22.4s, v7.s[0]\n"
+ "fmla v19.4s, v22.4s, v7.s[1]\n"
+ "ldr q22, [%x[params], #0xc0]\n"
+ "fmla v12.4s, v21.4s, v4.s[3]\n"
+ "fmla v13.4s, v21.4s, v5.s[0]\n"
+ "fmla v14.4s, v21.4s, v5.s[1]\n"
+ "fmla v15.4s, v21.4s, v5.s[2]\n"
+ "fmla v16.4s, v21.4s, v6.s[3]\n"
+ "fmla v17.4s, v21.4s, v7.s[0]\n"
+ "fmla v18.4s, v21.4s, v7.s[1]\n"
+ "fmla v19.4s, v21.4s, v7.s[2]\n"
+ "ldr q21, [%x[params], #0xd0]\n"
+ "fmla v12.4s, v20.4s, v5.s[0]\n"
+ "fmla v13.4s, v20.4s, v5.s[1]\n"
+ "fmla v14.4s, v20.4s, v5.s[2]\n"
+ "fmla v15.4s, v20.4s, v5.s[3]\n"
+ "fmla v16.4s, v20.4s, v7.s[0]\n"
+ "fmla v17.4s, v20.4s, v7.s[1]\n"
+ "fmla v18.4s, v20.4s, v7.s[2]\n"
+ "fmla v19.4s, v20.4s, v7.s[3]\n"
+ "ldr q20, [%x[params], #0xe0]\n"
+ "fmla v12.4s, v24.4s, v6.s[0]\n"
+ "fmla v13.4s, v24.4s, v6.s[1]\n"
+ "fmla v14.4s, v24.4s, v6.s[2]\n"
+ "fmla v15.4s, v24.4s, v6.s[3]\n"
+ "fmla v16.4s, v24.4s, v8.s[0]\n"
+ "fmla v17.4s, v24.4s, v8.s[1]\n"
+ "fmla v18.4s, v24.4s, v8.s[2]\n"
+ "fmla v19.4s, v24.4s, v8.s[3]\n"
+ "ldr q24, [%x[params], #0xf0]\n"
+ "fmla v12.4s, v23.4s, v6.s[1]\n"
+ "fmla v13.4s, v23.4s, v6.s[2]\n"
+ "fmla v14.4s, v23.4s, v6.s[3]\n"
+ "fmla v15.4s, v23.4s, v7.s[0]\n"
+ "fmla v16.4s, v23.4s, v8.s[1]\n"
+ "fmla v17.4s, v23.4s, v8.s[2]\n"
+ "fmla v18.4s, v23.4s, v8.s[3]\n"
+ "fmla v19.4s, v23.4s, v9.s[0]\n"
+ "ldr q23, [%x[params], #0x100]\n"
+ "fmla v12.4s, v22.4s, v6.s[2]\n"
+ "fmla v13.4s, v22.4s, v6.s[3]\n"
+ "fmla v14.4s, v22.4s, v7.s[0]\n"
+ "fmla v15.4s, v22.4s, v7.s[1]\n"
+ "fmla v16.4s, v22.4s, v8.s[2]\n"
+ "fmla v17.4s, v22.4s, v8.s[3]\n"
+ "fmla v18.4s, v22.4s, v9.s[0]\n"
+ "fmla v19.4s, v22.4s, v9.s[1]\n"
+ "ldr q22, [%x[params], #0x110]\n"
+ "fmla v12.4s, v21.4s, v6.s[3]\n"
+ "fmla v13.4s, v21.4s, v7.s[0]\n"
+ "fmla v14.4s, v21.4s, v7.s[1]\n"
+ "fmla v15.4s, v21.4s, v7.s[2]\n"
+ "fmla v16.4s, v21.4s, v8.s[3]\n"
+ "fmla v17.4s, v21.4s, v9.s[0]\n"
+ "fmla v18.4s, v21.4s, v9.s[1]\n"
+ "fmla v19.4s, v21.4s, v9.s[2]\n"
+ "ldr q21, [%x[params], #0x120]\n"
+ "fmla v12.4s, v20.4s, v7.s[0]\n"
+ "fmla v13.4s, v20.4s, v7.s[1]\n"
+ "fmla v14.4s, v20.4s, v7.s[2]\n"
+ "fmla v15.4s, v20.4s, v7.s[3]\n"
+ "fmla v16.4s, v20.4s, v9.s[0]\n"
+ "fmla v17.4s, v20.4s, v9.s[1]\n"
+ "fmla v18.4s, v20.4s, v9.s[2]\n"
+ "fmla v19.4s, v20.4s, v9.s[3]\n"
+ "ldr q20, [%x[params], #0x130]\n"
+ "fmla v12.4s, v24.4s, v8.s[0]\n"
+ "fmla v13.4s, v24.4s, v8.s[1]\n"
+ "fmla v14.4s, v24.4s, v8.s[2]\n"
+ "fmla v15.4s, v24.4s, v8.s[3]\n"
+ "fmla v16.4s, v24.4s, v10.s[0]\n"
+ "fmla v17.4s, v24.4s, v10.s[1]\n"
+ "fmla v18.4s, v24.4s, v10.s[2]\n"
+ "fmla v19.4s, v24.4s, v10.s[3]\n"
+ "ldr q31, [%x[params], #0x150]\n"
+ "fmla v12.4s, v23.4s, v8.s[1]\n"
+ "fmla v13.4s, v23.4s, v8.s[2]\n"
+ "fmla v14.4s, v23.4s, v8.s[3]\n"
+ "fmla v15.4s, v23.4s, v9.s[0]\n"
+ "fmla v16.4s, v23.4s, v10.s[1]\n"
+ "fmla v17.4s, v23.4s, v10.s[2]\n"
+ "fmla v18.4s, v23.4s, v10.s[3]\n"
+ "fmla v19.4s, v23.4s, v11.s[0]\n"
+ "ldr q30, [%x[params], #0x160]\n"
+ "fmla v12.4s, v22.4s, v8.s[2]\n"
+ "fmla v13.4s, v22.4s, v8.s[3]\n"
+ "fmla v14.4s, v22.4s, v9.s[0]\n"
+ "fmla v15.4s, v22.4s, v9.s[1]\n"
+ "fmla v16.4s, v22.4s, v10.s[2]\n"
+ "fmla v17.4s, v22.4s, v10.s[3]\n"
+ "fmla v18.4s, v22.4s, v11.s[0]\n"
+ "fmla v19.4s, v22.4s, v11.s[1]\n"
+ "ldr q29, [%x[params], #0x170]\n"
+ "fmla v12.4s, v21.4s, v8.s[3]\n"
+ "fmla v13.4s, v21.4s, v9.s[0]\n"
+ "fmla v14.4s, v21.4s, v9.s[1]\n"
+ "fmla v15.4s, v21.4s, v9.s[2]\n"
+ "fmla v16.4s, v21.4s, v10.s[3]\n"
+ "fmla v17.4s, v21.4s, v11.s[0]\n"
+ "fmla v18.4s, v21.4s, v11.s[1]\n"
+ "fmla v19.4s, v21.4s, v11.s[2]\n"
+ "ldr q28, [%x[params], #0x180]\n"
+ "fmla v12.4s, v20.4s, v9.s[0]\n"
+ "fmla v13.4s, v20.4s, v9.s[1]\n"
+ "fmin v12.4s, v12.4s, v25.4s\n"
+ "fmla v14.4s, v20.4s, v9.s[2]\n"
+ "fmla v15.4s, v20.4s, v9.s[3]\n"
+ "fmax v12.4s, v12.4s, v26.4s\n"
+ "str q12, [x12, x13]\n"
+ "ldr q12, [%x[params], #0x140]\n"
+ "fmla v16.4s, v20.4s, v11.s[0]\n"
+ "fmla v17.4s, v20.4s, v11.s[1]\n"
+ "fmin v13.4s, v13.4s, v25.4s\n"
+ "fmla v18.4s, v20.4s, v11.s[2]\n"
+ "fmla v19.4s, v20.4s, v11.s[3]\n"
+ "ldr q27, [%x[params], #0x190]\n"
+ "fmin v14.4s, v14.4s, v25.4s\n"
+ "fmin v15.4s, v15.4s, v25.4s\n"
+ "fmin v16.4s, v16.4s, v25.4s\n"
+ "add %x[params], %x[params], #0x1a0\n"
+ "fmin v17.4s, v17.4s, v25.4s\n"
+ "fmin v18.4s, v18.4s, v25.4s\n"
+ "fmin v19.4s, v19.4s, v25.4s\n"
+ "fmax v13.4s, v13.4s, v26.4s\n"
+ "str q13, [x11, x13]\n"
+ "fmax v14.4s, v14.4s, v26.4s\n"
+ "fmax v15.4s, v15.4s, v26.4s\n"
+ "str q14, [x10, x13]\n"
+ "fmax v16.4s, v16.4s, v26.4s\n"
+ "fmax v17.4s, v17.4s, v26.4s\n"
+ "str q15, [x9, x13]\n"
+ "fmax v18.4s, v18.4s, v26.4s\n"
+ "fmax v19.4s, v19.4s, v26.4s\n"
+ "str q16, [x28, x13]\n"
+ "str q17, [x27, x13]\n"
+ "mov v13.16b, v12.16b\n"
+ "mov v14.16b, v12.16b\n"
+ "str q18, [x26, x13]\n"
+ "mov v15.16b, v12.16b\n"
+ "mov v16.16b, v12.16b\n"
+ "str q19, [x25, x13]\n"
+ "mov v17.16b, v12.16b\n"
+ "mov v18.16b, v12.16b\n"
+ "add x13, x13, #0x10\n"
+ "mov v19.16b, v12.16b\n"
+ "bgt 1b\n"
+ "2:" // Output channel complete vector tail
+ "fmla v12.4s, v31.4s, v0.s[0]\n"
+ "fmla v13.4s, v31.4s, v0.s[1]\n"
+ "fmla v14.4s, v31.4s, v0.s[2]\n"
+ "fmla v15.4s, v31.4s, v0.s[3]\n"
+ "fmla v16.4s, v31.4s, v2.s[0]\n"
+ "fmla v17.4s, v31.4s, v2.s[1]\n"
+ "fmla v18.4s, v31.4s, v2.s[2]\n"
+ "fmla v19.4s, v31.4s, v2.s[3]\n"
+ "ldr q24, [%x[params], #0x0]\n"
+ "fmla v12.4s, v30.4s, v0.s[1]\n"
+ "fmla v13.4s, v30.4s, v0.s[2]\n"
+ "fmla v14.4s, v30.4s, v0.s[3]\n"
+ "fmla v15.4s, v30.4s, v1.s[0]\n"
+ "fmla v16.4s, v30.4s, v2.s[1]\n"
+ "fmla v17.4s, v30.4s, v2.s[2]\n"
+ "fmla v18.4s, v30.4s, v2.s[3]\n"
+ "fmla v19.4s, v30.4s, v3.s[0]\n"
+ "ldr q23, [%x[params], #0x10]\n"
+ "fmla v12.4s, v29.4s, v0.s[2]\n"
+ "fmla v13.4s, v29.4s, v0.s[3]\n"
+ "fmla v14.4s, v29.4s, v1.s[0]\n"
+ "fmla v15.4s, v29.4s, v1.s[1]\n"
+ "fmla v16.4s, v29.4s, v2.s[2]\n"
+ "fmla v17.4s, v29.4s, v2.s[3]\n"
+ "fmla v18.4s, v29.4s, v3.s[0]\n"
+ "fmla v19.4s, v29.4s, v3.s[1]\n"
+ "ldr q22, [%x[params], #0x20]\n"
+ "fmla v12.4s, v28.4s, v0.s[3]\n"
+ "fmla v13.4s, v28.4s, v1.s[0]\n"
+ "fmla v14.4s, v28.4s, v1.s[1]\n"
+ "fmla v15.4s, v28.4s, v1.s[2]\n"
+ "fmla v16.4s, v28.4s, v2.s[3]\n"
+ "fmla v17.4s, v28.4s, v3.s[0]\n"
+ "fmla v18.4s, v28.4s, v3.s[1]\n"
+ "fmla v19.4s, v28.4s, v3.s[2]\n"
+ "ldr q21, [%x[params], #0x30]\n"
+ "fmla v12.4s, v27.4s, v1.s[0]\n"
+ "fmla v13.4s, v27.4s, v1.s[1]\n"
+ "fmla v14.4s, v27.4s, v1.s[2]\n"
+ "fmla v15.4s, v27.4s, v1.s[3]\n"
+ "fmla v16.4s, v27.4s, v3.s[0]\n"
+ "fmla v17.4s, v27.4s, v3.s[1]\n"
+ "fmla v18.4s, v27.4s, v3.s[2]\n"
+ "fmla v19.4s, v27.4s, v3.s[3]\n"
+ "ldr q20, [%x[params], #0x40]\n"
+ "fmla v12.4s, v24.4s, v2.s[0]\n"
+ "fmla v13.4s, v24.4s, v2.s[1]\n"
+ "fmla v14.4s, v24.4s, v2.s[2]\n"
+ "fmla v15.4s, v24.4s, v2.s[3]\n"
+ "fmla v16.4s, v24.4s, v4.s[0]\n"
+ "fmla v17.4s, v24.4s, v4.s[1]\n"
+ "fmla v18.4s, v24.4s, v4.s[2]\n"
+ "fmla v19.4s, v24.4s, v4.s[3]\n"
+ "ldr q24, [%x[params], #0x50]\n"
+ "fmla v12.4s, v23.4s, v2.s[1]\n"
+ "fmla v13.4s, v23.4s, v2.s[2]\n"
+ "fmla v14.4s, v23.4s, v2.s[3]\n"
+ "fmla v15.4s, v23.4s, v3.s[0]\n"
+ "fmla v16.4s, v23.4s, v4.s[1]\n"
+ "fmla v17.4s, v23.4s, v4.s[2]\n"
+ "fmla v18.4s, v23.4s, v4.s[3]\n"
+ "fmla v19.4s, v23.4s, v5.s[0]\n"
+ "ldr q23, [%x[params], #0x60]\n"
+ "fmla v12.4s, v22.4s, v2.s[2]\n"
+ "fmla v13.4s, v22.4s, v2.s[3]\n"
+ "fmla v14.4s, v22.4s, v3.s[0]\n"
+ "fmla v15.4s, v22.4s, v3.s[1]\n"
+ "fmla v16.4s, v22.4s, v4.s[2]\n"
+ "fmla v17.4s, v22.4s, v4.s[3]\n"
+ "fmla v18.4s, v22.4s, v5.s[0]\n"
+ "fmla v19.4s, v22.4s, v5.s[1]\n"
+ "ldr q22, [%x[params], #0x70]\n"
+ "fmla v12.4s, v21.4s, v2.s[3]\n"
+ "fmla v13.4s, v21.4s, v3.s[0]\n"
+ "fmla v14.4s, v21.4s, v3.s[1]\n"
+ "fmla v15.4s, v21.4s, v3.s[2]\n"
+ "fmla v16.4s, v21.4s, v4.s[3]\n"
+ "fmla v17.4s, v21.4s, v5.s[0]\n"
+ "fmla v18.4s, v21.4s, v5.s[1]\n"
+ "fmla v19.4s, v21.4s, v5.s[2]\n"
+ "ldr q21, [%x[params], #0x80]\n"
+ "fmla v12.4s, v20.4s, v3.s[0]\n"
+ "fmla v13.4s, v20.4s, v3.s[1]\n"
+ "fmla v14.4s, v20.4s, v3.s[2]\n"
+ "fmla v15.4s, v20.4s, v3.s[3]\n"
+ "fmla v16.4s, v20.4s, v5.s[0]\n"
+ "fmla v17.4s, v20.4s, v5.s[1]\n"
+ "fmla v18.4s, v20.4s, v5.s[2]\n"
+ "fmla v19.4s, v20.4s, v5.s[3]\n"
+ "ldr q20, [%x[params], #0x90]\n"
+ "fmla v12.4s, v24.4s, v4.s[0]\n"
+ "fmla v13.4s, v24.4s, v4.s[1]\n"
+ "fmla v14.4s, v24.4s, v4.s[2]\n"
+ "fmla v15.4s, v24.4s, v4.s[3]\n"
+ "fmla v16.4s, v24.4s, v6.s[0]\n"
+ "fmla v17.4s, v24.4s, v6.s[1]\n"
+ "fmla v18.4s, v24.4s, v6.s[2]\n"
+ "fmla v19.4s, v24.4s, v6.s[3]\n"
+ "ldr q24, [%x[params], #0xa0]\n"
+ "fmla v12.4s, v23.4s, v4.s[1]\n"
+ "fmla v13.4s, v23.4s, v4.s[2]\n"
+ "fmla v14.4s, v23.4s, v4.s[3]\n"
+ "fmla v15.4s, v23.4s, v5.s[0]\n"
+ "fmla v16.4s, v23.4s, v6.s[1]\n"
+ "fmla v17.4s, v23.4s, v6.s[2]\n"
+ "fmla v18.4s, v23.4s, v6.s[3]\n"
+ "fmla v19.4s, v23.4s, v7.s[0]\n"
+ "ldr q23, [%x[params], #0xb0]\n"
+ "fmla v12.4s, v22.4s, v4.s[2]\n"
+ "fmla v13.4s, v22.4s, v4.s[3]\n"
+ "fmla v14.4s, v22.4s, v5.s[0]\n"
+ "fmla v15.4s, v22.4s, v5.s[1]\n"
+ "fmla v16.4s, v22.4s, v6.s[2]\n"
+ "fmla v17.4s, v22.4s, v6.s[3]\n"
+ "fmla v18.4s, v22.4s, v7.s[0]\n"
+ "fmla v19.4s, v22.4s, v7.s[1]\n"
+ "ldr q22, [%x[params], #0xc0]\n"
+ "fmla v12.4s, v21.4s, v4.s[3]\n"
+ "fmla v13.4s, v21.4s, v5.s[0]\n"
+ "fmla v14.4s, v21.4s, v5.s[1]\n"
+ "fmla v15.4s, v21.4s, v5.s[2]\n"
+ "fmla v16.4s, v21.4s, v6.s[3]\n"
+ "fmla v17.4s, v21.4s, v7.s[0]\n"
+ "fmla v18.4s, v21.4s, v7.s[1]\n"
+ "fmla v19.4s, v21.4s, v7.s[2]\n"
+ "ldr q21, [%x[params], #0xd0]\n"
+ "fmla v12.4s, v20.4s, v5.s[0]\n"
+ "fmla v13.4s, v20.4s, v5.s[1]\n"
+ "fmla v14.4s, v20.4s, v5.s[2]\n"
+ "fmla v15.4s, v20.4s, v5.s[3]\n"
+ "fmla v16.4s, v20.4s, v7.s[0]\n"
+ "fmla v17.4s, v20.4s, v7.s[1]\n"
+ "fmla v18.4s, v20.4s, v7.s[2]\n"
+ "fmla v19.4s, v20.4s, v7.s[3]\n"
+ "ldr q20, [%x[params], #0xe0]\n"
+ "fmla v12.4s, v24.4s, v6.s[0]\n"
+ "fmla v13.4s, v24.4s, v6.s[1]\n"
+ "fmla v14.4s, v24.4s, v6.s[2]\n"
+ "fmla v15.4s, v24.4s, v6.s[3]\n"
+ "fmla v16.4s, v24.4s, v8.s[0]\n"
+ "fmla v17.4s, v24.4s, v8.s[1]\n"
+ "fmla v18.4s, v24.4s, v8.s[2]\n"
+ "fmla v19.4s, v24.4s, v8.s[3]\n"
+ "ldr q24, [%x[params], #0xf0]\n"
+ "fmla v12.4s, v23.4s, v6.s[1]\n"
+ "fmla v13.4s, v23.4s, v6.s[2]\n"
+ "fmla v14.4s, v23.4s, v6.s[3]\n"
+ "fmla v15.4s, v23.4s, v7.s[0]\n"
+ "fmla v16.4s, v23.4s, v8.s[1]\n"
+ "fmla v17.4s, v23.4s, v8.s[2]\n"
+ "fmla v18.4s, v23.4s, v8.s[3]\n"
+ "fmla v19.4s, v23.4s, v9.s[0]\n"
+ "ldr q23, [%x[params], #0x100]\n"
+ "fmla v12.4s, v22.4s, v6.s[2]\n"
+ "fmla v13.4s, v22.4s, v6.s[3]\n"
+ "fmla v14.4s, v22.4s, v7.s[0]\n"
+ "fmla v15.4s, v22.4s, v7.s[1]\n"
+ "fmla v16.4s, v22.4s, v8.s[2]\n"
+ "fmla v17.4s, v22.4s, v8.s[3]\n"
+ "fmla v18.4s, v22.4s, v9.s[0]\n"
+ "fmla v19.4s, v22.4s, v9.s[1]\n"
+ "ldr q22, [%x[params], #0x110]\n"
+ "fmla v12.4s, v21.4s, v6.s[3]\n"
+ "fmla v13.4s, v21.4s, v7.s[0]\n"
+ "fmla v14.4s, v21.4s, v7.s[1]\n"
+ "fmla v15.4s, v21.4s, v7.s[2]\n"
+ "fmla v16.4s, v21.4s, v8.s[3]\n"
+ "fmla v17.4s, v21.4s, v9.s[0]\n"
+ "fmla v18.4s, v21.4s, v9.s[1]\n"
+ "fmla v19.4s, v21.4s, v9.s[2]\n"
+ "ldr q21, [%x[params], #0x120]\n"
+ "fmla v12.4s, v20.4s, v7.s[0]\n"
+ "fmla v13.4s, v20.4s, v7.s[1]\n"
+ "fmla v14.4s, v20.4s, v7.s[2]\n"
+ "fmla v15.4s, v20.4s, v7.s[3]\n"
+ "fmla v16.4s, v20.4s, v9.s[0]\n"
+ "fmla v17.4s, v20.4s, v9.s[1]\n"
+ "fmla v18.4s, v20.4s, v9.s[2]\n"
+ "fmla v19.4s, v20.4s, v9.s[3]\n"
+ "ldr q20, [%x[params], #0x130]\n"
+ "add %x[params], %x[params], #0x140\n"
+ "fmla v12.4s, v24.4s, v8.s[0]\n"
+ "fmla v13.4s, v24.4s, v8.s[1]\n"
+ "fmla v14.4s, v24.4s, v8.s[2]\n"
+ "fmla v15.4s, v24.4s, v8.s[3]\n"
+ "fmla v16.4s, v24.4s, v10.s[0]\n"
+ "fmla v17.4s, v24.4s, v10.s[1]\n"
+ "fmla v18.4s, v24.4s, v10.s[2]\n"
+ "fmla v19.4s, v24.4s, v10.s[3]\n"
+ "fmla v12.4s, v23.4s, v8.s[1]\n"
+ "fmla v13.4s, v23.4s, v8.s[2]\n"
+ "fmla v14.4s, v23.4s, v8.s[3]\n"
+ "fmla v15.4s, v23.4s, v9.s[0]\n"
+ "fmla v16.4s, v23.4s, v10.s[1]\n"
+ "fmla v17.4s, v23.4s, v10.s[2]\n"
+ "fmla v18.4s, v23.4s, v10.s[3]\n"
+ "fmla v19.4s, v23.4s, v11.s[0]\n"
+ "fmla v12.4s, v22.4s, v8.s[2]\n"
+ "fmla v13.4s, v22.4s, v8.s[3]\n"
+ "fmla v14.4s, v22.4s, v9.s[0]\n"
+ "fmla v15.4s, v22.4s, v9.s[1]\n"
+ "fmla v16.4s, v22.4s, v10.s[2]\n"
+ "fmla v17.4s, v22.4s, v10.s[3]\n"
+ "fmla v18.4s, v22.4s, v11.s[0]\n"
+ "fmla v19.4s, v22.4s, v11.s[1]\n"
+ "fmla v12.4s, v21.4s, v8.s[3]\n"
+ "fmla v13.4s, v21.4s, v9.s[0]\n"
+ "fmla v14.4s, v21.4s, v9.s[1]\n"
+ "fmla v15.4s, v21.4s, v9.s[2]\n"
+ "fmla v16.4s, v21.4s, v10.s[3]\n"
+ "fmla v17.4s, v21.4s, v11.s[0]\n"
+ "fmla v18.4s, v21.4s, v11.s[1]\n"
+ "fmla v19.4s, v21.4s, v11.s[2]\n"
+ "fmla v12.4s, v20.4s, v9.s[0]\n"
+ "fmla v13.4s, v20.4s, v9.s[1]\n"
+ "fmin v12.4s, v12.4s, v25.4s\n"
+ "fmla v14.4s, v20.4s, v9.s[2]\n"
+ "fmla v15.4s, v20.4s, v9.s[3]\n"
+ "fmin v13.4s, v13.4s, v25.4s\n"
+ "fmla v16.4s, v20.4s, v11.s[0]\n"
+ "fmla v17.4s, v20.4s, v11.s[1]\n"
+ "fmin v14.4s, v14.4s, v25.4s\n"
+ "fmla v18.4s, v20.4s, v11.s[2]\n"
+ "fmla v19.4s, v20.4s, v11.s[3]\n"
+ "fmin v15.4s, v15.4s, v25.4s\n"
+ "fmin v16.4s, v16.4s, v25.4s\n"
+ "fmin v17.4s, v17.4s, v25.4s\n"
+ "fmin v18.4s, v18.4s, v25.4s\n"
+ "fmin v19.4s, v19.4s, v25.4s\n"
+ "fmax v12.4s, v12.4s, v26.4s\n"
+ "fmax v13.4s, v13.4s, v26.4s\n"
+ "str q12, [x12, x13]\n"
+ "fmax v14.4s, v14.4s, v26.4s\n"
+ "fmax v15.4s, v15.4s, v26.4s\n"
+ "str q13, [x11, x13]\n"
+ "fmax v16.4s, v16.4s, v26.4s\n"
+ "fmax v17.4s, v17.4s, v26.4s\n"
+ "str q14, [x10, x13]\n"
+ "fmax v18.4s, v18.4s, v26.4s\n"
+ "fmax v19.4s, v19.4s, v26.4s\n"
+ "str q15, [x9, x13]\n"
+ "str q16, [x28, x13]\n"
+ "str q17, [x27, x13]\n"
+ "str q18, [x26, x13]\n"
+ "str q19, [x25, x13]\n"
+ "add x13, x13, #0x10\n"
+ "3:" // Output channel oddments
+ "tst %x[channel_multiplier], #0x3\n"
+ "beq 6f\n"
+ "ldr q12, [%x[params], #0x0]\n"
+ "ldr q24, [%x[params], #0x10]\n"
+ "mov v13.16b, v12.16b\n"
+ "mov v14.16b, v12.16b\n"
+ "ldr q23, [%x[params], #0x20]\n"
+ "ldr q22, [%x[params], #0x30]\n"
+ "mov v15.16b, v12.16b\n"
+ "mov v16.16b, v12.16b\n"
+ "ldr q21, [%x[params], #0x40]\n"
+ "ldr q20, [%x[params], #0x50]\n"
+ "mov v17.16b, v12.16b\n"
+ "mov v18.16b, v12.16b\n"
+ "mov v19.16b, v12.16b\n"
+ "fmla v12.4s, v24.4s, v0.s[0]\n"
+ "fmla v13.4s, v24.4s, v0.s[1]\n"
+ "fmla v14.4s, v24.4s, v0.s[2]\n"
+ "fmla v15.4s, v24.4s, v0.s[3]\n"
+ "fmla v16.4s, v24.4s, v2.s[0]\n"
+ "fmla v17.4s, v24.4s, v2.s[1]\n"
+ "fmla v18.4s, v24.4s, v2.s[2]\n"
+ "fmla v19.4s, v24.4s, v2.s[3]\n"
+ "ldr q24, [%x[params], #0x60]\n"
+ "fmla v12.4s, v23.4s, v0.s[1]\n"
+ "fmla v13.4s, v23.4s, v0.s[2]\n"
+ "fmla v14.4s, v23.4s, v0.s[3]\n"
+ "fmla v15.4s, v23.4s, v1.s[0]\n"
+ "fmla v16.4s, v23.4s, v2.s[1]\n"
+ "fmla v17.4s, v23.4s, v2.s[2]\n"
+ "fmla v18.4s, v23.4s, v2.s[3]\n"
+ "fmla v19.4s, v23.4s, v3.s[0]\n"
+ "ldr q23, [%x[params], #0x70]\n"
+ "fmla v12.4s, v22.4s, v0.s[2]\n"
+ "fmla v13.4s, v22.4s, v0.s[3]\n"
+ "fmla v14.4s, v22.4s, v1.s[0]\n"
+ "fmla v15.4s, v22.4s, v1.s[1]\n"
+ "fmla v16.4s, v22.4s, v2.s[2]\n"
+ "fmla v17.4s, v22.4s, v2.s[3]\n"
+ "fmla v18.4s, v22.4s, v3.s[0]\n"
+ "fmla v19.4s, v22.4s, v3.s[1]\n"
+ "ldr q22, [%x[params], #0x80]\n"
+ "fmla v12.4s, v21.4s, v0.s[3]\n"
+ "fmla v13.4s, v21.4s, v1.s[0]\n"
+ "fmla v14.4s, v21.4s, v1.s[1]\n"
+ "fmla v15.4s, v21.4s, v1.s[2]\n"
+ "fmla v16.4s, v21.4s, v2.s[3]\n"
+ "fmla v17.4s, v21.4s, v3.s[0]\n"
+ "fmla v18.4s, v21.4s, v3.s[1]\n"
+ "fmla v19.4s, v21.4s, v3.s[2]\n"
+ "ldr q21, [%x[params], #0x90]\n"
+ "fmla v12.4s, v20.4s, v1.s[0]\n"
+ "fmla v13.4s, v20.4s, v1.s[1]\n"
+ "fmla v14.4s, v20.4s, v1.s[2]\n"
+ "fmla v15.4s, v20.4s, v1.s[3]\n"
+ "fmla v16.4s, v20.4s, v3.s[0]\n"
+ "fmla v17.4s, v20.4s, v3.s[1]\n"
+ "fmla v18.4s, v20.4s, v3.s[2]\n"
+ "fmla v19.4s, v20.4s, v3.s[3]\n"
+ "ldr q20, [%x[params], #0xa0]\n"
+ "fmla v12.4s, v24.4s, v2.s[0]\n"
+ "fmla v13.4s, v24.4s, v2.s[1]\n"
+ "fmla v14.4s, v24.4s, v2.s[2]\n"
+ "fmla v15.4s, v24.4s, v2.s[3]\n"
+ "fmla v16.4s, v24.4s, v4.s[0]\n"
+ "fmla v17.4s, v24.4s, v4.s[1]\n"
+ "fmla v18.4s, v24.4s, v4.s[2]\n"
+ "fmla v19.4s, v24.4s, v4.s[3]\n"
+ "ldr q24, [%x[params], #0xb0]\n"
+ "fmla v12.4s, v23.4s, v2.s[1]\n"
+ "fmla v13.4s, v23.4s, v2.s[2]\n"
+ "fmla v14.4s, v23.4s, v2.s[3]\n"
+ "fmla v15.4s, v23.4s, v3.s[0]\n"
+ "fmla v16.4s, v23.4s, v4.s[1]\n"
+ "fmla v17.4s, v23.4s, v4.s[2]\n"
+ "fmla v18.4s, v23.4s, v4.s[3]\n"
+ "fmla v19.4s, v23.4s, v5.s[0]\n"
+ "ldr q23, [%x[params], #0xc0]\n"
+ "fmla v12.4s, v22.4s, v2.s[2]\n"
+ "fmla v13.4s, v22.4s, v2.s[3]\n"
+ "fmla v14.4s, v22.4s, v3.s[0]\n"
+ "fmla v15.4s, v22.4s, v3.s[1]\n"
+ "fmla v16.4s, v22.4s, v4.s[2]\n"
+ "fmla v17.4s, v22.4s, v4.s[3]\n"
+ "fmla v18.4s, v22.4s, v5.s[0]\n"
+ "fmla v19.4s, v22.4s, v5.s[1]\n"
+ "ldr q22, [%x[params], #0xd0]\n"
+ "fmla v12.4s, v21.4s, v2.s[3]\n"
+ "fmla v13.4s, v21.4s, v3.s[0]\n"
+ "fmla v14.4s, v21.4s, v3.s[1]\n"
+ "fmla v15.4s, v21.4s, v3.s[2]\n"
+ "fmla v16.4s, v21.4s, v4.s[3]\n"
+ "fmla v17.4s, v21.4s, v5.s[0]\n"
+ "fmla v18.4s, v21.4s, v5.s[1]\n"
+ "fmla v19.4s, v21.4s, v5.s[2]\n"
+ "ldr q21, [%x[params], #0xe0]\n"
+ "fmla v12.4s, v20.4s, v3.s[0]\n"
+ "fmla v13.4s, v20.4s, v3.s[1]\n"
+ "fmla v14.4s, v20.4s, v3.s[2]\n"
+ "fmla v15.4s, v20.4s, v3.s[3]\n"
+ "fmla v16.4s, v20.4s, v5.s[0]\n"
+ "fmla v17.4s, v20.4s, v5.s[1]\n"
+ "fmla v18.4s, v20.4s, v5.s[2]\n"
+ "fmla v19.4s, v20.4s, v5.s[3]\n"
+ "ldr q20, [%x[params], #0xf0]\n"
+ "fmla v12.4s, v24.4s, v4.s[0]\n"
+ "fmla v13.4s, v24.4s, v4.s[1]\n"
+ "fmla v14.4s, v24.4s, v4.s[2]\n"
+ "fmla v15.4s, v24.4s, v4.s[3]\n"
+ "fmla v16.4s, v24.4s, v6.s[0]\n"
+ "fmla v17.4s, v24.4s, v6.s[1]\n"
+ "fmla v18.4s, v24.4s, v6.s[2]\n"
+ "fmla v19.4s, v24.4s, v6.s[3]\n"
+ "ldr q24, [%x[params], #0x100]\n"
+ "fmla v12.4s, v23.4s, v4.s[1]\n"
+ "fmla v13.4s, v23.4s, v4.s[2]\n"
+ "fmla v14.4s, v23.4s, v4.s[3]\n"
+ "fmla v15.4s, v23.4s, v5.s[0]\n"
+ "fmla v16.4s, v23.4s, v6.s[1]\n"
+ "fmla v17.4s, v23.4s, v6.s[2]\n"
+ "fmla v18.4s, v23.4s, v6.s[3]\n"
+ "fmla v19.4s, v23.4s, v7.s[0]\n"
+ "ldr q23, [%x[params], #0x110]\n"
+ "fmla v12.4s, v22.4s, v4.s[2]\n"
+ "fmla v13.4s, v22.4s, v4.s[3]\n"
+ "fmla v14.4s, v22.4s, v5.s[0]\n"
+ "fmla v15.4s, v22.4s, v5.s[1]\n"
+ "fmla v16.4s, v22.4s, v6.s[2]\n"
+ "fmla v17.4s, v22.4s, v6.s[3]\n"
+ "fmla v18.4s, v22.4s, v7.s[0]\n"
+ "fmla v19.4s, v22.4s, v7.s[1]\n"
+ "ldr q22, [%x[params], #0x120]\n"
+ "fmla v12.4s, v21.4s, v4.s[3]\n"
+ "fmla v13.4s, v21.4s, v5.s[0]\n"
+ "fmla v14.4s, v21.4s, v5.s[1]\n"
+ "fmla v15.4s, v21.4s, v5.s[2]\n"
+ "fmla v16.4s, v21.4s, v6.s[3]\n"
+ "fmla v17.4s, v21.4s, v7.s[0]\n"
+ "fmla v18.4s, v21.4s, v7.s[1]\n"
+ "fmla v19.4s, v21.4s, v7.s[2]\n"
+ "ldr q21, [%x[params], #0x130]\n"
+ "fmla v12.4s, v20.4s, v5.s[0]\n"
+ "fmla v13.4s, v20.4s, v5.s[1]\n"
+ "fmla v14.4s, v20.4s, v5.s[2]\n"
+ "fmla v15.4s, v20.4s, v5.s[3]\n"
+ "fmla v16.4s, v20.4s, v7.s[0]\n"
+ "fmla v17.4s, v20.4s, v7.s[1]\n"
+ "fmla v18.4s, v20.4s, v7.s[2]\n"
+ "fmla v19.4s, v20.4s, v7.s[3]\n"
+ "ldr q20, [%x[params], #0x140]\n"
+ "fmla v12.4s, v24.4s, v6.s[0]\n"
+ "fmla v13.4s, v24.4s, v6.s[1]\n"
+ "fmla v14.4s, v24.4s, v6.s[2]\n"
+ "fmla v15.4s, v24.4s, v6.s[3]\n"
+ "fmla v16.4s, v24.4s, v8.s[0]\n"
+ "fmla v17.4s, v24.4s, v8.s[1]\n"
+ "fmla v18.4s, v24.4s, v8.s[2]\n"
+ "fmla v19.4s, v24.4s, v8.s[3]\n"
+ "ldr q24, [%x[params], #0x150]\n"
+ "fmla v12.4s, v23.4s, v6.s[1]\n"
+ "fmla v13.4s, v23.4s, v6.s[2]\n"
+ "fmla v14.4s, v23.4s, v6.s[3]\n"
+ "fmla v15.4s, v23.4s, v7.s[0]\n"
+ "fmla v16.4s, v23.4s, v8.s[1]\n"
+ "fmla v17.4s, v23.4s, v8.s[2]\n"
+ "fmla v18.4s, v23.4s, v8.s[3]\n"
+ "fmla v19.4s, v23.4s, v9.s[0]\n"
+ "ldr q23, [%x[params], #0x160]\n"
+ "fmla v12.4s, v22.4s, v6.s[2]\n"
+ "fmla v13.4s, v22.4s, v6.s[3]\n"
+ "fmla v14.4s, v22.4s, v7.s[0]\n"
+ "fmla v15.4s, v22.4s, v7.s[1]\n"
+ "fmla v16.4s, v22.4s, v8.s[2]\n"
+ "fmla v17.4s, v22.4s, v8.s[3]\n"
+ "fmla v18.4s, v22.4s, v9.s[0]\n"
+ "fmla v19.4s, v22.4s, v9.s[1]\n"
+ "ldr q22, [%x[params], #0x170]\n"
+ "fmla v12.4s, v21.4s, v6.s[3]\n"
+ "fmla v13.4s, v21.4s, v7.s[0]\n"
+ "fmla v14.4s, v21.4s, v7.s[1]\n"
+ "fmla v15.4s, v21.4s, v7.s[2]\n"
+ "fmla v16.4s, v21.4s, v8.s[3]\n"
+ "fmla v17.4s, v21.4s, v9.s[0]\n"
+ "fmla v18.4s, v21.4s, v9.s[1]\n"
+ "fmla v19.4s, v21.4s, v9.s[2]\n"
+ "ldr q21, [%x[params], #0x180]\n"
+ "fmla v12.4s, v20.4s, v7.s[0]\n"
+ "fmla v13.4s, v20.4s, v7.s[1]\n"
+ "fmla v14.4s, v20.4s, v7.s[2]\n"
+ "fmla v15.4s, v20.4s, v7.s[3]\n"
+ "fmla v16.4s, v20.4s, v9.s[0]\n"
+ "fmla v17.4s, v20.4s, v9.s[1]\n"
+ "fmla v18.4s, v20.4s, v9.s[2]\n"
+ "fmla v19.4s, v20.4s, v9.s[3]\n"
+ "ldr q20, [%x[params], #0x190]\n"
+ "add %x[params], %x[params], #0x1a0\n"
+ "fmla v12.4s, v24.4s, v8.s[0]\n"
+ "fmla v13.4s, v24.4s, v8.s[1]\n"
+ "fmla v14.4s, v24.4s, v8.s[2]\n"
+ "fmla v15.4s, v24.4s, v8.s[3]\n"
+ "fmla v16.4s, v24.4s, v10.s[0]\n"
+ "fmla v17.4s, v24.4s, v10.s[1]\n"
+ "fmla v18.4s, v24.4s, v10.s[2]\n"
+ "fmla v19.4s, v24.4s, v10.s[3]\n"
+ "fmla v12.4s, v23.4s, v8.s[1]\n"
+ "fmla v13.4s, v23.4s, v8.s[2]\n"
+ "fmla v14.4s, v23.4s, v8.s[3]\n"
+ "fmla v15.4s, v23.4s, v9.s[0]\n"
+ "fmla v16.4s, v23.4s, v10.s[1]\n"
+ "fmla v17.4s, v23.4s, v10.s[2]\n"
+ "fmla v18.4s, v23.4s, v10.s[3]\n"
+ "fmla v19.4s, v23.4s, v11.s[0]\n"
+ "fmla v12.4s, v22.4s, v8.s[2]\n"
+ "fmla v13.4s, v22.4s, v8.s[3]\n"
+ "fmla v14.4s, v22.4s, v9.s[0]\n"
+ "fmla v15.4s, v22.4s, v9.s[1]\n"
+ "fmla v16.4s, v22.4s, v10.s[2]\n"
+ "fmla v17.4s, v22.4s, v10.s[3]\n"
+ "fmla v18.4s, v22.4s, v11.s[0]\n"
+ "fmla v19.4s, v22.4s, v11.s[1]\n"
+ "fmla v12.4s, v21.4s, v8.s[3]\n"
+ "fmla v13.4s, v21.4s, v9.s[0]\n"
+ "fmla v14.4s, v21.4s, v9.s[1]\n"
+ "fmla v15.4s, v21.4s, v9.s[2]\n"
+ "fmla v16.4s, v21.4s, v10.s[3]\n"
+ "fmla v17.4s, v21.4s, v11.s[0]\n"
+ "fmla v18.4s, v21.4s, v11.s[1]\n"
+ "fmla v19.4s, v21.4s, v11.s[2]\n"
+ "fmla v12.4s, v20.4s, v9.s[0]\n"
+ "fmla v13.4s, v20.4s, v9.s[1]\n"
+ "fmin v12.4s, v12.4s, v25.4s\n"
+ "fmla v14.4s, v20.4s, v9.s[2]\n"
+ "fmla v15.4s, v20.4s, v9.s[3]\n"
+ "fmin v13.4s, v13.4s, v25.4s\n"
+ "fmla v16.4s, v20.4s, v11.s[0]\n"
+ "fmla v17.4s, v20.4s, v11.s[1]\n"
+ "fmin v14.4s, v14.4s, v25.4s\n"
+ "fmla v18.4s, v20.4s, v11.s[2]\n"
+ "fmla v19.4s, v20.4s, v11.s[3]\n"
+ "fmin v15.4s, v15.4s, v25.4s\n"
+ "fmin v16.4s, v16.4s, v25.4s\n"
+ "fmin v17.4s, v17.4s, v25.4s\n"
+ "fmin v18.4s, v18.4s, v25.4s\n"
+ "fmin v19.4s, v19.4s, v25.4s\n"
+ "fmax v12.4s, v12.4s, v26.4s\n"
+ "fmax v13.4s, v13.4s, v26.4s\n"
+ "fmax v14.4s, v14.4s, v26.4s\n"
+ "fmax v15.4s, v15.4s, v26.4s\n"
+ "fmax v16.4s, v16.4s, v26.4s\n"
+ "fmax v17.4s, v17.4s, v26.4s\n"
+ "fmax v18.4s, v18.4s, v26.4s\n"
+ "fmax v19.4s, v19.4s, v26.4s\n"
+ "tbz %x[channel_multiplier], #1, 4f\n"
+ "add x20, x12, x13\n"
+ "add x21, x11, x13\n"
+ "st1 { v12.d }[0], [x20]\n"
+ "add x20, x10, x13\n"
+ "add x24, x9, x13\n"
+ "st1 { v13.d }[0], [x21]\n"
+ "add x23, x28, x13\n"
+ "add x22, x27, x13\n"
+ "st1 { v14.d }[0], [x20]\n"
+ "add x21, x26, x13\n"
+ "add x20, x25, x13\n"
+ "st1 { v15.d }[0], [x24]\n"
+ "st1 { v16.d }[0], [x23]\n"
+ "add x13, x13, #0x8\n"
+ "st1 { v17.d }[0], [x22]\n"
+ "st1 { v18.d }[0], [x21]\n"
+ "st1 { v19.d }[0], [x20]\n"
+ "tbz %x[channel_multiplier], #0, 5f\n"
+ "add x20, x12, x13\n"
+ "add x21, x11, x13\n"
+ "st1 { v12.s }[2], [x20]\n"
+ "add x20, x10, x13\n"
+ "add x24, x9, x13\n"
+ "st1 { v13.s }[2], [x21]\n"
+ "add x23, x28, x13\n"
+ "add x22, x27, x13\n"
+ "st1 { v14.s }[2], [x20]\n"
+ "add x21, x26, x13\n"
+ "add x20, x25, x13\n"
+ "st1 { v15.s }[2], [x24]\n"
+ "st1 { v16.s }[2], [x23]\n"
+ "st1 { v17.s }[2], [x22]\n"
+ "st1 { v18.s }[2], [x21]\n"
+ "st1 { v19.s }[2], [x20]\n"
+ "b 5f\n"
+ "4:" // Output channel oddments: Store: Bit 1: Unset
+ "add x20, x12, x13\n"
+ "add x21, x11, x13\n"
+ "st1 { v12.s }[0], [x20]\n"
+ "add x20, x10, x13\n"
+ "add x24, x9, x13\n"
+ "st1 { v13.s }[0], [x21]\n"
+ "add x23, x28, x13\n"
+ "add x22, x27, x13\n"
+ "st1 { v14.s }[0], [x20]\n"
+ "add x21, x26, x13\n"
+ "add x20, x25, x13\n"
+ "st1 { v15.s }[0], [x24]\n"
+ "st1 { v16.s }[0], [x23]\n"
+ "st1 { v17.s }[0], [x22]\n"
+ "st1 { v18.s }[0], [x21]\n"
+ "st1 { v19.s }[0], [x20]\n"
+ "5:" // Output channel oddments: Store: Bit 1: End
+ "6:" // End
+ : [params] "+&r" (params)
+ : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..3bece73973
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float);
+
+struct a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<float, float, float, float>
+{
+ using Parent = GenericDepthfirstMultiplierKernelStrategy<float, float, float, float>;
+ a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+ : Parent(2, 8, arm_gemm::VLType::None)
+ {
+ }
+ Parent::KernelType kernel = a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..cc18dd4bb4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,850 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+ const float *const *const inptrs,
+ float *const *const outptrs,
+ const float *weights,
+ const float *bias,
+ const unsigned int kernel_points,
+ const unsigned int n_output_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ const float minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ld1r { v12.4s }, [%x[minmax_vals]]\n"
+ "lsr x11, %x[n_output_channels], #0x2\n"
+ "add x20, %x[minmax_vals], #0x4\n"
+ "ld1r { v11.4s }, [x20]\n"
+ "mov x10, #0x0\n"
+ "cbz x11, 8f\n"
+ "1:" // Output channel loop
+ "movi v31.16b, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "lsl x20, x10, #0x2\n"
+ "ldr q31, [%x[bias], x20]\n"
+ "2:" // Output channel loop: Load bias: Done
+ "ldr q10, [%x[weights], #0x0]\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr q3, [x21, #0x0]\n"
+ "ldr q2, [x21, #0x10]\n"
+ "mov v16.16b, v31.16b\n"
+ "mov v17.16b, v31.16b\n"
+ "ldr q1, [x20, #0x0]\n"
+ "ldr q0, [x20, #0x10]\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v19.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ "mov v21.16b, v31.16b\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "mov v22.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ "mov v24.16b, v31.16b\n"
+ "mov v25.16b, v31.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v27.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v31.16b, v31.16b\n"
+ "cbz x23, 6f\n"
+ "ldr q9, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "ldr q8, [x21, #0x0]\n"
+ "ldr q7, [x21, #0x10]\n"
+ "ldr q6, [x20, #0x0]\n"
+ "ldr q5, [x20, #0x10]\n"
+ "beq 4f\n"
+ "3:" // Output channel loop: Kernel loop
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "subs x23, x23, #0x1\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "ldr q3, [x21, #0x0]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "ldr q2, [x21, #0x10]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "ldr q1, [x20, #0x0]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
+ "ldr q0, [x20, #0x10]\n"
+ "ldr q10, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v16.4s, v9.4s, v8.s[0]\n"
+ "fmla v17.4s, v9.4s, v8.s[1]\n"
+ "fmla v18.4s, v9.4s, v8.s[2]\n"
+ "fmla v19.4s, v9.4s, v8.s[3]\n"
+ "ldr q8, [x21, #0x0]\n"
+ "fmla v20.4s, v9.4s, v7.s[0]\n"
+ "fmla v21.4s, v9.4s, v7.s[1]\n"
+ "fmla v22.4s, v9.4s, v7.s[2]\n"
+ "fmla v23.4s, v9.4s, v7.s[3]\n"
+ "ldr q7, [x21, #0x10]\n"
+ "fmla v24.4s, v9.4s, v6.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[1]\n"
+ "fmla v26.4s, v9.4s, v6.s[2]\n"
+ "fmla v27.4s, v9.4s, v6.s[3]\n"
+ "ldr q6, [x20, #0x0]\n"
+ "fmla v28.4s, v9.4s, v5.s[0]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmla v30.4s, v9.4s, v5.s[2]\n"
+ "fmla v31.4s, v9.4s, v5.s[3]\n"
+ "ldr q5, [x20, #0x10]\n"
+ "ldr q9, [%x[weights], #0x10]\n"
+ "add %x[weights], %x[weights], #0x20\n"
+ "bgt 3b\n"
+ "4:" // Output channel loop: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 5f\n"
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "lsl x28, x10, #0x2\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
+ "fmla v16.4s, v9.4s, v8.s[0]\n"
+ "fmla v17.4s, v9.4s, v8.s[1]\n"
+ "fmin v16.4s, v16.4s, v11.4s\n"
+ "fmla v18.4s, v9.4s, v8.s[2]\n"
+ "fmla v19.4s, v9.4s, v8.s[3]\n"
+ "fmin v17.4s, v17.4s, v11.4s\n"
+ "fmla v20.4s, v9.4s, v7.s[0]\n"
+ "fmla v21.4s, v9.4s, v7.s[1]\n"
+ "fmin v18.4s, v18.4s, v11.4s\n"
+ "fmla v22.4s, v9.4s, v7.s[2]\n"
+ "fmla v23.4s, v9.4s, v7.s[3]\n"
+ "fmin v19.4s, v19.4s, v11.4s\n"
+ "fmla v24.4s, v9.4s, v6.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[1]\n"
+ "fmin v20.4s, v20.4s, v11.4s\n"
+ "fmla v26.4s, v9.4s, v6.s[2]\n"
+ "fmla v27.4s, v9.4s, v6.s[3]\n"
+ "fmin v21.4s, v21.4s, v11.4s\n"
+ "fmla v28.4s, v9.4s, v5.s[0]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmin v22.4s, v22.4s, v11.4s\n"
+ "fmla v30.4s, v9.4s, v5.s[2]\n"
+ "fmla v31.4s, v9.4s, v5.s[3]\n"
+ "fmin v23.4s, v23.4s, v11.4s\n"
+ "fmax v16.4s, v16.4s, v12.4s\n"
+ "fmax v17.4s, v17.4s, v12.4s\n"
+ "str q16, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax v18.4s, v18.4s, v12.4s\n"
+ "fmax v19.4s, v19.4s, v12.4s\n"
+ "str q17, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax v20.4s, v20.4s, v12.4s\n"
+ "fmax v21.4s, v21.4s, v12.4s\n"
+ "str q18, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmax v22.4s, v22.4s, v12.4s\n"
+ "fmax v23.4s, v23.4s, v12.4s\n"
+ "str q19, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin v24.4s, v24.4s, v11.4s\n"
+ "fmin v25.4s, v25.4s, v11.4s\n"
+ "str q20, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin v26.4s, v26.4s, v11.4s\n"
+ "fmin v27.4s, v27.4s, v11.4s\n"
+ "str q21, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin v28.4s, v28.4s, v11.4s\n"
+ "fmin v29.4s, v29.4s, v11.4s\n"
+ "str q22, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmin v30.4s, v30.4s, v11.4s\n"
+ "fmin v31.4s, v31.4s, v11.4s\n"
+ "str q23, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax v24.4s, v24.4s, v12.4s\n"
+ "fmax v25.4s, v25.4s, v12.4s\n"
+ "str q24, [x27, x28]\n"
+ "fmax v26.4s, v26.4s, v12.4s\n"
+ "fmax v27.4s, v27.4s, v12.4s\n"
+ "str q25, [x26, x28]\n"
+ "fmax v28.4s, v28.4s, v12.4s\n"
+ "fmax v29.4s, v29.4s, v12.4s\n"
+ "str q26, [x25, x28]\n"
+ "fmax v30.4s, v30.4s, v12.4s\n"
+ "fmax v31.4s, v31.4s, v12.4s\n"
+ "str q27, [x24, x28]\n"
+ "str q28, [x23, x28]\n"
+ "str q29, [x22, x28]\n"
+ "str q30, [x21, x28]\n"
+ "str q31, [x20, x28]\n"
+ "b 7f\n"
+ "5:" // Output channel loop: Odd tail
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "ldp x20, x9, [x22], #0x10\n"
+ "lsl x28, x10, #0x2\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "ldr q4, [x20, #0x0]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "ldr q2, [x9, #0x0]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
+ "ldr q1, [%x[weights], #0x0]\n"
+ "ldr q0, [x9, #0x10]\n"
+ "fmla v16.4s, v9.4s, v8.s[0]\n"
+ "fmla v17.4s, v9.4s, v8.s[1]\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "fmla v18.4s, v9.4s, v8.s[2]\n"
+ "fmla v19.4s, v9.4s, v8.s[3]\n"
+ "fmla v20.4s, v9.4s, v7.s[0]\n"
+ "fmla v21.4s, v9.4s, v7.s[1]\n"
+ "fmla v22.4s, v9.4s, v7.s[2]\n"
+ "fmla v23.4s, v9.4s, v7.s[3]\n"
+ "fmla v24.4s, v9.4s, v6.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[1]\n"
+ "fmla v26.4s, v9.4s, v6.s[2]\n"
+ "fmla v27.4s, v9.4s, v6.s[3]\n"
+ "fmla v28.4s, v9.4s, v5.s[0]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmla v30.4s, v9.4s, v5.s[2]\n"
+ "fmla v31.4s, v9.4s, v5.s[3]\n"
+ "fmla v16.4s, v1.4s, v4.s[0]\n"
+ "fmla v17.4s, v1.4s, v4.s[1]\n"
+ "fmin v16.4s, v16.4s, v11.4s\n"
+ "fmla v18.4s, v1.4s, v4.s[2]\n"
+ "fmla v19.4s, v1.4s, v4.s[3]\n"
+ "fmin v17.4s, v17.4s, v11.4s\n"
+ "fmla v20.4s, v1.4s, v3.s[0]\n"
+ "fmla v21.4s, v1.4s, v3.s[1]\n"
+ "fmin v18.4s, v18.4s, v11.4s\n"
+ "fmla v22.4s, v1.4s, v3.s[2]\n"
+ "fmla v23.4s, v1.4s, v3.s[3]\n"
+ "fmin v19.4s, v19.4s, v11.4s\n"
+ "fmla v24.4s, v1.4s, v2.s[0]\n"
+ "fmla v25.4s, v1.4s, v2.s[1]\n"
+ "fmin v20.4s, v20.4s, v11.4s\n"
+ "fmla v26.4s, v1.4s, v2.s[2]\n"
+ "fmla v27.4s, v1.4s, v2.s[3]\n"
+ "fmin v21.4s, v21.4s, v11.4s\n"
+ "fmla v28.4s, v1.4s, v0.s[0]\n"
+ "fmla v29.4s, v1.4s, v0.s[1]\n"
+ "fmin v22.4s, v22.4s, v11.4s\n"
+ "fmla v30.4s, v1.4s, v0.s[2]\n"
+ "fmla v31.4s, v1.4s, v0.s[3]\n"
+ "fmin v23.4s, v23.4s, v11.4s\n"
+ "fmax v16.4s, v16.4s, v12.4s\n"
+ "fmax v17.4s, v17.4s, v12.4s\n"
+ "str q16, [x27, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax v18.4s, v18.4s, v12.4s\n"
+ "fmax v19.4s, v19.4s, v12.4s\n"
+ "str q17, [x26, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax v20.4s, v20.4s, v12.4s\n"
+ "fmax v21.4s, v21.4s, v12.4s\n"
+ "str q18, [x25, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmax v22.4s, v22.4s, v12.4s\n"
+ "fmax v23.4s, v23.4s, v12.4s\n"
+ "str q19, [x24, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin v24.4s, v24.4s, v11.4s\n"
+ "fmin v25.4s, v25.4s, v11.4s\n"
+ "str q20, [x23, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin v26.4s, v26.4s, v11.4s\n"
+ "fmin v27.4s, v27.4s, v11.4s\n"
+ "str q21, [x22, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin v28.4s, v28.4s, v11.4s\n"
+ "fmin v29.4s, v29.4s, v11.4s\n"
+ "str q22, [x21, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmin v30.4s, v30.4s, v11.4s\n"
+ "fmin v31.4s, v31.4s, v11.4s\n"
+ "str q23, [x20, x28]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax v24.4s, v24.4s, v12.4s\n"
+ "fmax v25.4s, v25.4s, v12.4s\n"
+ "str q24, [x27, x28]\n"
+ "fmax v26.4s, v26.4s, v12.4s\n"
+ "fmax v27.4s, v27.4s, v12.4s\n"
+ "str q25, [x26, x28]\n"
+ "fmax v28.4s, v28.4s, v12.4s\n"
+ "fmax v29.4s, v29.4s, v12.4s\n"
+ "str q26, [x25, x28]\n"
+ "fmax v30.4s, v30.4s, v12.4s\n"
+ "fmax v31.4s, v31.4s, v12.4s\n"
+ "str q27, [x24, x28]\n"
+ "str q28, [x23, x28]\n"
+ "str q29, [x22, x28]\n"
+ "str q30, [x21, x28]\n"
+ "str q31, [x20, x28]\n"
+ "b 7f\n"
+ "6:" // Output channel loop: Single kernel point
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "fmin v16.4s, v16.4s, v11.4s\n"
+ "lsl x28, x10, #0x2\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "fmin v17.4s, v17.4s, v11.4s\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "fmin v18.4s, v18.4s, v11.4s\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "fmin v19.4s, v19.4s, v11.4s\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "fmin v20.4s, v20.4s, v11.4s\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "fmin v21.4s, v21.4s, v11.4s\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmin v22.4s, v22.4s, v11.4s\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
+ "fmin v23.4s, v23.4s, v11.4s\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmax v16.4s, v16.4s, v12.4s\n"
+ "fmax v17.4s, v17.4s, v12.4s\n"
+ "str q16, [x27, x28]\n"
+ "fmax v18.4s, v18.4s, v12.4s\n"
+ "fmax v19.4s, v19.4s, v12.4s\n"
+ "str q17, [x26, x28]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax v20.4s, v20.4s, v12.4s\n"
+ "fmax v21.4s, v21.4s, v12.4s\n"
+ "str q18, [x25, x28]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax v22.4s, v22.4s, v12.4s\n"
+ "fmax v23.4s, v23.4s, v12.4s\n"
+ "str q19, [x24, x28]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmin v24.4s, v24.4s, v11.4s\n"
+ "fmin v25.4s, v25.4s, v11.4s\n"
+ "str q20, [x23, x28]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin v26.4s, v26.4s, v11.4s\n"
+ "fmin v27.4s, v27.4s, v11.4s\n"
+ "str q21, [x22, x28]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin v28.4s, v28.4s, v11.4s\n"
+ "fmin v29.4s, v29.4s, v11.4s\n"
+ "str q22, [x21, x28]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin v30.4s, v30.4s, v11.4s\n"
+ "fmin v31.4s, v31.4s, v11.4s\n"
+ "str q23, [x20, x28]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax v24.4s, v24.4s, v12.4s\n"
+ "fmax v25.4s, v25.4s, v12.4s\n"
+ "str q24, [x27, x28]\n"
+ "fmax v26.4s, v26.4s, v12.4s\n"
+ "fmax v27.4s, v27.4s, v12.4s\n"
+ "str q25, [x26, x28]\n"
+ "fmax v28.4s, v28.4s, v12.4s\n"
+ "fmax v29.4s, v29.4s, v12.4s\n"
+ "str q26, [x25, x28]\n"
+ "fmax v30.4s, v30.4s, v12.4s\n"
+ "fmax v31.4s, v31.4s, v12.4s\n"
+ "str q27, [x24, x28]\n"
+ "str q28, [x23, x28]\n"
+ "str q29, [x22, x28]\n"
+ "str q30, [x21, x28]\n"
+ "str q31, [x20, x28]\n"
+ "7:" // Output channel loop: Done
+ "add x10, x10, #0x4\n"
+ "cmp x10, x11, LSL #2\n"
+ "blt 1b\n"
+ "tst %x[n_output_channels], #0x3\n"
+ "beq 19f\n"
+ "8:" // Output channel oddments
+ "movi v31.16b, #0x0\n"
+ "cbz %x[bias], 11f\n"
+ "add x20, %x[bias], x10, LSL #2\n"
+ "tbz %x[n_output_channels], #1, 9f\n"
+ "ld1 { v31.d }[0], [x20], #0x8\n"
+ "tbz %x[n_output_channels], #0, 10f\n"
+ "ld1 { v31.s }[2], [x20]\n"
+ "b 10f\n"
+ "9:" // Output channel oddments: Load bias: Bit 1: Unset
+ "ld1 { v31.s }[0], [x20]\n"
+ "10:" // Output channel oddments: Load bias: Bit 1: End
+ "11:" // Output channel oddments: Load bias: Done
+ "ldr q10, [%x[weights], #0x0]\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr q3, [x21, #0x0]\n"
+ "ldr q2, [x21, #0x10]\n"
+ "mov v16.16b, v31.16b\n"
+ "mov v17.16b, v31.16b\n"
+ "ldr q1, [x20, #0x0]\n"
+ "ldr q0, [x20, #0x10]\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v19.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ "mov v21.16b, v31.16b\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "mov v22.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ "mov v24.16b, v31.16b\n"
+ "mov v25.16b, v31.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v27.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v31.16b, v31.16b\n"
+ "cbz x23, 15f\n"
+ "ldr q9, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "ldr q8, [x21, #0x0]\n"
+ "ldr q7, [x21, #0x10]\n"
+ "ldr q6, [x20, #0x0]\n"
+ "ldr q5, [x20, #0x10]\n"
+ "beq 13f\n"
+ "12:" // Output channel oddments: Kernel loop
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "subs x23, x23, #0x1\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "ldr q3, [x21, #0x0]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "ldr q2, [x21, #0x10]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "ldr q1, [x20, #0x0]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
+ "ldr q0, [x20, #0x10]\n"
+ "ldr q10, [%x[weights], #0x0]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v16.4s, v9.4s, v8.s[0]\n"
+ "fmla v17.4s, v9.4s, v8.s[1]\n"
+ "fmla v18.4s, v9.4s, v8.s[2]\n"
+ "fmla v19.4s, v9.4s, v8.s[3]\n"
+ "ldr q8, [x21, #0x0]\n"
+ "fmla v20.4s, v9.4s, v7.s[0]\n"
+ "fmla v21.4s, v9.4s, v7.s[1]\n"
+ "fmla v22.4s, v9.4s, v7.s[2]\n"
+ "fmla v23.4s, v9.4s, v7.s[3]\n"
+ "ldr q7, [x21, #0x10]\n"
+ "fmla v24.4s, v9.4s, v6.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[1]\n"
+ "fmla v26.4s, v9.4s, v6.s[2]\n"
+ "fmla v27.4s, v9.4s, v6.s[3]\n"
+ "ldr q6, [x20, #0x0]\n"
+ "fmla v28.4s, v9.4s, v5.s[0]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmla v30.4s, v9.4s, v5.s[2]\n"
+ "fmla v31.4s, v9.4s, v5.s[3]\n"
+ "ldr q5, [x20, #0x10]\n"
+ "ldr q9, [%x[weights], #0x10]\n"
+ "add %x[weights], %x[weights], #0x20\n"
+ "bgt 12b\n"
+ "13:" // Output channel oddments: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 14f\n"
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
+ "fmla v16.4s, v9.4s, v8.s[0]\n"
+ "fmla v17.4s, v9.4s, v8.s[1]\n"
+ "fmla v18.4s, v9.4s, v8.s[2]\n"
+ "fmla v19.4s, v9.4s, v8.s[3]\n"
+ "fmla v20.4s, v9.4s, v7.s[0]\n"
+ "fmla v21.4s, v9.4s, v7.s[1]\n"
+ "fmla v22.4s, v9.4s, v7.s[2]\n"
+ "fmla v23.4s, v9.4s, v7.s[3]\n"
+ "fmla v24.4s, v9.4s, v6.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[1]\n"
+ "fmla v26.4s, v9.4s, v6.s[2]\n"
+ "fmla v27.4s, v9.4s, v6.s[3]\n"
+ "fmla v28.4s, v9.4s, v5.s[0]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmla v30.4s, v9.4s, v5.s[2]\n"
+ "fmla v31.4s, v9.4s, v5.s[3]\n"
+ "b 16f\n"
+ "14:" // Output channel oddments: Odd tail
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "ldr q4, [x21, #0x0]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "ldr q3, [x21, #0x10]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "ldr q2, [x20, #0x0]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
+ "ldr q1, [x20, #0x10]\n"
+ "ldr q0, [%x[weights], #0x0]\n"
+ "fmla v16.4s, v9.4s, v8.s[0]\n"
+ "fmla v17.4s, v9.4s, v8.s[1]\n"
+ "add %x[weights], %x[weights], #0x10\n"
+ "fmla v18.4s, v9.4s, v8.s[2]\n"
+ "fmla v19.4s, v9.4s, v8.s[3]\n"
+ "fmla v20.4s, v9.4s, v7.s[0]\n"
+ "fmla v21.4s, v9.4s, v7.s[1]\n"
+ "fmla v22.4s, v9.4s, v7.s[2]\n"
+ "fmla v23.4s, v9.4s, v7.s[3]\n"
+ "fmla v24.4s, v9.4s, v6.s[0]\n"
+ "fmla v25.4s, v9.4s, v6.s[1]\n"
+ "fmla v26.4s, v9.4s, v6.s[2]\n"
+ "fmla v27.4s, v9.4s, v6.s[3]\n"
+ "fmla v28.4s, v9.4s, v5.s[0]\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "fmla v30.4s, v9.4s, v5.s[2]\n"
+ "fmla v31.4s, v9.4s, v5.s[3]\n"
+ "fmla v16.4s, v0.4s, v4.s[0]\n"
+ "fmla v17.4s, v0.4s, v4.s[1]\n"
+ "fmla v18.4s, v0.4s, v4.s[2]\n"
+ "fmla v19.4s, v0.4s, v4.s[3]\n"
+ "fmla v20.4s, v0.4s, v3.s[0]\n"
+ "fmla v21.4s, v0.4s, v3.s[1]\n"
+ "fmla v22.4s, v0.4s, v3.s[2]\n"
+ "fmla v23.4s, v0.4s, v3.s[3]\n"
+ "fmla v24.4s, v0.4s, v2.s[0]\n"
+ "fmla v25.4s, v0.4s, v2.s[1]\n"
+ "fmla v26.4s, v0.4s, v2.s[2]\n"
+ "fmla v27.4s, v0.4s, v2.s[3]\n"
+ "fmla v28.4s, v0.4s, v1.s[0]\n"
+ "fmla v29.4s, v0.4s, v1.s[1]\n"
+ "fmla v30.4s, v0.4s, v1.s[2]\n"
+ "fmla v31.4s, v0.4s, v1.s[3]\n"
+ "b 16f\n"
+ "15:" // Output channel oddments: Single kernel point
+ "fmla v16.4s, v10.4s, v3.s[0]\n"
+ "fmla v17.4s, v10.4s, v3.s[1]\n"
+ "fmla v18.4s, v10.4s, v3.s[2]\n"
+ "fmla v19.4s, v10.4s, v3.s[3]\n"
+ "fmla v20.4s, v10.4s, v2.s[0]\n"
+ "fmla v21.4s, v10.4s, v2.s[1]\n"
+ "fmla v22.4s, v10.4s, v2.s[2]\n"
+ "fmla v23.4s, v10.4s, v2.s[3]\n"
+ "fmla v24.4s, v10.4s, v1.s[0]\n"
+ "fmla v25.4s, v10.4s, v1.s[1]\n"
+ "fmla v26.4s, v10.4s, v1.s[2]\n"
+ "fmla v27.4s, v10.4s, v1.s[3]\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "fmla v29.4s, v10.4s, v0.s[1]\n"
+ "fmla v30.4s, v10.4s, v0.s[2]\n"
+ "fmla v31.4s, v10.4s, v0.s[3]\n"
+ "16:" // Output channel oddments: Done
+ "fmin v16.4s, v16.4s, v11.4s\n"
+ "fmin v17.4s, v17.4s, v11.4s\n"
+ "fmin v18.4s, v18.4s, v11.4s\n"
+ "fmin v19.4s, v19.4s, v11.4s\n"
+ "fmin v20.4s, v20.4s, v11.4s\n"
+ "fmin v21.4s, v21.4s, v11.4s\n"
+ "fmin v22.4s, v22.4s, v11.4s\n"
+ "fmin v23.4s, v23.4s, v11.4s\n"
+ "fmin v24.4s, v24.4s, v11.4s\n"
+ "fmin v25.4s, v25.4s, v11.4s\n"
+ "fmin v26.4s, v26.4s, v11.4s\n"
+ "fmin v27.4s, v27.4s, v11.4s\n"
+ "fmin v28.4s, v28.4s, v11.4s\n"
+ "fmin v29.4s, v29.4s, v11.4s\n"
+ "fmin v30.4s, v30.4s, v11.4s\n"
+ "fmin v31.4s, v31.4s, v11.4s\n"
+ "fmax v16.4s, v16.4s, v12.4s\n"
+ "fmax v17.4s, v17.4s, v12.4s\n"
+ "fmax v18.4s, v18.4s, v12.4s\n"
+ "fmax v19.4s, v19.4s, v12.4s\n"
+ "fmax v20.4s, v20.4s, v12.4s\n"
+ "fmax v21.4s, v21.4s, v12.4s\n"
+ "fmax v22.4s, v22.4s, v12.4s\n"
+ "fmax v23.4s, v23.4s, v12.4s\n"
+ "fmax v24.4s, v24.4s, v12.4s\n"
+ "fmax v25.4s, v25.4s, v12.4s\n"
+ "fmax v26.4s, v26.4s, v12.4s\n"
+ "fmax v27.4s, v27.4s, v12.4s\n"
+ "fmax v28.4s, v28.4s, v12.4s\n"
+ "fmax v29.4s, v29.4s, v12.4s\n"
+ "fmax v30.4s, v30.4s, v12.4s\n"
+ "fmax v31.4s, v31.4s, v12.4s\n"
+ "tbz %x[n_output_channels], #1, 17f\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "add x27, x27, x10, LSL #2\n"
+ "add x26, x26, x10, LSL #2\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "add x24, x24, x10, LSL #2\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "add x22, x22, x10, LSL #2\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v16.d }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "add x27, x27, x10, LSL #2\n"
+ "st1 { v17.d }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #2\n"
+ "st1 { v18.d }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "st1 { v19.d }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #2\n"
+ "st1 { v20.d }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "st1 { v21.d }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #2\n"
+ "st1 { v22.d }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "st1 { v23.d }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #2\n"
+ "add x10, x10, #0x2\n"
+ "st1 { v24.d }[0], [x27]\n"
+ "st1 { v25.d }[0], [x26]\n"
+ "st1 { v26.d }[0], [x25]\n"
+ "st1 { v27.d }[0], [x24]\n"
+ "st1 { v28.d }[0], [x23]\n"
+ "st1 { v29.d }[0], [x22]\n"
+ "st1 { v30.d }[0], [x21]\n"
+ "st1 { v31.d }[0], [x20]\n"
+ "tbz %x[n_output_channels], #0, 18f\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "add x27, x27, x10, LSL #2\n"
+ "add x26, x26, x10, LSL #2\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "add x24, x24, x10, LSL #2\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "add x22, x22, x10, LSL #2\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "add x27, x27, x10, LSL #2\n"
+ "st1 { v17.s }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #2\n"
+ "st1 { v18.s }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "st1 { v19.s }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #2\n"
+ "st1 { v20.s }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "st1 { v21.s }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #2\n"
+ "st1 { v22.s }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "st1 { v23.s }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v24.s }[2], [x27]\n"
+ "st1 { v25.s }[2], [x26]\n"
+ "st1 { v26.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x24]\n"
+ "st1 { v28.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x22]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "st1 { v31.s }[2], [x20]\n"
+ "b 18f\n"
+ "17:" // Output channel oddments: Done: Store: Bit 1: Unset
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "add x27, x27, x10, LSL #2\n"
+ "add x26, x26, x10, LSL #2\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "add x24, x24, x10, LSL #2\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "add x22, x22, x10, LSL #2\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v16.s }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "add x27, x27, x10, LSL #2\n"
+ "st1 { v17.s }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x10, LSL #2\n"
+ "st1 { v18.s }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x10, LSL #2\n"
+ "st1 { v19.s }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x10, LSL #2\n"
+ "st1 { v20.s }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x10, LSL #2\n"
+ "st1 { v21.s }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x10, LSL #2\n"
+ "st1 { v22.s }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x10, LSL #2\n"
+ "st1 { v23.s }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x10, LSL #2\n"
+ "st1 { v24.s }[0], [x27]\n"
+ "st1 { v25.s }[0], [x26]\n"
+ "st1 { v26.s }[0], [x25]\n"
+ "st1 { v27.s }[0], [x24]\n"
+ "st1 { v28.s }[0], [x23]\n"
+ "st1 { v29.s }[0], [x22]\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "st1 { v31.s }[0], [x20]\n"
+ "18:" // Output channel oddments: Done: Store: Bit 1: End
+ "19:" // Done
+ : [weights] "+&r" (weights)
+ : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..e51031ccdb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int, const int8_t *const *const, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *const);
+
+class a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleave_a64_s8q_3x3_dot::get_packed_size(args);
+ }
+
+ void pack_parameters(
+ const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const override
+ {
+ interleave_a64_s8q_3x3_dot::pack_parameters(
+ args.input_channels * args.channel_multiplier, buffer, reinterpret_cast<const int32_t *>(biases),
+ reinterpret_cast<const int8_t *>(weights), qp, ld_weight_col, ld_weight_row
+ );
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..916c8a4afe
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,1658 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
+{
+ __asm__ __volatile__(
+ "mov x20, #0x1\n"
+ "orr x20, x20, #0x100\n"
+ "ldp x15, x14, [%x[inptrs], #0x0]\n"
+ "ldp x13, x12, [%x[inptrs], #0x10]\n"
+ "orr x20, x20, #0x10000\n"
+ "lsr x11, %x[n_channels], #0x4\n"
+ "dup v12.4s, w20\n"
+ "ldp x10, x9, [%x[inptrs], #0x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v11.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "mov x28, #0x0\n"
+ "mov x27, #0x0\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "ldp x25, x24, [%x[outptrs], #0x0]\n"
+ "ldp x23, x22, [%x[outptrs], #0x10]\n"
+ "cbz x11, 3f\n"
+ "ldr q15, [x15, x28]\n"
+ "ldr q28, [x14, x28]\n"
+ "subs x11, x11, #0x1\n"
+ "ldr q30, [x13, x28]\n"
+ "ldr q8, [x12, x28]\n"
+ "zip2 v19.16b, v15.16b, v30.16b\n"
+ "zip1 v15.16b, v15.16b, v30.16b\n"
+ "ldr q26, [x10, x28]\n"
+ "ldr q0, [x9, x28]\n"
+ "zip1 v7.16b, v28.16b, v8.16b\n"
+ "zip2 v8.16b, v28.16b, v8.16b\n"
+ "ldr q29, [x26, x28]\n"
+ "ldr q10, [x21, x28]\n"
+ "zip2 v25.16b, v15.16b, v7.16b\n"
+ "zip1 v15.16b, v15.16b, v7.16b\n"
+ "ldr q1, [%x[params], #0x10]\n"
+ "ldr q6, [%x[params], #0x20]\n"
+ "zip1 v7.16b, v19.16b, v8.16b\n"
+ "zip2 v8.16b, v19.16b, v8.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q20, [%x[params], #0x30]\n"
+ "zip2 v21.16b, v26.16b, v29.16b\n"
+ "zip1 v26.16b, v26.16b, v29.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x40]\n"
+ "ldr q22, [x21, x28]\n"
+ "zip1 v27.16b, v0.16b, v10.16b\n"
+ "zip2 v10.16b, v0.16b, v10.16b\n"
+ "ldr q17, [x20, x28]\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "zip2 v23.16b, v26.16b, v27.16b\n"
+ "zip1 v26.16b, v26.16b, v27.16b\n"
+ "ldr q9, [x21, x28]\n"
+ "ldr q5, [x20, x28]\n"
+ "zip2 v28.16b, v22.16b, v9.16b\n"
+ "zip1 v22.16b, v22.16b, v9.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x60]\n"
+ "ldr q27, [x21, x28]\n"
+ "zip1 v24.16b, v17.16b, v5.16b\n"
+ "zip2 v5.16b, v17.16b, v5.16b\n"
+ "ldr q18, [x20, x28]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "zip1 v3.16b, v21.16b, v10.16b\n"
+ "zip2 v10.16b, v21.16b, v10.16b\n"
+ "ldr q4, [x21, x28]\n"
+ "ldr q9, [x20, x28]\n"
+ "zip2 v17.16b, v27.16b, v4.16b\n"
+ "zip1 v27.16b, v27.16b, v4.16b\n"
+ "zip1 v4.16b, v18.16b, v9.16b\n"
+ "zip2 v9.16b, v18.16b, v9.16b\n"
+ "ldp x15, x14, [%x[inptrs], #0x0]\n"
+ "ldp x13, x12, [%x[inptrs], #0x10]\n"
+ "ldp x10, x9, [%x[inptrs], #0x20]\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "zip2 v19.16b, v22.16b, v24.16b\n"
+ "zip1 v22.16b, v22.16b, v24.16b\n"
+ "zip1 v0.16b, v28.16b, v5.16b\n"
+ "zip2 v5.16b, v28.16b, v5.16b\n"
+ "add %x[params], %x[params], #0x40\n"
+ "zip2 v24.16b, v27.16b, v4.16b\n"
+ "zip1 v27.16b, v27.16b, v4.16b\n"
+ "zip1 v2.16b, v17.16b, v9.16b\n"
+ "zip2 v9.16b, v17.16b, v9.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "movi v21.4s, #0x0\n"
+ ".inst 0x4e9a9595 // sdot v21.4s, v12.16b, v26.16b\n"
+ ".inst 0x4e8f943f // sdot v31.4s, v1.16b, v15.16b\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4e969595 // sdot v21.4s, v12.16b, v22.16b\n"
+ ".inst 0x4e9a943d // sdot v29.4s, v1.16b, v26.16b\n"
+ "movi v18.4s, #0x0\n"
+ "subs x11, x11, #0x1\n"
+ ".inst 0x4e9a94df // sdot v31.4s, v6.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "mov v17.16b, v21.16b\n .inst 0x4e9b9591 // sdot v17.4s, v12.16b, v27.16b\n"
+ ".inst 0x4e8f9595 // sdot v21.4s, v12.16b, v15.16b\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x4e9a9592 // sdot v18.4s, v12.16b, v26.16b\n"
+ ".inst 0x4e9694dd // sdot v29.4s, v6.16b, v22.16b\n"
+ ".inst 0x4e96969f // sdot v31.4s, v20.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x4e8f943e // sdot v30.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e9a943c // sdot v28.4s, v1.16b, v26.16b\n"
+ "mls v31.4s, v21.4s, v16.4s\n"
+ ".inst 0x4e969592 // sdot v18.4s, v12.16b, v22.16b\n"
+ ".inst 0x4e9b969d // sdot v29.4s, v20.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e9a94de // sdot v30.4s, v6.16b, v26.16b\n"
+ "ldr q26, [%x[params], #0x10]\n"
+ ".inst 0x4e9694dc // sdot v28.4s, v6.16b, v22.16b\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mov v21.16b, v18.16b\n .inst 0x4e9b9595 // sdot v21.4s, v12.16b, v27.16b\n"
+ ".inst 0x4e8f9592 // sdot v18.4s, v12.16b, v15.16b\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e96969e // sdot v30.4s, v20.16b, v22.16b\n"
+ ".inst 0x4e9b969c // sdot v28.4s, v20.16b, v27.16b\n"
+ "mls v30.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v21.4s, v16.4s\n"
+ "and v15.16b, v31.16b, v26.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v17.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v17.4s\n"
+ "ldr q1, [%x[params], #0x60]\n"
+ "sqadd v31.4s, v31.4s, v15.4s\n"
+ "and v18.16b, v30.16b, v26.16b\n"
+ "and v21.16b, v29.16b, v26.16b\n"
+ "and v17.16b, v28.16b, v26.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v26.4s\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0x40]\n"
+ "sqadd v29.4s, v29.4s, v21.4s\n"
+ "ldr q27, [%x[params], #0x50]\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "ldr q15, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v30.4s, v30.4s, v26.4s\n"
+ "srshl v29.4s, v29.4s, v26.4s\n"
+ "srshl v28.4s, v28.4s, v26.4s\n"
+ "ldr q20, [%x[params], #0x70]\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x4e979596 // sdot v22.4s, v12.16b, v23.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q26, [%x[params], #0x20]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ ".inst 0x4e939596 // sdot v22.4s, v12.16b, v19.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s30, [x24, x27]\n"
+ "mov v6.16b, v22.16b\n .inst 0x4e989586 // sdot v6.4s, v12.16b, v24.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s29, [x23, x27]\n"
+ "mov v30.16b, v26.16b\n"
+ ".inst 0x4e999596 // sdot v22.4s, v12.16b, v25.16b\n"
+ "str s28, [x22, x27]\n"
+ "mov v29.16b, v26.16b\n"
+ "mov v21.16b, v26.16b\n"
+ ".inst 0x4e9995fa // sdot v26.4s, v15.16b, v25.16b\n"
+ ".inst 0x4e9795fd // sdot v29.4s, v15.16b, v23.16b\n"
+ ".inst 0x4e97965a // sdot v26.4s, v18.16b, v23.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+ "add x27, x27, #0x4\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+ "movi v28.4s, #0x0\n"
+ ".inst 0x4e9995fe // sdot v30.4s, v15.16b, v25.16b\n"
+ ".inst 0x4e9795f5 // sdot v21.4s, v15.16b, v23.16b\n"
+ ".inst 0x4e97959c // sdot v28.4s, v12.16b, v23.16b\n"
+ ".inst 0x4e93965d // sdot v29.4s, v18.16b, v19.16b\n"
+ ".inst 0x4e93977a // sdot v26.4s, v27.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x4e97965e // sdot v30.4s, v18.16b, v23.16b\n"
+ "ldr q4, [x9, x28]\n"
+ ".inst 0x4e939655 // sdot v21.4s, v18.16b, v19.16b\n"
+ "mls v26.4s, v22.4s, v16.4s\n"
+ ".inst 0x4e93959c // sdot v28.4s, v12.16b, v19.16b\n"
+ ".inst 0x4e98977d // sdot v29.4s, v27.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e93977e // sdot v30.4s, v27.16b, v19.16b\n"
+ ".inst 0x4e989775 // sdot v21.4s, v27.16b, v24.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+ "mov v17.16b, v28.16b\n .inst 0x4e989591 // sdot v17.4s, v12.16b, v24.16b\n"
+ ".inst 0x4e99959c // sdot v28.4s, v12.16b, v25.16b\n"
+ "ldr q31, [x14, x28]\n"
+ "mls v30.4s, v28.4s, v16.4s\n"
+ "mls v29.4s, v6.4s, v16.4s\n"
+ "mls v21.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+ "ldr q27, [%x[params], #0xc0]\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "and v18.16b, v30.16b, v20.16b\n"
+ "and v6.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v21.16b, v20.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "ldr q28, [%x[params], #0xa0]\n"
+ "sqadd v29.4s, v29.4s, v6.4s\n"
+ "ldr q24, [%x[params], #0xb0]\n"
+ "sqadd v21.4s, v21.4s, v17.4s\n"
+ "ldr q15, [%x[params], #0x90]\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v21.4s, v21.4s, v20.4s\n"
+ "ldr q1, [%x[params], #0xd0]\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x4e839596 // sdot v22.4s, v12.16b, v3.16b\n"
+ ".inst 0x4e809596 // sdot v22.4s, v12.16b, v0.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s26, [x25, x27]\n"
+ "ldr q26, [%x[params], #0x80]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "mov v18.16b, v22.16b\n .inst 0x4e829592 // sdot v18.4s, v12.16b, v2.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s30, [x24, x27]\n"
+ ".inst 0x4e879596 // sdot v22.4s, v12.16b, v7.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s29, [x23, x27]\n"
+ "mov v6.16b, v26.16b\n"
+ "str s21, [x22, x27]\n"
+ "mov v25.16b, v26.16b\n"
+ "mov v20.16b, v26.16b\n"
+ ".inst 0x4e8795fa // sdot v26.4s, v15.16b, v7.16b\n"
+ ".inst 0x4e8395f9 // sdot v25.4s, v15.16b, v3.16b\n"
+ ".inst 0x4e83979a // sdot v26.4s, v28.16b, v3.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ "add x27, x27, #0x4\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "movi v23.4s, #0x0\n"
+ ".inst 0x4e8795e6 // sdot v6.4s, v15.16b, v7.16b\n"
+ ".inst 0x4e8395f4 // sdot v20.4s, v15.16b, v3.16b\n"
+ ".inst 0x4e839597 // sdot v23.4s, v12.16b, v3.16b\n"
+ ".inst 0x4e809799 // sdot v25.4s, v28.16b, v0.16b\n"
+ ".inst 0x4e80971a // sdot v26.4s, v24.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e839786 // sdot v6.4s, v28.16b, v3.16b\n"
+ "ldr q19, [x26, x28]\n"
+ ".inst 0x4e809794 // sdot v20.4s, v28.16b, v0.16b\n"
+ "mls v26.4s, v22.4s, v16.4s\n"
+ ".inst 0x4e809597 // sdot v23.4s, v12.16b, v0.16b\n"
+ ".inst 0x4e829719 // sdot v25.4s, v24.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x4e809706 // sdot v6.4s, v24.16b, v0.16b\n"
+ ".inst 0x4e829714 // sdot v20.4s, v24.16b, v2.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+ "mov v17.16b, v23.16b\n .inst 0x4e829591 // sdot v17.4s, v12.16b, v2.16b\n"
+ ".inst 0x4e879597 // sdot v23.4s, v12.16b, v7.16b\n"
+ "ldr q21, [x13, x28]\n"
+ "mls v6.4s, v23.4s, v16.4s\n"
+ "mls v25.4s, v18.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v26.16b, v1.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v6.4s, v6.4s, v27.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+ "ldr q15, [%x[params], #0x120]\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "and v18.16b, v6.16b, v1.16b\n"
+ "and v22.16b, v25.16b, v1.16b\n"
+ "and v17.16b, v20.16b, v1.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "ldr q30, [%x[params], #0x100]\n"
+ "sqadd v25.4s, v25.4s, v22.4s\n"
+ "ldr q27, [%x[params], #0x110]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q24, [%x[params], #0xf0]\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "srshl v6.4s, v6.4s, v1.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v20.4s, v20.4s, v1.4s\n"
+ "ldr q23, [%x[params], #0x130]\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "add v6.4s, v6.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smax v6.4s, v6.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v6.4s, v6.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "movi v0.4s, #0x0\n"
+ ".inst 0x4e8a9580 // sdot v0.4s, v12.16b, v10.16b\n"
+ ".inst 0x4e859580 // sdot v0.4s, v12.16b, v5.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str s26, [x25, x27]\n"
+ "ldr q28, [%x[params], #0xe0]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "mov v22.16b, v0.16b\n .inst 0x4e899596 // sdot v22.4s, v12.16b, v9.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s6, [x24, x27]\n"
+ ".inst 0x4e889580 // sdot v0.4s, v12.16b, v8.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s25, [x23, x27]\n"
+ "mov v29.16b, v28.16b\n"
+ "str s20, [x22, x27]\n"
+ "mov v25.16b, v28.16b\n"
+ "mov v7.16b, v28.16b\n"
+ ".inst 0x4e88971c // sdot v28.4s, v24.16b, v8.16b\n"
+ ".inst 0x4e8a9719 // sdot v25.4s, v24.16b, v10.16b\n"
+ ".inst 0x4e8a97dc // sdot v28.4s, v30.16b, v10.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ "add x27, x27, #0x4\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x4e88971d // sdot v29.4s, v24.16b, v8.16b\n"
+ ".inst 0x4e8a9707 // sdot v7.4s, v24.16b, v10.16b\n"
+ ".inst 0x4e8a9591 // sdot v17.4s, v12.16b, v10.16b\n"
+ ".inst 0x4e8597d9 // sdot v25.4s, v30.16b, v5.16b\n"
+ ".inst 0x4e85977c // sdot v28.4s, v27.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e8a97dd // sdot v29.4s, v30.16b, v10.16b\n"
+ "ldr q10, [x21, x28]\n"
+ ".inst 0x4e8597c7 // sdot v7.4s, v30.16b, v5.16b\n"
+ "mls v28.4s, v0.4s, v16.4s\n"
+ ".inst 0x4e859591 // sdot v17.4s, v12.16b, v5.16b\n"
+ ".inst 0x4e899779 // sdot v25.4s, v27.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x4e85977d // sdot v29.4s, v27.16b, v5.16b\n"
+ ".inst 0x4e899767 // sdot v7.4s, v27.16b, v9.16b\n"
+ "sqrdmulh v28.4s, v28.4s, v15.4s\n"
+ "mov v18.16b, v17.16b\n .inst 0x4e899592 // sdot v18.4s, v12.16b, v9.16b\n"
+ ".inst 0x4e889591 // sdot v17.4s, v12.16b, v8.16b\n"
+ "ldr q8, [x12, x28]\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mls v25.4s, v22.4s, v16.4s\n"
+ "mls v7.4s, v18.4s, v16.4s\n"
+ "and v17.16b, v28.16b, v23.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v29.4s, v29.4s, v15.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v15.4s\n"
+ "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+ "ldr q15, [x15, x28]\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "ldp x21, x20, [%x[inptrs], #0x40]\n"
+ "ldr q22, [x21, x28]\n"
+ "ldr q3, [x20, x28]\n"
+ "and v24.16b, v29.16b, v23.16b\n"
+ "and v20.16b, v25.16b, v23.16b\n"
+ "and v17.16b, v7.16b, v23.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "ldr q2, [x21, x28]\n"
+ "ldr q5, [x20, x28]\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v28.4s, v28.4s, v23.4s\n"
+ "sqadd v29.4s, v29.4s, v24.4s\n"
+ "ldr q6, [%x[params], #0x160]\n"
+ "sqadd v25.4s, v25.4s, v20.4s\n"
+ "ldr q20, [%x[params], #0x170]\n"
+ "sqadd v7.4s, v7.4s, v17.4s\n"
+ "ldr q1, [%x[params], #0x150]\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "srshl v29.4s, v29.4s, v23.4s\n"
+ "srshl v25.4s, v25.4s, v23.4s\n"
+ "srshl v7.4s, v7.4s, v23.4s\n"
+ "ldr q26, [x10, x28]\n"
+ "ldp x21, x20, [%x[inptrs], #0x60]\n"
+ "ldr q27, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v7.4s, v7.4s, v14.4s\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "ldr q23, [x21, x28]\n"
+ "ldr q9, [x20, x28]\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "ldp x15, x14, [%x[inptrs], #0x0]\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v7.4s, v7.4s, v13.4s\n"
+ "ldp x13, x12, [%x[inptrs], #0x10]\n"
+ "ldp x10, x9, [%x[inptrs], #0x20]\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "smin v7.4s, v7.4s, v11.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s28, [x25, x27]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "zip2 v17.16b, v15.16b, v21.16b\n"
+ "zip1 v15.16b, v15.16b, v21.16b\n"
+ "zip1 v18.16b, v31.16b, v8.16b\n"
+ "zip2 v8.16b, v31.16b, v8.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s29, [x24, x27]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str s25, [x23, x27]\n"
+ "zip2 v25.16b, v15.16b, v18.16b\n"
+ "str s7, [x22, x27]\n"
+ "zip1 v15.16b, v15.16b, v18.16b\n"
+ "zip1 v7.16b, v17.16b, v8.16b\n"
+ "add x27, x27, #0x4\n"
+ "zip2 v8.16b, v17.16b, v8.16b\n"
+ "ldr q31, [%x[params], #0x140]\n"
+ "zip2 v29.16b, v26.16b, v19.16b\n"
+ "add %x[params], %x[params], #0x180\n"
+ "zip1 v26.16b, v26.16b, v19.16b\n"
+ "zip1 v28.16b, v4.16b, v10.16b\n"
+ "zip2 v10.16b, v4.16b, v10.16b\n"
+ "zip2 v24.16b, v22.16b, v2.16b\n"
+ "zip1 v22.16b, v22.16b, v2.16b\n"
+ "zip1 v21.16b, v3.16b, v5.16b\n"
+ "zip2 v5.16b, v3.16b, v5.16b\n"
+ "zip2 v18.16b, v27.16b, v23.16b\n"
+ "zip1 v27.16b, v27.16b, v23.16b\n"
+ "zip1 v17.16b, v30.16b, v9.16b\n"
+ "zip2 v9.16b, v30.16b, v9.16b\n"
+ "zip2 v23.16b, v26.16b, v28.16b\n"
+ "zip1 v26.16b, v26.16b, v28.16b\n"
+ "zip1 v3.16b, v29.16b, v10.16b\n"
+ "zip2 v10.16b, v29.16b, v10.16b\n"
+ "zip2 v19.16b, v22.16b, v21.16b\n"
+ "zip1 v22.16b, v22.16b, v21.16b\n"
+ "zip1 v0.16b, v24.16b, v5.16b\n"
+ "zip2 v5.16b, v24.16b, v5.16b\n"
+ "zip2 v24.16b, v27.16b, v17.16b\n"
+ "zip1 v27.16b, v27.16b, v17.16b\n"
+ "zip1 v2.16b, v18.16b, v9.16b\n"
+ "zip2 v9.16b, v18.16b, v9.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "bgt 1b\n"
+ "2:" // Detached iteration
+ "movi v21.4s, #0x0\n"
+ ".inst 0x4e9a9595 // sdot v21.4s, v12.16b, v26.16b\n"
+ ".inst 0x4e8f943f // sdot v31.4s, v1.16b, v15.16b\n"
+ "tst %x[n_channels], #0xf\n"
+ ".inst 0x4e969595 // sdot v21.4s, v12.16b, v22.16b\n"
+ ".inst 0x4e9a943d // sdot v29.4s, v1.16b, v26.16b\n"
+ "movi v18.4s, #0x0\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4e9a94df // sdot v31.4s, v6.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "mov v17.16b, v21.16b\n .inst 0x4e9b9591 // sdot v17.4s, v12.16b, v27.16b\n"
+ ".inst 0x4e8f9595 // sdot v21.4s, v12.16b, v15.16b\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x4e9a9592 // sdot v18.4s, v12.16b, v26.16b\n"
+ ".inst 0x4e9694dd // sdot v29.4s, v6.16b, v22.16b\n"
+ ".inst 0x4e96969f // sdot v31.4s, v20.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x4e8f943e // sdot v30.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e9a943c // sdot v28.4s, v1.16b, v26.16b\n"
+ "mls v31.4s, v21.4s, v16.4s\n"
+ ".inst 0x4e969592 // sdot v18.4s, v12.16b, v22.16b\n"
+ ".inst 0x4e9b969d // sdot v29.4s, v20.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e9a94de // sdot v30.4s, v6.16b, v26.16b\n"
+ "ldr q4, [%x[params], #0x10]\n"
+ ".inst 0x4e9694dc // sdot v28.4s, v6.16b, v22.16b\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mov v21.16b, v18.16b\n .inst 0x4e9b9595 // sdot v21.4s, v12.16b, v27.16b\n"
+ ".inst 0x4e8f9592 // sdot v18.4s, v12.16b, v15.16b\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e96969e // sdot v30.4s, v20.16b, v22.16b\n"
+ ".inst 0x4e9b969c // sdot v28.4s, v20.16b, v27.16b\n"
+ "mls v30.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v21.4s, v16.4s\n"
+ "and v27.16b, v31.16b, v4.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v17.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v17.4s\n"
+ "ldr q15, [%x[params], #0x60]\n"
+ "sqadd v31.4s, v31.4s, v27.4s\n"
+ "and v20.16b, v30.16b, v4.16b\n"
+ "and v18.16b, v29.16b, v4.16b\n"
+ "and v17.16b, v28.16b, v4.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v20.4s\n"
+ "ldr q27, [%x[params], #0x40]\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "ldr q26, [%x[params], #0x50]\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "ldr q6, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v29.4s, v29.4s, v4.4s\n"
+ "srshl v28.4s, v28.4s, v4.4s\n"
+ "ldr q4, [%x[params], #0x70]\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v1.4s, #0x0\n"
+ ".inst 0x4e979581 // sdot v1.4s, v12.16b, v23.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q31, [%x[params], #0x20]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ ".inst 0x4e939581 // sdot v1.4s, v12.16b, v19.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s30, [x24, x27]\n"
+ "mov v22.16b, v1.16b\n .inst 0x4e989596 // sdot v22.4s, v12.16b, v24.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s29, [x23, x27]\n"
+ "mov v29.16b, v31.16b\n"
+ ".inst 0x4e999581 // sdot v1.4s, v12.16b, v25.16b\n"
+ "str s28, [x22, x27]\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ ".inst 0x4e9994df // sdot v31.4s, v6.16b, v25.16b\n"
+ ".inst 0x4e9794d5 // sdot v21.4s, v6.16b, v23.16b\n"
+ ".inst 0x4e97977f // sdot v31.4s, v27.16b, v23.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+ "add x27, x27, #0x4\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x4e9994dd // sdot v29.4s, v6.16b, v25.16b\n"
+ ".inst 0x4e9794d4 // sdot v20.4s, v6.16b, v23.16b\n"
+ ".inst 0x4e979592 // sdot v18.4s, v12.16b, v23.16b\n"
+ ".inst 0x4e939775 // sdot v21.4s, v27.16b, v19.16b\n"
+ ".inst 0x4e93975f // sdot v31.4s, v26.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x4e97977d // sdot v29.4s, v27.16b, v23.16b\n"
+ ".inst 0x4e939774 // sdot v20.4s, v27.16b, v19.16b\n"
+ "mls v31.4s, v1.4s, v16.4s\n"
+ ".inst 0x4e939592 // sdot v18.4s, v12.16b, v19.16b\n"
+ ".inst 0x4e989755 // sdot v21.4s, v26.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e93975d // sdot v29.4s, v26.16b, v19.16b\n"
+ ".inst 0x4e989754 // sdot v20.4s, v26.16b, v24.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v15.4s\n"
+ "mov v17.16b, v18.16b\n .inst 0x4e989591 // sdot v17.4s, v12.16b, v24.16b\n"
+ ".inst 0x4e999592 // sdot v18.4s, v12.16b, v25.16b\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mls v21.4s, v22.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v4.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v29.4s, v29.4s, v15.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v15.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v15.4s\n"
+ "ldr q27, [%x[params], #0xc0]\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v29.16b, v4.16b\n"
+ "and v18.16b, v21.16b, v4.16b\n"
+ "and v17.16b, v20.16b, v4.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v4.4s\n"
+ "sqadd v29.4s, v29.4s, v19.4s\n"
+ "ldr q26, [%x[params], #0xa0]\n"
+ "sqadd v21.4s, v21.4s, v18.4s\n"
+ "ldr q25, [%x[params], #0xb0]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q24, [%x[params], #0x90]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v29.4s, v29.4s, v4.4s\n"
+ "srshl v21.4s, v21.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "ldr q1, [%x[params], #0xd0]\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v23.4s, #0x0\n"
+ ".inst 0x4e839597 // sdot v23.4s, v12.16b, v3.16b\n"
+ ".inst 0x4e809597 // sdot v23.4s, v12.16b, v0.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q31, [%x[params], #0x80]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "mov v22.16b, v23.16b\n .inst 0x4e829596 // sdot v22.4s, v12.16b, v2.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s29, [x24, x27]\n"
+ ".inst 0x4e879597 // sdot v23.4s, v12.16b, v7.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s21, [x23, x27]\n"
+ "mov v21.16b, v31.16b\n"
+ "str s20, [x22, x27]\n"
+ "mov v4.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ ".inst 0x4e87971f // sdot v31.4s, v24.16b, v7.16b\n"
+ ".inst 0x4e839704 // sdot v4.4s, v24.16b, v3.16b\n"
+ ".inst 0x4e83975f // sdot v31.4s, v26.16b, v3.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ "add x27, x27, #0x4\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x4e879715 // sdot v21.4s, v24.16b, v7.16b\n"
+ ".inst 0x4e839714 // sdot v20.4s, v24.16b, v3.16b\n"
+ ".inst 0x4e839592 // sdot v18.4s, v12.16b, v3.16b\n"
+ ".inst 0x4e809744 // sdot v4.4s, v26.16b, v0.16b\n"
+ ".inst 0x4e80973f // sdot v31.4s, v25.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e839755 // sdot v21.4s, v26.16b, v3.16b\n"
+ ".inst 0x4e809754 // sdot v20.4s, v26.16b, v0.16b\n"
+ "mls v31.4s, v23.4s, v16.4s\n"
+ ".inst 0x4e809592 // sdot v18.4s, v12.16b, v0.16b\n"
+ ".inst 0x4e829724 // sdot v4.4s, v25.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x4e809735 // sdot v21.4s, v25.16b, v0.16b\n"
+ ".inst 0x4e829734 // sdot v20.4s, v25.16b, v2.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v27.4s\n"
+ "mov v17.16b, v18.16b\n .inst 0x4e829591 // sdot v17.4s, v12.16b, v2.16b\n"
+ ".inst 0x4e879592 // sdot v18.4s, v12.16b, v7.16b\n"
+ "mls v21.4s, v18.4s, v16.4s\n"
+ "mls v4.4s, v22.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v1.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v27.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+ "ldr q30, [%x[params], #0x120]\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v21.16b, v1.16b\n"
+ "and v18.16b, v4.16b, v1.16b\n"
+ "and v17.16b, v20.16b, v1.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "sqadd v21.4s, v21.4s, v19.4s\n"
+ "ldr q29, [%x[params], #0x100]\n"
+ "sqadd v4.4s, v4.4s, v18.4s\n"
+ "ldr q28, [%x[params], #0x110]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q27, [%x[params], #0xf0]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v4.4s, v4.4s, v1.4s\n"
+ "srshl v20.4s, v20.4s, v1.4s\n"
+ "ldr q26, [%x[params], #0x130]\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v4.4s, v4.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v4.4s, v4.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v4.4s, v4.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v25.4s, #0x0\n"
+ ".inst 0x4e8a9599 // sdot v25.4s, v12.16b, v10.16b\n"
+ ".inst 0x4e859599 // sdot v25.4s, v12.16b, v5.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q24, [%x[params], #0xe0]\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "mov v23.16b, v25.16b\n .inst 0x4e899597 // sdot v23.4s, v12.16b, v9.16b\n"
+ "add %x[params], %x[params], #0x140\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s21, [x24, x27]\n"
+ ".inst 0x4e889599 // sdot v25.4s, v12.16b, v8.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s4, [x23, x27]\n"
+ "mov v22.16b, v24.16b\n"
+ "str s20, [x22, x27]\n"
+ "mov v21.16b, v24.16b\n"
+ "mov v20.16b, v24.16b\n"
+ ".inst 0x4e889778 // sdot v24.4s, v27.16b, v8.16b\n"
+ ".inst 0x4e8a9775 // sdot v21.4s, v27.16b, v10.16b\n"
+ ".inst 0x4e8a97b8 // sdot v24.4s, v29.16b, v10.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ "add x27, x27, #0x4\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x4e889776 // sdot v22.4s, v27.16b, v8.16b\n"
+ ".inst 0x4e8a9774 // sdot v20.4s, v27.16b, v10.16b\n"
+ ".inst 0x4e8a9592 // sdot v18.4s, v12.16b, v10.16b\n"
+ ".inst 0x4e8597b5 // sdot v21.4s, v29.16b, v5.16b\n"
+ ".inst 0x4e859798 // sdot v24.4s, v28.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e8a97b6 // sdot v22.4s, v29.16b, v10.16b\n"
+ ".inst 0x4e8597b4 // sdot v20.4s, v29.16b, v5.16b\n"
+ "mls v24.4s, v25.4s, v16.4s\n"
+ ".inst 0x4e859592 // sdot v18.4s, v12.16b, v5.16b\n"
+ ".inst 0x4e899795 // sdot v21.4s, v28.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x4e859796 // sdot v22.4s, v28.16b, v5.16b\n"
+ ".inst 0x4e899794 // sdot v20.4s, v28.16b, v9.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+ "mov v17.16b, v18.16b\n .inst 0x4e899591 // sdot v17.4s, v12.16b, v9.16b\n"
+ ".inst 0x4e889592 // sdot v18.4s, v12.16b, v8.16b\n"
+ "mls v22.4s, v18.4s, v16.4s\n"
+ "mls v21.4s, v23.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v24.16b, v26.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "and v19.16b, v22.16b, v26.16b\n"
+ "and v18.16b, v21.16b, v26.16b\n"
+ "and v17.16b, v20.16b, v26.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v19.4s\n"
+ "sqadd v21.4s, v21.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "srshl v24.4s, v24.4s, v26.4s\n"
+ "srshl v22.4s, v22.4s, v26.4s\n"
+ "srshl v21.4s, v21.4s, v26.4s\n"
+ "srshl v20.4s, v20.4s, v26.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v22.4s, v22.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x25, x27]\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s22, [x24, x27]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s21, [x23, x27]\n"
+ "str s20, [x22, x27]\n"
+ "add x27, x27, #0x4\n"
+ "beq 35f\n"
+ "3:" // Oddments
+ "and x20, %x[n_channels], #0xf\n"
+ "add x15, x15, x28\n"
+ "add x14, x14, x28\n"
+ "add x13, x13, x28\n"
+ "add x12, x12, x28\n"
+ "add x10, x10, x28\n"
+ "add x9, x9, x28\n"
+ "add x26, x26, x28\n"
+ "add x21, x21, x28\n"
+ "tbz %x[n_channels], #3, 7f\n"
+ "ldr d15, [x15], #0x8\n"
+ "ldr d25, [x14], #0x8\n"
+ "ldr d7, [x13], #0x8\n"
+ "ldr d8, [x12], #0x8\n"
+ "ldr d26, [x10], #0x8\n"
+ "ldr d23, [x9], #0x8\n"
+ "ldr d3, [x26], #0x8\n"
+ "ldr d10, [x21], #0x8\n"
+ "tbz %x[n_channels], #2, 5f\n"
+ "ld1 { v15.s }[2], [x15], #0x4\n"
+ "ld1 { v25.s }[2], [x14], #0x4\n"
+ "ld1 { v7.s }[2], [x13], #0x4\n"
+ "ld1 { v8.s }[2], [x12], #0x4\n"
+ "ld1 { v26.s }[2], [x10], #0x4\n"
+ "ld1 { v23.s }[2], [x9], #0x4\n"
+ "ld1 { v3.s }[2], [x26], #0x4\n"
+ "ld1 { v10.s }[2], [x21], #0x4\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v15.h }[6], [x15], #0x2\n"
+ "ld1 { v25.h }[6], [x14], #0x2\n"
+ "ld1 { v7.h }[6], [x13], #0x2\n"
+ "ld1 { v8.h }[6], [x12], #0x2\n"
+ "ld1 { v26.h }[6], [x10], #0x2\n"
+ "ld1 { v23.h }[6], [x9], #0x2\n"
+ "ld1 { v3.h }[6], [x26], #0x2\n"
+ "ld1 { v10.h }[6], [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v15.b }[14], [x15], #0x1\n"
+ "ld1 { v25.b }[14], [x14], #0x1\n"
+ "ld1 { v7.b }[14], [x13], #0x1\n"
+ "ld1 { v8.b }[14], [x12], #0x1\n"
+ "ld1 { v26.b }[14], [x10], #0x1\n"
+ "ld1 { v23.b }[14], [x9], #0x1\n"
+ "ld1 { v3.b }[14], [x26], #0x1\n"
+ "ld1 { v10.b }[14], [x21], #0x1\n"
+ "b 11f\n"
+ "4:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v15.b }[12], [x15], #0x1\n"
+ "ld1 { v25.b }[12], [x14], #0x1\n"
+ "ld1 { v7.b }[12], [x13], #0x1\n"
+ "ld1 { v8.b }[12], [x12], #0x1\n"
+ "ld1 { v26.b }[12], [x10], #0x1\n"
+ "ld1 { v23.b }[12], [x9], #0x1\n"
+ "ld1 { v3.b }[12], [x26], #0x1\n"
+ "ld1 { v10.b }[12], [x21], #0x1\n"
+ "b 11f\n"
+ "5:" // Oddments: Load (A): Bit 3: Bit 2: Unset
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v15.h }[4], [x15], #0x2\n"
+ "ld1 { v25.h }[4], [x14], #0x2\n"
+ "ld1 { v7.h }[4], [x13], #0x2\n"
+ "ld1 { v8.h }[4], [x12], #0x2\n"
+ "ld1 { v26.h }[4], [x10], #0x2\n"
+ "ld1 { v23.h }[4], [x9], #0x2\n"
+ "ld1 { v3.h }[4], [x26], #0x2\n"
+ "ld1 { v10.h }[4], [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v15.b }[10], [x15], #0x1\n"
+ "ld1 { v25.b }[10], [x14], #0x1\n"
+ "ld1 { v7.b }[10], [x13], #0x1\n"
+ "ld1 { v8.b }[10], [x12], #0x1\n"
+ "ld1 { v26.b }[10], [x10], #0x1\n"
+ "ld1 { v23.b }[10], [x9], #0x1\n"
+ "ld1 { v3.b }[10], [x26], #0x1\n"
+ "ld1 { v10.b }[10], [x21], #0x1\n"
+ "b 11f\n"
+ "6:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v15.b }[8], [x15], #0x1\n"
+ "ld1 { v25.b }[8], [x14], #0x1\n"
+ "ld1 { v7.b }[8], [x13], #0x1\n"
+ "ld1 { v8.b }[8], [x12], #0x1\n"
+ "ld1 { v26.b }[8], [x10], #0x1\n"
+ "ld1 { v23.b }[8], [x9], #0x1\n"
+ "ld1 { v3.b }[8], [x26], #0x1\n"
+ "ld1 { v10.b }[8], [x21], #0x1\n"
+ "b 11f\n"
+ "7:" // Oddments: Load (A): Bit 3: Unset
+ "tbz %x[n_channels], #2, 9f\n"
+ "ldr s15, [x15], #0x4\n"
+ "ldr s25, [x14], #0x4\n"
+ "ldr s7, [x13], #0x4\n"
+ "ldr s8, [x12], #0x4\n"
+ "ldr s26, [x10], #0x4\n"
+ "ldr s23, [x9], #0x4\n"
+ "ldr s3, [x26], #0x4\n"
+ "ldr s10, [x21], #0x4\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v15.h }[2], [x15], #0x2\n"
+ "ld1 { v25.h }[2], [x14], #0x2\n"
+ "ld1 { v7.h }[2], [x13], #0x2\n"
+ "ld1 { v8.h }[2], [x12], #0x2\n"
+ "ld1 { v26.h }[2], [x10], #0x2\n"
+ "ld1 { v23.h }[2], [x9], #0x2\n"
+ "ld1 { v3.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v15.b }[6], [x15], #0x1\n"
+ "ld1 { v25.b }[6], [x14], #0x1\n"
+ "ld1 { v7.b }[6], [x13], #0x1\n"
+ "ld1 { v8.b }[6], [x12], #0x1\n"
+ "ld1 { v26.b }[6], [x10], #0x1\n"
+ "ld1 { v23.b }[6], [x9], #0x1\n"
+ "ld1 { v3.b }[6], [x26], #0x1\n"
+ "ld1 { v10.b }[6], [x21], #0x1\n"
+ "b 11f\n"
+ "8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v15.b }[4], [x15], #0x1\n"
+ "ld1 { v25.b }[4], [x14], #0x1\n"
+ "ld1 { v7.b }[4], [x13], #0x1\n"
+ "ld1 { v8.b }[4], [x12], #0x1\n"
+ "ld1 { v26.b }[4], [x10], #0x1\n"
+ "ld1 { v23.b }[4], [x9], #0x1\n"
+ "ld1 { v3.b }[4], [x26], #0x1\n"
+ "ld1 { v10.b }[4], [x21], #0x1\n"
+ "b 11f\n"
+ "9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
+ "tbz %x[n_channels], #1, 10f\n"
+ "ldr h15, [x15], #0x2\n"
+ "ldr h25, [x14], #0x2\n"
+ "ldr h7, [x13], #0x2\n"
+ "ldr h8, [x12], #0x2\n"
+ "ldr h26, [x10], #0x2\n"
+ "ldr h23, [x9], #0x2\n"
+ "ldr h3, [x26], #0x2\n"
+ "ldr h10, [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v15.b }[2], [x15], #0x1\n"
+ "ld1 { v25.b }[2], [x14], #0x1\n"
+ "ld1 { v7.b }[2], [x13], #0x1\n"
+ "ld1 { v8.b }[2], [x12], #0x1\n"
+ "ld1 { v26.b }[2], [x10], #0x1\n"
+ "ld1 { v23.b }[2], [x9], #0x1\n"
+ "ld1 { v3.b }[2], [x26], #0x1\n"
+ "ld1 { v10.b }[2], [x21], #0x1\n"
+ "b 11f\n"
+ "10:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+ "ldr b15, [x15], #0x1\n"
+ "ldr b25, [x14], #0x1\n"
+ "ldr b7, [x13], #0x1\n"
+ "ldr b8, [x12], #0x1\n"
+ "ldr b26, [x10], #0x1\n"
+ "ldr b23, [x9], #0x1\n"
+ "ldr b3, [x26], #0x1\n"
+ "ldr b10, [x21], #0x1\n"
+ "11:" // Oddments: Load (A): Bit 3: End
+ "ldp x15, x14, [%x[inptrs], #0x40]\n"
+ "ldp x13, x12, [%x[inptrs], #0x50]\n"
+ "add x15, x15, x28\n"
+ "add x14, x14, x28\n"
+ "ldp x10, x9, [%x[inptrs], #0x60]\n"
+ "ldp x26, x21, [%x[inptrs], #0x70]\n"
+ "add x13, x13, x28\n"
+ "add x12, x12, x28\n"
+ "add x10, x10, x28\n"
+ "add x9, x9, x28\n"
+ "add x26, x26, x28\n"
+ "add x21, x21, x28\n"
+ "tbz %x[n_channels], #3, 15f\n"
+ "ldr d22, [x15], #0x8\n"
+ "ldr d19, [x14], #0x8\n"
+ "ldr d0, [x13], #0x8\n"
+ "ldr d5, [x12], #0x8\n"
+ "ldr d27, [x10], #0x8\n"
+ "ldr d24, [x9], #0x8\n"
+ "ldr d2, [x26], #0x8\n"
+ "ldr d9, [x21], #0x8\n"
+ "tbz %x[n_channels], #2, 13f\n"
+ "ld1 { v22.s }[2], [x15], #0x4\n"
+ "ld1 { v19.s }[2], [x14], #0x4\n"
+ "ld1 { v0.s }[2], [x13], #0x4\n"
+ "ld1 { v5.s }[2], [x12], #0x4\n"
+ "ld1 { v27.s }[2], [x10], #0x4\n"
+ "ld1 { v24.s }[2], [x9], #0x4\n"
+ "ld1 { v2.s }[2], [x26], #0x4\n"
+ "ld1 { v9.s }[2], [x21], #0x4\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v22.h }[6], [x15], #0x2\n"
+ "ld1 { v19.h }[6], [x14], #0x2\n"
+ "ld1 { v0.h }[6], [x13], #0x2\n"
+ "ld1 { v5.h }[6], [x12], #0x2\n"
+ "ld1 { v27.h }[6], [x10], #0x2\n"
+ "ld1 { v24.h }[6], [x9], #0x2\n"
+ "ld1 { v2.h }[6], [x26], #0x2\n"
+ "ld1 { v9.h }[6], [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v22.b }[14], [x15], #0x1\n"
+ "ld1 { v19.b }[14], [x14], #0x1\n"
+ "ld1 { v0.b }[14], [x13], #0x1\n"
+ "ld1 { v5.b }[14], [x12], #0x1\n"
+ "ld1 { v27.b }[14], [x10], #0x1\n"
+ "ld1 { v24.b }[14], [x9], #0x1\n"
+ "ld1 { v2.b }[14], [x26], #0x1\n"
+ "ld1 { v9.b }[14], [x21], #0x1\n"
+ "b 19f\n"
+ "12:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v22.b }[12], [x15], #0x1\n"
+ "ld1 { v19.b }[12], [x14], #0x1\n"
+ "ld1 { v0.b }[12], [x13], #0x1\n"
+ "ld1 { v5.b }[12], [x12], #0x1\n"
+ "ld1 { v27.b }[12], [x10], #0x1\n"
+ "ld1 { v24.b }[12], [x9], #0x1\n"
+ "ld1 { v2.b }[12], [x26], #0x1\n"
+ "ld1 { v9.b }[12], [x21], #0x1\n"
+ "b 19f\n"
+ "13:" // Oddments: Load (B): Bit 3: Bit 2: Unset
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v22.h }[4], [x15], #0x2\n"
+ "ld1 { v19.h }[4], [x14], #0x2\n"
+ "ld1 { v0.h }[4], [x13], #0x2\n"
+ "ld1 { v5.h }[4], [x12], #0x2\n"
+ "ld1 { v27.h }[4], [x10], #0x2\n"
+ "ld1 { v24.h }[4], [x9], #0x2\n"
+ "ld1 { v2.h }[4], [x26], #0x2\n"
+ "ld1 { v9.h }[4], [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v22.b }[10], [x15], #0x1\n"
+ "ld1 { v19.b }[10], [x14], #0x1\n"
+ "ld1 { v0.b }[10], [x13], #0x1\n"
+ "ld1 { v5.b }[10], [x12], #0x1\n"
+ "ld1 { v27.b }[10], [x10], #0x1\n"
+ "ld1 { v24.b }[10], [x9], #0x1\n"
+ "ld1 { v2.b }[10], [x26], #0x1\n"
+ "ld1 { v9.b }[10], [x21], #0x1\n"
+ "b 19f\n"
+ "14:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v22.b }[8], [x15], #0x1\n"
+ "ld1 { v19.b }[8], [x14], #0x1\n"
+ "ld1 { v0.b }[8], [x13], #0x1\n"
+ "ld1 { v5.b }[8], [x12], #0x1\n"
+ "ld1 { v27.b }[8], [x10], #0x1\n"
+ "ld1 { v24.b }[8], [x9], #0x1\n"
+ "ld1 { v2.b }[8], [x26], #0x1\n"
+ "ld1 { v9.b }[8], [x21], #0x1\n"
+ "b 19f\n"
+ "15:" // Oddments: Load (B): Bit 3: Unset
+ "tbz %x[n_channels], #2, 17f\n"
+ "ldr s22, [x15], #0x4\n"
+ "ldr s19, [x14], #0x4\n"
+ "ldr s0, [x13], #0x4\n"
+ "ldr s5, [x12], #0x4\n"
+ "ldr s27, [x10], #0x4\n"
+ "ldr s24, [x9], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s9, [x21], #0x4\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v22.h }[2], [x15], #0x2\n"
+ "ld1 { v19.h }[2], [x14], #0x2\n"
+ "ld1 { v0.h }[2], [x13], #0x2\n"
+ "ld1 { v5.h }[2], [x12], #0x2\n"
+ "ld1 { v27.h }[2], [x10], #0x2\n"
+ "ld1 { v24.h }[2], [x9], #0x2\n"
+ "ld1 { v2.h }[2], [x26], #0x2\n"
+ "ld1 { v9.h }[2], [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v22.b }[6], [x15], #0x1\n"
+ "ld1 { v19.b }[6], [x14], #0x1\n"
+ "ld1 { v0.b }[6], [x13], #0x1\n"
+ "ld1 { v5.b }[6], [x12], #0x1\n"
+ "ld1 { v27.b }[6], [x10], #0x1\n"
+ "ld1 { v24.b }[6], [x9], #0x1\n"
+ "ld1 { v2.b }[6], [x26], #0x1\n"
+ "ld1 { v9.b }[6], [x21], #0x1\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v22.b }[4], [x15], #0x1\n"
+ "ld1 { v19.b }[4], [x14], #0x1\n"
+ "ld1 { v0.b }[4], [x13], #0x1\n"
+ "ld1 { v5.b }[4], [x12], #0x1\n"
+ "ld1 { v27.b }[4], [x10], #0x1\n"
+ "ld1 { v24.b }[4], [x9], #0x1\n"
+ "ld1 { v2.b }[4], [x26], #0x1\n"
+ "ld1 { v9.b }[4], [x21], #0x1\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
+ "tbz %x[n_channels], #1, 18f\n"
+ "ldr h22, [x15], #0x2\n"
+ "ldr h19, [x14], #0x2\n"
+ "ldr h0, [x13], #0x2\n"
+ "ldr h5, [x12], #0x2\n"
+ "ldr h27, [x10], #0x2\n"
+ "ldr h24, [x9], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h9, [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v22.b }[2], [x15], #0x1\n"
+ "ld1 { v19.b }[2], [x14], #0x1\n"
+ "ld1 { v0.b }[2], [x13], #0x1\n"
+ "ld1 { v5.b }[2], [x12], #0x1\n"
+ "ld1 { v27.b }[2], [x10], #0x1\n"
+ "ld1 { v24.b }[2], [x9], #0x1\n"
+ "ld1 { v2.b }[2], [x26], #0x1\n"
+ "ld1 { v9.b }[2], [x21], #0x1\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+ "ldr b22, [x15], #0x1\n"
+ "ldr b19, [x14], #0x1\n"
+ "ldr b0, [x13], #0x1\n"
+ "ldr b5, [x12], #0x1\n"
+ "ldr b27, [x10], #0x1\n"
+ "ldr b24, [x9], #0x1\n"
+ "ldr b2, [x26], #0x1\n"
+ "ldr b9, [x21], #0x1\n"
+ "19:" // Oddments: Load (B): Bit 3: End
+ "ldr q20, [%x[params], #0x10]\n"
+ "ldr q6, [%x[params], #0x20]\n"
+ "zip2 v1.16b, v26.16b, v3.16b\n"
+ "zip1 v26.16b, v26.16b, v3.16b\n"
+ "ldr q4, [%x[params], #0x30]\n"
+ "zip1 v18.16b, v23.16b, v10.16b\n"
+ "zip2 v30.16b, v15.16b, v7.16b\n"
+ "cmp x20, #0x4\n"
+ "zip1 v15.16b, v15.16b, v7.16b\n"
+ "zip1 v29.16b, v25.16b, v8.16b\n"
+ "zip2 v8.16b, v25.16b, v8.16b\n"
+ "zip2 v10.16b, v23.16b, v10.16b\n"
+ "zip2 v23.16b, v26.16b, v18.16b\n"
+ "zip1 v26.16b, v26.16b, v18.16b\n"
+ "zip2 v28.16b, v22.16b, v0.16b\n"
+ "zip1 v22.16b, v22.16b, v0.16b\n"
+ "zip1 v21.16b, v19.16b, v5.16b\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x4e9a9591 // sdot v17.4s, v12.16b, v26.16b\n"
+ "zip2 v25.16b, v15.16b, v29.16b\n"
+ "zip1 v15.16b, v15.16b, v29.16b\n"
+ "zip1 v7.16b, v30.16b, v8.16b\n"
+ "zip2 v8.16b, v30.16b, v8.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "zip2 v5.16b, v19.16b, v5.16b\n"
+ "zip2 v30.16b, v27.16b, v2.16b\n"
+ "zip1 v27.16b, v27.16b, v2.16b\n"
+ "zip1 v18.16b, v24.16b, v9.16b\n"
+ "zip2 v9.16b, v24.16b, v9.16b\n"
+ "zip2 v19.16b, v22.16b, v21.16b\n"
+ "zip1 v22.16b, v22.16b, v21.16b\n"
+ "zip1 v3.16b, v1.16b, v10.16b\n"
+ ".inst 0x4e969591 // sdot v17.4s, v12.16b, v22.16b\n"
+ "zip2 v10.16b, v1.16b, v10.16b\n"
+ "zip1 v0.16b, v28.16b, v5.16b\n"
+ "zip2 v5.16b, v28.16b, v5.16b\n"
+ "zip2 v24.16b, v27.16b, v18.16b\n"
+ "zip1 v27.16b, v27.16b, v18.16b\n"
+ "zip1 v2.16b, v30.16b, v9.16b\n"
+ "mov v18.16b, v17.16b\n .inst 0x4e9b9592 // sdot v18.4s, v12.16b, v27.16b\n"
+ "zip2 v9.16b, v30.16b, v9.16b\n"
+ "mov v30.16b, v31.16b\n"
+ ".inst 0x4e8f9591 // sdot v17.4s, v12.16b, v15.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x4e8f969f // sdot v31.4s, v20.16b, v15.16b\n"
+ ".inst 0x4e9a969d // sdot v29.4s, v20.16b, v26.16b\n"
+ ".inst 0x4e9a94df // sdot v31.4s, v6.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "movi v1.4s, #0x0\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x4e9a9581 // sdot v1.4s, v12.16b, v26.16b\n"
+ ".inst 0x4e9694dd // sdot v29.4s, v6.16b, v22.16b\n"
+ ".inst 0x4e96949f // sdot v31.4s, v4.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x4e8f969e // sdot v30.4s, v20.16b, v15.16b\n"
+ ".inst 0x4e9a969c // sdot v28.4s, v20.16b, v26.16b\n"
+ "mls v31.4s, v17.4s, v16.4s\n"
+ ".inst 0x4e969581 // sdot v1.4s, v12.16b, v22.16b\n"
+ ".inst 0x4e9b949d // sdot v29.4s, v4.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e9a94de // sdot v30.4s, v6.16b, v26.16b\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ ".inst 0x4e9694dc // sdot v28.4s, v6.16b, v22.16b\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mov v20.16b, v1.16b\n .inst 0x4e9b9594 // sdot v20.4s, v12.16b, v27.16b\n"
+ ".inst 0x4e8f9581 // sdot v1.4s, v12.16b, v15.16b\n"
+ "ldr q18, [%x[params], #0x40]\n"
+ "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+ ".inst 0x4e96949e // sdot v30.4s, v4.16b, v22.16b\n"
+ ".inst 0x4e9b949c // sdot v28.4s, v4.16b, v27.16b\n"
+ "mls v30.4s, v1.4s, v16.4s\n"
+ "add %x[params], %x[params], #0x60\n"
+ "mls v28.4s, v20.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v18.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v18.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v17.16b, v30.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v26.16b, v28.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v28.4s, v28.4s, v26.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "blt 20f\n"
+ "str s31, [x25, x27]\n"
+ "str s30, [x24, x27]\n"
+ "str s29, [x23, x27]\n"
+ "str s28, [x22, x27]\n"
+ "b 23f\n"
+ "20:" // Oddments: Unroll 0: Oddment store
+ "add x25, x25, x27\n"
+ "add x24, x24, x27\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "tbz x20, #1, 21f\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
+ "tbz x20, #0, 22f\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
+ "b 22f\n"
+ "21:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
+ "22:" // Oddments: Unroll 0: Oddment store: Bit 1: End
+ "23:" // Oddments: Unroll 0: After oddment store
+ "subs x20, x20, #0x4\n"
+ "add x27, x27, #0x4\n"
+ "ble 35f\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q27, [%x[params], #0x10]\n"
+ "movi v1.4s, #0x0\n"
+ ".inst 0x4e979581 // sdot v1.4s, v12.16b, v23.16b\n"
+ "ldr q26, [%x[params], #0x20]\n"
+ "ldr q22, [%x[params], #0x30]\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "ldr q4, [%x[params], #0x40]\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x4e99977f // sdot v31.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e939581 // sdot v1.4s, v12.16b, v19.16b\n"
+ ".inst 0x4e97977d // sdot v29.4s, v27.16b, v23.16b\n"
+ "movi v20.4s, #0x0\n"
+ "cmp x20, #0x4\n"
+ ".inst 0x4e97975f // sdot v31.4s, v26.16b, v23.16b\n"
+ "mov v18.16b, v1.16b\n .inst 0x4e989592 // sdot v18.4s, v12.16b, v24.16b\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x4e999581 // sdot v1.4s, v12.16b, v25.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+ ".inst 0x4e99977e // sdot v30.4s, v27.16b, v25.16b\n"
+ ".inst 0x4e97977c // sdot v28.4s, v27.16b, v23.16b\n"
+ ".inst 0x4e979594 // sdot v20.4s, v12.16b, v23.16b\n"
+ ".inst 0x4e93975d // sdot v29.4s, v26.16b, v19.16b\n"
+ ".inst 0x4e9396df // sdot v31.4s, v22.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x4e97975e // sdot v30.4s, v26.16b, v23.16b\n"
+ ".inst 0x4e93975c // sdot v28.4s, v26.16b, v19.16b\n"
+ "mls v31.4s, v1.4s, v16.4s\n"
+ ".inst 0x4e939594 // sdot v20.4s, v12.16b, v19.16b\n"
+ ".inst 0x4e9896dd // sdot v29.4s, v22.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x4e9396de // sdot v30.4s, v22.16b, v19.16b\n"
+ ".inst 0x4e9896dc // sdot v28.4s, v22.16b, v24.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+ "mov v17.16b, v20.16b\n .inst 0x4e989591 // sdot v17.4s, v12.16b, v24.16b\n"
+ ".inst 0x4e999594 // sdot v20.4s, v12.16b, v25.16b\n"
+ "mls v30.4s, v20.4s, v16.4s\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v30.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v17.16b, v28.16b, v21.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "blt 24f\n"
+ "str s31, [x25, x27]\n"
+ "str s30, [x24, x27]\n"
+ "str s29, [x23, x27]\n"
+ "str s28, [x22, x27]\n"
+ "b 27f\n"
+ "24:" // Oddments: Unroll 1: Oddment store
+ "add x25, x25, x27\n"
+ "add x24, x24, x27\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "tbz x20, #1, 25f\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
+ "tbz x20, #0, 26f\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
+ "b 26f\n"
+ "25:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
+ "26:" // Oddments: Unroll 1: Oddment store: Bit 1: End
+ "27:" // Oddments: Unroll 1: After oddment store
+ "subs x20, x20, #0x4\n"
+ "add x27, x27, #0x4\n"
+ "ble 35f\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q25, [%x[params], #0x10]\n"
+ "movi v24.4s, #0x0\n"
+ ".inst 0x4e839598 // sdot v24.4s, v12.16b, v3.16b\n"
+ "ldr q23, [%x[params], #0x20]\n"
+ "ldr q22, [%x[params], #0x30]\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "ldr q21, [%x[params], #0x40]\n"
+ "ldr q20, [%x[params], #0x50]\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x4e87973f // sdot v31.4s, v25.16b, v7.16b\n"
+ ".inst 0x4e809598 // sdot v24.4s, v12.16b, v0.16b\n"
+ ".inst 0x4e83973d // sdot v29.4s, v25.16b, v3.16b\n"
+ "movi v19.4s, #0x0\n"
+ "cmp x20, #0x4\n"
+ ".inst 0x4e8396ff // sdot v31.4s, v23.16b, v3.16b\n"
+ "mov v18.16b, v24.16b\n .inst 0x4e829592 // sdot v18.4s, v12.16b, v2.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x4e879598 // sdot v24.4s, v12.16b, v7.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ ".inst 0x4e87973e // sdot v30.4s, v25.16b, v7.16b\n"
+ ".inst 0x4e83973c // sdot v28.4s, v25.16b, v3.16b\n"
+ ".inst 0x4e839593 // sdot v19.4s, v12.16b, v3.16b\n"
+ ".inst 0x4e8096fd // sdot v29.4s, v23.16b, v0.16b\n"
+ ".inst 0x4e8096df // sdot v31.4s, v22.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e8396fe // sdot v30.4s, v23.16b, v3.16b\n"
+ ".inst 0x4e8096fc // sdot v28.4s, v23.16b, v0.16b\n"
+ "mls v31.4s, v24.4s, v16.4s\n"
+ ".inst 0x4e809593 // sdot v19.4s, v12.16b, v0.16b\n"
+ ".inst 0x4e8296dd // sdot v29.4s, v22.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x4e8096de // sdot v30.4s, v22.16b, v0.16b\n"
+ ".inst 0x4e8296dc // sdot v28.4s, v22.16b, v2.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "mov v17.16b, v19.16b\n .inst 0x4e829591 // sdot v17.4s, v12.16b, v2.16b\n"
+ ".inst 0x4e879593 // sdot v19.4s, v12.16b, v7.16b\n"
+ "mls v30.4s, v19.4s, v16.4s\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v20.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v30.16b, v20.16b\n"
+ "and v18.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v28.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v28.4s, v28.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "blt 28f\n"
+ "str s31, [x25, x27]\n"
+ "str s30, [x24, x27]\n"
+ "str s29, [x23, x27]\n"
+ "str s28, [x22, x27]\n"
+ "b 31f\n"
+ "28:" // Oddments: Unroll 2: Oddment store
+ "add x25, x25, x27\n"
+ "add x24, x24, x27\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "tbz x20, #1, 29f\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
+ "tbz x20, #0, 30f\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
+ "b 30f\n"
+ "29:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
+ "30:" // Oddments: Unroll 2: Oddment store: Bit 1: End
+ "31:" // Oddments: Unroll 2: After oddment store
+ "subs x20, x20, #0x4\n"
+ "add x27, x27, #0x4\n"
+ "ble 35f\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q23, [%x[params], #0x10]\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x4e8a9596 // sdot v22.4s, v12.16b, v10.16b\n"
+ "ldr q21, [%x[params], #0x20]\n"
+ "ldr q19, [%x[params], #0x30]\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "ldr q20, [%x[params], #0x40]\n"
+ "ldr q26, [%x[params], #0x50]\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x4e8896ff // sdot v31.4s, v23.16b, v8.16b\n"
+ ".inst 0x4e859596 // sdot v22.4s, v12.16b, v5.16b\n"
+ ".inst 0x4e8a96fd // sdot v29.4s, v23.16b, v10.16b\n"
+ "movi v18.4s, #0x0\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x4e8a96bf // sdot v31.4s, v21.16b, v10.16b\n"
+ "mov v17.16b, v22.16b\n .inst 0x4e899591 // sdot v17.4s, v12.16b, v9.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ ".inst 0x4e889596 // sdot v22.4s, v12.16b, v8.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ ".inst 0x4e8896fe // sdot v30.4s, v23.16b, v8.16b\n"
+ ".inst 0x4e8a96fc // sdot v28.4s, v23.16b, v10.16b\n"
+ ".inst 0x4e8a9592 // sdot v18.4s, v12.16b, v10.16b\n"
+ ".inst 0x4e8596bd // sdot v29.4s, v21.16b, v5.16b\n"
+ ".inst 0x4e85967f // sdot v31.4s, v19.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e8a96be // sdot v30.4s, v21.16b, v10.16b\n"
+ ".inst 0x4e8596bc // sdot v28.4s, v21.16b, v5.16b\n"
+ "mls v31.4s, v22.4s, v16.4s\n"
+ ".inst 0x4e859592 // sdot v18.4s, v12.16b, v5.16b\n"
+ ".inst 0x4e89967d // sdot v29.4s, v19.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x4e85967e // sdot v30.4s, v19.16b, v5.16b\n"
+ ".inst 0x4e89967c // sdot v28.4s, v19.16b, v9.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v20.4s\n"
+ "mov v7.16b, v18.16b\n .inst 0x4e899587 // sdot v7.4s, v12.16b, v9.16b\n"
+ ".inst 0x4e889592 // sdot v18.4s, v12.16b, v8.16b\n"
+ "mls v30.4s, v18.4s, v16.4s\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mls v28.4s, v7.4s, v16.4s\n"
+ "and v16.16b, v31.16b, v26.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v20.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v20.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v20.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v18.16b, v30.16b, v26.16b\n"
+ "and v17.16b, v29.16b, v26.16b\n"
+ "and v16.16b, v28.16b, v26.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "sqadd v29.4s, v29.4s, v17.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "srshl v31.4s, v31.4s, v26.4s\n"
+ "srshl v30.4s, v30.4s, v26.4s\n"
+ "srshl v29.4s, v29.4s, v26.4s\n"
+ "srshl v28.4s, v28.4s, v26.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "32:" // Oddments: Unroll 3: Oddment store
+ "add x25, x25, x27\n"
+ "add x24, x24, x27\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "tbz x20, #1, 33f\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
+ "tbz x20, #0, 34f\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
+ "b 34f\n"
+ "33:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
+ "34:" // Oddments: Unroll 3: Oddment store: Bit 1: End
+ "35:" // End
+ : [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..874b18c145
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int,
+ const int8_t *const *const,
+ const int8_t *const,
+ const int32_t *const,
+ const arm_gemm::Requantize32 &,
+ const int32_t *const,
+ const int32_t *const,
+ int8_t *const *const
+);
+
+class a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..4626007afa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1166 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const int8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ uint64_t n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ int8_t *const *const outptrs;
+ const int8_t *inptrs[16];
+
+ Params(
+ long unsigned int n_channels,
+ const int8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[5];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[3];
+ inptrs[3] = inptrs_raw[6];
+ inptrs[4] = inptrs_raw[9];
+ inptrs[5] = inptrs_raw[12];
+ inptrs[6] = inptrs_raw[15];
+ inptrs[7] = inptrs_raw[1];
+ inptrs[8] = inptrs_raw[2];
+ inptrs[9] = inptrs_raw[10];
+ inptrs[10] = inptrs_raw[4];
+ inptrs[11] = inptrs_raw[7];
+ inptrs[12] = inptrs_raw[8];
+ inptrs[13] = inptrs_raw[11];
+ inptrs[14] = inptrs_raw[13];
+ inptrs[15] = inptrs_raw[14];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x8, x7, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v14.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v19.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v29.8h }, [x21]\n"
+ "ld1r { v12.8h }, [x20]\n"
+ "mov x17, #0x0\n"
+ "mov x16, #0x0\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x11, x10, [x22, #0x0]\n"
+ "ldp x9, x28, [x22, #0x10]\n"
+ "cbz x8, 3f\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "subs x8, x8, #0x1\n"
+ "ssubl v23.8h, v23.8b, v19.8b\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "ssubl v16.8h, v16.8b, v19.8b\n"
+ "ssubl v1.8h, v1.8b, v19.8b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "ssubl v5.8h, v5.8b, v19.8b\n"
+ "ssubl v26.8h, v26.8b, v19.8b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "ssubl v18.8h, v18.8b, v19.8b\n"
+ "ssubl v31.8h, v31.8b, v19.8b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v25.8h, v25.8b, v19.8b\n"
+ "ssubl v20.8h, v20.8b, v19.8b\n"
+ "ldr q9, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d22, [x23, x17]\n"
+ "ldr d4, [x22, x17]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d8, [x21, x17]\n"
+ "ldr d27, [x20, x17]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ldr d15, [x20, x17]\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "ldr q3, [x13, #0x0]\n"
+ "ldr q17, [x12, #0x0]\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr q21, [x13, #0x10]\n"
+ "ldr q28, [x12, #0x10]\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ldr x20, [x15, #0x38]\n"
+ "ldr d4, [x20, x17]\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "ssubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x27, [x15, #0x48]\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x26, [x15, #0x50]\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "ldr d8, [x20, x17]\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr x25, [x15, #0x58]\n"
+ "ldr x24, [x15, #0x60]\n"
+ "smlal v2.4s, v11.4h, v31.4h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x70]\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v9.4s, v4.4h, v16.4h\n"
+ "ldr x21, [x15, #0x78]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "ldr d27, [x27, x17]\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d11, [x26, x17]\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "add x14, x14, #0x48\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal v10.4s, v22.4h, v20.4h\n"
+ "ssubl v11.8h, v11.8b, v14.8b\n"
+ "subs x8, x8, #0x1\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v9.4s, v8.4h, v1.4h\n"
+ "add x13, x13, #0x20\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "ldr d15, [x25, x17]\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v20.8h\n"
+ "ldr d22, [x24, x17]\n"
+ "smlal v7.4s, v4.4h, v23.4h\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v2.4s, v27.4h, v18.4h\n"
+ "smlal v10.4s, v27.4h, v26.4h\n"
+ "smlal2 v24.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v20.4h\n"
+ "smlal2 v0.4s, v4.8h, v23.8h\n"
+ "ldr d4, [x23, x17]\n"
+ "smlal2 v30.4s, v27.8h, v18.8h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v26.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "ssubl v26.8h, v26.8b, v14.8b\n"
+ "smlal v2.4s, v11.4h, v23.4h\n"
+ "smlal v10.4s, v15.4h, v1.4h\n"
+ "smlal2 v24.4s, v27.8h, v20.8h\n"
+ "smlal v9.4s, v11.4h, v5.4h\n"
+ "smlal2 v0.4s, v8.8h, v16.8h\n"
+ "ldr d8, [x21, x17]\n"
+ "smlal2 v30.4s, v11.8h, v23.8h\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v15.8h, v1.8h\n"
+ "smlal v7.4s, v27.4h, v25.4h\n"
+ "add x17, x17, #0x8\n"
+ "smlal v2.4s, v22.4h, v5.4h\n"
+ "smlal v10.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v9.4s, v22.4h, v31.4h\n"
+ "sqrdmulh v9.4s, v9.4s, v3.4s\n"
+ "smlal2 v0.4s, v27.8h, v25.8h\n"
+ "smlal2 v30.4s, v22.8h, v5.8h\n"
+ "and v27.16b, v9.16b, v17.16b\n"
+ "smlal2 v6.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v15.4h, v18.4h\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "smlal v2.4s, v26.4h, v25.4h\n"
+ "smlal v10.4s, v26.4h, v31.4h\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "smlal2 v24.4s, v22.8h, v31.8h\n"
+ "smlal2 v0.4s, v15.8h, v18.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "smlal2 v30.4s, v26.8h, v25.8h\n"
+ "smlal2 v6.4s, v26.8h, v31.8h\n"
+ "and v31.16b, v24.16b, v28.16b\n"
+ "smlal v7.4s, v4.4h, v20.4h\n"
+ "smlal v2.4s, v8.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v3.4s\n"
+ "smlal v10.4s, v8.4h, v25.4h\n"
+ "smlal2 v0.4s, v4.8h, v20.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v3.4s\n"
+ "smlal2 v30.4s, v8.8h, v20.8h\n"
+ "smlal2 v6.4s, v8.8h, v25.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v3.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v22.16b, v7.16b, v17.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+ "and v3.16b, v2.16b, v17.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "and v11.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v20.16b, v0.16b, v28.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v31.16b, v30.16b, v28.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v18.16b, v6.16b, v28.16b\n"
+ "sqadd v7.4s, v7.4s, v22.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v3.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v11.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v17.4s\n"
+ "sqadd v0.4s, v0.4s, v20.4s\n"
+ "srshl v2.4s, v2.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v31.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v0.4s, v0.4s, v28.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v28.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v28.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "str d9, [x11, x16]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str d7, [x10, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d2, [x9, x16]\n"
+ "str d10, [x28, x16]\n"
+ "ldr q9, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "add x16, x16, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "ssubl v23.8h, v23.8b, v19.8b\n"
+ "ssubl v16.8h, v16.8b, v19.8b\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr d22, [x23, x17]\n"
+ "ssubl v1.8h, v1.8b, v19.8b\n"
+ "ssubl v5.8h, v5.8b, v19.8b\n"
+ "ldr d4, [x22, x17]\n"
+ "ldr d8, [x21, x17]\n"
+ "ssubl v26.8h, v26.8b, v19.8b\n"
+ "ssubl v18.8h, v18.8b, v19.8b\n"
+ "ldr d27, [x20, x17]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ssubl v31.8h, v31.8b, v19.8b\n"
+ "ssubl v25.8h, v25.8b, v19.8b\n"
+ "ldr d15, [x20, x17]\n"
+ "ssubl v20.8h, v20.8b, v19.8b\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "ldr q28, [x13, #0x0]\n"
+ "ldr q17, [x12, #0x0]\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr q21, [x13, #0x10]\n"
+ "ldr q3, [x12, #0x10]\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ldr x20, [x15, #0x38]\n"
+ "ldr d4, [x20, x17]\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "ssubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x26, [x15, #0x48]\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x25, [x15, #0x50]\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "ldr d8, [x20, x17]\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr x24, [x15, #0x58]\n"
+ "ldr x23, [x15, #0x60]\n"
+ "smlal v2.4s, v11.4h, v31.4h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "ldr x22, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v9.4s, v4.4h, v16.4h\n"
+ "ldr x20, [x15, #0x78]\n"
+ "tst x7, #0x7\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "ldr d27, [x26, x17]\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d11, [x25, x17]\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "add x13, x13, #0x20\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal v10.4s, v22.4h, v20.4h\n"
+ "ssubl v11.8h, v11.8b, v14.8b\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v9.4s, v8.4h, v1.4h\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "ldr d15, [x24, x17]\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v20.8h\n"
+ "ldr d22, [x23, x17]\n"
+ "smlal v7.4s, v4.4h, v23.4h\n"
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v2.4s, v27.4h, v18.4h\n"
+ "smlal v10.4s, v27.4h, v26.4h\n"
+ "smlal2 v24.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v20.4h\n"
+ "smlal2 v0.4s, v4.8h, v23.8h\n"
+ "ldr d4, [x22, x17]\n"
+ "smlal2 v30.4s, v27.8h, v18.8h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v26.8h\n"
+ "ldr d26, [x21, x17]\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "ssubl v26.8h, v26.8b, v14.8b\n"
+ "smlal v2.4s, v11.4h, v23.4h\n"
+ "smlal v10.4s, v15.4h, v1.4h\n"
+ "smlal2 v24.4s, v27.8h, v20.8h\n"
+ "smlal v9.4s, v11.4h, v5.4h\n"
+ "smlal2 v0.4s, v8.8h, v16.8h\n"
+ "ldr d16, [x20, x17]\n"
+ "smlal2 v30.4s, v11.8h, v23.8h\n"
+ "ssubl v16.8h, v16.8b, v14.8b\n"
+ "smlal2 v6.4s, v15.8h, v1.8h\n"
+ "smlal v7.4s, v27.4h, v25.4h\n"
+ "add x17, x17, #0x8\n"
+ "smlal v2.4s, v22.4h, v5.4h\n"
+ "smlal v10.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v9.4s, v22.4h, v31.4h\n"
+ "sqrdmulh v9.4s, v9.4s, v28.4s\n"
+ "smlal2 v0.4s, v27.8h, v25.8h\n"
+ "smlal2 v30.4s, v22.8h, v5.8h\n"
+ "and v1.16b, v9.16b, v17.16b\n"
+ "smlal2 v6.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v15.4h, v18.4h\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "smlal v2.4s, v26.4h, v25.4h\n"
+ "smlal v10.4s, v26.4h, v31.4h\n"
+ "sqadd v9.4s, v9.4s, v1.4s\n"
+ "smlal2 v24.4s, v22.8h, v31.8h\n"
+ "smlal2 v0.4s, v15.8h, v18.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "smlal2 v30.4s, v26.8h, v25.8h\n"
+ "smlal2 v6.4s, v26.8h, v31.8h\n"
+ "and v31.16b, v24.16b, v3.16b\n"
+ "smlal v7.4s, v4.4h, v20.4h\n"
+ "smlal v2.4s, v16.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v28.4s\n"
+ "smlal v10.4s, v16.4h, v25.4h\n"
+ "smlal2 v0.4s, v4.8h, v20.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v28.4s\n"
+ "smlal2 v30.4s, v16.8h, v20.8h\n"
+ "smlal2 v6.4s, v16.8h, v25.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v28.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v22.16b, v7.16b, v17.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+ "and v15.16b, v2.16b, v17.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "and v11.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v18.16b, v0.16b, v3.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v23.16b, v30.16b, v3.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v21.16b, v6.16b, v3.16b\n"
+ "sqadd v7.4s, v7.4s, v22.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v15.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v11.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v17.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v23.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v21.4s\n"
+ "srshl v24.4s, v24.4s, v3.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v0.4s, v0.4s, v3.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v3.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v3.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "str d9, [x11, x16]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str d7, [x10, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d2, [x9, x16]\n"
+ "str d10, [x28, x16]\n"
+ "add x16, x16, #0x8\n"
+ "beq 64f\n"
+ "add x14, x14, #0x48\n"
+ "3:" // Oddments
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x7, #2, 5f\n"
+ "ld1 { v9.4s }, [x20], #0x10\n"
+ "tbz x7, #1, 4f\n"
+ "ld1 { v24.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v24.s }[2], [x20]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 7f\n"
+ "ld1 { v24.s }[0], [x20]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x7, #1, 6f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v9.s }[2], [x20]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 7f\n"
+ "ld1 { v9.s }[0], [x20]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "ssubl v23.8h, v23.8b, v19.8b\n"
+ "ssubl v16.8h, v16.8b, v19.8b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "ssubl v1.8h, v1.8b, v19.8b\n"
+ "ssubl v5.8h, v5.8b, v19.8b\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ssubl v26.8h, v26.8b, v19.8b\n"
+ "ssubl v18.8h, v18.8b, v19.8b\n"
+ "ssubl v31.8h, v31.8b, v19.8b\n"
+ "ssubl v25.8h, v25.8b, v19.8b\n"
+ "ssubl v20.8h, v20.8b, v19.8b\n"
+ "add x24, x24, x17\n"
+ "add x23, x23, x17\n"
+ "add x22, x22, x17\n"
+ "add x21, x21, x17\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 9f\n"
+ "ld1 { v22.s }[0], [x24], #0x4\n"
+ "ld1 { v4.s }[0], [x23], #0x4\n"
+ "ld1 { v8.s }[0], [x22], #0x4\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 8f\n"
+ "ld1 { v22.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v8.h }[2], [x22], #0x2\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[6], [x24]\n"
+ "ld1 { v4.b }[6], [x23]\n"
+ "ld1 { v8.b }[6], [x22]\n"
+ "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v15.b }[6], [x20]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[4], [x24]\n"
+ "ld1 { v4.b }[4], [x23]\n"
+ "ld1 { v8.b }[4], [x22]\n"
+ "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v15.b }[4], [x20]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x7, #1, 10f\n"
+ "ld1 { v22.h }[0], [x24], #0x2\n"
+ "ld1 { v4.h }[0], [x23], #0x2\n"
+ "ld1 { v8.h }[0], [x22], #0x2\n"
+ "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x23]\n"
+ "ld1 { v8.b }[2], [x22]\n"
+ "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v15.b }[2], [x20]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[0], [x24]\n"
+ "ld1 { v4.b }[0], [x23]\n"
+ "ld1 { v8.b }[0], [x22]\n"
+ "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v15.b }[0], [x20]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ssubl v27.8h, v27.8b, v14.8b\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ssubl v15.8h, v15.8b, v14.8b\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "tbz x7, #2, 13f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 12f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[6], [x20]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[4], [x20]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x7, #1, 14f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[2], [x20]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[0], [x20]\n"
+ "15:" // Oddments: Load (3, 0): Bit 2: End
+ "ssubl v21.8h, v21.8b, v14.8b\n"
+ "smlal v2.4s, v21.4h, v31.4h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "tbz x7, #2, 17f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 16f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[6], [x20]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[4], [x20]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x7, #1, 18f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[2], [x20]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[0], [x20]\n"
+ "19:" // Oddments: Load (3, 3): Bit 2: End
+ "ssubl v28.8h, v28.8b, v14.8b\n"
+ "ldr x20, [x15, #0x38]\n"
+ "smlal v10.4s, v28.4h, v20.4h\n"
+ "smlal2 v6.4s, v28.8h, v20.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 21f\n"
+ "ld1 { v22.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 20f\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[6], [x20]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[4], [x20]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (0, 1): Bit 2: Unset
+ "tbz x7, #1, 22f\n"
+ "ld1 { v22.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[2], [x20]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[0], [x20]\n"
+ "23:" // Oddments: Load (0, 1): Bit 2: End
+ "ssubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v9.4s, v22.4h, v16.4h\n"
+ "smlal2 v24.4s, v22.8h, v16.8h\n"
+ "smlal v7.4s, v22.4h, v23.4h\n"
+ "smlal2 v0.4s, v22.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 25f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 24f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[6], [x20]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[4], [x20]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (0, 2): Bit 2: Unset
+ "tbz x7, #1, 26f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[2], [x20]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[0], [x20]\n"
+ "27:" // Oddments: Load (0, 2): Bit 2: End
+ "ssubl v21.8h, v21.8b, v14.8b\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v9.4s, v21.4h, v1.4h\n"
+ "smlal2 v24.4s, v21.8h, v1.8h\n"
+ "smlal v7.4s, v21.4h, v16.4h\n"
+ "smlal2 v0.4s, v21.8h, v16.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 29f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 28f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[6], [x20]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[4], [x20]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 2): Bit 2: Unset
+ "tbz x7, #1, 30f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[2], [x20]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[0], [x20]\n"
+ "31:" // Oddments: Load (2, 2): Bit 2: End
+ "ssubl v28.8h, v28.8b, v14.8b\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v9.4s, v28.4h, v20.4h\n"
+ "smlal2 v24.4s, v28.8h, v20.8h\n"
+ "smlal v7.4s, v28.4h, v25.4h\n"
+ "smlal2 v0.4s, v28.8h, v25.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v2.4s, v28.4h, v18.4h\n"
+ "smlal2 v30.4s, v28.8h, v18.8h\n"
+ "smlal v10.4s, v28.4h, v26.4h\n"
+ "smlal2 v6.4s, v28.8h, v26.8h\n"
+ "tbz x7, #2, 33f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 32f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[6], [x20]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[4], [x20]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (1, 0): Bit 2: Unset
+ "tbz x7, #1, 34f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[2], [x20]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[0], [x20]\n"
+ "35:" // Oddments: Load (1, 0): Bit 2: End
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "ldr x20, [x15, #0x58]\n"
+ "smlal v9.4s, v8.4h, v5.4h\n"
+ "smlal2 v24.4s, v8.8h, v5.8h\n"
+ "smlal v2.4s, v8.4h, v23.4h\n"
+ "smlal2 v30.4s, v8.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 37f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 36f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[6], [x20]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[4], [x20]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x7, #1, 38f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[2], [x20]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[0], [x20]\n"
+ "39:" // Oddments: Load (1, 3): Bit 2: End
+ "ssubl v8.8h, v8.8b, v14.8b\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v7.4s, v8.4h, v18.4h\n"
+ "smlal2 v0.4s, v8.8h, v18.8h\n"
+ "smlal v10.4s, v8.4h, v1.4h\n"
+ "smlal2 v6.4s, v8.8h, v1.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 41f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 40f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[6], [x20]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[4], [x20]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (2, 0): Bit 2: Unset
+ "tbz x7, #1, 42f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[2], [x20]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[0], [x20]\n"
+ "43:" // Oddments: Load (2, 0): Bit 2: End
+ "ssubl v17.8h, v17.8b, v14.8b\n"
+ "ldr x20, [x15, #0x68]\n"
+ "smlal v9.4s, v17.4h, v31.4h\n"
+ "smlal2 v24.4s, v17.8h, v31.8h\n"
+ "smlal v2.4s, v17.4h, v5.4h\n"
+ "smlal2 v30.4s, v17.8h, v5.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 45f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 44f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[6], [x20]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[4], [x20]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x7, #1, 46f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[2], [x20]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[0], [x20]\n"
+ "47:" // Oddments: Load (2, 3): Bit 2: End
+ "ssubl v23.8h, v23.8b, v14.8b\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v7.4s, v23.4h, v20.4h\n"
+ "smlal2 v0.4s, v23.8h, v20.8h\n"
+ "smlal v10.4s, v23.4h, v18.4h\n"
+ "smlal2 v6.4s, v23.8h, v18.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 49f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 48f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[6], [x20]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[4], [x20]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x7, #1, 50f\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[2], [x20]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[0], [x20]\n"
+ "51:" // Oddments: Load (3, 1): Bit 2: End
+ "ssubl v5.8h, v5.8b, v14.8b\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v2.4s, v5.4h, v25.4h\n"
+ "smlal2 v30.4s, v5.8h, v25.8h\n"
+ "smlal v10.4s, v5.4h, v31.4h\n"
+ "smlal2 v6.4s, v5.8h, v31.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 53f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 52f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[6], [x20]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[4], [x20]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x7, #1, 54f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[2], [x20]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[0], [x20]\n"
+ "55:" // Oddments: Load (3, 2): Bit 2: End
+ "ssubl v23.8h, v23.8b, v14.8b\n"
+ "smlal v2.4s, v23.4h, v20.4h\n"
+ "smlal2 v30.4s, v23.8h, v20.8h\n"
+ "smlal v10.4s, v23.4h, v25.4h\n"
+ "smlal2 v6.4s, v23.8h, v25.8h\n"
+ "tbz x7, #2, 57f\n"
+ "ld1 { v15.4s }, [x13], #0x10\n"
+ "ld1 { v19.4s }, [x12], #0x10\n"
+ "tbz x7, #1, 56f\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v22.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v22.s }[2], [x12]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 59f\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v22.s }[0], [x12]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x7, #1, 58f\n"
+ "ld1 { v15.d }[0], [x13], #0x8\n"
+ "ld1 { v19.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v15.s }[2], [x13]\n"
+ "ld1 { v19.s }[2], [x12]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 59f\n"
+ "ld1 { v15.s }[0], [x13]\n"
+ "ld1 { v19.s }[0], [x12]\n"
+ "59:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v9.4s, v9.4s, v15.4s\n"
+ "and v17.16b, v9.16b, v19.16b\n"
+ "add x11, x11, x16\n"
+ "add x10, x10, x16\n"
+ "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "add x9, x9, x16\n"
+ "add x28, x28, x16\n"
+ "and v20.16b, v24.16b, v22.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+ "sqrdmulh v2.4s, v2.4s, v15.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v15.4s\n"
+ "sqadd v9.4s, v9.4s, v17.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v21.16b, v7.16b, v19.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "and v15.16b, v2.16b, v19.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+ "and v23.16b, v10.16b, v19.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v20.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v18.16b, v0.16b, v22.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v17.16b, v30.16b, v22.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v28.16b, v6.16b, v22.16b\n"
+ "sqadd v7.4s, v7.4s, v21.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v15.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v23.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v19.4s\n"
+ "srshl v7.4s, v7.4s, v19.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v19.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqadd v6.4s, v6.4s, v28.4s\n"
+ "srshl v24.4s, v24.4s, v22.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v0.4s, v0.4s, v22.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v22.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v22.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "tbz x7, #2, 61f\n"
+ "st1 { v9.s }[0], [x11], #0x4\n"
+ "st1 { v7.s }[0], [x10], #0x4\n"
+ "st1 { v2.s }[0], [x9], #0x4\n"
+ "st1 { v10.s }[0], [x28], #0x4\n"
+ "tbz x7, #1, 60f\n"
+ "st1 { v9.h }[2], [x11], #0x2\n"
+ "st1 { v7.h }[2], [x10], #0x2\n"
+ "st1 { v2.h }[2], [x9], #0x2\n"
+ "st1 { v10.h }[2], [x28], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "st1 { v9.b }[6], [x11], #0x1\n"
+ "st1 { v7.b }[6], [x10], #0x1\n"
+ "st1 { v2.b }[6], [x9], #0x1\n"
+ "st1 { v10.b }[6], [x28], #0x1\n"
+ "b 63f\n"
+ "60:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 63f\n"
+ "st1 { v9.b }[4], [x11], #0x1\n"
+ "st1 { v7.b }[4], [x10], #0x1\n"
+ "st1 { v2.b }[4], [x9], #0x1\n"
+ "st1 { v10.b }[4], [x28], #0x1\n"
+ "b 63f\n"
+ "61:" // Oddments: Bit 2: Unset
+ "tbz x7, #1, 62f\n"
+ "st1 { v9.h }[0], [x11], #0x2\n"
+ "st1 { v7.h }[0], [x10], #0x2\n"
+ "st1 { v2.h }[0], [x9], #0x2\n"
+ "st1 { v10.h }[0], [x28], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "st1 { v9.b }[2], [x11], #0x1\n"
+ "st1 { v7.b }[2], [x10], #0x1\n"
+ "st1 { v2.b }[2], [x9], #0x1\n"
+ "st1 { v10.b }[2], [x28], #0x1\n"
+ "b 63f\n"
+ "62:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 63f\n"
+ "st1 { v9.b }[0], [x11], #0x1\n"
+ "st1 { v7.b }[0], [x10], #0x1\n"
+ "st1 { v2.b }[0], [x9], #0x1\n"
+ "st1 { v10.b }[0], [x28], #0x1\n"
+ "63:" // Oddments: Bit 2: End
+ "64:" // End
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..893260362a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+ const unsigned int,
+ const int8_t *const *const,
+ const int8_t *const,
+ const int32_t *const,
+ const arm_gemm::Requantize32 &,
+ const int32_t *const,
+ const int32_t *const,
+ int8_t *const *const
+);
+
+
+class a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d98ab71cb8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const int8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ uint64_t n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ int8_t *const *const outptrs;
+ const int8_t *inptrs[25];
+
+ Params(
+ long unsigned int n_channels,
+ const int8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[12];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[1];
+ inptrs[3] = inptrs_raw[3];
+ inptrs[4] = inptrs_raw[4];
+ inptrs[5] = inptrs_raw[5];
+ inptrs[6] = inptrs_raw[6];
+ inptrs[7] = inptrs_raw[2];
+ inptrs[8] = inptrs_raw[8];
+ inptrs[9] = inptrs_raw[9];
+ inptrs[10] = inptrs_raw[7];
+ inptrs[11] = inptrs_raw[15];
+ inptrs[12] = inptrs_raw[10];
+ inptrs[13] = inptrs_raw[16];
+ inptrs[14] = inptrs_raw[11];
+ inptrs[15] = inptrs_raw[18];
+ inptrs[16] = inptrs_raw[13];
+ inptrs[17] = inptrs_raw[19];
+ inptrs[18] = inptrs_raw[20];
+ inptrs[19] = inptrs_raw[14];
+ inptrs[20] = inptrs_raw[21];
+ inptrs[21] = inptrs_raw[17];
+ inptrs[22] = inptrs_raw[23];
+ inptrs[23] = inptrs_raw[22];
+ inptrs[24] = inptrs_raw[24];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x8, x7, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v6.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v15.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v17.8h }, [x21]\n"
+ "ld1r { v24.8h }, [x20]\n"
+ "mov x17, #0x0\n"
+ "mov x16, #0x0\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x11, x10, [x22, #0x0]\n"
+ "ldp x9, x28, [x22, #0x10]\n"
+ "cbz x8, 3f\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
+ "subs x8, x8, #0x1\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "ssubl v22.8h, v22.8b, v15.8b\n"
+ "ssubl v14.8h, v14.8b, v15.8b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
+ "ldr d26, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "ssubl v9.8h, v9.8b, v15.8b\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ldr d4, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d25, [x27, x17]\n"
+ "ldr d27, [x26, x17]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d1, [x25, x17]\n"
+ "ldr d2, [x24, x17]\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "ldr d12, [x23, x17]\n"
+ "ldr d16, [x22, x17]\n"
+ "ssubl v1.8h, v1.8b, v6.8b\n"
+ "ssubl v2.8h, v2.8b, v6.8b\n"
+ "ldr d23, [x21, x17]\n"
+ "ldr d10, [x20, x17]\n"
+ "ssubl v12.8h, v12.8b, v6.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ssubl v23.8h, v23.8b, v6.8b\n"
+ "ssubl v10.8h, v10.8b, v6.8b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "ldr q30, [x13, #0x0]\n"
+ "ldr q29, [x12, #0x0]\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x21, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "ldr x25, [x15, #0x60]\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
+ "ldr d27, [x21, x17]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal v20.4s, v27.4h, v28.4h\n"
+ "smlal v19.4s, v25.4h, v18.4h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ldr d1, [x25, x17]\n"
+ "ssubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "ldr d2, [x24, x17]\n"
+ "ssubl v2.8h, v2.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v28.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v18.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
+ "ldr x25, [x15, #0x98]\n"
+ "smlal v20.4s, v1.4h, v11.4h\n"
+ "smlal v19.4s, v2.4h, v22.4h\n"
+ "ldr x24, [x15, #0x50]\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ldr d16, [x21, x17]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "ldr d12, [x20, x17]\n"
+ "ldr x23, [x15, #0x48]\n"
+ "smlal2 v0.4s, v1.8h, v11.8h\n"
+ "smlal2 v31.4s, v2.8h, v22.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal v20.4s, v27.4h, v18.4h\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v19.4s, v25.4h, v9.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ldr d23, [x25, x17]\n"
+ "ssubl v12.8h, v12.8b, v6.8b\n"
+ "ssubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "ldr d11, [x24, x17]\n"
+ "ssubl v11.8h, v11.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v18.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v9.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v21.4s, v16.4h, v18.4h\n"
+ "smlal v20.4s, v12.4h, v22.4h\n"
+ "smlal v19.4s, v23.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "ldr d10, [x20, x17]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "ssubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal2 v8.4s, v16.8h, v18.8h\n"
+ "ldr d18, [x22, x17]\n"
+ "ldr d16, [x21, x17]\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal2 v31.4s, v23.8h, v14.8h\n"
+ "ldr q14, [x13, #0x10]\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v25.4h, v26.4h\n"
+ "smlal v19.4s, v10.4h, v28.4h\n"
+ "ssubl v18.8h, v18.8b, v6.8b\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "smlal2 v3.4s, v11.8h, v9.8h\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v26.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v8.4s, v27.8h, v9.8h\n"
+ "ldr d27, [x21, x17]\n"
+ "smlal2 v0.4s, v25.8h, v26.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal2 v31.4s, v10.8h, v28.8h\n"
+ "smlal v21.4s, v11.4h, v28.4h\n"
+ "ssubl v22.8h, v22.8b, v6.8b\n"
+ "add x14, x14, #0x48\n"
+ "smlal v20.4s, v18.4h, v7.4h\n"
+ "smlal v19.4s, v16.4h, v7.4h\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "add x17, x17, #0x8\n"
+ "smlal2 v3.4s, v1.8h, v26.8h\n"
+ "smlal v5.4s, v12.4h, v7.4h\n"
+ "sqrdmulh v5.4s, v5.4s, v30.4s\n"
+ "subs x8, x8, #0x1\n"
+ "smlal2 v8.4s, v11.8h, v28.8h\n"
+ "smlal2 v0.4s, v18.8h, v7.8h\n"
+ "and v28.16b, v5.16b, v29.16b\n"
+ "add x13, x13, #0x20\n"
+ "smlal2 v31.4s, v16.8h, v7.8h\n"
+ "smlal v21.4s, v2.4h, v7.4h\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "add x12, x12, #0x20\n"
+ "smlal v20.4s, v10.4h, v9.4h\n"
+ "smlal v19.4s, v22.4h, v26.4h\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "smlal2 v3.4s, v12.8h, v7.8h\n"
+ "smlal2 v8.4s, v2.8h, v7.8h\n"
+ "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+ "smlal2 v0.4s, v10.8h, v9.8h\n"
+ "smlal2 v31.4s, v22.8h, v26.8h\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "smlal v21.4s, v23.4h, v4.4h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+ "smlal v19.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "smlal2 v0.4s, v22.8h, v4.8h\n"
+ "smlal2 v31.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v12.16b, v21.16b, v29.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+ "and v23.16b, v20.16b, v29.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+ "and v9.16b, v19.16b, v29.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "and v18.16b, v8.16b, v25.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v22.16b, v0.16b, v25.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v12.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v23.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v9.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v29.4s\n"
+ "srshl v21.4s, v21.4s, v29.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v29.4s\n"
+ "sqadd v0.4s, v0.4s, v22.4s\n"
+ "srshl v19.4s, v19.4s, v29.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v25.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d5, [x11, x16]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d19, [x28, x16]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
+ "add x16, x16, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d26, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d4, [x14, #0x40]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "ssubl v22.8h, v22.8b, v15.8b\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ssubl v14.8h, v14.8b, v15.8b\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ldr d25, [x27, x17]\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
+ "ssubl v9.8h, v9.8b, v15.8b\n"
+ "ldr d27, [x26, x17]\n"
+ "ldr d1, [x25, x17]\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ldr d2, [x24, x17]\n"
+ "ldr d12, [x23, x17]\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "ldr d16, [x22, x17]\n"
+ "ldr d23, [x21, x17]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "ssubl v1.8h, v1.8b, v6.8b\n"
+ "ldr d10, [x20, x17]\n"
+ "ssubl v2.8h, v2.8b, v6.8b\n"
+ "ssubl v12.8h, v12.8b, v6.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ssubl v23.8h, v23.8b, v6.8b\n"
+ "ssubl v10.8h, v10.8b, v6.8b\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "ldr q29, [x13, #0x0]\n"
+ "ldr q30, [x12, #0x0]\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x21, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "ldr x25, [x15, #0x60]\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
+ "ldr d27, [x21, x17]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal v20.4s, v27.4h, v28.4h\n"
+ "smlal v19.4s, v25.4h, v18.4h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ldr d1, [x25, x17]\n"
+ "ssubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "ldr d2, [x24, x17]\n"
+ "ssubl v2.8h, v2.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v28.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v18.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
+ "ldr x25, [x15, #0x98]\n"
+ "smlal v20.4s, v1.4h, v11.4h\n"
+ "smlal v19.4s, v2.4h, v22.4h\n"
+ "ldr x24, [x15, #0x50]\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ldr d16, [x21, x17]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "ldr d12, [x20, x17]\n"
+ "ldr x23, [x15, #0x48]\n"
+ "smlal2 v0.4s, v1.8h, v11.8h\n"
+ "smlal2 v31.4s, v2.8h, v22.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal v20.4s, v27.4h, v18.4h\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v19.4s, v25.4h, v9.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ldr d23, [x25, x17]\n"
+ "ssubl v12.8h, v12.8b, v6.8b\n"
+ "ssubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "ldr d11, [x24, x17]\n"
+ "ssubl v11.8h, v11.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v18.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v9.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v21.4s, v16.4h, v18.4h\n"
+ "smlal v20.4s, v12.4h, v22.4h\n"
+ "smlal v19.4s, v23.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "ldr d10, [x20, x17]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "ssubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal2 v8.4s, v16.8h, v18.8h\n"
+ "ldr d16, [x22, x17]\n"
+ "ldr d18, [x21, x17]\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal2 v31.4s, v23.8h, v14.8h\n"
+ "ldr q14, [x13, #0x10]\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v25.4h, v26.4h\n"
+ "smlal v19.4s, v10.4h, v28.4h\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "smlal2 v3.4s, v11.8h, v9.8h\n"
+ "ssubl v18.8h, v18.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v26.4h\n"
+ "tst x7, #0x7\n"
+ "smlal2 v8.4s, v27.8h, v9.8h\n"
+ "ldr d27, [x20, x17]\n"
+ "smlal2 v0.4s, v25.8h, v26.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal2 v31.4s, v10.8h, v28.8h\n"
+ "smlal v21.4s, v11.4h, v28.4h\n"
+ "ssubl v22.8h, v22.8b, v6.8b\n"
+ "add x17, x17, #0x8\n"
+ "smlal v20.4s, v16.4h, v7.4h\n"
+ "smlal v19.4s, v18.4h, v7.4h\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "add x13, x13, #0x20\n"
+ "smlal2 v3.4s, v1.8h, v26.8h\n"
+ "smlal v5.4s, v12.4h, v7.4h\n"
+ "sqrdmulh v5.4s, v5.4s, v29.4s\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v8.4s, v11.8h, v28.8h\n"
+ "smlal2 v0.4s, v16.8h, v7.8h\n"
+ "and v16.16b, v5.16b, v30.16b\n"
+ "smlal2 v31.4s, v18.8h, v7.8h\n"
+ "smlal v21.4s, v2.4h, v7.4h\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smlal v20.4s, v10.4h, v9.4h\n"
+ "smlal v19.4s, v22.4h, v26.4h\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "smlal2 v3.4s, v12.8h, v7.8h\n"
+ "smlal2 v8.4s, v2.8h, v7.8h\n"
+ "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+ "smlal2 v0.4s, v10.8h, v9.8h\n"
+ "smlal2 v31.4s, v22.8h, v26.8h\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "smlal v21.4s, v23.4h, v4.4h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v29.4s\n"
+ "smlal v19.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "sqrdmulh v20.4s, v20.4s, v29.4s\n"
+ "smlal2 v0.4s, v22.8h, v4.8h\n"
+ "smlal2 v31.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v29.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v23.16b, v21.16b, v30.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+ "and v27.16b, v20.16b, v30.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+ "and v22.16b, v19.16b, v30.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v14.16b, v8.16b, v25.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v18.16b, v0.16b, v25.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v23.4s\n"
+ "sshr v14.4s, v14.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v27.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v22.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v30.4s\n"
+ "srshl v21.4s, v21.4s, v30.4s\n"
+ "sqadd v8.4s, v8.4s, v14.4s\n"
+ "srshl v20.4s, v20.4s, v30.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v19.4s, v19.4s, v30.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v25.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d5, [x11, x16]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d19, [x28, x16]\n"
+ "add x16, x16, #0x8\n"
+ "beq 88f\n"
+ "add x14, x14, #0x48\n"
+ "3:" // Oddments
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x7, #2, 5f\n"
+ "ld1 { v5.4s }, [x20], #0x10\n"
+ "tbz x7, #1, 4f\n"
+ "ld1 { v3.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v3.s }[2], [x20]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 7f\n"
+ "ld1 { v3.s }[0], [x20]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x7, #1, 6f\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v5.s }[2], [x20]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 7f\n"
+ "ld1 { v5.s }[0], [x20]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d26, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "ssubl v22.8h, v22.8b, v15.8b\n"
+ "ldr d4, [x14, #0x40]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ssubl v14.8h, v14.8b, v15.8b\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
+ "ssubl v9.8h, v9.8b, v15.8b\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "add x27, x27, x17\n"
+ "add x26, x26, x17\n"
+ "add x25, x25, x17\n"
+ "add x24, x24, x17\n"
+ "add x23, x23, x17\n"
+ "add x22, x22, x17\n"
+ "add x21, x21, x17\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 9f\n"
+ "ld1 { v25.s }[0], [x27], #0x4\n"
+ "ld1 { v27.s }[0], [x26], #0x4\n"
+ "ld1 { v1.s }[0], [x25], #0x4\n"
+ "ld1 { v2.s }[0], [x24], #0x4\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x22], #0x4\n"
+ "ld1 { v23.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 8f\n"
+ "ld1 { v25.h }[2], [x27], #0x2\n"
+ "ld1 { v27.h }[2], [x26], #0x2\n"
+ "ld1 { v1.h }[2], [x25], #0x2\n"
+ "ld1 { v2.h }[2], [x24], #0x2\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x22], #0x2\n"
+ "ld1 { v23.h }[2], [x21], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v25.b }[6], [x27]\n"
+ "ld1 { v27.b }[6], [x26]\n"
+ "ld1 { v1.b }[6], [x25]\n"
+ "ld1 { v2.b }[6], [x24]\n"
+ "ld1 { v12.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x22]\n"
+ "ld1 { v23.b }[6], [x21]\n"
+ "ld1 { v10.b }[6], [x20]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 11f\n"
+ "ld1 { v25.b }[4], [x27]\n"
+ "ld1 { v27.b }[4], [x26]\n"
+ "ld1 { v1.b }[4], [x25]\n"
+ "ld1 { v2.b }[4], [x24]\n"
+ "ld1 { v12.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x22]\n"
+ "ld1 { v23.b }[4], [x21]\n"
+ "ld1 { v10.b }[4], [x20]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x7, #1, 10f\n"
+ "ld1 { v25.h }[0], [x27], #0x2\n"
+ "ld1 { v27.h }[0], [x26], #0x2\n"
+ "ld1 { v1.h }[0], [x25], #0x2\n"
+ "ld1 { v2.h }[0], [x24], #0x2\n"
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x22], #0x2\n"
+ "ld1 { v23.h }[0], [x21], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v25.b }[2], [x27]\n"
+ "ld1 { v27.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x25]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "ld1 { v12.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x22]\n"
+ "ld1 { v23.b }[2], [x21]\n"
+ "ld1 { v10.b }[2], [x20]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 11f\n"
+ "ld1 { v25.b }[0], [x27]\n"
+ "ld1 { v27.b }[0], [x26]\n"
+ "ld1 { v1.b }[0], [x25]\n"
+ "ld1 { v2.b }[0], [x24]\n"
+ "ld1 { v12.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x22]\n"
+ "ld1 { v23.b }[0], [x21]\n"
+ "ld1 { v10.b }[0], [x20]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "ssubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "ssubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
+ "ssubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ssubl v2.8h, v2.8b, v6.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ssubl v12.8h, v12.8b, v6.8b\n"
+ "ssubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ssubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "tbz x7, #2, 13f\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 12f\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v15.b }[6], [x20]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 15f\n"
+ "ld1 { v15.b }[4], [x20]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x7, #1, 14f\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v15.b }[2], [x20]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 15f\n"
+ "ld1 { v15.b }[0], [x20]\n"
+ "15:" // Oddments: Load (1, 3): Bit 2: End
+ "ssubl v15.8h, v15.8b, v6.8b\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v21.4s, v15.4h, v18.4h\n"
+ "smlal2 v8.4s, v15.8h, v18.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 17f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 16f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 19f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (1, 4): Bit 2: Unset
+ "tbz x7, #1, 18f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 19f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "19:" // Oddments: Load (1, 4): Bit 2: End
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v21.4s, v16.4h, v9.4h\n"
+ "smlal2 v8.4s, v16.8h, v9.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 21f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 20f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 23f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (1, 2): Bit 2: Unset
+ "tbz x7, #1, 22f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 23f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "23:" // Oddments: Load (1, 2): Bit 2: End
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x58]\n"
+ "smlal v5.4s, v16.4h, v9.4h\n"
+ "smlal2 v3.4s, v16.8h, v9.8h\n"
+ "smlal v21.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v16.8h, v28.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 25f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 24f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 27f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x7, #1, 26f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 27f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "27:" // Oddments: Load (3, 0): Bit 2: End
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v20.4s, v16.4h, v28.4h\n"
+ "smlal2 v0.4s, v16.8h, v28.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 29f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 28f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 31f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 0): Bit 2: Unset
+ "tbz x7, #1, 30f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 31f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "31:" // Oddments: Load (2, 0): Bit 2: End
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x68]\n"
+ "smlal v5.4s, v16.4h, v26.4h\n"
+ "smlal2 v3.4s, v16.8h, v26.8h\n"
+ "smlal v20.4s, v16.4h, v11.4h\n"
+ "smlal2 v0.4s, v16.8h, v11.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 33f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 32f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 35f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x7, #1, 34f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 35f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "35:" // Oddments: Load (3, 1): Bit 2: End
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v20.4s, v16.4h, v18.4h\n"
+ "smlal2 v0.4s, v16.8h, v18.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 37f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 36f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 39f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (2, 1): Bit 2: Unset
+ "tbz x7, #1, 38f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 39f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "39:" // Oddments: Load (2, 1): Bit 2: End
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v16.4h, v7.4h\n"
+ "smlal2 v3.4s, v16.8h, v7.8h\n"
+ "smlal v20.4s, v16.4h, v22.4h\n"
+ "smlal2 v0.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 41f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 40f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 43f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x7, #1, 42f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 43f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "43:" // Oddments: Load (3, 3): Bit 2: End
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x80]\n"
+ "smlal v19.4s, v16.4h, v18.4h\n"
+ "smlal2 v31.4s, v16.8h, v18.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 45f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 44f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 47f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x7, #1, 46f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 47f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "47:" // Oddments: Load (2, 3): Bit 2: End
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x88]\n"
+ "smlal v21.4s, v16.4h, v7.4h\n"
+ "smlal2 v8.4s, v16.8h, v7.8h\n"
+ "smlal v19.4s, v16.4h, v22.4h\n"
+ "smlal2 v31.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 49f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 48f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 51f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 4): Bit 2: Unset
+ "tbz x7, #1, 50f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 51f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "51:" // Oddments: Load (3, 4): Bit 2: End
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x90]\n"
+ "smlal v19.4s, v16.4h, v9.4h\n"
+ "smlal2 v31.4s, v16.8h, v9.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 53f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 52f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 55f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (4, 0): Bit 2: Unset
+ "tbz x7, #1, 54f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 55f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "55:" // Oddments: Load (4, 0): Bit 2: End
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x98]\n"
+ "smlal v20.4s, v16.4h, v26.4h\n"
+ "smlal2 v0.4s, v16.8h, v26.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 57f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 56f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 59f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load (2, 4): Bit 2: Unset
+ "tbz x7, #1, 58f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 59f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "59:" // Oddments: Load (2, 4): Bit 2: End
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xa0]\n"
+ "smlal v21.4s, v16.4h, v4.4h\n"
+ "smlal2 v8.4s, v16.8h, v4.8h\n"
+ "smlal v19.4s, v16.4h, v14.4h\n"
+ "smlal2 v31.4s, v16.8h, v14.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 61f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 60f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 63f\n"
+ "60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 63f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 63f\n"
+ "61:" // Oddments: Load (4, 1): Bit 2: Unset
+ "tbz x7, #1, 62f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 63f\n"
+ "62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 63f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "63:" // Oddments: Load (4, 1): Bit 2: End
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v20.4s, v16.4h, v7.4h\n"
+ "smlal2 v0.4s, v16.8h, v7.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 65f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 64f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 67f\n"
+ "64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 67f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 67f\n"
+ "65:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x7, #1, 66f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 67f\n"
+ "66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 67f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "67:" // Oddments: Load (3, 2): Bit 2: End
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "smlal v20.4s, v16.4h, v9.4h\n"
+ "smlal2 v0.4s, v16.8h, v9.8h\n"
+ "smlal v19.4s, v16.4h, v28.4h\n"
+ "smlal2 v31.4s, v16.8h, v28.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 69f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 68f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 71f\n"
+ "68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 71f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 71f\n"
+ "69:" // Oddments: Load (4, 3): Bit 2: Unset
+ "tbz x7, #1, 70f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 71f\n"
+ "70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 71f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "71:" // Oddments: Load (4, 3): Bit 2: End
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal v19.4s, v16.4h, v7.4h\n"
+ "smlal2 v31.4s, v16.8h, v7.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 73f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 72f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 75f\n"
+ "72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 75f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 75f\n"
+ "73:" // Oddments: Load (4, 2): Bit 2: Unset
+ "tbz x7, #1, 74f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 75f\n"
+ "74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 75f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "75:" // Oddments: Load (4, 2): Bit 2: End
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "smlal v20.4s, v16.4h, v4.4h\n"
+ "smlal2 v0.4s, v16.8h, v4.8h\n"
+ "smlal v19.4s, v16.4h, v26.4h\n"
+ "smlal2 v31.4s, v16.8h, v26.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 77f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 76f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 79f\n"
+ "76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 79f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 79f\n"
+ "77:" // Oddments: Load (4, 4): Bit 2: Unset
+ "tbz x7, #1, 78f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 79f\n"
+ "78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 79f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "79:" // Oddments: Load (4, 4): Bit 2: End
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v19.4s, v16.4h, v4.4h\n"
+ "smlal2 v31.4s, v16.8h, v4.8h\n"
+ "tbz x7, #2, 81f\n"
+ "ld1 { v14.4s }, [x13], #0x10\n"
+ "ld1 { v25.4s }, [x12], #0x10\n"
+ "tbz x7, #1, 80f\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v12.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 83f\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x12]\n"
+ "b 83f\n"
+ "80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 83f\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v12.s }[0], [x12]\n"
+ "b 83f\n"
+ "81:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x7, #1, 82f\n"
+ "ld1 { v14.d }[0], [x13], #0x8\n"
+ "ld1 { v25.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 83f\n"
+ "ld1 { v14.s }[2], [x13]\n"
+ "ld1 { v25.s }[2], [x12]\n"
+ "b 83f\n"
+ "82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 83f\n"
+ "ld1 { v14.s }[0], [x13]\n"
+ "ld1 { v25.s }[0], [x12]\n"
+ "83:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+ "and v28.16b, v5.16b, v25.16b\n"
+ "add x11, x11, x16\n"
+ "add x10, x10, x16\n"
+ "sqrdmulh v3.4s, v3.4s, v18.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "add x9, x9, x16\n"
+ "add x28, x28, x16\n"
+ "and v16.16b, v3.16b, v12.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v14.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v14.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v14.16b, v21.16b, v25.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+ "and v6.16b, v20.16b, v25.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "and v4.16b, v19.16b, v25.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v14.4s, v14.4s, #0x1f\n"
+ "and v18.16b, v8.16b, v12.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v7.16b, v0.16b, v12.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v12.16b\n"
+ "sqadd v21.4s, v21.4s, v14.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v6.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v25.4s\n"
+ "srshl v21.4s, v21.4s, v25.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v25.4s\n"
+ "sqadd v0.4s, v0.4s, v7.4s\n"
+ "srshl v19.4s, v19.4s, v25.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v12.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v12.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "srshl v0.4s, v0.4s, v12.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v12.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "tbz x7, #2, 85f\n"
+ "st1 { v5.s }[0], [x11], #0x4\n"
+ "st1 { v21.s }[0], [x10], #0x4\n"
+ "st1 { v20.s }[0], [x9], #0x4\n"
+ "st1 { v19.s }[0], [x28], #0x4\n"
+ "tbz x7, #1, 84f\n"
+ "st1 { v5.h }[2], [x11], #0x2\n"
+ "st1 { v21.h }[2], [x10], #0x2\n"
+ "st1 { v20.h }[2], [x9], #0x2\n"
+ "st1 { v19.h }[2], [x28], #0x2\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v5.b }[6], [x11], #0x1\n"
+ "st1 { v21.b }[6], [x10], #0x1\n"
+ "st1 { v20.b }[6], [x9], #0x1\n"
+ "st1 { v19.b }[6], [x28], #0x1\n"
+ "b 87f\n"
+ "84:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 87f\n"
+ "st1 { v5.b }[4], [x11], #0x1\n"
+ "st1 { v21.b }[4], [x10], #0x1\n"
+ "st1 { v20.b }[4], [x9], #0x1\n"
+ "st1 { v19.b }[4], [x28], #0x1\n"
+ "b 87f\n"
+ "85:" // Oddments: Bit 2: Unset
+ "tbz x7, #1, 86f\n"
+ "st1 { v5.h }[0], [x11], #0x2\n"
+ "st1 { v21.h }[0], [x10], #0x2\n"
+ "st1 { v20.h }[0], [x9], #0x2\n"
+ "st1 { v19.h }[0], [x28], #0x2\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v5.b }[2], [x11], #0x1\n"
+ "st1 { v21.b }[2], [x10], #0x1\n"
+ "st1 { v20.b }[2], [x9], #0x1\n"
+ "st1 { v19.b }[2], [x28], #0x1\n"
+ "b 87f\n"
+ "86:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 87f\n"
+ "st1 { v5.b }[0], [x11], #0x1\n"
+ "st1 { v21.b }[0], [x10], #0x1\n"
+ "st1 { v20.b }[0], [x9], #0x1\n"
+ "st1 { v19.b }[0], [x28], #0x1\n"
+ "87:" // Oddments: Bit 2: End
+ "88:" // End
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..ccab35ce57
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int,
+ const int8_t *const *const,
+ const int8_t *const,
+ const int32_t *const,
+ const arm_gemm::Requantize32 &,
+ const int32_t *const,
+ const int32_t *const,
+ int8_t *const *const
+);
+
+
+
+class a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..b1648bae14
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,2187 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const int8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ uint64_t n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ int8_t *const *const outptrs;
+ const int8_t *inptrs[36];
+
+ Params(
+ long unsigned int n_channels,
+ const int8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[0];
+ inptrs[1] = inptrs_raw[1];
+ inptrs[2] = inptrs_raw[6];
+ inptrs[3] = inptrs_raw[7];
+ inptrs[4] = inptrs_raw[2];
+ inptrs[5] = inptrs_raw[8];
+ inptrs[6] = inptrs_raw[3];
+ inptrs[7] = inptrs_raw[4];
+ inptrs[8] = inptrs_raw[11];
+ inptrs[9] = inptrs_raw[12];
+ inptrs[10] = inptrs_raw[9];
+ inptrs[11] = inptrs_raw[10];
+ inptrs[12] = inptrs_raw[5];
+ inptrs[13] = inptrs_raw[13];
+ inptrs[14] = inptrs_raw[14];
+ inptrs[15] = inptrs_raw[15];
+ inptrs[16] = inptrs_raw[16];
+ inptrs[17] = inptrs_raw[17];
+ inptrs[18] = inptrs_raw[18];
+ inptrs[19] = inptrs_raw[19];
+ inptrs[20] = inptrs_raw[20];
+ inptrs[21] = inptrs_raw[21];
+ inptrs[22] = inptrs_raw[22];
+ inptrs[23] = inptrs_raw[23];
+ inptrs[24] = inptrs_raw[24];
+ inptrs[25] = inptrs_raw[25];
+ inptrs[26] = inptrs_raw[26];
+ inptrs[27] = inptrs_raw[27];
+ inptrs[28] = inptrs_raw[28];
+ inptrs[29] = inptrs_raw[29];
+ inptrs[30] = inptrs_raw[30];
+ inptrs[31] = inptrs_raw[31];
+ inptrs[32] = inptrs_raw[32];
+ inptrs[33] = inptrs_raw[33];
+ inptrs[34] = inptrs_raw[34];
+ inptrs[35] = inptrs_raw[35];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x2, x1, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v18.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v13.16b }, [x21]\n"
+ "ld1r { v26.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v11.8h }, [x21]\n"
+ "ld1r { v0.8h }, [x20]\n"
+ "mov x3, #0x0\n"
+ "mov x4, #0x0\n"
+ "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x17, x16, [x22, #0x0]\n"
+ "ldp x15, x14, [x22, #0x10]\n"
+ "cbz x2, 3f\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "subs x2, x2, #0x1\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ssubl v10.8h, v10.8b, v13.8b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q15, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ldr d31, [x9, x3]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldr d17, [x28, x3]\n"
+ "ldr d30, [x27, x3]\n"
+ "ssubl v31.8h, v31.8b, v18.8b\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "ldr d16, [x26, x3]\n"
+ "ldr d3, [x25, x3]\n"
+ "ssubl v30.8h, v30.8b, v18.8b\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "ldr d4, [x24, x3]\n"
+ "ldr d25, [x23, x3]\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
+ "ldr d9, [x22, x3]\n"
+ "ldr d29, [x21, x3]\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "ldr d28, [x20, x3]\n"
+ "ssubl v29.8h, v29.8b, v18.8b\n"
+ "ssubl v28.8h, v28.8b, v18.8b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "ldr d2, [x6, #0x28]\n"
+ "ldr d27, [x6, #0x30]\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "ldr d1, [x6, #0x38]\n"
+ "ldr d31, [x6, #0x40]\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "ldr d8, [x6, #0x48]\n"
+ "ldr x22, [x5, #0x50]\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x21, [x5, #0x60]\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "ldr x22, [x5, #0x70]\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "ldr d14, [x20, x3]\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "ssubl v14.8h, v14.8b, v18.8b\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal v23.4s, v17.4h, v10.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x21, [x5, #0x80]\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "ldr d25, [x22, x3]\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v10.8h\n"
+ "ldr d10, [x20, x3]\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "ssubl v10.8h, v10.8b, v18.8b\n"
+ "smlal v24.4s, v17.4h, v21.4h\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x24, [x5, #0x88]\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "smlal v7.4s, v30.4h, v2.4h\n"
+ "ldr x20, [x5, #0x90]\n"
+ "ldr x23, [x5, #0x98]\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "ldr d9, [x21, x3]\n"
+ "smlal2 v22.4s, v17.8h, v21.8h\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "smlal v20.4s, v3.4h, v12.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "ldr x22, [x5, #0xa0]\n"
+ "ldr x21, [x5, #0xa8]\n"
+ "smlal2 v15.4s, v30.8h, v2.8h\n"
+ "ldr d30, [x24, x3]\n"
+ "smlal v7.4s, v16.4h, v27.4h\n"
+ "ssubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v12.8h\n"
+ "ldr d3, [x6, #0x58]\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "ldr d12, [x20, x3]\n"
+ "smlal v20.4s, v16.4h, v2.4h\n"
+ "ssubl v12.8h, v12.8b, v18.8b\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal v23.4s, v14.4h, v2.4h\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "ldr x13, [x5, #0xb8]\n"
+ "smlal2 v15.4s, v16.8h, v27.8h\n"
+ "smlal v7.4s, v4.4h, v1.4h\n"
+ "ldr x12, [x5, #0xc0]\n"
+ "ldr x11, [x5, #0xc8]\n"
+ "smlal2 v5.4s, v16.8h, v2.8h\n"
+ "ldr d16, [x23, x3]\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v2.8h\n"
+ "ldr d2, [x6, #0x60]\n"
+ "smlal v20.4s, v4.4h, v27.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v27.4h\n"
+ "smlal v23.4s, v25.4h, v27.4h\n"
+ "ldr x10, [x5, #0xd0]\n"
+ "ldr x9, [x5, #0xd8]\n"
+ "smlal2 v15.4s, v4.8h, v1.8h\n"
+ "smlal v7.4s, v17.4h, v31.4h\n"
+ "ldr x28, [x5, #0xe0]\n"
+ "ldr x27, [x5, #0xe8]\n"
+ "smlal2 v5.4s, v4.8h, v27.8h\n"
+ "ldr d4, [x22, x3]\n"
+ "smlal2 v22.4s, v14.8h, v27.8h\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v27.8h\n"
+ "ldr d27, [x6, #0x68]\n"
+ "smlal v20.4s, v17.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v1.4h\n"
+ "smlal v23.4s, v10.4h, v1.4h\n"
+ "ldr x26, [x5, #0xf0]\n"
+ "ldr x25, [x5, #0xf8]\n"
+ "smlal2 v15.4s, v17.8h, v31.8h\n"
+ "smlal v7.4s, v6.4h, v8.4h\n"
+ "ldr x24, [x5, #0x100]\n"
+ "ldr x23, [x5, #0x108]\n"
+ "smlal2 v5.4s, v17.8h, v1.8h\n"
+ "ldr d17, [x21, x3]\n"
+ "smlal2 v22.4s, v25.8h, v1.8h\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v1.8h\n"
+ "ldr d1, [x6, #0x70]\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v31.4h\n"
+ "smlal v23.4s, v9.4h, v31.4h\n"
+ "ldr x22, [x5, #0x110]\n"
+ "ldr x21, [x5, #0x118]\n"
+ "smlal2 v15.4s, v6.8h, v8.8h\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "subs x2, x2, #0x1\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal2 v22.4s, v10.8h, v31.8h\n"
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v31.8h\n"
+ "ldr d31, [x6, #0x78]\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v9.4h, v8.4h\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "ldr d28, [x13, x3]\n"
+ "smlal v7.4s, v14.4h, v3.4h\n"
+ "ssubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v5.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "smlal2 v22.4s, v9.8h, v8.8h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "ldr d8, [x12, x3]\n"
+ "smlal v20.4s, v14.4h, v21.4h\n"
+ "ssubl v8.8h, v8.8b, v18.8b\n"
+ "smlal v24.4s, v12.4h, v21.4h\n"
+ "smlal v23.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v14.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v14.8h, v21.8h\n"
+ "ldr d14, [x11, x3]\n"
+ "smlal2 v22.4s, v12.8h, v21.8h\n"
+ "ssubl v14.8h, v14.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v21.8h\n"
+ "ldr d21, [x6, #0x88]\n"
+ "smlal v20.4s, v25.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v16.4h, v3.4h\n"
+ "smlal v23.4s, v4.4h, v3.4h\n"
+ "smlal2 v15.4s, v25.8h, v2.8h\n"
+ "smlal v7.4s, v10.4h, v27.4h\n"
+ "smlal2 v5.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x10, x3]\n"
+ "smlal2 v22.4s, v16.8h, v3.8h\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v3.8h\n"
+ "ldr d3, [x6, #0x90]\n"
+ "smlal v20.4s, v10.4h, v2.4h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v4.4h, v2.4h\n"
+ "smlal v23.4s, v17.4h, v2.4h\n"
+ "smlal2 v15.4s, v10.8h, v27.8h\n"
+ "smlal v7.4s, v9.4h, v1.4h\n"
+ "smlal2 v5.4s, v10.8h, v2.8h\n"
+ "ldr d10, [x9, x3]\n"
+ "smlal2 v22.4s, v4.8h, v2.8h\n"
+ "ssubl v10.8h, v10.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v2.8h\n"
+ "ldr d2, [x6, #0x98]\n"
+ "smlal v20.4s, v9.4h, v27.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v17.4h, v27.4h\n"
+ "smlal v23.4s, v6.4h, v27.4h\n"
+ "smlal2 v15.4s, v9.8h, v1.8h\n"
+ "smlal v7.4s, v12.4h, v31.4h\n"
+ "smlal2 v5.4s, v9.8h, v27.8h\n"
+ "ldr d9, [x28, x3]\n"
+ "smlal2 v22.4s, v17.8h, v27.8h\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v27.8h\n"
+ "ldr d27, [x6, #0xa0]\n"
+ "smlal v20.4s, v30.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v1.4h\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal2 v15.4s, v12.8h, v31.8h\n"
+ "ldr d12, [x27, x3]\n"
+ "smlal v7.4s, v16.4h, v29.4h\n"
+ "ssubl v12.8h, v12.8b, v18.8b\n"
+ "smlal2 v5.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x6, #0xa8]\n"
+ "smlal2 v22.4s, v6.8h, v1.8h\n"
+ "ssubl v30.8h, v30.8b, v13.8b\n"
+ "smlal2 v19.4s, v28.8h, v1.8h\n"
+ "ldr d1, [x26, x3]\n"
+ "smlal v20.4s, v16.4h, v31.4h\n"
+ "ssubl v1.8h, v1.8b, v18.8b\n"
+ "smlal v24.4s, v8.4h, v31.4h\n"
+ "smlal v23.4s, v14.4h, v31.4h\n"
+ "smlal2 v15.4s, v16.8h, v29.8h\n"
+ "smlal v7.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v16.8h, v31.8h\n"
+ "ldr d16, [x25, x3]\n"
+ "smlal2 v22.4s, v8.8h, v31.8h\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v31.8h\n"
+ "ldr d31, [x6, #0xb0]\n"
+ "smlal v20.4s, v4.4h, v29.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v29.4h\n"
+ "smlal v23.4s, v25.4h, v29.4h\n"
+ "smlal2 v15.4s, v4.8h, v21.8h\n"
+ "smlal v7.4s, v17.4h, v3.4h\n"
+ "smlal2 v5.4s, v4.8h, v29.8h\n"
+ "ldr d4, [x24, x3]\n"
+ "smlal2 v22.4s, v14.8h, v29.8h\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v29.8h\n"
+ "ldr d29, [x6, #0xb8]\n"
+ "smlal v20.4s, v17.4h, v21.4h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v15.4s, v17.8h, v3.8h\n"
+ "smlal v7.4s, v6.4h, v2.4h\n"
+ "smlal2 v5.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x23, x3]\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x6, #0xc0]\n"
+ "smlal v20.4s, v6.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v3.4h\n"
+ "smlal v23.4s, v9.4h, v3.4h\n"
+ "add x6, x6, #0xc8\n"
+ "smlal2 v15.4s, v6.8h, v2.8h\n"
+ "smlal v7.4s, v8.4h, v27.4h\n"
+ "smlal2 v5.4s, v6.8h, v3.8h\n"
+ "ldr d6, [x22, x3]\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal v20.4s, v28.4h, v2.4h\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v24.4s, v9.4h, v2.4h\n"
+ "smlal v23.4s, v12.4h, v2.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v15.4s, v8.8h, v27.8h\n"
+ "ldr q8, [x7, #0x0]\n"
+ "smlal v7.4s, v14.4h, v30.4h\n"
+ "smlal2 v5.4s, v28.8h, v2.8h\n"
+ "ldr q28, [x8, #0x0]\n"
+ "smlal2 v22.4s, v9.8h, v2.8h\n"
+ "smlal2 v19.4s, v12.8h, v2.8h\n"
+ "ldr q2, [x7, #0x10]\n"
+ "smlal v20.4s, v14.4h, v27.4h\n"
+ "add x7, x7, #0x20\n"
+ "smlal v24.4s, v1.4h, v27.4h\n"
+ "smlal v23.4s, v16.4h, v27.4h\n"
+ "smlal2 v15.4s, v14.8h, v30.8h\n"
+ "smlal v7.4s, v25.4h, v31.4h\n"
+ "smlal2 v5.4s, v14.8h, v27.8h\n"
+ "ldr q14, [x8, #0x10]\n"
+ "smlal2 v22.4s, v1.8h, v27.8h\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v19.4s, v16.8h, v27.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal v24.4s, v16.4h, v30.4h\n"
+ "smlal v23.4s, v4.4h, v30.4h\n"
+ "smlal2 v15.4s, v25.8h, v31.8h\n"
+ "smlal v7.4s, v10.4h, v29.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal2 v22.4s, v16.8h, v30.8h\n"
+ "smlal2 v19.4s, v4.8h, v30.8h\n"
+ "smlal v20.4s, v10.4h, v31.4h\n"
+ "smlal v24.4s, v4.4h, v31.4h\n"
+ "smlal v23.4s, v17.4h, v31.4h\n"
+ "smlal2 v15.4s, v10.8h, v29.8h\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+ "smlal2 v5.4s, v10.8h, v31.8h\n"
+ "smlal2 v22.4s, v4.8h, v31.8h\n"
+ "and v27.16b, v7.16b, v28.16b\n"
+ "smlal2 v19.4s, v17.8h, v31.8h\n"
+ "smlal v20.4s, v9.4h, v29.4h\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "smlal v24.4s, v17.4h, v29.4h\n"
+ "smlal v23.4s, v6.4h, v29.4h\n"
+ "sqadd v7.4s, v7.4s, v27.4s\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal2 v5.4s, v9.8h, v29.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v2.4s\n"
+ "smlal2 v22.4s, v17.8h, v29.8h\n"
+ "smlal2 v19.4s, v6.8h, v29.8h\n"
+ "and v9.16b, v15.16b, v14.16b\n"
+ "smlal v20.4s, v12.4h, v21.4h\n"
+ "smlal v24.4s, v6.4h, v21.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smlal v23.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v12.8h, v21.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "smlal2 v22.4s, v6.8h, v21.8h\n"
+ "smlal2 v19.4s, v3.8h, v21.8h\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v25.16b, v20.16b, v28.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v2.4s\n"
+ "and v10.16b, v24.16b, v28.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v2.4s\n"
+ "and v21.16b, v23.16b, v28.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v2.4s\n"
+ "sqadd v15.4s, v15.4s, v9.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v9.16b, v5.16b, v14.16b\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "and v12.16b, v22.16b, v14.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v17.16b, v19.16b, v14.16b\n"
+ "sqadd v20.4s, v20.4s, v25.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v10.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v21.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v28.4s\n"
+ "srshl v20.4s, v20.4s, v28.4s\n"
+ "sqadd v5.4s, v5.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqadd v22.4s, v22.4s, v12.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "sqadd v19.4s, v19.4s, v17.4s\n"
+ "srshl v15.4s, v15.4s, v14.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v22.4s, v22.4s, v14.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v14.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d7, [x17, x4]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d20, [x16, x4]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d24, [x15, x4]\n"
+ "str d23, [x14, x4]\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q15, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "add x4, x4, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ldr d31, [x9, x3]\n"
+ "ldr d17, [x28, x3]\n"
+ "ssubl v10.8h, v10.8b, v13.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr d30, [x27, x3]\n"
+ "ldr d16, [x26, x3]\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ssubl v31.8h, v31.8b, v18.8b\n"
+ "ldr d3, [x25, x3]\n"
+ "ldr d4, [x24, x3]\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "ssubl v30.8h, v30.8b, v18.8b\n"
+ "ldr d25, [x23, x3]\n"
+ "ldr d9, [x22, x3]\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "ldr d29, [x21, x3]\n"
+ "ldr d28, [x20, x3]\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "ssubl v29.8h, v29.8b, v18.8b\n"
+ "ssubl v28.8h, v28.8b, v18.8b\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "ldr d27, [x6, #0x28]\n"
+ "ldr d1, [x6, #0x30]\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "ldr d2, [x6, #0x38]\n"
+ "ldr d31, [x6, #0x40]\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "ldr d8, [x6, #0x48]\n"
+ "ldr x22, [x5, #0x50]\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x21, [x5, #0x60]\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x22, [x5, #0x70]\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "ldr d14, [x20, x3]\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "ssubl v14.8h, v14.8b, v18.8b\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal v23.4s, v17.4h, v10.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x21, [x5, #0x78]\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "ldr d25, [x22, x3]\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v10.8h\n"
+ "ldr d10, [x21, x3]\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "ssubl v10.8h, v10.8b, v18.8b\n"
+ "smlal v24.4s, v17.4h, v21.4h\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x24, [x5, #0x88]\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "smlal v7.4s, v30.4h, v27.4h\n"
+ "ldr x23, [x5, #0x90]\n"
+ "ldr x22, [x5, #0x98]\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "ldr d9, [x20, x3]\n"
+ "smlal2 v22.4s, v17.8h, v21.8h\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "smlal v20.4s, v3.4h, v12.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "ldr x21, [x5, #0xa0]\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal2 v15.4s, v30.8h, v27.8h\n"
+ "ldr d30, [x24, x3]\n"
+ "smlal v7.4s, v16.4h, v1.4h\n"
+ "ssubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v12.8h\n"
+ "ldr d3, [x6, #0x58]\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "ldr d12, [x23, x3]\n"
+ "smlal v20.4s, v16.4h, v27.4h\n"
+ "ssubl v12.8h, v12.8b, v18.8b\n"
+ "smlal v24.4s, v28.4h, v27.4h\n"
+ "smlal v23.4s, v14.4h, v27.4h\n"
+ "ldr x13, [x5, #0xb0]\n"
+ "ldr x12, [x5, #0xb8]\n"
+ "smlal2 v15.4s, v16.8h, v1.8h\n"
+ "smlal v7.4s, v4.4h, v2.4h\n"
+ "ldr x11, [x5, #0xc0]\n"
+ "ldr x10, [x5, #0xc8]\n"
+ "smlal2 v5.4s, v16.8h, v27.8h\n"
+ "ldr d16, [x22, x3]\n"
+ "smlal2 v22.4s, v28.8h, v27.8h\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v27.8h\n"
+ "ldr d27, [x6, #0x60]\n"
+ "smlal v20.4s, v4.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v1.4h\n"
+ "smlal v23.4s, v25.4h, v1.4h\n"
+ "ldr x9, [x5, #0xd0]\n"
+ "ldr x28, [x5, #0xd8]\n"
+ "smlal2 v15.4s, v4.8h, v2.8h\n"
+ "smlal v7.4s, v17.4h, v31.4h\n"
+ "ldr x27, [x5, #0xe0]\n"
+ "ldr x26, [x5, #0xe8]\n"
+ "smlal2 v5.4s, v4.8h, v1.8h\n"
+ "ldr d4, [x21, x3]\n"
+ "smlal2 v22.4s, v14.8h, v1.8h\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v1.8h\n"
+ "ldr d1, [x6, #0x68]\n"
+ "smlal v20.4s, v17.4h, v2.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v2.4h\n"
+ "smlal v23.4s, v10.4h, v2.4h\n"
+ "ldr x25, [x5, #0xf0]\n"
+ "ldr x24, [x5, #0xf8]\n"
+ "smlal2 v15.4s, v17.8h, v31.8h\n"
+ "smlal v7.4s, v6.4h, v8.4h\n"
+ "ldr x23, [x5, #0x100]\n"
+ "ldr x22, [x5, #0x108]\n"
+ "smlal2 v5.4s, v17.8h, v2.8h\n"
+ "ldr d17, [x20, x3]\n"
+ "smlal2 v22.4s, v25.8h, v2.8h\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v2.8h\n"
+ "ldr d2, [x6, #0x70]\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v31.4h\n"
+ "smlal v23.4s, v9.4h, v31.4h\n"
+ "ldr x21, [x5, #0x110]\n"
+ "ldr x20, [x5, #0x118]\n"
+ "smlal2 v15.4s, v6.8h, v8.8h\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "tst x1, #0x7\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x13, x3]\n"
+ "smlal2 v22.4s, v10.8h, v31.8h\n"
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v31.8h\n"
+ "ldr d31, [x6, #0x78]\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v9.4h, v8.4h\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "ldr d28, [x12, x3]\n"
+ "smlal v7.4s, v14.4h, v3.4h\n"
+ "ssubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v5.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "smlal2 v22.4s, v9.8h, v8.8h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "ldr d8, [x11, x3]\n"
+ "smlal v20.4s, v14.4h, v21.4h\n"
+ "ssubl v8.8h, v8.8b, v18.8b\n"
+ "smlal v24.4s, v12.4h, v21.4h\n"
+ "smlal v23.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v14.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v27.4h\n"
+ "smlal2 v5.4s, v14.8h, v21.8h\n"
+ "ldr d14, [x10, x3]\n"
+ "smlal2 v22.4s, v12.8h, v21.8h\n"
+ "ssubl v14.8h, v14.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v21.8h\n"
+ "ldr d21, [x6, #0x88]\n"
+ "smlal v20.4s, v25.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v16.4h, v3.4h\n"
+ "smlal v23.4s, v4.4h, v3.4h\n"
+ "smlal2 v15.4s, v25.8h, v27.8h\n"
+ "smlal v7.4s, v10.4h, v1.4h\n"
+ "smlal2 v5.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x9, x3]\n"
+ "smlal2 v22.4s, v16.8h, v3.8h\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v3.8h\n"
+ "ldr d3, [x6, #0x90]\n"
+ "smlal v20.4s, v10.4h, v27.4h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v4.4h, v27.4h\n"
+ "smlal v23.4s, v17.4h, v27.4h\n"
+ "smlal2 v15.4s, v10.8h, v1.8h\n"
+ "smlal v7.4s, v9.4h, v2.4h\n"
+ "smlal2 v5.4s, v10.8h, v27.8h\n"
+ "ldr d10, [x28, x3]\n"
+ "smlal2 v22.4s, v4.8h, v27.8h\n"
+ "ssubl v10.8h, v10.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v27.8h\n"
+ "ldr d27, [x6, #0x98]\n"
+ "smlal v20.4s, v9.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v17.4h, v1.4h\n"
+ "smlal v23.4s, v6.4h, v1.4h\n"
+ "smlal2 v15.4s, v9.8h, v2.8h\n"
+ "smlal v7.4s, v12.4h, v31.4h\n"
+ "smlal2 v5.4s, v9.8h, v1.8h\n"
+ "ldr d9, [x27, x3]\n"
+ "smlal2 v22.4s, v17.8h, v1.8h\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v1.8h\n"
+ "ldr d1, [x6, #0xa0]\n"
+ "smlal v20.4s, v30.4h, v2.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v2.4h\n"
+ "smlal v23.4s, v28.4h, v2.4h\n"
+ "smlal2 v15.4s, v12.8h, v31.8h\n"
+ "ldr d12, [x26, x3]\n"
+ "smlal v7.4s, v16.4h, v29.4h\n"
+ "ssubl v12.8h, v12.8b, v18.8b\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "ldr d30, [x6, #0xa8]\n"
+ "smlal2 v22.4s, v6.8h, v2.8h\n"
+ "ssubl v30.8h, v30.8b, v13.8b\n"
+ "smlal2 v19.4s, v28.8h, v2.8h\n"
+ "ldr d2, [x25, x3]\n"
+ "smlal v20.4s, v16.4h, v31.4h\n"
+ "ssubl v2.8h, v2.8b, v18.8b\n"
+ "smlal v24.4s, v8.4h, v31.4h\n"
+ "smlal v23.4s, v14.4h, v31.4h\n"
+ "smlal2 v15.4s, v16.8h, v29.8h\n"
+ "smlal v7.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v16.8h, v31.8h\n"
+ "ldr d16, [x24, x3]\n"
+ "smlal2 v22.4s, v8.8h, v31.8h\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v31.8h\n"
+ "ldr d31, [x6, #0xb0]\n"
+ "smlal v20.4s, v4.4h, v29.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v29.4h\n"
+ "smlal v23.4s, v25.4h, v29.4h\n"
+ "smlal2 v15.4s, v4.8h, v21.8h\n"
+ "smlal v7.4s, v17.4h, v3.4h\n"
+ "smlal2 v5.4s, v4.8h, v29.8h\n"
+ "ldr d4, [x23, x3]\n"
+ "smlal2 v22.4s, v14.8h, v29.8h\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v29.8h\n"
+ "ldr d29, [x6, #0xb8]\n"
+ "smlal v20.4s, v17.4h, v21.4h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v15.4s, v17.8h, v3.8h\n"
+ "smlal v7.4s, v6.4h, v27.4h\n"
+ "smlal2 v5.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x6, #0xc0]\n"
+ "smlal v20.4s, v6.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v3.4h\n"
+ "smlal v23.4s, v9.4h, v3.4h\n"
+ "smlal2 v15.4s, v6.8h, v27.8h\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "smlal2 v5.4s, v6.8h, v3.8h\n"
+ "ldr d6, [x21, x3]\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x20, x3]\n"
+ "smlal v20.4s, v28.4h, v27.4h\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v24.4s, v9.4h, v27.4h\n"
+ "smlal v23.4s, v12.4h, v27.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v15.4s, v8.8h, v1.8h\n"
+ "ldr q8, [x7, #0x0]\n"
+ "smlal v7.4s, v14.4h, v30.4h\n"
+ "smlal2 v5.4s, v28.8h, v27.8h\n"
+ "ldr q28, [x8, #0x0]\n"
+ "smlal2 v22.4s, v9.8h, v27.8h\n"
+ "smlal2 v19.4s, v12.8h, v27.8h\n"
+ "ldr q27, [x7, #0x10]\n"
+ "smlal v20.4s, v14.4h, v1.4h\n"
+ "add x7, x7, #0x20\n"
+ "smlal v24.4s, v2.4h, v1.4h\n"
+ "smlal v23.4s, v16.4h, v1.4h\n"
+ "smlal2 v15.4s, v14.8h, v30.8h\n"
+ "smlal v7.4s, v25.4h, v31.4h\n"
+ "smlal2 v5.4s, v14.8h, v1.8h\n"
+ "ldr q14, [x8, #0x10]\n"
+ "smlal2 v22.4s, v2.8h, v1.8h\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v19.4s, v16.8h, v1.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal v24.4s, v16.4h, v30.4h\n"
+ "smlal v23.4s, v4.4h, v30.4h\n"
+ "smlal2 v15.4s, v25.8h, v31.8h\n"
+ "smlal v7.4s, v10.4h, v29.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal2 v22.4s, v16.8h, v30.8h\n"
+ "smlal2 v19.4s, v4.8h, v30.8h\n"
+ "smlal v20.4s, v10.4h, v31.4h\n"
+ "smlal v24.4s, v4.4h, v31.4h\n"
+ "smlal v23.4s, v17.4h, v31.4h\n"
+ "smlal2 v15.4s, v10.8h, v29.8h\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+ "smlal2 v5.4s, v10.8h, v31.8h\n"
+ "smlal2 v22.4s, v4.8h, v31.8h\n"
+ "and v4.16b, v7.16b, v28.16b\n"
+ "smlal2 v19.4s, v17.8h, v31.8h\n"
+ "smlal v20.4s, v9.4h, v29.4h\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v24.4s, v17.4h, v29.4h\n"
+ "smlal v23.4s, v6.4h, v29.4h\n"
+ "sqadd v7.4s, v7.4s, v4.4s\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal2 v5.4s, v9.8h, v29.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v27.4s\n"
+ "smlal2 v22.4s, v17.8h, v29.8h\n"
+ "smlal2 v19.4s, v6.8h, v29.8h\n"
+ "and v30.16b, v15.16b, v14.16b\n"
+ "smlal v20.4s, v12.4h, v21.4h\n"
+ "smlal v24.4s, v6.4h, v21.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smlal v23.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v12.8h, v21.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "smlal2 v22.4s, v6.8h, v21.8h\n"
+ "smlal2 v19.4s, v3.8h, v21.8h\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "and v3.16b, v20.16b, v28.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v27.4s\n"
+ "and v25.16b, v24.16b, v28.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+ "and v16.16b, v23.16b, v28.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+ "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v4.16b, v5.16b, v14.16b\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v10.16b, v22.16b, v14.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v12.16b, v19.16b, v14.16b\n"
+ "sqadd v20.4s, v20.4s, v3.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v25.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v28.4s\n"
+ "srshl v20.4s, v20.4s, v28.4s\n"
+ "sqadd v5.4s, v5.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "sqadd v19.4s, v19.4s, v12.4s\n"
+ "srshl v15.4s, v15.4s, v14.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v22.4s, v22.4s, v14.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v14.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d7, [x17, x4]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d20, [x16, x4]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d24, [x15, x4]\n"
+ "str d23, [x14, x4]\n"
+ "add x4, x4, #0x8\n"
+ "beq 124f\n"
+ "add x6, x6, #0xc8\n"
+ "3:" // Oddments
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x1, #2, 5f\n"
+ "ld1 { v7.4s }, [x20], #0x10\n"
+ "tbz x1, #1, 4f\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
+ "tbz x1, #0, 7f\n"
+ "ld1 { v15.s }[2], [x20]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x1, #0, 7f\n"
+ "ld1 { v15.s }[0], [x20]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x1, #1, 6f\n"
+ "ld1 { v7.d }[0], [x20], #0x8\n"
+ "tbz x1, #0, 7f\n"
+ "ld1 { v7.s }[2], [x20]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 7f\n"
+ "ld1 { v7.s }[0], [x20]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ssubl v10.8h, v10.8b, v13.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "add x9, x9, x3\n"
+ "add x28, x28, x3\n"
+ "add x27, x27, x3\n"
+ "add x26, x26, x3\n"
+ "add x25, x25, x3\n"
+ "add x24, x24, x3\n"
+ "add x23, x23, x3\n"
+ "add x22, x22, x3\n"
+ "add x21, x21, x3\n"
+ "add x20, x20, x3\n"
+ "tbz x1, #2, 9f\n"
+ "ld1 { v31.s }[0], [x9], #0x4\n"
+ "ld1 { v17.s }[0], [x28], #0x4\n"
+ "ld1 { v30.s }[0], [x27], #0x4\n"
+ "ld1 { v16.s }[0], [x26], #0x4\n"
+ "ld1 { v3.s }[0], [x25], #0x4\n"
+ "ld1 { v4.s }[0], [x24], #0x4\n"
+ "ld1 { v25.s }[0], [x23], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v29.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 8f\n"
+ "ld1 { v31.h }[2], [x9], #0x2\n"
+ "ld1 { v17.h }[2], [x28], #0x2\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v16.h }[2], [x26], #0x2\n"
+ "ld1 { v3.h }[2], [x25], #0x2\n"
+ "ld1 { v4.h }[2], [x24], #0x2\n"
+ "ld1 { v25.h }[2], [x23], #0x2\n"
+ "ld1 { v9.h }[2], [x22], #0x2\n"
+ "ld1 { v29.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[6], [x9]\n"
+ "ld1 { v17.b }[6], [x28]\n"
+ "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v16.b }[6], [x26]\n"
+ "ld1 { v3.b }[6], [x25]\n"
+ "ld1 { v4.b }[6], [x24]\n"
+ "ld1 { v25.b }[6], [x23]\n"
+ "ld1 { v9.b }[6], [x22]\n"
+ "ld1 { v29.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[4], [x9]\n"
+ "ld1 { v17.b }[4], [x28]\n"
+ "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v16.b }[4], [x26]\n"
+ "ld1 { v3.b }[4], [x25]\n"
+ "ld1 { v4.b }[4], [x24]\n"
+ "ld1 { v25.b }[4], [x23]\n"
+ "ld1 { v9.b }[4], [x22]\n"
+ "ld1 { v29.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x1, #1, 10f\n"
+ "ld1 { v31.h }[0], [x9], #0x2\n"
+ "ld1 { v17.h }[0], [x28], #0x2\n"
+ "ld1 { v30.h }[0], [x27], #0x2\n"
+ "ld1 { v16.h }[0], [x26], #0x2\n"
+ "ld1 { v3.h }[0], [x25], #0x2\n"
+ "ld1 { v4.h }[0], [x24], #0x2\n"
+ "ld1 { v25.h }[0], [x23], #0x2\n"
+ "ld1 { v9.h }[0], [x22], #0x2\n"
+ "ld1 { v29.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[2], [x9]\n"
+ "ld1 { v17.b }[2], [x28]\n"
+ "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v16.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x25]\n"
+ "ld1 { v4.b }[2], [x24]\n"
+ "ld1 { v25.b }[2], [x23]\n"
+ "ld1 { v9.b }[2], [x22]\n"
+ "ld1 { v29.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[0], [x9]\n"
+ "ld1 { v17.b }[0], [x28]\n"
+ "ld1 { v30.b }[0], [x27]\n"
+ "ld1 { v16.b }[0], [x26]\n"
+ "ld1 { v3.b }[0], [x25]\n"
+ "ld1 { v4.b }[0], [x24]\n"
+ "ld1 { v25.b }[0], [x23]\n"
+ "ld1 { v9.b }[0], [x22]\n"
+ "ld1 { v29.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "ssubl v31.8h, v31.8b, v18.8b\n"
+ "ssubl v17.8h, v17.8b, v18.8b\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "ldr x20, [x5, #0x50]\n"
+ "ssubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "add x20, x20, x3\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "ssubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "ssubl v29.8h, v29.8b, v18.8b\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "ssubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "tbz x1, #2, 13f\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 12f\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v27.b }[6], [x20]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 15f\n"
+ "ld1 { v27.b }[4], [x20]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x1, #1, 14f\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v27.b }[2], [x20]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 15f\n"
+ "ld1 { v27.b }[0], [x20]\n"
+ "15:" // Oddments: Load (1, 3): Bit 2: End
+ "ssubl v27.8h, v27.8b, v18.8b\n"
+ "ldr x20, [x5, #0x58]\n"
+ "smlal v23.4s, v27.4h, v10.4h\n"
+ "smlal2 v19.4s, v27.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "smlal v24.4s, v27.4h, v21.4h\n"
+ "smlal2 v22.4s, v27.8h, v21.8h\n"
+ "tbz x1, #2, 17f\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 16f\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v6.b }[6], [x20]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 19f\n"
+ "ld1 { v6.b }[4], [x20]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (1, 4): Bit 2: Unset
+ "tbz x1, #1, 18f\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v6.b }[2], [x20]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 19f\n"
+ "ld1 { v6.b }[0], [x20]\n"
+ "19:" // Oddments: Load (1, 4): Bit 2: End
+ "ssubl v6.8h, v6.8b, v18.8b\n"
+ "ldr x20, [x5, #0x60]\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x1, #2, 21f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 20f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v9.b }[6], [x20]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 23f\n"
+ "ld1 { v9.b }[4], [x20]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (0, 5): Bit 2: Unset
+ "tbz x1, #1, 22f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v9.b }[2], [x20]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 23f\n"
+ "ld1 { v9.b }[0], [x20]\n"
+ "23:" // Oddments: Load (0, 5): Bit 2: End
+ "ldr d14, [x6, #0x28]\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal v20.4s, v9.4h, v12.4h\n"
+ "smlal2 v5.4s, v9.8h, v12.8h\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v30.4h, v14.4h\n"
+ "smlal2 v15.4s, v30.8h, v14.8h\n"
+ "smlal v20.4s, v16.4h, v14.4h\n"
+ "smlal2 v5.4s, v16.8h, v14.8h\n"
+ "smlal v24.4s, v28.4h, v14.4h\n"
+ "smlal2 v22.4s, v28.8h, v14.8h\n"
+ "tbz x1, #2, 25f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 24f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (2, 1): Bit 2: Unset
+ "tbz x1, #1, 26f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[0], [x20]\n"
+ "27:" // Oddments: Load (2, 1): Bit 2: End
+ "ldr d21, [x6, #0x30]\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0x70]\n"
+ "smlal v23.4s, v25.4h, v14.4h\n"
+ "smlal2 v19.4s, v25.8h, v14.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v16.8h, v21.8h\n"
+ "smlal v20.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v4.8h, v21.8h\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "tbz x1, #2, 29f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 28f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v10.b }[6], [x20]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 31f\n"
+ "ld1 { v10.b }[4], [x20]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 2): Bit 2: Unset
+ "tbz x1, #1, 30f\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v10.b }[2], [x20]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 31f\n"
+ "ld1 { v10.b }[0], [x20]\n"
+ "31:" // Oddments: Load (2, 2): Bit 2: End
+ "ldr d9, [x6, #0x38]\n"
+ "ssubl v10.8h, v10.8b, v18.8b\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v4.4h, v9.4h\n"
+ "smlal2 v15.4s, v4.8h, v9.8h\n"
+ "smlal v20.4s, v27.4h, v9.4h\n"
+ "smlal2 v5.4s, v27.8h, v9.8h\n"
+ "smlal v24.4s, v10.4h, v9.4h\n"
+ "smlal2 v22.4s, v10.8h, v9.8h\n"
+ "tbz x1, #2, 33f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 32f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v12.b }[6], [x20]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 35f\n"
+ "ld1 { v12.b }[4], [x20]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x1, #1, 34f\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v12.b }[2], [x20]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 35f\n"
+ "ld1 { v12.b }[0], [x20]\n"
+ "35:" // Oddments: Load (2, 3): Bit 2: End
+ "ldr d31, [x6, #0x40]\n"
+ "ssubl v12.8h, v12.8b, v18.8b\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal v23.4s, v12.4h, v9.4h\n"
+ "smlal2 v19.4s, v12.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v27.4h, v31.4h\n"
+ "smlal2 v15.4s, v27.8h, v31.8h\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "smlal v24.4s, v12.4h, v31.4h\n"
+ "smlal2 v22.4s, v12.8h, v31.8h\n"
+ "tbz x1, #2, 37f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 36f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v8.b }[6], [x20]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 39f\n"
+ "ld1 { v8.b }[4], [x20]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (2, 4): Bit 2: Unset
+ "tbz x1, #1, 38f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v8.b }[2], [x20]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 39f\n"
+ "ld1 { v8.b }[0], [x20]\n"
+ "39:" // Oddments: Load (2, 4): Bit 2: End
+ "ldr d16, [x6, #0x48]\n"
+ "ssubl v8.8h, v8.8b, v18.8b\n"
+ "ssubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0x88]\n"
+ "smlal v23.4s, v8.4h, v31.4h\n"
+ "smlal2 v19.4s, v8.8h, v31.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v6.4h, v16.4h\n"
+ "smlal2 v15.4s, v6.8h, v16.8h\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "smlal2 v5.4s, v29.8h, v16.8h\n"
+ "smlal v24.4s, v8.4h, v16.4h\n"
+ "smlal2 v22.4s, v8.8h, v16.8h\n"
+ "tbz x1, #2, 41f\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 40f\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v27.b }[6], [x20]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 43f\n"
+ "ld1 { v27.b }[4], [x20]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (2, 5): Bit 2: Unset
+ "tbz x1, #1, 42f\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v27.b }[2], [x20]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 43f\n"
+ "ld1 { v27.b }[0], [x20]\n"
+ "43:" // Oddments: Load (2, 5): Bit 2: End
+ "ldr d21, [x6, #0x50]\n"
+ "ssubl v27.8h, v27.8b, v18.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0x90]\n"
+ "smlal v23.4s, v27.4h, v16.4h\n"
+ "smlal2 v19.4s, v27.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "smlal v20.4s, v25.4h, v21.4h\n"
+ "smlal2 v5.4s, v25.8h, v21.8h\n"
+ "tbz x1, #2, 45f\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 44f\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[6], [x20]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[4], [x20]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x1, #1, 46f\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[2], [x20]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[0], [x20]\n"
+ "47:" // Oddments: Load (3, 0): Bit 2: End
+ "ssubl v31.8h, v31.8b, v18.8b\n"
+ "ldr x20, [x5, #0x98]\n"
+ "smlal v24.4s, v31.4h, v21.4h\n"
+ "smlal2 v22.4s, v31.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x1, #2, 49f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 48f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v28.b }[6], [x20]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 51f\n"
+ "ld1 { v28.b }[4], [x20]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x1, #1, 50f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v28.b }[2], [x20]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 51f\n"
+ "ld1 { v28.b }[0], [x20]\n"
+ "51:" // Oddments: Load (3, 1): Bit 2: End
+ "ldr d2, [x6, #0x58]\n"
+ "ssubl v28.8h, v28.8b, v18.8b\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa0]\n"
+ "smlal v23.4s, v28.4h, v21.4h\n"
+ "smlal2 v19.4s, v28.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v15.4s, v25.8h, v2.8h\n"
+ "smlal v20.4s, v10.4h, v2.4h\n"
+ "smlal2 v5.4s, v10.8h, v2.8h\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
+ "tbz x1, #2, 53f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 52f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v21.b }[6], [x20]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 55f\n"
+ "ld1 { v21.b }[4], [x20]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x1, #1, 54f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v21.b }[2], [x20]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 55f\n"
+ "ld1 { v21.b }[0], [x20]\n"
+ "55:" // Oddments: Load (3, 2): Bit 2: End
+ "ldr d25, [x6, #0x60]\n"
+ "ssubl v21.8h, v21.8b, v18.8b\n"
+ "ssubl v25.8h, v25.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal v23.4s, v21.4h, v2.4h\n"
+ "smlal2 v19.4s, v21.8h, v2.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v10.4h, v25.4h\n"
+ "smlal2 v15.4s, v10.8h, v25.8h\n"
+ "smlal v20.4s, v12.4h, v25.4h\n"
+ "smlal2 v5.4s, v12.8h, v25.8h\n"
+ "smlal v24.4s, v21.4h, v25.4h\n"
+ "smlal2 v22.4s, v21.8h, v25.8h\n"
+ "tbz x1, #2, 57f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 56f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v9.b }[6], [x20]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 59f\n"
+ "ld1 { v9.b }[4], [x20]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x1, #1, 58f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v9.b }[2], [x20]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 59f\n"
+ "ld1 { v9.b }[0], [x20]\n"
+ "59:" // Oddments: Load (3, 3): Bit 2: End
+ "ldr d1, [x6, #0x68]\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "smlal v23.4s, v9.4h, v25.4h\n"
+ "smlal2 v19.4s, v9.8h, v25.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v12.4h, v1.4h\n"
+ "smlal2 v15.4s, v12.8h, v1.8h\n"
+ "smlal v20.4s, v8.4h, v1.4h\n"
+ "smlal2 v5.4s, v8.8h, v1.8h\n"
+ "smlal v24.4s, v9.4h, v1.4h\n"
+ "smlal2 v22.4s, v9.8h, v1.8h\n"
+ "tbz x1, #2, 61f\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 60f\n"
+ "ld1 { v3.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v3.b }[6], [x20]\n"
+ "b 63f\n"
+ "60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 63f\n"
+ "ld1 { v3.b }[4], [x20]\n"
+ "b 63f\n"
+ "61:" // Oddments: Load (3, 4): Bit 2: Unset
+ "tbz x1, #1, 62f\n"
+ "ld1 { v3.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v3.b }[2], [x20]\n"
+ "b 63f\n"
+ "62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 63f\n"
+ "ld1 { v3.b }[0], [x20]\n"
+ "63:" // Oddments: Load (3, 4): Bit 2: End
+ "ldr d16, [x6, #0x70]\n"
+ "ssubl v3.8h, v3.8b, v18.8b\n"
+ "ssubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb8]\n"
+ "smlal v23.4s, v3.4h, v1.4h\n"
+ "smlal2 v19.4s, v3.8h, v1.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "smlal2 v15.4s, v8.8h, v16.8h\n"
+ "smlal v20.4s, v27.4h, v16.4h\n"
+ "smlal2 v5.4s, v27.8h, v16.8h\n"
+ "smlal v24.4s, v3.4h, v16.4h\n"
+ "smlal2 v22.4s, v3.8h, v16.8h\n"
+ "tbz x1, #2, 65f\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 64f\n"
+ "ld1 { v14.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v14.b }[6], [x20]\n"
+ "b 67f\n"
+ "64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 67f\n"
+ "ld1 { v14.b }[4], [x20]\n"
+ "b 67f\n"
+ "65:" // Oddments: Load (3, 5): Bit 2: Unset
+ "tbz x1, #1, 66f\n"
+ "ld1 { v14.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v14.b }[2], [x20]\n"
+ "b 67f\n"
+ "66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 67f\n"
+ "ld1 { v14.b }[0], [x20]\n"
+ "67:" // Oddments: Load (3, 5): Bit 2: End
+ "ldr d17, [x6, #0x78]\n"
+ "ssubl v14.8h, v14.8b, v18.8b\n"
+ "ssubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0xc0]\n"
+ "smlal v23.4s, v14.4h, v16.4h\n"
+ "smlal2 v19.4s, v14.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v31.4h, v17.4h\n"
+ "smlal2 v15.4s, v31.8h, v17.8h\n"
+ "smlal v20.4s, v28.4h, v17.4h\n"
+ "smlal2 v5.4s, v28.8h, v17.8h\n"
+ "tbz x1, #2, 69f\n"
+ "ld1 { v1.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 68f\n"
+ "ld1 { v1.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 71f\n"
+ "ld1 { v1.b }[6], [x20]\n"
+ "b 71f\n"
+ "68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 71f\n"
+ "ld1 { v1.b }[4], [x20]\n"
+ "b 71f\n"
+ "69:" // Oddments: Load (4, 0): Bit 2: Unset
+ "tbz x1, #1, 70f\n"
+ "ld1 { v1.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 71f\n"
+ "ld1 { v1.b }[2], [x20]\n"
+ "b 71f\n"
+ "70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 71f\n"
+ "ld1 { v1.b }[0], [x20]\n"
+ "71:" // Oddments: Load (4, 0): Bit 2: End
+ "ssubl v1.8h, v1.8b, v18.8b\n"
+ "ldr x20, [x5, #0xc8]\n"
+ "smlal v24.4s, v1.4h, v17.4h\n"
+ "smlal2 v22.4s, v1.8h, v17.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x1, #2, 73f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 72f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 75f\n"
+ "72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 75f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 75f\n"
+ "73:" // Oddments: Load (4, 1): Bit 2: Unset
+ "tbz x1, #1, 74f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 75f\n"
+ "74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 75f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "75:" // Oddments: Load (4, 1): Bit 2: End
+ "ldr d29, [x6, #0x80]\n"
+ "ssubl v16.8h, v16.8b, v18.8b\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "ldr x20, [x5, #0xd0]\n"
+ "smlal v23.4s, v16.4h, v17.4h\n"
+ "smlal2 v19.4s, v16.8h, v17.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v28.4h, v29.4h\n"
+ "smlal2 v15.4s, v28.8h, v29.8h\n"
+ "smlal v20.4s, v21.4h, v29.4h\n"
+ "smlal2 v5.4s, v21.8h, v29.8h\n"
+ "smlal v24.4s, v16.4h, v29.4h\n"
+ "smlal2 v22.4s, v16.8h, v29.8h\n"
+ "tbz x1, #2, 77f\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 76f\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v30.b }[6], [x20]\n"
+ "b 79f\n"
+ "76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 79f\n"
+ "ld1 { v30.b }[4], [x20]\n"
+ "b 79f\n"
+ "77:" // Oddments: Load (4, 2): Bit 2: Unset
+ "tbz x1, #1, 78f\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v30.b }[2], [x20]\n"
+ "b 79f\n"
+ "78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 79f\n"
+ "ld1 { v30.b }[0], [x20]\n"
+ "79:" // Oddments: Load (4, 2): Bit 2: End
+ "ldr d12, [x6, #0x88]\n"
+ "ssubl v30.8h, v30.8b, v18.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ldr x20, [x5, #0xd8]\n"
+ "smlal v23.4s, v30.4h, v29.4h\n"
+ "smlal2 v19.4s, v30.8h, v29.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v21.4h, v12.4h\n"
+ "smlal2 v15.4s, v21.8h, v12.8h\n"
+ "smlal v20.4s, v9.4h, v12.4h\n"
+ "smlal2 v5.4s, v9.8h, v12.8h\n"
+ "smlal v24.4s, v30.4h, v12.4h\n"
+ "smlal2 v22.4s, v30.8h, v12.8h\n"
+ "tbz x1, #2, 81f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 80f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v29.b }[6], [x20]\n"
+ "b 83f\n"
+ "80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 83f\n"
+ "ld1 { v29.b }[4], [x20]\n"
+ "b 83f\n"
+ "81:" // Oddments: Load (4, 3): Bit 2: Unset
+ "tbz x1, #1, 82f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v29.b }[2], [x20]\n"
+ "b 83f\n"
+ "82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 83f\n"
+ "ld1 { v29.b }[0], [x20]\n"
+ "83:" // Oddments: Load (4, 3): Bit 2: End
+ "ldr d21, [x6, #0x90]\n"
+ "ssubl v29.8h, v29.8b, v18.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0xe0]\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal v20.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v3.8h, v21.8h\n"
+ "smlal v24.4s, v29.4h, v21.4h\n"
+ "smlal2 v22.4s, v29.8h, v21.8h\n"
+ "tbz x1, #2, 85f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 84f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "b 87f\n"
+ "84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 87f\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "b 87f\n"
+ "85:" // Oddments: Load (4, 4): Bit 2: Unset
+ "tbz x1, #1, 86f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "b 87f\n"
+ "86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 87f\n"
+ "ld1 { v25.b }[0], [x20]\n"
+ "87:" // Oddments: Load (4, 4): Bit 2: End
+ "ldr d8, [x6, #0x98]\n"
+ "ssubl v25.8h, v25.8b, v18.8b\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x20, [x5, #0xe8]\n"
+ "smlal v23.4s, v25.4h, v21.4h\n"
+ "smlal2 v19.4s, v25.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v3.4h, v8.4h\n"
+ "smlal2 v15.4s, v3.8h, v8.8h\n"
+ "smlal v20.4s, v14.4h, v8.4h\n"
+ "smlal2 v5.4s, v14.8h, v8.8h\n"
+ "smlal v24.4s, v25.4h, v8.4h\n"
+ "smlal2 v22.4s, v25.8h, v8.8h\n"
+ "tbz x1, #2, 89f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 88f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v21.b }[6], [x20]\n"
+ "b 91f\n"
+ "88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 91f\n"
+ "ld1 { v21.b }[4], [x20]\n"
+ "b 91f\n"
+ "89:" // Oddments: Load (4, 5): Bit 2: Unset
+ "tbz x1, #1, 90f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v21.b }[2], [x20]\n"
+ "b 91f\n"
+ "90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 91f\n"
+ "ld1 { v21.b }[0], [x20]\n"
+ "91:" // Oddments: Load (4, 5): Bit 2: End
+ "ldr d9, [x6, #0xa0]\n"
+ "ssubl v21.8h, v21.8b, v18.8b\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0xf0]\n"
+ "smlal v23.4s, v21.4h, v8.4h\n"
+ "smlal2 v19.4s, v21.8h, v8.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v1.4h, v9.4h\n"
+ "smlal2 v15.4s, v1.8h, v9.8h\n"
+ "smlal v20.4s, v16.4h, v9.4h\n"
+ "smlal2 v5.4s, v16.8h, v9.8h\n"
+ "tbz x1, #2, 93f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 92f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v12.b }[6], [x20]\n"
+ "b 95f\n"
+ "92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 95f\n"
+ "ld1 { v12.b }[4], [x20]\n"
+ "b 95f\n"
+ "93:" // Oddments: Load (5, 0): Bit 2: Unset
+ "tbz x1, #1, 94f\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v12.b }[2], [x20]\n"
+ "b 95f\n"
+ "94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 95f\n"
+ "ld1 { v12.b }[0], [x20]\n"
+ "95:" // Oddments: Load (5, 0): Bit 2: End
+ "ssubl v12.8h, v12.8b, v18.8b\n"
+ "ldr x20, [x5, #0xf8]\n"
+ "smlal v24.4s, v12.4h, v9.4h\n"
+ "smlal2 v22.4s, v12.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x1, #2, 97f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 96f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v10.b }[6], [x20]\n"
+ "b 99f\n"
+ "96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 99f\n"
+ "ld1 { v10.b }[4], [x20]\n"
+ "b 99f\n"
+ "97:" // Oddments: Load (5, 1): Bit 2: Unset
+ "tbz x1, #1, 98f\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v10.b }[2], [x20]\n"
+ "b 99f\n"
+ "98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 99f\n"
+ "ld1 { v10.b }[0], [x20]\n"
+ "99:" // Oddments: Load (5, 1): Bit 2: End
+ "ldr d12, [x6, #0xa8]\n"
+ "ssubl v10.8h, v10.8b, v18.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ldr x20, [x5, #0x100]\n"
+ "smlal v23.4s, v10.4h, v9.4h\n"
+ "smlal2 v19.4s, v10.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v16.4h, v12.4h\n"
+ "smlal2 v15.4s, v16.8h, v12.8h\n"
+ "smlal v20.4s, v30.4h, v12.4h\n"
+ "smlal2 v5.4s, v30.8h, v12.8h\n"
+ "smlal v24.4s, v10.4h, v12.4h\n"
+ "smlal2 v22.4s, v10.8h, v12.8h\n"
+ "tbz x1, #2, 101f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 100f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v9.b }[6], [x20]\n"
+ "b 103f\n"
+ "100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 103f\n"
+ "ld1 { v9.b }[4], [x20]\n"
+ "b 103f\n"
+ "101:" // Oddments: Load (5, 2): Bit 2: Unset
+ "tbz x1, #1, 102f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v9.b }[2], [x20]\n"
+ "b 103f\n"
+ "102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 103f\n"
+ "ld1 { v9.b }[0], [x20]\n"
+ "103:" // Oddments: Load (5, 2): Bit 2: End
+ "ldr d28, [x6, #0xb0]\n"
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "ssubl v28.8h, v28.8b, v13.8b\n"
+ "ldr x20, [x5, #0x108]\n"
+ "smlal v23.4s, v9.4h, v12.4h\n"
+ "smlal2 v19.4s, v9.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v30.4h, v28.4h\n"
+ "smlal2 v15.4s, v30.8h, v28.8h\n"
+ "smlal v20.4s, v29.4h, v28.4h\n"
+ "smlal2 v5.4s, v29.8h, v28.8h\n"
+ "smlal v24.4s, v9.4h, v28.4h\n"
+ "smlal2 v22.4s, v9.8h, v28.8h\n"
+ "tbz x1, #2, 105f\n"
+ "ld1 { v2.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 104f\n"
+ "ld1 { v2.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v2.b }[6], [x20]\n"
+ "b 107f\n"
+ "104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 107f\n"
+ "ld1 { v2.b }[4], [x20]\n"
+ "b 107f\n"
+ "105:" // Oddments: Load (5, 3): Bit 2: Unset
+ "tbz x1, #1, 106f\n"
+ "ld1 { v2.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v2.b }[2], [x20]\n"
+ "b 107f\n"
+ "106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 107f\n"
+ "ld1 { v2.b }[0], [x20]\n"
+ "107:" // Oddments: Load (5, 3): Bit 2: End
+ "ldr d30, [x6, #0xb8]\n"
+ "ssubl v2.8h, v2.8b, v18.8b\n"
+ "ssubl v30.8h, v30.8b, v13.8b\n"
+ "ldr x20, [x5, #0x110]\n"
+ "smlal v23.4s, v2.4h, v28.4h\n"
+ "smlal2 v19.4s, v2.8h, v28.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v29.4h, v30.4h\n"
+ "smlal2 v15.4s, v29.8h, v30.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal v24.4s, v2.4h, v30.4h\n"
+ "smlal2 v22.4s, v2.8h, v30.8h\n"
+ "tbz x1, #2, 109f\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 108f\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v27.b }[6], [x20]\n"
+ "b 111f\n"
+ "108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 111f\n"
+ "ld1 { v27.b }[4], [x20]\n"
+ "b 111f\n"
+ "109:" // Oddments: Load (5, 4): Bit 2: Unset
+ "tbz x1, #1, 110f\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v27.b }[2], [x20]\n"
+ "b 111f\n"
+ "110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 111f\n"
+ "ld1 { v27.b }[0], [x20]\n"
+ "111:" // Oddments: Load (5, 4): Bit 2: End
+ "ldr d8, [x6, #0xc0]\n"
+ "ssubl v27.8h, v27.8b, v18.8b\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x20, [x5, #0x118]\n"
+ "smlal v23.4s, v27.4h, v30.4h\n"
+ "smlal2 v19.4s, v27.8h, v30.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v25.4h, v8.4h\n"
+ "smlal2 v15.4s, v25.8h, v8.8h\n"
+ "smlal v20.4s, v21.4h, v8.4h\n"
+ "smlal2 v5.4s, v21.8h, v8.8h\n"
+ "smlal v24.4s, v27.4h, v8.4h\n"
+ "smlal2 v22.4s, v27.8h, v8.8h\n"
+ "tbz x1, #2, 113f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 112f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v9.b }[6], [x20]\n"
+ "b 115f\n"
+ "112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 115f\n"
+ "ld1 { v9.b }[4], [x20]\n"
+ "b 115f\n"
+ "113:" // Oddments: Load (5, 5): Bit 2: Unset
+ "tbz x1, #1, 114f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v9.b }[2], [x20]\n"
+ "b 115f\n"
+ "114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 115f\n"
+ "ld1 { v9.b }[0], [x20]\n"
+ "115:" // Oddments: Load (5, 5): Bit 2: End
+ "ssubl v9.8h, v9.8b, v18.8b\n"
+ "smlal v23.4s, v9.4h, v8.4h\n"
+ "smlal2 v19.4s, v9.8h, v8.8h\n"
+ "tbz x1, #2, 117f\n"
+ "ld1 { v30.4s }, [x7], #0x10\n"
+ "ld1 { v12.4s }, [x8], #0x10\n"
+ "tbz x1, #1, 116f\n"
+ "ld1 { v14.d }[0], [x7], #0x8\n"
+ "ld1 { v27.d }[0], [x8], #0x8\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v14.s }[2], [x7]\n"
+ "ld1 { v27.s }[2], [x8]\n"
+ "b 119f\n"
+ "116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x1, #0, 119f\n"
+ "ld1 { v14.s }[0], [x7]\n"
+ "ld1 { v27.s }[0], [x8]\n"
+ "b 119f\n"
+ "117:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x1, #1, 118f\n"
+ "ld1 { v30.d }[0], [x7], #0x8\n"
+ "ld1 { v12.d }[0], [x8], #0x8\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v30.s }[2], [x7]\n"
+ "ld1 { v12.s }[2], [x8]\n"
+ "b 119f\n"
+ "118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 119f\n"
+ "ld1 { v30.s }[0], [x7]\n"
+ "ld1 { v12.s }[0], [x8]\n"
+ "119:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v7.4s, v7.4s, v30.4s\n"
+ "and v16.16b, v7.16b, v12.16b\n"
+ "add x17, x17, x4\n"
+ "add x16, x16, x4\n"
+ "sqrdmulh v15.4s, v15.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add x15, x15, x4\n"
+ "add x14, x14, x4\n"
+ "and v2.16b, v15.16b, v27.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+ "sqadd v7.4s, v7.4s, v16.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "and v21.16b, v20.16b, v12.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+ "and v18.16b, v24.16b, v12.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v14.4s\n"
+ "and v31.16b, v23.16b, v12.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+ "sqadd v15.4s, v15.4s, v2.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v9.16b, v5.16b, v27.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v4.16b, v22.16b, v27.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v28.16b, v19.16b, v27.16b\n"
+ "sqadd v20.4s, v20.4s, v21.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v18.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v31.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v12.4s\n"
+ "srshl v20.4s, v20.4s, v12.4s\n"
+ "sqadd v5.4s, v5.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v12.4s\n"
+ "sqadd v22.4s, v22.4s, v4.4s\n"
+ "srshl v23.4s, v23.4s, v12.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "srshl v15.4s, v15.4s, v27.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v27.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v22.4s, v22.4s, v27.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v27.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "tbz x1, #2, 121f\n"
+ "st1 { v7.s }[0], [x17], #0x4\n"
+ "st1 { v20.s }[0], [x16], #0x4\n"
+ "st1 { v24.s }[0], [x15], #0x4\n"
+ "st1 { v23.s }[0], [x14], #0x4\n"
+ "tbz x1, #1, 120f\n"
+ "st1 { v7.h }[2], [x17], #0x2\n"
+ "st1 { v20.h }[2], [x16], #0x2\n"
+ "st1 { v24.h }[2], [x15], #0x2\n"
+ "st1 { v23.h }[2], [x14], #0x2\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v7.b }[6], [x17], #0x1\n"
+ "st1 { v20.b }[6], [x16], #0x1\n"
+ "st1 { v24.b }[6], [x15], #0x1\n"
+ "st1 { v23.b }[6], [x14], #0x1\n"
+ "b 123f\n"
+ "120:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x1, #0, 123f\n"
+ "st1 { v7.b }[4], [x17], #0x1\n"
+ "st1 { v20.b }[4], [x16], #0x1\n"
+ "st1 { v24.b }[4], [x15], #0x1\n"
+ "st1 { v23.b }[4], [x14], #0x1\n"
+ "b 123f\n"
+ "121:" // Oddments: Bit 2: Unset
+ "tbz x1, #1, 122f\n"
+ "st1 { v7.h }[0], [x17], #0x2\n"
+ "st1 { v20.h }[0], [x16], #0x2\n"
+ "st1 { v24.h }[0], [x15], #0x2\n"
+ "st1 { v23.h }[0], [x14], #0x2\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v7.b }[2], [x17], #0x1\n"
+ "st1 { v20.b }[2], [x16], #0x1\n"
+ "st1 { v24.b }[2], [x15], #0x1\n"
+ "st1 { v23.b }[2], [x14], #0x1\n"
+ "b 123f\n"
+ "122:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 123f\n"
+ "st1 { v7.b }[0], [x17], #0x1\n"
+ "st1 { v20.b }[0], [x16], #0x1\n"
+ "st1 { v24.b }[0], [x15], #0x1\n"
+ "st1 { v23.b }[0], [x14], #0x1\n"
+ "123:" // Oddments: Bit 2: End
+ "124:" // End
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..9c92a9dd46
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+class a64_s8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+ KernelType kernel = a64_s8q_nhwc_generic_output9_mla_depthfirst_impl;
+
+ public:
+ a64_s8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<int8_t, int8_t, int8_t, int32_t>(9, arm_gemm::VLType::None) {}
+
+ KernelType get_kernel() const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..77b7d231e0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(
+ const int8_t *const *const inptrs,
+ int8_t *const *const outptrs,
+ const void *params,
+ const arm_gemm::Requantize32& qp,
+ const unsigned int n_points,
+ const unsigned int n_channels
+)
+{
+ __asm__ __volatile__(
+ "lsr x9, %x[n_channels], #0x2\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v8.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v7.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v6.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v5.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v4.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+ "ld1r { v3.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v2.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "ld1r { v1.4s }, [x20]\n"
+ "mov x11, #0x0\n"
+ "cbz x9, 6f\n"
+ "1:" // Channel loop
+ "movi v23.4s, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "lsl x20, x11, #0x2\n"
+ "ldr q23, [%x[bias], x20]\n"
+ "2:" // Channel loop: Load bias: Done
+ "ldr s0, [%x[params]], #0x4\n"
+ "mov x25, %x[inptrs]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "subs x24, %x[n_points], #0x1\n"
+ "ldr s14, [x21, x11]\n"
+ "ldr s15, [x20, x11]\n"
+ "mov v24.16b, v23.16b\n"
+ "mov v25.16b, v23.16b\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s16, [x21, x11]\n"
+ "mov v26.16b, v23.16b\n"
+ "mov v27.16b, v23.16b\n"
+ "ldr s17, [x20, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "mov v28.16b, v23.16b\n"
+ "mov v29.16b, v23.16b\n"
+ "ldr s18, [x21, x11]\n"
+ "ldr s19, [x20, x11]\n"
+ "mov v30.16b, v23.16b\n"
+ "mov v31.16b, v23.16b\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s20, [x21, x11]\n"
+ "ssubl v0.8h, v0.8b, v5.8b\n"
+ "ssubl v14.8h, v14.8b, v6.8b\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x25], #0x8\n"
+ "ssubl v15.8h, v15.8b, v6.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr s22, [x20, x11]\n"
+ "ssubl v17.8h, v17.8b, v6.8b\n"
+ "ssubl v18.8h, v18.8b, v6.8b\n"
+ "ssubl v19.8h, v19.8b, v6.8b\n"
+ "ssubl v20.8h, v20.8b, v6.8b\n"
+ "ssubl v21.8h, v21.8b, v6.8b\n"
+ "ssubl v22.8h, v22.8b, v6.8b\n"
+ "ble 4f\n"
+ "3:" // Channel loop: Planar loop
+ "ldp x23, x22, [x25], #0x10\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "ldr s14, [x23, x11]\n"
+ "ldr s15, [x22, x11]\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "ldr s16, [x21, x11]\n"
+ "ldr s17, [x20, x11]\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s18, [x21, x11]\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "ldr s19, [x20, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
+ "subs x24, x24, #0x1\n"
+ "ldr s0, [%x[params]], #0x4\n"
+ "ldr s20, [x21, x11]\n"
+ "ssubl v0.8h, v0.8b, v5.8b\n"
+ "ssubl v14.8h, v14.8b, v6.8b\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x25], #0x8\n"
+ "ssubl v15.8h, v15.8b, v6.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ldr s22, [x20, x11]\n"
+ "ssubl v17.8h, v17.8b, v6.8b\n"
+ "ssubl v18.8h, v18.8b, v6.8b\n"
+ "ssubl v19.8h, v19.8b, v6.8b\n"
+ "ssubl v20.8h, v20.8b, v6.8b\n"
+ "ssubl v21.8h, v21.8b, v6.8b\n"
+ "ssubl v22.8h, v22.8b, v6.8b\n"
+ "bgt 3b\n"
+ "4:" // Channel loop: Planar tail
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
+ "cbz %x[rq_mul_ptr], 5f\n"
+ "lsl x20, x11, #0x2\n"
+ "ldr q2, [%x[rq_mul_ptr], x20]\n"
+ "ldr q1, [%x[rq_right_shift_ptr], x20]\n"
+ "cbz %x[rq_left_shift_ptr], 5f\n"
+ "ldr q3, [%x[rq_left_shift_ptr], x20]\n"
+ "5:" // Channel loop: Load quantisation parameters: Done
+ "sshl v23.4s, v23.4s, v3.4s\n"
+ "sshl v24.4s, v24.4s, v3.4s\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "sshl v25.4s, v25.4s, v3.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
+ "and v16.16b, v25.16b, v1.16b\n"
+ "sshl v26.4s, v26.4s, v3.4s\n"
+ "sshl v27.4s, v27.4s, v3.4s\n"
+ "sshl v28.4s, v28.4s, v3.4s\n"
+ "sshl v29.4s, v29.4s, v3.4s\n"
+ "sshl v30.4s, v30.4s, v3.4s\n"
+ "sshl v31.4s, v31.4s, v3.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v21.16b, v26.16b, v1.16b\n"
+ "and v20.16b, v27.16b, v1.16b\n"
+ "and v19.16b, v28.16b, v1.16b\n"
+ "and v18.16b, v29.16b, v1.16b\n"
+ "and v17.16b, v30.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v1.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v21.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v23.4s, v23.4s, v1.4s\n"
+ "srshl v24.4s, v24.4s, v1.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v27.4s, v27.4s, v1.4s\n"
+ "srshl v28.4s, v28.4s, v1.4s\n"
+ "srshl v29.4s, v29.4s, v1.4s\n"
+ "srshl v30.4s, v30.4s, v1.4s\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "smax v23.4s, v23.4s, v8.4s\n"
+ "smax v24.4s, v24.4s, v8.4s\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v27.4s, v27.4s, v8.4s\n"
+ "smax v28.4s, v28.4s, v8.4s\n"
+ "smax v29.4s, v29.4s, v8.4s\n"
+ "smax v30.4s, v30.4s, v8.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smin v23.4s, v23.4s, v7.4s\n"
+ "smin v24.4s, v24.4s, v7.4s\n"
+ "smin v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v7.4s\n"
+ "smin v27.4s, v27.4s, v7.4s\n"
+ "smin v28.4s, v28.4s, v7.4s\n"
+ "smin v29.4s, v29.4s, v7.4s\n"
+ "smin v30.4s, v30.4s, v7.4s\n"
+ "smin v31.4s, v31.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s23, [x28, x11]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s24, [x27, x11]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s25, [x26, x11]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s26, [x25, x11]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s27, [x24, x11]\n"
+ "str s28, [x23, x11]\n"
+ "str s29, [x22, x11]\n"
+ "str s30, [x21, x11]\n"
+ "str s31, [x20, x11]\n"
+ "add x11, x11, #0x4\n"
+ "cmp x11, x9, LSL #2\n"
+ "blt 1b\n"
+ "6:" // Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 24f\n"
+ "movi v23.4s, #0x0\n"
+ "cbz %x[bias], 9f\n"
+ "add x20, %x[bias], x11, LSL #2\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ld1 { v23.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v23.s }[2], [x20], #0x4\n"
+ "b 8f\n"
+ "7:" // Oddments: Load bias: Bit 1: Unset
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "8:" // Oddments: Load bias: Bit 1: End
+ "9:" // Oddments: Load bias: Done
+ "ldr s0, [%x[params]], #0x4\n"
+ "mov x10, %x[inptrs]\n"
+ "ldp x9, x28, [x10], #0x10\n"
+ "mov v24.16b, v23.16b\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "mov v25.16b, v23.16b\n"
+ "mov v26.16b, v23.16b\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "ldr x21, [x10], #0x8\n"
+ "mov v27.16b, v23.16b\n"
+ "mov v28.16b, v23.16b\n"
+ "mov v29.16b, v23.16b\n"
+ "mov v30.16b, v23.16b\n"
+ "add x9, x9, x11\n"
+ "add x28, x28, x11\n"
+ "mov v31.16b, v23.16b\n"
+ "ssubl v0.8h, v0.8b, v5.8b\n"
+ "add x27, x27, x11\n"
+ "add x26, x26, x11\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v15.b }[2], [x28], #0x1\n"
+ "ld1 { v16.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "ld1 { v18.b }[2], [x25], #0x1\n"
+ "ld1 { v19.b }[2], [x24], #0x1\n"
+ "ld1 { v20.b }[2], [x23], #0x1\n"
+ "ld1 { v21.b }[2], [x22], #0x1\n"
+ "ld1 { v22.b }[2], [x21], #0x1\n"
+ "b 11f\n"
+ "10:" // Oddments: Load: Bit 1: Unset
+ "ldr b14, [x9], #0x1\n"
+ "ldr b15, [x28], #0x1\n"
+ "ldr b16, [x27], #0x1\n"
+ "ldr b17, [x26], #0x1\n"
+ "ldr b18, [x25], #0x1\n"
+ "ldr b19, [x24], #0x1\n"
+ "ldr b20, [x23], #0x1\n"
+ "ldr b21, [x22], #0x1\n"
+ "ldr b22, [x21], #0x1\n"
+ "11:" // Oddments: Load: Bit 1: End
+ "subs x20, %x[n_points], #0x1\n"
+ "ssubl v14.8h, v14.8b, v6.8b\n"
+ "ssubl v15.8h, v15.8b, v6.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ssubl v17.8h, v17.8b, v6.8b\n"
+ "ssubl v18.8h, v18.8b, v6.8b\n"
+ "ssubl v19.8h, v19.8b, v6.8b\n"
+ "ssubl v20.8h, v20.8b, v6.8b\n"
+ "ssubl v21.8h, v21.8b, v6.8b\n"
+ "ssubl v22.8h, v22.8b, v6.8b\n"
+ "ble 15f\n"
+ "12:" // Oddments: Planar loop
+ "ldp x9, x28, [x10], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "ldr x21, [x10], #0x8\n"
+ "add x9, x9, x11\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "add x28, x28, x11\n"
+ "add x27, x27, x11\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
+ "ldr s0, [%x[params]], #0x4\n"
+ "ssubl v0.8h, v0.8b, v5.8b\n"
+ "add x26, x26, x11\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v15.b }[2], [x28], #0x1\n"
+ "ld1 { v16.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "ld1 { v18.b }[2], [x25], #0x1\n"
+ "ld1 { v19.b }[2], [x24], #0x1\n"
+ "ld1 { v20.b }[2], [x23], #0x1\n"
+ "ld1 { v21.b }[2], [x22], #0x1\n"
+ "ld1 { v22.b }[2], [x21], #0x1\n"
+ "b 14f\n"
+ "13:" // Oddments: Planar loop: Load: Bit 1: Unset
+ "ldr b14, [x9], #0x1\n"
+ "ldr b15, [x28], #0x1\n"
+ "ldr b16, [x27], #0x1\n"
+ "ldr b17, [x26], #0x1\n"
+ "ldr b18, [x25], #0x1\n"
+ "ldr b19, [x24], #0x1\n"
+ "ldr b20, [x23], #0x1\n"
+ "ldr b21, [x22], #0x1\n"
+ "ldr b22, [x21], #0x1\n"
+ "14:" // Oddments: Planar loop: Load: Bit 1: End
+ "subs x20, x20, #0x1\n"
+ "ssubl v14.8h, v14.8b, v6.8b\n"
+ "ssubl v15.8h, v15.8b, v6.8b\n"
+ "ssubl v16.8h, v16.8b, v6.8b\n"
+ "ssubl v17.8h, v17.8b, v6.8b\n"
+ "ssubl v18.8h, v18.8b, v6.8b\n"
+ "ssubl v19.8h, v19.8b, v6.8b\n"
+ "ssubl v20.8h, v20.8b, v6.8b\n"
+ "ssubl v21.8h, v21.8b, v6.8b\n"
+ "ssubl v22.8h, v22.8b, v6.8b\n"
+ "bgt 12b\n"
+ "15:" // Oddments: Planar tail
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
+ "cbz %x[rq_mul_ptr], 21f\n"
+ "add x22, %x[rq_mul_ptr], x11, LSL #2\n"
+ "add x21, %x[rq_right_shift_ptr], x11, LSL #2\n"
+ "add x20, %x[rq_left_shift_ptr], x11, LSL #2\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v2.d }[0], [x22], #0x8\n"
+ "ld1 { v1.d }[0], [x21], #0x8\n"
+ "cbz %x[rq_left_shift_ptr], 16f\n"
+ "ld1 { v3.d }[0], [x20], #0x8\n"
+ "16:" // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v2.s }[2], [x22], #0x4\n"
+ "ld1 { v1.s }[2], [x21], #0x4\n"
+ "cbz %x[rq_left_shift_ptr], 17f\n"
+ "ld1 { v3.s }[2], [x20], #0x4\n"
+ "17:" // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done
+ "b 20f\n"
+ "18:" // Oddments: Load quantisation parameters: Bit 1: Unset
+ "ld1 { v2.s }[0], [x22], #0x4\n"
+ "ld1 { v1.s }[0], [x21], #0x4\n"
+ "cbz %x[rq_left_shift_ptr], 19f\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
+ "19:" // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
+ "20:" // Oddments: Load quantisation parameters: Bit 1: End
+ "21:" // Oddments: Load quantisation parameters: Done
+ "sshl v23.4s, v23.4s, v3.4s\n"
+ "sshl v24.4s, v24.4s, v3.4s\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "sshl v25.4s, v25.4s, v3.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "add x28, x28, x11\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
+ "add x27, x27, x11\n"
+ "add x26, x26, x11\n"
+ "and v16.16b, v25.16b, v1.16b\n"
+ "sshl v26.4s, v26.4s, v3.4s\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "sshl v27.4s, v27.4s, v3.4s\n"
+ "sshl v28.4s, v28.4s, v3.4s\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "sshl v29.4s, v29.4s, v3.4s\n"
+ "sshl v30.4s, v30.4s, v3.4s\n"
+ "add x21, x21, x11\n"
+ "add x20, x20, x11\n"
+ "sshl v31.4s, v31.4s, v3.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v21.16b, v26.16b, v1.16b\n"
+ "and v20.16b, v27.16b, v1.16b\n"
+ "and v19.16b, v28.16b, v1.16b\n"
+ "and v18.16b, v29.16b, v1.16b\n"
+ "and v17.16b, v30.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v1.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v21.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v23.4s, v23.4s, v1.4s\n"
+ "srshl v24.4s, v24.4s, v1.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v27.4s, v27.4s, v1.4s\n"
+ "srshl v28.4s, v28.4s, v1.4s\n"
+ "srshl v29.4s, v29.4s, v1.4s\n"
+ "srshl v30.4s, v30.4s, v1.4s\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "smax v23.4s, v23.4s, v8.4s\n"
+ "smax v24.4s, v24.4s, v8.4s\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v27.4s, v27.4s, v8.4s\n"
+ "smax v28.4s, v28.4s, v8.4s\n"
+ "smax v29.4s, v29.4s, v8.4s\n"
+ "smax v30.4s, v30.4s, v8.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smin v23.4s, v23.4s, v7.4s\n"
+ "smin v24.4s, v24.4s, v7.4s\n"
+ "smin v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v7.4s\n"
+ "smin v27.4s, v27.4s, v7.4s\n"
+ "smin v28.4s, v28.4s, v7.4s\n"
+ "smin v29.4s, v29.4s, v7.4s\n"
+ "smin v30.4s, v30.4s, v7.4s\n"
+ "smin v31.4s, v31.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "st1 { v23.h }[0], [x28], #0x2\n"
+ "st1 { v24.h }[0], [x27], #0x2\n"
+ "st1 { v25.h }[0], [x26], #0x2\n"
+ "st1 { v26.h }[0], [x25], #0x2\n"
+ "st1 { v27.h }[0], [x24], #0x2\n"
+ "st1 { v28.h }[0], [x23], #0x2\n"
+ "st1 { v29.h }[0], [x22], #0x2\n"
+ "st1 { v30.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x20], #0x2\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "st1 { v23.b }[2], [x28], #0x1\n"
+ "st1 { v24.b }[2], [x27], #0x1\n"
+ "st1 { v25.b }[2], [x26], #0x1\n"
+ "st1 { v26.b }[2], [x25], #0x1\n"
+ "st1 { v27.b }[2], [x24], #0x1\n"
+ "st1 { v28.b }[2], [x23], #0x1\n"
+ "st1 { v29.b }[2], [x22], #0x1\n"
+ "st1 { v30.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x20], #0x1\n"
+ "b 23f\n"
+ "22:" // Oddments: Store: Bit 1: Unset
+ "st1 { v23.b }[0], [x28], #0x1\n"
+ "st1 { v24.b }[0], [x27], #0x1\n"
+ "st1 { v25.b }[0], [x26], #0x1\n"
+ "st1 { v26.b }[0], [x25], #0x1\n"
+ "st1 { v27.b }[0], [x24], #0x1\n"
+ "st1 { v28.b }[0], [x23], #0x1\n"
+ "st1 { v29.b }[0], [x22], #0x1\n"
+ "st1 { v30.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x20], #0x1\n"
+ "23:" // Oddments: Store: Bit 1: End
+ "24:" // End
+ : [params] "+&r" (params)
+ : [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
new file mode 100644
index 0000000000..14adf8880f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst : DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+ using Parent = DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>;
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *)
+ : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
+
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..be8fbfa0e2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -0,0 +1,519 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(
+ const int8_t *const *const inptrs,
+ int8_t *const *const outptrs,
+ const void *params,
+ unsigned int n_output_channels,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "ldr q11, [%x[params], #0x0]\n"
+ "ldr q5, [%x[params], #0x10]\n"
+ "movi v8.16b, #0x1\n"
+ "ushr v8.4s, v8.4s, #0x8\n"
+ "ldr q6, [%x[params], #0x20]\n"
+ "ldr q7, [%x[params], #0x30]\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "ldr x20, [%x[inptrs], #0x8]\n"
+ "ld1 { v1.16b }, [x20]\n"
+ "mov v28.16b, v1.16b\n"
+ "mov v23.16b, v1.16b\n"
+ "ldr x20, [%x[inptrs], #0x10]\n"
+ "ld1 { v2.16b }, [x20]\n"
+ "mov v30.16b, v1.16b\n"
+ "mov v21.16b, v2.16b\n"
+ "ldr x20, [%x[inptrs], #0x20]\n"
+ "ld1 { v4.16b }, [x20]\n"
+ "mov v20.16b, v2.16b\n"
+ "mov v29.16b, v2.16b\n"
+ "ldr x20, [%x[inptrs], #0x0]\n"
+ "ld1 { v0.16b }, [x20]\n"
+ "mov v9.16b, v4.16b\n"
+ "mov v22.16b, v4.16b\n"
+ "ldr x20, [%x[inptrs], #0x18]\n"
+ "ld1 { v3.16b }, [x20]\n"
+ "mov v31.16b, v4.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x2\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x4\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x6\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v12.4s }, [x20]\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x2\n"
+ "ext v20.16b, v20.16b, v20.16b, #0x4\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x6\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x2\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x4\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x6\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "mov v27.16b, v0.16b\n"
+ "mov v19.16b, v0.16b\n"
+ "cmp %x[n_channels], #0x4\n"
+ "mov x9, #0x0\n"
+ "mov v18.16b, v0.16b\n"
+ "mov v26.16b, v3.16b\n"
+ "mov x28, #0x0\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "mov v17.16b, v3.16b\n"
+ "mov v16.16b, v3.16b\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x2\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x4\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "add %x[params], %x[params], #0x40\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x6\n"
+ "zip1 v1.4s, v1.4s, v23.4s\n"
+ "zip1 v28.4s, v28.4s, v30.4s\n"
+ "zip1 v2.4s, v2.4s, v20.4s\n"
+ "zip1 v21.4s, v21.4s, v29.4s\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x2\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x6\n"
+ "zip1 v4.4s, v4.4s, v22.4s\n"
+ "zip1 v9.4s, v9.4s, v31.4s\n"
+ "zip1 v0.4s, v0.4s, v19.4s\n"
+ "zip1 v27.4s, v27.4s, v18.4s\n"
+ "zip1 v1.4s, v1.4s, v28.4s\n"
+ "zip1 v2.4s, v2.4s, v21.4s\n"
+ ".inst 0x4f81e118 // sdot v24.4s, v8.16b, v1.4b[0]\n"
+ "zip1 v3.4s, v3.4s, v17.4s\n"
+ "zip1 v26.4s, v26.4s, v16.4s\n"
+ ".inst 0x4fa1e119 // sdot v25.4s, v8.16b, v1.4b[1]\n"
+ "zip1 v4.4s, v4.4s, v9.4s\n"
+ "movi v23.4s, #0x0\n"
+ ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
+ "movi v22.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ ".inst 0x4fa1e916 // sdot v22.4s, v8.16b, v1.4b[3]\n"
+ "movi v19.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ ".inst 0x4f82e115 // sdot v21.4s, v8.16b, v2.4b[0]\n"
+ "movi v10.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ ".inst 0x4fa2e113 // sdot v19.4s, v8.16b, v2.4b[1]\n"
+ "movi v18.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x4f82e909 // sdot v9.4s, v8.16b, v2.4b[2]\n"
+ "movi v16.4s, #0x0\n"
+ "zip1 v0.4s, v0.4s, v27.4s\n"
+ ".inst 0x4fa2e90a // sdot v10.4s, v8.16b, v2.4b[3]\n"
+ "zip1 v3.4s, v3.4s, v26.4s\n"
+ ".inst 0x4f84e114 // sdot v20.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x4fa4e112 // sdot v18.4s, v8.16b, v4.4b[1]\n"
+ ".inst 0x4f84e911 // sdot v17.4s, v8.16b, v4.4b[2]\n"
+ ".inst 0x4fa4e910 // sdot v16.4s, v8.16b, v4.4b[3]\n"
+ "movi v31.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ ".inst 0x4f80e11f // sdot v31.4s, v8.16b, v0.4b[0]\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ ".inst 0x4fa0e11e // sdot v30.4s, v8.16b, v0.4b[1]\n"
+ "movi v29.4s, #0x0\n"
+ ".inst 0x4f80e91a // sdot v26.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4fa0e91b // sdot v27.4s, v8.16b, v0.4b[3]\n"
+ ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ ".inst 0x4fa3e11d // sdot v29.4s, v8.16b, v3.4b[1]\n"
+ "add v24.4s, v24.4s, v21.4s\n"
+ "add v25.4s, v25.4s, v19.4s\n"
+ "add v23.4s, v23.4s, v9.4s\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "add v21.4s, v20.4s, v21.4s\n"
+ "movi v20.4s, #0x0\n"
+ ".inst 0x4f83e914 // sdot v20.4s, v8.16b, v3.4b[2]\n"
+ "add v19.4s, v18.4s, v19.4s\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x4fa3e912 // sdot v18.4s, v8.16b, v3.4b[3]\n"
+ "add v17.4s, v17.4s, v9.4s\n"
+ "add v16.4s, v16.4s, v10.4s\n"
+ "add v24.4s, v24.4s, v31.4s\n"
+ "add v25.4s, v25.4s, v30.4s\n"
+ "add v26.4s, v23.4s, v26.4s\n"
+ "add v27.4s, v22.4s, v27.4s\n"
+ "add v28.4s, v21.4s, v28.4s\n"
+ "add v29.4s, v19.4s, v29.4s\n"
+ "add v30.4s, v17.4s, v20.4s\n"
+ "add v31.4s, v16.4s, v18.4s\n"
+ "neg v12.4s, v12.4s\n"
+ "mul v24.4s, v24.4s, v12.4s\n"
+ "mul v25.4s, v25.4s, v12.4s\n"
+ "mul v26.4s, v26.4s, v12.4s\n"
+ "mul v27.4s, v27.4s, v12.4s\n"
+ "mul v28.4s, v28.4s, v12.4s\n"
+ "mul v29.4s, v29.4s, v12.4s\n"
+ "mul v30.4s, v30.4s, v12.4s\n"
+ "mul v31.4s, v31.4s, v12.4s\n"
+ "zip1 v19.4s, v24.4s, v26.4s\n"
+ "zip1 v18.4s, v25.4s, v27.4s\n"
+ "zip1 v17.4s, v28.4s, v30.4s\n"
+ "zip1 v16.4s, v29.4s, v31.4s\n"
+ "zip1 v22.4s, v19.4s, v18.4s\n"
+ "zip1 v23.4s, v17.4s, v16.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "ble 2f\n"
+ "1:" // Loop
+ "ldr q8, [%x[params], #0x0]\n"
+ "ldr q21, [%x[params], #0x10]\n"
+ ".inst 0x4f80e0b8 // sdot v24.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4fa0e0b9 // sdot v25.4s, v5.16b, v0.4b[1]\n"
+ "ldr q20, [%x[params], #0x20]\n"
+ ".inst 0x4f80e8ba // sdot v26.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4fa0e8bb // sdot v27.4s, v5.16b, v0.4b[3]\n"
+ "sub %x[n_channels], %x[n_channels], #0x4\n"
+ ".inst 0x4f81e0d8 // sdot v24.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4fa1e0d9 // sdot v25.4s, v6.16b, v1.4b[1]\n"
+ "cmp %x[n_channels], #0x4\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0x4f81e8da // sdot v26.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4fa1e8db // sdot v27.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4f82e0bc // sdot v28.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x4fa2e0bd // sdot v29.4s, v5.16b, v2.4b[1]\n"
+ ".inst 0x4f82e8be // sdot v30.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x4fa2e8bf // sdot v31.4s, v5.16b, v2.4b[3]\n"
+ "ldr q5, [%x[params], #0x30]\n"
+ ".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4fa2e0f9 // sdot v25.4s, v7.16b, v2.4b[1]\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4fa2e8fb // sdot v27.4s, v7.16b, v2.4b[3]\n"
+ "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+ ".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4fa3e0dd // sdot v29.4s, v6.16b, v3.4b[1]\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+ ".inst 0x4f83e8de // sdot v30.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4fa3e8df // sdot v31.4s, v6.16b, v3.4b[3]\n"
+ "ldr q6, [%x[params], #0x40]\n"
+ "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+ ".inst 0x4f84e0fc // sdot v28.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4fa4e0fd // sdot v29.4s, v7.16b, v4.4b[1]\n"
+ "and v19.16b, v24.16b, v21.16b\n"
+ ".inst 0x4f84e8fe // sdot v30.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4fa4e8ff // sdot v31.4s, v7.16b, v4.4b[3]\n"
+ "ldr q7, [%x[params], #0x50]\n"
+ "and v18.16b, v25.16b, v21.16b\n"
+ "and v17.16b, v26.16b, v21.16b\n"
+ "and v16.16b, v27.16b, v21.16b\n"
+ "add %x[params], %x[params], #0x60\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
+ "sqadd v25.4s, v25.4s, v18.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "and v19.16b, v28.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v17.16b, v30.16b, v21.16b\n"
+ "and v16.16b, v31.16b, v21.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v24.4s, v24.4s, v21.4s\n"
+ "srshl v25.4s, v25.4s, v21.4s\n"
+ "srshl v26.4s, v26.4s, v21.4s\n"
+ "srshl v27.4s, v27.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v15.4s\n"
+ "smin v26.4s, v26.4s, v15.4s\n"
+ "smin v27.4s, v27.4s, v15.4s\n"
+ "smin v28.4s, v28.4s, v15.4s\n"
+ "smin v29.4s, v29.4s, v15.4s\n"
+ "smin v30.4s, v30.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s24, [x27, x28]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s25, [x26, x28]\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s26, [x25, x28]\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s27, [x24, x28]\n"
+ "str s28, [x23, x28]\n"
+ "dup v24.4s, v22.s[0]\n"
+ "dup v25.4s, v22.s[1]\n"
+ "str s29, [x22, x28]\n"
+ "dup v26.4s, v22.s[2]\n"
+ "dup v27.4s, v22.s[3]\n"
+ "str s30, [x21, x28]\n"
+ "dup v28.4s, v23.s[0]\n"
+ "dup v29.4s, v23.s[1]\n"
+ "str s31, [x20, x28]\n"
+ "dup v30.4s, v23.s[2]\n"
+ "dup v31.4s, v23.s[3]\n"
+ "add x28, x28, #0x4\n"
+ "add v24.4s, v24.4s, v20.4s\n"
+ "add v25.4s, v25.4s, v20.4s\n"
+ "add v26.4s, v26.4s, v20.4s\n"
+ "add v27.4s, v27.4s, v20.4s\n"
+ "add v28.4s, v28.4s, v20.4s\n"
+ "add v29.4s, v29.4s, v20.4s\n"
+ "add v30.4s, v30.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v20.4s\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "ldr q21, [%x[params], #0x0]\n"
+ "ldr q20, [%x[params], #0x10]\n"
+ ".inst 0x4f80e0b8 // sdot v24.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x4fa0e0b9 // sdot v25.4s, v5.16b, v0.4b[1]\n"
+ ".inst 0x4f80e8ba // sdot v26.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4fa0e8bb // sdot v27.4s, v5.16b, v0.4b[3]\n"
+ "cmp %x[n_channels], #0x4\n"
+ "add x27, x27, x28\n"
+ ".inst 0x4f81e0d8 // sdot v24.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4fa1e0d9 // sdot v25.4s, v6.16b, v1.4b[1]\n"
+ "add x26, x26, x28\n"
+ "add x25, x25, x28\n"
+ ".inst 0x4f81e8da // sdot v26.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4fa1e8db // sdot v27.4s, v6.16b, v1.4b[3]\n"
+ "add x24, x24, x28\n"
+ "add x23, x23, x28\n"
+ ".inst 0x4f82e0bc // sdot v28.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x4fa2e0bd // sdot v29.4s, v5.16b, v2.4b[1]\n"
+ "add x22, x22, x28\n"
+ "add x21, x21, x28\n"
+ ".inst 0x4f82e8be // sdot v30.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x4fa2e8bf // sdot v31.4s, v5.16b, v2.4b[3]\n"
+ "add x20, x20, x28\n"
+ "add %x[params], %x[params], #0x20\n"
+ ".inst 0x4f82e0f8 // sdot v24.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4fa2e0f9 // sdot v25.4s, v7.16b, v2.4b[1]\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4fa2e8fb // sdot v27.4s, v7.16b, v2.4b[3]\n"
+ "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ ".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4fa3e0dd // sdot v29.4s, v6.16b, v3.4b[1]\n"
+ "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+ ".inst 0x4f83e8de // sdot v30.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4fa3e8df // sdot v31.4s, v6.16b, v3.4b[3]\n"
+ "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ ".inst 0x4f84e0fc // sdot v28.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4fa4e0fd // sdot v29.4s, v7.16b, v4.4b[1]\n"
+ "and v19.16b, v24.16b, v20.16b\n"
+ ".inst 0x4f84e8fe // sdot v30.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4fa4e8ff // sdot v31.4s, v7.16b, v4.4b[3]\n"
+ "and v18.16b, v25.16b, v20.16b\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "and v16.16b, v27.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
+ "sqadd v25.4s, v25.4s, v18.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "and v19.16b, v28.16b, v20.16b\n"
+ "and v18.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v30.16b, v20.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v24.4s, v24.4s, v20.4s\n"
+ "srshl v25.4s, v25.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v27.4s, v27.4s, v20.4s\n"
+ "srshl v28.4s, v28.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v15.4s\n"
+ "smin v26.4s, v26.4s, v15.4s\n"
+ "smin v27.4s, v27.4s, v15.4s\n"
+ "smin v28.4s, v28.4s, v15.4s\n"
+ "smin v29.4s, v29.4s, v15.4s\n"
+ "smin v30.4s, v30.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "blt 3f\n"
+ "str s24, [x27, #0x0]\n"
+ "str s25, [x26, #0x0]\n"
+ "str s26, [x25, #0x0]\n"
+ "str s27, [x24, #0x0]\n"
+ "str s28, [x23, #0x0]\n"
+ "str s29, [x22, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
+ "str s31, [x20, #0x0]\n"
+ "b 4f\n"
+ "3:" // Tail: Oddments
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v24.b }[0], [x27], #0x1\n"
+ "st1 { v25.b }[0], [x26], #0x1\n"
+ "st1 { v26.b }[0], [x25], #0x1\n"
+ "st1 { v27.b }[0], [x24], #0x1\n"
+ "st1 { v28.b }[0], [x23], #0x1\n"
+ "st1 { v29.b }[0], [x22], #0x1\n"
+ "st1 { v30.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x20], #0x1\n"
+ "beq 4f\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v24.b }[1], [x27], #0x1\n"
+ "st1 { v25.b }[1], [x26], #0x1\n"
+ "st1 { v26.b }[1], [x25], #0x1\n"
+ "st1 { v27.b }[1], [x24], #0x1\n"
+ "st1 { v28.b }[1], [x23], #0x1\n"
+ "st1 { v29.b }[1], [x22], #0x1\n"
+ "st1 { v30.b }[1], [x21], #0x1\n"
+ "st1 { v31.b }[1], [x20], #0x1\n"
+ "beq 4f\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v24.b }[2], [x27], #0x1\n"
+ "st1 { v25.b }[2], [x26], #0x1\n"
+ "st1 { v26.b }[2], [x25], #0x1\n"
+ "st1 { v27.b }[2], [x24], #0x1\n"
+ "st1 { v28.b }[2], [x23], #0x1\n"
+ "st1 { v29.b }[2], [x22], #0x1\n"
+ "st1 { v30.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x20], #0x1\n"
+ "beq 4f\n"
+ "st1 { v24.b }[3], [x27], #0x1\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v25.b }[3], [x26], #0x1\n"
+ "st1 { v26.b }[3], [x25], #0x1\n"
+ "st1 { v27.b }[3], [x24], #0x1\n"
+ "st1 { v28.b }[3], [x23], #0x1\n"
+ "st1 { v29.b }[3], [x22], #0x1\n"
+ "st1 { v30.b }[3], [x21], #0x1\n"
+ "st1 { v31.b }[3], [x20], #0x1\n"
+ "4:" // Tail: End
+ : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..62b033f48d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst : DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+ using Parent = DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>;
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *)
+ : Parent(4, 2, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
+
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..17afc92e30
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,640 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(
+ const int8_t *const *const inptrs,
+ int8_t *const *const outptrs,
+ const void *params,
+ unsigned int n_output_channels,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "ldr q12, [%x[params], #0x0]\n"
+ "ldr q8, [%x[params], #0x10]\n"
+ "movi v30.16b, #0x1\n"
+ "movi v17.4s, #0x0\n"
+ "ldr q9, [%x[params], #0x20]\n"
+ "ldr q10, [%x[params], #0x30]\n"
+ "movi v16.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "ldr q11, [%x[params], #0x40]\n"
+ "ldr x20, [%x[inptrs], #0x18]\n"
+ "movi v24.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "ld1 { v3.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x20]\n"
+ "mov v26.16b, v3.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "ld1 { v4.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x10]\n"
+ "mov v21.16b, v4.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ "ld1 { v2.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x8]\n"
+ "mov v27.16b, v2.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ "ld1 { v1.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x28]\n"
+ "zip1 v3.2d, v3.2d, v26.2d\n"
+ "zip1 v4.2d, v4.2d, v21.2d\n"
+ "ld1 { v5.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x30]\n"
+ "mov v26.16b, v1.16b\n"
+ "mov v22.16b, v5.16b\n"
+ "ld1 { v6.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x38]\n"
+ "mov v19.16b, v6.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "ld1 { v7.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x0]\n"
+ "mov v21.16b, v7.16b\n"
+ "zip1 v2.2d, v2.2d, v27.2d\n"
+ "ld1 { v0.16b }, [x20]\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x4f83e3d1 // sdot v17.4s, v30.16b, v3.4b[0]\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4f83ebd0 // sdot v16.4s, v30.16b, v3.4b[2]\n"
+ ".inst 0x4f84e3d9 // sdot v25.4s, v30.16b, v4.4b[0]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v23.4s }, [x20]\n"
+ ".inst 0x4f84ebd8 // sdot v24.4s, v30.16b, v4.4b[2]\n"
+ "mov v18.16b, v0.16b\n"
+ ".inst 0x4f82e3df // sdot v31.4s, v30.16b, v2.4b[0]\n"
+ "movi v29.4s, #0x0\n"
+ "movi v28.4s, #0x1\n"
+ ".inst 0x4f82ebdd // sdot v29.4s, v30.16b, v2.4b[2]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+ "zip1 v1.2d, v1.2d, v26.2d\n"
+ ".inst 0x4fa3e391 // sdot v17.4s, v28.16b, v3.4b[1]\n"
+ "zip1 v5.2d, v5.2d, v22.2d\n"
+ "zip1 v6.2d, v6.2d, v19.2d\n"
+ ".inst 0x4fa3eb90 // sdot v16.4s, v28.16b, v3.4b[3]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "zip1 v7.2d, v7.2d, v21.2d\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x4fa4e399 // sdot v25.4s, v28.16b, v4.4b[1]\n"
+ "movi v21.4s, #0x0\n"
+ ".inst 0x4fa4eb98 // sdot v24.4s, v28.16b, v4.4b[3]\n"
+ ".inst 0x4f81e3d6 // sdot v22.4s, v30.16b, v1.4b[0]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ ".inst 0x4f81ebd5 // sdot v21.4s, v30.16b, v1.4b[2]\n"
+ "movi v20.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x4f85e3da // sdot v26.4s, v30.16b, v5.4b[0]\n"
+ "cmp %x[n_channels], #0x4\n"
+ "zip1 v0.2d, v0.2d, v18.2d\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x4f85ebdb // sdot v27.4s, v30.16b, v5.4b[2]\n"
+ "mov x9, #0x0\n"
+ ".inst 0x4f86e3d4 // sdot v20.4s, v30.16b, v6.4b[0]\n"
+ ".inst 0x4f86ebd3 // sdot v19.4s, v30.16b, v6.4b[2]\n"
+ "add v17.4s, v17.4s, v25.4s\n"
+ "mov x28, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ ".inst 0x4f87e3d2 // sdot v18.4s, v30.16b, v7.4b[0]\n"
+ ".inst 0x4f87ebd9 // sdot v25.4s, v30.16b, v7.4b[2]\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ ".inst 0x4fa2e39f // sdot v31.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x4fa2eb9d // sdot v29.4s, v28.16b, v2.4b[3]\n"
+ "add v16.4s, v16.4s, v24.4s\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "movi v24.4s, #0x0\n"
+ ".inst 0x4f80e3d8 // sdot v24.4s, v30.16b, v0.4b[0]\n"
+ ".inst 0x4fa1e396 // sdot v22.4s, v28.16b, v1.4b[1]\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ ".inst 0x4fa1eb95 // sdot v21.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x4fa5e39a // sdot v26.4s, v28.16b, v5.4b[1]\n"
+ "add v31.4s, v31.4s, v17.4s\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ ".inst 0x4fa5eb9b // sdot v27.4s, v28.16b, v5.4b[3]\n"
+ ".inst 0x4fa6e394 // sdot v20.4s, v28.16b, v6.4b[1]\n"
+ "add v29.4s, v29.4s, v16.4s\n"
+ "add %x[params], %x[params], #0x50\n"
+ ".inst 0x4fa6eb93 // sdot v19.4s, v28.16b, v6.4b[3]\n"
+ ".inst 0x4fa7e392 // sdot v18.4s, v28.16b, v7.4b[1]\n"
+ "add v22.4s, v22.4s, v31.4s\n"
+ ".inst 0x4fa7eb99 // sdot v25.4s, v28.16b, v7.4b[3]\n"
+ ".inst 0x4fa0e398 // sdot v24.4s, v28.16b, v0.4b[1]\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v20.4s, v26.4s, v20.4s\n"
+ "add v19.4s, v27.4s, v19.4s\n"
+ "add v18.4s, v18.4s, v17.4s\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x4f80ebd1 // sdot v17.4s, v30.16b, v0.4b[2]\n"
+ ".inst 0x4fa0eb91 // sdot v17.4s, v28.16b, v0.4b[3]\n"
+ "add v16.4s, v25.4s, v16.4s\n"
+ "add v24.4s, v22.4s, v24.4s\n"
+ "add v25.4s, v21.4s, v17.4s\n"
+ "add v26.4s, v26.4s, v22.4s\n"
+ "add v27.4s, v27.4s, v21.4s\n"
+ "add v28.4s, v20.4s, v31.4s\n"
+ "add v29.4s, v19.4s, v29.4s\n"
+ "add v30.4s, v20.4s, v18.4s\n"
+ "add v31.4s, v19.4s, v16.4s\n"
+ "neg v23.4s, v23.4s\n"
+ "mul v24.4s, v24.4s, v23.4s\n"
+ "mul v25.4s, v25.4s, v23.4s\n"
+ "mul v26.4s, v26.4s, v23.4s\n"
+ "mul v27.4s, v27.4s, v23.4s\n"
+ "mul v28.4s, v28.4s, v23.4s\n"
+ "mul v29.4s, v29.4s, v23.4s\n"
+ "mul v30.4s, v30.4s, v23.4s\n"
+ "mul v31.4s, v31.4s, v23.4s\n"
+ "zip1 v19.4s, v24.4s, v26.4s\n"
+ "zip1 v18.4s, v25.4s, v27.4s\n"
+ "zip1 v17.4s, v28.4s, v30.4s\n"
+ "zip1 v16.4s, v29.4s, v31.4s\n"
+ "zip1 v22.4s, v19.4s, v18.4s\n"
+ "zip1 v23.4s, v17.4s, v16.4s\n"
+ "add v24.4s, v24.4s, v12.4s\n"
+ "add v25.4s, v25.4s, v12.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "add v27.4s, v27.4s, v12.4s\n"
+ "add v28.4s, v28.4s, v12.4s\n"
+ "add v29.4s, v29.4s, v12.4s\n"
+ "add v30.4s, v30.4s, v12.4s\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "ble 2f\n"
+ "1:" // Loop
+ "ldr q12, [%x[params], #0x60]\n"
+ "ldr q21, [%x[params], #0x70]\n"
+ ".inst 0x4f80e118 // sdot v24.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x4f80e919 // sdot v25.4s, v8.16b, v0.4b[2]\n"
+ "ldr q20, [%x[params], #0x80]\n"
+ ".inst 0x4f81e11a // sdot v26.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x4f81e91b // sdot v27.4s, v8.16b, v1.4b[2]\n"
+ "sub %x[n_channels], %x[n_channels], #0x4\n"
+ ".inst 0x4fa0e138 // sdot v24.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x4fa0e939 // sdot v25.4s, v9.16b, v0.4b[3]\n"
+ "cmp %x[n_channels], #0x4\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0x4fa1e13a // sdot v26.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x4fa1e93b // sdot v27.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x4f82e11c // sdot v28.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x4f82e91d // sdot v29.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n"
+ ".inst 0x4f83e91f // sdot v31.4s, v8.16b, v3.4b[2]\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ ".inst 0x4f81e158 // sdot v24.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x4f81e959 // sdot v25.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x4f82e95b // sdot v27.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x4fa2e13c // sdot v28.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x4fa2e93d // sdot v29.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e13e // sdot v30.4s, v9.16b, v3.4b[1]\n"
+ ".inst 0x4fa3e93f // sdot v31.4s, v9.16b, v3.4b[3]\n"
+ "ldr q16, [%x[params], #0x10]\n"
+ ".inst 0x4fa1e178 // sdot v24.4s, v11.16b, v1.4b[1]\n"
+ ".inst 0x4fa1e979 // sdot v25.4s, v11.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e17a // sdot v26.4s, v11.16b, v2.4b[1]\n"
+ ".inst 0x4fa2e97b // sdot v27.4s, v11.16b, v2.4b[3]\n"
+ ".inst 0x4f83e15c // sdot v28.4s, v10.16b, v3.4b[0]\n"
+ ".inst 0x4f83e95d // sdot v29.4s, v10.16b, v3.4b[2]\n"
+ ".inst 0x4f84e15e // sdot v30.4s, v10.16b, v4.4b[0]\n"
+ ".inst 0x4f84e95f // sdot v31.4s, v10.16b, v4.4b[2]\n"
+ "ldr q19, [%x[params], #0x20]\n"
+ ".inst 0x4f82e238 // sdot v24.4s, v17.16b, v2.4b[0]\n"
+ ".inst 0x4f82ea39 // sdot v25.4s, v17.16b, v2.4b[2]\n"
+ ".inst 0x4f83e23a // sdot v26.4s, v17.16b, v3.4b[0]\n"
+ ".inst 0x4f83ea3b // sdot v27.4s, v17.16b, v3.4b[2]\n"
+ ".inst 0x4fa3e17c // sdot v28.4s, v11.16b, v3.4b[1]\n"
+ ".inst 0x4fa3e97d // sdot v29.4s, v11.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e17e // sdot v30.4s, v11.16b, v4.4b[1]\n"
+ ".inst 0x4fa4e97f // sdot v31.4s, v11.16b, v4.4b[3]\n"
+ "ldr q18, [%x[params], #0x30]\n"
+ ".inst 0x4fa2e218 // sdot v24.4s, v16.16b, v2.4b[1]\n"
+ ".inst 0x4fa2ea19 // sdot v25.4s, v16.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e21a // sdot v26.4s, v16.16b, v3.4b[1]\n"
+ ".inst 0x4fa3ea1b // sdot v27.4s, v16.16b, v3.4b[3]\n"
+ ".inst 0x4f84e23c // sdot v28.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea3d // sdot v29.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x4f85e23e // sdot v30.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x4f85ea3f // sdot v31.4s, v17.16b, v5.4b[2]\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ ".inst 0x4f83e278 // sdot v24.4s, v19.16b, v3.4b[0]\n"
+ ".inst 0x4f83ea79 // sdot v25.4s, v19.16b, v3.4b[2]\n"
+ ".inst 0x4f84e27a // sdot v26.4s, v19.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea7b // sdot v27.4s, v19.16b, v4.4b[2]\n"
+ ".inst 0x4fa4e21c // sdot v28.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea1d // sdot v29.4s, v16.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e21e // sdot v30.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ea1f // sdot v31.4s, v16.16b, v5.4b[3]\n"
+ "ldr q16, [%x[params], #0x50]\n"
+ ".inst 0x4fa3e258 // sdot v24.4s, v18.16b, v3.4b[1]\n"
+ ".inst 0x4fa3ea59 // sdot v25.4s, v18.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e25a // sdot v26.4s, v18.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea5b // sdot v27.4s, v18.16b, v4.4b[3]\n"
+ ".inst 0x4f85e27c // sdot v28.4s, v19.16b, v5.4b[0]\n"
+ ".inst 0x4f85ea7d // sdot v29.4s, v19.16b, v5.4b[2]\n"
+ ".inst 0x4f86e27e // sdot v30.4s, v19.16b, v6.4b[0]\n"
+ ".inst 0x4f86ea7f // sdot v31.4s, v19.16b, v6.4b[2]\n"
+ "ldr q10, [%x[params], #0xb0]\n"
+ ".inst 0x4f84e238 // sdot v24.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea39 // sdot v25.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x4f85e23a // sdot v26.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x4f85ea3b // sdot v27.4s, v17.16b, v5.4b[2]\n"
+ ".inst 0x4fa5e25c // sdot v28.4s, v18.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ea5d // sdot v29.4s, v18.16b, v5.4b[3]\n"
+ ".inst 0x4fa6e25e // sdot v30.4s, v18.16b, v6.4b[1]\n"
+ ".inst 0x4fa6ea5f // sdot v31.4s, v18.16b, v6.4b[3]\n"
+ "ldr q11, [%x[params], #0xc0]\n"
+ ".inst 0x4fa4e218 // sdot v24.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea19 // sdot v25.4s, v16.16b, v4.4b[3]\n"
+ "sqrdmulh v24.4s, v24.4s, v12.4s\n"
+ ".inst 0x4fa5e21a // sdot v26.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ea1b // sdot v27.4s, v16.16b, v5.4b[3]\n"
+ "sqrdmulh v25.4s, v25.4s, v12.4s\n"
+ ".inst 0x4f86e23c // sdot v28.4s, v17.16b, v6.4b[0]\n"
+ ".inst 0x4f86ea3d // sdot v29.4s, v17.16b, v6.4b[2]\n"
+ "sqrdmulh v26.4s, v26.4s, v12.4s\n"
+ ".inst 0x4f87e23e // sdot v30.4s, v17.16b, v7.4b[0]\n"
+ ".inst 0x4f87ea3f // sdot v31.4s, v17.16b, v7.4b[2]\n"
+ "ldr q8, [%x[params], #0x90]\n"
+ "sqrdmulh v27.4s, v27.4s, v12.4s\n"
+ ".inst 0x4fa6e21c // sdot v28.4s, v16.16b, v6.4b[1]\n"
+ ".inst 0x4fa6ea1d // sdot v29.4s, v16.16b, v6.4b[3]\n"
+ "and v19.16b, v24.16b, v21.16b\n"
+ ".inst 0x4fa7e21e // sdot v30.4s, v16.16b, v7.4b[1]\n"
+ ".inst 0x4fa7ea1f // sdot v31.4s, v16.16b, v7.4b[3]\n"
+ "ldr q9, [%x[params], #0xa0]\n"
+ "and v18.16b, v25.16b, v21.16b\n"
+ "and v17.16b, v26.16b, v21.16b\n"
+ "and v16.16b, v27.16b, v21.16b\n"
+ "add %x[params], %x[params], #0xd0\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v28.4s, v28.4s, v12.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v12.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v12.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v12.4s\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
+ "sqadd v25.4s, v25.4s, v18.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "and v19.16b, v28.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v17.16b, v30.16b, v21.16b\n"
+ "and v16.16b, v31.16b, v21.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v24.4s, v24.4s, v21.4s\n"
+ "srshl v25.4s, v25.4s, v21.4s\n"
+ "srshl v26.4s, v26.4s, v21.4s\n"
+ "srshl v27.4s, v27.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v28.4s, v28.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "add v31.4s, v31.4s, v13.4s\n"
+ "smin v24.4s, v24.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v15.4s\n"
+ "smin v26.4s, v26.4s, v15.4s\n"
+ "smin v27.4s, v27.4s, v15.4s\n"
+ "smin v28.4s, v28.4s, v15.4s\n"
+ "smin v29.4s, v29.4s, v15.4s\n"
+ "smin v30.4s, v30.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s24, [x27, x28]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s25, [x26, x28]\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s26, [x25, x28]\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s27, [x24, x28]\n"
+ "str s28, [x23, x28]\n"
+ "dup v24.4s, v22.s[0]\n"
+ "dup v25.4s, v22.s[1]\n"
+ "str s29, [x22, x28]\n"
+ "dup v26.4s, v22.s[2]\n"
+ "dup v27.4s, v22.s[3]\n"
+ "str s30, [x21, x28]\n"
+ "dup v28.4s, v23.s[0]\n"
+ "dup v29.4s, v23.s[1]\n"
+ "str s31, [x20, x28]\n"
+ "dup v30.4s, v23.s[2]\n"
+ "dup v31.4s, v23.s[3]\n"
+ "add x28, x28, #0x4\n"
+ "add v24.4s, v24.4s, v20.4s\n"
+ "add v25.4s, v25.4s, v20.4s\n"
+ "add v26.4s, v26.4s, v20.4s\n"
+ "add v27.4s, v27.4s, v20.4s\n"
+ "add v28.4s, v28.4s, v20.4s\n"
+ "add v29.4s, v29.4s, v20.4s\n"
+ "add v30.4s, v30.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v20.4s\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "ldr q21, [%x[params], #0x60]\n"
+ "ldr q20, [%x[params], #0x70]\n"
+ ".inst 0x4f80e118 // sdot v24.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x4f80e919 // sdot v25.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4f81e11a // sdot v26.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x4f81e91b // sdot v27.4s, v8.16b, v1.4b[2]\n"
+ "cmp %x[n_channels], #0x4\n"
+ "add x27, x27, x28\n"
+ ".inst 0x4fa0e138 // sdot v24.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x4fa0e939 // sdot v25.4s, v9.16b, v0.4b[3]\n"
+ "add x26, x26, x28\n"
+ "add x25, x25, x28\n"
+ ".inst 0x4fa1e13a // sdot v26.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x4fa1e93b // sdot v27.4s, v9.16b, v1.4b[3]\n"
+ "add x24, x24, x28\n"
+ "add x23, x23, x28\n"
+ ".inst 0x4f82e11c // sdot v28.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x4f82e91d // sdot v29.4s, v8.16b, v2.4b[2]\n"
+ "add x22, x22, x28\n"
+ "add x21, x21, x28\n"
+ ".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n"
+ ".inst 0x4f83e91f // sdot v31.4s, v8.16b, v3.4b[2]\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ "add x20, x20, x28\n"
+ ".inst 0x4f81e158 // sdot v24.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x4f81e959 // sdot v25.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x4f82e95b // sdot v27.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x4fa2e13c // sdot v28.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x4fa2e93d // sdot v29.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e13e // sdot v30.4s, v9.16b, v3.4b[1]\n"
+ ".inst 0x4fa3e93f // sdot v31.4s, v9.16b, v3.4b[3]\n"
+ "ldr q16, [%x[params], #0x10]\n"
+ ".inst 0x4fa1e178 // sdot v24.4s, v11.16b, v1.4b[1]\n"
+ ".inst 0x4fa1e979 // sdot v25.4s, v11.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e17a // sdot v26.4s, v11.16b, v2.4b[1]\n"
+ ".inst 0x4fa2e97b // sdot v27.4s, v11.16b, v2.4b[3]\n"
+ ".inst 0x4f83e15c // sdot v28.4s, v10.16b, v3.4b[0]\n"
+ ".inst 0x4f83e95d // sdot v29.4s, v10.16b, v3.4b[2]\n"
+ ".inst 0x4f84e15e // sdot v30.4s, v10.16b, v4.4b[0]\n"
+ ".inst 0x4f84e95f // sdot v31.4s, v10.16b, v4.4b[2]\n"
+ "ldr q19, [%x[params], #0x20]\n"
+ ".inst 0x4f82e238 // sdot v24.4s, v17.16b, v2.4b[0]\n"
+ ".inst 0x4f82ea39 // sdot v25.4s, v17.16b, v2.4b[2]\n"
+ ".inst 0x4f83e23a // sdot v26.4s, v17.16b, v3.4b[0]\n"
+ ".inst 0x4f83ea3b // sdot v27.4s, v17.16b, v3.4b[2]\n"
+ ".inst 0x4fa3e17c // sdot v28.4s, v11.16b, v3.4b[1]\n"
+ ".inst 0x4fa3e97d // sdot v29.4s, v11.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e17e // sdot v30.4s, v11.16b, v4.4b[1]\n"
+ ".inst 0x4fa4e97f // sdot v31.4s, v11.16b, v4.4b[3]\n"
+ "ldr q18, [%x[params], #0x30]\n"
+ ".inst 0x4fa2e218 // sdot v24.4s, v16.16b, v2.4b[1]\n"
+ ".inst 0x4fa2ea19 // sdot v25.4s, v16.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e21a // sdot v26.4s, v16.16b, v3.4b[1]\n"
+ ".inst 0x4fa3ea1b // sdot v27.4s, v16.16b, v3.4b[3]\n"
+ ".inst 0x4f84e23c // sdot v28.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea3d // sdot v29.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x4f85e23e // sdot v30.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x4f85ea3f // sdot v31.4s, v17.16b, v5.4b[2]\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ ".inst 0x4f83e278 // sdot v24.4s, v19.16b, v3.4b[0]\n"
+ ".inst 0x4f83ea79 // sdot v25.4s, v19.16b, v3.4b[2]\n"
+ ".inst 0x4f84e27a // sdot v26.4s, v19.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea7b // sdot v27.4s, v19.16b, v4.4b[2]\n"
+ ".inst 0x4fa4e21c // sdot v28.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea1d // sdot v29.4s, v16.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e21e // sdot v30.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ea1f // sdot v31.4s, v16.16b, v5.4b[3]\n"
+ "ldr q16, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x80\n"
+ ".inst 0x4fa3e258 // sdot v24.4s, v18.16b, v3.4b[1]\n"
+ ".inst 0x4fa3ea59 // sdot v25.4s, v18.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e25a // sdot v26.4s, v18.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea5b // sdot v27.4s, v18.16b, v4.4b[3]\n"
+ ".inst 0x4f85e27c // sdot v28.4s, v19.16b, v5.4b[0]\n"
+ ".inst 0x4f85ea7d // sdot v29.4s, v19.16b, v5.4b[2]\n"
+ ".inst 0x4f86e27e // sdot v30.4s, v19.16b, v6.4b[0]\n"
+ ".inst 0x4f86ea7f // sdot v31.4s, v19.16b, v6.4b[2]\n"
+ ".inst 0x4f84e238 // sdot v24.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x4f84ea39 // sdot v25.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x4f85e23a // sdot v26.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x4f85ea3b // sdot v27.4s, v17.16b, v5.4b[2]\n"
+ ".inst 0x4fa5e25c // sdot v28.4s, v18.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ea5d // sdot v29.4s, v18.16b, v5.4b[3]\n"
+ ".inst 0x4fa6e25e // sdot v30.4s, v18.16b, v6.4b[1]\n"
+ ".inst 0x4fa6ea5f // sdot v31.4s, v18.16b, v6.4b[3]\n"
+ ".inst 0x4fa4e218 // sdot v24.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x4fa4ea19 // sdot v25.4s, v16.16b, v4.4b[3]\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ ".inst 0x4fa5e21a // sdot v26.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x4fa5ea1b // sdot v27.4s, v16.16b, v5.4b[3]\n"
+ "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ ".inst 0x4f86e23c // sdot v28.4s, v17.16b, v6.4b[0]\n"
+ ".inst 0x4f86ea3d // sdot v29.4s, v17.16b, v6.4b[2]\n"
+ "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+ ".inst 0x4f87e23e // sdot v30.4s, v17.16b, v7.4b[0]\n"
+ ".inst 0x4f87ea3f // sdot v31.4s, v17.16b, v7.4b[2]\n"
+ "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ ".inst 0x4fa6e21c // sdot v28.4s, v16.16b, v6.4b[1]\n"
+ ".inst 0x4fa6ea1d // sdot v29.4s, v16.16b, v6.4b[3]\n"
+ "and v19.16b, v24.16b, v20.16b\n"
+ ".inst 0x4fa7e21e // sdot v30.4s, v16.16b, v7.4b[1]\n"
+ ".inst 0x4fa7ea1f // sdot v31.4s, v16.16b, v7.4b[3]\n"
+ "and v18.16b, v25.16b, v20.16b\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "and v16.16b, v27.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
+ "sqadd v25.4s, v25.4s, v18.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "and v19.16b, v28.16b, v20.16b\n"
+ "and v18.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v30.16b, v20.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v24.4s, v24.4s, v20.4s\n"
+ "srshl v25.4s, v25.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v27.4s, v27.4s, v20.4s\n"
+ "srshl v28.4s, v28.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v28.4s, v28.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "add v31.4s, v31.4s, v13.4s\n"
+ "smin v24.4s, v24.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v15.4s\n"
+ "smin v26.4s, v26.4s, v15.4s\n"
+ "smin v27.4s, v27.4s, v15.4s\n"
+ "smin v28.4s, v28.4s, v15.4s\n"
+ "smin v29.4s, v29.4s, v15.4s\n"
+ "smin v30.4s, v30.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "blt 3f\n"
+ "str s24, [x27, #0x0]\n"
+ "str s25, [x26, #0x0]\n"
+ "str s26, [x25, #0x0]\n"
+ "str s27, [x24, #0x0]\n"
+ "str s28, [x23, #0x0]\n"
+ "str s29, [x22, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
+ "str s31, [x20, #0x0]\n"
+ "b 4f\n"
+ "3:" // Tail: Oddments
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v24.b }[0], [x27], #0x1\n"
+ "st1 { v25.b }[0], [x26], #0x1\n"
+ "st1 { v26.b }[0], [x25], #0x1\n"
+ "st1 { v27.b }[0], [x24], #0x1\n"
+ "st1 { v28.b }[0], [x23], #0x1\n"
+ "st1 { v29.b }[0], [x22], #0x1\n"
+ "st1 { v30.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x20], #0x1\n"
+ "beq 4f\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v24.b }[1], [x27], #0x1\n"
+ "st1 { v25.b }[1], [x26], #0x1\n"
+ "st1 { v26.b }[1], [x25], #0x1\n"
+ "st1 { v27.b }[1], [x24], #0x1\n"
+ "st1 { v28.b }[1], [x23], #0x1\n"
+ "st1 { v29.b }[1], [x22], #0x1\n"
+ "st1 { v30.b }[1], [x21], #0x1\n"
+ "st1 { v31.b }[1], [x20], #0x1\n"
+ "beq 4f\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v24.b }[2], [x27], #0x1\n"
+ "st1 { v25.b }[2], [x26], #0x1\n"
+ "st1 { v26.b }[2], [x25], #0x1\n"
+ "st1 { v27.b }[2], [x24], #0x1\n"
+ "st1 { v28.b }[2], [x23], #0x1\n"
+ "st1 { v29.b }[2], [x22], #0x1\n"
+ "st1 { v30.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x20], #0x1\n"
+ "beq 4f\n"
+ "st1 { v24.b }[3], [x27], #0x1\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v25.b }[3], [x26], #0x1\n"
+ "st1 { v26.b }[3], [x25], #0x1\n"
+ "st1 { v27.b }[3], [x24], #0x1\n"
+ "st1 { v28.b }[3], [x23], #0x1\n"
+ "st1 { v29.b }[3], [x22], #0x1\n"
+ "st1 { v30.b }[3], [x21], #0x1\n"
+ "st1 { v31.b }[3], [x20], #0x1\n"
+ "4:" // Tail: End
+ : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..3f71c5fb64
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+struct a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+ using Parent = GenericDepthfirstMultiplierKernelStrategy<int8_t, int8_t, int8_t, int32_t>;
+ a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+ : Parent(2, 8, arm_gemm::VLType::None)
+ {
+ }
+ Parent::KernelType kernel = a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..b21ad484e5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,1480 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+ const int8_t *const *const inptrs,
+ int8_t *const *const outptrs,
+ const int8_t *weights,
+ const int32_t *bias,
+ const unsigned int kernel_points,
+ const unsigned int n_output_channels,
+ const int32_t *per_channel_left_shifts,
+ const int32_t *per_channel_muls,
+ const int32_t *per_channel_right_shifts,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "lsr x10, %x[n_output_channels], #0x2\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v13.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v12.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v11.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+ "ld1r { v10.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v9.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "ld1r { v8.4s }, [x20]\n"
+ "mov x9, #0x0\n"
+ "cbz x10, 9f\n"
+ "1:" // Output channel loop
+ "movi v31.4s, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "lsl x20, x9, #0x2\n"
+ "ldr q31, [%x[bias], x20]\n"
+ "2:" // Output channel loop: Load bias: Done
+ "mov v16.16b, v31.16b\n"
+ "mov v17.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v19.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v22.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ "mov v24.16b, v31.16b\n"
+ "mov v25.16b, v31.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v27.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v31.16b, v31.16b\n"
+ "cbz %x[rq_mul_ptr], 3f\n"
+ "lsl x20, x9, #0x2\n"
+ "ldr q9, [%x[rq_mul_ptr], x20]\n"
+ "ldr q8, [%x[rq_right_shift_ptr], x20]\n"
+ "cbz %x[rq_left_shift_ptr], 3f\n"
+ "ldr q10, [%x[rq_left_shift_ptr], x20]\n"
+ "3:" // Output channel loop: Load quantization parameters: Done
+ "ldr s5, [%x[weights]], #0x4\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "cbz x23, 7f\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "beq 5f\n"
+ "4:" // Output channel loop: Kernel loop
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "bgt 4b\n"
+ "5:" // Output channel loop: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 6f\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x27, x9]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s25, [x26, x9]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s26, [x25, x9]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s27, [x24, x9]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
+ "b 8f\n"
+ "6:" // Output channel loop: Odd tail
+ "ldp x20, x28, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x20, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldr d4, [x28, #0x0]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x27, x9]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s25, [x26, x9]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s26, [x25, x9]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s27, [x24, x9]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
+ "b 8f\n"
+ "7:" // Output channel loop: Single kernel point
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x27, x9]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s25, [x26, x9]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s26, [x25, x9]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s27, [x24, x9]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
+ "8:" // Output channel loop: Done
+ "add x9, x9, #0x4\n"
+ "cmp x9, x10, LSL #2\n"
+ "blt 1b\n"
+ "tst %x[n_output_channels], #0x3\n"
+ "beq 26f\n"
+ "9:" // Output channel oddments
+ "movi v31.4s, #0x0\n"
+ "cbz %x[bias], 12f\n"
+ "add x20, %x[bias], x9, LSL #2\n"
+ "tbz %x[n_output_channels], #1, 10f\n"
+ "ld1 { v31.d }[0], [x20], #0x8\n"
+ "tbz %x[n_output_channels], #0, 11f\n"
+ "ld1 { v31.s }[2], [x20]\n"
+ "b 11f\n"
+ "10:" // Output channel oddments: Load bias: Bit 1: Unset
+ "ld1 { v31.s }[0], [x20]\n"
+ "11:" // Output channel oddments: Load bias: Bit 1: End
+ "12:" // Output channel oddments: Load bias: Done
+ "mov v16.16b, v31.16b\n"
+ "mov v17.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v19.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v22.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ "mov v24.16b, v31.16b\n"
+ "mov v25.16b, v31.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v27.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v31.16b, v31.16b\n"
+ "cbz %x[rq_mul_ptr], 18f\n"
+ "add x22, %x[rq_mul_ptr], x9, LSL #2\n"
+ "add x21, %x[rq_right_shift_ptr], x9, LSL #2\n"
+ "add x20, %x[rq_left_shift_ptr], x9, LSL #2\n"
+ "cbz %x[rq_left_shift_ptr], 15f\n"
+ "tbz %x[n_output_channels], #1, 13f\n"
+ "ld1 { v9.d }[0], [x22], #0x8\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_output_channels], #0, 14f\n"
+ "ld1 { v9.s }[2], [x22], #0x4\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "b 14f\n"
+ "13:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "14:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
+ "b 18f\n"
+ "15:" // Output channel oddments: Load quantization parameters: No left shift
+ "tbz %x[n_output_channels], #1, 16f\n"
+ "ld1 { v9.d }[0], [x22], #0x8\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
+ "tbz %x[n_output_channels], #0, 17f\n"
+ "ld1 { v9.s }[2], [x22], #0x4\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
+ "b 17f\n"
+ "16:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
+ "17:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
+ "18:" // Output channel oddments: Load quantization parameters: Done
+ "ldr s5, [%x[weights]], #0x4\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "cbz x23, 22f\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "beq 20f\n"
+ "19:" // Output channel oddments: Kernel loop
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "bgt 19b\n"
+ "20:" // Output channel oddments: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 21f\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "b 23f\n"
+ "21:" // Output channel oddments: Odd tail
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d2, [x21, #0x0]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d1, [x20, #0x0]\n"
+ "ldr s0, [%x[weights]], #0x4\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "smlal v16.4s, v0.4h, v2.h[0]\n"
+ "smlal v17.4s, v0.4h, v2.h[1]\n"
+ "smlal v18.4s, v0.4h, v2.h[2]\n"
+ "smlal v19.4s, v0.4h, v2.h[3]\n"
+ "smlal v20.4s, v0.4h, v2.h[4]\n"
+ "smlal v21.4s, v0.4h, v2.h[5]\n"
+ "smlal v22.4s, v0.4h, v2.h[6]\n"
+ "smlal v23.4s, v0.4h, v2.h[7]\n"
+ "smlal v24.4s, v0.4h, v1.h[0]\n"
+ "smlal v25.4s, v0.4h, v1.h[1]\n"
+ "smlal v26.4s, v0.4h, v1.h[2]\n"
+ "smlal v27.4s, v0.4h, v1.h[3]\n"
+ "smlal v28.4s, v0.4h, v1.h[4]\n"
+ "smlal v29.4s, v0.4h, v1.h[5]\n"
+ "smlal v30.4s, v0.4h, v1.h[6]\n"
+ "smlal v31.4s, v0.4h, v1.h[7]\n"
+ "b 23f\n"
+ "22:" // Output channel oddments: Single kernel point
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "23:" // Output channel oddments: Done
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "tbz %x[n_output_channels], #1, 24f\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "add x27, x27, x9\n"
+ "add x26, x26, x9\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.h }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "add x27, x27, x9\n"
+ "st1 { v17.h }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.h }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.h }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.h }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.h }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.h }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.h }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "add x9, x9, #0x2\n"
+ "st1 { v24.h }[0], [x27]\n"
+ "st1 { v25.h }[0], [x26]\n"
+ "st1 { v26.h }[0], [x25]\n"
+ "st1 { v27.h }[0], [x24]\n"
+ "st1 { v28.h }[0], [x23]\n"
+ "st1 { v29.h }[0], [x22]\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
+ "tbz %x[n_output_channels], #0, 25f\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "add x27, x27, x9\n"
+ "add x26, x26, x9\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.b }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "add x27, x27, x9\n"
+ "st1 { v17.b }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.b }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.b }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.b }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.b }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.b }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "st1 { v24.b }[2], [x27]\n"
+ "st1 { v25.b }[2], [x26]\n"
+ "st1 { v26.b }[2], [x25]\n"
+ "st1 { v27.b }[2], [x24]\n"
+ "st1 { v28.b }[2], [x23]\n"
+ "st1 { v29.b }[2], [x22]\n"
+ "st1 { v30.b }[2], [x21]\n"
+ "st1 { v31.b }[2], [x20]\n"
+ "b 25f\n"
+ "24:" // Output channel oddments: Done: Store: Bit 1: Unset
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "add x27, x27, x9\n"
+ "add x26, x26, x9\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.b }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "add x27, x27, x9\n"
+ "st1 { v17.b }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.b }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.b }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.b }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.b }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.b }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.b }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "st1 { v24.b }[0], [x27]\n"
+ "st1 { v25.b }[0], [x26]\n"
+ "st1 { v26.b }[0], [x25]\n"
+ "st1 { v27.b }[0], [x24]\n"
+ "st1 { v28.b }[0], [x23]\n"
+ "st1 { v29.b }[0], [x22]\n"
+ "st1 { v30.b }[0], [x21]\n"
+ "st1 { v31.b }[0], [x20]\n"
+ "25:" // Output channel oddments: Done: Store: Bit 1: End
+ "26:" // Done
+ : [weights] "+&r" (weights)
+ : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..3190cbfbf0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int , const int8_t *const *const , const int8_t *, const int32_t *, const arm_gemm::Requantize32& , const int32_t *, const int32_t *, int8_t *const *const );
+
+class a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleave_a64_s8q_3x3_dot::get_packed_size(args);
+ }
+
+ void pack_parameters(
+ const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const override
+ {
+ interleave_a64_s8q_3x3_dot::pack_parameters(
+ args.input_channels * args.channel_multiplier, buffer, reinterpret_cast<const int32_t *>(biases),
+ reinterpret_cast<const int8_t *>(weights), qp, ld_weight_col, ld_weight_row
+ );
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..aad34c4c25
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,1484 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
+{
+ __asm__ __volatile__(
+ "lsr x15, %x[n_channels], #0x4\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v8.4s }, [x20]\n"
+ "ldp x14, x13, [%x[inptrs], #0x0]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v12.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "mov x12, #0x0\n"
+ "mov x11, #0x0\n"
+ "ldp x10, x9, [%x[inptrs], #0x10]\n"
+ "ldp x28, x27, [%x[inptrs], #0x20]\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "ldp x25, x24, [%x[outptrs], #0x0]\n"
+ "ldp x23, x22, [%x[outptrs], #0x10]\n"
+ "cbz x15, 3f\n"
+ "ldr q11, [x14, x12]\n"
+ "ldr q20, [x13, x12]\n"
+ "subs x15, x15, #0x1\n"
+ "ldr q16, [x10, x12]\n"
+ "ldr q14, [x9, x12]\n"
+ "zip2 v19.16b, v11.16b, v16.16b\n"
+ "zip1 v11.16b, v11.16b, v16.16b\n"
+ "ldr q13, [x28, x12]\n"
+ "ldr q18, [x27, x12]\n"
+ "zip1 v17.16b, v20.16b, v14.16b\n"
+ "zip2 v14.16b, v20.16b, v14.16b\n"
+ "ldr q16, [x26, x12]\n"
+ "ldr q27, [x21, x12]\n"
+ "zip2 v10.16b, v11.16b, v17.16b\n"
+ "zip1 v11.16b, v11.16b, v17.16b\n"
+ "ldr q24, [%x[params], #0x10]\n"
+ "ldr q9, [%x[params], #0x20]\n"
+ "zip1 v3.16b, v19.16b, v14.16b\n"
+ "zip2 v14.16b, v19.16b, v14.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q6, [%x[params], #0x30]\n"
+ "zip2 v30.16b, v13.16b, v16.16b\n"
+ "zip1 v13.16b, v13.16b, v16.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x40]\n"
+ "ldr q5, [x21, x12]\n"
+ "zip1 v16.16b, v18.16b, v27.16b\n"
+ "zip2 v27.16b, v18.16b, v27.16b\n"
+ "ldr q17, [x20, x12]\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "zip2 v28.16b, v13.16b, v16.16b\n"
+ "zip1 v13.16b, v13.16b, v16.16b\n"
+ "ldr q16, [x21, x12]\n"
+ "ldr q7, [x20, x12]\n"
+ "zip2 v20.16b, v5.16b, v16.16b\n"
+ "zip1 v5.16b, v5.16b, v16.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x60]\n"
+ "ldr q16, [x21, x12]\n"
+ "zip1 v22.16b, v17.16b, v7.16b\n"
+ "zip2 v7.16b, v17.16b, v7.16b\n"
+ "ldr q19, [x20, x12]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "zip1 v21.16b, v30.16b, v27.16b\n"
+ "zip2 v27.16b, v30.16b, v27.16b\n"
+ "ldr q30, [x21, x12]\n"
+ "ldr q1, [x20, x12]\n"
+ "zip2 v17.16b, v16.16b, v30.16b\n"
+ "zip1 v16.16b, v16.16b, v30.16b\n"
+ "zip1 v18.16b, v19.16b, v1.16b\n"
+ "zip2 v1.16b, v19.16b, v1.16b\n"
+ "ldp x14, x13, [%x[inptrs], #0x0]\n"
+ "ldp x10, x9, [%x[inptrs], #0x10]\n"
+ "ldp x28, x27, [%x[inptrs], #0x20]\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "zip2 v29.16b, v5.16b, v22.16b\n"
+ "zip1 v5.16b, v5.16b, v22.16b\n"
+ "zip1 v0.16b, v20.16b, v7.16b\n"
+ "zip2 v7.16b, v20.16b, v7.16b\n"
+ "add %x[params], %x[params], #0x40\n"
+ "zip2 v30.16b, v16.16b, v18.16b\n"
+ "zip1 v16.16b, v16.16b, v18.16b\n"
+ "zip1 v2.16b, v17.16b, v1.16b\n"
+ "zip2 v1.16b, v17.16b, v1.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v4.16b, v31.16b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ ".inst 0x4e8b971f // sdot v31.4s, v24.16b, v11.16b\n"
+ ".inst 0x4e8d9712 // sdot v18.4s, v24.16b, v13.16b\n"
+ "ext v11.16b, v11.16b, v11.16b, #0x1\n"
+ "add x12, x12, #0x10\n"
+ ".inst 0x4e8d953f // sdot v31.4s, v9.16b, v13.16b\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x4e8b971a // sdot v26.4s, v24.16b, v11.16b\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ ".inst 0x4e8d9704 // sdot v4.4s, v24.16b, v13.16b\n"
+ ".inst 0x4e859532 // sdot v18.4s, v9.16b, v5.16b\n"
+ "subs x15, x15, #0x1\n"
+ ".inst 0x4e8594df // sdot v31.4s, v6.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e8d953a // sdot v26.4s, v9.16b, v13.16b\n"
+ "ldr q20, [%x[params], #0x10]\n"
+ ".inst 0x4e859524 // sdot v4.4s, v9.16b, v5.16b\n"
+ ".inst 0x4e9094d2 // sdot v18.4s, v6.16b, v16.16b\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e8594da // sdot v26.4s, v6.16b, v5.16b\n"
+ ".inst 0x4e9094c4 // sdot v4.4s, v6.16b, v16.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+ "ldr q5, [%x[params], #0x60]\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v19.16b, v26.16b, v20.16b\n"
+ "and v17.16b, v18.16b, v20.16b\n"
+ "and v16.16b, v4.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "sqadd v26.4s, v26.4s, v19.4s\n"
+ "ldr q13, [%x[params], #0x40]\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0x50]\n"
+ "sqadd v4.4s, v4.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v18.4s, v18.4s, v20.4s\n"
+ "srshl v4.4s, v4.4s, v20.4s\n"
+ "ldr q22, [%x[params], #0x70]\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v18.4s, v18.4s, v15.4s\n"
+ "add v4.4s, v4.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v18.4s, v18.4s, v8.4s\n"
+ "smax v4.4s, v4.4s, v8.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "smin v4.4s, v4.4s, v12.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s31, [x25, x11]\n"
+ "ldr q24, [%x[params], #0x20]\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s26, [x24, x11]\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s18, [x23, x11]\n"
+ "mov v26.16b, v24.16b\n"
+ "str s4, [x22, x11]\n"
+ "mov v25.16b, v24.16b\n"
+ "mov v23.16b, v24.16b\n"
+ ".inst 0x4e8a9618 // sdot v24.4s, v16.16b, v10.16b\n"
+ ".inst 0x4e9c9619 // sdot v25.4s, v16.16b, v28.16b\n"
+ ".inst 0x4e9c95b8 // sdot v24.4s, v13.16b, v28.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ "add x11, x11, #0x4\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x4e8a961a // sdot v26.4s, v16.16b, v10.16b\n"
+ "ldr q10, [x13, x12]\n"
+ ".inst 0x4e9c9617 // sdot v23.4s, v16.16b, v28.16b\n"
+ ".inst 0x4e9d95b9 // sdot v25.4s, v13.16b, v29.16b\n"
+ ".inst 0x4e9d9638 // sdot v24.4s, v17.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x4e9c95ba // sdot v26.4s, v13.16b, v28.16b\n"
+ "ldr q20, [x27, x12]\n"
+ ".inst 0x4e9d95b7 // sdot v23.4s, v13.16b, v29.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v5.4s\n"
+ ".inst 0x4e9e9639 // sdot v25.4s, v17.16b, v30.16b\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+ ".inst 0x4e9d963a // sdot v26.4s, v17.16b, v29.16b\n"
+ ".inst 0x4e9e9637 // sdot v23.4s, v17.16b, v30.16b\n"
+ "and v16.16b, v24.16b, v22.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v5.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v5.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v5.4s\n"
+ "ldr q19, [%x[params], #0xc0]\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "and v18.16b, v26.16b, v22.16b\n"
+ "and v17.16b, v25.16b, v22.16b\n"
+ "and v16.16b, v23.16b, v22.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v24.4s, v24.4s, v22.4s\n"
+ "sqadd v26.4s, v26.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0xa0]\n"
+ "sqadd v25.4s, v25.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0xb0]\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x90]\n"
+ "add v24.4s, v24.4s, v15.4s\n"
+ "srshl v26.4s, v26.4s, v22.4s\n"
+ "srshl v25.4s, v25.4s, v22.4s\n"
+ "srshl v23.4s, v23.4s, v22.4s\n"
+ "ldr q22, [%x[params], #0xd0]\n"
+ "smax v24.4s, v24.4s, v8.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v25.4s, v25.4s, v15.4s\n"
+ "add v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v12.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "smax v23.4s, v23.4s, v8.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "smin v23.4s, v23.4s, v12.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x25, x11]\n"
+ "ldr q24, [%x[params], #0x80]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s26, [x24, x11]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s25, [x23, x11]\n"
+ "str s23, [x22, x11]\n"
+ "mov v23.16b, v24.16b\n"
+ "mov v31.16b, v24.16b\n"
+ ".inst 0x4e95961f // sdot v31.4s, v16.16b, v21.16b\n"
+ "mov v13.16b, v24.16b\n"
+ ".inst 0x4e839618 // sdot v24.4s, v16.16b, v3.16b\n"
+ ".inst 0x4e959658 // sdot v24.4s, v18.16b, v21.16b\n"
+ "add x11, x11, #0x4\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4e839617 // sdot v23.4s, v16.16b, v3.16b\n"
+ "ldr q3, [x10, x12]\n"
+ ".inst 0x4e95960d // sdot v13.4s, v16.16b, v21.16b\n"
+ ".inst 0x4e80965f // sdot v31.4s, v18.16b, v0.16b\n"
+ ".inst 0x4e809638 // sdot v24.4s, v17.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e959657 // sdot v23.4s, v18.16b, v21.16b\n"
+ "ldr q4, [x26, x12]\n"
+ ".inst 0x4e80964d // sdot v13.4s, v18.16b, v0.16b\n"
+ ".inst 0x4e82963f // sdot v31.4s, v17.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "sqrdmulh v24.4s, v24.4s, v19.4s\n"
+ ".inst 0x4e809637 // sdot v23.4s, v17.16b, v0.16b\n"
+ ".inst 0x4e82962d // sdot v13.4s, v17.16b, v2.16b\n"
+ "and v16.16b, v24.16b, v22.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v23.4s, v23.4s, v19.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v19.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v19.4s\n"
+ "ldr q19, [%x[params], #0x120]\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "and v18.16b, v23.16b, v22.16b\n"
+ "and v17.16b, v31.16b, v22.16b\n"
+ "and v16.16b, v13.16b, v22.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v24.4s, v24.4s, v22.4s\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0x100]\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0x110]\n"
+ "sqadd v13.4s, v13.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0xf0]\n"
+ "add v24.4s, v24.4s, v15.4s\n"
+ "srshl v23.4s, v23.4s, v22.4s\n"
+ "srshl v31.4s, v31.4s, v22.4s\n"
+ "srshl v13.4s, v13.4s, v22.4s\n"
+ "ldr q22, [%x[params], #0x130]\n"
+ "smax v24.4s, v24.4s, v8.4s\n"
+ "add v23.4s, v23.4s, v15.4s\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "add v13.4s, v13.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v12.4s\n"
+ "smax v23.4s, v23.4s, v8.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smax v13.4s, v13.4s, v8.4s\n"
+ "smin v23.4s, v23.4s, v12.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smin v13.4s, v13.4s, v12.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str s24, [x25, x11]\n"
+ "ldr q2, [%x[params], #0xe0]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s23, [x24, x11]\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "str s31, [x23, x11]\n"
+ "mov v25.16b, v2.16b\n"
+ "str s13, [x22, x11]\n"
+ "mov v21.16b, v2.16b\n"
+ "mov v30.16b, v2.16b\n"
+ ".inst 0x4e8e9602 // sdot v2.4s, v16.16b, v14.16b\n"
+ ".inst 0x4e9b9615 // sdot v21.4s, v16.16b, v27.16b\n"
+ ".inst 0x4e9b9642 // sdot v2.4s, v18.16b, v27.16b\n"
+ "ext v14.16b, v14.16b, v14.16b, #0x1\n"
+ "add x11, x11, #0x4\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e8e9619 // sdot v25.4s, v16.16b, v14.16b\n"
+ "ldr q14, [x9, x12]\n"
+ ".inst 0x4e9b961e // sdot v30.4s, v16.16b, v27.16b\n"
+ ".inst 0x4e879655 // sdot v21.4s, v18.16b, v7.16b\n"
+ ".inst 0x4e879622 // sdot v2.4s, v17.16b, v7.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ ".inst 0x4e9b9659 // sdot v25.4s, v18.16b, v27.16b\n"
+ "ldr q27, [x21, x12]\n"
+ ".inst 0x4e87965e // sdot v30.4s, v18.16b, v7.16b\n"
+ "sqrdmulh v2.4s, v2.4s, v19.4s\n"
+ ".inst 0x4e819635 // sdot v21.4s, v17.16b, v1.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ ".inst 0x4e879639 // sdot v25.4s, v17.16b, v7.16b\n"
+ ".inst 0x4e81963e // sdot v30.4s, v17.16b, v1.16b\n"
+ "and v16.16b, v2.16b, v22.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v25.4s, v25.4s, v19.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v19.4s\n"
+ "ldr q11, [x14, x12]\n"
+ "ldp x21, x20, [%x[inptrs], #0x40]\n"
+ "ldr q5, [x21, x12]\n"
+ "ldr q29, [x20, x12]\n"
+ "sqadd v2.4s, v2.4s, v16.4s\n"
+ "and v19.16b, v25.16b, v22.16b\n"
+ "and v17.16b, v21.16b, v22.16b\n"
+ "and v16.16b, v30.16b, v22.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "ldr q26, [x21, x12]\n"
+ "ldr q7, [x20, x12]\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v2.4s, v2.4s, v22.4s\n"
+ "sqadd v25.4s, v25.4s, v19.4s\n"
+ "ldr q9, [%x[params], #0x160]\n"
+ "sqadd v21.4s, v21.4s, v17.4s\n"
+ "ldr q6, [%x[params], #0x170]\n"
+ "sqadd v30.4s, v30.4s, v16.4s\n"
+ "ldr q24, [%x[params], #0x150]\n"
+ "add v2.4s, v2.4s, v15.4s\n"
+ "srshl v25.4s, v25.4s, v22.4s\n"
+ "srshl v21.4s, v21.4s, v22.4s\n"
+ "srshl v30.4s, v30.4s, v22.4s\n"
+ "ldr q13, [x28, x12]\n"
+ "smax v2.4s, v2.4s, v8.4s\n"
+ "ldp x21, x20, [%x[inptrs], #0x60]\n"
+ "ldr q16, [x21, x12]\n"
+ "ldr q28, [x20, x12]\n"
+ "add v25.4s, v25.4s, v15.4s\n"
+ "add v21.4s, v21.4s, v15.4s\n"
+ "add v30.4s, v30.4s, v15.4s\n"
+ "smin v2.4s, v2.4s, v12.4s\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "ldr q23, [x21, x12]\n"
+ "ldr q1, [x20, x12]\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "smax v21.4s, v21.4s, v8.4s\n"
+ "ldp x14, x13, [%x[inptrs], #0x0]\n"
+ "smax v30.4s, v30.4s, v8.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "ldp x10, x9, [%x[inptrs], #0x10]\n"
+ "ldp x28, x27, [%x[inptrs], #0x20]\n"
+ "smin v21.4s, v21.4s, v12.4s\n"
+ "smin v30.4s, v30.4s, v12.4s\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str s2, [x25, x11]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "zip2 v18.16b, v11.16b, v3.16b\n"
+ "zip1 v11.16b, v11.16b, v3.16b\n"
+ "zip1 v17.16b, v10.16b, v14.16b\n"
+ "zip2 v14.16b, v10.16b, v14.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x24, x11]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s21, [x23, x11]\n"
+ "str s30, [x22, x11]\n"
+ "zip2 v10.16b, v11.16b, v17.16b\n"
+ "zip1 v11.16b, v11.16b, v17.16b\n"
+ "add x11, x11, #0x4\n"
+ "zip1 v3.16b, v18.16b, v14.16b\n"
+ "zip2 v14.16b, v18.16b, v14.16b\n"
+ "ldr q31, [%x[params], #0x140]\n"
+ "add %x[params], %x[params], #0x180\n"
+ "zip2 v22.16b, v13.16b, v4.16b\n"
+ "zip1 v13.16b, v13.16b, v4.16b\n"
+ "zip1 v2.16b, v20.16b, v27.16b\n"
+ "zip2 v27.16b, v20.16b, v27.16b\n"
+ "zip2 v19.16b, v5.16b, v26.16b\n"
+ "zip1 v5.16b, v5.16b, v26.16b\n"
+ "zip1 v18.16b, v29.16b, v7.16b\n"
+ "zip2 v7.16b, v29.16b, v7.16b\n"
+ "zip2 v4.16b, v16.16b, v23.16b\n"
+ "zip1 v16.16b, v16.16b, v23.16b\n"
+ "zip1 v17.16b, v28.16b, v1.16b\n"
+ "zip2 v1.16b, v28.16b, v1.16b\n"
+ "zip2 v28.16b, v13.16b, v2.16b\n"
+ "zip1 v13.16b, v13.16b, v2.16b\n"
+ "zip1 v21.16b, v22.16b, v27.16b\n"
+ "zip2 v27.16b, v22.16b, v27.16b\n"
+ "zip2 v29.16b, v5.16b, v18.16b\n"
+ "zip1 v5.16b, v5.16b, v18.16b\n"
+ "zip1 v0.16b, v19.16b, v7.16b\n"
+ "zip2 v7.16b, v19.16b, v7.16b\n"
+ "zip2 v30.16b, v16.16b, v17.16b\n"
+ "zip1 v16.16b, v16.16b, v17.16b\n"
+ "zip1 v2.16b, v4.16b, v1.16b\n"
+ "zip2 v1.16b, v4.16b, v1.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v4.16b, v31.16b\n"
+ "bgt 1b\n"
+ "2:" // Detached iteration
+ ".inst 0x4e8b971f // sdot v31.4s, v24.16b, v11.16b\n"
+ ".inst 0x4e8d9712 // sdot v18.4s, v24.16b, v13.16b\n"
+ "ext v11.16b, v11.16b, v11.16b, #0x1\n"
+ "tst %x[n_channels], #0xf\n"
+ ".inst 0x4e8d953f // sdot v31.4s, v9.16b, v13.16b\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x4e8b971a // sdot v26.4s, v24.16b, v11.16b\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ ".inst 0x4e8d9704 // sdot v4.4s, v24.16b, v13.16b\n"
+ ".inst 0x4e859532 // sdot v18.4s, v9.16b, v5.16b\n"
+ "add x12, x12, #0x10\n"
+ ".inst 0x4e8594df // sdot v31.4s, v6.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e8d953a // sdot v26.4s, v9.16b, v13.16b\n"
+ "ldr q19, [%x[params], #0x10]\n"
+ ".inst 0x4e859524 // sdot v4.4s, v9.16b, v5.16b\n"
+ ".inst 0x4e9094d2 // sdot v18.4s, v6.16b, v16.16b\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e8594da // sdot v26.4s, v6.16b, v5.16b\n"
+ ".inst 0x4e9094c4 // sdot v4.4s, v6.16b, v16.16b\n"
+ "and v16.16b, v31.16b, v19.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+ "ldr q24, [%x[params], #0x60]\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v20.16b, v26.16b, v19.16b\n"
+ "and v17.16b, v18.16b, v19.16b\n"
+ "and v16.16b, v4.16b, v19.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v19.4s\n"
+ "sqadd v26.4s, v26.4s, v20.4s\n"
+ "ldr q5, [%x[params], #0x40]\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0x50]\n"
+ "sqadd v4.4s, v4.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "srshl v26.4s, v26.4s, v19.4s\n"
+ "srshl v18.4s, v18.4s, v19.4s\n"
+ "srshl v4.4s, v4.4s, v19.4s\n"
+ "ldr q23, [%x[params], #0x70]\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v18.4s, v18.4s, v15.4s\n"
+ "add v4.4s, v4.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v18.4s, v18.4s, v8.4s\n"
+ "smax v4.4s, v4.4s, v8.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "smin v4.4s, v4.4s, v12.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s31, [x25, x11]\n"
+ "ldr q25, [%x[params], #0x20]\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s26, [x24, x11]\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s18, [x23, x11]\n"
+ "mov v22.16b, v25.16b\n"
+ "str s4, [x22, x11]\n"
+ "mov v20.16b, v25.16b\n"
+ "mov v19.16b, v25.16b\n"
+ ".inst 0x4e8a9619 // sdot v25.4s, v16.16b, v10.16b\n"
+ ".inst 0x4e9c9614 // sdot v20.4s, v16.16b, v28.16b\n"
+ ".inst 0x4e9c94b9 // sdot v25.4s, v5.16b, v28.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ "add x11, x11, #0x4\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x4e8a9616 // sdot v22.4s, v16.16b, v10.16b\n"
+ ".inst 0x4e9c9613 // sdot v19.4s, v16.16b, v28.16b\n"
+ ".inst 0x4e9d94b4 // sdot v20.4s, v5.16b, v29.16b\n"
+ ".inst 0x4e9d9639 // sdot v25.4s, v17.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x4e9c94b6 // sdot v22.4s, v5.16b, v28.16b\n"
+ ".inst 0x4e9d94b3 // sdot v19.4s, v5.16b, v29.16b\n"
+ "sqrdmulh v25.4s, v25.4s, v24.4s\n"
+ ".inst 0x4e9e9634 // sdot v20.4s, v17.16b, v30.16b\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+ ".inst 0x4e9d9636 // sdot v22.4s, v17.16b, v29.16b\n"
+ ".inst 0x4e9e9633 // sdot v19.4s, v17.16b, v30.16b\n"
+ "and v16.16b, v25.16b, v23.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+ "ldr q24, [%x[params], #0xc0]\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v18.16b, v22.16b, v23.16b\n"
+ "and v17.16b, v20.16b, v23.16b\n"
+ "and v16.16b, v19.16b, v23.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v25.4s, v25.4s, v23.4s\n"
+ "sqadd v22.4s, v22.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0xa0]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0xb0]\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0x90]\n"
+ "add v25.4s, v25.4s, v15.4s\n"
+ "srshl v22.4s, v22.4s, v23.4s\n"
+ "srshl v20.4s, v20.4s, v23.4s\n"
+ "srshl v19.4s, v19.4s, v23.4s\n"
+ "ldr q23, [%x[params], #0xd0]\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "add v22.4s, v22.4s, v15.4s\n"
+ "add v20.4s, v20.4s, v15.4s\n"
+ "add v19.4s, v19.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v12.4s\n"
+ "smax v22.4s, v22.4s, v8.4s\n"
+ "smax v20.4s, v20.4s, v8.4s\n"
+ "smax v19.4s, v19.4s, v8.4s\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s25, [x25, x11]\n"
+ "ldr q10, [%x[params], #0x80]\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s22, [x24, x11]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s20, [x23, x11]\n"
+ "str s19, [x22, x11]\n"
+ "mov v28.16b, v10.16b\n"
+ "mov v20.16b, v10.16b\n"
+ ".inst 0x4e959614 // sdot v20.4s, v16.16b, v21.16b\n"
+ "mov v19.16b, v10.16b\n"
+ ".inst 0x4e83960a // sdot v10.4s, v16.16b, v3.16b\n"
+ ".inst 0x4e95964a // sdot v10.4s, v18.16b, v21.16b\n"
+ "add x11, x11, #0x4\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4e83961c // sdot v28.4s, v16.16b, v3.16b\n"
+ ".inst 0x4e959613 // sdot v19.4s, v16.16b, v21.16b\n"
+ ".inst 0x4e809654 // sdot v20.4s, v18.16b, v0.16b\n"
+ ".inst 0x4e80962a // sdot v10.4s, v17.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e95965c // sdot v28.4s, v18.16b, v21.16b\n"
+ ".inst 0x4e809653 // sdot v19.4s, v18.16b, v0.16b\n"
+ ".inst 0x4e829634 // sdot v20.4s, v17.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "sqrdmulh v10.4s, v10.4s, v24.4s\n"
+ ".inst 0x4e80963c // sdot v28.4s, v17.16b, v0.16b\n"
+ ".inst 0x4e829633 // sdot v19.4s, v17.16b, v2.16b\n"
+ "and v16.16b, v10.16b, v23.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v28.4s, v28.4s, v24.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+ "ldr q24, [%x[params], #0x120]\n"
+ "sqadd v10.4s, v10.4s, v16.4s\n"
+ "and v18.16b, v28.16b, v23.16b\n"
+ "and v17.16b, v20.16b, v23.16b\n"
+ "and v16.16b, v19.16b, v23.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v10.4s, v10.4s, v23.4s\n"
+ "sqadd v28.4s, v28.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0x100]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q17, [%x[params], #0x110]\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "ldr q16, [%x[params], #0xf0]\n"
+ "add v10.4s, v10.4s, v15.4s\n"
+ "srshl v28.4s, v28.4s, v23.4s\n"
+ "srshl v20.4s, v20.4s, v23.4s\n"
+ "srshl v19.4s, v19.4s, v23.4s\n"
+ "ldr q23, [%x[params], #0x130]\n"
+ "smax v10.4s, v10.4s, v8.4s\n"
+ "add v28.4s, v28.4s, v15.4s\n"
+ "add v20.4s, v20.4s, v15.4s\n"
+ "add v19.4s, v19.4s, v15.4s\n"
+ "smin v10.4s, v10.4s, v12.4s\n"
+ "smax v28.4s, v28.4s, v8.4s\n"
+ "smax v20.4s, v20.4s, v8.4s\n"
+ "smax v19.4s, v19.4s, v8.4s\n"
+ "smin v28.4s, v28.4s, v12.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s10, [x25, x11]\n"
+ "ldr q22, [%x[params], #0xe0]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "add %x[params], %x[params], #0x140\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s28, [x24, x11]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s20, [x23, x11]\n"
+ "mov v21.16b, v22.16b\n"
+ "str s19, [x22, x11]\n"
+ "mov v20.16b, v22.16b\n"
+ "mov v19.16b, v22.16b\n"
+ ".inst 0x4e8e9616 // sdot v22.4s, v16.16b, v14.16b\n"
+ ".inst 0x4e9b9614 // sdot v20.4s, v16.16b, v27.16b\n"
+ ".inst 0x4e9b9656 // sdot v22.4s, v18.16b, v27.16b\n"
+ "ext v14.16b, v14.16b, v14.16b, #0x1\n"
+ "add x11, x11, #0x4\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e8e9615 // sdot v21.4s, v16.16b, v14.16b\n"
+ ".inst 0x4e9b9613 // sdot v19.4s, v16.16b, v27.16b\n"
+ ".inst 0x4e879654 // sdot v20.4s, v18.16b, v7.16b\n"
+ ".inst 0x4e879636 // sdot v22.4s, v17.16b, v7.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ ".inst 0x4e9b9655 // sdot v21.4s, v18.16b, v27.16b\n"
+ ".inst 0x4e879653 // sdot v19.4s, v18.16b, v7.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+ ".inst 0x4e819634 // sdot v20.4s, v17.16b, v1.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ ".inst 0x4e879635 // sdot v21.4s, v17.16b, v7.16b\n"
+ ".inst 0x4e819633 // sdot v19.4s, v17.16b, v1.16b\n"
+ "and v16.16b, v22.16b, v23.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v21.4s, v21.4s, v24.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v24.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v24.4s\n"
+ "sqadd v22.4s, v22.4s, v16.4s\n"
+ "and v18.16b, v21.16b, v23.16b\n"
+ "and v17.16b, v20.16b, v23.16b\n"
+ "and v16.16b, v19.16b, v23.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "sqadd v19.4s, v19.4s, v16.4s\n"
+ "srshl v22.4s, v22.4s, v23.4s\n"
+ "srshl v21.4s, v21.4s, v23.4s\n"
+ "srshl v20.4s, v20.4s, v23.4s\n"
+ "srshl v19.4s, v19.4s, v23.4s\n"
+ "add v22.4s, v22.4s, v15.4s\n"
+ "add v21.4s, v21.4s, v15.4s\n"
+ "add v20.4s, v20.4s, v15.4s\n"
+ "add v19.4s, v19.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v8.4s\n"
+ "smax v21.4s, v21.4s, v8.4s\n"
+ "smax v20.4s, v20.4s, v8.4s\n"
+ "smax v19.4s, v19.4s, v8.4s\n"
+ "smin v22.4s, v22.4s, v12.4s\n"
+ "smin v21.4s, v21.4s, v12.4s\n"
+ "smin v20.4s, v20.4s, v12.4s\n"
+ "smin v19.4s, v19.4s, v12.4s\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s22, [x25, x11]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str s21, [x24, x11]\n"
+ "str s20, [x23, x11]\n"
+ "str s19, [x22, x11]\n"
+ "add x11, x11, #0x4\n"
+ "beq 35f\n"
+ "3:" // Oddments
+ "and x20, %x[n_channels], #0xf\n"
+ "add x14, x14, x12\n"
+ "add x13, x13, x12\n"
+ "add x10, x10, x12\n"
+ "add x9, x9, x12\n"
+ "add x28, x28, x12\n"
+ "add x27, x27, x12\n"
+ "add x26, x26, x12\n"
+ "add x21, x21, x12\n"
+ "tbz %x[n_channels], #3, 7f\n"
+ "ldr d11, [x14], #0x8\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d3, [x10], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d13, [x28], #0x8\n"
+ "ldr d28, [x27], #0x8\n"
+ "ldr d21, [x26], #0x8\n"
+ "ldr d27, [x21], #0x8\n"
+ "tbz %x[n_channels], #2, 5f\n"
+ "ld1 { v11.s }[2], [x14], #0x4\n"
+ "ld1 { v10.s }[2], [x13], #0x4\n"
+ "ld1 { v3.s }[2], [x10], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v13.s }[2], [x28], #0x4\n"
+ "ld1 { v28.s }[2], [x27], #0x4\n"
+ "ld1 { v21.s }[2], [x26], #0x4\n"
+ "ld1 { v27.s }[2], [x21], #0x4\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v11.h }[6], [x14], #0x2\n"
+ "ld1 { v10.h }[6], [x13], #0x2\n"
+ "ld1 { v3.h }[6], [x10], #0x2\n"
+ "ld1 { v14.h }[6], [x9], #0x2\n"
+ "ld1 { v13.h }[6], [x28], #0x2\n"
+ "ld1 { v28.h }[6], [x27], #0x2\n"
+ "ld1 { v21.h }[6], [x26], #0x2\n"
+ "ld1 { v27.h }[6], [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v11.b }[14], [x14], #0x1\n"
+ "ld1 { v10.b }[14], [x13], #0x1\n"
+ "ld1 { v3.b }[14], [x10], #0x1\n"
+ "ld1 { v14.b }[14], [x9], #0x1\n"
+ "ld1 { v13.b }[14], [x28], #0x1\n"
+ "ld1 { v28.b }[14], [x27], #0x1\n"
+ "ld1 { v21.b }[14], [x26], #0x1\n"
+ "ld1 { v27.b }[14], [x21], #0x1\n"
+ "b 11f\n"
+ "4:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v11.b }[12], [x14], #0x1\n"
+ "ld1 { v10.b }[12], [x13], #0x1\n"
+ "ld1 { v3.b }[12], [x10], #0x1\n"
+ "ld1 { v14.b }[12], [x9], #0x1\n"
+ "ld1 { v13.b }[12], [x28], #0x1\n"
+ "ld1 { v28.b }[12], [x27], #0x1\n"
+ "ld1 { v21.b }[12], [x26], #0x1\n"
+ "ld1 { v27.b }[12], [x21], #0x1\n"
+ "b 11f\n"
+ "5:" // Oddments: Load (A): Bit 3: Bit 2: Unset
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v11.h }[4], [x14], #0x2\n"
+ "ld1 { v10.h }[4], [x13], #0x2\n"
+ "ld1 { v3.h }[4], [x10], #0x2\n"
+ "ld1 { v14.h }[4], [x9], #0x2\n"
+ "ld1 { v13.h }[4], [x28], #0x2\n"
+ "ld1 { v28.h }[4], [x27], #0x2\n"
+ "ld1 { v21.h }[4], [x26], #0x2\n"
+ "ld1 { v27.h }[4], [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v11.b }[10], [x14], #0x1\n"
+ "ld1 { v10.b }[10], [x13], #0x1\n"
+ "ld1 { v3.b }[10], [x10], #0x1\n"
+ "ld1 { v14.b }[10], [x9], #0x1\n"
+ "ld1 { v13.b }[10], [x28], #0x1\n"
+ "ld1 { v28.b }[10], [x27], #0x1\n"
+ "ld1 { v21.b }[10], [x26], #0x1\n"
+ "ld1 { v27.b }[10], [x21], #0x1\n"
+ "b 11f\n"
+ "6:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v11.b }[8], [x14], #0x1\n"
+ "ld1 { v10.b }[8], [x13], #0x1\n"
+ "ld1 { v3.b }[8], [x10], #0x1\n"
+ "ld1 { v14.b }[8], [x9], #0x1\n"
+ "ld1 { v13.b }[8], [x28], #0x1\n"
+ "ld1 { v28.b }[8], [x27], #0x1\n"
+ "ld1 { v21.b }[8], [x26], #0x1\n"
+ "ld1 { v27.b }[8], [x21], #0x1\n"
+ "b 11f\n"
+ "7:" // Oddments: Load (A): Bit 3: Unset
+ "tbz %x[n_channels], #2, 9f\n"
+ "ldr s11, [x14], #0x4\n"
+ "ldr s10, [x13], #0x4\n"
+ "ldr s3, [x10], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s13, [x28], #0x4\n"
+ "ldr s28, [x27], #0x4\n"
+ "ldr s21, [x26], #0x4\n"
+ "ldr s27, [x21], #0x4\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v11.h }[2], [x14], #0x2\n"
+ "ld1 { v10.h }[2], [x13], #0x2\n"
+ "ld1 { v3.h }[2], [x10], #0x2\n"
+ "ld1 { v14.h }[2], [x9], #0x2\n"
+ "ld1 { v13.h }[2], [x28], #0x2\n"
+ "ld1 { v28.h }[2], [x27], #0x2\n"
+ "ld1 { v21.h }[2], [x26], #0x2\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v11.b }[6], [x14], #0x1\n"
+ "ld1 { v10.b }[6], [x13], #0x1\n"
+ "ld1 { v3.b }[6], [x10], #0x1\n"
+ "ld1 { v14.b }[6], [x9], #0x1\n"
+ "ld1 { v13.b }[6], [x28], #0x1\n"
+ "ld1 { v28.b }[6], [x27], #0x1\n"
+ "ld1 { v21.b }[6], [x26], #0x1\n"
+ "ld1 { v27.b }[6], [x21], #0x1\n"
+ "b 11f\n"
+ "8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v11.b }[4], [x14], #0x1\n"
+ "ld1 { v10.b }[4], [x13], #0x1\n"
+ "ld1 { v3.b }[4], [x10], #0x1\n"
+ "ld1 { v14.b }[4], [x9], #0x1\n"
+ "ld1 { v13.b }[4], [x28], #0x1\n"
+ "ld1 { v28.b }[4], [x27], #0x1\n"
+ "ld1 { v21.b }[4], [x26], #0x1\n"
+ "ld1 { v27.b }[4], [x21], #0x1\n"
+ "b 11f\n"
+ "9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
+ "tbz %x[n_channels], #1, 10f\n"
+ "ldr h11, [x14], #0x2\n"
+ "ldr h10, [x13], #0x2\n"
+ "ldr h3, [x10], #0x2\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h13, [x28], #0x2\n"
+ "ldr h28, [x27], #0x2\n"
+ "ldr h21, [x26], #0x2\n"
+ "ldr h27, [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v11.b }[2], [x14], #0x1\n"
+ "ld1 { v10.b }[2], [x13], #0x1\n"
+ "ld1 { v3.b }[2], [x10], #0x1\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v13.b }[2], [x28], #0x1\n"
+ "ld1 { v28.b }[2], [x27], #0x1\n"
+ "ld1 { v21.b }[2], [x26], #0x1\n"
+ "ld1 { v27.b }[2], [x21], #0x1\n"
+ "b 11f\n"
+ "10:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+ "ldr b11, [x14], #0x1\n"
+ "ldr b10, [x13], #0x1\n"
+ "ldr b3, [x10], #0x1\n"
+ "ldr b14, [x9], #0x1\n"
+ "ldr b13, [x28], #0x1\n"
+ "ldr b28, [x27], #0x1\n"
+ "ldr b21, [x26], #0x1\n"
+ "ldr b27, [x21], #0x1\n"
+ "11:" // Oddments: Load (A): Bit 3: End
+ "ldp x14, x13, [%x[inptrs], #0x40]\n"
+ "ldp x10, x9, [%x[inptrs], #0x50]\n"
+ "add x14, x14, x12\n"
+ "add x13, x13, x12\n"
+ "ldp x28, x27, [%x[inptrs], #0x60]\n"
+ "ldp x26, x21, [%x[inptrs], #0x70]\n"
+ "add x10, x10, x12\n"
+ "add x9, x9, x12\n"
+ "add x28, x28, x12\n"
+ "add x27, x27, x12\n"
+ "add x26, x26, x12\n"
+ "add x21, x21, x12\n"
+ "tbz %x[n_channels], #3, 15f\n"
+ "ldr d5, [x14], #0x8\n"
+ "ldr d29, [x13], #0x8\n"
+ "ldr d0, [x10], #0x8\n"
+ "ldr d7, [x9], #0x8\n"
+ "ldr d16, [x28], #0x8\n"
+ "ldr d30, [x27], #0x8\n"
+ "ldr d2, [x26], #0x8\n"
+ "ldr d1, [x21], #0x8\n"
+ "tbz %x[n_channels], #2, 13f\n"
+ "ld1 { v5.s }[2], [x14], #0x4\n"
+ "ld1 { v29.s }[2], [x13], #0x4\n"
+ "ld1 { v0.s }[2], [x10], #0x4\n"
+ "ld1 { v7.s }[2], [x9], #0x4\n"
+ "ld1 { v16.s }[2], [x28], #0x4\n"
+ "ld1 { v30.s }[2], [x27], #0x4\n"
+ "ld1 { v2.s }[2], [x26], #0x4\n"
+ "ld1 { v1.s }[2], [x21], #0x4\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v5.h }[6], [x14], #0x2\n"
+ "ld1 { v29.h }[6], [x13], #0x2\n"
+ "ld1 { v0.h }[6], [x10], #0x2\n"
+ "ld1 { v7.h }[6], [x9], #0x2\n"
+ "ld1 { v16.h }[6], [x28], #0x2\n"
+ "ld1 { v30.h }[6], [x27], #0x2\n"
+ "ld1 { v2.h }[6], [x26], #0x2\n"
+ "ld1 { v1.h }[6], [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v5.b }[14], [x14], #0x1\n"
+ "ld1 { v29.b }[14], [x13], #0x1\n"
+ "ld1 { v0.b }[14], [x10], #0x1\n"
+ "ld1 { v7.b }[14], [x9], #0x1\n"
+ "ld1 { v16.b }[14], [x28], #0x1\n"
+ "ld1 { v30.b }[14], [x27], #0x1\n"
+ "ld1 { v2.b }[14], [x26], #0x1\n"
+ "ld1 { v1.b }[14], [x21], #0x1\n"
+ "b 19f\n"
+ "12:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v5.b }[12], [x14], #0x1\n"
+ "ld1 { v29.b }[12], [x13], #0x1\n"
+ "ld1 { v0.b }[12], [x10], #0x1\n"
+ "ld1 { v7.b }[12], [x9], #0x1\n"
+ "ld1 { v16.b }[12], [x28], #0x1\n"
+ "ld1 { v30.b }[12], [x27], #0x1\n"
+ "ld1 { v2.b }[12], [x26], #0x1\n"
+ "ld1 { v1.b }[12], [x21], #0x1\n"
+ "b 19f\n"
+ "13:" // Oddments: Load (B): Bit 3: Bit 2: Unset
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v5.h }[4], [x14], #0x2\n"
+ "ld1 { v29.h }[4], [x13], #0x2\n"
+ "ld1 { v0.h }[4], [x10], #0x2\n"
+ "ld1 { v7.h }[4], [x9], #0x2\n"
+ "ld1 { v16.h }[4], [x28], #0x2\n"
+ "ld1 { v30.h }[4], [x27], #0x2\n"
+ "ld1 { v2.h }[4], [x26], #0x2\n"
+ "ld1 { v1.h }[4], [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v5.b }[10], [x14], #0x1\n"
+ "ld1 { v29.b }[10], [x13], #0x1\n"
+ "ld1 { v0.b }[10], [x10], #0x1\n"
+ "ld1 { v7.b }[10], [x9], #0x1\n"
+ "ld1 { v16.b }[10], [x28], #0x1\n"
+ "ld1 { v30.b }[10], [x27], #0x1\n"
+ "ld1 { v2.b }[10], [x26], #0x1\n"
+ "ld1 { v1.b }[10], [x21], #0x1\n"
+ "b 19f\n"
+ "14:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v5.b }[8], [x14], #0x1\n"
+ "ld1 { v29.b }[8], [x13], #0x1\n"
+ "ld1 { v0.b }[8], [x10], #0x1\n"
+ "ld1 { v7.b }[8], [x9], #0x1\n"
+ "ld1 { v16.b }[8], [x28], #0x1\n"
+ "ld1 { v30.b }[8], [x27], #0x1\n"
+ "ld1 { v2.b }[8], [x26], #0x1\n"
+ "ld1 { v1.b }[8], [x21], #0x1\n"
+ "b 19f\n"
+ "15:" // Oddments: Load (B): Bit 3: Unset
+ "tbz %x[n_channels], #2, 17f\n"
+ "ldr s5, [x14], #0x4\n"
+ "ldr s29, [x13], #0x4\n"
+ "ldr s0, [x10], #0x4\n"
+ "ldr s7, [x9], #0x4\n"
+ "ldr s16, [x28], #0x4\n"
+ "ldr s30, [x27], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s1, [x21], #0x4\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v5.h }[2], [x14], #0x2\n"
+ "ld1 { v29.h }[2], [x13], #0x2\n"
+ "ld1 { v0.h }[2], [x10], #0x2\n"
+ "ld1 { v7.h }[2], [x9], #0x2\n"
+ "ld1 { v16.h }[2], [x28], #0x2\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v2.h }[2], [x26], #0x2\n"
+ "ld1 { v1.h }[2], [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v5.b }[6], [x14], #0x1\n"
+ "ld1 { v29.b }[6], [x13], #0x1\n"
+ "ld1 { v0.b }[6], [x10], #0x1\n"
+ "ld1 { v7.b }[6], [x9], #0x1\n"
+ "ld1 { v16.b }[6], [x28], #0x1\n"
+ "ld1 { v30.b }[6], [x27], #0x1\n"
+ "ld1 { v2.b }[6], [x26], #0x1\n"
+ "ld1 { v1.b }[6], [x21], #0x1\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v5.b }[4], [x14], #0x1\n"
+ "ld1 { v29.b }[4], [x13], #0x1\n"
+ "ld1 { v0.b }[4], [x10], #0x1\n"
+ "ld1 { v7.b }[4], [x9], #0x1\n"
+ "ld1 { v16.b }[4], [x28], #0x1\n"
+ "ld1 { v30.b }[4], [x27], #0x1\n"
+ "ld1 { v2.b }[4], [x26], #0x1\n"
+ "ld1 { v1.b }[4], [x21], #0x1\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
+ "tbz %x[n_channels], #1, 18f\n"
+ "ldr h5, [x14], #0x2\n"
+ "ldr h29, [x13], #0x2\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h7, [x9], #0x2\n"
+ "ldr h16, [x28], #0x2\n"
+ "ldr h30, [x27], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h1, [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v5.b }[2], [x14], #0x1\n"
+ "ld1 { v29.b }[2], [x13], #0x1\n"
+ "ld1 { v0.b }[2], [x10], #0x1\n"
+ "ld1 { v7.b }[2], [x9], #0x1\n"
+ "ld1 { v16.b }[2], [x28], #0x1\n"
+ "ld1 { v30.b }[2], [x27], #0x1\n"
+ "ld1 { v2.b }[2], [x26], #0x1\n"
+ "ld1 { v1.b }[2], [x21], #0x1\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+ "ldr b5, [x14], #0x1\n"
+ "ldr b29, [x13], #0x1\n"
+ "ldr b0, [x10], #0x1\n"
+ "ldr b7, [x9], #0x1\n"
+ "ldr b16, [x28], #0x1\n"
+ "ldr b30, [x27], #0x1\n"
+ "ldr b2, [x26], #0x1\n"
+ "ldr b1, [x21], #0x1\n"
+ "19:" // Oddments: Load (B): Bit 3: End
+ "ldr q25, [%x[params], #0x10]\n"
+ "ldr q24, [%x[params], #0x20]\n"
+ "zip2 v18.16b, v11.16b, v3.16b\n"
+ "zip1 v11.16b, v11.16b, v3.16b\n"
+ "ldr q23, [%x[params], #0x30]\n"
+ "zip1 v17.16b, v10.16b, v14.16b\n"
+ "zip2 v14.16b, v10.16b, v14.16b\n"
+ "cmp x20, #0x4\n"
+ "zip2 v10.16b, v11.16b, v17.16b\n"
+ "zip1 v11.16b, v11.16b, v17.16b\n"
+ "zip1 v3.16b, v18.16b, v14.16b\n"
+ "zip2 v14.16b, v18.16b, v14.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "zip2 v22.16b, v13.16b, v21.16b\n"
+ "zip1 v13.16b, v13.16b, v21.16b\n"
+ "zip1 v21.16b, v28.16b, v27.16b\n"
+ "zip2 v27.16b, v28.16b, v27.16b\n"
+ "zip2 v20.16b, v5.16b, v0.16b\n"
+ "zip1 v5.16b, v5.16b, v0.16b\n"
+ "zip1 v19.16b, v29.16b, v7.16b\n"
+ "zip2 v7.16b, v29.16b, v7.16b\n"
+ "zip2 v18.16b, v16.16b, v2.16b\n"
+ "zip1 v16.16b, v16.16b, v2.16b\n"
+ "zip1 v17.16b, v30.16b, v1.16b\n"
+ "zip2 v1.16b, v30.16b, v1.16b\n"
+ "zip2 v28.16b, v13.16b, v21.16b\n"
+ "zip1 v13.16b, v13.16b, v21.16b\n"
+ "zip1 v21.16b, v22.16b, v27.16b\n"
+ "zip2 v27.16b, v22.16b, v27.16b\n"
+ "zip2 v29.16b, v5.16b, v19.16b\n"
+ "zip1 v5.16b, v5.16b, v19.16b\n"
+ "zip1 v0.16b, v20.16b, v7.16b\n"
+ "zip2 v7.16b, v20.16b, v7.16b\n"
+ "zip2 v30.16b, v16.16b, v17.16b\n"
+ "zip1 v16.16b, v16.16b, v17.16b\n"
+ "zip1 v2.16b, v18.16b, v1.16b\n"
+ "zip2 v1.16b, v18.16b, v1.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ ".inst 0x4e8d9732 // sdot v18.4s, v25.16b, v13.16b\n"
+ "mov v4.16b, v31.16b\n"
+ ".inst 0x4e8b973f // sdot v31.4s, v25.16b, v11.16b\n"
+ ".inst 0x4e8d971f // sdot v31.4s, v24.16b, v13.16b\n"
+ "ext v11.16b, v11.16b, v11.16b, #0x1\n"
+ "ext v13.16b, v13.16b, v13.16b, #0x1\n"
+ ".inst 0x4e8b973a // sdot v26.4s, v25.16b, v11.16b\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ ".inst 0x4e8d9724 // sdot v4.4s, v25.16b, v13.16b\n"
+ ".inst 0x4e859712 // sdot v18.4s, v24.16b, v5.16b\n"
+ ".inst 0x4e8596ff // sdot v31.4s, v23.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x4e8d971a // sdot v26.4s, v24.16b, v13.16b\n"
+ "ldr q20, [%x[params], #0x50]\n"
+ ".inst 0x4e859704 // sdot v4.4s, v24.16b, v5.16b\n"
+ ".inst 0x4e9096f2 // sdot v18.4s, v23.16b, v16.16b\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x1\n"
+ "add %x[params], %x[params], #0x60\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e8596fa // sdot v26.4s, v23.16b, v5.16b\n"
+ ".inst 0x4e9096e4 // sdot v4.4s, v23.16b, v16.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v19.16b, v26.16b, v20.16b\n"
+ "and v17.16b, v18.16b, v20.16b\n"
+ "and v16.16b, v4.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v19.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "sqadd v4.4s, v4.4s, v16.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v18.4s, v18.4s, v20.4s\n"
+ "srshl v4.4s, v4.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v18.4s, v18.4s, v15.4s\n"
+ "add v4.4s, v4.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v18.4s, v18.4s, v8.4s\n"
+ "smax v4.4s, v4.4s, v8.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "smin v4.4s, v4.4s, v12.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "blt 20f\n"
+ "str s31, [x25, x11]\n"
+ "str s26, [x24, x11]\n"
+ "str s18, [x23, x11]\n"
+ "str s4, [x22, x11]\n"
+ "b 23f\n"
+ "20:" // Oddments: Unroll 0: Oddment store
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "tbz x20, #1, 21f\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v26.h }[0], [x24], #0x2\n"
+ "st1 { v18.h }[0], [x23], #0x2\n"
+ "st1 { v4.h }[0], [x22], #0x2\n"
+ "tbz x20, #0, 22f\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v26.b }[2], [x24], #0x1\n"
+ "st1 { v18.b }[2], [x23], #0x1\n"
+ "st1 { v4.b }[2], [x22], #0x1\n"
+ "b 22f\n"
+ "21:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v26.b }[0], [x24], #0x1\n"
+ "st1 { v18.b }[0], [x23], #0x1\n"
+ "st1 { v4.b }[0], [x22], #0x1\n"
+ "22:" // Oddments: Unroll 0: Oddment store: Bit 1: End
+ "23:" // Oddments: Unroll 0: After oddment store
+ "subs x20, x20, #0x4\n"
+ "add x11, x11, #0x4\n"
+ "ble 35f\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q23, [%x[params], #0x10]\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "ldr q22, [%x[params], #0x20]\n"
+ "ldr q16, [%x[params], #0x30]\n"
+ "mov v4.16b, v31.16b\n"
+ ".inst 0x4e8a96ff // sdot v31.4s, v23.16b, v10.16b\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ "ldr q20, [%x[params], #0x50]\n"
+ ".inst 0x4e9c96f2 // sdot v18.4s, v23.16b, v28.16b\n"
+ ".inst 0x4e9c96df // sdot v31.4s, v22.16b, v28.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x1\n"
+ ".inst 0x4e8a96fa // sdot v26.4s, v23.16b, v10.16b\n"
+ "cmp x20, #0x4\n"
+ ".inst 0x4e9c96e4 // sdot v4.4s, v23.16b, v28.16b\n"
+ ".inst 0x4e9d96d2 // sdot v18.4s, v22.16b, v29.16b\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x4e9d961f // sdot v31.4s, v16.16b, v29.16b\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x1\n"
+ ".inst 0x4e9c96da // sdot v26.4s, v22.16b, v28.16b\n"
+ ".inst 0x4e9d96c4 // sdot v4.4s, v22.16b, v29.16b\n"
+ ".inst 0x4e9e9612 // sdot v18.4s, v16.16b, v30.16b\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x1\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e9d961a // sdot v26.4s, v16.16b, v29.16b\n"
+ ".inst 0x4e9e9604 // sdot v4.4s, v16.16b, v30.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v19.16b, v26.16b, v20.16b\n"
+ "and v17.16b, v18.16b, v20.16b\n"
+ "and v16.16b, v4.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v19.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "sqadd v4.4s, v4.4s, v16.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v18.4s, v18.4s, v20.4s\n"
+ "srshl v4.4s, v4.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v18.4s, v18.4s, v15.4s\n"
+ "add v4.4s, v4.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v18.4s, v18.4s, v8.4s\n"
+ "smax v4.4s, v4.4s, v8.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "smin v4.4s, v4.4s, v12.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "blt 24f\n"
+ "str s31, [x25, x11]\n"
+ "str s26, [x24, x11]\n"
+ "str s18, [x23, x11]\n"
+ "str s4, [x22, x11]\n"
+ "b 27f\n"
+ "24:" // Oddments: Unroll 1: Oddment store
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "tbz x20, #1, 25f\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v26.h }[0], [x24], #0x2\n"
+ "st1 { v18.h }[0], [x23], #0x2\n"
+ "st1 { v4.h }[0], [x22], #0x2\n"
+ "tbz x20, #0, 26f\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v26.b }[2], [x24], #0x1\n"
+ "st1 { v18.b }[2], [x23], #0x1\n"
+ "st1 { v4.b }[2], [x22], #0x1\n"
+ "b 26f\n"
+ "25:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v26.b }[0], [x24], #0x1\n"
+ "st1 { v18.b }[0], [x23], #0x1\n"
+ "st1 { v4.b }[0], [x22], #0x1\n"
+ "26:" // Oddments: Unroll 1: Oddment store: Bit 1: End
+ "27:" // Oddments: Unroll 1: After oddment store
+ "subs x20, x20, #0x4\n"
+ "add x11, x11, #0x4\n"
+ "ble 35f\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q23, [%x[params], #0x10]\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "ldr q22, [%x[params], #0x20]\n"
+ "ldr q16, [%x[params], #0x30]\n"
+ "mov v4.16b, v31.16b\n"
+ ".inst 0x4e8396ff // sdot v31.4s, v23.16b, v3.16b\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ "ldr q20, [%x[params], #0x50]\n"
+ ".inst 0x4e9596f2 // sdot v18.4s, v23.16b, v21.16b\n"
+ ".inst 0x4e9596df // sdot v31.4s, v22.16b, v21.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x4e8396fa // sdot v26.4s, v23.16b, v3.16b\n"
+ "cmp x20, #0x4\n"
+ ".inst 0x4e9596e4 // sdot v4.4s, v23.16b, v21.16b\n"
+ ".inst 0x4e8096d2 // sdot v18.4s, v22.16b, v0.16b\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x4e80961f // sdot v31.4s, v16.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x4e9596da // sdot v26.4s, v22.16b, v21.16b\n"
+ ".inst 0x4e8096c4 // sdot v4.4s, v22.16b, v0.16b\n"
+ ".inst 0x4e829612 // sdot v18.4s, v16.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e80961a // sdot v26.4s, v16.16b, v0.16b\n"
+ ".inst 0x4e829604 // sdot v4.4s, v16.16b, v2.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v19.16b, v26.16b, v20.16b\n"
+ "and v17.16b, v18.16b, v20.16b\n"
+ "and v16.16b, v4.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v19.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "sqadd v4.4s, v4.4s, v16.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v18.4s, v18.4s, v20.4s\n"
+ "srshl v4.4s, v4.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v18.4s, v18.4s, v15.4s\n"
+ "add v4.4s, v4.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v18.4s, v18.4s, v8.4s\n"
+ "smax v4.4s, v4.4s, v8.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "smin v4.4s, v4.4s, v12.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "blt 28f\n"
+ "str s31, [x25, x11]\n"
+ "str s26, [x24, x11]\n"
+ "str s18, [x23, x11]\n"
+ "str s4, [x22, x11]\n"
+ "b 31f\n"
+ "28:" // Oddments: Unroll 2: Oddment store
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "tbz x20, #1, 29f\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v26.h }[0], [x24], #0x2\n"
+ "st1 { v18.h }[0], [x23], #0x2\n"
+ "st1 { v4.h }[0], [x22], #0x2\n"
+ "tbz x20, #0, 30f\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v26.b }[2], [x24], #0x1\n"
+ "st1 { v18.b }[2], [x23], #0x1\n"
+ "st1 { v4.b }[2], [x22], #0x1\n"
+ "b 30f\n"
+ "29:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v26.b }[0], [x24], #0x1\n"
+ "st1 { v18.b }[0], [x23], #0x1\n"
+ "st1 { v4.b }[0], [x22], #0x1\n"
+ "30:" // Oddments: Unroll 2: Oddment store: Bit 1: End
+ "31:" // Oddments: Unroll 2: After oddment store
+ "subs x20, x20, #0x4\n"
+ "add x11, x11, #0x4\n"
+ "ble 35f\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q20, [%x[params], #0x10]\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "ldr q19, [%x[params], #0x20]\n"
+ "ldr q16, [%x[params], #0x30]\n"
+ "mov v4.16b, v31.16b\n"
+ ".inst 0x4e8e969f // sdot v31.4s, v20.16b, v14.16b\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ "ldr q22, [%x[params], #0x50]\n"
+ ".inst 0x4e9b9692 // sdot v18.4s, v20.16b, v27.16b\n"
+ ".inst 0x4e9b967f // sdot v31.4s, v19.16b, v27.16b\n"
+ "ext v14.16b, v14.16b, v14.16b, #0x1\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x4e8e969a // sdot v26.4s, v20.16b, v14.16b\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x4e9b9684 // sdot v4.4s, v20.16b, v27.16b\n"
+ ".inst 0x4e879672 // sdot v18.4s, v19.16b, v7.16b\n"
+ ".inst 0x4e87961f // sdot v31.4s, v16.16b, v7.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ ".inst 0x4e9b967a // sdot v26.4s, v19.16b, v27.16b\n"
+ ".inst 0x4e879664 // sdot v4.4s, v19.16b, v7.16b\n"
+ ".inst 0x4e819612 // sdot v18.4s, v16.16b, v1.16b\n"
+ "ext v1.16b, v1.16b, v1.16b, #0x1\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x4e87961a // sdot v26.4s, v16.16b, v7.16b\n"
+ ".inst 0x4e819604 // sdot v4.4s, v16.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v22.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v23.16b, v26.16b, v22.16b\n"
+ "and v17.16b, v18.16b, v22.16b\n"
+ "and v16.16b, v4.16b, v22.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v23.4s\n"
+ "sqadd v18.4s, v18.4s, v17.4s\n"
+ "sqadd v4.4s, v4.4s, v16.4s\n"
+ "srshl v31.4s, v31.4s, v22.4s\n"
+ "srshl v26.4s, v26.4s, v22.4s\n"
+ "srshl v18.4s, v18.4s, v22.4s\n"
+ "srshl v4.4s, v4.4s, v22.4s\n"
+ "add v31.4s, v31.4s, v15.4s\n"
+ "add v26.4s, v26.4s, v15.4s\n"
+ "add v18.4s, v18.4s, v15.4s\n"
+ "add v4.4s, v4.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v18.4s, v18.4s, v8.4s\n"
+ "smax v4.4s, v4.4s, v8.4s\n"
+ "smin v31.4s, v31.4s, v12.4s\n"
+ "smin v26.4s, v26.4s, v12.4s\n"
+ "smin v18.4s, v18.4s, v12.4s\n"
+ "smin v4.4s, v4.4s, v12.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "32:" // Oddments: Unroll 3: Oddment store
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "tbz x20, #1, 33f\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v26.h }[0], [x24], #0x2\n"
+ "st1 { v18.h }[0], [x23], #0x2\n"
+ "st1 { v4.h }[0], [x22], #0x2\n"
+ "tbz x20, #0, 34f\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v26.b }[2], [x24], #0x1\n"
+ "st1 { v18.b }[2], [x23], #0x1\n"
+ "st1 { v4.b }[2], [x22], #0x1\n"
+ "b 34f\n"
+ "33:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v26.b }[0], [x24], #0x1\n"
+ "st1 { v18.b }[0], [x23], #0x1\n"
+ "st1 { v4.b }[0], [x22], #0x1\n"
+ "34:" // Oddments: Unroll 3: Oddment store: Bit 1: End
+ "35:" // End
+ : [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..4026855617
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int, const uint8_t *const *const, const uint8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, uint8_t *const *const);
+
+class a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleave_a64_u8q_3x3_dot::get_packed_size(args);
+ }
+
+ void pack_parameters(
+ const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const override
+ {
+ interleave_a64_u8q_3x3_dot::pack_parameters(
+ args.input_channels * args.channel_multiplier, buffer, reinterpret_cast<const int32_t *>(biases),
+ reinterpret_cast<const uint8_t *>(weights), qp, ld_weight_col, ld_weight_row
+ );
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..5a28daffbf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,1658 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const uint8_t *const *const inptrs, const uint8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, uint8_t *const *const outptrs)
+{
+ __asm__ __volatile__(
+ "mov x20, #0x1\n"
+ "orr x20, x20, #0x100\n"
+ "ldp x15, x14, [%x[inptrs], #0x0]\n"
+ "ldp x13, x12, [%x[inptrs], #0x10]\n"
+ "orr x20, x20, #0x10000\n"
+ "lsr x11, %x[n_channels], #0x4\n"
+ "dup v12.4s, w20\n"
+ "ldp x10, x9, [%x[inptrs], #0x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v11.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "mov x28, #0x0\n"
+ "mov x27, #0x0\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "ldp x25, x24, [%x[outptrs], #0x0]\n"
+ "ldp x23, x22, [%x[outptrs], #0x10]\n"
+ "cbz x11, 3f\n"
+ "ldr q15, [x15, x28]\n"
+ "ldr q28, [x14, x28]\n"
+ "subs x11, x11, #0x1\n"
+ "ldr q30, [x13, x28]\n"
+ "ldr q8, [x12, x28]\n"
+ "zip2 v19.16b, v15.16b, v30.16b\n"
+ "zip1 v15.16b, v15.16b, v30.16b\n"
+ "ldr q26, [x10, x28]\n"
+ "ldr q0, [x9, x28]\n"
+ "zip1 v7.16b, v28.16b, v8.16b\n"
+ "zip2 v8.16b, v28.16b, v8.16b\n"
+ "ldr q29, [x26, x28]\n"
+ "ldr q10, [x21, x28]\n"
+ "zip2 v25.16b, v15.16b, v7.16b\n"
+ "zip1 v15.16b, v15.16b, v7.16b\n"
+ "ldr q1, [%x[params], #0x10]\n"
+ "ldr q6, [%x[params], #0x20]\n"
+ "zip1 v7.16b, v19.16b, v8.16b\n"
+ "zip2 v8.16b, v19.16b, v8.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q20, [%x[params], #0x30]\n"
+ "zip2 v21.16b, v26.16b, v29.16b\n"
+ "zip1 v26.16b, v26.16b, v29.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x40]\n"
+ "ldr q22, [x21, x28]\n"
+ "zip1 v27.16b, v0.16b, v10.16b\n"
+ "zip2 v10.16b, v0.16b, v10.16b\n"
+ "ldr q17, [x20, x28]\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "zip2 v23.16b, v26.16b, v27.16b\n"
+ "zip1 v26.16b, v26.16b, v27.16b\n"
+ "ldr q9, [x21, x28]\n"
+ "ldr q5, [x20, x28]\n"
+ "zip2 v28.16b, v22.16b, v9.16b\n"
+ "zip1 v22.16b, v22.16b, v9.16b\n"
+ "ldp x21, x20, [%x[inptrs], #0x60]\n"
+ "ldr q27, [x21, x28]\n"
+ "zip1 v24.16b, v17.16b, v5.16b\n"
+ "zip2 v5.16b, v17.16b, v5.16b\n"
+ "ldr q18, [x20, x28]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "zip1 v3.16b, v21.16b, v10.16b\n"
+ "zip2 v10.16b, v21.16b, v10.16b\n"
+ "ldr q4, [x21, x28]\n"
+ "ldr q9, [x20, x28]\n"
+ "zip2 v17.16b, v27.16b, v4.16b\n"
+ "zip1 v27.16b, v27.16b, v4.16b\n"
+ "zip1 v4.16b, v18.16b, v9.16b\n"
+ "zip2 v9.16b, v18.16b, v9.16b\n"
+ "ldp x15, x14, [%x[inptrs], #0x0]\n"
+ "ldp x13, x12, [%x[inptrs], #0x10]\n"
+ "ldp x10, x9, [%x[inptrs], #0x20]\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "zip2 v19.16b, v22.16b, v24.16b\n"
+ "zip1 v22.16b, v22.16b, v24.16b\n"
+ "zip1 v0.16b, v28.16b, v5.16b\n"
+ "zip2 v5.16b, v28.16b, v5.16b\n"
+ "add %x[params], %x[params], #0x40\n"
+ "zip2 v24.16b, v27.16b, v4.16b\n"
+ "zip1 v27.16b, v27.16b, v4.16b\n"
+ "zip1 v2.16b, v17.16b, v9.16b\n"
+ "zip2 v9.16b, v17.16b, v9.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "movi v21.4s, #0x0\n"
+ ".inst 0x6e9a9595 // udot v21.4s, v12.16b, v26.16b\n"
+ ".inst 0x6e8f943f // udot v31.4s, v1.16b, v15.16b\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6e969595 // udot v21.4s, v12.16b, v22.16b\n"
+ ".inst 0x6e9a943d // udot v29.4s, v1.16b, v26.16b\n"
+ "movi v18.4s, #0x0\n"
+ "subs x11, x11, #0x1\n"
+ ".inst 0x6e9a94df // udot v31.4s, v6.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "mov v17.16b, v21.16b\n .inst 0x6e9b9591 // udot v17.4s, v12.16b, v27.16b\n"
+ ".inst 0x6e8f9595 // udot v21.4s, v12.16b, v15.16b\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x6e9a9592 // udot v18.4s, v12.16b, v26.16b\n"
+ ".inst 0x6e9694dd // udot v29.4s, v6.16b, v22.16b\n"
+ ".inst 0x6e96969f // udot v31.4s, v20.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x6e8f943e // udot v30.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e9a943c // udot v28.4s, v1.16b, v26.16b\n"
+ "mls v31.4s, v21.4s, v16.4s\n"
+ ".inst 0x6e969592 // udot v18.4s, v12.16b, v22.16b\n"
+ ".inst 0x6e9b969d // udot v29.4s, v20.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x6e9a94de // udot v30.4s, v6.16b, v26.16b\n"
+ "ldr q26, [%x[params], #0x10]\n"
+ ".inst 0x6e9694dc // udot v28.4s, v6.16b, v22.16b\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mov v21.16b, v18.16b\n .inst 0x6e9b9595 // udot v21.4s, v12.16b, v27.16b\n"
+ ".inst 0x6e8f9592 // udot v18.4s, v12.16b, v15.16b\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x6e96969e // udot v30.4s, v20.16b, v22.16b\n"
+ ".inst 0x6e9b969c // udot v28.4s, v20.16b, v27.16b\n"
+ "mls v30.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v21.4s, v16.4s\n"
+ "and v15.16b, v31.16b, v26.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v17.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v17.4s\n"
+ "ldr q1, [%x[params], #0x60]\n"
+ "sqadd v31.4s, v31.4s, v15.4s\n"
+ "and v18.16b, v30.16b, v26.16b\n"
+ "and v21.16b, v29.16b, v26.16b\n"
+ "and v17.16b, v28.16b, v26.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v26.4s\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "ldr q18, [%x[params], #0x40]\n"
+ "sqadd v29.4s, v29.4s, v21.4s\n"
+ "ldr q27, [%x[params], #0x50]\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "ldr q15, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v30.4s, v30.4s, v26.4s\n"
+ "srshl v29.4s, v29.4s, v26.4s\n"
+ "srshl v28.4s, v28.4s, v26.4s\n"
+ "ldr q20, [%x[params], #0x70]\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x6e979596 // udot v22.4s, v12.16b, v23.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q26, [%x[params], #0x20]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ ".inst 0x6e939596 // udot v22.4s, v12.16b, v19.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s30, [x24, x27]\n"
+ "mov v6.16b, v22.16b\n .inst 0x6e989586 // udot v6.4s, v12.16b, v24.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s29, [x23, x27]\n"
+ "mov v30.16b, v26.16b\n"
+ ".inst 0x6e999596 // udot v22.4s, v12.16b, v25.16b\n"
+ "str s28, [x22, x27]\n"
+ "mov v29.16b, v26.16b\n"
+ "mov v21.16b, v26.16b\n"
+ ".inst 0x6e9995fa // udot v26.4s, v15.16b, v25.16b\n"
+ ".inst 0x6e9795fd // udot v29.4s, v15.16b, v23.16b\n"
+ ".inst 0x6e97965a // udot v26.4s, v18.16b, v23.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+ "add x27, x27, #0x4\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+ "movi v28.4s, #0x0\n"
+ ".inst 0x6e9995fe // udot v30.4s, v15.16b, v25.16b\n"
+ ".inst 0x6e9795f5 // udot v21.4s, v15.16b, v23.16b\n"
+ ".inst 0x6e97959c // udot v28.4s, v12.16b, v23.16b\n"
+ ".inst 0x6e93965d // udot v29.4s, v18.16b, v19.16b\n"
+ ".inst 0x6e93977a // udot v26.4s, v27.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x6e97965e // udot v30.4s, v18.16b, v23.16b\n"
+ "ldr q4, [x9, x28]\n"
+ ".inst 0x6e939655 // udot v21.4s, v18.16b, v19.16b\n"
+ "mls v26.4s, v22.4s, v16.4s\n"
+ ".inst 0x6e93959c // udot v28.4s, v12.16b, v19.16b\n"
+ ".inst 0x6e98977d // udot v29.4s, v27.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x6e93977e // udot v30.4s, v27.16b, v19.16b\n"
+ ".inst 0x6e989775 // udot v21.4s, v27.16b, v24.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v1.4s\n"
+ "mov v17.16b, v28.16b\n .inst 0x6e989591 // udot v17.4s, v12.16b, v24.16b\n"
+ ".inst 0x6e99959c // udot v28.4s, v12.16b, v25.16b\n"
+ "ldr q31, [x14, x28]\n"
+ "mls v30.4s, v28.4s, v16.4s\n"
+ "mls v29.4s, v6.4s, v16.4s\n"
+ "mls v21.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v1.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v1.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v1.4s\n"
+ "ldr q27, [%x[params], #0xc0]\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "and v18.16b, v30.16b, v20.16b\n"
+ "and v6.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v21.16b, v20.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "ldr q28, [%x[params], #0xa0]\n"
+ "sqadd v29.4s, v29.4s, v6.4s\n"
+ "ldr q24, [%x[params], #0xb0]\n"
+ "sqadd v21.4s, v21.4s, v17.4s\n"
+ "ldr q15, [%x[params], #0x90]\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v21.4s, v21.4s, v20.4s\n"
+ "ldr q1, [%x[params], #0xd0]\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x6e839596 // udot v22.4s, v12.16b, v3.16b\n"
+ ".inst 0x6e809596 // udot v22.4s, v12.16b, v0.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s26, [x25, x27]\n"
+ "ldr q26, [%x[params], #0x80]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "mov v18.16b, v22.16b\n .inst 0x6e829592 // udot v18.4s, v12.16b, v2.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s30, [x24, x27]\n"
+ ".inst 0x6e879596 // udot v22.4s, v12.16b, v7.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s29, [x23, x27]\n"
+ "mov v6.16b, v26.16b\n"
+ "str s21, [x22, x27]\n"
+ "mov v25.16b, v26.16b\n"
+ "mov v20.16b, v26.16b\n"
+ ".inst 0x6e8795fa // udot v26.4s, v15.16b, v7.16b\n"
+ ".inst 0x6e8395f9 // udot v25.4s, v15.16b, v3.16b\n"
+ ".inst 0x6e83979a // udot v26.4s, v28.16b, v3.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ "add x27, x27, #0x4\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "movi v23.4s, #0x0\n"
+ ".inst 0x6e8795e6 // udot v6.4s, v15.16b, v7.16b\n"
+ ".inst 0x6e8395f4 // udot v20.4s, v15.16b, v3.16b\n"
+ ".inst 0x6e839597 // udot v23.4s, v12.16b, v3.16b\n"
+ ".inst 0x6e809799 // udot v25.4s, v28.16b, v0.16b\n"
+ ".inst 0x6e80971a // udot v26.4s, v24.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x6e839786 // udot v6.4s, v28.16b, v3.16b\n"
+ "ldr q19, [x26, x28]\n"
+ ".inst 0x6e809794 // udot v20.4s, v28.16b, v0.16b\n"
+ "mls v26.4s, v22.4s, v16.4s\n"
+ ".inst 0x6e809597 // udot v23.4s, v12.16b, v0.16b\n"
+ ".inst 0x6e829719 // udot v25.4s, v24.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x6e809706 // udot v6.4s, v24.16b, v0.16b\n"
+ ".inst 0x6e829714 // udot v20.4s, v24.16b, v2.16b\n"
+ "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+ "mov v17.16b, v23.16b\n .inst 0x6e829591 // udot v17.4s, v12.16b, v2.16b\n"
+ ".inst 0x6e879597 // udot v23.4s, v12.16b, v7.16b\n"
+ "ldr q21, [x13, x28]\n"
+ "mls v6.4s, v23.4s, v16.4s\n"
+ "mls v25.4s, v18.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v26.16b, v1.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v6.4s, v6.4s, v27.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+ "ldr q15, [%x[params], #0x120]\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "and v18.16b, v6.16b, v1.16b\n"
+ "and v22.16b, v25.16b, v1.16b\n"
+ "and v17.16b, v20.16b, v1.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "ldr q30, [%x[params], #0x100]\n"
+ "sqadd v25.4s, v25.4s, v22.4s\n"
+ "ldr q27, [%x[params], #0x110]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q24, [%x[params], #0xf0]\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "srshl v6.4s, v6.4s, v1.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v20.4s, v20.4s, v1.4s\n"
+ "ldr q23, [%x[params], #0x130]\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "add v6.4s, v6.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v11.4s\n"
+ "smax v6.4s, v6.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v6.4s, v6.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "movi v0.4s, #0x0\n"
+ ".inst 0x6e8a9580 // udot v0.4s, v12.16b, v10.16b\n"
+ ".inst 0x6e859580 // udot v0.4s, v12.16b, v5.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str s26, [x25, x27]\n"
+ "ldr q28, [%x[params], #0xe0]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "mov v22.16b, v0.16b\n .inst 0x6e899596 // udot v22.4s, v12.16b, v9.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s6, [x24, x27]\n"
+ ".inst 0x6e889580 // udot v0.4s, v12.16b, v8.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s25, [x23, x27]\n"
+ "mov v29.16b, v28.16b\n"
+ "str s20, [x22, x27]\n"
+ "mov v25.16b, v28.16b\n"
+ "mov v7.16b, v28.16b\n"
+ ".inst 0x6e88971c // udot v28.4s, v24.16b, v8.16b\n"
+ ".inst 0x6e8a9719 // udot v25.4s, v24.16b, v10.16b\n"
+ ".inst 0x6e8a97dc // udot v28.4s, v30.16b, v10.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ "add x27, x27, #0x4\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x6e88971d // udot v29.4s, v24.16b, v8.16b\n"
+ ".inst 0x6e8a9707 // udot v7.4s, v24.16b, v10.16b\n"
+ ".inst 0x6e8a9591 // udot v17.4s, v12.16b, v10.16b\n"
+ ".inst 0x6e8597d9 // udot v25.4s, v30.16b, v5.16b\n"
+ ".inst 0x6e85977c // udot v28.4s, v27.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x6e8a97dd // udot v29.4s, v30.16b, v10.16b\n"
+ "ldr q10, [x21, x28]\n"
+ ".inst 0x6e8597c7 // udot v7.4s, v30.16b, v5.16b\n"
+ "mls v28.4s, v0.4s, v16.4s\n"
+ ".inst 0x6e859591 // udot v17.4s, v12.16b, v5.16b\n"
+ ".inst 0x6e899779 // udot v25.4s, v27.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x6e85977d // udot v29.4s, v27.16b, v5.16b\n"
+ ".inst 0x6e899767 // udot v7.4s, v27.16b, v9.16b\n"
+ "sqrdmulh v28.4s, v28.4s, v15.4s\n"
+ "mov v18.16b, v17.16b\n .inst 0x6e899592 // udot v18.4s, v12.16b, v9.16b\n"
+ ".inst 0x6e889591 // udot v17.4s, v12.16b, v8.16b\n"
+ "ldr q8, [x12, x28]\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mls v25.4s, v22.4s, v16.4s\n"
+ "mls v7.4s, v18.4s, v16.4s\n"
+ "and v17.16b, v28.16b, v23.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v29.4s, v29.4s, v15.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v15.4s\n"
+ "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+ "ldr q15, [x15, x28]\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "ldp x21, x20, [%x[inptrs], #0x40]\n"
+ "ldr q22, [x21, x28]\n"
+ "ldr q3, [x20, x28]\n"
+ "and v24.16b, v29.16b, v23.16b\n"
+ "and v20.16b, v25.16b, v23.16b\n"
+ "and v17.16b, v7.16b, v23.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "ldr q2, [x21, x28]\n"
+ "ldr q5, [x20, x28]\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v28.4s, v28.4s, v23.4s\n"
+ "sqadd v29.4s, v29.4s, v24.4s\n"
+ "ldr q6, [%x[params], #0x160]\n"
+ "sqadd v25.4s, v25.4s, v20.4s\n"
+ "ldr q20, [%x[params], #0x170]\n"
+ "sqadd v7.4s, v7.4s, v17.4s\n"
+ "ldr q1, [%x[params], #0x150]\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "srshl v29.4s, v29.4s, v23.4s\n"
+ "srshl v25.4s, v25.4s, v23.4s\n"
+ "srshl v7.4s, v7.4s, v23.4s\n"
+ "ldr q26, [x10, x28]\n"
+ "ldp x21, x20, [%x[inptrs], #0x60]\n"
+ "ldr q27, [x21, x28]\n"
+ "ldr q30, [x20, x28]\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v7.4s, v7.4s, v14.4s\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "ldr q23, [x21, x28]\n"
+ "ldr q9, [x20, x28]\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "ldp x15, x14, [%x[inptrs], #0x0]\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v7.4s, v7.4s, v13.4s\n"
+ "ldp x13, x12, [%x[inptrs], #0x10]\n"
+ "ldp x10, x9, [%x[inptrs], #0x20]\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v25.4s, v25.4s, v11.4s\n"
+ "ldp x26, x21, [%x[inptrs], #0x30]\n"
+ "smin v7.4s, v7.4s, v11.4s\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s28, [x25, x27]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "zip2 v17.16b, v15.16b, v21.16b\n"
+ "zip1 v15.16b, v15.16b, v21.16b\n"
+ "zip1 v18.16b, v31.16b, v8.16b\n"
+ "zip2 v8.16b, v31.16b, v8.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s29, [x24, x27]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str s25, [x23, x27]\n"
+ "zip2 v25.16b, v15.16b, v18.16b\n"
+ "str s7, [x22, x27]\n"
+ "zip1 v15.16b, v15.16b, v18.16b\n"
+ "zip1 v7.16b, v17.16b, v8.16b\n"
+ "add x27, x27, #0x4\n"
+ "zip2 v8.16b, v17.16b, v8.16b\n"
+ "ldr q31, [%x[params], #0x140]\n"
+ "zip2 v29.16b, v26.16b, v19.16b\n"
+ "add %x[params], %x[params], #0x180\n"
+ "zip1 v26.16b, v26.16b, v19.16b\n"
+ "zip1 v28.16b, v4.16b, v10.16b\n"
+ "zip2 v10.16b, v4.16b, v10.16b\n"
+ "zip2 v24.16b, v22.16b, v2.16b\n"
+ "zip1 v22.16b, v22.16b, v2.16b\n"
+ "zip1 v21.16b, v3.16b, v5.16b\n"
+ "zip2 v5.16b, v3.16b, v5.16b\n"
+ "zip2 v18.16b, v27.16b, v23.16b\n"
+ "zip1 v27.16b, v27.16b, v23.16b\n"
+ "zip1 v17.16b, v30.16b, v9.16b\n"
+ "zip2 v9.16b, v30.16b, v9.16b\n"
+ "zip2 v23.16b, v26.16b, v28.16b\n"
+ "zip1 v26.16b, v26.16b, v28.16b\n"
+ "zip1 v3.16b, v29.16b, v10.16b\n"
+ "zip2 v10.16b, v29.16b, v10.16b\n"
+ "zip2 v19.16b, v22.16b, v21.16b\n"
+ "zip1 v22.16b, v22.16b, v21.16b\n"
+ "zip1 v0.16b, v24.16b, v5.16b\n"
+ "zip2 v5.16b, v24.16b, v5.16b\n"
+ "zip2 v24.16b, v27.16b, v17.16b\n"
+ "zip1 v27.16b, v27.16b, v17.16b\n"
+ "zip1 v2.16b, v18.16b, v9.16b\n"
+ "zip2 v9.16b, v18.16b, v9.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "bgt 1b\n"
+ "2:" // Detached iteration
+ "movi v21.4s, #0x0\n"
+ ".inst 0x6e9a9595 // udot v21.4s, v12.16b, v26.16b\n"
+ ".inst 0x6e8f943f // udot v31.4s, v1.16b, v15.16b\n"
+ "tst %x[n_channels], #0xf\n"
+ ".inst 0x6e969595 // udot v21.4s, v12.16b, v22.16b\n"
+ ".inst 0x6e9a943d // udot v29.4s, v1.16b, v26.16b\n"
+ "movi v18.4s, #0x0\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6e9a94df // udot v31.4s, v6.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "mov v17.16b, v21.16b\n .inst 0x6e9b9591 // udot v17.4s, v12.16b, v27.16b\n"
+ ".inst 0x6e8f9595 // udot v21.4s, v12.16b, v15.16b\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x6e9a9592 // udot v18.4s, v12.16b, v26.16b\n"
+ ".inst 0x6e9694dd // udot v29.4s, v6.16b, v22.16b\n"
+ ".inst 0x6e96969f // udot v31.4s, v20.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x6e8f943e // udot v30.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e9a943c // udot v28.4s, v1.16b, v26.16b\n"
+ "mls v31.4s, v21.4s, v16.4s\n"
+ ".inst 0x6e969592 // udot v18.4s, v12.16b, v22.16b\n"
+ ".inst 0x6e9b969d // udot v29.4s, v20.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x6e9a94de // udot v30.4s, v6.16b, v26.16b\n"
+ "ldr q4, [%x[params], #0x10]\n"
+ ".inst 0x6e9694dc // udot v28.4s, v6.16b, v22.16b\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mov v21.16b, v18.16b\n .inst 0x6e9b9595 // udot v21.4s, v12.16b, v27.16b\n"
+ ".inst 0x6e8f9592 // udot v18.4s, v12.16b, v15.16b\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ "sqrdmulh v31.4s, v31.4s, v17.4s\n"
+ ".inst 0x6e96969e // udot v30.4s, v20.16b, v22.16b\n"
+ ".inst 0x6e9b969c // udot v28.4s, v20.16b, v27.16b\n"
+ "mls v30.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v21.4s, v16.4s\n"
+ "and v27.16b, v31.16b, v4.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v17.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v17.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v17.4s\n"
+ "ldr q15, [%x[params], #0x60]\n"
+ "sqadd v31.4s, v31.4s, v27.4s\n"
+ "and v20.16b, v30.16b, v4.16b\n"
+ "and v18.16b, v29.16b, v4.16b\n"
+ "and v17.16b, v28.16b, v4.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v4.4s\n"
+ "sqadd v30.4s, v30.4s, v20.4s\n"
+ "ldr q27, [%x[params], #0x40]\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "ldr q26, [%x[params], #0x50]\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "ldr q6, [%x[params], #0x30]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v29.4s, v29.4s, v4.4s\n"
+ "srshl v28.4s, v28.4s, v4.4s\n"
+ "ldr q4, [%x[params], #0x70]\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v1.4s, #0x0\n"
+ ".inst 0x6e979581 // udot v1.4s, v12.16b, v23.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q31, [%x[params], #0x20]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ ".inst 0x6e939581 // udot v1.4s, v12.16b, v19.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s30, [x24, x27]\n"
+ "mov v22.16b, v1.16b\n .inst 0x6e989596 // udot v22.4s, v12.16b, v24.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s29, [x23, x27]\n"
+ "mov v29.16b, v31.16b\n"
+ ".inst 0x6e999581 // udot v1.4s, v12.16b, v25.16b\n"
+ "str s28, [x22, x27]\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ ".inst 0x6e9994df // udot v31.4s, v6.16b, v25.16b\n"
+ ".inst 0x6e9794d5 // udot v21.4s, v6.16b, v23.16b\n"
+ ".inst 0x6e97977f // udot v31.4s, v27.16b, v23.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+ "add x27, x27, #0x4\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x6e9994dd // udot v29.4s, v6.16b, v25.16b\n"
+ ".inst 0x6e9794d4 // udot v20.4s, v6.16b, v23.16b\n"
+ ".inst 0x6e979592 // udot v18.4s, v12.16b, v23.16b\n"
+ ".inst 0x6e939775 // udot v21.4s, v27.16b, v19.16b\n"
+ ".inst 0x6e93975f // udot v31.4s, v26.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x6e97977d // udot v29.4s, v27.16b, v23.16b\n"
+ ".inst 0x6e939774 // udot v20.4s, v27.16b, v19.16b\n"
+ "mls v31.4s, v1.4s, v16.4s\n"
+ ".inst 0x6e939592 // udot v18.4s, v12.16b, v19.16b\n"
+ ".inst 0x6e989755 // udot v21.4s, v26.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x6e93975d // udot v29.4s, v26.16b, v19.16b\n"
+ ".inst 0x6e989754 // udot v20.4s, v26.16b, v24.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v15.4s\n"
+ "mov v17.16b, v18.16b\n .inst 0x6e989591 // udot v17.4s, v12.16b, v24.16b\n"
+ ".inst 0x6e999592 // udot v18.4s, v12.16b, v25.16b\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mls v21.4s, v22.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v4.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v29.4s, v29.4s, v15.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v15.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v15.4s\n"
+ "ldr q27, [%x[params], #0xc0]\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v29.16b, v4.16b\n"
+ "and v18.16b, v21.16b, v4.16b\n"
+ "and v17.16b, v20.16b, v4.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v4.4s\n"
+ "sqadd v29.4s, v29.4s, v19.4s\n"
+ "ldr q26, [%x[params], #0xa0]\n"
+ "sqadd v21.4s, v21.4s, v18.4s\n"
+ "ldr q25, [%x[params], #0xb0]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q24, [%x[params], #0x90]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v29.4s, v29.4s, v4.4s\n"
+ "srshl v21.4s, v21.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "ldr q1, [%x[params], #0xd0]\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v23.4s, #0x0\n"
+ ".inst 0x6e839597 // udot v23.4s, v12.16b, v3.16b\n"
+ ".inst 0x6e809597 // udot v23.4s, v12.16b, v0.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q31, [%x[params], #0x80]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "mov v22.16b, v23.16b\n .inst 0x6e829596 // udot v22.4s, v12.16b, v2.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s29, [x24, x27]\n"
+ ".inst 0x6e879597 // udot v23.4s, v12.16b, v7.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s21, [x23, x27]\n"
+ "mov v21.16b, v31.16b\n"
+ "str s20, [x22, x27]\n"
+ "mov v4.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ ".inst 0x6e87971f // udot v31.4s, v24.16b, v7.16b\n"
+ ".inst 0x6e839704 // udot v4.4s, v24.16b, v3.16b\n"
+ ".inst 0x6e83975f // udot v31.4s, v26.16b, v3.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ "add x27, x27, #0x4\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x6e879715 // udot v21.4s, v24.16b, v7.16b\n"
+ ".inst 0x6e839714 // udot v20.4s, v24.16b, v3.16b\n"
+ ".inst 0x6e839592 // udot v18.4s, v12.16b, v3.16b\n"
+ ".inst 0x6e809744 // udot v4.4s, v26.16b, v0.16b\n"
+ ".inst 0x6e80973f // udot v31.4s, v25.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x6e839755 // udot v21.4s, v26.16b, v3.16b\n"
+ ".inst 0x6e809754 // udot v20.4s, v26.16b, v0.16b\n"
+ "mls v31.4s, v23.4s, v16.4s\n"
+ ".inst 0x6e809592 // udot v18.4s, v12.16b, v0.16b\n"
+ ".inst 0x6e829724 // udot v4.4s, v25.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x6e809735 // udot v21.4s, v25.16b, v0.16b\n"
+ ".inst 0x6e829734 // udot v20.4s, v25.16b, v2.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v27.4s\n"
+ "mov v17.16b, v18.16b\n .inst 0x6e829591 // udot v17.4s, v12.16b, v2.16b\n"
+ ".inst 0x6e879592 // udot v18.4s, v12.16b, v7.16b\n"
+ "mls v21.4s, v18.4s, v16.4s\n"
+ "mls v4.4s, v22.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v1.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v27.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+ "ldr q30, [%x[params], #0x120]\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v21.16b, v1.16b\n"
+ "and v18.16b, v4.16b, v1.16b\n"
+ "and v17.16b, v20.16b, v1.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "sqadd v21.4s, v21.4s, v19.4s\n"
+ "ldr q29, [%x[params], #0x100]\n"
+ "sqadd v4.4s, v4.4s, v18.4s\n"
+ "ldr q28, [%x[params], #0x110]\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "ldr q27, [%x[params], #0xf0]\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v4.4s, v4.4s, v1.4s\n"
+ "srshl v20.4s, v20.4s, v1.4s\n"
+ "ldr q26, [%x[params], #0x130]\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v4.4s, v4.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v4.4s, v4.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v4.4s, v4.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "movi v25.4s, #0x0\n"
+ ".inst 0x6e8a9599 // udot v25.4s, v12.16b, v10.16b\n"
+ ".inst 0x6e859599 // udot v25.4s, v12.16b, v5.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s31, [x25, x27]\n"
+ "ldr q24, [%x[params], #0xe0]\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "mov v23.16b, v25.16b\n .inst 0x6e899597 // udot v23.4s, v12.16b, v9.16b\n"
+ "add %x[params], %x[params], #0x140\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v4.16b, v4.16b, v4.16b\n"
+ "str s21, [x24, x27]\n"
+ ".inst 0x6e889599 // udot v25.4s, v12.16b, v8.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s4, [x23, x27]\n"
+ "mov v22.16b, v24.16b\n"
+ "str s20, [x22, x27]\n"
+ "mov v21.16b, v24.16b\n"
+ "mov v20.16b, v24.16b\n"
+ ".inst 0x6e889778 // udot v24.4s, v27.16b, v8.16b\n"
+ ".inst 0x6e8a9775 // udot v21.4s, v27.16b, v10.16b\n"
+ ".inst 0x6e8a97b8 // udot v24.4s, v29.16b, v10.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ "add x27, x27, #0x4\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x6e889776 // udot v22.4s, v27.16b, v8.16b\n"
+ ".inst 0x6e8a9774 // udot v20.4s, v27.16b, v10.16b\n"
+ ".inst 0x6e8a9592 // udot v18.4s, v12.16b, v10.16b\n"
+ ".inst 0x6e8597b5 // udot v21.4s, v29.16b, v5.16b\n"
+ ".inst 0x6e859798 // udot v24.4s, v28.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x6e8a97b6 // udot v22.4s, v29.16b, v10.16b\n"
+ ".inst 0x6e8597b4 // udot v20.4s, v29.16b, v5.16b\n"
+ "mls v24.4s, v25.4s, v16.4s\n"
+ ".inst 0x6e859592 // udot v18.4s, v12.16b, v5.16b\n"
+ ".inst 0x6e899795 // udot v21.4s, v28.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x6e859796 // udot v22.4s, v28.16b, v5.16b\n"
+ ".inst 0x6e899794 // udot v20.4s, v28.16b, v9.16b\n"
+ "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+ "mov v17.16b, v18.16b\n .inst 0x6e899591 // udot v17.4s, v12.16b, v9.16b\n"
+ ".inst 0x6e889592 // udot v18.4s, v12.16b, v8.16b\n"
+ "mls v22.4s, v18.4s, v16.4s\n"
+ "mls v21.4s, v23.4s, v16.4s\n"
+ "mls v20.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v24.16b, v26.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v22.4s, v22.4s, v30.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "and v19.16b, v22.16b, v26.16b\n"
+ "and v18.16b, v21.16b, v26.16b\n"
+ "and v17.16b, v20.16b, v26.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v19.4s\n"
+ "sqadd v21.4s, v21.4s, v18.4s\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "srshl v24.4s, v24.4s, v26.4s\n"
+ "srshl v22.4s, v22.4s, v26.4s\n"
+ "srshl v21.4s, v21.4s, v26.4s\n"
+ "srshl v20.4s, v20.4s, v26.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v22.4s, v22.4s, v14.4s\n"
+ "add v21.4s, v21.4s, v14.4s\n"
+ "add v20.4s, v20.4s, v14.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v22.4s, v22.4s, v13.4s\n"
+ "smax v21.4s, v21.4s, v13.4s\n"
+ "smax v20.4s, v20.4s, v13.4s\n"
+ "smin v24.4s, v24.4s, v11.4s\n"
+ "smin v22.4s, v22.4s, v11.4s\n"
+ "smin v21.4s, v21.4s, v11.4s\n"
+ "smin v20.4s, v20.4s, v11.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x25, x27]\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "str s22, [x24, x27]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s21, [x23, x27]\n"
+ "str s20, [x22, x27]\n"
+ "add x27, x27, #0x4\n"
+ "beq 35f\n"
+ "3:" // Oddments
+ "and x20, %x[n_channels], #0xf\n"
+ "add x15, x15, x28\n"
+ "add x14, x14, x28\n"
+ "add x13, x13, x28\n"
+ "add x12, x12, x28\n"
+ "add x10, x10, x28\n"
+ "add x9, x9, x28\n"
+ "add x26, x26, x28\n"
+ "add x21, x21, x28\n"
+ "tbz %x[n_channels], #3, 7f\n"
+ "ldr d15, [x15], #0x8\n"
+ "ldr d25, [x14], #0x8\n"
+ "ldr d7, [x13], #0x8\n"
+ "ldr d8, [x12], #0x8\n"
+ "ldr d26, [x10], #0x8\n"
+ "ldr d23, [x9], #0x8\n"
+ "ldr d3, [x26], #0x8\n"
+ "ldr d10, [x21], #0x8\n"
+ "tbz %x[n_channels], #2, 5f\n"
+ "ld1 { v15.s }[2], [x15], #0x4\n"
+ "ld1 { v25.s }[2], [x14], #0x4\n"
+ "ld1 { v7.s }[2], [x13], #0x4\n"
+ "ld1 { v8.s }[2], [x12], #0x4\n"
+ "ld1 { v26.s }[2], [x10], #0x4\n"
+ "ld1 { v23.s }[2], [x9], #0x4\n"
+ "ld1 { v3.s }[2], [x26], #0x4\n"
+ "ld1 { v10.s }[2], [x21], #0x4\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v15.h }[6], [x15], #0x2\n"
+ "ld1 { v25.h }[6], [x14], #0x2\n"
+ "ld1 { v7.h }[6], [x13], #0x2\n"
+ "ld1 { v8.h }[6], [x12], #0x2\n"
+ "ld1 { v26.h }[6], [x10], #0x2\n"
+ "ld1 { v23.h }[6], [x9], #0x2\n"
+ "ld1 { v3.h }[6], [x26], #0x2\n"
+ "ld1 { v10.h }[6], [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v15.b }[14], [x15], #0x1\n"
+ "ld1 { v25.b }[14], [x14], #0x1\n"
+ "ld1 { v7.b }[14], [x13], #0x1\n"
+ "ld1 { v8.b }[14], [x12], #0x1\n"
+ "ld1 { v26.b }[14], [x10], #0x1\n"
+ "ld1 { v23.b }[14], [x9], #0x1\n"
+ "ld1 { v3.b }[14], [x26], #0x1\n"
+ "ld1 { v10.b }[14], [x21], #0x1\n"
+ "b 11f\n"
+ "4:" // Oddments: Load (A): Bit 3: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v15.b }[12], [x15], #0x1\n"
+ "ld1 { v25.b }[12], [x14], #0x1\n"
+ "ld1 { v7.b }[12], [x13], #0x1\n"
+ "ld1 { v8.b }[12], [x12], #0x1\n"
+ "ld1 { v26.b }[12], [x10], #0x1\n"
+ "ld1 { v23.b }[12], [x9], #0x1\n"
+ "ld1 { v3.b }[12], [x26], #0x1\n"
+ "ld1 { v10.b }[12], [x21], #0x1\n"
+ "b 11f\n"
+ "5:" // Oddments: Load (A): Bit 3: Bit 2: Unset
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v15.h }[4], [x15], #0x2\n"
+ "ld1 { v25.h }[4], [x14], #0x2\n"
+ "ld1 { v7.h }[4], [x13], #0x2\n"
+ "ld1 { v8.h }[4], [x12], #0x2\n"
+ "ld1 { v26.h }[4], [x10], #0x2\n"
+ "ld1 { v23.h }[4], [x9], #0x2\n"
+ "ld1 { v3.h }[4], [x26], #0x2\n"
+ "ld1 { v10.h }[4], [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v15.b }[10], [x15], #0x1\n"
+ "ld1 { v25.b }[10], [x14], #0x1\n"
+ "ld1 { v7.b }[10], [x13], #0x1\n"
+ "ld1 { v8.b }[10], [x12], #0x1\n"
+ "ld1 { v26.b }[10], [x10], #0x1\n"
+ "ld1 { v23.b }[10], [x9], #0x1\n"
+ "ld1 { v3.b }[10], [x26], #0x1\n"
+ "ld1 { v10.b }[10], [x21], #0x1\n"
+ "b 11f\n"
+ "6:" // Oddments: Load (A): Bit 3: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v15.b }[8], [x15], #0x1\n"
+ "ld1 { v25.b }[8], [x14], #0x1\n"
+ "ld1 { v7.b }[8], [x13], #0x1\n"
+ "ld1 { v8.b }[8], [x12], #0x1\n"
+ "ld1 { v26.b }[8], [x10], #0x1\n"
+ "ld1 { v23.b }[8], [x9], #0x1\n"
+ "ld1 { v3.b }[8], [x26], #0x1\n"
+ "ld1 { v10.b }[8], [x21], #0x1\n"
+ "b 11f\n"
+ "7:" // Oddments: Load (A): Bit 3: Unset
+ "tbz %x[n_channels], #2, 9f\n"
+ "ldr s15, [x15], #0x4\n"
+ "ldr s25, [x14], #0x4\n"
+ "ldr s7, [x13], #0x4\n"
+ "ldr s8, [x12], #0x4\n"
+ "ldr s26, [x10], #0x4\n"
+ "ldr s23, [x9], #0x4\n"
+ "ldr s3, [x26], #0x4\n"
+ "ldr s10, [x21], #0x4\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v15.h }[2], [x15], #0x2\n"
+ "ld1 { v25.h }[2], [x14], #0x2\n"
+ "ld1 { v7.h }[2], [x13], #0x2\n"
+ "ld1 { v8.h }[2], [x12], #0x2\n"
+ "ld1 { v26.h }[2], [x10], #0x2\n"
+ "ld1 { v23.h }[2], [x9], #0x2\n"
+ "ld1 { v3.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v15.b }[6], [x15], #0x1\n"
+ "ld1 { v25.b }[6], [x14], #0x1\n"
+ "ld1 { v7.b }[6], [x13], #0x1\n"
+ "ld1 { v8.b }[6], [x12], #0x1\n"
+ "ld1 { v26.b }[6], [x10], #0x1\n"
+ "ld1 { v23.b }[6], [x9], #0x1\n"
+ "ld1 { v3.b }[6], [x26], #0x1\n"
+ "ld1 { v10.b }[6], [x21], #0x1\n"
+ "b 11f\n"
+ "8:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v15.b }[4], [x15], #0x1\n"
+ "ld1 { v25.b }[4], [x14], #0x1\n"
+ "ld1 { v7.b }[4], [x13], #0x1\n"
+ "ld1 { v8.b }[4], [x12], #0x1\n"
+ "ld1 { v26.b }[4], [x10], #0x1\n"
+ "ld1 { v23.b }[4], [x9], #0x1\n"
+ "ld1 { v3.b }[4], [x26], #0x1\n"
+ "ld1 { v10.b }[4], [x21], #0x1\n"
+ "b 11f\n"
+ "9:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset
+ "tbz %x[n_channels], #1, 10f\n"
+ "ldr h15, [x15], #0x2\n"
+ "ldr h25, [x14], #0x2\n"
+ "ldr h7, [x13], #0x2\n"
+ "ldr h8, [x12], #0x2\n"
+ "ldr h26, [x10], #0x2\n"
+ "ldr h23, [x9], #0x2\n"
+ "ldr h3, [x26], #0x2\n"
+ "ldr h10, [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v15.b }[2], [x15], #0x1\n"
+ "ld1 { v25.b }[2], [x14], #0x1\n"
+ "ld1 { v7.b }[2], [x13], #0x1\n"
+ "ld1 { v8.b }[2], [x12], #0x1\n"
+ "ld1 { v26.b }[2], [x10], #0x1\n"
+ "ld1 { v23.b }[2], [x9], #0x1\n"
+ "ld1 { v3.b }[2], [x26], #0x1\n"
+ "ld1 { v10.b }[2], [x21], #0x1\n"
+ "b 11f\n"
+ "10:" // Oddments: Load (A): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+ "ldr b15, [x15], #0x1\n"
+ "ldr b25, [x14], #0x1\n"
+ "ldr b7, [x13], #0x1\n"
+ "ldr b8, [x12], #0x1\n"
+ "ldr b26, [x10], #0x1\n"
+ "ldr b23, [x9], #0x1\n"
+ "ldr b3, [x26], #0x1\n"
+ "ldr b10, [x21], #0x1\n"
+ "11:" // Oddments: Load (A): Bit 3: End
+ "ldp x15, x14, [%x[inptrs], #0x40]\n"
+ "ldp x13, x12, [%x[inptrs], #0x50]\n"
+ "add x15, x15, x28\n"
+ "add x14, x14, x28\n"
+ "ldp x10, x9, [%x[inptrs], #0x60]\n"
+ "ldp x26, x21, [%x[inptrs], #0x70]\n"
+ "add x13, x13, x28\n"
+ "add x12, x12, x28\n"
+ "add x10, x10, x28\n"
+ "add x9, x9, x28\n"
+ "add x26, x26, x28\n"
+ "add x21, x21, x28\n"
+ "tbz %x[n_channels], #3, 15f\n"
+ "ldr d22, [x15], #0x8\n"
+ "ldr d19, [x14], #0x8\n"
+ "ldr d0, [x13], #0x8\n"
+ "ldr d5, [x12], #0x8\n"
+ "ldr d27, [x10], #0x8\n"
+ "ldr d24, [x9], #0x8\n"
+ "ldr d2, [x26], #0x8\n"
+ "ldr d9, [x21], #0x8\n"
+ "tbz %x[n_channels], #2, 13f\n"
+ "ld1 { v22.s }[2], [x15], #0x4\n"
+ "ld1 { v19.s }[2], [x14], #0x4\n"
+ "ld1 { v0.s }[2], [x13], #0x4\n"
+ "ld1 { v5.s }[2], [x12], #0x4\n"
+ "ld1 { v27.s }[2], [x10], #0x4\n"
+ "ld1 { v24.s }[2], [x9], #0x4\n"
+ "ld1 { v2.s }[2], [x26], #0x4\n"
+ "ld1 { v9.s }[2], [x21], #0x4\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v22.h }[6], [x15], #0x2\n"
+ "ld1 { v19.h }[6], [x14], #0x2\n"
+ "ld1 { v0.h }[6], [x13], #0x2\n"
+ "ld1 { v5.h }[6], [x12], #0x2\n"
+ "ld1 { v27.h }[6], [x10], #0x2\n"
+ "ld1 { v24.h }[6], [x9], #0x2\n"
+ "ld1 { v2.h }[6], [x26], #0x2\n"
+ "ld1 { v9.h }[6], [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v22.b }[14], [x15], #0x1\n"
+ "ld1 { v19.b }[14], [x14], #0x1\n"
+ "ld1 { v0.b }[14], [x13], #0x1\n"
+ "ld1 { v5.b }[14], [x12], #0x1\n"
+ "ld1 { v27.b }[14], [x10], #0x1\n"
+ "ld1 { v24.b }[14], [x9], #0x1\n"
+ "ld1 { v2.b }[14], [x26], #0x1\n"
+ "ld1 { v9.b }[14], [x21], #0x1\n"
+ "b 19f\n"
+ "12:" // Oddments: Load (B): Bit 3: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v22.b }[12], [x15], #0x1\n"
+ "ld1 { v19.b }[12], [x14], #0x1\n"
+ "ld1 { v0.b }[12], [x13], #0x1\n"
+ "ld1 { v5.b }[12], [x12], #0x1\n"
+ "ld1 { v27.b }[12], [x10], #0x1\n"
+ "ld1 { v24.b }[12], [x9], #0x1\n"
+ "ld1 { v2.b }[12], [x26], #0x1\n"
+ "ld1 { v9.b }[12], [x21], #0x1\n"
+ "b 19f\n"
+ "13:" // Oddments: Load (B): Bit 3: Bit 2: Unset
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v22.h }[4], [x15], #0x2\n"
+ "ld1 { v19.h }[4], [x14], #0x2\n"
+ "ld1 { v0.h }[4], [x13], #0x2\n"
+ "ld1 { v5.h }[4], [x12], #0x2\n"
+ "ld1 { v27.h }[4], [x10], #0x2\n"
+ "ld1 { v24.h }[4], [x9], #0x2\n"
+ "ld1 { v2.h }[4], [x26], #0x2\n"
+ "ld1 { v9.h }[4], [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v22.b }[10], [x15], #0x1\n"
+ "ld1 { v19.b }[10], [x14], #0x1\n"
+ "ld1 { v0.b }[10], [x13], #0x1\n"
+ "ld1 { v5.b }[10], [x12], #0x1\n"
+ "ld1 { v27.b }[10], [x10], #0x1\n"
+ "ld1 { v24.b }[10], [x9], #0x1\n"
+ "ld1 { v2.b }[10], [x26], #0x1\n"
+ "ld1 { v9.b }[10], [x21], #0x1\n"
+ "b 19f\n"
+ "14:" // Oddments: Load (B): Bit 3: Bit 2: Unset: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v22.b }[8], [x15], #0x1\n"
+ "ld1 { v19.b }[8], [x14], #0x1\n"
+ "ld1 { v0.b }[8], [x13], #0x1\n"
+ "ld1 { v5.b }[8], [x12], #0x1\n"
+ "ld1 { v27.b }[8], [x10], #0x1\n"
+ "ld1 { v24.b }[8], [x9], #0x1\n"
+ "ld1 { v2.b }[8], [x26], #0x1\n"
+ "ld1 { v9.b }[8], [x21], #0x1\n"
+ "b 19f\n"
+ "15:" // Oddments: Load (B): Bit 3: Unset
+ "tbz %x[n_channels], #2, 17f\n"
+ "ldr s22, [x15], #0x4\n"
+ "ldr s19, [x14], #0x4\n"
+ "ldr s0, [x13], #0x4\n"
+ "ldr s5, [x12], #0x4\n"
+ "ldr s27, [x10], #0x4\n"
+ "ldr s24, [x9], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s9, [x21], #0x4\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v22.h }[2], [x15], #0x2\n"
+ "ld1 { v19.h }[2], [x14], #0x2\n"
+ "ld1 { v0.h }[2], [x13], #0x2\n"
+ "ld1 { v5.h }[2], [x12], #0x2\n"
+ "ld1 { v27.h }[2], [x10], #0x2\n"
+ "ld1 { v24.h }[2], [x9], #0x2\n"
+ "ld1 { v2.h }[2], [x26], #0x2\n"
+ "ld1 { v9.h }[2], [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v22.b }[6], [x15], #0x1\n"
+ "ld1 { v19.b }[6], [x14], #0x1\n"
+ "ld1 { v0.b }[6], [x13], #0x1\n"
+ "ld1 { v5.b }[6], [x12], #0x1\n"
+ "ld1 { v27.b }[6], [x10], #0x1\n"
+ "ld1 { v24.b }[6], [x9], #0x1\n"
+ "ld1 { v2.b }[6], [x26], #0x1\n"
+ "ld1 { v9.b }[6], [x21], #0x1\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Bit 1: Unset
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v22.b }[4], [x15], #0x1\n"
+ "ld1 { v19.b }[4], [x14], #0x1\n"
+ "ld1 { v0.b }[4], [x13], #0x1\n"
+ "ld1 { v5.b }[4], [x12], #0x1\n"
+ "ld1 { v27.b }[4], [x10], #0x1\n"
+ "ld1 { v24.b }[4], [x9], #0x1\n"
+ "ld1 { v2.b }[4], [x26], #0x1\n"
+ "ld1 { v9.b }[4], [x21], #0x1\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset
+ "tbz %x[n_channels], #1, 18f\n"
+ "ldr h22, [x15], #0x2\n"
+ "ldr h19, [x14], #0x2\n"
+ "ldr h0, [x13], #0x2\n"
+ "ldr h5, [x12], #0x2\n"
+ "ldr h27, [x10], #0x2\n"
+ "ldr h24, [x9], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h9, [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v22.b }[2], [x15], #0x1\n"
+ "ld1 { v19.b }[2], [x14], #0x1\n"
+ "ld1 { v0.b }[2], [x13], #0x1\n"
+ "ld1 { v5.b }[2], [x12], #0x1\n"
+ "ld1 { v27.b }[2], [x10], #0x1\n"
+ "ld1 { v24.b }[2], [x9], #0x1\n"
+ "ld1 { v2.b }[2], [x26], #0x1\n"
+ "ld1 { v9.b }[2], [x21], #0x1\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (B): Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
+ "ldr b22, [x15], #0x1\n"
+ "ldr b19, [x14], #0x1\n"
+ "ldr b0, [x13], #0x1\n"
+ "ldr b5, [x12], #0x1\n"
+ "ldr b27, [x10], #0x1\n"
+ "ldr b24, [x9], #0x1\n"
+ "ldr b2, [x26], #0x1\n"
+ "ldr b9, [x21], #0x1\n"
+ "19:" // Oddments: Load (B): Bit 3: End
+ "ldr q20, [%x[params], #0x10]\n"
+ "ldr q6, [%x[params], #0x20]\n"
+ "zip2 v1.16b, v26.16b, v3.16b\n"
+ "zip1 v26.16b, v26.16b, v3.16b\n"
+ "ldr q4, [%x[params], #0x30]\n"
+ "zip1 v18.16b, v23.16b, v10.16b\n"
+ "zip2 v30.16b, v15.16b, v7.16b\n"
+ "cmp x20, #0x4\n"
+ "zip1 v15.16b, v15.16b, v7.16b\n"
+ "zip1 v29.16b, v25.16b, v8.16b\n"
+ "zip2 v8.16b, v25.16b, v8.16b\n"
+ "zip2 v10.16b, v23.16b, v10.16b\n"
+ "zip2 v23.16b, v26.16b, v18.16b\n"
+ "zip1 v26.16b, v26.16b, v18.16b\n"
+ "zip2 v28.16b, v22.16b, v0.16b\n"
+ "zip1 v22.16b, v22.16b, v0.16b\n"
+ "zip1 v21.16b, v19.16b, v5.16b\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x6e9a9591 // udot v17.4s, v12.16b, v26.16b\n"
+ "zip2 v25.16b, v15.16b, v29.16b\n"
+ "zip1 v15.16b, v15.16b, v29.16b\n"
+ "zip1 v7.16b, v30.16b, v8.16b\n"
+ "zip2 v8.16b, v30.16b, v8.16b\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "zip2 v5.16b, v19.16b, v5.16b\n"
+ "zip2 v30.16b, v27.16b, v2.16b\n"
+ "zip1 v27.16b, v27.16b, v2.16b\n"
+ "zip1 v18.16b, v24.16b, v9.16b\n"
+ "zip2 v9.16b, v24.16b, v9.16b\n"
+ "zip2 v19.16b, v22.16b, v21.16b\n"
+ "zip1 v22.16b, v22.16b, v21.16b\n"
+ "zip1 v3.16b, v1.16b, v10.16b\n"
+ ".inst 0x6e969591 // udot v17.4s, v12.16b, v22.16b\n"
+ "zip2 v10.16b, v1.16b, v10.16b\n"
+ "zip1 v0.16b, v28.16b, v5.16b\n"
+ "zip2 v5.16b, v28.16b, v5.16b\n"
+ "zip2 v24.16b, v27.16b, v18.16b\n"
+ "zip1 v27.16b, v27.16b, v18.16b\n"
+ "zip1 v2.16b, v30.16b, v9.16b\n"
+ "mov v18.16b, v17.16b\n .inst 0x6e9b9592 // udot v18.4s, v12.16b, v27.16b\n"
+ "zip2 v9.16b, v30.16b, v9.16b\n"
+ "mov v30.16b, v31.16b\n"
+ ".inst 0x6e8f9591 // udot v17.4s, v12.16b, v15.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x6e8f969f // udot v31.4s, v20.16b, v15.16b\n"
+ ".inst 0x6e9a969d // udot v29.4s, v20.16b, v26.16b\n"
+ ".inst 0x6e9a94df // udot v31.4s, v6.16b, v26.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "movi v1.4s, #0x0\n"
+ "ext v15.16b, v15.16b, v15.16b, #0x1\n"
+ ".inst 0x6e9a9581 // udot v1.4s, v12.16b, v26.16b\n"
+ ".inst 0x6e9694dd // udot v29.4s, v6.16b, v22.16b\n"
+ ".inst 0x6e96949f // udot v31.4s, v4.16b, v22.16b\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ ".inst 0x6e8f969e // udot v30.4s, v20.16b, v15.16b\n"
+ ".inst 0x6e9a969c // udot v28.4s, v20.16b, v26.16b\n"
+ "mls v31.4s, v17.4s, v16.4s\n"
+ ".inst 0x6e969581 // udot v1.4s, v12.16b, v22.16b\n"
+ ".inst 0x6e9b949d // udot v29.4s, v4.16b, v27.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ ".inst 0x6e9a94de // udot v30.4s, v6.16b, v26.16b\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ ".inst 0x6e9694dc // udot v28.4s, v6.16b, v22.16b\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mov v20.16b, v1.16b\n .inst 0x6e9b9594 // udot v20.4s, v12.16b, v27.16b\n"
+ ".inst 0x6e8f9581 // udot v1.4s, v12.16b, v15.16b\n"
+ "ldr q18, [%x[params], #0x40]\n"
+ "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+ ".inst 0x6e96949e // udot v30.4s, v4.16b, v22.16b\n"
+ ".inst 0x6e9b949c // udot v28.4s, v4.16b, v27.16b\n"
+ "mls v30.4s, v1.4s, v16.4s\n"
+ "add %x[params], %x[params], #0x60\n"
+ "mls v28.4s, v20.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v18.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v18.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v17.16b, v30.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v26.16b, v28.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v28.4s, v28.4s, v26.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "blt 20f\n"
+ "str s31, [x25, x27]\n"
+ "str s30, [x24, x27]\n"
+ "str s29, [x23, x27]\n"
+ "str s28, [x22, x27]\n"
+ "b 23f\n"
+ "20:" // Oddments: Unroll 0: Oddment store
+ "add x25, x25, x27\n"
+ "add x24, x24, x27\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "tbz x20, #1, 21f\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
+ "tbz x20, #0, 22f\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
+ "b 22f\n"
+ "21:" // Oddments: Unroll 0: Oddment store: Bit 1: Unset
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
+ "22:" // Oddments: Unroll 0: Oddment store: Bit 1: End
+ "23:" // Oddments: Unroll 0: After oddment store
+ "subs x20, x20, #0x4\n"
+ "add x27, x27, #0x4\n"
+ "ble 35f\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q27, [%x[params], #0x10]\n"
+ "movi v1.4s, #0x0\n"
+ ".inst 0x6e979581 // udot v1.4s, v12.16b, v23.16b\n"
+ "ldr q26, [%x[params], #0x20]\n"
+ "ldr q22, [%x[params], #0x30]\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "ldr q4, [%x[params], #0x40]\n"
+ "ldr q21, [%x[params], #0x50]\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x6e99977f // udot v31.4s, v27.16b, v25.16b\n"
+ ".inst 0x6e939581 // udot v1.4s, v12.16b, v19.16b\n"
+ ".inst 0x6e97977d // udot v29.4s, v27.16b, v23.16b\n"
+ "movi v20.4s, #0x0\n"
+ "cmp x20, #0x4\n"
+ ".inst 0x6e97975f // udot v31.4s, v26.16b, v23.16b\n"
+ "mov v18.16b, v1.16b\n .inst 0x6e989592 // udot v18.4s, v12.16b, v24.16b\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x1\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x6e999581 // udot v1.4s, v12.16b, v25.16b\n"
+ "ext v25.16b, v25.16b, v25.16b, #0x1\n"
+ ".inst 0x6e99977e // udot v30.4s, v27.16b, v25.16b\n"
+ ".inst 0x6e97977c // udot v28.4s, v27.16b, v23.16b\n"
+ ".inst 0x6e979594 // udot v20.4s, v12.16b, v23.16b\n"
+ ".inst 0x6e93975d // udot v29.4s, v26.16b, v19.16b\n"
+ ".inst 0x6e9396df // udot v31.4s, v22.16b, v19.16b\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x6e97975e // udot v30.4s, v26.16b, v23.16b\n"
+ ".inst 0x6e93975c // udot v28.4s, v26.16b, v19.16b\n"
+ "mls v31.4s, v1.4s, v16.4s\n"
+ ".inst 0x6e939594 // udot v20.4s, v12.16b, v19.16b\n"
+ ".inst 0x6e9896dd // udot v29.4s, v22.16b, v24.16b\n"
+ "ext v24.16b, v24.16b, v24.16b, #0x1\n"
+ ".inst 0x6e9396de // udot v30.4s, v22.16b, v19.16b\n"
+ ".inst 0x6e9896dc // udot v28.4s, v22.16b, v24.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+ "mov v17.16b, v20.16b\n .inst 0x6e989591 // udot v17.4s, v12.16b, v24.16b\n"
+ ".inst 0x6e999594 // udot v20.4s, v12.16b, v25.16b\n"
+ "mls v30.4s, v20.4s, v16.4s\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v21.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v30.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v17.16b, v28.16b, v21.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "blt 24f\n"
+ "str s31, [x25, x27]\n"
+ "str s30, [x24, x27]\n"
+ "str s29, [x23, x27]\n"
+ "str s28, [x22, x27]\n"
+ "b 27f\n"
+ "24:" // Oddments: Unroll 1: Oddment store
+ "add x25, x25, x27\n"
+ "add x24, x24, x27\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "tbz x20, #1, 25f\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
+ "tbz x20, #0, 26f\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
+ "b 26f\n"
+ "25:" // Oddments: Unroll 1: Oddment store: Bit 1: Unset
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
+ "26:" // Oddments: Unroll 1: Oddment store: Bit 1: End
+ "27:" // Oddments: Unroll 1: After oddment store
+ "subs x20, x20, #0x4\n"
+ "add x27, x27, #0x4\n"
+ "ble 35f\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q25, [%x[params], #0x10]\n"
+ "movi v24.4s, #0x0\n"
+ ".inst 0x6e839598 // udot v24.4s, v12.16b, v3.16b\n"
+ "ldr q23, [%x[params], #0x20]\n"
+ "ldr q22, [%x[params], #0x30]\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "ldr q21, [%x[params], #0x40]\n"
+ "ldr q20, [%x[params], #0x50]\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x6e87973f // udot v31.4s, v25.16b, v7.16b\n"
+ ".inst 0x6e809598 // udot v24.4s, v12.16b, v0.16b\n"
+ ".inst 0x6e83973d // udot v29.4s, v25.16b, v3.16b\n"
+ "movi v19.4s, #0x0\n"
+ "cmp x20, #0x4\n"
+ ".inst 0x6e8396ff // udot v31.4s, v23.16b, v3.16b\n"
+ "mov v18.16b, v24.16b\n .inst 0x6e829592 // udot v18.4s, v12.16b, v2.16b\n"
+ "ext v3.16b, v3.16b, v3.16b, #0x1\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x6e879598 // udot v24.4s, v12.16b, v7.16b\n"
+ "ext v7.16b, v7.16b, v7.16b, #0x1\n"
+ ".inst 0x6e87973e // udot v30.4s, v25.16b, v7.16b\n"
+ ".inst 0x6e83973c // udot v28.4s, v25.16b, v3.16b\n"
+ ".inst 0x6e839593 // udot v19.4s, v12.16b, v3.16b\n"
+ ".inst 0x6e8096fd // udot v29.4s, v23.16b, v0.16b\n"
+ ".inst 0x6e8096df // udot v31.4s, v22.16b, v0.16b\n"
+ "ext v0.16b, v0.16b, v0.16b, #0x1\n"
+ ".inst 0x6e8396fe // udot v30.4s, v23.16b, v3.16b\n"
+ ".inst 0x6e8096fc // udot v28.4s, v23.16b, v0.16b\n"
+ "mls v31.4s, v24.4s, v16.4s\n"
+ ".inst 0x6e809593 // udot v19.4s, v12.16b, v0.16b\n"
+ ".inst 0x6e8296dd // udot v29.4s, v22.16b, v2.16b\n"
+ "ext v2.16b, v2.16b, v2.16b, #0x1\n"
+ ".inst 0x6e8096de // udot v30.4s, v22.16b, v0.16b\n"
+ ".inst 0x6e8296dc // udot v28.4s, v22.16b, v2.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "mov v17.16b, v19.16b\n .inst 0x6e829591 // udot v17.4s, v12.16b, v2.16b\n"
+ ".inst 0x6e879593 // udot v19.4s, v12.16b, v7.16b\n"
+ "mls v30.4s, v19.4s, v16.4s\n"
+ "mls v29.4s, v18.4s, v16.4s\n"
+ "mls v28.4s, v17.4s, v16.4s\n"
+ "and v17.16b, v31.16b, v20.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqadd v31.4s, v31.4s, v17.4s\n"
+ "and v19.16b, v30.16b, v20.16b\n"
+ "and v18.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v28.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v28.4s, v28.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "blt 28f\n"
+ "str s31, [x25, x27]\n"
+ "str s30, [x24, x27]\n"
+ "str s29, [x23, x27]\n"
+ "str s28, [x22, x27]\n"
+ "b 31f\n"
+ "28:" // Oddments: Unroll 2: Oddment store
+ "add x25, x25, x27\n"
+ "add x24, x24, x27\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "tbz x20, #1, 29f\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
+ "tbz x20, #0, 30f\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
+ "b 30f\n"
+ "29:" // Oddments: Unroll 2: Oddment store: Bit 1: Unset
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
+ "30:" // Oddments: Unroll 2: Oddment store: Bit 1: End
+ "31:" // Oddments: Unroll 2: After oddment store
+ "subs x20, x20, #0x4\n"
+ "add x27, x27, #0x4\n"
+ "ble 35f\n"
+ "ldr q31, [%x[params], #0x0]\n"
+ "ldr q23, [%x[params], #0x10]\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x6e8a9596 // udot v22.4s, v12.16b, v10.16b\n"
+ "ldr q21, [%x[params], #0x20]\n"
+ "ldr q19, [%x[params], #0x30]\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "ldr q20, [%x[params], #0x40]\n"
+ "ldr q26, [%x[params], #0x50]\n"
+ "mov v28.16b, v31.16b\n"
+ ".inst 0x6e8896ff // udot v31.4s, v23.16b, v8.16b\n"
+ ".inst 0x6e859596 // udot v22.4s, v12.16b, v5.16b\n"
+ ".inst 0x6e8a96fd // udot v29.4s, v23.16b, v10.16b\n"
+ "movi v18.4s, #0x0\n"
+ "add %x[params], %x[params], #0x60\n"
+ ".inst 0x6e8a96bf // udot v31.4s, v21.16b, v10.16b\n"
+ "mov v17.16b, v22.16b\n .inst 0x6e899591 // udot v17.4s, v12.16b, v9.16b\n"
+ "ext v10.16b, v10.16b, v10.16b, #0x1\n"
+ ".inst 0x6e889596 // udot v22.4s, v12.16b, v8.16b\n"
+ "ext v8.16b, v8.16b, v8.16b, #0x1\n"
+ ".inst 0x6e8896fe // udot v30.4s, v23.16b, v8.16b\n"
+ ".inst 0x6e8a96fc // udot v28.4s, v23.16b, v10.16b\n"
+ ".inst 0x6e8a9592 // udot v18.4s, v12.16b, v10.16b\n"
+ ".inst 0x6e8596bd // udot v29.4s, v21.16b, v5.16b\n"
+ ".inst 0x6e85967f // udot v31.4s, v19.16b, v5.16b\n"
+ "ext v5.16b, v5.16b, v5.16b, #0x1\n"
+ ".inst 0x6e8a96be // udot v30.4s, v21.16b, v10.16b\n"
+ ".inst 0x6e8596bc // udot v28.4s, v21.16b, v5.16b\n"
+ "mls v31.4s, v22.4s, v16.4s\n"
+ ".inst 0x6e859592 // udot v18.4s, v12.16b, v5.16b\n"
+ ".inst 0x6e89967d // udot v29.4s, v19.16b, v9.16b\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x1\n"
+ ".inst 0x6e85967e // udot v30.4s, v19.16b, v5.16b\n"
+ ".inst 0x6e89967c // udot v28.4s, v19.16b, v9.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v20.4s\n"
+ "mov v7.16b, v18.16b\n .inst 0x6e899587 // udot v7.4s, v12.16b, v9.16b\n"
+ ".inst 0x6e889592 // udot v18.4s, v12.16b, v8.16b\n"
+ "mls v30.4s, v18.4s, v16.4s\n"
+ "mls v29.4s, v17.4s, v16.4s\n"
+ "mls v28.4s, v7.4s, v16.4s\n"
+ "and v16.16b, v31.16b, v26.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v30.4s, v30.4s, v20.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v20.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v20.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "and v18.16b, v30.16b, v26.16b\n"
+ "and v17.16b, v29.16b, v26.16b\n"
+ "and v16.16b, v28.16b, v26.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v30.4s, v30.4s, v18.4s\n"
+ "sqadd v29.4s, v29.4s, v17.4s\n"
+ "sqadd v28.4s, v28.4s, v16.4s\n"
+ "srshl v31.4s, v31.4s, v26.4s\n"
+ "srshl v30.4s, v30.4s, v26.4s\n"
+ "srshl v29.4s, v29.4s, v26.4s\n"
+ "srshl v28.4s, v28.4s, v26.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smin v31.4s, v31.4s, v11.4s\n"
+ "smin v30.4s, v30.4s, v11.4s\n"
+ "smin v29.4s, v29.4s, v11.4s\n"
+ "smin v28.4s, v28.4s, v11.4s\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "32:" // Oddments: Unroll 3: Oddment store
+ "add x25, x25, x27\n"
+ "add x24, x24, x27\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "tbz x20, #1, 33f\n"
+ "st1 { v31.h }[0], [x25], #0x2\n"
+ "st1 { v30.h }[0], [x24], #0x2\n"
+ "st1 { v29.h }[0], [x23], #0x2\n"
+ "st1 { v28.h }[0], [x22], #0x2\n"
+ "tbz x20, #0, 34f\n"
+ "st1 { v31.b }[2], [x25], #0x1\n"
+ "st1 { v30.b }[2], [x24], #0x1\n"
+ "st1 { v29.b }[2], [x23], #0x1\n"
+ "st1 { v28.b }[2], [x22], #0x1\n"
+ "b 34f\n"
+ "33:" // Oddments: Unroll 3: Oddment store: Bit 1: Unset
+ "st1 { v31.b }[0], [x25], #0x1\n"
+ "st1 { v30.b }[0], [x24], #0x1\n"
+ "st1 { v29.b }[0], [x23], #0x1\n"
+ "st1 { v28.b }[0], [x22], #0x1\n"
+ "34:" // Oddments: Unroll 3: Oddment store: Bit 1: End
+ "35:" // End
+ : [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..5ae0be1054
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int,
+ const uint8_t *const *const,
+ const uint8_t *const,
+ const int32_t *const,
+ const arm_gemm::Requantize32 &,
+ const int32_t *const,
+ const int32_t *const,
+ uint8_t *const *const
+);
+
+class a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d5b55cb9c5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1166 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ uint64_t n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[16];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[5];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[3];
+ inptrs[3] = inptrs_raw[6];
+ inptrs[4] = inptrs_raw[9];
+ inptrs[5] = inptrs_raw[12];
+ inptrs[6] = inptrs_raw[15];
+ inptrs[7] = inptrs_raw[1];
+ inptrs[8] = inptrs_raw[2];
+ inptrs[9] = inptrs_raw[10];
+ inptrs[10] = inptrs_raw[4];
+ inptrs[11] = inptrs_raw[7];
+ inptrs[12] = inptrs_raw[8];
+ inptrs[13] = inptrs_raw[11];
+ inptrs[14] = inptrs_raw[13];
+ inptrs[15] = inptrs_raw[14];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x8, x7, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v14.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v19.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v29.8h }, [x21]\n"
+ "ld1r { v12.8h }, [x20]\n"
+ "mov x17, #0x0\n"
+ "mov x16, #0x0\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x11, x10, [x22, #0x0]\n"
+ "ldp x9, x28, [x22, #0x10]\n"
+ "cbz x8, 3f\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "subs x8, x8, #0x1\n"
+ "usubl v23.8h, v23.8b, v19.8b\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "usubl v16.8h, v16.8b, v19.8b\n"
+ "usubl v1.8h, v1.8b, v19.8b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "usubl v5.8h, v5.8b, v19.8b\n"
+ "usubl v26.8h, v26.8b, v19.8b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "usubl v18.8h, v18.8b, v19.8b\n"
+ "usubl v31.8h, v31.8b, v19.8b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v25.8h, v25.8b, v19.8b\n"
+ "usubl v20.8h, v20.8b, v19.8b\n"
+ "ldr q9, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d22, [x23, x17]\n"
+ "ldr d4, [x22, x17]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d8, [x21, x17]\n"
+ "ldr d27, [x20, x17]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ldr d15, [x20, x17]\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "ldr q3, [x13, #0x0]\n"
+ "ldr q17, [x12, #0x0]\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr q21, [x13, #0x10]\n"
+ "ldr q28, [x12, #0x10]\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ldr x20, [x15, #0x38]\n"
+ "ldr d4, [x20, x17]\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x27, [x15, #0x48]\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x26, [x15, #0x50]\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "ldr d8, [x20, x17]\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr x25, [x15, #0x58]\n"
+ "ldr x24, [x15, #0x60]\n"
+ "smlal v2.4s, v11.4h, v31.4h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x70]\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v9.4s, v4.4h, v16.4h\n"
+ "ldr x21, [x15, #0x78]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "ldr d27, [x27, x17]\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d11, [x26, x17]\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "add x14, x14, #0x48\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal v10.4s, v22.4h, v20.4h\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "subs x8, x8, #0x1\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v9.4s, v8.4h, v1.4h\n"
+ "add x13, x13, #0x20\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "ldr d15, [x25, x17]\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v20.8h\n"
+ "ldr d22, [x24, x17]\n"
+ "smlal v7.4s, v4.4h, v23.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v2.4s, v27.4h, v18.4h\n"
+ "smlal v10.4s, v27.4h, v26.4h\n"
+ "smlal2 v24.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v20.4h\n"
+ "smlal2 v0.4s, v4.8h, v23.8h\n"
+ "ldr d4, [x23, x17]\n"
+ "smlal2 v30.4s, v27.8h, v18.8h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v26.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "usubl v26.8h, v26.8b, v14.8b\n"
+ "smlal v2.4s, v11.4h, v23.4h\n"
+ "smlal v10.4s, v15.4h, v1.4h\n"
+ "smlal2 v24.4s, v27.8h, v20.8h\n"
+ "smlal v9.4s, v11.4h, v5.4h\n"
+ "smlal2 v0.4s, v8.8h, v16.8h\n"
+ "ldr d8, [x21, x17]\n"
+ "smlal2 v30.4s, v11.8h, v23.8h\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v15.8h, v1.8h\n"
+ "smlal v7.4s, v27.4h, v25.4h\n"
+ "add x17, x17, #0x8\n"
+ "smlal v2.4s, v22.4h, v5.4h\n"
+ "smlal v10.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v9.4s, v22.4h, v31.4h\n"
+ "sqrdmulh v9.4s, v9.4s, v3.4s\n"
+ "smlal2 v0.4s, v27.8h, v25.8h\n"
+ "smlal2 v30.4s, v22.8h, v5.8h\n"
+ "and v27.16b, v9.16b, v17.16b\n"
+ "smlal2 v6.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v15.4h, v18.4h\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "smlal v2.4s, v26.4h, v25.4h\n"
+ "smlal v10.4s, v26.4h, v31.4h\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "smlal2 v24.4s, v22.8h, v31.8h\n"
+ "smlal2 v0.4s, v15.8h, v18.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "smlal2 v30.4s, v26.8h, v25.8h\n"
+ "smlal2 v6.4s, v26.8h, v31.8h\n"
+ "and v31.16b, v24.16b, v28.16b\n"
+ "smlal v7.4s, v4.4h, v20.4h\n"
+ "smlal v2.4s, v8.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v3.4s\n"
+ "smlal v10.4s, v8.4h, v25.4h\n"
+ "smlal2 v0.4s, v4.8h, v20.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v3.4s\n"
+ "smlal2 v30.4s, v8.8h, v20.8h\n"
+ "smlal2 v6.4s, v8.8h, v25.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v3.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v22.16b, v7.16b, v17.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+ "and v3.16b, v2.16b, v17.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "and v11.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v20.16b, v0.16b, v28.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v31.16b, v30.16b, v28.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v18.16b, v6.16b, v28.16b\n"
+ "sqadd v7.4s, v7.4s, v22.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v3.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v11.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v17.4s\n"
+ "sqadd v0.4s, v0.4s, v20.4s\n"
+ "srshl v2.4s, v2.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v31.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v0.4s, v0.4s, v28.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v28.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v28.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "str d9, [x11, x16]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str d7, [x10, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d2, [x9, x16]\n"
+ "str d10, [x28, x16]\n"
+ "ldr q9, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "add x16, x16, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "usubl v23.8h, v23.8b, v19.8b\n"
+ "usubl v16.8h, v16.8b, v19.8b\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr d22, [x23, x17]\n"
+ "usubl v1.8h, v1.8b, v19.8b\n"
+ "usubl v5.8h, v5.8b, v19.8b\n"
+ "ldr d4, [x22, x17]\n"
+ "ldr d8, [x21, x17]\n"
+ "usubl v26.8h, v26.8b, v19.8b\n"
+ "usubl v18.8h, v18.8b, v19.8b\n"
+ "ldr d27, [x20, x17]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "usubl v31.8h, v31.8b, v19.8b\n"
+ "usubl v25.8h, v25.8b, v19.8b\n"
+ "ldr d15, [x20, x17]\n"
+ "usubl v20.8h, v20.8b, v19.8b\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "ldr q28, [x13, #0x0]\n"
+ "ldr q17, [x12, #0x0]\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr q21, [x13, #0x10]\n"
+ "ldr q3, [x12, #0x10]\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ldr x20, [x15, #0x38]\n"
+ "ldr d4, [x20, x17]\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x26, [x15, #0x48]\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x25, [x15, #0x50]\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "ldr d8, [x20, x17]\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr x24, [x15, #0x58]\n"
+ "ldr x23, [x15, #0x60]\n"
+ "smlal v2.4s, v11.4h, v31.4h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "ldr x22, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v9.4s, v4.4h, v16.4h\n"
+ "ldr x20, [x15, #0x78]\n"
+ "tst x7, #0x7\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "ldr d27, [x26, x17]\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d11, [x25, x17]\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "add x13, x13, #0x20\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal v10.4s, v22.4h, v20.4h\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v9.4s, v8.4h, v1.4h\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "ldr d15, [x24, x17]\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v20.8h\n"
+ "ldr d22, [x23, x17]\n"
+ "smlal v7.4s, v4.4h, v23.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v2.4s, v27.4h, v18.4h\n"
+ "smlal v10.4s, v27.4h, v26.4h\n"
+ "smlal2 v24.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v20.4h\n"
+ "smlal2 v0.4s, v4.8h, v23.8h\n"
+ "ldr d4, [x22, x17]\n"
+ "smlal2 v30.4s, v27.8h, v18.8h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v26.8h\n"
+ "ldr d26, [x21, x17]\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "usubl v26.8h, v26.8b, v14.8b\n"
+ "smlal v2.4s, v11.4h, v23.4h\n"
+ "smlal v10.4s, v15.4h, v1.4h\n"
+ "smlal2 v24.4s, v27.8h, v20.8h\n"
+ "smlal v9.4s, v11.4h, v5.4h\n"
+ "smlal2 v0.4s, v8.8h, v16.8h\n"
+ "ldr d16, [x20, x17]\n"
+ "smlal2 v30.4s, v11.8h, v23.8h\n"
+ "usubl v16.8h, v16.8b, v14.8b\n"
+ "smlal2 v6.4s, v15.8h, v1.8h\n"
+ "smlal v7.4s, v27.4h, v25.4h\n"
+ "add x17, x17, #0x8\n"
+ "smlal v2.4s, v22.4h, v5.4h\n"
+ "smlal v10.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v9.4s, v22.4h, v31.4h\n"
+ "sqrdmulh v9.4s, v9.4s, v28.4s\n"
+ "smlal2 v0.4s, v27.8h, v25.8h\n"
+ "smlal2 v30.4s, v22.8h, v5.8h\n"
+ "and v1.16b, v9.16b, v17.16b\n"
+ "smlal2 v6.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v15.4h, v18.4h\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "smlal v2.4s, v26.4h, v25.4h\n"
+ "smlal v10.4s, v26.4h, v31.4h\n"
+ "sqadd v9.4s, v9.4s, v1.4s\n"
+ "smlal2 v24.4s, v22.8h, v31.8h\n"
+ "smlal2 v0.4s, v15.8h, v18.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "smlal2 v30.4s, v26.8h, v25.8h\n"
+ "smlal2 v6.4s, v26.8h, v31.8h\n"
+ "and v31.16b, v24.16b, v3.16b\n"
+ "smlal v7.4s, v4.4h, v20.4h\n"
+ "smlal v2.4s, v16.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v28.4s\n"
+ "smlal v10.4s, v16.4h, v25.4h\n"
+ "smlal2 v0.4s, v4.8h, v20.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v28.4s\n"
+ "smlal2 v30.4s, v16.8h, v20.8h\n"
+ "smlal2 v6.4s, v16.8h, v25.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v28.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v22.16b, v7.16b, v17.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+ "and v15.16b, v2.16b, v17.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "and v11.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v18.16b, v0.16b, v3.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v23.16b, v30.16b, v3.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v21.16b, v6.16b, v3.16b\n"
+ "sqadd v7.4s, v7.4s, v22.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v15.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v11.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v17.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v23.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v21.4s\n"
+ "srshl v24.4s, v24.4s, v3.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v0.4s, v0.4s, v3.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v3.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v3.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "str d9, [x11, x16]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str d7, [x10, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d2, [x9, x16]\n"
+ "str d10, [x28, x16]\n"
+ "add x16, x16, #0x8\n"
+ "beq 64f\n"
+ "add x14, x14, #0x48\n"
+ "3:" // Oddments
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x7, #2, 5f\n"
+ "ld1 { v9.4s }, [x20], #0x10\n"
+ "tbz x7, #1, 4f\n"
+ "ld1 { v24.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v24.s }[2], [x20]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 7f\n"
+ "ld1 { v24.s }[0], [x20]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x7, #1, 6f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v9.s }[2], [x20]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 7f\n"
+ "ld1 { v9.s }[0], [x20]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "usubl v23.8h, v23.8b, v19.8b\n"
+ "usubl v16.8h, v16.8b, v19.8b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "usubl v1.8h, v1.8b, v19.8b\n"
+ "usubl v5.8h, v5.8b, v19.8b\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "usubl v26.8h, v26.8b, v19.8b\n"
+ "usubl v18.8h, v18.8b, v19.8b\n"
+ "usubl v31.8h, v31.8b, v19.8b\n"
+ "usubl v25.8h, v25.8b, v19.8b\n"
+ "usubl v20.8h, v20.8b, v19.8b\n"
+ "add x24, x24, x17\n"
+ "add x23, x23, x17\n"
+ "add x22, x22, x17\n"
+ "add x21, x21, x17\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 9f\n"
+ "ld1 { v22.s }[0], [x24], #0x4\n"
+ "ld1 { v4.s }[0], [x23], #0x4\n"
+ "ld1 { v8.s }[0], [x22], #0x4\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 8f\n"
+ "ld1 { v22.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v8.h }[2], [x22], #0x2\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[6], [x24]\n"
+ "ld1 { v4.b }[6], [x23]\n"
+ "ld1 { v8.b }[6], [x22]\n"
+ "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v15.b }[6], [x20]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[4], [x24]\n"
+ "ld1 { v4.b }[4], [x23]\n"
+ "ld1 { v8.b }[4], [x22]\n"
+ "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v15.b }[4], [x20]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x7, #1, 10f\n"
+ "ld1 { v22.h }[0], [x24], #0x2\n"
+ "ld1 { v4.h }[0], [x23], #0x2\n"
+ "ld1 { v8.h }[0], [x22], #0x2\n"
+ "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x23]\n"
+ "ld1 { v8.b }[2], [x22]\n"
+ "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v15.b }[2], [x20]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[0], [x24]\n"
+ "ld1 { v4.b }[0], [x23]\n"
+ "ld1 { v8.b }[0], [x22]\n"
+ "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v15.b }[0], [x20]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "tbz x7, #2, 13f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 12f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[6], [x20]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[4], [x20]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x7, #1, 14f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[2], [x20]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[0], [x20]\n"
+ "15:" // Oddments: Load (3, 0): Bit 2: End
+ "usubl v21.8h, v21.8b, v14.8b\n"
+ "smlal v2.4s, v21.4h, v31.4h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "tbz x7, #2, 17f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 16f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[6], [x20]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[4], [x20]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x7, #1, 18f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[2], [x20]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[0], [x20]\n"
+ "19:" // Oddments: Load (3, 3): Bit 2: End
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "ldr x20, [x15, #0x38]\n"
+ "smlal v10.4s, v28.4h, v20.4h\n"
+ "smlal2 v6.4s, v28.8h, v20.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 21f\n"
+ "ld1 { v22.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 20f\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[6], [x20]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[4], [x20]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (0, 1): Bit 2: Unset
+ "tbz x7, #1, 22f\n"
+ "ld1 { v22.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[2], [x20]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[0], [x20]\n"
+ "23:" // Oddments: Load (0, 1): Bit 2: End
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v9.4s, v22.4h, v16.4h\n"
+ "smlal2 v24.4s, v22.8h, v16.8h\n"
+ "smlal v7.4s, v22.4h, v23.4h\n"
+ "smlal2 v0.4s, v22.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 25f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 24f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[6], [x20]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[4], [x20]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (0, 2): Bit 2: Unset
+ "tbz x7, #1, 26f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[2], [x20]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[0], [x20]\n"
+ "27:" // Oddments: Load (0, 2): Bit 2: End
+ "usubl v21.8h, v21.8b, v14.8b\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v9.4s, v21.4h, v1.4h\n"
+ "smlal2 v24.4s, v21.8h, v1.8h\n"
+ "smlal v7.4s, v21.4h, v16.4h\n"
+ "smlal2 v0.4s, v21.8h, v16.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 29f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 28f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[6], [x20]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[4], [x20]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 2): Bit 2: Unset
+ "tbz x7, #1, 30f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[2], [x20]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[0], [x20]\n"
+ "31:" // Oddments: Load (2, 2): Bit 2: End
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v9.4s, v28.4h, v20.4h\n"
+ "smlal2 v24.4s, v28.8h, v20.8h\n"
+ "smlal v7.4s, v28.4h, v25.4h\n"
+ "smlal2 v0.4s, v28.8h, v25.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v2.4s, v28.4h, v18.4h\n"
+ "smlal2 v30.4s, v28.8h, v18.8h\n"
+ "smlal v10.4s, v28.4h, v26.4h\n"
+ "smlal2 v6.4s, v28.8h, v26.8h\n"
+ "tbz x7, #2, 33f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 32f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[6], [x20]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[4], [x20]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (1, 0): Bit 2: Unset
+ "tbz x7, #1, 34f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[2], [x20]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[0], [x20]\n"
+ "35:" // Oddments: Load (1, 0): Bit 2: End
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "ldr x20, [x15, #0x58]\n"
+ "smlal v9.4s, v8.4h, v5.4h\n"
+ "smlal2 v24.4s, v8.8h, v5.8h\n"
+ "smlal v2.4s, v8.4h, v23.4h\n"
+ "smlal2 v30.4s, v8.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 37f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 36f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[6], [x20]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[4], [x20]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x7, #1, 38f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[2], [x20]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[0], [x20]\n"
+ "39:" // Oddments: Load (1, 3): Bit 2: End
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v7.4s, v8.4h, v18.4h\n"
+ "smlal2 v0.4s, v8.8h, v18.8h\n"
+ "smlal v10.4s, v8.4h, v1.4h\n"
+ "smlal2 v6.4s, v8.8h, v1.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 41f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 40f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[6], [x20]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[4], [x20]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (2, 0): Bit 2: Unset
+ "tbz x7, #1, 42f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[2], [x20]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[0], [x20]\n"
+ "43:" // Oddments: Load (2, 0): Bit 2: End
+ "usubl v17.8h, v17.8b, v14.8b\n"
+ "ldr x20, [x15, #0x68]\n"
+ "smlal v9.4s, v17.4h, v31.4h\n"
+ "smlal2 v24.4s, v17.8h, v31.8h\n"
+ "smlal v2.4s, v17.4h, v5.4h\n"
+ "smlal2 v30.4s, v17.8h, v5.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 45f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 44f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[6], [x20]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[4], [x20]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x7, #1, 46f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[2], [x20]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[0], [x20]\n"
+ "47:" // Oddments: Load (2, 3): Bit 2: End
+ "usubl v23.8h, v23.8b, v14.8b\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v7.4s, v23.4h, v20.4h\n"
+ "smlal2 v0.4s, v23.8h, v20.8h\n"
+ "smlal v10.4s, v23.4h, v18.4h\n"
+ "smlal2 v6.4s, v23.8h, v18.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 49f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 48f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[6], [x20]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[4], [x20]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x7, #1, 50f\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[2], [x20]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[0], [x20]\n"
+ "51:" // Oddments: Load (3, 1): Bit 2: End
+ "usubl v5.8h, v5.8b, v14.8b\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v2.4s, v5.4h, v25.4h\n"
+ "smlal2 v30.4s, v5.8h, v25.8h\n"
+ "smlal v10.4s, v5.4h, v31.4h\n"
+ "smlal2 v6.4s, v5.8h, v31.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 53f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 52f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[6], [x20]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[4], [x20]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x7, #1, 54f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[2], [x20]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[0], [x20]\n"
+ "55:" // Oddments: Load (3, 2): Bit 2: End
+ "usubl v23.8h, v23.8b, v14.8b\n"
+ "smlal v2.4s, v23.4h, v20.4h\n"
+ "smlal2 v30.4s, v23.8h, v20.8h\n"
+ "smlal v10.4s, v23.4h, v25.4h\n"
+ "smlal2 v6.4s, v23.8h, v25.8h\n"
+ "tbz x7, #2, 57f\n"
+ "ld1 { v15.4s }, [x13], #0x10\n"
+ "ld1 { v19.4s }, [x12], #0x10\n"
+ "tbz x7, #1, 56f\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v22.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v22.s }[2], [x12]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 59f\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v22.s }[0], [x12]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x7, #1, 58f\n"
+ "ld1 { v15.d }[0], [x13], #0x8\n"
+ "ld1 { v19.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v15.s }[2], [x13]\n"
+ "ld1 { v19.s }[2], [x12]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 59f\n"
+ "ld1 { v15.s }[0], [x13]\n"
+ "ld1 { v19.s }[0], [x12]\n"
+ "59:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v9.4s, v9.4s, v15.4s\n"
+ "and v17.16b, v9.16b, v19.16b\n"
+ "add x11, x11, x16\n"
+ "add x10, x10, x16\n"
+ "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "add x9, x9, x16\n"
+ "add x28, x28, x16\n"
+ "and v20.16b, v24.16b, v22.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+ "sqrdmulh v2.4s, v2.4s, v15.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v15.4s\n"
+ "sqadd v9.4s, v9.4s, v17.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v21.16b, v7.16b, v19.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "and v15.16b, v2.16b, v19.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+ "and v23.16b, v10.16b, v19.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v20.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v18.16b, v0.16b, v22.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v17.16b, v30.16b, v22.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v28.16b, v6.16b, v22.16b\n"
+ "sqadd v7.4s, v7.4s, v21.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v15.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v23.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v19.4s\n"
+ "srshl v7.4s, v7.4s, v19.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v19.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqadd v6.4s, v6.4s, v28.4s\n"
+ "srshl v24.4s, v24.4s, v22.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v0.4s, v0.4s, v22.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v22.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v22.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "tbz x7, #2, 61f\n"
+ "st1 { v9.s }[0], [x11], #0x4\n"
+ "st1 { v7.s }[0], [x10], #0x4\n"
+ "st1 { v2.s }[0], [x9], #0x4\n"
+ "st1 { v10.s }[0], [x28], #0x4\n"
+ "tbz x7, #1, 60f\n"
+ "st1 { v9.h }[2], [x11], #0x2\n"
+ "st1 { v7.h }[2], [x10], #0x2\n"
+ "st1 { v2.h }[2], [x9], #0x2\n"
+ "st1 { v10.h }[2], [x28], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "st1 { v9.b }[6], [x11], #0x1\n"
+ "st1 { v7.b }[6], [x10], #0x1\n"
+ "st1 { v2.b }[6], [x9], #0x1\n"
+ "st1 { v10.b }[6], [x28], #0x1\n"
+ "b 63f\n"
+ "60:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 63f\n"
+ "st1 { v9.b }[4], [x11], #0x1\n"
+ "st1 { v7.b }[4], [x10], #0x1\n"
+ "st1 { v2.b }[4], [x9], #0x1\n"
+ "st1 { v10.b }[4], [x28], #0x1\n"
+ "b 63f\n"
+ "61:" // Oddments: Bit 2: Unset
+ "tbz x7, #1, 62f\n"
+ "st1 { v9.h }[0], [x11], #0x2\n"
+ "st1 { v7.h }[0], [x10], #0x2\n"
+ "st1 { v2.h }[0], [x9], #0x2\n"
+ "st1 { v10.h }[0], [x28], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "st1 { v9.b }[2], [x11], #0x1\n"
+ "st1 { v7.b }[2], [x10], #0x1\n"
+ "st1 { v2.b }[2], [x9], #0x1\n"
+ "st1 { v10.b }[2], [x28], #0x1\n"
+ "b 63f\n"
+ "62:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 63f\n"
+ "st1 { v9.b }[0], [x11], #0x1\n"
+ "st1 { v7.b }[0], [x10], #0x1\n"
+ "st1 { v2.b }[0], [x9], #0x1\n"
+ "st1 { v10.b }[0], [x28], #0x1\n"
+ "63:" // Oddments: Bit 2: End
+ "64:" // End
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..52280ebe70
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+ const unsigned int,
+ const uint8_t *const *const,
+ const uint8_t *const,
+ const int32_t *const,
+ const arm_gemm::Requantize32 &,
+ const int32_t *const,
+ const int32_t *const,
+ uint8_t *const *const
+);
+
+class a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..c4184622b0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ uint64_t n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[25];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[12];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[1];
+ inptrs[3] = inptrs_raw[3];
+ inptrs[4] = inptrs_raw[4];
+ inptrs[5] = inptrs_raw[5];
+ inptrs[6] = inptrs_raw[6];
+ inptrs[7] = inptrs_raw[2];
+ inptrs[8] = inptrs_raw[8];
+ inptrs[9] = inptrs_raw[9];
+ inptrs[10] = inptrs_raw[7];
+ inptrs[11] = inptrs_raw[15];
+ inptrs[12] = inptrs_raw[10];
+ inptrs[13] = inptrs_raw[16];
+ inptrs[14] = inptrs_raw[11];
+ inptrs[15] = inptrs_raw[18];
+ inptrs[16] = inptrs_raw[13];
+ inptrs[17] = inptrs_raw[19];
+ inptrs[18] = inptrs_raw[20];
+ inptrs[19] = inptrs_raw[14];
+ inptrs[20] = inptrs_raw[21];
+ inptrs[21] = inptrs_raw[17];
+ inptrs[22] = inptrs_raw[23];
+ inptrs[23] = inptrs_raw[22];
+ inptrs[24] = inptrs_raw[24];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x8, x7, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v6.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v15.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v17.8h }, [x21]\n"
+ "ld1r { v24.8h }, [x20]\n"
+ "mov x17, #0x0\n"
+ "mov x16, #0x0\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x11, x10, [x22, #0x0]\n"
+ "ldp x9, x28, [x22, #0x10]\n"
+ "cbz x8, 3f\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
+ "subs x8, x8, #0x1\n"
+ "usubl v11.8h, v11.8b, v15.8b\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "usubl v22.8h, v22.8b, v15.8b\n"
+ "usubl v14.8h, v14.8b, v15.8b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "usubl v18.8h, v18.8b, v15.8b\n"
+ "ldr d26, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "usubl v9.8h, v9.8b, v15.8b\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "ldr d4, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d25, [x27, x17]\n"
+ "ldr d27, [x26, x17]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d1, [x25, x17]\n"
+ "ldr d2, [x24, x17]\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "ldr d12, [x23, x17]\n"
+ "ldr d16, [x22, x17]\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "ldr d23, [x21, x17]\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "ldr q30, [x13, #0x0]\n"
+ "ldr q29, [x12, #0x0]\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x21, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "ldr x25, [x15, #0x60]\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
+ "ldr d27, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal v20.4s, v27.4h, v28.4h\n"
+ "smlal v19.4s, v25.4h, v18.4h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ldr d1, [x25, x17]\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "ldr d2, [x24, x17]\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v28.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v18.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
+ "ldr x25, [x15, #0x98]\n"
+ "smlal v20.4s, v1.4h, v11.4h\n"
+ "smlal v19.4s, v2.4h, v22.4h\n"
+ "ldr x24, [x15, #0x50]\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ldr d16, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "ldr d12, [x20, x17]\n"
+ "ldr x23, [x15, #0x48]\n"
+ "smlal2 v0.4s, v1.8h, v11.8h\n"
+ "smlal2 v31.4s, v2.8h, v22.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal v20.4s, v27.4h, v18.4h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v19.4s, v25.4h, v9.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ldr d23, [x25, x17]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "ldr d11, [x24, x17]\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v18.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v9.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v21.4s, v16.4h, v18.4h\n"
+ "smlal v20.4s, v12.4h, v22.4h\n"
+ "smlal v19.4s, v23.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal2 v8.4s, v16.8h, v18.8h\n"
+ "ldr d18, [x22, x17]\n"
+ "ldr d16, [x21, x17]\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal2 v31.4s, v23.8h, v14.8h\n"
+ "ldr q14, [x13, #0x10]\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v25.4h, v26.4h\n"
+ "smlal v19.4s, v10.4h, v28.4h\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "smlal2 v3.4s, v11.8h, v9.8h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v26.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v8.4s, v27.8h, v9.8h\n"
+ "ldr d27, [x21, x17]\n"
+ "smlal2 v0.4s, v25.8h, v26.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal2 v31.4s, v10.8h, v28.8h\n"
+ "smlal v21.4s, v11.4h, v28.4h\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
+ "add x14, x14, #0x48\n"
+ "smlal v20.4s, v18.4h, v7.4h\n"
+ "smlal v19.4s, v16.4h, v7.4h\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "add x17, x17, #0x8\n"
+ "smlal2 v3.4s, v1.8h, v26.8h\n"
+ "smlal v5.4s, v12.4h, v7.4h\n"
+ "sqrdmulh v5.4s, v5.4s, v30.4s\n"
+ "subs x8, x8, #0x1\n"
+ "smlal2 v8.4s, v11.8h, v28.8h\n"
+ "smlal2 v0.4s, v18.8h, v7.8h\n"
+ "and v28.16b, v5.16b, v29.16b\n"
+ "add x13, x13, #0x20\n"
+ "smlal2 v31.4s, v16.8h, v7.8h\n"
+ "smlal v21.4s, v2.4h, v7.4h\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "add x12, x12, #0x20\n"
+ "smlal v20.4s, v10.4h, v9.4h\n"
+ "smlal v19.4s, v22.4h, v26.4h\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "smlal2 v3.4s, v12.8h, v7.8h\n"
+ "smlal2 v8.4s, v2.8h, v7.8h\n"
+ "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+ "smlal2 v0.4s, v10.8h, v9.8h\n"
+ "smlal2 v31.4s, v22.8h, v26.8h\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "smlal v21.4s, v23.4h, v4.4h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+ "smlal v19.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "smlal2 v0.4s, v22.8h, v4.8h\n"
+ "smlal2 v31.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v12.16b, v21.16b, v29.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+ "and v23.16b, v20.16b, v29.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+ "and v9.16b, v19.16b, v29.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "and v18.16b, v8.16b, v25.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v22.16b, v0.16b, v25.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v12.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v23.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v9.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v29.4s\n"
+ "srshl v21.4s, v21.4s, v29.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v29.4s\n"
+ "sqadd v0.4s, v0.4s, v22.4s\n"
+ "srshl v19.4s, v19.4s, v29.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v25.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d5, [x11, x16]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d19, [x28, x16]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
+ "add x16, x16, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d26, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d4, [x14, #0x40]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "usubl v11.8h, v11.8b, v15.8b\n"
+ "usubl v22.8h, v22.8b, v15.8b\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "usubl v14.8h, v14.8b, v15.8b\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ldr d25, [x27, x17]\n"
+ "usubl v18.8h, v18.8b, v15.8b\n"
+ "usubl v9.8h, v9.8b, v15.8b\n"
+ "ldr d27, [x26, x17]\n"
+ "ldr d1, [x25, x17]\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "ldr d2, [x24, x17]\n"
+ "ldr d12, [x23, x17]\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "ldr d16, [x22, x17]\n"
+ "ldr d23, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "ldr q29, [x13, #0x0]\n"
+ "ldr q30, [x12, #0x0]\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x21, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "ldr x25, [x15, #0x60]\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
+ "ldr d27, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal v20.4s, v27.4h, v28.4h\n"
+ "smlal v19.4s, v25.4h, v18.4h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ldr d1, [x25, x17]\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "ldr d2, [x24, x17]\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v28.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v18.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
+ "ldr x25, [x15, #0x98]\n"
+ "smlal v20.4s, v1.4h, v11.4h\n"
+ "smlal v19.4s, v2.4h, v22.4h\n"
+ "ldr x24, [x15, #0x50]\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ldr d16, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "ldr d12, [x20, x17]\n"
+ "ldr x23, [x15, #0x48]\n"
+ "smlal2 v0.4s, v1.8h, v11.8h\n"
+ "smlal2 v31.4s, v2.8h, v22.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal v20.4s, v27.4h, v18.4h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v19.4s, v25.4h, v9.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ldr d23, [x25, x17]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "ldr d11, [x24, x17]\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v18.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v9.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v21.4s, v16.4h, v18.4h\n"
+ "smlal v20.4s, v12.4h, v22.4h\n"
+ "smlal v19.4s, v23.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal2 v8.4s, v16.8h, v18.8h\n"
+ "ldr d16, [x22, x17]\n"
+ "ldr d18, [x21, x17]\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal2 v31.4s, v23.8h, v14.8h\n"
+ "ldr q14, [x13, #0x10]\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v25.4h, v26.4h\n"
+ "smlal v19.4s, v10.4h, v28.4h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "smlal2 v3.4s, v11.8h, v9.8h\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v26.4h\n"
+ "tst x7, #0x7\n"
+ "smlal2 v8.4s, v27.8h, v9.8h\n"
+ "ldr d27, [x20, x17]\n"
+ "smlal2 v0.4s, v25.8h, v26.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal2 v31.4s, v10.8h, v28.8h\n"
+ "smlal v21.4s, v11.4h, v28.4h\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
+ "add x17, x17, #0x8\n"
+ "smlal v20.4s, v16.4h, v7.4h\n"
+ "smlal v19.4s, v18.4h, v7.4h\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "add x13, x13, #0x20\n"
+ "smlal2 v3.4s, v1.8h, v26.8h\n"
+ "smlal v5.4s, v12.4h, v7.4h\n"
+ "sqrdmulh v5.4s, v5.4s, v29.4s\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v8.4s, v11.8h, v28.8h\n"
+ "smlal2 v0.4s, v16.8h, v7.8h\n"
+ "and v16.16b, v5.16b, v30.16b\n"
+ "smlal2 v31.4s, v18.8h, v7.8h\n"
+ "smlal v21.4s, v2.4h, v7.4h\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smlal v20.4s, v10.4h, v9.4h\n"
+ "smlal v19.4s, v22.4h, v26.4h\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "smlal2 v3.4s, v12.8h, v7.8h\n"
+ "smlal2 v8.4s, v2.8h, v7.8h\n"
+ "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+ "smlal2 v0.4s, v10.8h, v9.8h\n"
+ "smlal2 v31.4s, v22.8h, v26.8h\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "smlal v21.4s, v23.4h, v4.4h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v29.4s\n"
+ "smlal v19.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "sqrdmulh v20.4s, v20.4s, v29.4s\n"
+ "smlal2 v0.4s, v22.8h, v4.8h\n"
+ "smlal2 v31.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v29.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v23.16b, v21.16b, v30.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+ "and v27.16b, v20.16b, v30.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+ "and v22.16b, v19.16b, v30.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v14.16b, v8.16b, v25.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v18.16b, v0.16b, v25.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v23.4s\n"
+ "sshr v14.4s, v14.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v27.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v22.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v30.4s\n"
+ "srshl v21.4s, v21.4s, v30.4s\n"
+ "sqadd v8.4s, v8.4s, v14.4s\n"
+ "srshl v20.4s, v20.4s, v30.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v19.4s, v19.4s, v30.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v25.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d5, [x11, x16]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d19, [x28, x16]\n"
+ "add x16, x16, #0x8\n"
+ "beq 88f\n"
+ "add x14, x14, #0x48\n"
+ "3:" // Oddments
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x7, #2, 5f\n"
+ "ld1 { v5.4s }, [x20], #0x10\n"
+ "tbz x7, #1, 4f\n"
+ "ld1 { v3.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v3.s }[2], [x20]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 7f\n"
+ "ld1 { v3.s }[0], [x20]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x7, #1, 6f\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v5.s }[2], [x20]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 7f\n"
+ "ld1 { v5.s }[0], [x20]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d26, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "usubl v11.8h, v11.8b, v15.8b\n"
+ "usubl v22.8h, v22.8b, v15.8b\n"
+ "ldr d4, [x14, #0x40]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "usubl v14.8h, v14.8b, v15.8b\n"
+ "usubl v28.8h, v28.8b, v15.8b\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "usubl v18.8h, v18.8b, v15.8b\n"
+ "usubl v9.8h, v9.8b, v15.8b\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "usubl v26.8h, v26.8b, v15.8b\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "add x27, x27, x17\n"
+ "add x26, x26, x17\n"
+ "add x25, x25, x17\n"
+ "add x24, x24, x17\n"
+ "add x23, x23, x17\n"
+ "add x22, x22, x17\n"
+ "add x21, x21, x17\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 9f\n"
+ "ld1 { v25.s }[0], [x27], #0x4\n"
+ "ld1 { v27.s }[0], [x26], #0x4\n"
+ "ld1 { v1.s }[0], [x25], #0x4\n"
+ "ld1 { v2.s }[0], [x24], #0x4\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x22], #0x4\n"
+ "ld1 { v23.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 8f\n"
+ "ld1 { v25.h }[2], [x27], #0x2\n"
+ "ld1 { v27.h }[2], [x26], #0x2\n"
+ "ld1 { v1.h }[2], [x25], #0x2\n"
+ "ld1 { v2.h }[2], [x24], #0x2\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x22], #0x2\n"
+ "ld1 { v23.h }[2], [x21], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v25.b }[6], [x27]\n"
+ "ld1 { v27.b }[6], [x26]\n"
+ "ld1 { v1.b }[6], [x25]\n"
+ "ld1 { v2.b }[6], [x24]\n"
+ "ld1 { v12.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x22]\n"
+ "ld1 { v23.b }[6], [x21]\n"
+ "ld1 { v10.b }[6], [x20]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 11f\n"
+ "ld1 { v25.b }[4], [x27]\n"
+ "ld1 { v27.b }[4], [x26]\n"
+ "ld1 { v1.b }[4], [x25]\n"
+ "ld1 { v2.b }[4], [x24]\n"
+ "ld1 { v12.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x22]\n"
+ "ld1 { v23.b }[4], [x21]\n"
+ "ld1 { v10.b }[4], [x20]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x7, #1, 10f\n"
+ "ld1 { v25.h }[0], [x27], #0x2\n"
+ "ld1 { v27.h }[0], [x26], #0x2\n"
+ "ld1 { v1.h }[0], [x25], #0x2\n"
+ "ld1 { v2.h }[0], [x24], #0x2\n"
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x22], #0x2\n"
+ "ld1 { v23.h }[0], [x21], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v25.b }[2], [x27]\n"
+ "ld1 { v27.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x25]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "ld1 { v12.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x22]\n"
+ "ld1 { v23.b }[2], [x21]\n"
+ "ld1 { v10.b }[2], [x20]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 11f\n"
+ "ld1 { v25.b }[0], [x27]\n"
+ "ld1 { v27.b }[0], [x26]\n"
+ "ld1 { v1.b }[0], [x25]\n"
+ "ld1 { v2.b }[0], [x24]\n"
+ "ld1 { v12.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x22]\n"
+ "ld1 { v23.b }[0], [x21]\n"
+ "ld1 { v10.b }[0], [x20]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "tbz x7, #2, 13f\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 12f\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v15.b }[6], [x20]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 15f\n"
+ "ld1 { v15.b }[4], [x20]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x7, #1, 14f\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v15.b }[2], [x20]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 15f\n"
+ "ld1 { v15.b }[0], [x20]\n"
+ "15:" // Oddments: Load (1, 3): Bit 2: End
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v21.4s, v15.4h, v18.4h\n"
+ "smlal2 v8.4s, v15.8h, v18.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 17f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 16f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 19f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (1, 4): Bit 2: Unset
+ "tbz x7, #1, 18f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 19f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "19:" // Oddments: Load (1, 4): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v21.4s, v16.4h, v9.4h\n"
+ "smlal2 v8.4s, v16.8h, v9.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 21f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 20f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 23f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (1, 2): Bit 2: Unset
+ "tbz x7, #1, 22f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 23f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "23:" // Oddments: Load (1, 2): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x58]\n"
+ "smlal v5.4s, v16.4h, v9.4h\n"
+ "smlal2 v3.4s, v16.8h, v9.8h\n"
+ "smlal v21.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v16.8h, v28.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 25f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 24f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 27f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x7, #1, 26f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 27f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "27:" // Oddments: Load (3, 0): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v20.4s, v16.4h, v28.4h\n"
+ "smlal2 v0.4s, v16.8h, v28.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 29f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 28f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 31f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 0): Bit 2: Unset
+ "tbz x7, #1, 30f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 31f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "31:" // Oddments: Load (2, 0): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x68]\n"
+ "smlal v5.4s, v16.4h, v26.4h\n"
+ "smlal2 v3.4s, v16.8h, v26.8h\n"
+ "smlal v20.4s, v16.4h, v11.4h\n"
+ "smlal2 v0.4s, v16.8h, v11.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 33f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 32f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 35f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x7, #1, 34f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 35f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "35:" // Oddments: Load (3, 1): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v20.4s, v16.4h, v18.4h\n"
+ "smlal2 v0.4s, v16.8h, v18.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 37f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 36f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 39f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (2, 1): Bit 2: Unset
+ "tbz x7, #1, 38f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 39f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "39:" // Oddments: Load (2, 1): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v16.4h, v7.4h\n"
+ "smlal2 v3.4s, v16.8h, v7.8h\n"
+ "smlal v20.4s, v16.4h, v22.4h\n"
+ "smlal2 v0.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 41f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 40f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 43f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x7, #1, 42f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 43f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "43:" // Oddments: Load (3, 3): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x80]\n"
+ "smlal v19.4s, v16.4h, v18.4h\n"
+ "smlal2 v31.4s, v16.8h, v18.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 45f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 44f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 47f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x7, #1, 46f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 47f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "47:" // Oddments: Load (2, 3): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x88]\n"
+ "smlal v21.4s, v16.4h, v7.4h\n"
+ "smlal2 v8.4s, v16.8h, v7.8h\n"
+ "smlal v19.4s, v16.4h, v22.4h\n"
+ "smlal2 v31.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 49f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 48f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 51f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 4): Bit 2: Unset
+ "tbz x7, #1, 50f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 51f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "51:" // Oddments: Load (3, 4): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x90]\n"
+ "smlal v19.4s, v16.4h, v9.4h\n"
+ "smlal2 v31.4s, v16.8h, v9.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 53f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 52f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 55f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (4, 0): Bit 2: Unset
+ "tbz x7, #1, 54f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 55f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "55:" // Oddments: Load (4, 0): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x98]\n"
+ "smlal v20.4s, v16.4h, v26.4h\n"
+ "smlal2 v0.4s, v16.8h, v26.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 57f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 56f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 59f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load (2, 4): Bit 2: Unset
+ "tbz x7, #1, 58f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 59f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "59:" // Oddments: Load (2, 4): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xa0]\n"
+ "smlal v21.4s, v16.4h, v4.4h\n"
+ "smlal2 v8.4s, v16.8h, v4.8h\n"
+ "smlal v19.4s, v16.4h, v14.4h\n"
+ "smlal2 v31.4s, v16.8h, v14.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 61f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 60f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 63f\n"
+ "60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 63f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 63f\n"
+ "61:" // Oddments: Load (4, 1): Bit 2: Unset
+ "tbz x7, #1, 62f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 63f\n"
+ "62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 63f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "63:" // Oddments: Load (4, 1): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v20.4s, v16.4h, v7.4h\n"
+ "smlal2 v0.4s, v16.8h, v7.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 65f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 64f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 67f\n"
+ "64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 67f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 67f\n"
+ "65:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x7, #1, 66f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 67f\n"
+ "66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 67f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "67:" // Oddments: Load (3, 2): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "smlal v20.4s, v16.4h, v9.4h\n"
+ "smlal2 v0.4s, v16.8h, v9.8h\n"
+ "smlal v19.4s, v16.4h, v28.4h\n"
+ "smlal2 v31.4s, v16.8h, v28.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 69f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 68f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 71f\n"
+ "68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 71f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 71f\n"
+ "69:" // Oddments: Load (4, 3): Bit 2: Unset
+ "tbz x7, #1, 70f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 71f\n"
+ "70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 71f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "71:" // Oddments: Load (4, 3): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal v19.4s, v16.4h, v7.4h\n"
+ "smlal2 v31.4s, v16.8h, v7.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 73f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 72f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 75f\n"
+ "72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 75f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 75f\n"
+ "73:" // Oddments: Load (4, 2): Bit 2: Unset
+ "tbz x7, #1, 74f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 75f\n"
+ "74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 75f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "75:" // Oddments: Load (4, 2): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "smlal v20.4s, v16.4h, v4.4h\n"
+ "smlal2 v0.4s, v16.8h, v4.8h\n"
+ "smlal v19.4s, v16.4h, v26.4h\n"
+ "smlal2 v31.4s, v16.8h, v26.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 77f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 76f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 79f\n"
+ "76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 79f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 79f\n"
+ "77:" // Oddments: Load (4, 4): Bit 2: Unset
+ "tbz x7, #1, 78f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 79f\n"
+ "78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 79f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "79:" // Oddments: Load (4, 4): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v19.4s, v16.4h, v4.4h\n"
+ "smlal2 v31.4s, v16.8h, v4.8h\n"
+ "tbz x7, #2, 81f\n"
+ "ld1 { v14.4s }, [x13], #0x10\n"
+ "ld1 { v25.4s }, [x12], #0x10\n"
+ "tbz x7, #1, 80f\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v12.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 83f\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x12]\n"
+ "b 83f\n"
+ "80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 83f\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v12.s }[0], [x12]\n"
+ "b 83f\n"
+ "81:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x7, #1, 82f\n"
+ "ld1 { v14.d }[0], [x13], #0x8\n"
+ "ld1 { v25.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 83f\n"
+ "ld1 { v14.s }[2], [x13]\n"
+ "ld1 { v25.s }[2], [x12]\n"
+ "b 83f\n"
+ "82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 83f\n"
+ "ld1 { v14.s }[0], [x13]\n"
+ "ld1 { v25.s }[0], [x12]\n"
+ "83:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+ "and v28.16b, v5.16b, v25.16b\n"
+ "add x11, x11, x16\n"
+ "add x10, x10, x16\n"
+ "sqrdmulh v3.4s, v3.4s, v18.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "add x9, x9, x16\n"
+ "add x28, x28, x16\n"
+ "and v16.16b, v3.16b, v12.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v14.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v14.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v14.16b, v21.16b, v25.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+ "and v6.16b, v20.16b, v25.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "and v4.16b, v19.16b, v25.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v14.4s, v14.4s, #0x1f\n"
+ "and v18.16b, v8.16b, v12.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v7.16b, v0.16b, v12.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v12.16b\n"
+ "sqadd v21.4s, v21.4s, v14.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v6.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v25.4s\n"
+ "srshl v21.4s, v21.4s, v25.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v25.4s\n"
+ "sqadd v0.4s, v0.4s, v7.4s\n"
+ "srshl v19.4s, v19.4s, v25.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v12.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v12.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "srshl v0.4s, v0.4s, v12.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v12.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "tbz x7, #2, 85f\n"
+ "st1 { v5.s }[0], [x11], #0x4\n"
+ "st1 { v21.s }[0], [x10], #0x4\n"
+ "st1 { v20.s }[0], [x9], #0x4\n"
+ "st1 { v19.s }[0], [x28], #0x4\n"
+ "tbz x7, #1, 84f\n"
+ "st1 { v5.h }[2], [x11], #0x2\n"
+ "st1 { v21.h }[2], [x10], #0x2\n"
+ "st1 { v20.h }[2], [x9], #0x2\n"
+ "st1 { v19.h }[2], [x28], #0x2\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v5.b }[6], [x11], #0x1\n"
+ "st1 { v21.b }[6], [x10], #0x1\n"
+ "st1 { v20.b }[6], [x9], #0x1\n"
+ "st1 { v19.b }[6], [x28], #0x1\n"
+ "b 87f\n"
+ "84:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 87f\n"
+ "st1 { v5.b }[4], [x11], #0x1\n"
+ "st1 { v21.b }[4], [x10], #0x1\n"
+ "st1 { v20.b }[4], [x9], #0x1\n"
+ "st1 { v19.b }[4], [x28], #0x1\n"
+ "b 87f\n"
+ "85:" // Oddments: Bit 2: Unset
+ "tbz x7, #1, 86f\n"
+ "st1 { v5.h }[0], [x11], #0x2\n"
+ "st1 { v21.h }[0], [x10], #0x2\n"
+ "st1 { v20.h }[0], [x9], #0x2\n"
+ "st1 { v19.h }[0], [x28], #0x2\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v5.b }[2], [x11], #0x1\n"
+ "st1 { v21.b }[2], [x10], #0x1\n"
+ "st1 { v20.b }[2], [x9], #0x1\n"
+ "st1 { v19.b }[2], [x28], #0x1\n"
+ "b 87f\n"
+ "86:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 87f\n"
+ "st1 { v5.b }[0], [x11], #0x1\n"
+ "st1 { v21.b }[0], [x10], #0x1\n"
+ "st1 { v20.b }[0], [x9], #0x1\n"
+ "st1 { v19.b }[0], [x28], #0x1\n"
+ "87:" // Oddments: Bit 2: End
+ "88:" // End
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..07f66fb482
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int,
+ const uint8_t *const *const,
+ const uint8_t *const,
+ const int32_t *const,
+ const arm_gemm::Requantize32 &,
+ const int32_t *const,
+ const int32_t *const,
+ uint8_t *const *const
+);
+
+class a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..a3fa93df9c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,2187 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ uint64_t n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[36];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[0];
+ inptrs[1] = inptrs_raw[1];
+ inptrs[2] = inptrs_raw[6];
+ inptrs[3] = inptrs_raw[7];
+ inptrs[4] = inptrs_raw[2];
+ inptrs[5] = inptrs_raw[8];
+ inptrs[6] = inptrs_raw[3];
+ inptrs[7] = inptrs_raw[4];
+ inptrs[8] = inptrs_raw[11];
+ inptrs[9] = inptrs_raw[12];
+ inptrs[10] = inptrs_raw[9];
+ inptrs[11] = inptrs_raw[10];
+ inptrs[12] = inptrs_raw[5];
+ inptrs[13] = inptrs_raw[13];
+ inptrs[14] = inptrs_raw[14];
+ inptrs[15] = inptrs_raw[15];
+ inptrs[16] = inptrs_raw[16];
+ inptrs[17] = inptrs_raw[17];
+ inptrs[18] = inptrs_raw[18];
+ inptrs[19] = inptrs_raw[19];
+ inptrs[20] = inptrs_raw[20];
+ inptrs[21] = inptrs_raw[21];
+ inptrs[22] = inptrs_raw[22];
+ inptrs[23] = inptrs_raw[23];
+ inptrs[24] = inptrs_raw[24];
+ inptrs[25] = inptrs_raw[25];
+ inptrs[26] = inptrs_raw[26];
+ inptrs[27] = inptrs_raw[27];
+ inptrs[28] = inptrs_raw[28];
+ inptrs[29] = inptrs_raw[29];
+ inptrs[30] = inptrs_raw[30];
+ inptrs[31] = inptrs_raw[31];
+ inptrs[32] = inptrs_raw[32];
+ inptrs[33] = inptrs_raw[33];
+ inptrs[34] = inptrs_raw[34];
+ inptrs[35] = inptrs_raw[35];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x2, x1, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v18.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v13.16b }, [x21]\n"
+ "ld1r { v26.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v11.8h }, [x21]\n"
+ "ld1r { v0.8h }, [x20]\n"
+ "mov x3, #0x0\n"
+ "mov x4, #0x0\n"
+ "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x17, x16, [x22, #0x0]\n"
+ "ldp x15, x14, [x22, #0x10]\n"
+ "cbz x2, 3f\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "subs x2, x2, #0x1\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "usubl v14.8h, v14.8b, v13.8b\n"
+ "usubl v10.8h, v10.8b, v13.8b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "usubl v12.8h, v12.8b, v13.8b\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q15, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ldr d31, [x9, x3]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldr d17, [x28, x3]\n"
+ "ldr d30, [x27, x3]\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "ldr d16, [x26, x3]\n"
+ "ldr d3, [x25, x3]\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "ldr d4, [x24, x3]\n"
+ "ldr d25, [x23, x3]\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "ldr d9, [x22, x3]\n"
+ "ldr d29, [x21, x3]\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "ldr d28, [x20, x3]\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "ldr d2, [x6, #0x28]\n"
+ "ldr d27, [x6, #0x30]\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "ldr d1, [x6, #0x38]\n"
+ "ldr d31, [x6, #0x40]\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "ldr d8, [x6, #0x48]\n"
+ "ldr x22, [x5, #0x50]\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x21, [x5, #0x60]\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "ldr x22, [x5, #0x70]\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "ldr d14, [x20, x3]\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal v23.4s, v17.4h, v10.4h\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x21, [x5, #0x80]\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "ldr d25, [x22, x3]\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v10.8h\n"
+ "ldr d10, [x20, x3]\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal v24.4s, v17.4h, v21.4h\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "usubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x24, [x5, #0x88]\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "smlal v7.4s, v30.4h, v2.4h\n"
+ "ldr x20, [x5, #0x90]\n"
+ "ldr x23, [x5, #0x98]\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "ldr d9, [x21, x3]\n"
+ "smlal2 v22.4s, v17.8h, v21.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "smlal v20.4s, v3.4h, v12.4h\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "ldr x22, [x5, #0xa0]\n"
+ "ldr x21, [x5, #0xa8]\n"
+ "smlal2 v15.4s, v30.8h, v2.8h\n"
+ "ldr d30, [x24, x3]\n"
+ "smlal v7.4s, v16.4h, v27.4h\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v12.8h\n"
+ "ldr d3, [x6, #0x58]\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "ldr d12, [x20, x3]\n"
+ "smlal v20.4s, v16.4h, v2.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal v23.4s, v14.4h, v2.4h\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "ldr x13, [x5, #0xb8]\n"
+ "smlal2 v15.4s, v16.8h, v27.8h\n"
+ "smlal v7.4s, v4.4h, v1.4h\n"
+ "ldr x12, [x5, #0xc0]\n"
+ "ldr x11, [x5, #0xc8]\n"
+ "smlal2 v5.4s, v16.8h, v2.8h\n"
+ "ldr d16, [x23, x3]\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v2.8h\n"
+ "ldr d2, [x6, #0x60]\n"
+ "smlal v20.4s, v4.4h, v27.4h\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v27.4h\n"
+ "smlal v23.4s, v25.4h, v27.4h\n"
+ "ldr x10, [x5, #0xd0]\n"
+ "ldr x9, [x5, #0xd8]\n"
+ "smlal2 v15.4s, v4.8h, v1.8h\n"
+ "smlal v7.4s, v17.4h, v31.4h\n"
+ "ldr x28, [x5, #0xe0]\n"
+ "ldr x27, [x5, #0xe8]\n"
+ "smlal2 v5.4s, v4.8h, v27.8h\n"
+ "ldr d4, [x22, x3]\n"
+ "smlal2 v22.4s, v14.8h, v27.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v27.8h\n"
+ "ldr d27, [x6, #0x68]\n"
+ "smlal v20.4s, v17.4h, v1.4h\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v1.4h\n"
+ "smlal v23.4s, v10.4h, v1.4h\n"
+ "ldr x26, [x5, #0xf0]\n"
+ "ldr x25, [x5, #0xf8]\n"
+ "smlal2 v15.4s, v17.8h, v31.8h\n"
+ "smlal v7.4s, v6.4h, v8.4h\n"
+ "ldr x24, [x5, #0x100]\n"
+ "ldr x23, [x5, #0x108]\n"
+ "smlal2 v5.4s, v17.8h, v1.8h\n"
+ "ldr d17, [x21, x3]\n"
+ "smlal2 v22.4s, v25.8h, v1.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v1.8h\n"
+ "ldr d1, [x6, #0x70]\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v31.4h\n"
+ "smlal v23.4s, v9.4h, v31.4h\n"
+ "ldr x22, [x5, #0x110]\n"
+ "ldr x21, [x5, #0x118]\n"
+ "smlal2 v15.4s, v6.8h, v8.8h\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "subs x2, x2, #0x1\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal2 v22.4s, v10.8h, v31.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v31.8h\n"
+ "ldr d31, [x6, #0x78]\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v9.4h, v8.4h\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "ldr d28, [x13, x3]\n"
+ "smlal v7.4s, v14.4h, v3.4h\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v5.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "smlal2 v22.4s, v9.8h, v8.8h\n"
+ "usubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "ldr d8, [x12, x3]\n"
+ "smlal v20.4s, v14.4h, v21.4h\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "smlal v24.4s, v12.4h, v21.4h\n"
+ "smlal v23.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v14.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v14.8h, v21.8h\n"
+ "ldr d14, [x11, x3]\n"
+ "smlal2 v22.4s, v12.8h, v21.8h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v21.8h\n"
+ "ldr d21, [x6, #0x88]\n"
+ "smlal v20.4s, v25.4h, v3.4h\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v16.4h, v3.4h\n"
+ "smlal v23.4s, v4.4h, v3.4h\n"
+ "smlal2 v15.4s, v25.8h, v2.8h\n"
+ "smlal v7.4s, v10.4h, v27.4h\n"
+ "smlal2 v5.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x10, x3]\n"
+ "smlal2 v22.4s, v16.8h, v3.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v3.8h\n"
+ "ldr d3, [x6, #0x90]\n"
+ "smlal v20.4s, v10.4h, v2.4h\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v4.4h, v2.4h\n"
+ "smlal v23.4s, v17.4h, v2.4h\n"
+ "smlal2 v15.4s, v10.8h, v27.8h\n"
+ "smlal v7.4s, v9.4h, v1.4h\n"
+ "smlal2 v5.4s, v10.8h, v2.8h\n"
+ "ldr d10, [x9, x3]\n"
+ "smlal2 v22.4s, v4.8h, v2.8h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v2.8h\n"
+ "ldr d2, [x6, #0x98]\n"
+ "smlal v20.4s, v9.4h, v27.4h\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v17.4h, v27.4h\n"
+ "smlal v23.4s, v6.4h, v27.4h\n"
+ "smlal2 v15.4s, v9.8h, v1.8h\n"
+ "smlal v7.4s, v12.4h, v31.4h\n"
+ "smlal2 v5.4s, v9.8h, v27.8h\n"
+ "ldr d9, [x28, x3]\n"
+ "smlal2 v22.4s, v17.8h, v27.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v27.8h\n"
+ "ldr d27, [x6, #0xa0]\n"
+ "smlal v20.4s, v30.4h, v1.4h\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v1.4h\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal2 v15.4s, v12.8h, v31.8h\n"
+ "ldr d12, [x27, x3]\n"
+ "smlal v7.4s, v16.4h, v29.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal2 v5.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x6, #0xa8]\n"
+ "smlal2 v22.4s, v6.8h, v1.8h\n"
+ "usubl v30.8h, v30.8b, v13.8b\n"
+ "smlal2 v19.4s, v28.8h, v1.8h\n"
+ "ldr d1, [x26, x3]\n"
+ "smlal v20.4s, v16.4h, v31.4h\n"
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "smlal v24.4s, v8.4h, v31.4h\n"
+ "smlal v23.4s, v14.4h, v31.4h\n"
+ "smlal2 v15.4s, v16.8h, v29.8h\n"
+ "smlal v7.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v16.8h, v31.8h\n"
+ "ldr d16, [x25, x3]\n"
+ "smlal2 v22.4s, v8.8h, v31.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v31.8h\n"
+ "ldr d31, [x6, #0xb0]\n"
+ "smlal v20.4s, v4.4h, v29.4h\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v29.4h\n"
+ "smlal v23.4s, v25.4h, v29.4h\n"
+ "smlal2 v15.4s, v4.8h, v21.8h\n"
+ "smlal v7.4s, v17.4h, v3.4h\n"
+ "smlal2 v5.4s, v4.8h, v29.8h\n"
+ "ldr d4, [x24, x3]\n"
+ "smlal2 v22.4s, v14.8h, v29.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v29.8h\n"
+ "ldr d29, [x6, #0xb8]\n"
+ "smlal v20.4s, v17.4h, v21.4h\n"
+ "usubl v29.8h, v29.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v15.4s, v17.8h, v3.8h\n"
+ "smlal v7.4s, v6.4h, v2.4h\n"
+ "smlal2 v5.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x23, x3]\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x6, #0xc0]\n"
+ "smlal v20.4s, v6.4h, v3.4h\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v3.4h\n"
+ "smlal v23.4s, v9.4h, v3.4h\n"
+ "add x6, x6, #0xc8\n"
+ "smlal2 v15.4s, v6.8h, v2.8h\n"
+ "smlal v7.4s, v8.4h, v27.4h\n"
+ "smlal2 v5.4s, v6.8h, v3.8h\n"
+ "ldr d6, [x22, x3]\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal v20.4s, v28.4h, v2.4h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v24.4s, v9.4h, v2.4h\n"
+ "smlal v23.4s, v12.4h, v2.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v15.4s, v8.8h, v27.8h\n"
+ "ldr q8, [x7, #0x0]\n"
+ "smlal v7.4s, v14.4h, v30.4h\n"
+ "smlal2 v5.4s, v28.8h, v2.8h\n"
+ "ldr q28, [x8, #0x0]\n"
+ "smlal2 v22.4s, v9.8h, v2.8h\n"
+ "smlal2 v19.4s, v12.8h, v2.8h\n"
+ "ldr q2, [x7, #0x10]\n"
+ "smlal v20.4s, v14.4h, v27.4h\n"
+ "add x7, x7, #0x20\n"
+ "smlal v24.4s, v1.4h, v27.4h\n"
+ "smlal v23.4s, v16.4h, v27.4h\n"
+ "smlal2 v15.4s, v14.8h, v30.8h\n"
+ "smlal v7.4s, v25.4h, v31.4h\n"
+ "smlal2 v5.4s, v14.8h, v27.8h\n"
+ "ldr q14, [x8, #0x10]\n"
+ "smlal2 v22.4s, v1.8h, v27.8h\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v19.4s, v16.8h, v27.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal v24.4s, v16.4h, v30.4h\n"
+ "smlal v23.4s, v4.4h, v30.4h\n"
+ "smlal2 v15.4s, v25.8h, v31.8h\n"
+ "smlal v7.4s, v10.4h, v29.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal2 v22.4s, v16.8h, v30.8h\n"
+ "smlal2 v19.4s, v4.8h, v30.8h\n"
+ "smlal v20.4s, v10.4h, v31.4h\n"
+ "smlal v24.4s, v4.4h, v31.4h\n"
+ "smlal v23.4s, v17.4h, v31.4h\n"
+ "smlal2 v15.4s, v10.8h, v29.8h\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+ "smlal2 v5.4s, v10.8h, v31.8h\n"
+ "smlal2 v22.4s, v4.8h, v31.8h\n"
+ "and v27.16b, v7.16b, v28.16b\n"
+ "smlal2 v19.4s, v17.8h, v31.8h\n"
+ "smlal v20.4s, v9.4h, v29.4h\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "smlal v24.4s, v17.4h, v29.4h\n"
+ "smlal v23.4s, v6.4h, v29.4h\n"
+ "sqadd v7.4s, v7.4s, v27.4s\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal2 v5.4s, v9.8h, v29.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v2.4s\n"
+ "smlal2 v22.4s, v17.8h, v29.8h\n"
+ "smlal2 v19.4s, v6.8h, v29.8h\n"
+ "and v9.16b, v15.16b, v14.16b\n"
+ "smlal v20.4s, v12.4h, v21.4h\n"
+ "smlal v24.4s, v6.4h, v21.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smlal v23.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v12.8h, v21.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "smlal2 v22.4s, v6.8h, v21.8h\n"
+ "smlal2 v19.4s, v3.8h, v21.8h\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v25.16b, v20.16b, v28.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v2.4s\n"
+ "and v10.16b, v24.16b, v28.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v2.4s\n"
+ "and v21.16b, v23.16b, v28.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v2.4s\n"
+ "sqadd v15.4s, v15.4s, v9.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v9.16b, v5.16b, v14.16b\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "and v12.16b, v22.16b, v14.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v17.16b, v19.16b, v14.16b\n"
+ "sqadd v20.4s, v20.4s, v25.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v10.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v21.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v28.4s\n"
+ "srshl v20.4s, v20.4s, v28.4s\n"
+ "sqadd v5.4s, v5.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqadd v22.4s, v22.4s, v12.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "sqadd v19.4s, v19.4s, v17.4s\n"
+ "srshl v15.4s, v15.4s, v14.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v22.4s, v22.4s, v14.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v14.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d7, [x17, x4]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d20, [x16, x4]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d24, [x15, x4]\n"
+ "str d23, [x14, x4]\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q15, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "add x4, x4, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "usubl v14.8h, v14.8b, v13.8b\n"
+ "ldr d31, [x9, x3]\n"
+ "ldr d17, [x28, x3]\n"
+ "usubl v10.8h, v10.8b, v13.8b\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "ldr d30, [x27, x3]\n"
+ "ldr d16, [x26, x3]\n"
+ "usubl v12.8h, v12.8b, v13.8b\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "ldr d3, [x25, x3]\n"
+ "ldr d4, [x24, x3]\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "ldr d25, [x23, x3]\n"
+ "ldr d9, [x22, x3]\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "ldr d29, [x21, x3]\n"
+ "ldr d28, [x20, x3]\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "ldr d27, [x6, #0x28]\n"
+ "ldr d1, [x6, #0x30]\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "ldr d2, [x6, #0x38]\n"
+ "ldr d31, [x6, #0x40]\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "ldr d8, [x6, #0x48]\n"
+ "ldr x22, [x5, #0x50]\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x21, [x5, #0x60]\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x22, [x5, #0x70]\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "ldr d14, [x20, x3]\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal v23.4s, v17.4h, v10.4h\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x21, [x5, #0x78]\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "ldr d25, [x22, x3]\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v10.8h\n"
+ "ldr d10, [x21, x3]\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal v24.4s, v17.4h, v21.4h\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "usubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x24, [x5, #0x88]\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "smlal v7.4s, v30.4h, v27.4h\n"
+ "ldr x23, [x5, #0x90]\n"
+ "ldr x22, [x5, #0x98]\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "ldr d9, [x20, x3]\n"
+ "smlal2 v22.4s, v17.8h, v21.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "smlal v20.4s, v3.4h, v12.4h\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "ldr x21, [x5, #0xa0]\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal2 v15.4s, v30.8h, v27.8h\n"
+ "ldr d30, [x24, x3]\n"
+ "smlal v7.4s, v16.4h, v1.4h\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v12.8h\n"
+ "ldr d3, [x6, #0x58]\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "ldr d12, [x23, x3]\n"
+ "smlal v20.4s, v16.4h, v27.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal v24.4s, v28.4h, v27.4h\n"
+ "smlal v23.4s, v14.4h, v27.4h\n"
+ "ldr x13, [x5, #0xb0]\n"
+ "ldr x12, [x5, #0xb8]\n"
+ "smlal2 v15.4s, v16.8h, v1.8h\n"
+ "smlal v7.4s, v4.4h, v2.4h\n"
+ "ldr x11, [x5, #0xc0]\n"
+ "ldr x10, [x5, #0xc8]\n"
+ "smlal2 v5.4s, v16.8h, v27.8h\n"
+ "ldr d16, [x22, x3]\n"
+ "smlal2 v22.4s, v28.8h, v27.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v27.8h\n"
+ "ldr d27, [x6, #0x60]\n"
+ "smlal v20.4s, v4.4h, v1.4h\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v1.4h\n"
+ "smlal v23.4s, v25.4h, v1.4h\n"
+ "ldr x9, [x5, #0xd0]\n"
+ "ldr x28, [x5, #0xd8]\n"
+ "smlal2 v15.4s, v4.8h, v2.8h\n"
+ "smlal v7.4s, v17.4h, v31.4h\n"
+ "ldr x27, [x5, #0xe0]\n"
+ "ldr x26, [x5, #0xe8]\n"
+ "smlal2 v5.4s, v4.8h, v1.8h\n"
+ "ldr d4, [x21, x3]\n"
+ "smlal2 v22.4s, v14.8h, v1.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v1.8h\n"
+ "ldr d1, [x6, #0x68]\n"
+ "smlal v20.4s, v17.4h, v2.4h\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v2.4h\n"
+ "smlal v23.4s, v10.4h, v2.4h\n"
+ "ldr x25, [x5, #0xf0]\n"
+ "ldr x24, [x5, #0xf8]\n"
+ "smlal2 v15.4s, v17.8h, v31.8h\n"
+ "smlal v7.4s, v6.4h, v8.4h\n"
+ "ldr x23, [x5, #0x100]\n"
+ "ldr x22, [x5, #0x108]\n"
+ "smlal2 v5.4s, v17.8h, v2.8h\n"
+ "ldr d17, [x20, x3]\n"
+ "smlal2 v22.4s, v25.8h, v2.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v2.8h\n"
+ "ldr d2, [x6, #0x70]\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v31.4h\n"
+ "smlal v23.4s, v9.4h, v31.4h\n"
+ "ldr x21, [x5, #0x110]\n"
+ "ldr x20, [x5, #0x118]\n"
+ "smlal2 v15.4s, v6.8h, v8.8h\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "tst x1, #0x7\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x13, x3]\n"
+ "smlal2 v22.4s, v10.8h, v31.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v31.8h\n"
+ "ldr d31, [x6, #0x78]\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v9.4h, v8.4h\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "ldr d28, [x12, x3]\n"
+ "smlal v7.4s, v14.4h, v3.4h\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v5.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "smlal2 v22.4s, v9.8h, v8.8h\n"
+ "usubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "ldr d8, [x11, x3]\n"
+ "smlal v20.4s, v14.4h, v21.4h\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "smlal v24.4s, v12.4h, v21.4h\n"
+ "smlal v23.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v14.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v27.4h\n"
+ "smlal2 v5.4s, v14.8h, v21.8h\n"
+ "ldr d14, [x10, x3]\n"
+ "smlal2 v22.4s, v12.8h, v21.8h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v21.8h\n"
+ "ldr d21, [x6, #0x88]\n"
+ "smlal v20.4s, v25.4h, v3.4h\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v16.4h, v3.4h\n"
+ "smlal v23.4s, v4.4h, v3.4h\n"
+ "smlal2 v15.4s, v25.8h, v27.8h\n"
+ "smlal v7.4s, v10.4h, v1.4h\n"
+ "smlal2 v5.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x9, x3]\n"
+ "smlal2 v22.4s, v16.8h, v3.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v3.8h\n"
+ "ldr d3, [x6, #0x90]\n"
+ "smlal v20.4s, v10.4h, v27.4h\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v4.4h, v27.4h\n"
+ "smlal v23.4s, v17.4h, v27.4h\n"
+ "smlal2 v15.4s, v10.8h, v1.8h\n"
+ "smlal v7.4s, v9.4h, v2.4h\n"
+ "smlal2 v5.4s, v10.8h, v27.8h\n"
+ "ldr d10, [x28, x3]\n"
+ "smlal2 v22.4s, v4.8h, v27.8h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v27.8h\n"
+ "ldr d27, [x6, #0x98]\n"
+ "smlal v20.4s, v9.4h, v1.4h\n"
+ "usubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v17.4h, v1.4h\n"
+ "smlal v23.4s, v6.4h, v1.4h\n"
+ "smlal2 v15.4s, v9.8h, v2.8h\n"
+ "smlal v7.4s, v12.4h, v31.4h\n"
+ "smlal2 v5.4s, v9.8h, v1.8h\n"
+ "ldr d9, [x27, x3]\n"
+ "smlal2 v22.4s, v17.8h, v1.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v1.8h\n"
+ "ldr d1, [x6, #0xa0]\n"
+ "smlal v20.4s, v30.4h, v2.4h\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v2.4h\n"
+ "smlal v23.4s, v28.4h, v2.4h\n"
+ "smlal2 v15.4s, v12.8h, v31.8h\n"
+ "ldr d12, [x26, x3]\n"
+ "smlal v7.4s, v16.4h, v29.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "ldr d30, [x6, #0xa8]\n"
+ "smlal2 v22.4s, v6.8h, v2.8h\n"
+ "usubl v30.8h, v30.8b, v13.8b\n"
+ "smlal2 v19.4s, v28.8h, v2.8h\n"
+ "ldr d2, [x25, x3]\n"
+ "smlal v20.4s, v16.4h, v31.4h\n"
+ "usubl v2.8h, v2.8b, v18.8b\n"
+ "smlal v24.4s, v8.4h, v31.4h\n"
+ "smlal v23.4s, v14.4h, v31.4h\n"
+ "smlal2 v15.4s, v16.8h, v29.8h\n"
+ "smlal v7.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v16.8h, v31.8h\n"
+ "ldr d16, [x24, x3]\n"
+ "smlal2 v22.4s, v8.8h, v31.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v31.8h\n"
+ "ldr d31, [x6, #0xb0]\n"
+ "smlal v20.4s, v4.4h, v29.4h\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v29.4h\n"
+ "smlal v23.4s, v25.4h, v29.4h\n"
+ "smlal2 v15.4s, v4.8h, v21.8h\n"
+ "smlal v7.4s, v17.4h, v3.4h\n"
+ "smlal2 v5.4s, v4.8h, v29.8h\n"
+ "ldr d4, [x23, x3]\n"
+ "smlal2 v22.4s, v14.8h, v29.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v29.8h\n"
+ "ldr d29, [x6, #0xb8]\n"
+ "smlal v20.4s, v17.4h, v21.4h\n"
+ "usubl v29.8h, v29.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v15.4s, v17.8h, v3.8h\n"
+ "smlal v7.4s, v6.4h, v27.4h\n"
+ "smlal2 v5.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x6, #0xc0]\n"
+ "smlal v20.4s, v6.4h, v3.4h\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v3.4h\n"
+ "smlal v23.4s, v9.4h, v3.4h\n"
+ "smlal2 v15.4s, v6.8h, v27.8h\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "smlal2 v5.4s, v6.8h, v3.8h\n"
+ "ldr d6, [x21, x3]\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x20, x3]\n"
+ "smlal v20.4s, v28.4h, v27.4h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v24.4s, v9.4h, v27.4h\n"
+ "smlal v23.4s, v12.4h, v27.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v15.4s, v8.8h, v1.8h\n"
+ "ldr q8, [x7, #0x0]\n"
+ "smlal v7.4s, v14.4h, v30.4h\n"
+ "smlal2 v5.4s, v28.8h, v27.8h\n"
+ "ldr q28, [x8, #0x0]\n"
+ "smlal2 v22.4s, v9.8h, v27.8h\n"
+ "smlal2 v19.4s, v12.8h, v27.8h\n"
+ "ldr q27, [x7, #0x10]\n"
+ "smlal v20.4s, v14.4h, v1.4h\n"
+ "add x7, x7, #0x20\n"
+ "smlal v24.4s, v2.4h, v1.4h\n"
+ "smlal v23.4s, v16.4h, v1.4h\n"
+ "smlal2 v15.4s, v14.8h, v30.8h\n"
+ "smlal v7.4s, v25.4h, v31.4h\n"
+ "smlal2 v5.4s, v14.8h, v1.8h\n"
+ "ldr q14, [x8, #0x10]\n"
+ "smlal2 v22.4s, v2.8h, v1.8h\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v19.4s, v16.8h, v1.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal v24.4s, v16.4h, v30.4h\n"
+ "smlal v23.4s, v4.4h, v30.4h\n"
+ "smlal2 v15.4s, v25.8h, v31.8h\n"
+ "smlal v7.4s, v10.4h, v29.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal2 v22.4s, v16.8h, v30.8h\n"
+ "smlal2 v19.4s, v4.8h, v30.8h\n"
+ "smlal v20.4s, v10.4h, v31.4h\n"
+ "smlal v24.4s, v4.4h, v31.4h\n"
+ "smlal v23.4s, v17.4h, v31.4h\n"
+ "smlal2 v15.4s, v10.8h, v29.8h\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+ "smlal2 v5.4s, v10.8h, v31.8h\n"
+ "smlal2 v22.4s, v4.8h, v31.8h\n"
+ "and v4.16b, v7.16b, v28.16b\n"
+ "smlal2 v19.4s, v17.8h, v31.8h\n"
+ "smlal v20.4s, v9.4h, v29.4h\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v24.4s, v17.4h, v29.4h\n"
+ "smlal v23.4s, v6.4h, v29.4h\n"
+ "sqadd v7.4s, v7.4s, v4.4s\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal2 v5.4s, v9.8h, v29.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v27.4s\n"
+ "smlal2 v22.4s, v17.8h, v29.8h\n"
+ "smlal2 v19.4s, v6.8h, v29.8h\n"
+ "and v30.16b, v15.16b, v14.16b\n"
+ "smlal v20.4s, v12.4h, v21.4h\n"
+ "smlal v24.4s, v6.4h, v21.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smlal v23.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v12.8h, v21.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "smlal2 v22.4s, v6.8h, v21.8h\n"
+ "smlal2 v19.4s, v3.8h, v21.8h\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "and v3.16b, v20.16b, v28.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v27.4s\n"
+ "and v25.16b, v24.16b, v28.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+ "and v16.16b, v23.16b, v28.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+ "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v4.16b, v5.16b, v14.16b\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v10.16b, v22.16b, v14.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v12.16b, v19.16b, v14.16b\n"
+ "sqadd v20.4s, v20.4s, v3.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v25.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v28.4s\n"
+ "srshl v20.4s, v20.4s, v28.4s\n"
+ "sqadd v5.4s, v5.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "sqadd v19.4s, v19.4s, v12.4s\n"
+ "srshl v15.4s, v15.4s, v14.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v22.4s, v22.4s, v14.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v14.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d7, [x17, x4]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d20, [x16, x4]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d24, [x15, x4]\n"
+ "str d23, [x14, x4]\n"
+ "add x4, x4, #0x8\n"
+ "beq 124f\n"
+ "add x6, x6, #0xc8\n"
+ "3:" // Oddments
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x1, #2, 5f\n"
+ "ld1 { v7.4s }, [x20], #0x10\n"
+ "tbz x1, #1, 4f\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
+ "tbz x1, #0, 7f\n"
+ "ld1 { v15.s }[2], [x20]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x1, #0, 7f\n"
+ "ld1 { v15.s }[0], [x20]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x1, #1, 6f\n"
+ "ld1 { v7.d }[0], [x20], #0x8\n"
+ "tbz x1, #0, 7f\n"
+ "ld1 { v7.s }[2], [x20]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 7f\n"
+ "ld1 { v7.s }[0], [x20]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "usubl v14.8h, v14.8b, v13.8b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "usubl v10.8h, v10.8b, v13.8b\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "usubl v12.8h, v12.8b, v13.8b\n"
+ "add x9, x9, x3\n"
+ "add x28, x28, x3\n"
+ "add x27, x27, x3\n"
+ "add x26, x26, x3\n"
+ "add x25, x25, x3\n"
+ "add x24, x24, x3\n"
+ "add x23, x23, x3\n"
+ "add x22, x22, x3\n"
+ "add x21, x21, x3\n"
+ "add x20, x20, x3\n"
+ "tbz x1, #2, 9f\n"
+ "ld1 { v31.s }[0], [x9], #0x4\n"
+ "ld1 { v17.s }[0], [x28], #0x4\n"
+ "ld1 { v30.s }[0], [x27], #0x4\n"
+ "ld1 { v16.s }[0], [x26], #0x4\n"
+ "ld1 { v3.s }[0], [x25], #0x4\n"
+ "ld1 { v4.s }[0], [x24], #0x4\n"
+ "ld1 { v25.s }[0], [x23], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v29.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 8f\n"
+ "ld1 { v31.h }[2], [x9], #0x2\n"
+ "ld1 { v17.h }[2], [x28], #0x2\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v16.h }[2], [x26], #0x2\n"
+ "ld1 { v3.h }[2], [x25], #0x2\n"
+ "ld1 { v4.h }[2], [x24], #0x2\n"
+ "ld1 { v25.h }[2], [x23], #0x2\n"
+ "ld1 { v9.h }[2], [x22], #0x2\n"
+ "ld1 { v29.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[6], [x9]\n"
+ "ld1 { v17.b }[6], [x28]\n"
+ "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v16.b }[6], [x26]\n"
+ "ld1 { v3.b }[6], [x25]\n"
+ "ld1 { v4.b }[6], [x24]\n"
+ "ld1 { v25.b }[6], [x23]\n"
+ "ld1 { v9.b }[6], [x22]\n"
+ "ld1 { v29.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[4], [x9]\n"
+ "ld1 { v17.b }[4], [x28]\n"
+ "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v16.b }[4], [x26]\n"
+ "ld1 { v3.b }[4], [x25]\n"
+ "ld1 { v4.b }[4], [x24]\n"
+ "ld1 { v25.b }[4], [x23]\n"
+ "ld1 { v9.b }[4], [x22]\n"
+ "ld1 { v29.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x1, #1, 10f\n"
+ "ld1 { v31.h }[0], [x9], #0x2\n"
+ "ld1 { v17.h }[0], [x28], #0x2\n"
+ "ld1 { v30.h }[0], [x27], #0x2\n"
+ "ld1 { v16.h }[0], [x26], #0x2\n"
+ "ld1 { v3.h }[0], [x25], #0x2\n"
+ "ld1 { v4.h }[0], [x24], #0x2\n"
+ "ld1 { v25.h }[0], [x23], #0x2\n"
+ "ld1 { v9.h }[0], [x22], #0x2\n"
+ "ld1 { v29.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[2], [x9]\n"
+ "ld1 { v17.b }[2], [x28]\n"
+ "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v16.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x25]\n"
+ "ld1 { v4.b }[2], [x24]\n"
+ "ld1 { v25.b }[2], [x23]\n"
+ "ld1 { v9.b }[2], [x22]\n"
+ "ld1 { v29.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[0], [x9]\n"
+ "ld1 { v17.b }[0], [x28]\n"
+ "ld1 { v30.b }[0], [x27]\n"
+ "ld1 { v16.b }[0], [x26]\n"
+ "ld1 { v3.b }[0], [x25]\n"
+ "ld1 { v4.b }[0], [x24]\n"
+ "ld1 { v25.b }[0], [x23]\n"
+ "ld1 { v9.b }[0], [x22]\n"
+ "ld1 { v29.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "ldr x20, [x5, #0x50]\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "add x20, x20, x3\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "tbz x1, #2, 13f\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 12f\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v27.b }[6], [x20]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 15f\n"
+ "ld1 { v27.b }[4], [x20]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x1, #1, 14f\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v27.b }[2], [x20]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 15f\n"
+ "ld1 { v27.b }[0], [x20]\n"
+ "15:" // Oddments: Load (1, 3): Bit 2: End
+ "usubl v27.8h, v27.8b, v18.8b\n"
+ "ldr x20, [x5, #0x58]\n"
+ "smlal v23.4s, v27.4h, v10.4h\n"
+ "smlal2 v19.4s, v27.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "smlal v24.4s, v27.4h, v21.4h\n"
+ "smlal2 v22.4s, v27.8h, v21.8h\n"
+ "tbz x1, #2, 17f\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 16f\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v6.b }[6], [x20]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 19f\n"
+ "ld1 { v6.b }[4], [x20]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (1, 4): Bit 2: Unset
+ "tbz x1, #1, 18f\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v6.b }[2], [x20]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 19f\n"
+ "ld1 { v6.b }[0], [x20]\n"
+ "19:" // Oddments: Load (1, 4): Bit 2: End
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "ldr x20, [x5, #0x60]\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x1, #2, 21f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 20f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v9.b }[6], [x20]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 23f\n"
+ "ld1 { v9.b }[4], [x20]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (0, 5): Bit 2: Unset
+ "tbz x1, #1, 22f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v9.b }[2], [x20]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 23f\n"
+ "ld1 { v9.b }[0], [x20]\n"
+ "23:" // Oddments: Load (0, 5): Bit 2: End
+ "ldr d14, [x6, #0x28]\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal v20.4s, v9.4h, v12.4h\n"
+ "smlal2 v5.4s, v9.8h, v12.8h\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "usubl v14.8h, v14.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v30.4h, v14.4h\n"
+ "smlal2 v15.4s, v30.8h, v14.8h\n"
+ "smlal v20.4s, v16.4h, v14.4h\n"
+ "smlal2 v5.4s, v16.8h, v14.8h\n"
+ "smlal v24.4s, v28.4h, v14.4h\n"
+ "smlal2 v22.4s, v28.8h, v14.8h\n"
+ "tbz x1, #2, 25f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 24f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (2, 1): Bit 2: Unset
+ "tbz x1, #1, 26f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[0], [x20]\n"
+ "27:" // Oddments: Load (2, 1): Bit 2: End
+ "ldr d21, [x6, #0x30]\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0x70]\n"
+ "smlal v23.4s, v25.4h, v14.4h\n"
+ "smlal2 v19.4s, v25.8h, v14.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v16.8h, v21.8h\n"
+ "smlal v20.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v4.8h, v21.8h\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "tbz x1, #2, 29f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 28f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v10.b }[6], [x20]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 31f\n"
+ "ld1 { v10.b }[4], [x20]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 2): Bit 2: Unset
+ "tbz x1, #1, 30f\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v10.b }[2], [x20]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 31f\n"
+ "ld1 { v10.b }[0], [x20]\n"
+ "31:" // Oddments: Load (2, 2): Bit 2: End
+ "ldr d9, [x6, #0x38]\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v4.4h, v9.4h\n"
+ "smlal2 v15.4s, v4.8h, v9.8h\n"
+ "smlal v20.4s, v27.4h, v9.4h\n"
+ "smlal2 v5.4s, v27.8h, v9.8h\n"
+ "smlal v24.4s, v10.4h, v9.4h\n"
+ "smlal2 v22.4s, v10.8h, v9.8h\n"
+ "tbz x1, #2, 33f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 32f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v12.b }[6], [x20]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 35f\n"
+ "ld1 { v12.b }[4], [x20]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x1, #1, 34f\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v12.b }[2], [x20]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 35f\n"
+ "ld1 { v12.b }[0], [x20]\n"
+ "35:" // Oddments: Load (2, 3): Bit 2: End
+ "ldr d31, [x6, #0x40]\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "usubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal v23.4s, v12.4h, v9.4h\n"
+ "smlal2 v19.4s, v12.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v27.4h, v31.4h\n"
+ "smlal2 v15.4s, v27.8h, v31.8h\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "smlal v24.4s, v12.4h, v31.4h\n"
+ "smlal2 v22.4s, v12.8h, v31.8h\n"
+ "tbz x1, #2, 37f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 36f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v8.b }[6], [x20]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 39f\n"
+ "ld1 { v8.b }[4], [x20]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (2, 4): Bit 2: Unset
+ "tbz x1, #1, 38f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v8.b }[2], [x20]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 39f\n"
+ "ld1 { v8.b }[0], [x20]\n"
+ "39:" // Oddments: Load (2, 4): Bit 2: End
+ "ldr d16, [x6, #0x48]\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "usubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0x88]\n"
+ "smlal v23.4s, v8.4h, v31.4h\n"
+ "smlal2 v19.4s, v8.8h, v31.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v6.4h, v16.4h\n"
+ "smlal2 v15.4s, v6.8h, v16.8h\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "smlal2 v5.4s, v29.8h, v16.8h\n"
+ "smlal v24.4s, v8.4h, v16.4h\n"
+ "smlal2 v22.4s, v8.8h, v16.8h\n"
+ "tbz x1, #2, 41f\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 40f\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v27.b }[6], [x20]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 43f\n"
+ "ld1 { v27.b }[4], [x20]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (2, 5): Bit 2: Unset
+ "tbz x1, #1, 42f\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v27.b }[2], [x20]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 43f\n"
+ "ld1 { v27.b }[0], [x20]\n"
+ "43:" // Oddments: Load (2, 5): Bit 2: End
+ "ldr d21, [x6, #0x50]\n"
+ "usubl v27.8h, v27.8b, v18.8b\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0x90]\n"
+ "smlal v23.4s, v27.4h, v16.4h\n"
+ "smlal2 v19.4s, v27.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "smlal v20.4s, v25.4h, v21.4h\n"
+ "smlal2 v5.4s, v25.8h, v21.8h\n"
+ "tbz x1, #2, 45f\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 44f\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[6], [x20]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[4], [x20]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x1, #1, 46f\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[2], [x20]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[0], [x20]\n"
+ "47:" // Oddments: Load (3, 0): Bit 2: End
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "ldr x20, [x5, #0x98]\n"
+ "smlal v24.4s, v31.4h, v21.4h\n"
+ "smlal2 v22.4s, v31.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x1, #2, 49f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 48f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v28.b }[6], [x20]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 51f\n"
+ "ld1 { v28.b }[4], [x20]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x1, #1, 50f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v28.b }[2], [x20]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 51f\n"
+ "ld1 { v28.b }[0], [x20]\n"
+ "51:" // Oddments: Load (3, 1): Bit 2: End
+ "ldr d2, [x6, #0x58]\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa0]\n"
+ "smlal v23.4s, v28.4h, v21.4h\n"
+ "smlal2 v19.4s, v28.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v15.4s, v25.8h, v2.8h\n"
+ "smlal v20.4s, v10.4h, v2.4h\n"
+ "smlal2 v5.4s, v10.8h, v2.8h\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
+ "tbz x1, #2, 53f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 52f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v21.b }[6], [x20]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 55f\n"
+ "ld1 { v21.b }[4], [x20]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x1, #1, 54f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v21.b }[2], [x20]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 55f\n"
+ "ld1 { v21.b }[0], [x20]\n"
+ "55:" // Oddments: Load (3, 2): Bit 2: End
+ "ldr d25, [x6, #0x60]\n"
+ "usubl v21.8h, v21.8b, v18.8b\n"
+ "usubl v25.8h, v25.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal v23.4s, v21.4h, v2.4h\n"
+ "smlal2 v19.4s, v21.8h, v2.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v10.4h, v25.4h\n"
+ "smlal2 v15.4s, v10.8h, v25.8h\n"
+ "smlal v20.4s, v12.4h, v25.4h\n"
+ "smlal2 v5.4s, v12.8h, v25.8h\n"
+ "smlal v24.4s, v21.4h, v25.4h\n"
+ "smlal2 v22.4s, v21.8h, v25.8h\n"
+ "tbz x1, #2, 57f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 56f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v9.b }[6], [x20]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 59f\n"
+ "ld1 { v9.b }[4], [x20]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x1, #1, 58f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v9.b }[2], [x20]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 59f\n"
+ "ld1 { v9.b }[0], [x20]\n"
+ "59:" // Oddments: Load (3, 3): Bit 2: End
+ "ldr d1, [x6, #0x68]\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "smlal v23.4s, v9.4h, v25.4h\n"
+ "smlal2 v19.4s, v9.8h, v25.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v12.4h, v1.4h\n"
+ "smlal2 v15.4s, v12.8h, v1.8h\n"
+ "smlal v20.4s, v8.4h, v1.4h\n"
+ "smlal2 v5.4s, v8.8h, v1.8h\n"
+ "smlal v24.4s, v9.4h, v1.4h\n"
+ "smlal2 v22.4s, v9.8h, v1.8h\n"
+ "tbz x1, #2, 61f\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 60f\n"
+ "ld1 { v3.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v3.b }[6], [x20]\n"
+ "b 63f\n"
+ "60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 63f\n"
+ "ld1 { v3.b }[4], [x20]\n"
+ "b 63f\n"
+ "61:" // Oddments: Load (3, 4): Bit 2: Unset
+ "tbz x1, #1, 62f\n"
+ "ld1 { v3.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v3.b }[2], [x20]\n"
+ "b 63f\n"
+ "62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 63f\n"
+ "ld1 { v3.b }[0], [x20]\n"
+ "63:" // Oddments: Load (3, 4): Bit 2: End
+ "ldr d16, [x6, #0x70]\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "usubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb8]\n"
+ "smlal v23.4s, v3.4h, v1.4h\n"
+ "smlal2 v19.4s, v3.8h, v1.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "smlal2 v15.4s, v8.8h, v16.8h\n"
+ "smlal v20.4s, v27.4h, v16.4h\n"
+ "smlal2 v5.4s, v27.8h, v16.8h\n"
+ "smlal v24.4s, v3.4h, v16.4h\n"
+ "smlal2 v22.4s, v3.8h, v16.8h\n"
+ "tbz x1, #2, 65f\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 64f\n"
+ "ld1 { v14.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v14.b }[6], [x20]\n"
+ "b 67f\n"
+ "64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 67f\n"
+ "ld1 { v14.b }[4], [x20]\n"
+ "b 67f\n"
+ "65:" // Oddments: Load (3, 5): Bit 2: Unset
+ "tbz x1, #1, 66f\n"
+ "ld1 { v14.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v14.b }[2], [x20]\n"
+ "b 67f\n"
+ "66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 67f\n"
+ "ld1 { v14.b }[0], [x20]\n"
+ "67:" // Oddments: Load (3, 5): Bit 2: End
+ "ldr d17, [x6, #0x78]\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0xc0]\n"
+ "smlal v23.4s, v14.4h, v16.4h\n"
+ "smlal2 v19.4s, v14.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v31.4h, v17.4h\n"
+ "smlal2 v15.4s, v31.8h, v17.8h\n"
+ "smlal v20.4s, v28.4h, v17.4h\n"
+ "smlal2 v5.4s, v28.8h, v17.8h\n"
+ "tbz x1, #2, 69f\n"
+ "ld1 { v1.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 68f\n"
+ "ld1 { v1.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 71f\n"
+ "ld1 { v1.b }[6], [x20]\n"
+ "b 71f\n"
+ "68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 71f\n"
+ "ld1 { v1.b }[4], [x20]\n"
+ "b 71f\n"
+ "69:" // Oddments: Load (4, 0): Bit 2: Unset
+ "tbz x1, #1, 70f\n"
+ "ld1 { v1.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 71f\n"
+ "ld1 { v1.b }[2], [x20]\n"
+ "b 71f\n"
+ "70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 71f\n"
+ "ld1 { v1.b }[0], [x20]\n"
+ "71:" // Oddments: Load (4, 0): Bit 2: End
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "ldr x20, [x5, #0xc8]\n"
+ "smlal v24.4s, v1.4h, v17.4h\n"
+ "smlal2 v22.4s, v1.8h, v17.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x1, #2, 73f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 72f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 75f\n"
+ "72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 75f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 75f\n"
+ "73:" // Oddments: Load (4, 1): Bit 2: Unset
+ "tbz x1, #1, 74f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 75f\n"
+ "74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 75f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "75:" // Oddments: Load (4, 1): Bit 2: End
+ "ldr d29, [x6, #0x80]\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "usubl v29.8h, v29.8b, v13.8b\n"
+ "ldr x20, [x5, #0xd0]\n"
+ "smlal v23.4s, v16.4h, v17.4h\n"
+ "smlal2 v19.4s, v16.8h, v17.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v28.4h, v29.4h\n"
+ "smlal2 v15.4s, v28.8h, v29.8h\n"
+ "smlal v20.4s, v21.4h, v29.4h\n"
+ "smlal2 v5.4s, v21.8h, v29.8h\n"
+ "smlal v24.4s, v16.4h, v29.4h\n"
+ "smlal2 v22.4s, v16.8h, v29.8h\n"
+ "tbz x1, #2, 77f\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 76f\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v30.b }[6], [x20]\n"
+ "b 79f\n"
+ "76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 79f\n"
+ "ld1 { v30.b }[4], [x20]\n"
+ "b 79f\n"
+ "77:" // Oddments: Load (4, 2): Bit 2: Unset
+ "tbz x1, #1, 78f\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v30.b }[2], [x20]\n"
+ "b 79f\n"
+ "78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 79f\n"
+ "ld1 { v30.b }[0], [x20]\n"
+ "79:" // Oddments: Load (4, 2): Bit 2: End
+ "ldr d12, [x6, #0x88]\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "usubl v12.8h, v12.8b, v13.8b\n"
+ "ldr x20, [x5, #0xd8]\n"
+ "smlal v23.4s, v30.4h, v29.4h\n"
+ "smlal2 v19.4s, v30.8h, v29.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v21.4h, v12.4h\n"
+ "smlal2 v15.4s, v21.8h, v12.8h\n"
+ "smlal v20.4s, v9.4h, v12.4h\n"
+ "smlal2 v5.4s, v9.8h, v12.8h\n"
+ "smlal v24.4s, v30.4h, v12.4h\n"
+ "smlal2 v22.4s, v30.8h, v12.8h\n"
+ "tbz x1, #2, 81f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 80f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v29.b }[6], [x20]\n"
+ "b 83f\n"
+ "80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 83f\n"
+ "ld1 { v29.b }[4], [x20]\n"
+ "b 83f\n"
+ "81:" // Oddments: Load (4, 3): Bit 2: Unset
+ "tbz x1, #1, 82f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v29.b }[2], [x20]\n"
+ "b 83f\n"
+ "82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 83f\n"
+ "ld1 { v29.b }[0], [x20]\n"
+ "83:" // Oddments: Load (4, 3): Bit 2: End
+ "ldr d21, [x6, #0x90]\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0xe0]\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal v20.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v3.8h, v21.8h\n"
+ "smlal v24.4s, v29.4h, v21.4h\n"
+ "smlal2 v22.4s, v29.8h, v21.8h\n"
+ "tbz x1, #2, 85f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 84f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "b 87f\n"
+ "84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 87f\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "b 87f\n"
+ "85:" // Oddments: Load (4, 4): Bit 2: Unset
+ "tbz x1, #1, 86f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "b 87f\n"
+ "86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 87f\n"
+ "ld1 { v25.b }[0], [x20]\n"
+ "87:" // Oddments: Load (4, 4): Bit 2: End
+ "ldr d8, [x6, #0x98]\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "usubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x20, [x5, #0xe8]\n"
+ "smlal v23.4s, v25.4h, v21.4h\n"
+ "smlal2 v19.4s, v25.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v3.4h, v8.4h\n"
+ "smlal2 v15.4s, v3.8h, v8.8h\n"
+ "smlal v20.4s, v14.4h, v8.4h\n"
+ "smlal2 v5.4s, v14.8h, v8.8h\n"
+ "smlal v24.4s, v25.4h, v8.4h\n"
+ "smlal2 v22.4s, v25.8h, v8.8h\n"
+ "tbz x1, #2, 89f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 88f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v21.b }[6], [x20]\n"
+ "b 91f\n"
+ "88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 91f\n"
+ "ld1 { v21.b }[4], [x20]\n"
+ "b 91f\n"
+ "89:" // Oddments: Load (4, 5): Bit 2: Unset
+ "tbz x1, #1, 90f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v21.b }[2], [x20]\n"
+ "b 91f\n"
+ "90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 91f\n"
+ "ld1 { v21.b }[0], [x20]\n"
+ "91:" // Oddments: Load (4, 5): Bit 2: End
+ "ldr d9, [x6, #0xa0]\n"
+ "usubl v21.8h, v21.8b, v18.8b\n"
+ "usubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0xf0]\n"
+ "smlal v23.4s, v21.4h, v8.4h\n"
+ "smlal2 v19.4s, v21.8h, v8.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v1.4h, v9.4h\n"
+ "smlal2 v15.4s, v1.8h, v9.8h\n"
+ "smlal v20.4s, v16.4h, v9.4h\n"
+ "smlal2 v5.4s, v16.8h, v9.8h\n"
+ "tbz x1, #2, 93f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 92f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v12.b }[6], [x20]\n"
+ "b 95f\n"
+ "92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 95f\n"
+ "ld1 { v12.b }[4], [x20]\n"
+ "b 95f\n"
+ "93:" // Oddments: Load (5, 0): Bit 2: Unset
+ "tbz x1, #1, 94f\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v12.b }[2], [x20]\n"
+ "b 95f\n"
+ "94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 95f\n"
+ "ld1 { v12.b }[0], [x20]\n"
+ "95:" // Oddments: Load (5, 0): Bit 2: End
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "ldr x20, [x5, #0xf8]\n"
+ "smlal v24.4s, v12.4h, v9.4h\n"
+ "smlal2 v22.4s, v12.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x1, #2, 97f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 96f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v10.b }[6], [x20]\n"
+ "b 99f\n"
+ "96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 99f\n"
+ "ld1 { v10.b }[4], [x20]\n"
+ "b 99f\n"
+ "97:" // Oddments: Load (5, 1): Bit 2: Unset
+ "tbz x1, #1, 98f\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v10.b }[2], [x20]\n"
+ "b 99f\n"
+ "98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 99f\n"
+ "ld1 { v10.b }[0], [x20]\n"
+ "99:" // Oddments: Load (5, 1): Bit 2: End
+ "ldr d12, [x6, #0xa8]\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "usubl v12.8h, v12.8b, v13.8b\n"
+ "ldr x20, [x5, #0x100]\n"
+ "smlal v23.4s, v10.4h, v9.4h\n"
+ "smlal2 v19.4s, v10.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v16.4h, v12.4h\n"
+ "smlal2 v15.4s, v16.8h, v12.8h\n"
+ "smlal v20.4s, v30.4h, v12.4h\n"
+ "smlal2 v5.4s, v30.8h, v12.8h\n"
+ "smlal v24.4s, v10.4h, v12.4h\n"
+ "smlal2 v22.4s, v10.8h, v12.8h\n"
+ "tbz x1, #2, 101f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 100f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v9.b }[6], [x20]\n"
+ "b 103f\n"
+ "100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 103f\n"
+ "ld1 { v9.b }[4], [x20]\n"
+ "b 103f\n"
+ "101:" // Oddments: Load (5, 2): Bit 2: Unset
+ "tbz x1, #1, 102f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v9.b }[2], [x20]\n"
+ "b 103f\n"
+ "102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 103f\n"
+ "ld1 { v9.b }[0], [x20]\n"
+ "103:" // Oddments: Load (5, 2): Bit 2: End
+ "ldr d28, [x6, #0xb0]\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "usubl v28.8h, v28.8b, v13.8b\n"
+ "ldr x20, [x5, #0x108]\n"
+ "smlal v23.4s, v9.4h, v12.4h\n"
+ "smlal2 v19.4s, v9.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v30.4h, v28.4h\n"
+ "smlal2 v15.4s, v30.8h, v28.8h\n"
+ "smlal v20.4s, v29.4h, v28.4h\n"
+ "smlal2 v5.4s, v29.8h, v28.8h\n"
+ "smlal v24.4s, v9.4h, v28.4h\n"
+ "smlal2 v22.4s, v9.8h, v28.8h\n"
+ "tbz x1, #2, 105f\n"
+ "ld1 { v2.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 104f\n"
+ "ld1 { v2.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v2.b }[6], [x20]\n"
+ "b 107f\n"
+ "104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 107f\n"
+ "ld1 { v2.b }[4], [x20]\n"
+ "b 107f\n"
+ "105:" // Oddments: Load (5, 3): Bit 2: Unset
+ "tbz x1, #1, 106f\n"
+ "ld1 { v2.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v2.b }[2], [x20]\n"
+ "b 107f\n"
+ "106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 107f\n"
+ "ld1 { v2.b }[0], [x20]\n"
+ "107:" // Oddments: Load (5, 3): Bit 2: End
+ "ldr d30, [x6, #0xb8]\n"
+ "usubl v2.8h, v2.8b, v18.8b\n"
+ "usubl v30.8h, v30.8b, v13.8b\n"
+ "ldr x20, [x5, #0x110]\n"
+ "smlal v23.4s, v2.4h, v28.4h\n"
+ "smlal2 v19.4s, v2.8h, v28.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v29.4h, v30.4h\n"
+ "smlal2 v15.4s, v29.8h, v30.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal v24.4s, v2.4h, v30.4h\n"
+ "smlal2 v22.4s, v2.8h, v30.8h\n"
+ "tbz x1, #2, 109f\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 108f\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v27.b }[6], [x20]\n"
+ "b 111f\n"
+ "108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 111f\n"
+ "ld1 { v27.b }[4], [x20]\n"
+ "b 111f\n"
+ "109:" // Oddments: Load (5, 4): Bit 2: Unset
+ "tbz x1, #1, 110f\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v27.b }[2], [x20]\n"
+ "b 111f\n"
+ "110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 111f\n"
+ "ld1 { v27.b }[0], [x20]\n"
+ "111:" // Oddments: Load (5, 4): Bit 2: End
+ "ldr d8, [x6, #0xc0]\n"
+ "usubl v27.8h, v27.8b, v18.8b\n"
+ "usubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x20, [x5, #0x118]\n"
+ "smlal v23.4s, v27.4h, v30.4h\n"
+ "smlal2 v19.4s, v27.8h, v30.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v25.4h, v8.4h\n"
+ "smlal2 v15.4s, v25.8h, v8.8h\n"
+ "smlal v20.4s, v21.4h, v8.4h\n"
+ "smlal2 v5.4s, v21.8h, v8.8h\n"
+ "smlal v24.4s, v27.4h, v8.4h\n"
+ "smlal2 v22.4s, v27.8h, v8.8h\n"
+ "tbz x1, #2, 113f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 112f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v9.b }[6], [x20]\n"
+ "b 115f\n"
+ "112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 115f\n"
+ "ld1 { v9.b }[4], [x20]\n"
+ "b 115f\n"
+ "113:" // Oddments: Load (5, 5): Bit 2: Unset
+ "tbz x1, #1, 114f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v9.b }[2], [x20]\n"
+ "b 115f\n"
+ "114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 115f\n"
+ "ld1 { v9.b }[0], [x20]\n"
+ "115:" // Oddments: Load (5, 5): Bit 2: End
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal v23.4s, v9.4h, v8.4h\n"
+ "smlal2 v19.4s, v9.8h, v8.8h\n"
+ "tbz x1, #2, 117f\n"
+ "ld1 { v30.4s }, [x7], #0x10\n"
+ "ld1 { v12.4s }, [x8], #0x10\n"
+ "tbz x1, #1, 116f\n"
+ "ld1 { v14.d }[0], [x7], #0x8\n"
+ "ld1 { v27.d }[0], [x8], #0x8\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v14.s }[2], [x7]\n"
+ "ld1 { v27.s }[2], [x8]\n"
+ "b 119f\n"
+ "116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x1, #0, 119f\n"
+ "ld1 { v14.s }[0], [x7]\n"
+ "ld1 { v27.s }[0], [x8]\n"
+ "b 119f\n"
+ "117:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x1, #1, 118f\n"
+ "ld1 { v30.d }[0], [x7], #0x8\n"
+ "ld1 { v12.d }[0], [x8], #0x8\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v30.s }[2], [x7]\n"
+ "ld1 { v12.s }[2], [x8]\n"
+ "b 119f\n"
+ "118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 119f\n"
+ "ld1 { v30.s }[0], [x7]\n"
+ "ld1 { v12.s }[0], [x8]\n"
+ "119:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v7.4s, v7.4s, v30.4s\n"
+ "and v16.16b, v7.16b, v12.16b\n"
+ "add x17, x17, x4\n"
+ "add x16, x16, x4\n"
+ "sqrdmulh v15.4s, v15.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add x15, x15, x4\n"
+ "add x14, x14, x4\n"
+ "and v2.16b, v15.16b, v27.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+ "sqadd v7.4s, v7.4s, v16.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "and v21.16b, v20.16b, v12.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+ "and v18.16b, v24.16b, v12.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v14.4s\n"
+ "and v31.16b, v23.16b, v12.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+ "sqadd v15.4s, v15.4s, v2.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v9.16b, v5.16b, v27.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v4.16b, v22.16b, v27.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v28.16b, v19.16b, v27.16b\n"
+ "sqadd v20.4s, v20.4s, v21.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v18.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v31.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v12.4s\n"
+ "srshl v20.4s, v20.4s, v12.4s\n"
+ "sqadd v5.4s, v5.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v12.4s\n"
+ "sqadd v22.4s, v22.4s, v4.4s\n"
+ "srshl v23.4s, v23.4s, v12.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "srshl v15.4s, v15.4s, v27.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v27.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v22.4s, v22.4s, v27.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v27.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "tbz x1, #2, 121f\n"
+ "st1 { v7.s }[0], [x17], #0x4\n"
+ "st1 { v20.s }[0], [x16], #0x4\n"
+ "st1 { v24.s }[0], [x15], #0x4\n"
+ "st1 { v23.s }[0], [x14], #0x4\n"
+ "tbz x1, #1, 120f\n"
+ "st1 { v7.h }[2], [x17], #0x2\n"
+ "st1 { v20.h }[2], [x16], #0x2\n"
+ "st1 { v24.h }[2], [x15], #0x2\n"
+ "st1 { v23.h }[2], [x14], #0x2\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v7.b }[6], [x17], #0x1\n"
+ "st1 { v20.b }[6], [x16], #0x1\n"
+ "st1 { v24.b }[6], [x15], #0x1\n"
+ "st1 { v23.b }[6], [x14], #0x1\n"
+ "b 123f\n"
+ "120:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x1, #0, 123f\n"
+ "st1 { v7.b }[4], [x17], #0x1\n"
+ "st1 { v20.b }[4], [x16], #0x1\n"
+ "st1 { v24.b }[4], [x15], #0x1\n"
+ "st1 { v23.b }[4], [x14], #0x1\n"
+ "b 123f\n"
+ "121:" // Oddments: Bit 2: Unset
+ "tbz x1, #1, 122f\n"
+ "st1 { v7.h }[0], [x17], #0x2\n"
+ "st1 { v20.h }[0], [x16], #0x2\n"
+ "st1 { v24.h }[0], [x15], #0x2\n"
+ "st1 { v23.h }[0], [x14], #0x2\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v7.b }[2], [x17], #0x1\n"
+ "st1 { v20.b }[2], [x16], #0x1\n"
+ "st1 { v24.b }[2], [x15], #0x1\n"
+ "st1 { v23.b }[2], [x14], #0x1\n"
+ "b 123f\n"
+ "122:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 123f\n"
+ "st1 { v7.b }[0], [x17], #0x1\n"
+ "st1 { v20.b }[0], [x16], #0x1\n"
+ "st1 { v24.b }[0], [x15], #0x1\n"
+ "st1 { v23.b }[0], [x14], #0x1\n"
+ "123:" // Oddments: Bit 2: End
+ "124:" // End
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..814efe006e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+class a64_u8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+ KernelType kernel = a64_u8q_nhwc_generic_output9_mla_depthfirst_impl;
+
+ public:
+ a64_u8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<uint8_t, uint8_t, uint8_t, int32_t>(9, arm_gemm::VLType::None) {}
+
+ KernelType get_kernel() const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..f7aa889b56
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(
+ const uint8_t *const *const inptrs,
+ uint8_t *const *const outptrs,
+ const void *params,
+ const arm_gemm::Requantize32& qp,
+ const unsigned int n_points,
+ const unsigned int n_channels
+)
+{
+ __asm__ __volatile__(
+ "lsr x9, %x[n_channels], #0x2\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v8.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v7.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v6.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v5.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v4.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+ "ld1r { v3.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v2.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "ld1r { v1.4s }, [x20]\n"
+ "mov x11, #0x0\n"
+ "cbz x9, 6f\n"
+ "1:" // Channel loop
+ "movi v23.4s, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "lsl x20, x11, #0x2\n"
+ "ldr q23, [%x[bias], x20]\n"
+ "2:" // Channel loop: Load bias: Done
+ "ldr s0, [%x[params]], #0x4\n"
+ "mov x25, %x[inptrs]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "subs x24, %x[n_points], #0x1\n"
+ "ldr s14, [x21, x11]\n"
+ "ldr s15, [x20, x11]\n"
+ "mov v24.16b, v23.16b\n"
+ "mov v25.16b, v23.16b\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s16, [x21, x11]\n"
+ "mov v26.16b, v23.16b\n"
+ "mov v27.16b, v23.16b\n"
+ "ldr s17, [x20, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "mov v28.16b, v23.16b\n"
+ "mov v29.16b, v23.16b\n"
+ "ldr s18, [x21, x11]\n"
+ "ldr s19, [x20, x11]\n"
+ "mov v30.16b, v23.16b\n"
+ "mov v31.16b, v23.16b\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s20, [x21, x11]\n"
+ "usubl v0.8h, v0.8b, v5.8b\n"
+ "usubl v14.8h, v14.8b, v6.8b\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x25], #0x8\n"
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr s22, [x20, x11]\n"
+ "usubl v17.8h, v17.8b, v6.8b\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "usubl v19.8h, v19.8b, v6.8b\n"
+ "usubl v20.8h, v20.8b, v6.8b\n"
+ "usubl v21.8h, v21.8b, v6.8b\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
+ "ble 4f\n"
+ "3:" // Channel loop: Planar loop
+ "ldp x23, x22, [x25], #0x10\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "ldr s14, [x23, x11]\n"
+ "ldr s15, [x22, x11]\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "ldr s16, [x21, x11]\n"
+ "ldr s17, [x20, x11]\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s18, [x21, x11]\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "ldr s19, [x20, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
+ "subs x24, x24, #0x1\n"
+ "ldr s0, [%x[params]], #0x4\n"
+ "ldr s20, [x21, x11]\n"
+ "usubl v0.8h, v0.8b, v5.8b\n"
+ "usubl v14.8h, v14.8b, v6.8b\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x25], #0x8\n"
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr s22, [x20, x11]\n"
+ "usubl v17.8h, v17.8b, v6.8b\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "usubl v19.8h, v19.8b, v6.8b\n"
+ "usubl v20.8h, v20.8b, v6.8b\n"
+ "usubl v21.8h, v21.8b, v6.8b\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
+ "bgt 3b\n"
+ "4:" // Channel loop: Planar tail
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
+ "cbz %x[rq_mul_ptr], 5f\n"
+ "lsl x20, x11, #0x2\n"
+ "ldr q2, [%x[rq_mul_ptr], x20]\n"
+ "ldr q1, [%x[rq_right_shift_ptr], x20]\n"
+ "cbz %x[rq_left_shift_ptr], 5f\n"
+ "ldr q3, [%x[rq_left_shift_ptr], x20]\n"
+ "5:" // Channel loop: Load quantisation parameters: Done
+ "sshl v23.4s, v23.4s, v3.4s\n"
+ "sshl v24.4s, v24.4s, v3.4s\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "sshl v25.4s, v25.4s, v3.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
+ "and v16.16b, v25.16b, v1.16b\n"
+ "sshl v26.4s, v26.4s, v3.4s\n"
+ "sshl v27.4s, v27.4s, v3.4s\n"
+ "sshl v28.4s, v28.4s, v3.4s\n"
+ "sshl v29.4s, v29.4s, v3.4s\n"
+ "sshl v30.4s, v30.4s, v3.4s\n"
+ "sshl v31.4s, v31.4s, v3.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v21.16b, v26.16b, v1.16b\n"
+ "and v20.16b, v27.16b, v1.16b\n"
+ "and v19.16b, v28.16b, v1.16b\n"
+ "and v18.16b, v29.16b, v1.16b\n"
+ "and v17.16b, v30.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v1.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v21.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v23.4s, v23.4s, v1.4s\n"
+ "srshl v24.4s, v24.4s, v1.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v27.4s, v27.4s, v1.4s\n"
+ "srshl v28.4s, v28.4s, v1.4s\n"
+ "srshl v29.4s, v29.4s, v1.4s\n"
+ "srshl v30.4s, v30.4s, v1.4s\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "smax v23.4s, v23.4s, v8.4s\n"
+ "smax v24.4s, v24.4s, v8.4s\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v27.4s, v27.4s, v8.4s\n"
+ "smax v28.4s, v28.4s, v8.4s\n"
+ "smax v29.4s, v29.4s, v8.4s\n"
+ "smax v30.4s, v30.4s, v8.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smin v23.4s, v23.4s, v7.4s\n"
+ "smin v24.4s, v24.4s, v7.4s\n"
+ "smin v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v7.4s\n"
+ "smin v27.4s, v27.4s, v7.4s\n"
+ "smin v28.4s, v28.4s, v7.4s\n"
+ "smin v29.4s, v29.4s, v7.4s\n"
+ "smin v30.4s, v30.4s, v7.4s\n"
+ "smin v31.4s, v31.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s23, [x28, x11]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s24, [x27, x11]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s25, [x26, x11]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s26, [x25, x11]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s27, [x24, x11]\n"
+ "str s28, [x23, x11]\n"
+ "str s29, [x22, x11]\n"
+ "str s30, [x21, x11]\n"
+ "str s31, [x20, x11]\n"
+ "add x11, x11, #0x4\n"
+ "cmp x11, x9, LSL #2\n"
+ "blt 1b\n"
+ "6:" // Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 24f\n"
+ "movi v23.4s, #0x0\n"
+ "cbz %x[bias], 9f\n"
+ "add x20, %x[bias], x11, LSL #2\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ld1 { v23.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v23.s }[2], [x20], #0x4\n"
+ "b 8f\n"
+ "7:" // Oddments: Load bias: Bit 1: Unset
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "8:" // Oddments: Load bias: Bit 1: End
+ "9:" // Oddments: Load bias: Done
+ "ldr s0, [%x[params]], #0x4\n"
+ "mov x10, %x[inptrs]\n"
+ "ldp x9, x28, [x10], #0x10\n"
+ "mov v24.16b, v23.16b\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "mov v25.16b, v23.16b\n"
+ "mov v26.16b, v23.16b\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "ldr x21, [x10], #0x8\n"
+ "mov v27.16b, v23.16b\n"
+ "mov v28.16b, v23.16b\n"
+ "mov v29.16b, v23.16b\n"
+ "mov v30.16b, v23.16b\n"
+ "add x9, x9, x11\n"
+ "add x28, x28, x11\n"
+ "mov v31.16b, v23.16b\n"
+ "usubl v0.8h, v0.8b, v5.8b\n"
+ "add x27, x27, x11\n"
+ "add x26, x26, x11\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v15.b }[2], [x28], #0x1\n"
+ "ld1 { v16.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "ld1 { v18.b }[2], [x25], #0x1\n"
+ "ld1 { v19.b }[2], [x24], #0x1\n"
+ "ld1 { v20.b }[2], [x23], #0x1\n"
+ "ld1 { v21.b }[2], [x22], #0x1\n"
+ "ld1 { v22.b }[2], [x21], #0x1\n"
+ "b 11f\n"
+ "10:" // Oddments: Load: Bit 1: Unset
+ "ldr b14, [x9], #0x1\n"
+ "ldr b15, [x28], #0x1\n"
+ "ldr b16, [x27], #0x1\n"
+ "ldr b17, [x26], #0x1\n"
+ "ldr b18, [x25], #0x1\n"
+ "ldr b19, [x24], #0x1\n"
+ "ldr b20, [x23], #0x1\n"
+ "ldr b21, [x22], #0x1\n"
+ "ldr b22, [x21], #0x1\n"
+ "11:" // Oddments: Load: Bit 1: End
+ "subs x20, %x[n_points], #0x1\n"
+ "usubl v14.8h, v14.8b, v6.8b\n"
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "usubl v17.8h, v17.8b, v6.8b\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "usubl v19.8h, v19.8b, v6.8b\n"
+ "usubl v20.8h, v20.8b, v6.8b\n"
+ "usubl v21.8h, v21.8b, v6.8b\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
+ "ble 15f\n"
+ "12:" // Oddments: Planar loop
+ "ldp x9, x28, [x10], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "ldr x21, [x10], #0x8\n"
+ "add x9, x9, x11\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "add x28, x28, x11\n"
+ "add x27, x27, x11\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
+ "ldr s0, [%x[params]], #0x4\n"
+ "usubl v0.8h, v0.8b, v5.8b\n"
+ "add x26, x26, x11\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v15.b }[2], [x28], #0x1\n"
+ "ld1 { v16.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "ld1 { v18.b }[2], [x25], #0x1\n"
+ "ld1 { v19.b }[2], [x24], #0x1\n"
+ "ld1 { v20.b }[2], [x23], #0x1\n"
+ "ld1 { v21.b }[2], [x22], #0x1\n"
+ "ld1 { v22.b }[2], [x21], #0x1\n"
+ "b 14f\n"
+ "13:" // Oddments: Planar loop: Load: Bit 1: Unset
+ "ldr b14, [x9], #0x1\n"
+ "ldr b15, [x28], #0x1\n"
+ "ldr b16, [x27], #0x1\n"
+ "ldr b17, [x26], #0x1\n"
+ "ldr b18, [x25], #0x1\n"
+ "ldr b19, [x24], #0x1\n"
+ "ldr b20, [x23], #0x1\n"
+ "ldr b21, [x22], #0x1\n"
+ "ldr b22, [x21], #0x1\n"
+ "14:" // Oddments: Planar loop: Load: Bit 1: End
+ "subs x20, x20, #0x1\n"
+ "usubl v14.8h, v14.8b, v6.8b\n"
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "usubl v17.8h, v17.8b, v6.8b\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "usubl v19.8h, v19.8b, v6.8b\n"
+ "usubl v20.8h, v20.8b, v6.8b\n"
+ "usubl v21.8h, v21.8b, v6.8b\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
+ "bgt 12b\n"
+ "15:" // Oddments: Planar tail
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
+ "cbz %x[rq_mul_ptr], 21f\n"
+ "add x22, %x[rq_mul_ptr], x11, LSL #2\n"
+ "add x21, %x[rq_right_shift_ptr], x11, LSL #2\n"
+ "add x20, %x[rq_left_shift_ptr], x11, LSL #2\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v2.d }[0], [x22], #0x8\n"
+ "ld1 { v1.d }[0], [x21], #0x8\n"
+ "cbz %x[rq_left_shift_ptr], 16f\n"
+ "ld1 { v3.d }[0], [x20], #0x8\n"
+ "16:" // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v2.s }[2], [x22], #0x4\n"
+ "ld1 { v1.s }[2], [x21], #0x4\n"
+ "cbz %x[rq_left_shift_ptr], 17f\n"
+ "ld1 { v3.s }[2], [x20], #0x4\n"
+ "17:" // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done
+ "b 20f\n"
+ "18:" // Oddments: Load quantisation parameters: Bit 1: Unset
+ "ld1 { v2.s }[0], [x22], #0x4\n"
+ "ld1 { v1.s }[0], [x21], #0x4\n"
+ "cbz %x[rq_left_shift_ptr], 19f\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
+ "19:" // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
+ "20:" // Oddments: Load quantisation parameters: Bit 1: End
+ "21:" // Oddments: Load quantisation parameters: Done
+ "sshl v23.4s, v23.4s, v3.4s\n"
+ "sshl v24.4s, v24.4s, v3.4s\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "sshl v25.4s, v25.4s, v3.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "add x28, x28, x11\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
+ "add x27, x27, x11\n"
+ "add x26, x26, x11\n"
+ "and v16.16b, v25.16b, v1.16b\n"
+ "sshl v26.4s, v26.4s, v3.4s\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "sshl v27.4s, v27.4s, v3.4s\n"
+ "sshl v28.4s, v28.4s, v3.4s\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "sshl v29.4s, v29.4s, v3.4s\n"
+ "sshl v30.4s, v30.4s, v3.4s\n"
+ "add x21, x21, x11\n"
+ "add x20, x20, x11\n"
+ "sshl v31.4s, v31.4s, v3.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v21.16b, v26.16b, v1.16b\n"
+ "and v20.16b, v27.16b, v1.16b\n"
+ "and v19.16b, v28.16b, v1.16b\n"
+ "and v18.16b, v29.16b, v1.16b\n"
+ "and v17.16b, v30.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v1.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v21.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v23.4s, v23.4s, v1.4s\n"
+ "srshl v24.4s, v24.4s, v1.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v27.4s, v27.4s, v1.4s\n"
+ "srshl v28.4s, v28.4s, v1.4s\n"
+ "srshl v29.4s, v29.4s, v1.4s\n"
+ "srshl v30.4s, v30.4s, v1.4s\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "smax v23.4s, v23.4s, v8.4s\n"
+ "smax v24.4s, v24.4s, v8.4s\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v27.4s, v27.4s, v8.4s\n"
+ "smax v28.4s, v28.4s, v8.4s\n"
+ "smax v29.4s, v29.4s, v8.4s\n"
+ "smax v30.4s, v30.4s, v8.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smin v23.4s, v23.4s, v7.4s\n"
+ "smin v24.4s, v24.4s, v7.4s\n"
+ "smin v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v7.4s\n"
+ "smin v27.4s, v27.4s, v7.4s\n"
+ "smin v28.4s, v28.4s, v7.4s\n"
+ "smin v29.4s, v29.4s, v7.4s\n"
+ "smin v30.4s, v30.4s, v7.4s\n"
+ "smin v31.4s, v31.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "st1 { v23.h }[0], [x28], #0x2\n"
+ "st1 { v24.h }[0], [x27], #0x2\n"
+ "st1 { v25.h }[0], [x26], #0x2\n"
+ "st1 { v26.h }[0], [x25], #0x2\n"
+ "st1 { v27.h }[0], [x24], #0x2\n"
+ "st1 { v28.h }[0], [x23], #0x2\n"
+ "st1 { v29.h }[0], [x22], #0x2\n"
+ "st1 { v30.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x20], #0x2\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "st1 { v23.b }[2], [x28], #0x1\n"
+ "st1 { v24.b }[2], [x27], #0x1\n"
+ "st1 { v25.b }[2], [x26], #0x1\n"
+ "st1 { v26.b }[2], [x25], #0x1\n"
+ "st1 { v27.b }[2], [x24], #0x1\n"
+ "st1 { v28.b }[2], [x23], #0x1\n"
+ "st1 { v29.b }[2], [x22], #0x1\n"
+ "st1 { v30.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x20], #0x1\n"
+ "b 23f\n"
+ "22:" // Oddments: Store: Bit 1: Unset
+ "st1 { v23.b }[0], [x28], #0x1\n"
+ "st1 { v24.b }[0], [x27], #0x1\n"
+ "st1 { v25.b }[0], [x26], #0x1\n"
+ "st1 { v26.b }[0], [x25], #0x1\n"
+ "st1 { v27.b }[0], [x24], #0x1\n"
+ "st1 { v28.b }[0], [x23], #0x1\n"
+ "st1 { v29.b }[0], [x22], #0x1\n"
+ "st1 { v30.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x20], #0x1\n"
+ "23:" // Oddments: Store: Bit 1: End
+ "24:" // End
+ : [params] "+&r" (params)
+ : [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
new file mode 100644
index 0000000000..76965606f7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst : DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+ using Parent = DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *)
+ : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
+
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d69f391514
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -0,0 +1,519 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(
+ const uint8_t *const *const inptrs,
+ uint8_t *const *const outptrs,
+ const void *params,
+ unsigned int n_output_channels,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "ldr q11, [%x[params], #0x0]\n"
+ "ldr q5, [%x[params], #0x10]\n"
+ "movi v8.16b, #0x1\n"
+ "ushr v8.4s, v8.4s, #0x8\n"
+ "ldr q6, [%x[params], #0x20]\n"
+ "ldr q7, [%x[params], #0x30]\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "ldr x20, [%x[inptrs], #0x8]\n"
+ "ld1 { v1.16b }, [x20]\n"
+ "mov v28.16b, v1.16b\n"
+ "mov v23.16b, v1.16b\n"
+ "ldr x20, [%x[inptrs], #0x10]\n"
+ "ld1 { v2.16b }, [x20]\n"
+ "mov v30.16b, v1.16b\n"
+ "mov v21.16b, v2.16b\n"
+ "ldr x20, [%x[inptrs], #0x20]\n"
+ "ld1 { v4.16b }, [x20]\n"
+ "mov v20.16b, v2.16b\n"
+ "mov v29.16b, v2.16b\n"
+ "ldr x20, [%x[inptrs], #0x0]\n"
+ "ld1 { v0.16b }, [x20]\n"
+ "mov v9.16b, v4.16b\n"
+ "mov v22.16b, v4.16b\n"
+ "ldr x20, [%x[inptrs], #0x18]\n"
+ "ld1 { v3.16b }, [x20]\n"
+ "mov v31.16b, v4.16b\n"
+ "ext v28.16b, v28.16b, v28.16b, #0x2\n"
+ "ext v23.16b, v23.16b, v23.16b, #0x4\n"
+ "ext v30.16b, v30.16b, v30.16b, #0x6\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v12.4s }, [x20]\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x2\n"
+ "ext v20.16b, v20.16b, v20.16b, #0x4\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "ext v29.16b, v29.16b, v29.16b, #0x6\n"
+ "ext v9.16b, v9.16b, v9.16b, #0x2\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x4\n"
+ "ext v31.16b, v31.16b, v31.16b, #0x6\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "mov v27.16b, v0.16b\n"
+ "mov v19.16b, v0.16b\n"
+ "cmp %x[n_channels], #0x4\n"
+ "mov x9, #0x0\n"
+ "mov v18.16b, v0.16b\n"
+ "mov v26.16b, v3.16b\n"
+ "mov x28, #0x0\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "mov v17.16b, v3.16b\n"
+ "mov v16.16b, v3.16b\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x2\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x4\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "add %x[params], %x[params], #0x40\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x6\n"
+ "zip1 v1.4s, v1.4s, v23.4s\n"
+ "zip1 v28.4s, v28.4s, v30.4s\n"
+ "zip1 v2.4s, v2.4s, v20.4s\n"
+ "zip1 v21.4s, v21.4s, v29.4s\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x2\n"
+ "ext v17.16b, v17.16b, v17.16b, #0x4\n"
+ "ext v16.16b, v16.16b, v16.16b, #0x6\n"
+ "zip1 v4.4s, v4.4s, v22.4s\n"
+ "zip1 v9.4s, v9.4s, v31.4s\n"
+ "zip1 v0.4s, v0.4s, v19.4s\n"
+ "zip1 v27.4s, v27.4s, v18.4s\n"
+ "zip1 v1.4s, v1.4s, v28.4s\n"
+ "zip1 v2.4s, v2.4s, v21.4s\n"
+ ".inst 0x6f81e118 // udot v24.4s, v8.16b, v1.4b[0]\n"
+ "zip1 v3.4s, v3.4s, v17.4s\n"
+ "zip1 v26.4s, v26.4s, v16.4s\n"
+ ".inst 0x6fa1e119 // udot v25.4s, v8.16b, v1.4b[1]\n"
+ "zip1 v4.4s, v4.4s, v9.4s\n"
+ "movi v23.4s, #0x0\n"
+ ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
+ "movi v22.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ ".inst 0x6fa1e916 // udot v22.4s, v8.16b, v1.4b[3]\n"
+ "movi v19.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ ".inst 0x6f82e115 // udot v21.4s, v8.16b, v2.4b[0]\n"
+ "movi v10.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ ".inst 0x6fa2e113 // udot v19.4s, v8.16b, v2.4b[1]\n"
+ "movi v18.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x6f82e909 // udot v9.4s, v8.16b, v2.4b[2]\n"
+ "movi v16.4s, #0x0\n"
+ "zip1 v0.4s, v0.4s, v27.4s\n"
+ ".inst 0x6fa2e90a // udot v10.4s, v8.16b, v2.4b[3]\n"
+ "zip1 v3.4s, v3.4s, v26.4s\n"
+ ".inst 0x6f84e114 // udot v20.4s, v8.16b, v4.4b[0]\n"
+ ".inst 0x6fa4e112 // udot v18.4s, v8.16b, v4.4b[1]\n"
+ ".inst 0x6f84e911 // udot v17.4s, v8.16b, v4.4b[2]\n"
+ ".inst 0x6fa4e910 // udot v16.4s, v8.16b, v4.4b[3]\n"
+ "movi v31.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ ".inst 0x6f80e11f // udot v31.4s, v8.16b, v0.4b[0]\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ ".inst 0x6fa0e11e // udot v30.4s, v8.16b, v0.4b[1]\n"
+ "movi v29.4s, #0x0\n"
+ ".inst 0x6f80e91a // udot v26.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6fa0e91b // udot v27.4s, v8.16b, v0.4b[3]\n"
+ ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ ".inst 0x6fa3e11d // udot v29.4s, v8.16b, v3.4b[1]\n"
+ "add v24.4s, v24.4s, v21.4s\n"
+ "add v25.4s, v25.4s, v19.4s\n"
+ "add v23.4s, v23.4s, v9.4s\n"
+ "add v22.4s, v22.4s, v10.4s\n"
+ "add v21.4s, v20.4s, v21.4s\n"
+ "movi v20.4s, #0x0\n"
+ ".inst 0x6f83e914 // udot v20.4s, v8.16b, v3.4b[2]\n"
+ "add v19.4s, v18.4s, v19.4s\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x6fa3e912 // udot v18.4s, v8.16b, v3.4b[3]\n"
+ "add v17.4s, v17.4s, v9.4s\n"
+ "add v16.4s, v16.4s, v10.4s\n"
+ "add v24.4s, v24.4s, v31.4s\n"
+ "add v25.4s, v25.4s, v30.4s\n"
+ "add v26.4s, v23.4s, v26.4s\n"
+ "add v27.4s, v22.4s, v27.4s\n"
+ "add v28.4s, v21.4s, v28.4s\n"
+ "add v29.4s, v19.4s, v29.4s\n"
+ "add v30.4s, v17.4s, v20.4s\n"
+ "add v31.4s, v16.4s, v18.4s\n"
+ "neg v12.4s, v12.4s\n"
+ "mul v24.4s, v24.4s, v12.4s\n"
+ "mul v25.4s, v25.4s, v12.4s\n"
+ "mul v26.4s, v26.4s, v12.4s\n"
+ "mul v27.4s, v27.4s, v12.4s\n"
+ "mul v28.4s, v28.4s, v12.4s\n"
+ "mul v29.4s, v29.4s, v12.4s\n"
+ "mul v30.4s, v30.4s, v12.4s\n"
+ "mul v31.4s, v31.4s, v12.4s\n"
+ "zip1 v19.4s, v24.4s, v26.4s\n"
+ "zip1 v18.4s, v25.4s, v27.4s\n"
+ "zip1 v17.4s, v28.4s, v30.4s\n"
+ "zip1 v16.4s, v29.4s, v31.4s\n"
+ "zip1 v22.4s, v19.4s, v18.4s\n"
+ "zip1 v23.4s, v17.4s, v16.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "ble 2f\n"
+ "1:" // Loop
+ "ldr q8, [%x[params], #0x0]\n"
+ "ldr q21, [%x[params], #0x10]\n"
+ ".inst 0x6f80e0b8 // udot v24.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6fa0e0b9 // udot v25.4s, v5.16b, v0.4b[1]\n"
+ "ldr q20, [%x[params], #0x20]\n"
+ ".inst 0x6f80e8ba // udot v26.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6fa0e8bb // udot v27.4s, v5.16b, v0.4b[3]\n"
+ "sub %x[n_channels], %x[n_channels], #0x4\n"
+ ".inst 0x6f81e0d8 // udot v24.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6fa1e0d9 // udot v25.4s, v6.16b, v1.4b[1]\n"
+ "cmp %x[n_channels], #0x4\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0x6f81e8da // udot v26.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6fa1e8db // udot v27.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6f82e0bc // udot v28.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x6fa2e0bd // udot v29.4s, v5.16b, v2.4b[1]\n"
+ ".inst 0x6f82e8be // udot v30.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x6fa2e8bf // udot v31.4s, v5.16b, v2.4b[3]\n"
+ "ldr q5, [%x[params], #0x30]\n"
+ ".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6fa2e0f9 // udot v25.4s, v7.16b, v2.4b[1]\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6fa2e8fb // udot v27.4s, v7.16b, v2.4b[3]\n"
+ "sqrdmulh v25.4s, v25.4s, v8.4s\n"
+ ".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6fa3e0dd // udot v29.4s, v6.16b, v3.4b[1]\n"
+ "sqrdmulh v26.4s, v26.4s, v8.4s\n"
+ ".inst 0x6f83e8de // udot v30.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6fa3e8df // udot v31.4s, v6.16b, v3.4b[3]\n"
+ "ldr q6, [%x[params], #0x40]\n"
+ "sqrdmulh v27.4s, v27.4s, v8.4s\n"
+ ".inst 0x6f84e0fc // udot v28.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6fa4e0fd // udot v29.4s, v7.16b, v4.4b[1]\n"
+ "and v19.16b, v24.16b, v21.16b\n"
+ ".inst 0x6f84e8fe // udot v30.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x6fa4e8ff // udot v31.4s, v7.16b, v4.4b[3]\n"
+ "ldr q7, [%x[params], #0x50]\n"
+ "and v18.16b, v25.16b, v21.16b\n"
+ "and v17.16b, v26.16b, v21.16b\n"
+ "and v16.16b, v27.16b, v21.16b\n"
+ "add %x[params], %x[params], #0x60\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v28.4s, v28.4s, v8.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v8.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v8.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v8.4s\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
+ "sqadd v25.4s, v25.4s, v18.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "and v19.16b, v28.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v17.16b, v30.16b, v21.16b\n"
+ "and v16.16b, v31.16b, v21.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v24.4s, v24.4s, v21.4s\n"
+ "srshl v25.4s, v25.4s, v21.4s\n"
+ "srshl v26.4s, v26.4s, v21.4s\n"
+ "srshl v27.4s, v27.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v15.4s\n"
+ "smin v26.4s, v26.4s, v15.4s\n"
+ "smin v27.4s, v27.4s, v15.4s\n"
+ "smin v28.4s, v28.4s, v15.4s\n"
+ "smin v29.4s, v29.4s, v15.4s\n"
+ "smin v30.4s, v30.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s24, [x27, x28]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s25, [x26, x28]\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s26, [x25, x28]\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s27, [x24, x28]\n"
+ "str s28, [x23, x28]\n"
+ "dup v24.4s, v22.s[0]\n"
+ "dup v25.4s, v22.s[1]\n"
+ "str s29, [x22, x28]\n"
+ "dup v26.4s, v22.s[2]\n"
+ "dup v27.4s, v22.s[3]\n"
+ "str s30, [x21, x28]\n"
+ "dup v28.4s, v23.s[0]\n"
+ "dup v29.4s, v23.s[1]\n"
+ "str s31, [x20, x28]\n"
+ "dup v30.4s, v23.s[2]\n"
+ "dup v31.4s, v23.s[3]\n"
+ "add x28, x28, #0x4\n"
+ "add v24.4s, v24.4s, v20.4s\n"
+ "add v25.4s, v25.4s, v20.4s\n"
+ "add v26.4s, v26.4s, v20.4s\n"
+ "add v27.4s, v27.4s, v20.4s\n"
+ "add v28.4s, v28.4s, v20.4s\n"
+ "add v29.4s, v29.4s, v20.4s\n"
+ "add v30.4s, v30.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v20.4s\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "ldr q21, [%x[params], #0x0]\n"
+ "ldr q20, [%x[params], #0x10]\n"
+ ".inst 0x6f80e0b8 // udot v24.4s, v5.16b, v0.4b[0]\n"
+ ".inst 0x6fa0e0b9 // udot v25.4s, v5.16b, v0.4b[1]\n"
+ ".inst 0x6f80e8ba // udot v26.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6fa0e8bb // udot v27.4s, v5.16b, v0.4b[3]\n"
+ "cmp %x[n_channels], #0x4\n"
+ "add x27, x27, x28\n"
+ ".inst 0x6f81e0d8 // udot v24.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6fa1e0d9 // udot v25.4s, v6.16b, v1.4b[1]\n"
+ "add x26, x26, x28\n"
+ "add x25, x25, x28\n"
+ ".inst 0x6f81e8da // udot v26.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6fa1e8db // udot v27.4s, v6.16b, v1.4b[3]\n"
+ "add x24, x24, x28\n"
+ "add x23, x23, x28\n"
+ ".inst 0x6f82e0bc // udot v28.4s, v5.16b, v2.4b[0]\n"
+ ".inst 0x6fa2e0bd // udot v29.4s, v5.16b, v2.4b[1]\n"
+ "add x22, x22, x28\n"
+ "add x21, x21, x28\n"
+ ".inst 0x6f82e8be // udot v30.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x6fa2e8bf // udot v31.4s, v5.16b, v2.4b[3]\n"
+ "add x20, x20, x28\n"
+ "add %x[params], %x[params], #0x20\n"
+ ".inst 0x6f82e0f8 // udot v24.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6fa2e0f9 // udot v25.4s, v7.16b, v2.4b[1]\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6fa2e8fb // udot v27.4s, v7.16b, v2.4b[3]\n"
+ "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ ".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6fa3e0dd // udot v29.4s, v6.16b, v3.4b[1]\n"
+ "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+ ".inst 0x6f83e8de // udot v30.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6fa3e8df // udot v31.4s, v6.16b, v3.4b[3]\n"
+ "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ ".inst 0x6f84e0fc // udot v28.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6fa4e0fd // udot v29.4s, v7.16b, v4.4b[1]\n"
+ "and v19.16b, v24.16b, v20.16b\n"
+ ".inst 0x6f84e8fe // udot v30.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x6fa4e8ff // udot v31.4s, v7.16b, v4.4b[3]\n"
+ "and v18.16b, v25.16b, v20.16b\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "and v16.16b, v27.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
+ "sqadd v25.4s, v25.4s, v18.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "and v19.16b, v28.16b, v20.16b\n"
+ "and v18.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v30.16b, v20.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v24.4s, v24.4s, v20.4s\n"
+ "srshl v25.4s, v25.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v27.4s, v27.4s, v20.4s\n"
+ "srshl v28.4s, v28.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "add v24.4s, v24.4s, v14.4s\n"
+ "add v25.4s, v25.4s, v14.4s\n"
+ "add v26.4s, v26.4s, v14.4s\n"
+ "add v27.4s, v27.4s, v14.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v15.4s\n"
+ "smin v26.4s, v26.4s, v15.4s\n"
+ "smin v27.4s, v27.4s, v15.4s\n"
+ "smin v28.4s, v28.4s, v15.4s\n"
+ "smin v29.4s, v29.4s, v15.4s\n"
+ "smin v30.4s, v30.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v13.4s\n"
+ "smax v25.4s, v25.4s, v13.4s\n"
+ "smax v26.4s, v26.4s, v13.4s\n"
+ "smax v27.4s, v27.4s, v13.4s\n"
+ "smax v28.4s, v28.4s, v13.4s\n"
+ "smax v29.4s, v29.4s, v13.4s\n"
+ "smax v30.4s, v30.4s, v13.4s\n"
+ "smax v31.4s, v31.4s, v13.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "blt 3f\n"
+ "str s24, [x27, #0x0]\n"
+ "str s25, [x26, #0x0]\n"
+ "str s26, [x25, #0x0]\n"
+ "str s27, [x24, #0x0]\n"
+ "str s28, [x23, #0x0]\n"
+ "str s29, [x22, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
+ "str s31, [x20, #0x0]\n"
+ "b 4f\n"
+ "3:" // Tail: Oddments
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v24.b }[0], [x27], #0x1\n"
+ "st1 { v25.b }[0], [x26], #0x1\n"
+ "st1 { v26.b }[0], [x25], #0x1\n"
+ "st1 { v27.b }[0], [x24], #0x1\n"
+ "st1 { v28.b }[0], [x23], #0x1\n"
+ "st1 { v29.b }[0], [x22], #0x1\n"
+ "st1 { v30.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x20], #0x1\n"
+ "beq 4f\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v24.b }[1], [x27], #0x1\n"
+ "st1 { v25.b }[1], [x26], #0x1\n"
+ "st1 { v26.b }[1], [x25], #0x1\n"
+ "st1 { v27.b }[1], [x24], #0x1\n"
+ "st1 { v28.b }[1], [x23], #0x1\n"
+ "st1 { v29.b }[1], [x22], #0x1\n"
+ "st1 { v30.b }[1], [x21], #0x1\n"
+ "st1 { v31.b }[1], [x20], #0x1\n"
+ "beq 4f\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v24.b }[2], [x27], #0x1\n"
+ "st1 { v25.b }[2], [x26], #0x1\n"
+ "st1 { v26.b }[2], [x25], #0x1\n"
+ "st1 { v27.b }[2], [x24], #0x1\n"
+ "st1 { v28.b }[2], [x23], #0x1\n"
+ "st1 { v29.b }[2], [x22], #0x1\n"
+ "st1 { v30.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x20], #0x1\n"
+ "beq 4f\n"
+ "st1 { v24.b }[3], [x27], #0x1\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v25.b }[3], [x26], #0x1\n"
+ "st1 { v26.b }[3], [x25], #0x1\n"
+ "st1 { v27.b }[3], [x24], #0x1\n"
+ "st1 { v28.b }[3], [x23], #0x1\n"
+ "st1 { v29.b }[3], [x22], #0x1\n"
+ "st1 { v30.b }[3], [x21], #0x1\n"
+ "st1 { v31.b }[3], [x20], #0x1\n"
+ "4:" // Tail: End
+ : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..4485aaa735
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst : DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+ using Parent = DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *)
+ : Parent(4, 2, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
+
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..61cec2b66d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,640 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(
+ const uint8_t *const *const inptrs,
+ uint8_t *const *const outptrs,
+ const void *params,
+ unsigned int n_output_channels,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "ldr q12, [%x[params], #0x0]\n"
+ "ldr q8, [%x[params], #0x10]\n"
+ "movi v30.16b, #0x1\n"
+ "movi v17.4s, #0x0\n"
+ "ldr q9, [%x[params], #0x20]\n"
+ "ldr q10, [%x[params], #0x30]\n"
+ "movi v16.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "ldr q11, [%x[params], #0x40]\n"
+ "ldr x20, [%x[inptrs], #0x18]\n"
+ "movi v24.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "ld1 { v3.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x20]\n"
+ "mov v26.16b, v3.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "ld1 { v4.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x10]\n"
+ "mov v21.16b, v4.16b\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ "ld1 { v2.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x8]\n"
+ "mov v27.16b, v2.16b\n"
+ "ext v27.16b, v27.16b, v27.16b, #0x1\n"
+ "ld1 { v1.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x28]\n"
+ "zip1 v3.2d, v3.2d, v26.2d\n"
+ "zip1 v4.2d, v4.2d, v21.2d\n"
+ "ld1 { v5.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x30]\n"
+ "mov v26.16b, v1.16b\n"
+ "mov v22.16b, v5.16b\n"
+ "ld1 { v6.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x38]\n"
+ "mov v19.16b, v6.16b\n"
+ "ext v26.16b, v26.16b, v26.16b, #0x1\n"
+ "ld1 { v7.16b }, [x20]\n"
+ "ldr x20, [%x[inptrs], #0x0]\n"
+ "mov v21.16b, v7.16b\n"
+ "zip1 v2.2d, v2.2d, v27.2d\n"
+ "ld1 { v0.16b }, [x20]\n"
+ "ext v22.16b, v22.16b, v22.16b, #0x1\n"
+ "ext v19.16b, v19.16b, v19.16b, #0x1\n"
+ ".inst 0x6f83e3d1 // udot v17.4s, v30.16b, v3.4b[0]\n"
+ "ext v21.16b, v21.16b, v21.16b, #0x1\n"
+ ".inst 0x6f83ebd0 // udot v16.4s, v30.16b, v3.4b[2]\n"
+ ".inst 0x6f84e3d9 // udot v25.4s, v30.16b, v4.4b[0]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v23.4s }, [x20]\n"
+ ".inst 0x6f84ebd8 // udot v24.4s, v30.16b, v4.4b[2]\n"
+ "mov v18.16b, v0.16b\n"
+ ".inst 0x6f82e3df // udot v31.4s, v30.16b, v2.4b[0]\n"
+ "movi v29.4s, #0x0\n"
+ "movi v28.4s, #0x1\n"
+ ".inst 0x6f82ebdd // udot v29.4s, v30.16b, v2.4b[2]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v13.4s }, [x20]\n"
+ "ext v18.16b, v18.16b, v18.16b, #0x1\n"
+ "zip1 v1.2d, v1.2d, v26.2d\n"
+ ".inst 0x6fa3e391 // udot v17.4s, v28.16b, v3.4b[1]\n"
+ "zip1 v5.2d, v5.2d, v22.2d\n"
+ "zip1 v6.2d, v6.2d, v19.2d\n"
+ ".inst 0x6fa3eb90 // udot v16.4s, v28.16b, v3.4b[3]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "zip1 v7.2d, v7.2d, v21.2d\n"
+ "movi v22.4s, #0x0\n"
+ ".inst 0x6fa4e399 // udot v25.4s, v28.16b, v4.4b[1]\n"
+ "movi v21.4s, #0x0\n"
+ ".inst 0x6fa4eb98 // udot v24.4s, v28.16b, v4.4b[3]\n"
+ ".inst 0x6f81e3d6 // udot v22.4s, v30.16b, v1.4b[0]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ ".inst 0x6f81ebd5 // udot v21.4s, v30.16b, v1.4b[2]\n"
+ "movi v20.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ ".inst 0x6f85e3da // udot v26.4s, v30.16b, v5.4b[0]\n"
+ "cmp %x[n_channels], #0x4\n"
+ "zip1 v0.2d, v0.2d, v18.2d\n"
+ "movi v18.4s, #0x0\n"
+ ".inst 0x6f85ebdb // udot v27.4s, v30.16b, v5.4b[2]\n"
+ "mov x9, #0x0\n"
+ ".inst 0x6f86e3d4 // udot v20.4s, v30.16b, v6.4b[0]\n"
+ ".inst 0x6f86ebd3 // udot v19.4s, v30.16b, v6.4b[2]\n"
+ "add v17.4s, v17.4s, v25.4s\n"
+ "mov x28, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ ".inst 0x6f87e3d2 // udot v18.4s, v30.16b, v7.4b[0]\n"
+ ".inst 0x6f87ebd9 // udot v25.4s, v30.16b, v7.4b[2]\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ ".inst 0x6fa2e39f // udot v31.4s, v28.16b, v2.4b[1]\n"
+ ".inst 0x6fa2eb9d // udot v29.4s, v28.16b, v2.4b[3]\n"
+ "add v16.4s, v16.4s, v24.4s\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "movi v24.4s, #0x0\n"
+ ".inst 0x6f80e3d8 // udot v24.4s, v30.16b, v0.4b[0]\n"
+ ".inst 0x6fa1e396 // udot v22.4s, v28.16b, v1.4b[1]\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ ".inst 0x6fa1eb95 // udot v21.4s, v28.16b, v1.4b[3]\n"
+ ".inst 0x6fa5e39a // udot v26.4s, v28.16b, v5.4b[1]\n"
+ "add v31.4s, v31.4s, v17.4s\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ ".inst 0x6fa5eb9b // udot v27.4s, v28.16b, v5.4b[3]\n"
+ ".inst 0x6fa6e394 // udot v20.4s, v28.16b, v6.4b[1]\n"
+ "add v29.4s, v29.4s, v16.4s\n"
+ "add %x[params], %x[params], #0x50\n"
+ ".inst 0x6fa6eb93 // udot v19.4s, v28.16b, v6.4b[3]\n"
+ ".inst 0x6fa7e392 // udot v18.4s, v28.16b, v7.4b[1]\n"
+ "add v22.4s, v22.4s, v31.4s\n"
+ ".inst 0x6fa7eb99 // udot v25.4s, v28.16b, v7.4b[3]\n"
+ ".inst 0x6fa0e398 // udot v24.4s, v28.16b, v0.4b[1]\n"
+ "add v21.4s, v21.4s, v29.4s\n"
+ "add v20.4s, v26.4s, v20.4s\n"
+ "add v19.4s, v27.4s, v19.4s\n"
+ "add v18.4s, v18.4s, v17.4s\n"
+ "movi v17.4s, #0x0\n"
+ ".inst 0x6f80ebd1 // udot v17.4s, v30.16b, v0.4b[2]\n"
+ ".inst 0x6fa0eb91 // udot v17.4s, v28.16b, v0.4b[3]\n"
+ "add v16.4s, v25.4s, v16.4s\n"
+ "add v24.4s, v22.4s, v24.4s\n"
+ "add v25.4s, v21.4s, v17.4s\n"
+ "add v26.4s, v26.4s, v22.4s\n"
+ "add v27.4s, v27.4s, v21.4s\n"
+ "add v28.4s, v20.4s, v31.4s\n"
+ "add v29.4s, v19.4s, v29.4s\n"
+ "add v30.4s, v20.4s, v18.4s\n"
+ "add v31.4s, v19.4s, v16.4s\n"
+ "neg v23.4s, v23.4s\n"
+ "mul v24.4s, v24.4s, v23.4s\n"
+ "mul v25.4s, v25.4s, v23.4s\n"
+ "mul v26.4s, v26.4s, v23.4s\n"
+ "mul v27.4s, v27.4s, v23.4s\n"
+ "mul v28.4s, v28.4s, v23.4s\n"
+ "mul v29.4s, v29.4s, v23.4s\n"
+ "mul v30.4s, v30.4s, v23.4s\n"
+ "mul v31.4s, v31.4s, v23.4s\n"
+ "zip1 v19.4s, v24.4s, v26.4s\n"
+ "zip1 v18.4s, v25.4s, v27.4s\n"
+ "zip1 v17.4s, v28.4s, v30.4s\n"
+ "zip1 v16.4s, v29.4s, v31.4s\n"
+ "zip1 v22.4s, v19.4s, v18.4s\n"
+ "zip1 v23.4s, v17.4s, v16.4s\n"
+ "add v24.4s, v24.4s, v12.4s\n"
+ "add v25.4s, v25.4s, v12.4s\n"
+ "add v26.4s, v26.4s, v12.4s\n"
+ "add v27.4s, v27.4s, v12.4s\n"
+ "add v28.4s, v28.4s, v12.4s\n"
+ "add v29.4s, v29.4s, v12.4s\n"
+ "add v30.4s, v30.4s, v12.4s\n"
+ "add v31.4s, v31.4s, v12.4s\n"
+ "ble 2f\n"
+ "1:" // Loop
+ "ldr q12, [%x[params], #0x60]\n"
+ "ldr q21, [%x[params], #0x70]\n"
+ ".inst 0x6f80e118 // udot v24.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x6f80e919 // udot v25.4s, v8.16b, v0.4b[2]\n"
+ "ldr q20, [%x[params], #0x80]\n"
+ ".inst 0x6f81e11a // udot v26.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x6f81e91b // udot v27.4s, v8.16b, v1.4b[2]\n"
+ "sub %x[n_channels], %x[n_channels], #0x4\n"
+ ".inst 0x6fa0e138 // udot v24.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x6fa0e939 // udot v25.4s, v9.16b, v0.4b[3]\n"
+ "cmp %x[n_channels], #0x4\n"
+ "add x9, x9, #0x10\n"
+ ".inst 0x6fa1e13a // udot v26.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x6fa1e93b // udot v27.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x6f82e11c // udot v28.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x6f82e91d // udot v29.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n"
+ ".inst 0x6f83e91f // udot v31.4s, v8.16b, v3.4b[2]\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ ".inst 0x6f81e158 // udot v24.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x6f81e959 // udot v25.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x6f82e95b // udot v27.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x6fa2e13c // udot v28.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x6fa2e93d // udot v29.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e13e // udot v30.4s, v9.16b, v3.4b[1]\n"
+ ".inst 0x6fa3e93f // udot v31.4s, v9.16b, v3.4b[3]\n"
+ "ldr q16, [%x[params], #0x10]\n"
+ ".inst 0x6fa1e178 // udot v24.4s, v11.16b, v1.4b[1]\n"
+ ".inst 0x6fa1e979 // udot v25.4s, v11.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e17a // udot v26.4s, v11.16b, v2.4b[1]\n"
+ ".inst 0x6fa2e97b // udot v27.4s, v11.16b, v2.4b[3]\n"
+ ".inst 0x6f83e15c // udot v28.4s, v10.16b, v3.4b[0]\n"
+ ".inst 0x6f83e95d // udot v29.4s, v10.16b, v3.4b[2]\n"
+ ".inst 0x6f84e15e // udot v30.4s, v10.16b, v4.4b[0]\n"
+ ".inst 0x6f84e95f // udot v31.4s, v10.16b, v4.4b[2]\n"
+ "ldr q19, [%x[params], #0x20]\n"
+ ".inst 0x6f82e238 // udot v24.4s, v17.16b, v2.4b[0]\n"
+ ".inst 0x6f82ea39 // udot v25.4s, v17.16b, v2.4b[2]\n"
+ ".inst 0x6f83e23a // udot v26.4s, v17.16b, v3.4b[0]\n"
+ ".inst 0x6f83ea3b // udot v27.4s, v17.16b, v3.4b[2]\n"
+ ".inst 0x6fa3e17c // udot v28.4s, v11.16b, v3.4b[1]\n"
+ ".inst 0x6fa3e97d // udot v29.4s, v11.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e17e // udot v30.4s, v11.16b, v4.4b[1]\n"
+ ".inst 0x6fa4e97f // udot v31.4s, v11.16b, v4.4b[3]\n"
+ "ldr q18, [%x[params], #0x30]\n"
+ ".inst 0x6fa2e218 // udot v24.4s, v16.16b, v2.4b[1]\n"
+ ".inst 0x6fa2ea19 // udot v25.4s, v16.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e21a // udot v26.4s, v16.16b, v3.4b[1]\n"
+ ".inst 0x6fa3ea1b // udot v27.4s, v16.16b, v3.4b[3]\n"
+ ".inst 0x6f84e23c // udot v28.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea3d // udot v29.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x6f85e23e // udot v30.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x6f85ea3f // udot v31.4s, v17.16b, v5.4b[2]\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ ".inst 0x6f83e278 // udot v24.4s, v19.16b, v3.4b[0]\n"
+ ".inst 0x6f83ea79 // udot v25.4s, v19.16b, v3.4b[2]\n"
+ ".inst 0x6f84e27a // udot v26.4s, v19.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea7b // udot v27.4s, v19.16b, v4.4b[2]\n"
+ ".inst 0x6fa4e21c // udot v28.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea1d // udot v29.4s, v16.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e21e // udot v30.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ea1f // udot v31.4s, v16.16b, v5.4b[3]\n"
+ "ldr q16, [%x[params], #0x50]\n"
+ ".inst 0x6fa3e258 // udot v24.4s, v18.16b, v3.4b[1]\n"
+ ".inst 0x6fa3ea59 // udot v25.4s, v18.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e25a // udot v26.4s, v18.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea5b // udot v27.4s, v18.16b, v4.4b[3]\n"
+ ".inst 0x6f85e27c // udot v28.4s, v19.16b, v5.4b[0]\n"
+ ".inst 0x6f85ea7d // udot v29.4s, v19.16b, v5.4b[2]\n"
+ ".inst 0x6f86e27e // udot v30.4s, v19.16b, v6.4b[0]\n"
+ ".inst 0x6f86ea7f // udot v31.4s, v19.16b, v6.4b[2]\n"
+ "ldr q10, [%x[params], #0xb0]\n"
+ ".inst 0x6f84e238 // udot v24.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea39 // udot v25.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x6f85e23a // udot v26.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x6f85ea3b // udot v27.4s, v17.16b, v5.4b[2]\n"
+ ".inst 0x6fa5e25c // udot v28.4s, v18.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ea5d // udot v29.4s, v18.16b, v5.4b[3]\n"
+ ".inst 0x6fa6e25e // udot v30.4s, v18.16b, v6.4b[1]\n"
+ ".inst 0x6fa6ea5f // udot v31.4s, v18.16b, v6.4b[3]\n"
+ "ldr q11, [%x[params], #0xc0]\n"
+ ".inst 0x6fa4e218 // udot v24.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea19 // udot v25.4s, v16.16b, v4.4b[3]\n"
+ "sqrdmulh v24.4s, v24.4s, v12.4s\n"
+ ".inst 0x6fa5e21a // udot v26.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ea1b // udot v27.4s, v16.16b, v5.4b[3]\n"
+ "sqrdmulh v25.4s, v25.4s, v12.4s\n"
+ ".inst 0x6f86e23c // udot v28.4s, v17.16b, v6.4b[0]\n"
+ ".inst 0x6f86ea3d // udot v29.4s, v17.16b, v6.4b[2]\n"
+ "sqrdmulh v26.4s, v26.4s, v12.4s\n"
+ ".inst 0x6f87e23e // udot v30.4s, v17.16b, v7.4b[0]\n"
+ ".inst 0x6f87ea3f // udot v31.4s, v17.16b, v7.4b[2]\n"
+ "ldr q8, [%x[params], #0x90]\n"
+ "sqrdmulh v27.4s, v27.4s, v12.4s\n"
+ ".inst 0x6fa6e21c // udot v28.4s, v16.16b, v6.4b[1]\n"
+ ".inst 0x6fa6ea1d // udot v29.4s, v16.16b, v6.4b[3]\n"
+ "and v19.16b, v24.16b, v21.16b\n"
+ ".inst 0x6fa7e21e // udot v30.4s, v16.16b, v7.4b[1]\n"
+ ".inst 0x6fa7ea1f // udot v31.4s, v16.16b, v7.4b[3]\n"
+ "ldr q9, [%x[params], #0xa0]\n"
+ "and v18.16b, v25.16b, v21.16b\n"
+ "and v17.16b, v26.16b, v21.16b\n"
+ "and v16.16b, v27.16b, v21.16b\n"
+ "add %x[params], %x[params], #0xd0\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v28.4s, v28.4s, v12.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v12.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v12.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v12.4s\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
+ "sqadd v25.4s, v25.4s, v18.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "and v19.16b, v28.16b, v21.16b\n"
+ "and v18.16b, v29.16b, v21.16b\n"
+ "and v17.16b, v30.16b, v21.16b\n"
+ "and v16.16b, v31.16b, v21.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v24.4s, v24.4s, v21.4s\n"
+ "srshl v25.4s, v25.4s, v21.4s\n"
+ "srshl v26.4s, v26.4s, v21.4s\n"
+ "srshl v27.4s, v27.4s, v21.4s\n"
+ "srshl v28.4s, v28.4s, v21.4s\n"
+ "srshl v29.4s, v29.4s, v21.4s\n"
+ "srshl v30.4s, v30.4s, v21.4s\n"
+ "srshl v31.4s, v31.4s, v21.4s\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v28.4s, v28.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "add v31.4s, v31.4s, v13.4s\n"
+ "smin v24.4s, v24.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v15.4s\n"
+ "smin v26.4s, v26.4s, v15.4s\n"
+ "smin v27.4s, v27.4s, v15.4s\n"
+ "smin v28.4s, v28.4s, v15.4s\n"
+ "smin v29.4s, v29.4s, v15.4s\n"
+ "smin v30.4s, v30.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "str s24, [x27, x28]\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str s25, [x26, x28]\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "str s26, [x25, x28]\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s27, [x24, x28]\n"
+ "str s28, [x23, x28]\n"
+ "dup v24.4s, v22.s[0]\n"
+ "dup v25.4s, v22.s[1]\n"
+ "str s29, [x22, x28]\n"
+ "dup v26.4s, v22.s[2]\n"
+ "dup v27.4s, v22.s[3]\n"
+ "str s30, [x21, x28]\n"
+ "dup v28.4s, v23.s[0]\n"
+ "dup v29.4s, v23.s[1]\n"
+ "str s31, [x20, x28]\n"
+ "dup v30.4s, v23.s[2]\n"
+ "dup v31.4s, v23.s[3]\n"
+ "add x28, x28, #0x4\n"
+ "add v24.4s, v24.4s, v20.4s\n"
+ "add v25.4s, v25.4s, v20.4s\n"
+ "add v26.4s, v26.4s, v20.4s\n"
+ "add v27.4s, v27.4s, v20.4s\n"
+ "add v28.4s, v28.4s, v20.4s\n"
+ "add v29.4s, v29.4s, v20.4s\n"
+ "add v30.4s, v30.4s, v20.4s\n"
+ "add v31.4s, v31.4s, v20.4s\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "ldr q21, [%x[params], #0x60]\n"
+ "ldr q20, [%x[params], #0x70]\n"
+ ".inst 0x6f80e118 // udot v24.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x6f80e919 // udot v25.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6f81e11a // udot v26.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x6f81e91b // udot v27.4s, v8.16b, v1.4b[2]\n"
+ "cmp %x[n_channels], #0x4\n"
+ "add x27, x27, x28\n"
+ ".inst 0x6fa0e138 // udot v24.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x6fa0e939 // udot v25.4s, v9.16b, v0.4b[3]\n"
+ "add x26, x26, x28\n"
+ "add x25, x25, x28\n"
+ ".inst 0x6fa1e13a // udot v26.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x6fa1e93b // udot v27.4s, v9.16b, v1.4b[3]\n"
+ "add x24, x24, x28\n"
+ "add x23, x23, x28\n"
+ ".inst 0x6f82e11c // udot v28.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x6f82e91d // udot v29.4s, v8.16b, v2.4b[2]\n"
+ "add x22, x22, x28\n"
+ "add x21, x21, x28\n"
+ ".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n"
+ ".inst 0x6f83e91f // udot v31.4s, v8.16b, v3.4b[2]\n"
+ "ldr q17, [%x[params], #0x0]\n"
+ "add x20, x20, x28\n"
+ ".inst 0x6f81e158 // udot v24.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x6f81e959 // udot v25.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x6f82e95b // udot v27.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x6fa2e13c // udot v28.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x6fa2e93d // udot v29.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e13e // udot v30.4s, v9.16b, v3.4b[1]\n"
+ ".inst 0x6fa3e93f // udot v31.4s, v9.16b, v3.4b[3]\n"
+ "ldr q16, [%x[params], #0x10]\n"
+ ".inst 0x6fa1e178 // udot v24.4s, v11.16b, v1.4b[1]\n"
+ ".inst 0x6fa1e979 // udot v25.4s, v11.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e17a // udot v26.4s, v11.16b, v2.4b[1]\n"
+ ".inst 0x6fa2e97b // udot v27.4s, v11.16b, v2.4b[3]\n"
+ ".inst 0x6f83e15c // udot v28.4s, v10.16b, v3.4b[0]\n"
+ ".inst 0x6f83e95d // udot v29.4s, v10.16b, v3.4b[2]\n"
+ ".inst 0x6f84e15e // udot v30.4s, v10.16b, v4.4b[0]\n"
+ ".inst 0x6f84e95f // udot v31.4s, v10.16b, v4.4b[2]\n"
+ "ldr q19, [%x[params], #0x20]\n"
+ ".inst 0x6f82e238 // udot v24.4s, v17.16b, v2.4b[0]\n"
+ ".inst 0x6f82ea39 // udot v25.4s, v17.16b, v2.4b[2]\n"
+ ".inst 0x6f83e23a // udot v26.4s, v17.16b, v3.4b[0]\n"
+ ".inst 0x6f83ea3b // udot v27.4s, v17.16b, v3.4b[2]\n"
+ ".inst 0x6fa3e17c // udot v28.4s, v11.16b, v3.4b[1]\n"
+ ".inst 0x6fa3e97d // udot v29.4s, v11.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e17e // udot v30.4s, v11.16b, v4.4b[1]\n"
+ ".inst 0x6fa4e97f // udot v31.4s, v11.16b, v4.4b[3]\n"
+ "ldr q18, [%x[params], #0x30]\n"
+ ".inst 0x6fa2e218 // udot v24.4s, v16.16b, v2.4b[1]\n"
+ ".inst 0x6fa2ea19 // udot v25.4s, v16.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e21a // udot v26.4s, v16.16b, v3.4b[1]\n"
+ ".inst 0x6fa3ea1b // udot v27.4s, v16.16b, v3.4b[3]\n"
+ ".inst 0x6f84e23c // udot v28.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea3d // udot v29.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x6f85e23e // udot v30.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x6f85ea3f // udot v31.4s, v17.16b, v5.4b[2]\n"
+ "ldr q17, [%x[params], #0x40]\n"
+ ".inst 0x6f83e278 // udot v24.4s, v19.16b, v3.4b[0]\n"
+ ".inst 0x6f83ea79 // udot v25.4s, v19.16b, v3.4b[2]\n"
+ ".inst 0x6f84e27a // udot v26.4s, v19.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea7b // udot v27.4s, v19.16b, v4.4b[2]\n"
+ ".inst 0x6fa4e21c // udot v28.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea1d // udot v29.4s, v16.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e21e // udot v30.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ea1f // udot v31.4s, v16.16b, v5.4b[3]\n"
+ "ldr q16, [%x[params], #0x50]\n"
+ "add %x[params], %x[params], #0x80\n"
+ ".inst 0x6fa3e258 // udot v24.4s, v18.16b, v3.4b[1]\n"
+ ".inst 0x6fa3ea59 // udot v25.4s, v18.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e25a // udot v26.4s, v18.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea5b // udot v27.4s, v18.16b, v4.4b[3]\n"
+ ".inst 0x6f85e27c // udot v28.4s, v19.16b, v5.4b[0]\n"
+ ".inst 0x6f85ea7d // udot v29.4s, v19.16b, v5.4b[2]\n"
+ ".inst 0x6f86e27e // udot v30.4s, v19.16b, v6.4b[0]\n"
+ ".inst 0x6f86ea7f // udot v31.4s, v19.16b, v6.4b[2]\n"
+ ".inst 0x6f84e238 // udot v24.4s, v17.16b, v4.4b[0]\n"
+ ".inst 0x6f84ea39 // udot v25.4s, v17.16b, v4.4b[2]\n"
+ ".inst 0x6f85e23a // udot v26.4s, v17.16b, v5.4b[0]\n"
+ ".inst 0x6f85ea3b // udot v27.4s, v17.16b, v5.4b[2]\n"
+ ".inst 0x6fa5e25c // udot v28.4s, v18.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ea5d // udot v29.4s, v18.16b, v5.4b[3]\n"
+ ".inst 0x6fa6e25e // udot v30.4s, v18.16b, v6.4b[1]\n"
+ ".inst 0x6fa6ea5f // udot v31.4s, v18.16b, v6.4b[3]\n"
+ ".inst 0x6fa4e218 // udot v24.4s, v16.16b, v4.4b[1]\n"
+ ".inst 0x6fa4ea19 // udot v25.4s, v16.16b, v4.4b[3]\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ ".inst 0x6fa5e21a // udot v26.4s, v16.16b, v5.4b[1]\n"
+ ".inst 0x6fa5ea1b // udot v27.4s, v16.16b, v5.4b[3]\n"
+ "sqrdmulh v25.4s, v25.4s, v21.4s\n"
+ ".inst 0x6f86e23c // udot v28.4s, v17.16b, v6.4b[0]\n"
+ ".inst 0x6f86ea3d // udot v29.4s, v17.16b, v6.4b[2]\n"
+ "sqrdmulh v26.4s, v26.4s, v21.4s\n"
+ ".inst 0x6f87e23e // udot v30.4s, v17.16b, v7.4b[0]\n"
+ ".inst 0x6f87ea3f // udot v31.4s, v17.16b, v7.4b[2]\n"
+ "sqrdmulh v27.4s, v27.4s, v21.4s\n"
+ ".inst 0x6fa6e21c // udot v28.4s, v16.16b, v6.4b[1]\n"
+ ".inst 0x6fa6ea1d // udot v29.4s, v16.16b, v6.4b[3]\n"
+ "and v19.16b, v24.16b, v20.16b\n"
+ ".inst 0x6fa7e21e // udot v30.4s, v16.16b, v7.4b[1]\n"
+ ".inst 0x6fa7ea1f // udot v31.4s, v16.16b, v7.4b[3]\n"
+ "and v18.16b, v25.16b, v20.16b\n"
+ "and v17.16b, v26.16b, v20.16b\n"
+ "and v16.16b, v27.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v28.4s, v28.4s, v21.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v21.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v19.4s\n"
+ "sqadd v25.4s, v25.4s, v18.4s\n"
+ "sqadd v26.4s, v26.4s, v17.4s\n"
+ "sqadd v27.4s, v27.4s, v16.4s\n"
+ "and v19.16b, v28.16b, v20.16b\n"
+ "and v18.16b, v29.16b, v20.16b\n"
+ "and v17.16b, v30.16b, v20.16b\n"
+ "and v16.16b, v31.16b, v20.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v24.4s, v24.4s, v20.4s\n"
+ "srshl v25.4s, v25.4s, v20.4s\n"
+ "srshl v26.4s, v26.4s, v20.4s\n"
+ "srshl v27.4s, v27.4s, v20.4s\n"
+ "srshl v28.4s, v28.4s, v20.4s\n"
+ "srshl v29.4s, v29.4s, v20.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "srshl v31.4s, v31.4s, v20.4s\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v28.4s, v28.4s, v13.4s\n"
+ "add v29.4s, v29.4s, v13.4s\n"
+ "add v30.4s, v30.4s, v13.4s\n"
+ "add v31.4s, v31.4s, v13.4s\n"
+ "smin v24.4s, v24.4s, v15.4s\n"
+ "smin v25.4s, v25.4s, v15.4s\n"
+ "smin v26.4s, v26.4s, v15.4s\n"
+ "smin v27.4s, v27.4s, v15.4s\n"
+ "smin v28.4s, v28.4s, v15.4s\n"
+ "smin v29.4s, v29.4s, v15.4s\n"
+ "smin v30.4s, v30.4s, v15.4s\n"
+ "smin v31.4s, v31.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v14.4s\n"
+ "smax v25.4s, v25.4s, v14.4s\n"
+ "smax v26.4s, v26.4s, v14.4s\n"
+ "smax v27.4s, v27.4s, v14.4s\n"
+ "smax v28.4s, v28.4s, v14.4s\n"
+ "smax v29.4s, v29.4s, v14.4s\n"
+ "smax v30.4s, v30.4s, v14.4s\n"
+ "smax v31.4s, v31.4s, v14.4s\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "blt 3f\n"
+ "str s24, [x27, #0x0]\n"
+ "str s25, [x26, #0x0]\n"
+ "str s26, [x25, #0x0]\n"
+ "str s27, [x24, #0x0]\n"
+ "str s28, [x23, #0x0]\n"
+ "str s29, [x22, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
+ "str s31, [x20, #0x0]\n"
+ "b 4f\n"
+ "3:" // Tail: Oddments
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v24.b }[0], [x27], #0x1\n"
+ "st1 { v25.b }[0], [x26], #0x1\n"
+ "st1 { v26.b }[0], [x25], #0x1\n"
+ "st1 { v27.b }[0], [x24], #0x1\n"
+ "st1 { v28.b }[0], [x23], #0x1\n"
+ "st1 { v29.b }[0], [x22], #0x1\n"
+ "st1 { v30.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x20], #0x1\n"
+ "beq 4f\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v24.b }[1], [x27], #0x1\n"
+ "st1 { v25.b }[1], [x26], #0x1\n"
+ "st1 { v26.b }[1], [x25], #0x1\n"
+ "st1 { v27.b }[1], [x24], #0x1\n"
+ "st1 { v28.b }[1], [x23], #0x1\n"
+ "st1 { v29.b }[1], [x22], #0x1\n"
+ "st1 { v30.b }[1], [x21], #0x1\n"
+ "st1 { v31.b }[1], [x20], #0x1\n"
+ "beq 4f\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v24.b }[2], [x27], #0x1\n"
+ "st1 { v25.b }[2], [x26], #0x1\n"
+ "st1 { v26.b }[2], [x25], #0x1\n"
+ "st1 { v27.b }[2], [x24], #0x1\n"
+ "st1 { v28.b }[2], [x23], #0x1\n"
+ "st1 { v29.b }[2], [x22], #0x1\n"
+ "st1 { v30.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x20], #0x1\n"
+ "beq 4f\n"
+ "st1 { v24.b }[3], [x27], #0x1\n"
+ "subs %x[n_channels], %x[n_channels], #0x1\n"
+ "st1 { v25.b }[3], [x26], #0x1\n"
+ "st1 { v26.b }[3], [x25], #0x1\n"
+ "st1 { v27.b }[3], [x24], #0x1\n"
+ "st1 { v28.b }[3], [x23], #0x1\n"
+ "st1 { v29.b }[3], [x22], #0x1\n"
+ "st1 { v30.b }[3], [x21], #0x1\n"
+ "st1 { v31.b }[3], [x20], #0x1\n"
+ "4:" // Tail: End
+ : [n_channels] "+&r" (n_output_channels), [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..1f2d211be2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const uint8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+struct a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+ using Parent = GenericDepthfirstMultiplierKernelStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+ a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+ : Parent(2, 8, arm_gemm::VLType::None)
+ {
+ }
+ Parent::KernelType kernel = a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..0770c126ec
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,1480 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+ const uint8_t *const *const inptrs,
+ uint8_t *const *const outptrs,
+ const uint8_t *weights,
+ const int32_t *bias,
+ const unsigned int kernel_points,
+ const unsigned int n_output_channels,
+ const int32_t *per_channel_left_shifts,
+ const int32_t *per_channel_muls,
+ const int32_t *per_channel_right_shifts,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "lsr x10, %x[n_output_channels], #0x2\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v13.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v12.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v11.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+ "ld1r { v10.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v9.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "ld1r { v8.4s }, [x20]\n"
+ "mov x9, #0x0\n"
+ "cbz x10, 9f\n"
+ "1:" // Output channel loop
+ "movi v31.4s, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "lsl x20, x9, #0x2\n"
+ "ldr q31, [%x[bias], x20]\n"
+ "2:" // Output channel loop: Load bias: Done
+ "mov v16.16b, v31.16b\n"
+ "mov v17.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v19.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v22.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ "mov v24.16b, v31.16b\n"
+ "mov v25.16b, v31.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v27.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v31.16b, v31.16b\n"
+ "cbz %x[rq_mul_ptr], 3f\n"
+ "lsl x20, x9, #0x2\n"
+ "ldr q9, [%x[rq_mul_ptr], x20]\n"
+ "ldr q8, [%x[rq_right_shift_ptr], x20]\n"
+ "cbz %x[rq_left_shift_ptr], 3f\n"
+ "ldr q10, [%x[rq_left_shift_ptr], x20]\n"
+ "3:" // Output channel loop: Load quantization parameters: Done
+ "ldr s5, [%x[weights]], #0x4\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "cbz x23, 7f\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "usubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "beq 5f\n"
+ "4:" // Output channel loop: Kernel loop
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x21, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "ldr d3, [x21, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "usubl v7.8h, v7.8b, v12.8b\n"
+ "bgt 4b\n"
+ "5:" // Output channel loop: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 6f\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x27, x9]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s25, [x26, x9]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s26, [x25, x9]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s27, [x24, x9]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
+ "b 8f\n"
+ "6:" // Output channel loop: Odd tail
+ "ldp x20, x28, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x20, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldr d4, [x28, #0x0]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x27, x9]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s25, [x26, x9]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s26, [x25, x9]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s27, [x24, x9]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
+ "b 8f\n"
+ "7:" // Output channel loop: Single kernel point
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x27, x9]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s25, [x26, x9]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s26, [x25, x9]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s27, [x24, x9]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
+ "8:" // Output channel loop: Done
+ "add x9, x9, #0x4\n"
+ "cmp x9, x10, LSL #2\n"
+ "blt 1b\n"
+ "tst %x[n_output_channels], #0x3\n"
+ "beq 26f\n"
+ "9:" // Output channel oddments
+ "movi v31.4s, #0x0\n"
+ "cbz %x[bias], 12f\n"
+ "add x20, %x[bias], x9, LSL #2\n"
+ "tbz %x[n_output_channels], #1, 10f\n"
+ "ld1 { v31.d }[0], [x20], #0x8\n"
+ "tbz %x[n_output_channels], #0, 11f\n"
+ "ld1 { v31.s }[2], [x20]\n"
+ "b 11f\n"
+ "10:" // Output channel oddments: Load bias: Bit 1: Unset
+ "ld1 { v31.s }[0], [x20]\n"
+ "11:" // Output channel oddments: Load bias: Bit 1: End
+ "12:" // Output channel oddments: Load bias: Done
+ "mov v16.16b, v31.16b\n"
+ "mov v17.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v19.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v22.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ "mov v24.16b, v31.16b\n"
+ "mov v25.16b, v31.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v27.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v31.16b, v31.16b\n"
+ "cbz %x[rq_mul_ptr], 18f\n"
+ "add x22, %x[rq_mul_ptr], x9, LSL #2\n"
+ "add x21, %x[rq_right_shift_ptr], x9, LSL #2\n"
+ "add x20, %x[rq_left_shift_ptr], x9, LSL #2\n"
+ "cbz %x[rq_left_shift_ptr], 15f\n"
+ "tbz %x[n_output_channels], #1, 13f\n"
+ "ld1 { v9.d }[0], [x22], #0x8\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_output_channels], #0, 14f\n"
+ "ld1 { v9.s }[2], [x22], #0x4\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "b 14f\n"
+ "13:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "14:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
+ "b 18f\n"
+ "15:" // Output channel oddments: Load quantization parameters: No left shift
+ "tbz %x[n_output_channels], #1, 16f\n"
+ "ld1 { v9.d }[0], [x22], #0x8\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
+ "tbz %x[n_output_channels], #0, 17f\n"
+ "ld1 { v9.s }[2], [x22], #0x4\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
+ "b 17f\n"
+ "16:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
+ "17:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
+ "18:" // Output channel oddments: Load quantization parameters: Done
+ "ldr s5, [%x[weights]], #0x4\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "cbz x23, 22f\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "usubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "beq 20f\n"
+ "19:" // Output channel oddments: Kernel loop
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x21, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "ldr d3, [x21, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "usubl v7.8h, v7.8b, v12.8b\n"
+ "bgt 19b\n"
+ "20:" // Output channel oddments: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 21f\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "b 23f\n"
+ "21:" // Output channel oddments: Odd tail
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d2, [x21, #0x0]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d1, [x20, #0x0]\n"
+ "ldr s0, [%x[weights]], #0x4\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "usubl v0.8h, v0.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "smlal v16.4s, v0.4h, v2.h[0]\n"
+ "smlal v17.4s, v0.4h, v2.h[1]\n"
+ "smlal v18.4s, v0.4h, v2.h[2]\n"
+ "smlal v19.4s, v0.4h, v2.h[3]\n"
+ "smlal v20.4s, v0.4h, v2.h[4]\n"
+ "smlal v21.4s, v0.4h, v2.h[5]\n"
+ "smlal v22.4s, v0.4h, v2.h[6]\n"
+ "smlal v23.4s, v0.4h, v2.h[7]\n"
+ "smlal v24.4s, v0.4h, v1.h[0]\n"
+ "smlal v25.4s, v0.4h, v1.h[1]\n"
+ "smlal v26.4s, v0.4h, v1.h[2]\n"
+ "smlal v27.4s, v0.4h, v1.h[3]\n"
+ "smlal v28.4s, v0.4h, v1.h[4]\n"
+ "smlal v29.4s, v0.4h, v1.h[5]\n"
+ "smlal v30.4s, v0.4h, v1.h[6]\n"
+ "smlal v31.4s, v0.4h, v1.h[7]\n"
+ "b 23f\n"
+ "22:" // Output channel oddments: Single kernel point
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "23:" // Output channel oddments: Done
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "tbz %x[n_output_channels], #1, 24f\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "add x27, x27, x9\n"
+ "add x26, x26, x9\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.h }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "add x27, x27, x9\n"
+ "st1 { v17.h }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.h }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.h }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.h }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.h }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.h }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.h }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "add x9, x9, #0x2\n"
+ "st1 { v24.h }[0], [x27]\n"
+ "st1 { v25.h }[0], [x26]\n"
+ "st1 { v26.h }[0], [x25]\n"
+ "st1 { v27.h }[0], [x24]\n"
+ "st1 { v28.h }[0], [x23]\n"
+ "st1 { v29.h }[0], [x22]\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
+ "tbz %x[n_output_channels], #0, 25f\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "add x27, x27, x9\n"
+ "add x26, x26, x9\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.b }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "add x27, x27, x9\n"
+ "st1 { v17.b }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.b }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.b }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.b }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.b }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.b }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "st1 { v24.b }[2], [x27]\n"
+ "st1 { v25.b }[2], [x26]\n"
+ "st1 { v26.b }[2], [x25]\n"
+ "st1 { v27.b }[2], [x24]\n"
+ "st1 { v28.b }[2], [x23]\n"
+ "st1 { v29.b }[2], [x22]\n"
+ "st1 { v30.b }[2], [x21]\n"
+ "st1 { v31.b }[2], [x20]\n"
+ "b 25f\n"
+ "24:" // Output channel oddments: Done: Store: Bit 1: Unset
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "add x27, x27, x9\n"
+ "add x26, x26, x9\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.b }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "add x27, x27, x9\n"
+ "st1 { v17.b }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.b }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.b }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.b }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.b }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.b }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.b }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "st1 { v24.b }[0], [x27]\n"
+ "st1 { v25.b }[0], [x26]\n"
+ "st1 { v26.b }[0], [x25]\n"
+ "st1 { v27.b }[0], [x24]\n"
+ "st1 { v28.b }[0], [x23]\n"
+ "st1 { v29.b }[0], [x22]\n"
+ "st1 { v30.b }[0], [x21]\n"
+ "st1 { v31.b }[0], [x20]\n"
+ "25:" // Output channel oddments: Done: Store: Bit 1: End
+ "26:" // Done
+ : [weights] "+&r" (weights)
+ : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..db73c88187
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int,
+ const uint8_t *const *const,
+ const uint8_t *const,
+ const int32_t *const,
+ const arm_gemm::Requantize32 &,
+ const int32_t *const,
+ const int32_t *const,
+ uint8_t *const *const
+);
+
+class a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d1872c90f8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1164 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[16];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[5];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[3];
+ inptrs[3] = inptrs_raw[6];
+ inptrs[4] = inptrs_raw[9];
+ inptrs[5] = inptrs_raw[12];
+ inptrs[6] = inptrs_raw[15];
+ inptrs[7] = inptrs_raw[1];
+ inptrs[8] = inptrs_raw[2];
+ inptrs[9] = inptrs_raw[10];
+ inptrs[10] = inptrs_raw[4];
+ inptrs[11] = inptrs_raw[7];
+ inptrs[12] = inptrs_raw[8];
+ inptrs[13] = inptrs_raw[11];
+ inptrs[14] = inptrs_raw[13];
+ inptrs[15] = inptrs_raw[14];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x16, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x15, x16, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v18.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_minval]\n"
+ "ld1r { v5.8h }, [x21]\n"
+ "ld1r { v14.8h }, [x20]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "mov x14, #0x0\n"
+ "ld1r { v12.8h }, [x20]\n"
+ "mov x13, #0x0\n"
+ "add x12, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x11, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x10, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x9, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x28, x27, [x22, #0x0]\n"
+ "ldp x26, x25, [x22, #0x10]\n"
+ "cbz x15, 3f\n"
+ "ldr d19, [x11, #0x0]\n"
+ "ldr d7, [x11, #0x8]\n"
+ "subs x15, x15, #0x1\n"
+ "usubl v19.8h, v19.8b, v18.8b\n"
+ "ldr d1, [x11, #0x10]\n"
+ "ldr d17, [x11, #0x18]\n"
+ "usubl v7.8h, v7.8b, v18.8b\n"
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "ldr d8, [x11, #0x20]\n"
+ "ldr d31, [x11, #0x28]\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "ldr d29, [x11, #0x30]\n"
+ "ldr d16, [x11, #0x38]\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "ldr d4, [x11, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "ldr q28, [x20, #0x0]\n"
+ "ldr q9, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x23, x22, [x12, #0x0]\n"
+ "ldp x21, x20, [x12, #0x10]\n"
+ "mov v3.16b, v28.16b\n"
+ "mov v30.16b, v9.16b\n"
+ "ldr d23, [x23, x14]\n"
+ "ldr d10, [x22, x14]\n"
+ "mov v0.16b, v28.16b\n"
+ "mov v22.16b, v9.16b\n"
+ "ldr d11, [x21, x14]\n"
+ "ldr d13, [x20, x14]\n"
+ "mov v6.16b, v28.16b\n"
+ "mov v2.16b, v9.16b\n"
+ "ldr x20, [x12, #0x20]\n"
+ "ldr d27, [x20, x14]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "ushll v13.8h, v13.8b, #0x0\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "ldr q24, [x10, #0x0]\n"
+ "ldr q25, [x9, #0x0]\n"
+ "smlal v28.4s, v23.4h, v8.4h\n"
+ "smlal2 v9.4s, v23.8h, v8.8h\n"
+ "ldr q20, [x10, #0x10]\n"
+ "ldr q26, [x9, #0x10]\n"
+ "smlal v28.4s, v10.4h, v19.4h\n"
+ "smlal v3.4s, v23.4h, v17.4h\n"
+ "ldr x20, [x12, #0x28]\n"
+ "ldr d21, [x20, x14]\n"
+ "smlal v0.4s, v23.4h, v7.4h\n"
+ "smlal v6.4s, v23.4h, v19.4h\n"
+ "smlal2 v9.4s, v10.8h, v19.8h\n"
+ "ldr x20, [x12, #0x38]\n"
+ "ldr d10, [x20, x14]\n"
+ "smlal v28.4s, v13.4h, v31.4h\n"
+ "smlal2 v30.4s, v23.8h, v17.8h\n"
+ "smlal2 v22.4s, v23.8h, v7.8h\n"
+ "ldr x20, [x12, #0x30]\n"
+ "ldr d15, [x20, x14]\n"
+ "smlal2 v2.4s, v23.8h, v19.8h\n"
+ "smlal v3.4s, v11.4h, v1.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ldr x20, [x12, #0x40]\n"
+ "ldr d23, [x20, x14]\n"
+ "smlal v0.4s, v13.4h, v1.4h\n"
+ "smlal v6.4s, v13.4h, v7.4h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal2 v9.4s, v13.8h, v31.8h\n"
+ "smlal v28.4s, v27.4h, v16.4h\n"
+ "ldr x20, [x12, #0x48]\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal2 v30.4s, v11.8h, v1.8h\n"
+ "ldr d11, [x20, x14]\n"
+ "smlal2 v22.4s, v13.8h, v1.8h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal2 v2.4s, v13.8h, v7.8h\n"
+ "smlal v3.4s, v13.4h, v8.4h\n"
+ "ldr x21, [x12, #0x50]\n"
+ "ldr x20, [x12, #0x58]\n"
+ "smlal v0.4s, v21.4h, v29.4h\n"
+ "smlal v6.4s, v27.4h, v17.4h\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "ldr x24, [x12, #0x60]\n"
+ "smlal2 v9.4s, v27.8h, v16.8h\n"
+ "smlal v28.4s, v10.4h, v7.4h\n"
+ "ldr x23, [x12, #0x68]\n"
+ "ldr x22, [x12, #0x70]\n"
+ "smlal2 v30.4s, v13.8h, v8.8h\n"
+ "ldr d13, [x21, x14]\n"
+ "smlal2 v22.4s, v21.8h, v29.8h\n"
+ "ldr d21, [x20, x14]\n"
+ "smlal2 v2.4s, v27.8h, v17.8h\n"
+ "smlal v3.4s, v27.4h, v29.4h\n"
+ "ushll v13.8h, v13.8b, #0x0\n"
+ "ldr x21, [x12, #0x78]\n"
+ "smlal v0.4s, v27.4h, v8.4h\n"
+ "smlal v6.4s, v15.4h, v4.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v9.4s, v10.8h, v7.8h\n"
+ "smlal v28.4s, v23.4h, v1.4h\n"
+ "add x11, x11, #0x48\n"
+ "subs x15, x15, #0x1\n"
+ "smlal2 v30.4s, v27.8h, v29.8h\n"
+ "smlal2 v22.4s, v27.8h, v8.8h\n"
+ "ldr d27, [x24, x14]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal2 v2.4s, v15.8h, v4.8h\n"
+ "ldr d15, [x23, x14]\n"
+ "smlal v3.4s, v10.4h, v19.4h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v0.4s, v11.4h, v31.4h\n"
+ "smlal v6.4s, v11.4h, v8.4h\n"
+ "add x10, x10, #0x20\n"
+ "add x9, x9, #0x20\n"
+ "smlal2 v9.4s, v23.8h, v1.8h\n"
+ "smlal v28.4s, v11.4h, v4.4h\n"
+ "smlal2 v30.4s, v10.8h, v19.8h\n"
+ "ldr d10, [x22, x14]\n"
+ "smlal2 v22.4s, v11.8h, v31.8h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal2 v2.4s, v11.8h, v8.8h\n"
+ "ldr d8, [x21, x14]\n"
+ "smlal v3.4s, v23.4h, v7.4h\n"
+ "ushll v8.8h, v8.8b, #0x0\n"
+ "smlal v0.4s, v13.4h, v19.4h\n"
+ "smlal v6.4s, v21.4h, v1.4h\n"
+ "add x14, x14, #0x8\n"
+ "smlal2 v9.4s, v11.8h, v4.8h\n"
+ "smlal v28.4s, v13.4h, v17.4h\n"
+ "smlal2 v30.4s, v23.8h, v7.8h\n"
+ "smlal2 v22.4s, v13.8h, v19.8h\n"
+ "smlal2 v2.4s, v21.8h, v1.8h\n"
+ "smlal v3.4s, v11.4h, v16.4h\n"
+ "smlal v0.4s, v27.4h, v17.4h\n"
+ "smlal v6.4s, v15.4h, v31.4h\n"
+ "smlal2 v9.4s, v13.8h, v17.8h\n"
+ "smlal v28.4s, v27.4h, v29.4h\n"
+ "sqrdmulh v28.4s, v28.4s, v24.4s\n"
+ "smlal2 v30.4s, v11.8h, v16.8h\n"
+ "smlal2 v22.4s, v27.8h, v17.8h\n"
+ "and v17.16b, v28.16b, v25.16b\n"
+ "smlal2 v2.4s, v15.8h, v31.8h\n"
+ "smlal v3.4s, v21.4h, v31.4h\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "smlal v0.4s, v10.4h, v16.4h\n"
+ "smlal v6.4s, v10.4h, v29.4h\n"
+ "sqadd v28.4s, v28.4s, v17.4s\n"
+ "smlal2 v9.4s, v27.8h, v29.8h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "sqrdmulh v9.4s, v9.4s, v20.4s\n"
+ "smlal2 v22.4s, v10.8h, v16.8h\n"
+ "smlal2 v2.4s, v10.8h, v29.8h\n"
+ "and v23.16b, v9.16b, v26.16b\n"
+ "smlal v3.4s, v15.4h, v4.4h\n"
+ "smlal v0.4s, v8.4h, v4.4h\n"
+ "sqrdmulh v3.4s, v3.4s, v24.4s\n"
+ "smlal v6.4s, v8.4h, v16.4h\n"
+ "smlal2 v30.4s, v15.8h, v4.8h\n"
+ "sqrdmulh v0.4s, v0.4s, v24.4s\n"
+ "smlal2 v22.4s, v8.8h, v4.8h\n"
+ "smlal2 v2.4s, v8.8h, v16.8h\n"
+ "sqrdmulh v6.4s, v6.4s, v24.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v8.16b, v3.16b, v25.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v20.4s\n"
+ "and v11.16b, v0.16b, v25.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v20.4s\n"
+ "and v29.16b, v6.16b, v25.16b\n"
+ "sqrdmulh v2.4s, v2.4s, v20.4s\n"
+ "sqadd v9.4s, v9.4s, v23.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "and v13.16b, v30.16b, v26.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v21.16b, v22.16b, v26.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v23.16b, v2.16b, v26.16b\n"
+ "sqadd v3.4s, v3.4s, v8.4s\n"
+ "sshr v13.4s, v13.4s, #0x1f\n"
+ "sqadd v0.4s, v0.4s, v11.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v29.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "srshl v28.4s, v28.4s, v25.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqadd v30.4s, v30.4s, v13.4s\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqadd v22.4s, v22.4s, v21.4s\n"
+ "srshl v6.4s, v6.4s, v25.4s\n"
+ "sqadd v2.4s, v2.4s, v23.4s\n"
+ "srshl v9.4s, v9.4s, v26.4s\n"
+ "sqxtn v28.4h, v28.4s\n"
+ "srshl v30.4s, v30.4s, v26.4s\n"
+ "sqxtn v3.4h, v3.4s\n"
+ "srshl v22.4s, v22.4s, v26.4s\n"
+ "sqxtn v0.4h, v0.4s\n"
+ "srshl v2.4s, v2.4s, v26.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v28.8h, v9.4s\n"
+ "sqxtn2 v3.8h, v30.4s\n"
+ "sqxtn2 v0.8h, v22.4s\n"
+ "sqxtn2 v6.8h, v2.4s\n"
+ "sqadd v28.8h, v28.8h, v5.8h\n"
+ "sqadd v3.8h, v3.8h, v5.8h\n"
+ "sqadd v0.8h, v0.8h, v5.8h\n"
+ "sqadd v6.8h, v6.8h, v5.8h\n"
+ "smax v28.8h, v28.8h, v14.8h\n"
+ "smax v3.8h, v3.8h, v14.8h\n"
+ "smax v0.8h, v0.8h, v14.8h\n"
+ "smax v6.8h, v6.8h, v14.8h\n"
+ "smin v28.8h, v28.8h, v12.8h\n"
+ "smin v3.8h, v3.8h, v12.8h\n"
+ "smin v0.8h, v0.8h, v12.8h\n"
+ "smin v6.8h, v6.8h, v12.8h\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str d28, [x28, x13]\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "str d3, [x27, x13]\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str d0, [x26, x13]\n"
+ "str d6, [x25, x13]\n"
+ "ldr q28, [x20, #0x0]\n"
+ "ldr q9, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d19, [x11, #0x0]\n"
+ "ldr d7, [x11, #0x8]\n"
+ "add x13, x13, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d1, [x11, #0x10]\n"
+ "ldr d17, [x11, #0x18]\n"
+ "mov v3.16b, v28.16b\n"
+ "mov v30.16b, v9.16b\n"
+ "ldr d8, [x11, #0x20]\n"
+ "ldr d31, [x11, #0x28]\n"
+ "mov v0.16b, v28.16b\n"
+ "mov v22.16b, v9.16b\n"
+ "ldr d29, [x11, #0x30]\n"
+ "ldr d16, [x11, #0x38]\n"
+ "mov v6.16b, v28.16b\n"
+ "mov v2.16b, v9.16b\n"
+ "ldr d4, [x11, #0x40]\n"
+ "ldp x23, x22, [x12, #0x0]\n"
+ "usubl v19.8h, v19.8b, v18.8b\n"
+ "usubl v7.8h, v7.8b, v18.8b\n"
+ "ldp x21, x20, [x12, #0x10]\n"
+ "ldr d23, [x23, x14]\n"
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "ldr d10, [x22, x14]\n"
+ "ldr d11, [x21, x14]\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "ldr d13, [x20, x14]\n"
+ "ldr x20, [x12, #0x20]\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "ldr d27, [x20, x14]\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "ushll v13.8h, v13.8b, #0x0\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "ldr q26, [x10, #0x0]\n"
+ "ldr q25, [x9, #0x0]\n"
+ "smlal v28.4s, v23.4h, v8.4h\n"
+ "smlal2 v9.4s, v23.8h, v8.8h\n"
+ "ldr q24, [x10, #0x10]\n"
+ "ldr q20, [x9, #0x10]\n"
+ "smlal v28.4s, v10.4h, v19.4h\n"
+ "smlal v3.4s, v23.4h, v17.4h\n"
+ "ldr x20, [x12, #0x28]\n"
+ "ldr d21, [x20, x14]\n"
+ "smlal v0.4s, v23.4h, v7.4h\n"
+ "smlal v6.4s, v23.4h, v19.4h\n"
+ "smlal2 v9.4s, v10.8h, v19.8h\n"
+ "ldr x20, [x12, #0x38]\n"
+ "ldr d15, [x20, x14]\n"
+ "smlal v28.4s, v13.4h, v31.4h\n"
+ "smlal2 v30.4s, v23.8h, v17.8h\n"
+ "smlal2 v22.4s, v23.8h, v7.8h\n"
+ "ldr x20, [x12, #0x30]\n"
+ "ldr d10, [x20, x14]\n"
+ "smlal2 v2.4s, v23.8h, v19.8h\n"
+ "smlal v3.4s, v11.4h, v1.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ldr x20, [x12, #0x40]\n"
+ "ldr d23, [x20, x14]\n"
+ "smlal v0.4s, v13.4h, v1.4h\n"
+ "smlal v6.4s, v13.4h, v7.4h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal2 v9.4s, v13.8h, v31.8h\n"
+ "smlal v28.4s, v27.4h, v16.4h\n"
+ "ldr x20, [x12, #0x48]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal2 v30.4s, v11.8h, v1.8h\n"
+ "ldr d11, [x20, x14]\n"
+ "smlal2 v22.4s, v13.8h, v1.8h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal2 v2.4s, v13.8h, v7.8h\n"
+ "smlal v3.4s, v13.4h, v8.4h\n"
+ "ldr x24, [x12, #0x50]\n"
+ "ldr x20, [x12, #0x58]\n"
+ "smlal v0.4s, v21.4h, v29.4h\n"
+ "smlal v6.4s, v27.4h, v17.4h\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "ldr x23, [x12, #0x60]\n"
+ "smlal2 v9.4s, v27.8h, v16.8h\n"
+ "smlal v28.4s, v15.4h, v7.4h\n"
+ "ldr x22, [x12, #0x68]\n"
+ "ldr x21, [x12, #0x70]\n"
+ "smlal2 v30.4s, v13.8h, v8.8h\n"
+ "ldr d13, [x24, x14]\n"
+ "smlal2 v22.4s, v21.8h, v29.8h\n"
+ "ldr d21, [x20, x14]\n"
+ "smlal2 v2.4s, v27.8h, v17.8h\n"
+ "smlal v3.4s, v27.4h, v29.4h\n"
+ "ushll v13.8h, v13.8b, #0x0\n"
+ "ldr x20, [x12, #0x78]\n"
+ "smlal v0.4s, v27.4h, v8.4h\n"
+ "smlal v6.4s, v10.4h, v4.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "tst x16, #0x7\n"
+ "smlal2 v9.4s, v15.8h, v7.8h\n"
+ "smlal v28.4s, v23.4h, v1.4h\n"
+ "add x10, x10, #0x20\n"
+ "add x9, x9, #0x20\n"
+ "smlal2 v30.4s, v27.8h, v29.8h\n"
+ "smlal2 v22.4s, v27.8h, v8.8h\n"
+ "ldr d27, [x23, x14]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal2 v2.4s, v10.8h, v4.8h\n"
+ "ldr d10, [x22, x14]\n"
+ "smlal v3.4s, v15.4h, v19.4h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal v0.4s, v11.4h, v31.4h\n"
+ "smlal v6.4s, v11.4h, v8.4h\n"
+ "smlal2 v9.4s, v23.8h, v1.8h\n"
+ "smlal v28.4s, v11.4h, v4.4h\n"
+ "smlal2 v30.4s, v15.8h, v19.8h\n"
+ "ldr d15, [x21, x14]\n"
+ "smlal2 v22.4s, v11.8h, v31.8h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal2 v2.4s, v11.8h, v8.8h\n"
+ "ldr d8, [x20, x14]\n"
+ "smlal v3.4s, v23.4h, v7.4h\n"
+ "ushll v8.8h, v8.8b, #0x0\n"
+ "smlal v0.4s, v13.4h, v19.4h\n"
+ "smlal v6.4s, v21.4h, v1.4h\n"
+ "add x14, x14, #0x8\n"
+ "smlal2 v9.4s, v11.8h, v4.8h\n"
+ "smlal v28.4s, v13.4h, v17.4h\n"
+ "smlal2 v30.4s, v23.8h, v7.8h\n"
+ "smlal2 v22.4s, v13.8h, v19.8h\n"
+ "smlal2 v2.4s, v21.8h, v1.8h\n"
+ "smlal v3.4s, v11.4h, v16.4h\n"
+ "smlal v0.4s, v27.4h, v17.4h\n"
+ "smlal v6.4s, v10.4h, v31.4h\n"
+ "smlal2 v9.4s, v13.8h, v17.8h\n"
+ "smlal v28.4s, v27.4h, v29.4h\n"
+ "sqrdmulh v28.4s, v28.4s, v26.4s\n"
+ "smlal2 v30.4s, v11.8h, v16.8h\n"
+ "smlal2 v22.4s, v27.8h, v17.8h\n"
+ "and v1.16b, v28.16b, v25.16b\n"
+ "smlal2 v2.4s, v10.8h, v31.8h\n"
+ "smlal v3.4s, v21.4h, v31.4h\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "smlal v0.4s, v15.4h, v16.4h\n"
+ "smlal v6.4s, v15.4h, v29.4h\n"
+ "sqadd v28.4s, v28.4s, v1.4s\n"
+ "smlal2 v9.4s, v27.8h, v29.8h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "sqrdmulh v9.4s, v9.4s, v24.4s\n"
+ "smlal2 v22.4s, v15.8h, v16.8h\n"
+ "smlal2 v2.4s, v15.8h, v29.8h\n"
+ "and v27.16b, v9.16b, v20.16b\n"
+ "smlal v3.4s, v10.4h, v4.4h\n"
+ "smlal v0.4s, v8.4h, v4.4h\n"
+ "sqrdmulh v3.4s, v3.4s, v26.4s\n"
+ "smlal v6.4s, v8.4h, v16.4h\n"
+ "smlal2 v30.4s, v10.8h, v4.8h\n"
+ "sqrdmulh v0.4s, v0.4s, v26.4s\n"
+ "smlal2 v22.4s, v8.8h, v4.8h\n"
+ "smlal2 v2.4s, v8.8h, v16.8h\n"
+ "sqrdmulh v6.4s, v6.4s, v26.4s\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v24.4s\n"
+ "and v4.16b, v0.16b, v25.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v24.4s\n"
+ "and v17.16b, v6.16b, v25.16b\n"
+ "sqrdmulh v2.4s, v2.4s, v24.4s\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v8.16b, v30.16b, v20.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v26.16b, v22.16b, v20.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v11.16b, v2.16b, v20.16b\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sqadd v0.4s, v0.4s, v4.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v17.4s\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "srshl v28.4s, v28.4s, v25.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqadd v30.4s, v30.4s, v8.4s\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqadd v22.4s, v22.4s, v26.4s\n"
+ "srshl v6.4s, v6.4s, v25.4s\n"
+ "sqadd v2.4s, v2.4s, v11.4s\n"
+ "srshl v9.4s, v9.4s, v20.4s\n"
+ "sqxtn v28.4h, v28.4s\n"
+ "srshl v30.4s, v30.4s, v20.4s\n"
+ "sqxtn v3.4h, v3.4s\n"
+ "srshl v22.4s, v22.4s, v20.4s\n"
+ "sqxtn v0.4h, v0.4s\n"
+ "srshl v2.4s, v2.4s, v20.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v28.8h, v9.4s\n"
+ "sqxtn2 v3.8h, v30.4s\n"
+ "sqxtn2 v0.8h, v22.4s\n"
+ "sqxtn2 v6.8h, v2.4s\n"
+ "sqadd v28.8h, v28.8h, v5.8h\n"
+ "sqadd v3.8h, v3.8h, v5.8h\n"
+ "sqadd v0.8h, v0.8h, v5.8h\n"
+ "sqadd v6.8h, v6.8h, v5.8h\n"
+ "smax v28.8h, v28.8h, v14.8h\n"
+ "smax v3.8h, v3.8h, v14.8h\n"
+ "smax v0.8h, v0.8h, v14.8h\n"
+ "smax v6.8h, v6.8h, v14.8h\n"
+ "smin v28.8h, v28.8h, v12.8h\n"
+ "smin v3.8h, v3.8h, v12.8h\n"
+ "smin v0.8h, v0.8h, v12.8h\n"
+ "smin v6.8h, v6.8h, v12.8h\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str d28, [x28, x13]\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "str d3, [x27, x13]\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str d0, [x26, x13]\n"
+ "str d6, [x25, x13]\n"
+ "add x13, x13, #0x8\n"
+ "beq 64f\n"
+ "add x11, x11, #0x48\n"
+ "3:" // Oddments
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x16, #2, 5f\n"
+ "ld1 { v28.4s }, [x20], #0x10\n"
+ "tbz x16, #1, 4f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz x16, #0, 7f\n"
+ "ld1 { v9.s }[2], [x20]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x16, #0, 7f\n"
+ "ld1 { v9.s }[0], [x20]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x16, #1, 6f\n"
+ "ld1 { v28.d }[0], [x20], #0x8\n"
+ "tbz x16, #0, 7f\n"
+ "ld1 { v28.s }[2], [x20]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x16, #0, 7f\n"
+ "ld1 { v28.s }[0], [x20]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "ldr d19, [x11, #0x0]\n"
+ "ldr d7, [x11, #0x8]\n"
+ "mov v3.16b, v28.16b\n"
+ "mov v30.16b, v9.16b\n"
+ "ldr d1, [x11, #0x10]\n"
+ "ldr d17, [x11, #0x18]\n"
+ "mov v0.16b, v28.16b\n"
+ "mov v22.16b, v9.16b\n"
+ "ldr d8, [x11, #0x20]\n"
+ "ldr d31, [x11, #0x28]\n"
+ "mov v6.16b, v28.16b\n"
+ "mov v2.16b, v9.16b\n"
+ "ldr d29, [x11, #0x30]\n"
+ "ldr d16, [x11, #0x38]\n"
+ "usubl v19.8h, v19.8b, v18.8b\n"
+ "usubl v7.8h, v7.8b, v18.8b\n"
+ "ldr d4, [x11, #0x40]\n"
+ "ldp x24, x23, [x12, #0x0]\n"
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "ldp x22, x21, [x12, #0x10]\n"
+ "ldr x20, [x12, #0x20]\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "add x24, x24, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 9f\n"
+ "ld1 { v23.s }[0], [x24], #0x4\n"
+ "ld1 { v10.s }[0], [x23], #0x4\n"
+ "ld1 { v11.s }[0], [x22], #0x4\n"
+ "ld1 { v13.s }[0], [x21], #0x4\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 8f\n"
+ "ld1 { v23.h }[2], [x24], #0x2\n"
+ "ld1 { v10.h }[2], [x23], #0x2\n"
+ "ld1 { v11.h }[2], [x22], #0x2\n"
+ "ld1 { v13.h }[2], [x21], #0x2\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 11f\n"
+ "ld1 { v23.b }[6], [x24]\n"
+ "ld1 { v10.b }[6], [x23]\n"
+ "ld1 { v11.b }[6], [x22]\n"
+ "ld1 { v13.b }[6], [x21]\n"
+ "ld1 { v27.b }[6], [x20]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x16, #0, 11f\n"
+ "ld1 { v23.b }[4], [x24]\n"
+ "ld1 { v10.b }[4], [x23]\n"
+ "ld1 { v11.b }[4], [x22]\n"
+ "ld1 { v13.b }[4], [x21]\n"
+ "ld1 { v27.b }[4], [x20]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x16, #1, 10f\n"
+ "ld1 { v23.h }[0], [x24], #0x2\n"
+ "ld1 { v10.h }[0], [x23], #0x2\n"
+ "ld1 { v11.h }[0], [x22], #0x2\n"
+ "ld1 { v13.h }[0], [x21], #0x2\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 11f\n"
+ "ld1 { v23.b }[2], [x24]\n"
+ "ld1 { v10.b }[2], [x23]\n"
+ "ld1 { v11.b }[2], [x22]\n"
+ "ld1 { v13.b }[2], [x21]\n"
+ "ld1 { v27.b }[2], [x20]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x16, #0, 11f\n"
+ "ld1 { v23.b }[0], [x24]\n"
+ "ld1 { v10.b }[0], [x23]\n"
+ "ld1 { v11.b }[0], [x22]\n"
+ "ld1 { v13.b }[0], [x21]\n"
+ "ld1 { v27.b }[0], [x20]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal v28.4s, v23.4h, v8.4h\n"
+ "smlal2 v9.4s, v23.8h, v8.8h\n"
+ "ldr x20, [x12, #0x28]\n"
+ "smlal v3.4s, v23.4h, v17.4h\n"
+ "smlal2 v30.4s, v23.8h, v17.8h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "smlal v0.4s, v23.4h, v7.4h\n"
+ "smlal2 v22.4s, v23.8h, v7.8h\n"
+ "add x20, x20, x14\n"
+ "smlal v6.4s, v23.4h, v19.4h\n"
+ "smlal2 v2.4s, v23.8h, v19.8h\n"
+ "ushll v13.8h, v13.8b, #0x0\n"
+ "smlal v28.4s, v10.4h, v19.4h\n"
+ "smlal2 v9.4s, v10.8h, v19.8h\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal v3.4s, v11.4h, v1.4h\n"
+ "smlal2 v30.4s, v11.8h, v1.8h\n"
+ "smlal v28.4s, v13.4h, v31.4h\n"
+ "smlal2 v9.4s, v13.8h, v31.8h\n"
+ "smlal v3.4s, v13.4h, v8.4h\n"
+ "smlal2 v30.4s, v13.8h, v8.8h\n"
+ "smlal v0.4s, v13.4h, v1.4h\n"
+ "smlal2 v22.4s, v13.8h, v1.8h\n"
+ "smlal v6.4s, v13.4h, v7.4h\n"
+ "smlal2 v2.4s, v13.8h, v7.8h\n"
+ "tbz x16, #2, 13f\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 12f\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 15f\n"
+ "ld1 { v26.b }[6], [x20]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x16, #0, 15f\n"
+ "ld1 { v26.b }[4], [x20]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x16, #1, 14f\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 15f\n"
+ "ld1 { v26.b }[2], [x20]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x16, #0, 15f\n"
+ "ld1 { v26.b }[0], [x20]\n"
+ "15:" // Oddments: Load (3, 0): Bit 2: End
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v0.4s, v26.4h, v29.4h\n"
+ "smlal2 v22.4s, v26.8h, v29.8h\n"
+ "ldr x20, [x12, #0x30]\n"
+ "smlal v28.4s, v27.4h, v16.4h\n"
+ "smlal2 v9.4s, v27.8h, v16.8h\n"
+ "add x20, x20, x14\n"
+ "smlal v3.4s, v27.4h, v29.4h\n"
+ "smlal2 v30.4s, v27.8h, v29.8h\n"
+ "smlal v0.4s, v27.4h, v8.4h\n"
+ "smlal2 v22.4s, v27.8h, v8.8h\n"
+ "smlal v6.4s, v27.4h, v17.4h\n"
+ "smlal2 v2.4s, v27.8h, v17.8h\n"
+ "tbz x16, #2, 17f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 16f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 19f\n"
+ "ld1 { v23.b }[6], [x20]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x16, #0, 19f\n"
+ "ld1 { v23.b }[4], [x20]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x16, #1, 18f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 19f\n"
+ "ld1 { v23.b }[2], [x20]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x16, #0, 19f\n"
+ "ld1 { v23.b }[0], [x20]\n"
+ "19:" // Oddments: Load (3, 3): Bit 2: End
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ldr x20, [x12, #0x38]\n"
+ "smlal v6.4s, v23.4h, v4.4h\n"
+ "smlal2 v2.4s, v23.8h, v4.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 21f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 20f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 23f\n"
+ "ld1 { v21.b }[6], [x20]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
+ "tbz x16, #0, 23f\n"
+ "ld1 { v21.b }[4], [x20]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (0, 1): Bit 2: Unset
+ "tbz x16, #1, 22f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 23f\n"
+ "ld1 { v21.b }[2], [x20]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x16, #0, 23f\n"
+ "ld1 { v21.b }[0], [x20]\n"
+ "23:" // Oddments: Load (0, 1): Bit 2: End
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ldr x20, [x12, #0x40]\n"
+ "smlal v28.4s, v21.4h, v7.4h\n"
+ "smlal2 v9.4s, v21.8h, v7.8h\n"
+ "smlal v3.4s, v21.4h, v19.4h\n"
+ "smlal2 v30.4s, v21.8h, v19.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 25f\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 24f\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 27f\n"
+ "ld1 { v18.b }[6], [x20]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
+ "tbz x16, #0, 27f\n"
+ "ld1 { v18.b }[4], [x20]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (0, 2): Bit 2: Unset
+ "tbz x16, #1, 26f\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 27f\n"
+ "ld1 { v18.b }[2], [x20]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x16, #0, 27f\n"
+ "ld1 { v18.b }[0], [x20]\n"
+ "27:" // Oddments: Load (0, 2): Bit 2: End
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ldr x20, [x12, #0x48]\n"
+ "smlal v28.4s, v18.4h, v1.4h\n"
+ "smlal2 v9.4s, v18.8h, v1.8h\n"
+ "smlal v3.4s, v18.4h, v7.4h\n"
+ "smlal2 v30.4s, v18.8h, v7.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 29f\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 28f\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 31f\n"
+ "ld1 { v15.b }[6], [x20]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+ "tbz x16, #0, 31f\n"
+ "ld1 { v15.b }[4], [x20]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 2): Bit 2: Unset
+ "tbz x16, #1, 30f\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 31f\n"
+ "ld1 { v15.b }[2], [x20]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x16, #0, 31f\n"
+ "ld1 { v15.b }[0], [x20]\n"
+ "31:" // Oddments: Load (2, 2): Bit 2: End
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "ldr x20, [x12, #0x50]\n"
+ "smlal v28.4s, v15.4h, v4.4h\n"
+ "smlal2 v9.4s, v15.8h, v4.8h\n"
+ "smlal v3.4s, v15.4h, v16.4h\n"
+ "smlal2 v30.4s, v15.8h, v16.8h\n"
+ "add x20, x20, x14\n"
+ "smlal v0.4s, v15.4h, v31.4h\n"
+ "smlal2 v22.4s, v15.8h, v31.8h\n"
+ "smlal v6.4s, v15.4h, v8.4h\n"
+ "smlal2 v2.4s, v15.8h, v8.8h\n"
+ "tbz x16, #2, 33f\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 32f\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 35f\n"
+ "ld1 { v20.b }[6], [x20]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
+ "tbz x16, #0, 35f\n"
+ "ld1 { v20.b }[4], [x20]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (1, 0): Bit 2: Unset
+ "tbz x16, #1, 34f\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 35f\n"
+ "ld1 { v20.b }[2], [x20]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x16, #0, 35f\n"
+ "ld1 { v20.b }[0], [x20]\n"
+ "35:" // Oddments: Load (1, 0): Bit 2: End
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "ldr x20, [x12, #0x58]\n"
+ "smlal v28.4s, v20.4h, v17.4h\n"
+ "smlal2 v9.4s, v20.8h, v17.8h\n"
+ "smlal v0.4s, v20.4h, v19.4h\n"
+ "smlal2 v22.4s, v20.8h, v19.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 37f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 36f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 39f\n"
+ "ld1 { v11.b }[6], [x20]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x16, #0, 39f\n"
+ "ld1 { v11.b }[4], [x20]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x16, #1, 38f\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 39f\n"
+ "ld1 { v11.b }[2], [x20]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x16, #0, 39f\n"
+ "ld1 { v11.b }[0], [x20]\n"
+ "39:" // Oddments: Load (1, 3): Bit 2: End
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "ldr x20, [x12, #0x60]\n"
+ "smlal v3.4s, v11.4h, v31.4h\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "smlal v6.4s, v11.4h, v1.4h\n"
+ "smlal2 v2.4s, v11.8h, v1.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 41f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 40f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 43f\n"
+ "ld1 { v23.b }[6], [x20]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+ "tbz x16, #0, 43f\n"
+ "ld1 { v23.b }[4], [x20]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (2, 0): Bit 2: Unset
+ "tbz x16, #1, 42f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 43f\n"
+ "ld1 { v23.b }[2], [x20]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x16, #0, 43f\n"
+ "ld1 { v23.b }[0], [x20]\n"
+ "43:" // Oddments: Load (2, 0): Bit 2: End
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ldr x20, [x12, #0x68]\n"
+ "smlal v28.4s, v23.4h, v29.4h\n"
+ "smlal2 v9.4s, v23.8h, v29.8h\n"
+ "smlal v0.4s, v23.4h, v17.4h\n"
+ "smlal2 v22.4s, v23.8h, v17.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 45f\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 44f\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 47f\n"
+ "ld1 { v20.b }[6], [x20]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x16, #0, 47f\n"
+ "ld1 { v20.b }[4], [x20]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x16, #1, 46f\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 47f\n"
+ "ld1 { v20.b }[2], [x20]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x16, #0, 47f\n"
+ "ld1 { v20.b }[0], [x20]\n"
+ "47:" // Oddments: Load (2, 3): Bit 2: End
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "ldr x20, [x12, #0x70]\n"
+ "smlal v3.4s, v20.4h, v4.4h\n"
+ "smlal2 v30.4s, v20.8h, v4.8h\n"
+ "smlal v6.4s, v20.4h, v31.4h\n"
+ "smlal2 v2.4s, v20.8h, v31.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 49f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 48f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 51f\n"
+ "ld1 { v8.b }[6], [x20]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x16, #0, 51f\n"
+ "ld1 { v8.b }[4], [x20]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x16, #1, 50f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 51f\n"
+ "ld1 { v8.b }[2], [x20]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x16, #0, 51f\n"
+ "ld1 { v8.b }[0], [x20]\n"
+ "51:" // Oddments: Load (3, 1): Bit 2: End
+ "ushll v8.8h, v8.8b, #0x0\n"
+ "ldr x20, [x12, #0x78]\n"
+ "smlal v0.4s, v8.4h, v16.4h\n"
+ "smlal2 v22.4s, v8.8h, v16.8h\n"
+ "smlal v6.4s, v8.4h, v29.4h\n"
+ "smlal2 v2.4s, v8.8h, v29.8h\n"
+ "add x20, x20, x14\n"
+ "tbz x16, #2, 53f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x16, #1, 52f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x16, #0, 55f\n"
+ "ld1 { v8.b }[6], [x20]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x16, #0, 55f\n"
+ "ld1 { v8.b }[4], [x20]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x16, #1, 54f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x16, #0, 55f\n"
+ "ld1 { v8.b }[2], [x20]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x16, #0, 55f\n"
+ "ld1 { v8.b }[0], [x20]\n"
+ "55:" // Oddments: Load (3, 2): Bit 2: End
+ "ushll v8.8h, v8.8b, #0x0\n"
+ "smlal v0.4s, v8.4h, v4.4h\n"
+ "smlal2 v22.4s, v8.8h, v4.8h\n"
+ "smlal v6.4s, v8.4h, v16.4h\n"
+ "smlal2 v2.4s, v8.8h, v16.8h\n"
+ "tbz x16, #2, 57f\n"
+ "ld1 { v7.4s }, [x10], #0x10\n"
+ "ld1 { v23.4s }, [x9], #0x10\n"
+ "tbz x16, #1, 56f\n"
+ "ld1 { v11.d }[0], [x10], #0x8\n"
+ "ld1 { v27.d }[0], [x9], #0x8\n"
+ "tbz x16, #0, 59f\n"
+ "ld1 { v11.s }[2], [x10]\n"
+ "ld1 { v27.s }[2], [x9]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x16, #0, 59f\n"
+ "ld1 { v11.s }[0], [x10]\n"
+ "ld1 { v27.s }[0], [x9]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x16, #1, 58f\n"
+ "ld1 { v7.d }[0], [x10], #0x8\n"
+ "ld1 { v23.d }[0], [x9], #0x8\n"
+ "tbz x16, #0, 59f\n"
+ "ld1 { v7.s }[2], [x10]\n"
+ "ld1 { v23.s }[2], [x9]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x16, #0, 59f\n"
+ "ld1 { v7.s }[0], [x10]\n"
+ "ld1 { v23.s }[0], [x9]\n"
+ "59:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v28.4s, v28.4s, v7.4s\n"
+ "and v20.16b, v28.16b, v23.16b\n"
+ "add x28, x28, x13\n"
+ "add x27, x27, x13\n"
+ "sqrdmulh v9.4s, v9.4s, v11.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "add x26, x26, x13\n"
+ "add x25, x25, x13\n"
+ "and v4.16b, v9.16b, v27.16b\n"
+ "sqrdmulh v3.4s, v3.4s, v7.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v7.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v7.4s\n"
+ "sqadd v28.4s, v28.4s, v20.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v19.16b, v3.16b, v23.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v11.4s\n"
+ "and v29.16b, v0.16b, v23.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v11.4s\n"
+ "and v26.16b, v6.16b, v23.16b\n"
+ "sqrdmulh v2.4s, v2.4s, v11.4s\n"
+ "sqadd v9.4s, v9.4s, v4.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "and v17.16b, v30.16b, v27.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v8.16b, v22.16b, v27.16b\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "and v13.16b, v2.16b, v27.16b\n"
+ "sqadd v3.4s, v3.4s, v19.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v0.4s, v0.4s, v29.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v26.4s\n"
+ "sshr v13.4s, v13.4s, #0x1f\n"
+ "srshl v28.4s, v28.4s, v23.4s\n"
+ "srshl v3.4s, v3.4s, v23.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "srshl v0.4s, v0.4s, v23.4s\n"
+ "sqadd v22.4s, v22.4s, v8.4s\n"
+ "srshl v6.4s, v6.4s, v23.4s\n"
+ "sqadd v2.4s, v2.4s, v13.4s\n"
+ "srshl v9.4s, v9.4s, v27.4s\n"
+ "sqxtn v28.4h, v28.4s\n"
+ "srshl v30.4s, v30.4s, v27.4s\n"
+ "sqxtn v3.4h, v3.4s\n"
+ "srshl v22.4s, v22.4s, v27.4s\n"
+ "sqxtn v0.4h, v0.4s\n"
+ "srshl v2.4s, v2.4s, v27.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v28.8h, v9.4s\n"
+ "sqxtn2 v3.8h, v30.4s\n"
+ "sqxtn2 v0.8h, v22.4s\n"
+ "sqxtn2 v6.8h, v2.4s\n"
+ "sqadd v28.8h, v28.8h, v5.8h\n"
+ "sqadd v3.8h, v3.8h, v5.8h\n"
+ "sqadd v0.8h, v0.8h, v5.8h\n"
+ "sqadd v6.8h, v6.8h, v5.8h\n"
+ "smax v28.8h, v28.8h, v14.8h\n"
+ "smax v3.8h, v3.8h, v14.8h\n"
+ "smax v0.8h, v0.8h, v14.8h\n"
+ "smax v6.8h, v6.8h, v14.8h\n"
+ "smin v28.8h, v28.8h, v12.8h\n"
+ "smin v3.8h, v3.8h, v12.8h\n"
+ "smin v0.8h, v0.8h, v12.8h\n"
+ "smin v6.8h, v6.8h, v12.8h\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v3.16b, v3.16b, v3.16b\n"
+ "uzp1 v0.16b, v0.16b, v0.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "tbz x16, #2, 61f\n"
+ "st1 { v28.s }[0], [x28], #0x4\n"
+ "st1 { v3.s }[0], [x27], #0x4\n"
+ "st1 { v0.s }[0], [x26], #0x4\n"
+ "st1 { v6.s }[0], [x25], #0x4\n"
+ "tbz x16, #1, 60f\n"
+ "st1 { v28.h }[2], [x28], #0x2\n"
+ "st1 { v3.h }[2], [x27], #0x2\n"
+ "st1 { v0.h }[2], [x26], #0x2\n"
+ "st1 { v6.h }[2], [x25], #0x2\n"
+ "tbz x16, #0, 63f\n"
+ "st1 { v28.b }[6], [x28], #0x1\n"
+ "st1 { v3.b }[6], [x27], #0x1\n"
+ "st1 { v0.b }[6], [x26], #0x1\n"
+ "st1 { v6.b }[6], [x25], #0x1\n"
+ "b 63f\n"
+ "60:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x16, #0, 63f\n"
+ "st1 { v28.b }[4], [x28], #0x1\n"
+ "st1 { v3.b }[4], [x27], #0x1\n"
+ "st1 { v0.b }[4], [x26], #0x1\n"
+ "st1 { v6.b }[4], [x25], #0x1\n"
+ "b 63f\n"
+ "61:" // Oddments: Bit 2: Unset
+ "tbz x16, #1, 62f\n"
+ "st1 { v28.h }[0], [x28], #0x2\n"
+ "st1 { v3.h }[0], [x27], #0x2\n"
+ "st1 { v0.h }[0], [x26], #0x2\n"
+ "st1 { v6.h }[0], [x25], #0x2\n"
+ "tbz x16, #0, 63f\n"
+ "st1 { v28.b }[2], [x28], #0x1\n"
+ "st1 { v3.b }[2], [x27], #0x1\n"
+ "st1 { v0.b }[2], [x26], #0x1\n"
+ "st1 { v6.b }[2], [x25], #0x1\n"
+ "b 63f\n"
+ "62:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x16, #0, 63f\n"
+ "st1 { v28.b }[0], [x28], #0x1\n"
+ "st1 { v3.b }[0], [x27], #0x1\n"
+ "st1 { v0.b }[0], [x26], #0x1\n"
+ "st1 { v6.b }[0], [x25], #0x1\n"
+ "63:" // Oddments: Bit 2: End
+ "64:" // End
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..9b646bc4f6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+ const unsigned int,
+ const uint8_t *const *const,
+ const uint8_t *const,
+ const int32_t *const,
+ const arm_gemm::Requantize32 &,
+ const int32_t *const,
+ const int32_t *const,
+ uint8_t *const *const
+);
+
+class a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..6cb10a7bb2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1395 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ uint64_t n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[25];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[12];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[1];
+ inptrs[3] = inptrs_raw[3];
+ inptrs[4] = inptrs_raw[4];
+ inptrs[5] = inptrs_raw[5];
+ inptrs[6] = inptrs_raw[6];
+ inptrs[7] = inptrs_raw[2];
+ inptrs[8] = inptrs_raw[8];
+ inptrs[9] = inptrs_raw[9];
+ inptrs[10] = inptrs_raw[7];
+ inptrs[11] = inptrs_raw[15];
+ inptrs[12] = inptrs_raw[10];
+ inptrs[13] = inptrs_raw[16];
+ inptrs[14] = inptrs_raw[11];
+ inptrs[15] = inptrs_raw[18];
+ inptrs[16] = inptrs_raw[13];
+ inptrs[17] = inptrs_raw[19];
+ inptrs[18] = inptrs_raw[20];
+ inptrs[19] = inptrs_raw[14];
+ inptrs[20] = inptrs_raw[21];
+ inptrs[21] = inptrs_raw[17];
+ inptrs[22] = inptrs_raw[23];
+ inptrs[23] = inptrs_raw[22];
+ inptrs[24] = inptrs_raw[24];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x8, x7, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v6.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_minval]\n"
+ "ld1r { v22.8h }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "mov x17, #0x0\n"
+ "ld1r { v5.8h }, [x20]\n"
+ "mov x16, #0x0\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x11, x10, [x22, #0x0]\n"
+ "ldp x9, x28, [x22, #0x10]\n"
+ "cbz x8, 3f\n"
+ "ldr d12, [x14, #0x0]\n"
+ "ldr d11, [x14, #0x8]\n"
+ "subs x8, x8, #0x1\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "ldr d25, [x14, #0x10]\n"
+ "ldr d24, [x14, #0x18]\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "ldr d23, [x14, #0x20]\n"
+ "ldr d7, [x14, #0x28]\n"
+ "usubl v24.8h, v24.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "ldr d3, [x14, #0x30]\n"
+ "ldr d9, [x14, #0x38]\n"
+ "usubl v7.8h, v7.8b, v6.8b\n"
+ "usubl v3.8h, v3.8b, v6.8b\n"
+ "ldr d30, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v9.8h, v9.8b, v6.8b\n"
+ "usubl v30.8h, v30.8b, v6.8b\n"
+ "ldr q8, [x20, #0x0]\n"
+ "ldr q2, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "mov v21.16b, v8.16b\n"
+ "mov v4.16b, v2.16b\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "mov v20.16b, v8.16b\n"
+ "mov v1.16b, v2.16b\n"
+ "ldr d26, [x27, x17]\n"
+ "ldr d18, [x26, x17]\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v14.16b, v2.16b\n"
+ "ldr d10, [x25, x17]\n"
+ "ldr d27, [x24, x17]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ldr d17, [x23, x17]\n"
+ "ldr d19, [x22, x17]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ldr d15, [x21, x17]\n"
+ "ldr d28, [x20, x17]\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "ldr q31, [x13, #0x0]\n"
+ "ldr q0, [x12, #0x0]\n"
+ "smlal v8.4s, v26.4h, v30.4h\n"
+ "smlal2 v2.4s, v26.8h, v30.8h\n"
+ "ldr q29, [x13, #0x10]\n"
+ "ldr x21, [x15, #0x58]\n"
+ "smlal v8.4s, v18.4h, v12.4h\n"
+ "smlal v21.4s, v26.4h, v3.4h\n"
+ "ldr x20, [x15, #0x78]\n"
+ "ldr x25, [x15, #0x60]\n"
+ "smlal v20.4s, v26.4h, v25.4h\n"
+ "smlal v16.4s, v26.4h, v12.4h\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal2 v2.4s, v18.8h, v12.8h\n"
+ "ldr d18, [x21, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal v8.4s, v10.4h, v11.4h\n"
+ "smlal2 v4.4s, v26.8h, v3.8h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal2 v1.4s, v26.8h, v25.8h\n"
+ "smlal2 v14.4s, v26.8h, v12.8h\n"
+ "ldr d26, [x20, x17]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v21.4s, v27.4h, v11.4h\n"
+ "smlal v20.4s, v18.4h, v24.4h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v16.4s, v26.4h, v23.4h\n"
+ "smlal2 v2.4s, v10.8h, v11.8h\n"
+ "ldr d10, [x25, x17]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal v8.4s, v19.4h, v24.4h\n"
+ "smlal2 v4.4s, v27.8h, v11.8h\n"
+ "ldr d27, [x24, x17]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal2 v1.4s, v18.8h, v24.8h\n"
+ "ldr d18, [x23, x17]\n"
+ "smlal2 v14.4s, v26.8h, v23.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "ldr x24, [x15, #0x98]\n"
+ "smlal v21.4s, v17.4h, v25.4h\n"
+ "smlal v20.4s, v10.4h, v12.4h\n"
+ "ldr x23, [x15, #0x50]\n"
+ "smlal v16.4s, v27.4h, v11.4h\n"
+ "smlal2 v2.4s, v19.8h, v24.8h\n"
+ "ldr d19, [x21, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v8.4s, v15.4h, v23.4h\n"
+ "smlal2 v4.4s, v17.8h, v25.8h\n"
+ "ldr d17, [x20, x17]\n"
+ "ldr x22, [x15, #0x48]\n"
+ "smlal2 v1.4s, v10.8h, v12.8h\n"
+ "smlal2 v14.4s, v27.8h, v11.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v28.4h, v12.4h\n"
+ "smlal v20.4s, v18.4h, v23.4h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "smlal v16.4s, v26.4h, v7.4h\n"
+ "smlal2 v2.4s, v15.8h, v23.8h\n"
+ "ldr d15, [x24, x17]\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v8.4s, v28.4h, v25.4h\n"
+ "smlal2 v4.4s, v28.8h, v12.8h\n"
+ "ldr d12, [x23, x17]\n"
+ "ushll v12.8h, v12.8b, #0x0\n"
+ "smlal2 v1.4s, v18.8h, v23.8h\n"
+ "ldr d18, [x22, x17]\n"
+ "smlal2 v14.4s, v26.8h, v7.8h\n"
+ "ldr d26, [x21, x17]\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v21.4s, v19.4h, v23.4h\n"
+ "smlal v20.4s, v17.4h, v11.4h\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v16.4s, v15.4h, v25.4h\n"
+ "smlal2 v2.4s, v28.8h, v25.8h\n"
+ "ldr d28, [x20, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v8.4s, v12.4h, v7.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal2 v4.4s, v19.8h, v23.8h\n"
+ "ldr d23, [x22, x17]\n"
+ "ldr d19, [x21, x17]\n"
+ "smlal2 v1.4s, v17.8h, v11.8h\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal2 v14.4s, v15.8h, v25.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal v21.4s, v18.4h, v7.4h\n"
+ "smlal v20.4s, v26.4h, v3.4h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "smlal v16.4s, v28.4h, v24.4h\n"
+ "smlal2 v2.4s, v12.8h, v7.8h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal v8.4s, v10.4h, v3.4h\n"
+ "smlal2 v4.4s, v18.8h, v7.8h\n"
+ "ldr d18, [x21, x17]\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "smlal2 v1.4s, v26.8h, v3.8h\n"
+ "smlal2 v14.4s, v28.8h, v24.8h\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "add x14, x14, #0x48\n"
+ "smlal v21.4s, v12.4h, v24.4h\n"
+ "smlal v20.4s, v23.4h, v9.4h\n"
+ "add x17, x17, #0x8\n"
+ "subs x8, x8, #0x1\n"
+ "smlal v16.4s, v19.4h, v9.4h\n"
+ "smlal2 v2.4s, v10.8h, v3.8h\n"
+ "add x13, x13, #0x20\n"
+ "add x12, x12, #0x20\n"
+ "smlal v8.4s, v17.4h, v9.4h\n"
+ "smlal2 v4.4s, v12.8h, v24.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v31.4s\n"
+ "smlal2 v1.4s, v23.8h, v9.8h\n"
+ "smlal2 v14.4s, v19.8h, v9.8h\n"
+ "and v10.16b, v8.16b, v0.16b\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v28.4h, v7.4h\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "smlal v16.4s, v11.4h, v3.4h\n"
+ "smlal2 v2.4s, v17.8h, v9.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v29.4s\n"
+ "smlal2 v4.4s, v27.8h, v9.8h\n"
+ "smlal2 v1.4s, v28.8h, v7.8h\n"
+ "and v12.16b, v2.16b, v25.16b\n"
+ "smlal2 v14.4s, v11.8h, v3.8h\n"
+ "smlal v21.4s, v15.4h, v30.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v31.4s\n"
+ "smlal v20.4s, v11.4h, v30.4h\n"
+ "smlal v16.4s, v18.4h, v30.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v31.4s\n"
+ "smlal2 v4.4s, v15.8h, v30.8h\n"
+ "smlal2 v1.4s, v11.8h, v30.8h\n"
+ "sqrdmulh v16.4s, v16.4s, v31.4s\n"
+ "smlal2 v14.4s, v18.8h, v30.8h\n"
+ "sqadd v8.4s, v8.4s, v10.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "and v27.16b, v21.16b, v0.16b\n"
+ "sqrdmulh v4.4s, v4.4s, v29.4s\n"
+ "and v24.16b, v20.16b, v0.16b\n"
+ "sqrdmulh v1.4s, v1.4s, v29.4s\n"
+ "and v19.16b, v16.16b, v0.16b\n"
+ "sqrdmulh v14.4s, v14.4s, v29.4s\n"
+ "sqadd v2.4s, v2.4s, v12.4s\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v18.16b, v4.16b, v25.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "and v17.16b, v1.16b, v25.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "and v15.16b, v14.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v27.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v24.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v19.4s\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "sqadd v4.4s, v4.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "sqadd v1.4s, v1.4s, v17.4s\n"
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "sqadd v14.4s, v14.4s, v15.4s\n"
+ "srshl v2.4s, v2.4s, v25.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v4.4s, v4.4s, v25.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "srshl v1.4s, v1.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v14.4s, v14.4s, v25.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "sqxtn2 v8.8h, v2.4s\n"
+ "sqxtn2 v21.8h, v4.4s\n"
+ "sqxtn2 v20.8h, v1.4s\n"
+ "sqxtn2 v16.8h, v14.4s\n"
+ "sqadd v8.8h, v8.8h, v22.8h\n"
+ "sqadd v21.8h, v21.8h, v22.8h\n"
+ "sqadd v20.8h, v20.8h, v22.8h\n"
+ "sqadd v16.8h, v16.8h, v22.8h\n"
+ "smax v8.8h, v8.8h, v13.8h\n"
+ "smax v21.8h, v21.8h, v13.8h\n"
+ "smax v20.8h, v20.8h, v13.8h\n"
+ "smax v16.8h, v16.8h, v13.8h\n"
+ "smin v8.8h, v8.8h, v5.8h\n"
+ "smin v21.8h, v21.8h, v5.8h\n"
+ "smin v20.8h, v20.8h, v5.8h\n"
+ "smin v16.8h, v16.8h, v5.8h\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "str d8, [x11, x16]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d16, [x28, x16]\n"
+ "ldr q8, [x20, #0x0]\n"
+ "ldr q2, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d12, [x14, #0x0]\n"
+ "ldr d11, [x14, #0x8]\n"
+ "add x16, x16, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d25, [x14, #0x10]\n"
+ "ldr d24, [x14, #0x18]\n"
+ "mov v21.16b, v8.16b\n"
+ "mov v4.16b, v2.16b\n"
+ "ldr d23, [x14, #0x20]\n"
+ "ldr d7, [x14, #0x28]\n"
+ "mov v20.16b, v8.16b\n"
+ "mov v1.16b, v2.16b\n"
+ "ldr d3, [x14, #0x30]\n"
+ "ldr d9, [x14, #0x38]\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v14.16b, v2.16b\n"
+ "ldr d30, [x14, #0x40]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v24.8h, v24.8b, v6.8b\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ldr d26, [x27, x17]\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "usubl v7.8h, v7.8b, v6.8b\n"
+ "ldr d18, [x26, x17]\n"
+ "ldr d10, [x25, x17]\n"
+ "usubl v3.8h, v3.8b, v6.8b\n"
+ "usubl v9.8h, v9.8b, v6.8b\n"
+ "ldr d27, [x24, x17]\n"
+ "ldr d17, [x23, x17]\n"
+ "usubl v30.8h, v30.8b, v6.8b\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ldr d19, [x22, x17]\n"
+ "ldr d15, [x21, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ldr d28, [x20, x17]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "ldr q0, [x13, #0x0]\n"
+ "ldr q31, [x12, #0x0]\n"
+ "smlal v8.4s, v26.4h, v30.4h\n"
+ "smlal2 v2.4s, v26.8h, v30.8h\n"
+ "ldr q29, [x13, #0x10]\n"
+ "ldr x21, [x15, #0x58]\n"
+ "smlal v8.4s, v18.4h, v12.4h\n"
+ "smlal v21.4s, v26.4h, v3.4h\n"
+ "ldr x20, [x15, #0x78]\n"
+ "ldr x25, [x15, #0x60]\n"
+ "smlal v20.4s, v26.4h, v25.4h\n"
+ "smlal v16.4s, v26.4h, v12.4h\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal2 v2.4s, v18.8h, v12.8h\n"
+ "ldr d18, [x21, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal v8.4s, v10.4h, v11.4h\n"
+ "smlal2 v4.4s, v26.8h, v3.8h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal2 v1.4s, v26.8h, v25.8h\n"
+ "smlal2 v14.4s, v26.8h, v12.8h\n"
+ "ldr d26, [x20, x17]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v21.4s, v27.4h, v11.4h\n"
+ "smlal v20.4s, v18.4h, v24.4h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v16.4s, v26.4h, v23.4h\n"
+ "smlal2 v2.4s, v10.8h, v11.8h\n"
+ "ldr d10, [x25, x17]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal v8.4s, v19.4h, v24.4h\n"
+ "smlal2 v4.4s, v27.8h, v11.8h\n"
+ "ldr d27, [x24, x17]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal2 v1.4s, v18.8h, v24.8h\n"
+ "ldr d18, [x23, x17]\n"
+ "smlal2 v14.4s, v26.8h, v23.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "ldr x24, [x15, #0x98]\n"
+ "smlal v21.4s, v17.4h, v25.4h\n"
+ "smlal v20.4s, v10.4h, v12.4h\n"
+ "ldr x23, [x15, #0x50]\n"
+ "smlal v16.4s, v27.4h, v11.4h\n"
+ "smlal2 v2.4s, v19.8h, v24.8h\n"
+ "ldr d19, [x21, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v8.4s, v15.4h, v23.4h\n"
+ "smlal2 v4.4s, v17.8h, v25.8h\n"
+ "ldr d17, [x20, x17]\n"
+ "ldr x22, [x15, #0x48]\n"
+ "smlal2 v1.4s, v10.8h, v12.8h\n"
+ "smlal2 v14.4s, v27.8h, v11.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v28.4h, v12.4h\n"
+ "smlal v20.4s, v18.4h, v23.4h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "smlal v16.4s, v26.4h, v7.4h\n"
+ "smlal2 v2.4s, v15.8h, v23.8h\n"
+ "ldr d15, [x24, x17]\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v8.4s, v28.4h, v25.4h\n"
+ "smlal2 v4.4s, v28.8h, v12.8h\n"
+ "ldr d12, [x23, x17]\n"
+ "ushll v12.8h, v12.8b, #0x0\n"
+ "smlal2 v1.4s, v18.8h, v23.8h\n"
+ "ldr d18, [x22, x17]\n"
+ "smlal2 v14.4s, v26.8h, v7.8h\n"
+ "ldr d26, [x21, x17]\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v21.4s, v19.4h, v23.4h\n"
+ "smlal v20.4s, v17.4h, v11.4h\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v16.4s, v15.4h, v25.4h\n"
+ "smlal2 v2.4s, v28.8h, v25.8h\n"
+ "ldr d28, [x20, x17]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v8.4s, v12.4h, v7.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal2 v4.4s, v19.8h, v23.8h\n"
+ "ldr d23, [x22, x17]\n"
+ "ldr d19, [x21, x17]\n"
+ "smlal2 v1.4s, v17.8h, v11.8h\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal2 v14.4s, v15.8h, v25.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal v21.4s, v18.4h, v7.4h\n"
+ "smlal v20.4s, v26.4h, v3.4h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "smlal v16.4s, v28.4h, v24.4h\n"
+ "smlal2 v2.4s, v12.8h, v7.8h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "tst x7, #0x7\n"
+ "smlal v8.4s, v10.4h, v3.4h\n"
+ "smlal2 v4.4s, v18.8h, v7.8h\n"
+ "ldr d18, [x20, x17]\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "smlal2 v1.4s, v26.8h, v3.8h\n"
+ "smlal2 v14.4s, v28.8h, v24.8h\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "add x17, x17, #0x8\n"
+ "smlal v21.4s, v12.4h, v24.4h\n"
+ "smlal v20.4s, v23.4h, v9.4h\n"
+ "add x13, x13, #0x20\n"
+ "add x12, x12, #0x20\n"
+ "smlal v16.4s, v19.4h, v9.4h\n"
+ "smlal2 v2.4s, v10.8h, v3.8h\n"
+ "smlal v8.4s, v17.4h, v9.4h\n"
+ "smlal2 v4.4s, v12.8h, v24.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v0.4s\n"
+ "smlal2 v1.4s, v23.8h, v9.8h\n"
+ "smlal2 v14.4s, v19.8h, v9.8h\n"
+ "and v23.16b, v8.16b, v31.16b\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v28.4h, v7.4h\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "smlal v16.4s, v11.4h, v3.4h\n"
+ "smlal2 v2.4s, v17.8h, v9.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v29.4s\n"
+ "smlal2 v4.4s, v27.8h, v9.8h\n"
+ "smlal2 v1.4s, v28.8h, v7.8h\n"
+ "and v7.16b, v2.16b, v25.16b\n"
+ "smlal2 v14.4s, v11.8h, v3.8h\n"
+ "smlal v21.4s, v15.4h, v30.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v0.4s\n"
+ "smlal v20.4s, v11.4h, v30.4h\n"
+ "smlal v16.4s, v18.4h, v30.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v0.4s\n"
+ "smlal2 v4.4s, v15.8h, v30.8h\n"
+ "smlal2 v1.4s, v11.8h, v30.8h\n"
+ "sqrdmulh v16.4s, v16.4s, v0.4s\n"
+ "smlal2 v14.4s, v18.8h, v30.8h\n"
+ "sqadd v8.4s, v8.4s, v23.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v23.16b, v21.16b, v31.16b\n"
+ "sqrdmulh v4.4s, v4.4s, v29.4s\n"
+ "and v24.16b, v20.16b, v31.16b\n"
+ "sqrdmulh v1.4s, v1.4s, v29.4s\n"
+ "and v19.16b, v16.16b, v31.16b\n"
+ "sqrdmulh v14.4s, v14.4s, v29.4s\n"
+ "sqadd v2.4s, v2.4s, v7.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v18.16b, v4.16b, v25.16b\n"
+ "sshr v24.4s, v24.4s, #0x1f\n"
+ "and v17.16b, v1.16b, v25.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "and v15.16b, v14.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v23.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v24.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v19.4s\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "srshl v8.4s, v8.4s, v31.4s\n"
+ "srshl v21.4s, v21.4s, v31.4s\n"
+ "sqadd v4.4s, v4.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v31.4s\n"
+ "sqadd v1.4s, v1.4s, v17.4s\n"
+ "srshl v16.4s, v16.4s, v31.4s\n"
+ "sqadd v14.4s, v14.4s, v15.4s\n"
+ "srshl v2.4s, v2.4s, v25.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v4.4s, v4.4s, v25.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "srshl v1.4s, v1.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v14.4s, v14.4s, v25.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "sqxtn2 v8.8h, v2.4s\n"
+ "sqxtn2 v21.8h, v4.4s\n"
+ "sqxtn2 v20.8h, v1.4s\n"
+ "sqxtn2 v16.8h, v14.4s\n"
+ "sqadd v8.8h, v8.8h, v22.8h\n"
+ "sqadd v21.8h, v21.8h, v22.8h\n"
+ "sqadd v20.8h, v20.8h, v22.8h\n"
+ "sqadd v16.8h, v16.8h, v22.8h\n"
+ "smax v8.8h, v8.8h, v13.8h\n"
+ "smax v21.8h, v21.8h, v13.8h\n"
+ "smax v20.8h, v20.8h, v13.8h\n"
+ "smax v16.8h, v16.8h, v13.8h\n"
+ "smin v8.8h, v8.8h, v5.8h\n"
+ "smin v21.8h, v21.8h, v5.8h\n"
+ "smin v20.8h, v20.8h, v5.8h\n"
+ "smin v16.8h, v16.8h, v5.8h\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "str d8, [x11, x16]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d16, [x28, x16]\n"
+ "add x16, x16, #0x8\n"
+ "beq 88f\n"
+ "add x14, x14, #0x48\n"
+ "3:" // Oddments
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x7, #2, 5f\n"
+ "ld1 { v8.4s }, [x20], #0x10\n"
+ "tbz x7, #1, 4f\n"
+ "ld1 { v2.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v2.s }[2], [x20]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 7f\n"
+ "ld1 { v2.s }[0], [x20]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x7, #1, 6f\n"
+ "ld1 { v8.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v8.s }[2], [x20]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 7f\n"
+ "ld1 { v8.s }[0], [x20]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "ldr d12, [x14, #0x0]\n"
+ "ldr d11, [x14, #0x8]\n"
+ "mov v21.16b, v8.16b\n"
+ "mov v4.16b, v2.16b\n"
+ "ldr d25, [x14, #0x10]\n"
+ "ldr d24, [x14, #0x18]\n"
+ "mov v20.16b, v8.16b\n"
+ "mov v1.16b, v2.16b\n"
+ "ldr d23, [x14, #0x20]\n"
+ "ldr d7, [x14, #0x28]\n"
+ "mov v16.16b, v8.16b\n"
+ "mov v14.16b, v2.16b\n"
+ "ldr d3, [x14, #0x30]\n"
+ "ldr d9, [x14, #0x38]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
+ "ldr d30, [x14, #0x40]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v24.8h, v24.8b, v6.8b\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "usubl v7.8h, v7.8b, v6.8b\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "usubl v3.8h, v3.8b, v6.8b\n"
+ "usubl v9.8h, v9.8b, v6.8b\n"
+ "usubl v30.8h, v30.8b, v6.8b\n"
+ "add x27, x27, x17\n"
+ "add x26, x26, x17\n"
+ "add x25, x25, x17\n"
+ "add x24, x24, x17\n"
+ "add x23, x23, x17\n"
+ "add x22, x22, x17\n"
+ "add x21, x21, x17\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 9f\n"
+ "ld1 { v26.s }[0], [x27], #0x4\n"
+ "ld1 { v18.s }[0], [x26], #0x4\n"
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "ld1 { v27.s }[0], [x24], #0x4\n"
+ "ld1 { v17.s }[0], [x23], #0x4\n"
+ "ld1 { v19.s }[0], [x22], #0x4\n"
+ "ld1 { v15.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 8f\n"
+ "ld1 { v26.h }[2], [x27], #0x2\n"
+ "ld1 { v18.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x25], #0x2\n"
+ "ld1 { v27.h }[2], [x24], #0x2\n"
+ "ld1 { v17.h }[2], [x23], #0x2\n"
+ "ld1 { v19.h }[2], [x22], #0x2\n"
+ "ld1 { v15.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v26.b }[6], [x27]\n"
+ "ld1 { v18.b }[6], [x26]\n"
+ "ld1 { v10.b }[6], [x25]\n"
+ "ld1 { v27.b }[6], [x24]\n"
+ "ld1 { v17.b }[6], [x23]\n"
+ "ld1 { v19.b }[6], [x22]\n"
+ "ld1 { v15.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 11f\n"
+ "ld1 { v26.b }[4], [x27]\n"
+ "ld1 { v18.b }[4], [x26]\n"
+ "ld1 { v10.b }[4], [x25]\n"
+ "ld1 { v27.b }[4], [x24]\n"
+ "ld1 { v17.b }[4], [x23]\n"
+ "ld1 { v19.b }[4], [x22]\n"
+ "ld1 { v15.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x7, #1, 10f\n"
+ "ld1 { v26.h }[0], [x27], #0x2\n"
+ "ld1 { v18.h }[0], [x26], #0x2\n"
+ "ld1 { v10.h }[0], [x25], #0x2\n"
+ "ld1 { v27.h }[0], [x24], #0x2\n"
+ "ld1 { v17.h }[0], [x23], #0x2\n"
+ "ld1 { v19.h }[0], [x22], #0x2\n"
+ "ld1 { v15.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v26.b }[2], [x27]\n"
+ "ld1 { v18.b }[2], [x26]\n"
+ "ld1 { v10.b }[2], [x25]\n"
+ "ld1 { v27.b }[2], [x24]\n"
+ "ld1 { v17.b }[2], [x23]\n"
+ "ld1 { v19.b }[2], [x22]\n"
+ "ld1 { v15.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 11f\n"
+ "ld1 { v26.b }[0], [x27]\n"
+ "ld1 { v18.b }[0], [x26]\n"
+ "ld1 { v10.b }[0], [x25]\n"
+ "ld1 { v27.b }[0], [x24]\n"
+ "ld1 { v17.b }[0], [x23]\n"
+ "ld1 { v19.b }[0], [x22]\n"
+ "ld1 { v15.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v8.4s, v26.4h, v30.4h\n"
+ "smlal2 v2.4s, v26.8h, v30.8h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal v8.4s, v18.4h, v12.4h\n"
+ "smlal2 v2.4s, v18.8h, v12.8h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal v21.4s, v26.4h, v3.4h\n"
+ "smlal2 v4.4s, v26.8h, v3.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v8.4s, v10.4h, v11.4h\n"
+ "smlal2 v2.4s, v10.8h, v11.8h\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "smlal v21.4s, v27.4h, v11.4h\n"
+ "smlal2 v4.4s, v27.8h, v11.8h\n"
+ "smlal v8.4s, v19.4h, v24.4h\n"
+ "smlal2 v2.4s, v19.8h, v24.8h\n"
+ "ushll v17.8h, v17.8b, #0x0\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v21.4s, v17.4h, v25.4h\n"
+ "smlal2 v4.4s, v17.8h, v25.8h\n"
+ "smlal v8.4s, v15.4h, v23.4h\n"
+ "smlal2 v2.4s, v15.8h, v23.8h\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v20.4s, v26.4h, v25.4h\n"
+ "smlal2 v1.4s, v26.8h, v25.8h\n"
+ "smlal v16.4s, v26.4h, v12.4h\n"
+ "smlal2 v14.4s, v26.8h, v12.8h\n"
+ "smlal v8.4s, v28.4h, v25.4h\n"
+ "smlal2 v2.4s, v28.8h, v25.8h\n"
+ "smlal v21.4s, v28.4h, v12.4h\n"
+ "smlal2 v4.4s, v28.8h, v12.8h\n"
+ "tbz x7, #2, 13f\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 12f\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v31.b }[6], [x20]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 15f\n"
+ "ld1 { v31.b }[4], [x20]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x7, #1, 14f\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v31.b }[2], [x20]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 15f\n"
+ "ld1 { v31.b }[0], [x20]\n"
+ "15:" // Oddments: Load (1, 3): Bit 2: End
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v21.4s, v31.4h, v23.4h\n"
+ "smlal2 v4.4s, v31.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 17f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 16f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[6], [x20]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[4], [x20]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (1, 4): Bit 2: Unset
+ "tbz x7, #1, 18f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[2], [x20]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[0], [x20]\n"
+ "19:" // Oddments: Load (1, 4): Bit 2: End
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v21.4s, v28.4h, v7.4h\n"
+ "smlal2 v4.4s, v28.8h, v7.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 21f\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 20f\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v27.b }[6], [x20]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 23f\n"
+ "ld1 { v27.b }[4], [x20]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (1, 2): Bit 2: Unset
+ "tbz x7, #1, 22f\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v27.b }[2], [x20]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 23f\n"
+ "ld1 { v27.b }[0], [x20]\n"
+ "23:" // Oddments: Load (1, 2): Bit 2: End
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ldr x20, [x15, #0x58]\n"
+ "smlal v8.4s, v27.4h, v7.4h\n"
+ "smlal2 v2.4s, v27.8h, v7.8h\n"
+ "smlal v21.4s, v27.4h, v24.4h\n"
+ "smlal2 v4.4s, v27.8h, v24.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 25f\n"
+ "ld1 { v0.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 24f\n"
+ "ld1 { v0.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v0.b }[6], [x20]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 27f\n"
+ "ld1 { v0.b }[4], [x20]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x7, #1, 26f\n"
+ "ld1 { v0.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v0.b }[2], [x20]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 27f\n"
+ "ld1 { v0.b }[0], [x20]\n"
+ "27:" // Oddments: Load (3, 0): Bit 2: End
+ "ushll v0.8h, v0.8b, #0x0\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v20.4s, v0.4h, v24.4h\n"
+ "smlal2 v1.4s, v0.8h, v24.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 29f\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 28f\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v15.b }[6], [x20]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 31f\n"
+ "ld1 { v15.b }[4], [x20]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 0): Bit 2: Unset
+ "tbz x7, #1, 30f\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v15.b }[2], [x20]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 31f\n"
+ "ld1 { v15.b }[0], [x20]\n"
+ "31:" // Oddments: Load (2, 0): Bit 2: End
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "ldr x20, [x15, #0x68]\n"
+ "smlal v8.4s, v15.4h, v3.4h\n"
+ "smlal2 v2.4s, v15.8h, v3.8h\n"
+ "smlal v20.4s, v15.4h, v12.4h\n"
+ "smlal2 v1.4s, v15.8h, v12.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 33f\n"
+ "ld1 { v0.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 32f\n"
+ "ld1 { v0.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v0.b }[6], [x20]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 35f\n"
+ "ld1 { v0.b }[4], [x20]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x7, #1, 34f\n"
+ "ld1 { v0.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v0.b }[2], [x20]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 35f\n"
+ "ld1 { v0.b }[0], [x20]\n"
+ "35:" // Oddments: Load (3, 1): Bit 2: End
+ "ushll v0.8h, v0.8b, #0x0\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v20.4s, v0.4h, v23.4h\n"
+ "smlal2 v1.4s, v0.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 37f\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 36f\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v6.b }[6], [x20]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 39f\n"
+ "ld1 { v6.b }[4], [x20]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (2, 1): Bit 2: Unset
+ "tbz x7, #1, 38f\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v6.b }[2], [x20]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 39f\n"
+ "ld1 { v6.b }[0], [x20]\n"
+ "39:" // Oddments: Load (2, 1): Bit 2: End
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v8.4s, v6.4h, v9.4h\n"
+ "smlal2 v2.4s, v6.8h, v9.8h\n"
+ "smlal v20.4s, v6.4h, v11.4h\n"
+ "smlal2 v1.4s, v6.8h, v11.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 41f\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 40f\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v27.b }[6], [x20]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 43f\n"
+ "ld1 { v27.b }[4], [x20]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x7, #1, 42f\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v27.b }[2], [x20]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 43f\n"
+ "ld1 { v27.b }[0], [x20]\n"
+ "43:" // Oddments: Load (3, 3): Bit 2: End
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ldr x20, [x15, #0x80]\n"
+ "smlal v16.4s, v27.4h, v23.4h\n"
+ "smlal2 v14.4s, v27.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 45f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 44f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v10.b }[6], [x20]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 47f\n"
+ "ld1 { v10.b }[4], [x20]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x7, #1, 46f\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v10.b }[2], [x20]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 47f\n"
+ "ld1 { v10.b }[0], [x20]\n"
+ "47:" // Oddments: Load (2, 3): Bit 2: End
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ldr x20, [x15, #0x88]\n"
+ "smlal v21.4s, v10.4h, v9.4h\n"
+ "smlal2 v4.4s, v10.8h, v9.8h\n"
+ "smlal v16.4s, v10.4h, v11.4h\n"
+ "smlal2 v14.4s, v10.8h, v11.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 49f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 48f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v28.b }[6], [x20]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 51f\n"
+ "ld1 { v28.b }[4], [x20]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 4): Bit 2: Unset
+ "tbz x7, #1, 50f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v28.b }[2], [x20]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 51f\n"
+ "ld1 { v28.b }[0], [x20]\n"
+ "51:" // Oddments: Load (3, 4): Bit 2: End
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ldr x20, [x15, #0x90]\n"
+ "smlal v16.4s, v28.4h, v7.4h\n"
+ "smlal2 v14.4s, v28.8h, v7.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 53f\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 52f\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v15.b }[6], [x20]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 55f\n"
+ "ld1 { v15.b }[4], [x20]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (4, 0): Bit 2: Unset
+ "tbz x7, #1, 54f\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v15.b }[2], [x20]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 55f\n"
+ "ld1 { v15.b }[0], [x20]\n"
+ "55:" // Oddments: Load (4, 0): Bit 2: End
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "ldr x20, [x15, #0x98]\n"
+ "smlal v20.4s, v15.4h, v3.4h\n"
+ "smlal2 v1.4s, v15.8h, v3.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 57f\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 56f\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v6.b }[6], [x20]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 59f\n"
+ "ld1 { v6.b }[4], [x20]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load (2, 4): Bit 2: Unset
+ "tbz x7, #1, 58f\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v6.b }[2], [x20]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 59f\n"
+ "ld1 { v6.b }[0], [x20]\n"
+ "59:" // Oddments: Load (2, 4): Bit 2: End
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "ldr x20, [x15, #0xa0]\n"
+ "smlal v21.4s, v6.4h, v30.4h\n"
+ "smlal2 v4.4s, v6.8h, v30.8h\n"
+ "smlal v16.4s, v6.4h, v25.4h\n"
+ "smlal2 v14.4s, v6.8h, v25.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 61f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 60f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v23.b }[6], [x20]\n"
+ "b 63f\n"
+ "60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 63f\n"
+ "ld1 { v23.b }[4], [x20]\n"
+ "b 63f\n"
+ "61:" // Oddments: Load (4, 1): Bit 2: Unset
+ "tbz x7, #1, 62f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v23.b }[2], [x20]\n"
+ "b 63f\n"
+ "62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 63f\n"
+ "ld1 { v23.b }[0], [x20]\n"
+ "63:" // Oddments: Load (4, 1): Bit 2: End
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v20.4s, v23.4h, v9.4h\n"
+ "smlal2 v1.4s, v23.8h, v9.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 65f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 64f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v12.b }[6], [x20]\n"
+ "b 67f\n"
+ "64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 67f\n"
+ "ld1 { v12.b }[4], [x20]\n"
+ "b 67f\n"
+ "65:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x7, #1, 66f\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v12.b }[2], [x20]\n"
+ "b 67f\n"
+ "66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 67f\n"
+ "ld1 { v12.b }[0], [x20]\n"
+ "67:" // Oddments: Load (3, 2): Bit 2: End
+ "ushll v12.8h, v12.8b, #0x0\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "smlal v20.4s, v12.4h, v7.4h\n"
+ "smlal2 v1.4s, v12.8h, v7.8h\n"
+ "smlal v16.4s, v12.4h, v24.4h\n"
+ "smlal2 v14.4s, v12.8h, v24.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 69f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 68f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v10.b }[6], [x20]\n"
+ "b 71f\n"
+ "68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 71f\n"
+ "ld1 { v10.b }[4], [x20]\n"
+ "b 71f\n"
+ "69:" // Oddments: Load (4, 3): Bit 2: Unset
+ "tbz x7, #1, 70f\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v10.b }[2], [x20]\n"
+ "b 71f\n"
+ "70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 71f\n"
+ "ld1 { v10.b }[0], [x20]\n"
+ "71:" // Oddments: Load (4, 3): Bit 2: End
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal v16.4s, v10.4h, v9.4h\n"
+ "smlal2 v14.4s, v10.8h, v9.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 73f\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 72f\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v15.b }[6], [x20]\n"
+ "b 75f\n"
+ "72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 75f\n"
+ "ld1 { v15.b }[4], [x20]\n"
+ "b 75f\n"
+ "73:" // Oddments: Load (4, 2): Bit 2: Unset
+ "tbz x7, #1, 74f\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v15.b }[2], [x20]\n"
+ "b 75f\n"
+ "74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 75f\n"
+ "ld1 { v15.b }[0], [x20]\n"
+ "75:" // Oddments: Load (4, 2): Bit 2: End
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "smlal v20.4s, v15.4h, v30.4h\n"
+ "smlal2 v1.4s, v15.8h, v30.8h\n"
+ "smlal v16.4s, v15.4h, v3.4h\n"
+ "smlal2 v14.4s, v15.8h, v3.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 77f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 76f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v28.b }[6], [x20]\n"
+ "b 79f\n"
+ "76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 79f\n"
+ "ld1 { v28.b }[4], [x20]\n"
+ "b 79f\n"
+ "77:" // Oddments: Load (4, 4): Bit 2: Unset
+ "tbz x7, #1, 78f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v28.b }[2], [x20]\n"
+ "b 79f\n"
+ "78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 79f\n"
+ "ld1 { v28.b }[0], [x20]\n"
+ "79:" // Oddments: Load (4, 4): Bit 2: End
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v16.4s, v28.4h, v30.4h\n"
+ "smlal2 v14.4s, v28.8h, v30.8h\n"
+ "tbz x7, #2, 81f\n"
+ "ld1 { v19.4s }, [x13], #0x10\n"
+ "ld1 { v23.4s }, [x12], #0x10\n"
+ "tbz x7, #1, 80f\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v24.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 83f\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v24.s }[2], [x12]\n"
+ "b 83f\n"
+ "80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 83f\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v24.s }[0], [x12]\n"
+ "b 83f\n"
+ "81:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x7, #1, 82f\n"
+ "ld1 { v19.d }[0], [x13], #0x8\n"
+ "ld1 { v23.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 83f\n"
+ "ld1 { v19.s }[2], [x13]\n"
+ "ld1 { v23.s }[2], [x12]\n"
+ "b 83f\n"
+ "82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 83f\n"
+ "ld1 { v19.s }[0], [x13]\n"
+ "ld1 { v23.s }[0], [x12]\n"
+ "83:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v8.4s, v8.4s, v19.4s\n"
+ "and v17.16b, v8.16b, v23.16b\n"
+ "add x11, x11, x16\n"
+ "add x10, x10, x16\n"
+ "sqrdmulh v2.4s, v2.4s, v18.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "add x9, x9, x16\n"
+ "add x28, x28, x16\n"
+ "and v11.16b, v2.16b, v24.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v19.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v19.4s\n"
+ "sqadd v8.4s, v8.4s, v17.4s\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v28.16b, v21.16b, v23.16b\n"
+ "sqrdmulh v4.4s, v4.4s, v18.4s\n"
+ "and v17.16b, v20.16b, v23.16b\n"
+ "sqrdmulh v1.4s, v1.4s, v18.4s\n"
+ "and v19.16b, v16.16b, v23.16b\n"
+ "sqrdmulh v14.4s, v14.4s, v18.4s\n"
+ "sqadd v2.4s, v2.4s, v11.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v18.16b, v4.16b, v24.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v12.16b, v1.16b, v24.16b\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "and v25.16b, v14.16b, v24.16b\n"
+ "sqadd v21.4s, v21.4s, v28.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v17.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v19.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "srshl v8.4s, v8.4s, v23.4s\n"
+ "srshl v21.4s, v21.4s, v23.4s\n"
+ "sqadd v4.4s, v4.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v23.4s\n"
+ "sqadd v1.4s, v1.4s, v12.4s\n"
+ "srshl v16.4s, v16.4s, v23.4s\n"
+ "sqadd v14.4s, v14.4s, v25.4s\n"
+ "srshl v2.4s, v2.4s, v24.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "srshl v4.4s, v4.4s, v24.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "srshl v1.4s, v1.4s, v24.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v14.4s, v14.4s, v24.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "sqxtn2 v8.8h, v2.4s\n"
+ "sqxtn2 v21.8h, v4.4s\n"
+ "sqxtn2 v20.8h, v1.4s\n"
+ "sqxtn2 v16.8h, v14.4s\n"
+ "sqadd v8.8h, v8.8h, v22.8h\n"
+ "sqadd v21.8h, v21.8h, v22.8h\n"
+ "sqadd v20.8h, v20.8h, v22.8h\n"
+ "sqadd v16.8h, v16.8h, v22.8h\n"
+ "smax v8.8h, v8.8h, v13.8h\n"
+ "smax v21.8h, v21.8h, v13.8h\n"
+ "smax v20.8h, v20.8h, v13.8h\n"
+ "smax v16.8h, v16.8h, v13.8h\n"
+ "smin v8.8h, v8.8h, v5.8h\n"
+ "smin v21.8h, v21.8h, v5.8h\n"
+ "smin v20.8h, v20.8h, v5.8h\n"
+ "smin v16.8h, v16.8h, v5.8h\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "tbz x7, #2, 85f\n"
+ "st1 { v8.s }[0], [x11], #0x4\n"
+ "st1 { v21.s }[0], [x10], #0x4\n"
+ "st1 { v20.s }[0], [x9], #0x4\n"
+ "st1 { v16.s }[0], [x28], #0x4\n"
+ "tbz x7, #1, 84f\n"
+ "st1 { v8.h }[2], [x11], #0x2\n"
+ "st1 { v21.h }[2], [x10], #0x2\n"
+ "st1 { v20.h }[2], [x9], #0x2\n"
+ "st1 { v16.h }[2], [x28], #0x2\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v8.b }[6], [x11], #0x1\n"
+ "st1 { v21.b }[6], [x10], #0x1\n"
+ "st1 { v20.b }[6], [x9], #0x1\n"
+ "st1 { v16.b }[6], [x28], #0x1\n"
+ "b 87f\n"
+ "84:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 87f\n"
+ "st1 { v8.b }[4], [x11], #0x1\n"
+ "st1 { v21.b }[4], [x10], #0x1\n"
+ "st1 { v20.b }[4], [x9], #0x1\n"
+ "st1 { v16.b }[4], [x28], #0x1\n"
+ "b 87f\n"
+ "85:" // Oddments: Bit 2: Unset
+ "tbz x7, #1, 86f\n"
+ "st1 { v8.h }[0], [x11], #0x2\n"
+ "st1 { v21.h }[0], [x10], #0x2\n"
+ "st1 { v20.h }[0], [x9], #0x2\n"
+ "st1 { v16.h }[0], [x28], #0x2\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v8.b }[2], [x11], #0x1\n"
+ "st1 { v21.b }[2], [x10], #0x1\n"
+ "st1 { v20.b }[2], [x9], #0x1\n"
+ "st1 { v16.b }[2], [x28], #0x1\n"
+ "b 87f\n"
+ "86:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 87f\n"
+ "st1 { v8.b }[0], [x11], #0x1\n"
+ "st1 { v21.b }[0], [x10], #0x1\n"
+ "st1 { v20.b }[0], [x9], #0x1\n"
+ "st1 { v16.b }[0], [x28], #0x1\n"
+ "87:" // Oddments: Bit 2: End
+ "88:" // End
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..39601fd8e4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int,
+ const uint8_t *const *const,
+ const uint8_t *const,
+ const int32_t *const,
+ const arm_gemm::Requantize32 &,
+ const int32_t *const,
+ const int32_t *const,
+ uint8_t *const *const
+);
+
+class a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..9316732632
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,2185 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ uint64_t n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[36];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[0];
+ inptrs[1] = inptrs_raw[1];
+ inptrs[2] = inptrs_raw[6];
+ inptrs[3] = inptrs_raw[7];
+ inptrs[4] = inptrs_raw[2];
+ inptrs[5] = inptrs_raw[8];
+ inptrs[6] = inptrs_raw[3];
+ inptrs[7] = inptrs_raw[4];
+ inptrs[8] = inptrs_raw[11];
+ inptrs[9] = inptrs_raw[12];
+ inptrs[10] = inptrs_raw[9];
+ inptrs[11] = inptrs_raw[10];
+ inptrs[12] = inptrs_raw[5];
+ inptrs[13] = inptrs_raw[13];
+ inptrs[14] = inptrs_raw[14];
+ inptrs[15] = inptrs_raw[15];
+ inptrs[16] = inptrs_raw[16];
+ inptrs[17] = inptrs_raw[17];
+ inptrs[18] = inptrs_raw[18];
+ inptrs[19] = inptrs_raw[19];
+ inptrs[20] = inptrs_raw[20];
+ inptrs[21] = inptrs_raw[21];
+ inptrs[22] = inptrs_raw[22];
+ inptrs[23] = inptrs_raw[23];
+ inptrs[24] = inptrs_raw[24];
+ inptrs[25] = inptrs_raw[25];
+ inptrs[26] = inptrs_raw[26];
+ inptrs[27] = inptrs_raw[27];
+ inptrs[28] = inptrs_raw[28];
+ inptrs[29] = inptrs_raw[29];
+ inptrs[30] = inptrs_raw[30];
+ inptrs[31] = inptrs_raw[31];
+ inptrs[32] = inptrs_raw[32];
+ inptrs[33] = inptrs_raw[33];
+ inptrs[34] = inptrs_raw[34];
+ inptrs[35] = inptrs_raw[35];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x2, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x3, x2, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v2.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_minval]\n"
+ "ld1r { v25.8h }, [x21]\n"
+ "ld1r { v12.8h }, [x20]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "mov x4, #0x0\n"
+ "ld1r { v26.8h }, [x20]\n"
+ "mov x5, #0x0\n"
+ "add x6, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x16, x15, [x22, #0x0]\n"
+ "ldp x14, x13, [x22, #0x10]\n"
+ "cbz x3, 3f\n"
+ "ldr d21, [x7, #0x0]\n"
+ "ldr d15, [x7, #0x8]\n"
+ "subs x3, x3, #0x1\n"
+ "usubl v21.8h, v21.8b, v2.8b\n"
+ "ldr d29, [x7, #0x10]\n"
+ "ldr d18, [x7, #0x18]\n"
+ "usubl v15.8h, v15.8b, v2.8b\n"
+ "usubl v29.8h, v29.8b, v2.8b\n"
+ "ldr d3, [x7, #0x20]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "usubl v18.8h, v18.8b, v2.8b\n"
+ "usubl v3.8h, v3.8b, v2.8b\n"
+ "ldr q13, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x9, x28, [x6, #0x0]\n"
+ "ldp x27, x26, [x6, #0x10]\n"
+ "mov v7.16b, v13.16b\n"
+ "mov v14.16b, v24.16b\n"
+ "ldp x25, x24, [x6, #0x20]\n"
+ "ldp x23, x22, [x6, #0x30]\n"
+ "mov v27.16b, v13.16b\n"
+ "mov v22.16b, v24.16b\n"
+ "ldp x21, x20, [x6, #0x40]\n"
+ "ldr d10, [x9, x4]\n"
+ "mov v8.16b, v13.16b\n"
+ "mov v17.16b, v24.16b\n"
+ "ldr d16, [x28, x4]\n"
+ "ldr d23, [x27, x4]\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "ldr d30, [x26, x4]\n"
+ "ldr d4, [x25, x4]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ldr d28, [x24, x4]\n"
+ "ldr d31, [x23, x4]\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ldr d1, [x22, x4]\n"
+ "ldr d9, [x21, x4]\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "ldr d11, [x20, x4]\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "ldr d5, [x7, #0x28]\n"
+ "ldr d6, [x7, #0x30]\n"
+ "smlal v13.4s, v10.4h, v21.4h\n"
+ "smlal2 v24.4s, v10.8h, v21.8h\n"
+ "ldr d19, [x7, #0x38]\n"
+ "ldr d0, [x7, #0x40]\n"
+ "smlal v13.4s, v16.4h, v15.4h\n"
+ "smlal v7.4s, v16.4h, v21.4h\n"
+ "ldr d10, [x7, #0x48]\n"
+ "ldr d20, [x7, #0x50]\n"
+ "smlal v27.4s, v23.4h, v21.4h\n"
+ "smlal v8.4s, v30.4h, v21.4h\n"
+ "ldr x21, [x6, #0x50]\n"
+ "smlal2 v24.4s, v16.8h, v15.8h\n"
+ "smlal v13.4s, v4.4h, v29.4h\n"
+ "ldr x20, [x6, #0x58]\n"
+ "smlal2 v14.4s, v16.8h, v21.8h\n"
+ "ldr d16, [x21, x4]\n"
+ "smlal2 v22.4s, v23.8h, v21.8h\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "smlal2 v17.4s, v30.8h, v21.8h\n"
+ "ldr d21, [x20, x4]\n"
+ "smlal v7.4s, v4.4h, v15.4h\n"
+ "ldr x22, [x6, #0x60]\n"
+ "smlal v27.4s, v30.4h, v15.4h\n"
+ "smlal v8.4s, v28.4h, v15.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ldr x20, [x6, #0x68]\n"
+ "smlal2 v24.4s, v4.8h, v29.8h\n"
+ "smlal v13.4s, v31.4h, v18.4h\n"
+ "usubl v5.8h, v5.8b, v2.8b\n"
+ "ldr x21, [x6, #0x70]\n"
+ "smlal2 v14.4s, v4.8h, v15.8h\n"
+ "ldr d4, [x22, x4]\n"
+ "smlal2 v22.4s, v30.8h, v15.8h\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "smlal2 v17.4s, v28.8h, v15.8h\n"
+ "ldr d15, [x20, x4]\n"
+ "smlal v7.4s, v31.4h, v29.4h\n"
+ "usubl v6.8h, v6.8b, v2.8b\n"
+ "smlal v27.4s, v28.4h, v29.4h\n"
+ "smlal v8.4s, v16.4h, v29.4h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "ldr x20, [x6, #0x78]\n"
+ "smlal2 v24.4s, v31.8h, v18.8h\n"
+ "smlal v13.4s, v1.4h, v3.4h\n"
+ "usubl v19.8h, v19.8b, v2.8b\n"
+ "ldr x22, [x6, #0x80]\n"
+ "smlal2 v14.4s, v31.8h, v29.8h\n"
+ "ldr d31, [x21, x4]\n"
+ "smlal2 v22.4s, v28.8h, v29.8h\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal2 v17.4s, v16.8h, v29.8h\n"
+ "ldr d29, [x20, x4]\n"
+ "smlal v7.4s, v1.4h, v18.4h\n"
+ "usubl v0.8h, v0.8b, v2.8b\n"
+ "smlal v27.4s, v16.4h, v18.4h\n"
+ "smlal v8.4s, v21.4h, v18.4h\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr x20, [x6, #0x88]\n"
+ "smlal2 v24.4s, v1.8h, v3.8h\n"
+ "smlal v13.4s, v23.4h, v5.4h\n"
+ "usubl v10.8h, v10.8b, v2.8b\n"
+ "ldr x21, [x6, #0x90]\n"
+ "smlal2 v14.4s, v1.8h, v18.8h\n"
+ "ldr d1, [x22, x4]\n"
+ "smlal2 v22.4s, v16.8h, v18.8h\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "smlal2 v17.4s, v21.8h, v18.8h\n"
+ "ldr d18, [x20, x4]\n"
+ "smlal v7.4s, v4.4h, v3.4h\n"
+ "usubl v20.8h, v20.8b, v2.8b\n"
+ "smlal v27.4s, v21.4h, v3.4h\n"
+ "smlal v8.4s, v9.4h, v3.4h\n"
+ "ldr x20, [x6, #0x98]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal2 v24.4s, v23.8h, v5.8h\n"
+ "ldr d23, [x7, #0x58]\n"
+ "smlal v13.4s, v30.4h, v6.4h\n"
+ "usubl v23.8h, v23.8b, v2.8b\n"
+ "smlal2 v14.4s, v4.8h, v3.8h\n"
+ "ldr d4, [x21, x4]\n"
+ "smlal2 v22.4s, v21.8h, v3.8h\n"
+ "ldr x23, [x6, #0xa0]\n"
+ "smlal2 v17.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x20, x4]\n"
+ "smlal v7.4s, v30.4h, v5.4h\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "smlal v27.4s, v11.4h, v5.4h\n"
+ "smlal v8.4s, v15.4h, v5.4h\n"
+ "ushll v3.8h, v3.8b, #0x0\n"
+ "ldr x22, [x6, #0xa8]\n"
+ "smlal2 v24.4s, v30.8h, v6.8h\n"
+ "smlal v13.4s, v28.4h, v19.4h\n"
+ "ldr x21, [x6, #0xb0]\n"
+ "ldr x20, [x6, #0xb8]\n"
+ "smlal2 v14.4s, v30.8h, v5.8h\n"
+ "ldr d30, [x7, #0x60]\n"
+ "smlal2 v22.4s, v11.8h, v5.8h\n"
+ "usubl v30.8h, v30.8b, v2.8b\n"
+ "smlal2 v17.4s, v15.8h, v5.8h\n"
+ "ldr d5, [x23, x4]\n"
+ "smlal v7.4s, v28.4h, v6.4h\n"
+ "ushll v5.8h, v5.8b, #0x0\n"
+ "smlal v27.4s, v15.4h, v6.4h\n"
+ "smlal v8.4s, v31.4h, v6.4h\n"
+ "ldr x12, [x6, #0xc0]\n"
+ "ldr x11, [x6, #0xc8]\n"
+ "smlal2 v24.4s, v28.8h, v19.8h\n"
+ "smlal v13.4s, v16.4h, v0.4h\n"
+ "ldr x10, [x6, #0xd0]\n"
+ "ldr x9, [x6, #0xd8]\n"
+ "smlal2 v14.4s, v28.8h, v6.8h\n"
+ "ldr d28, [x7, #0x68]\n"
+ "smlal2 v22.4s, v15.8h, v6.8h\n"
+ "usubl v28.8h, v28.8b, v2.8b\n"
+ "smlal2 v17.4s, v31.8h, v6.8h\n"
+ "ldr d6, [x22, x4]\n"
+ "smlal v7.4s, v16.4h, v19.4h\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "smlal v27.4s, v31.4h, v19.4h\n"
+ "smlal v8.4s, v29.4h, v19.4h\n"
+ "ldr x28, [x6, #0xe0]\n"
+ "ldr x27, [x6, #0xe8]\n"
+ "smlal2 v24.4s, v16.8h, v0.8h\n"
+ "smlal v13.4s, v21.4h, v10.4h\n"
+ "ldr x26, [x6, #0xf0]\n"
+ "ldr x25, [x6, #0xf8]\n"
+ "smlal2 v14.4s, v16.8h, v19.8h\n"
+ "ldr d16, [x7, #0x70]\n"
+ "smlal2 v22.4s, v31.8h, v19.8h\n"
+ "usubl v16.8h, v16.8b, v2.8b\n"
+ "smlal2 v17.4s, v29.8h, v19.8h\n"
+ "ldr d19, [x21, x4]\n"
+ "smlal v7.4s, v21.4h, v0.4h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "smlal v27.4s, v29.4h, v0.4h\n"
+ "smlal v8.4s, v1.4h, v0.4h\n"
+ "ldr x24, [x6, #0x100]\n"
+ "ldr x23, [x6, #0x108]\n"
+ "smlal2 v24.4s, v21.8h, v10.8h\n"
+ "smlal v13.4s, v11.4h, v20.4h\n"
+ "ldr x22, [x6, #0x110]\n"
+ "ldr x21, [x6, #0x118]\n"
+ "smlal2 v14.4s, v21.8h, v0.8h\n"
+ "ldr d21, [x7, #0x78]\n"
+ "smlal2 v22.4s, v29.8h, v0.8h\n"
+ "usubl v21.8h, v21.8b, v2.8b\n"
+ "smlal2 v17.4s, v1.8h, v0.8h\n"
+ "ldr d0, [x20, x4]\n"
+ "smlal v7.4s, v9.4h, v10.4h\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
+ "smlal v27.4s, v1.4h, v10.4h\n"
+ "smlal v8.4s, v18.4h, v10.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "subs x3, x3, #0x1\n"
+ "smlal2 v24.4s, v11.8h, v20.8h\n"
+ "ldr d11, [x7, #0x80]\n"
+ "smlal v13.4s, v15.4h, v23.4h\n"
+ "usubl v11.8h, v11.8b, v2.8b\n"
+ "smlal2 v14.4s, v9.8h, v10.8h\n"
+ "ldr d9, [x12, x4]\n"
+ "smlal2 v22.4s, v1.8h, v10.8h\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "smlal2 v17.4s, v18.8h, v10.8h\n"
+ "ldr d10, [x11, x4]\n"
+ "smlal v7.4s, v15.4h, v20.4h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal v27.4s, v4.4h, v20.4h\n"
+ "smlal v8.4s, v3.4h, v20.4h\n"
+ "smlal2 v24.4s, v15.8h, v23.8h\n"
+ "smlal v13.4s, v31.4h, v30.4h\n"
+ "smlal2 v14.4s, v15.8h, v20.8h\n"
+ "ldr d15, [x7, #0x88]\n"
+ "smlal2 v22.4s, v4.8h, v20.8h\n"
+ "usubl v15.8h, v15.8b, v2.8b\n"
+ "smlal2 v17.4s, v3.8h, v20.8h\n"
+ "ldr d20, [x10, x4]\n"
+ "smlal v7.4s, v31.4h, v23.4h\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "smlal v27.4s, v3.4h, v23.4h\n"
+ "smlal v8.4s, v5.4h, v23.4h\n"
+ "smlal2 v24.4s, v31.8h, v30.8h\n"
+ "smlal v13.4s, v29.4h, v28.4h\n"
+ "smlal2 v14.4s, v31.8h, v23.8h\n"
+ "ldr d31, [x7, #0x90]\n"
+ "smlal2 v22.4s, v3.8h, v23.8h\n"
+ "usubl v31.8h, v31.8b, v2.8b\n"
+ "smlal2 v17.4s, v5.8h, v23.8h\n"
+ "ldr d23, [x9, x4]\n"
+ "smlal v7.4s, v29.4h, v30.4h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal v27.4s, v5.4h, v30.4h\n"
+ "smlal v8.4s, v6.4h, v30.4h\n"
+ "smlal2 v24.4s, v29.8h, v28.8h\n"
+ "smlal v13.4s, v1.4h, v16.4h\n"
+ "smlal2 v14.4s, v29.8h, v30.8h\n"
+ "ldr d29, [x7, #0x98]\n"
+ "smlal2 v22.4s, v5.8h, v30.8h\n"
+ "usubl v29.8h, v29.8b, v2.8b\n"
+ "smlal2 v17.4s, v6.8h, v30.8h\n"
+ "ldr d30, [x28, x4]\n"
+ "smlal v7.4s, v1.4h, v28.4h\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "smlal v27.4s, v6.4h, v28.4h\n"
+ "smlal v8.4s, v19.4h, v28.4h\n"
+ "smlal2 v24.4s, v1.8h, v16.8h\n"
+ "smlal v13.4s, v4.4h, v21.4h\n"
+ "smlal2 v14.4s, v1.8h, v28.8h\n"
+ "ldr d1, [x7, #0xa0]\n"
+ "smlal2 v22.4s, v6.8h, v28.8h\n"
+ "usubl v1.8h, v1.8b, v2.8b\n"
+ "smlal2 v17.4s, v19.8h, v28.8h\n"
+ "ldr d28, [x27, x4]\n"
+ "smlal v7.4s, v18.4h, v16.4h\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v27.4s, v19.4h, v16.4h\n"
+ "smlal v8.4s, v0.4h, v16.4h\n"
+ "smlal2 v24.4s, v4.8h, v21.8h\n"
+ "ldr d4, [x7, #0xa8]\n"
+ "smlal v13.4s, v3.4h, v11.4h\n"
+ "usubl v4.8h, v4.8b, v2.8b\n"
+ "smlal2 v14.4s, v18.8h, v16.8h\n"
+ "ldr d18, [x26, x4]\n"
+ "smlal2 v22.4s, v19.8h, v16.8h\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal2 v17.4s, v0.8h, v16.8h\n"
+ "ldr d16, [x25, x4]\n"
+ "smlal v7.4s, v3.4h, v21.4h\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "smlal v27.4s, v9.4h, v21.4h\n"
+ "smlal v8.4s, v10.4h, v21.4h\n"
+ "smlal2 v24.4s, v3.8h, v11.8h\n"
+ "smlal v13.4s, v5.4h, v15.4h\n"
+ "smlal2 v14.4s, v3.8h, v21.8h\n"
+ "ldr d3, [x7, #0xb0]\n"
+ "smlal2 v22.4s, v9.8h, v21.8h\n"
+ "usubl v3.8h, v3.8b, v2.8b\n"
+ "smlal2 v17.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x24, x4]\n"
+ "smlal v7.4s, v5.4h, v11.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "smlal v27.4s, v10.4h, v11.4h\n"
+ "smlal v8.4s, v20.4h, v11.4h\n"
+ "smlal2 v24.4s, v5.8h, v15.8h\n"
+ "smlal v13.4s, v6.4h, v31.4h\n"
+ "smlal2 v14.4s, v5.8h, v11.8h\n"
+ "ldr d5, [x7, #0xb8]\n"
+ "smlal2 v22.4s, v10.8h, v11.8h\n"
+ "usubl v5.8h, v5.8b, v2.8b\n"
+ "smlal2 v17.4s, v20.8h, v11.8h\n"
+ "ldr d11, [x23, x4]\n"
+ "smlal v7.4s, v6.4h, v15.4h\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "smlal v27.4s, v20.4h, v15.4h\n"
+ "smlal v8.4s, v23.4h, v15.4h\n"
+ "smlal2 v24.4s, v6.8h, v31.8h\n"
+ "smlal v13.4s, v19.4h, v29.4h\n"
+ "smlal2 v14.4s, v6.8h, v15.8h\n"
+ "ldr d6, [x7, #0xc0]\n"
+ "smlal2 v22.4s, v20.8h, v15.8h\n"
+ "usubl v6.8h, v6.8b, v2.8b\n"
+ "smlal2 v17.4s, v23.8h, v15.8h\n"
+ "ldr d15, [x22, x4]\n"
+ "smlal v7.4s, v19.4h, v31.4h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v27.4s, v23.4h, v31.4h\n"
+ "smlal v8.4s, v30.4h, v31.4h\n"
+ "add x7, x7, #0xc8\n"
+ "smlal2 v24.4s, v19.8h, v29.8h\n"
+ "smlal v13.4s, v9.4h, v1.4h\n"
+ "smlal2 v14.4s, v19.8h, v31.8h\n"
+ "ldr d19, [x21, x4]\n"
+ "smlal2 v22.4s, v23.8h, v31.8h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "smlal2 v17.4s, v30.8h, v31.8h\n"
+ "ldr q31, [x8, #0x0]\n"
+ "smlal v7.4s, v0.4h, v29.4h\n"
+ "add x4, x4, #0x8\n"
+ "smlal v27.4s, v30.4h, v29.4h\n"
+ "smlal v8.4s, v28.4h, v29.4h\n"
+ "smlal2 v24.4s, v9.8h, v1.8h\n"
+ "ldr q9, [x17, #0x0]\n"
+ "smlal v13.4s, v10.4h, v4.4h\n"
+ "smlal2 v14.4s, v0.8h, v29.8h\n"
+ "ldr q0, [x8, #0x10]\n"
+ "smlal2 v22.4s, v30.8h, v29.8h\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v17.4s, v28.8h, v29.8h\n"
+ "ldr q29, [x17, #0x10]\n"
+ "smlal v7.4s, v10.4h, v1.4h\n"
+ "add x17, x17, #0x20\n"
+ "smlal v27.4s, v18.4h, v1.4h\n"
+ "smlal v8.4s, v16.4h, v1.4h\n"
+ "smlal2 v24.4s, v10.8h, v4.8h\n"
+ "smlal v13.4s, v20.4h, v3.4h\n"
+ "smlal2 v14.4s, v10.8h, v1.8h\n"
+ "smlal2 v22.4s, v18.8h, v1.8h\n"
+ "smlal2 v17.4s, v16.8h, v1.8h\n"
+ "smlal v7.4s, v20.4h, v4.4h\n"
+ "smlal v27.4s, v16.4h, v4.4h\n"
+ "smlal v8.4s, v21.4h, v4.4h\n"
+ "smlal2 v24.4s, v20.8h, v3.8h\n"
+ "smlal v13.4s, v23.4h, v5.4h\n"
+ "smlal2 v14.4s, v20.8h, v4.8h\n"
+ "smlal2 v22.4s, v16.8h, v4.8h\n"
+ "smlal2 v17.4s, v21.8h, v4.8h\n"
+ "smlal v7.4s, v23.4h, v3.4h\n"
+ "smlal v27.4s, v21.4h, v3.4h\n"
+ "smlal v8.4s, v11.4h, v3.4h\n"
+ "smlal2 v24.4s, v23.8h, v5.8h\n"
+ "smlal v13.4s, v30.4h, v6.4h\n"
+ "sqrdmulh v13.4s, v13.4s, v31.4s\n"
+ "smlal2 v14.4s, v23.8h, v3.8h\n"
+ "smlal2 v22.4s, v21.8h, v3.8h\n"
+ "and v23.16b, v13.16b, v9.16b\n"
+ "smlal2 v17.4s, v11.8h, v3.8h\n"
+ "smlal v7.4s, v30.4h, v5.4h\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "smlal v27.4s, v11.4h, v5.4h\n"
+ "smlal v8.4s, v15.4h, v5.4h\n"
+ "sqadd v13.4s, v13.4s, v23.4s\n"
+ "smlal2 v24.4s, v30.8h, v6.8h\n"
+ "smlal2 v14.4s, v30.8h, v5.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v0.4s\n"
+ "smlal2 v22.4s, v11.8h, v5.8h\n"
+ "smlal2 v17.4s, v15.8h, v5.8h\n"
+ "and v10.16b, v24.16b, v29.16b\n"
+ "smlal v7.4s, v28.4h, v6.4h\n"
+ "smlal v27.4s, v15.4h, v6.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v31.4s\n"
+ "smlal v8.4s, v19.4h, v6.4h\n"
+ "smlal2 v14.4s, v28.8h, v6.8h\n"
+ "sqrdmulh v27.4s, v27.4s, v31.4s\n"
+ "smlal2 v22.4s, v15.8h, v6.8h\n"
+ "smlal2 v17.4s, v19.8h, v6.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v31.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "and v28.16b, v7.16b, v9.16b\n"
+ "sqrdmulh v14.4s, v14.4s, v0.4s\n"
+ "and v20.16b, v27.16b, v9.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v0.4s\n"
+ "and v23.16b, v8.16b, v9.16b\n"
+ "sqrdmulh v17.4s, v17.4s, v0.4s\n"
+ "sqadd v24.4s, v24.4s, v10.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v18.16b, v14.16b, v29.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v30.16b, v22.16b, v29.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v5.16b, v17.16b, v29.16b\n"
+ "sqadd v7.4s, v7.4s, v28.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v23.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v9.4s\n"
+ "srshl v7.4s, v7.4s, v9.4s\n"
+ "sqadd v14.4s, v14.4s, v18.4s\n"
+ "srshl v27.4s, v27.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v30.4s\n"
+ "srshl v8.4s, v8.4s, v9.4s\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "srshl v24.4s, v24.4s, v29.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v14.4s, v14.4s, v29.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v22.4s, v22.4s, v29.4s\n"
+ "sqxtn v27.4h, v27.4s\n"
+ "srshl v17.4s, v17.4s, v29.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "sqxtn2 v13.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v14.4s\n"
+ "sqxtn2 v27.8h, v22.4s\n"
+ "sqxtn2 v8.8h, v17.4s\n"
+ "sqadd v13.8h, v13.8h, v25.8h\n"
+ "sqadd v7.8h, v7.8h, v25.8h\n"
+ "sqadd v27.8h, v27.8h, v25.8h\n"
+ "sqadd v8.8h, v8.8h, v25.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v7.8h, v7.8h, v12.8h\n"
+ "smax v27.8h, v27.8h, v12.8h\n"
+ "smax v8.8h, v8.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v26.8h\n"
+ "smin v7.8h, v7.8h, v26.8h\n"
+ "smin v27.8h, v27.8h, v26.8h\n"
+ "smin v8.8h, v8.8h, v26.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "str d13, [x16, x5]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str d7, [x15, x5]\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "str d27, [x14, x5]\n"
+ "str d8, [x13, x5]\n"
+ "ldr q13, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d21, [x7, #0x0]\n"
+ "ldr d15, [x7, #0x8]\n"
+ "add x5, x5, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d29, [x7, #0x10]\n"
+ "ldr d18, [x7, #0x18]\n"
+ "mov v7.16b, v13.16b\n"
+ "mov v14.16b, v24.16b\n"
+ "ldr d3, [x7, #0x20]\n"
+ "ldp x9, x28, [x6, #0x0]\n"
+ "mov v27.16b, v13.16b\n"
+ "mov v22.16b, v24.16b\n"
+ "ldp x27, x26, [x6, #0x10]\n"
+ "ldp x25, x24, [x6, #0x20]\n"
+ "mov v8.16b, v13.16b\n"
+ "mov v17.16b, v24.16b\n"
+ "ldp x23, x22, [x6, #0x30]\n"
+ "ldp x21, x20, [x6, #0x40]\n"
+ "usubl v21.8h, v21.8b, v2.8b\n"
+ "usubl v15.8h, v15.8b, v2.8b\n"
+ "ldr d10, [x9, x4]\n"
+ "ldr d16, [x28, x4]\n"
+ "usubl v29.8h, v29.8b, v2.8b\n"
+ "usubl v18.8h, v18.8b, v2.8b\n"
+ "ldr d23, [x27, x4]\n"
+ "ldr d30, [x26, x4]\n"
+ "usubl v3.8h, v3.8b, v2.8b\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ldr d4, [x25, x4]\n"
+ "ldr d28, [x24, x4]\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ldr d31, [x23, x4]\n"
+ "ldr d1, [x22, x4]\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "ldr d9, [x21, x4]\n"
+ "ldr d11, [x20, x4]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "ldr d0, [x7, #0x28]\n"
+ "ldr d20, [x7, #0x30]\n"
+ "smlal v13.4s, v10.4h, v21.4h\n"
+ "smlal2 v24.4s, v10.8h, v21.8h\n"
+ "ldr d6, [x7, #0x38]\n"
+ "ldr d19, [x7, #0x40]\n"
+ "smlal v13.4s, v16.4h, v15.4h\n"
+ "smlal v7.4s, v16.4h, v21.4h\n"
+ "ldr d10, [x7, #0x48]\n"
+ "ldr d5, [x7, #0x50]\n"
+ "smlal v27.4s, v23.4h, v21.4h\n"
+ "smlal v8.4s, v30.4h, v21.4h\n"
+ "ldr x21, [x6, #0x50]\n"
+ "smlal2 v24.4s, v16.8h, v15.8h\n"
+ "smlal v13.4s, v4.4h, v29.4h\n"
+ "ldr x20, [x6, #0x58]\n"
+ "smlal2 v14.4s, v16.8h, v21.8h\n"
+ "ldr d16, [x21, x4]\n"
+ "smlal2 v22.4s, v23.8h, v21.8h\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "smlal2 v17.4s, v30.8h, v21.8h\n"
+ "ldr d21, [x20, x4]\n"
+ "smlal v7.4s, v4.4h, v15.4h\n"
+ "ldr x22, [x6, #0x60]\n"
+ "smlal v27.4s, v30.4h, v15.4h\n"
+ "smlal v8.4s, v28.4h, v15.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ldr x20, [x6, #0x68]\n"
+ "smlal2 v24.4s, v4.8h, v29.8h\n"
+ "smlal v13.4s, v31.4h, v18.4h\n"
+ "usubl v0.8h, v0.8b, v2.8b\n"
+ "ldr x21, [x6, #0x70]\n"
+ "smlal2 v14.4s, v4.8h, v15.8h\n"
+ "ldr d4, [x22, x4]\n"
+ "smlal2 v22.4s, v30.8h, v15.8h\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "smlal2 v17.4s, v28.8h, v15.8h\n"
+ "ldr d15, [x20, x4]\n"
+ "smlal v7.4s, v31.4h, v29.4h\n"
+ "usubl v20.8h, v20.8b, v2.8b\n"
+ "smlal v27.4s, v28.4h, v29.4h\n"
+ "smlal v8.4s, v16.4h, v29.4h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "ldr x20, [x6, #0x78]\n"
+ "smlal2 v24.4s, v31.8h, v18.8h\n"
+ "smlal v13.4s, v1.4h, v3.4h\n"
+ "usubl v6.8h, v6.8b, v2.8b\n"
+ "ldr x22, [x6, #0x80]\n"
+ "smlal2 v14.4s, v31.8h, v29.8h\n"
+ "ldr d31, [x21, x4]\n"
+ "smlal2 v22.4s, v28.8h, v29.8h\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal2 v17.4s, v16.8h, v29.8h\n"
+ "ldr d29, [x20, x4]\n"
+ "smlal v7.4s, v1.4h, v18.4h\n"
+ "usubl v19.8h, v19.8b, v2.8b\n"
+ "smlal v27.4s, v16.4h, v18.4h\n"
+ "smlal v8.4s, v21.4h, v18.4h\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr x20, [x6, #0x88]\n"
+ "smlal2 v24.4s, v1.8h, v3.8h\n"
+ "smlal v13.4s, v23.4h, v0.4h\n"
+ "usubl v10.8h, v10.8b, v2.8b\n"
+ "ldr x21, [x6, #0x90]\n"
+ "smlal2 v14.4s, v1.8h, v18.8h\n"
+ "ldr d1, [x22, x4]\n"
+ "smlal2 v22.4s, v16.8h, v18.8h\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "smlal2 v17.4s, v21.8h, v18.8h\n"
+ "ldr d18, [x20, x4]\n"
+ "smlal v7.4s, v4.4h, v3.4h\n"
+ "usubl v5.8h, v5.8b, v2.8b\n"
+ "smlal v27.4s, v21.4h, v3.4h\n"
+ "smlal v8.4s, v9.4h, v3.4h\n"
+ "ldr x20, [x6, #0x98]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal2 v24.4s, v23.8h, v0.8h\n"
+ "ldr d23, [x7, #0x58]\n"
+ "smlal v13.4s, v30.4h, v20.4h\n"
+ "usubl v23.8h, v23.8b, v2.8b\n"
+ "smlal2 v14.4s, v4.8h, v3.8h\n"
+ "ldr d4, [x21, x4]\n"
+ "smlal2 v22.4s, v21.8h, v3.8h\n"
+ "ldr x22, [x6, #0xa0]\n"
+ "smlal2 v17.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x20, x4]\n"
+ "smlal v7.4s, v30.4h, v0.4h\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "smlal v27.4s, v11.4h, v0.4h\n"
+ "smlal v8.4s, v15.4h, v0.4h\n"
+ "ushll v3.8h, v3.8b, #0x0\n"
+ "ldr x21, [x6, #0xa8]\n"
+ "smlal2 v24.4s, v30.8h, v20.8h\n"
+ "smlal v13.4s, v28.4h, v6.4h\n"
+ "ldr x20, [x6, #0xb0]\n"
+ "ldr x12, [x6, #0xb8]\n"
+ "smlal2 v14.4s, v30.8h, v0.8h\n"
+ "ldr d30, [x7, #0x60]\n"
+ "smlal2 v22.4s, v11.8h, v0.8h\n"
+ "usubl v30.8h, v30.8b, v2.8b\n"
+ "smlal2 v17.4s, v15.8h, v0.8h\n"
+ "ldr d0, [x22, x4]\n"
+ "smlal v7.4s, v28.4h, v20.4h\n"
+ "ushll v0.8h, v0.8b, #0x0\n"
+ "smlal v27.4s, v15.4h, v20.4h\n"
+ "smlal v8.4s, v31.4h, v20.4h\n"
+ "ldr x11, [x6, #0xc0]\n"
+ "ldr x10, [x6, #0xc8]\n"
+ "smlal2 v24.4s, v28.8h, v6.8h\n"
+ "smlal v13.4s, v16.4h, v19.4h\n"
+ "ldr x9, [x6, #0xd0]\n"
+ "ldr x28, [x6, #0xd8]\n"
+ "smlal2 v14.4s, v28.8h, v20.8h\n"
+ "ldr d28, [x7, #0x68]\n"
+ "smlal2 v22.4s, v15.8h, v20.8h\n"
+ "usubl v28.8h, v28.8b, v2.8b\n"
+ "smlal2 v17.4s, v31.8h, v20.8h\n"
+ "ldr d20, [x21, x4]\n"
+ "smlal v7.4s, v16.4h, v6.4h\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "smlal v27.4s, v31.4h, v6.4h\n"
+ "smlal v8.4s, v29.4h, v6.4h\n"
+ "ldr x27, [x6, #0xe0]\n"
+ "ldr x26, [x6, #0xe8]\n"
+ "smlal2 v24.4s, v16.8h, v19.8h\n"
+ "smlal v13.4s, v21.4h, v10.4h\n"
+ "ldr x25, [x6, #0xf0]\n"
+ "ldr x24, [x6, #0xf8]\n"
+ "smlal2 v14.4s, v16.8h, v6.8h\n"
+ "ldr d16, [x7, #0x70]\n"
+ "smlal2 v22.4s, v31.8h, v6.8h\n"
+ "usubl v16.8h, v16.8b, v2.8b\n"
+ "smlal2 v17.4s, v29.8h, v6.8h\n"
+ "ldr d6, [x20, x4]\n"
+ "smlal v7.4s, v21.4h, v19.4h\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "smlal v27.4s, v29.4h, v19.4h\n"
+ "smlal v8.4s, v1.4h, v19.4h\n"
+ "ldr x23, [x6, #0x100]\n"
+ "ldr x22, [x6, #0x108]\n"
+ "smlal2 v24.4s, v21.8h, v10.8h\n"
+ "smlal v13.4s, v11.4h, v5.4h\n"
+ "ldr x21, [x6, #0x110]\n"
+ "ldr x20, [x6, #0x118]\n"
+ "smlal2 v14.4s, v21.8h, v19.8h\n"
+ "ldr d21, [x7, #0x78]\n"
+ "smlal2 v22.4s, v29.8h, v19.8h\n"
+ "usubl v21.8h, v21.8b, v2.8b\n"
+ "smlal2 v17.4s, v1.8h, v19.8h\n"
+ "ldr d19, [x12, x4]\n"
+ "smlal v7.4s, v9.4h, v10.4h\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "smlal v27.4s, v1.4h, v10.4h\n"
+ "smlal v8.4s, v18.4h, v10.4h\n"
+ "tst x2, #0x7\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "ldr d11, [x7, #0x80]\n"
+ "smlal v13.4s, v15.4h, v23.4h\n"
+ "usubl v11.8h, v11.8b, v2.8b\n"
+ "smlal2 v14.4s, v9.8h, v10.8h\n"
+ "ldr d9, [x11, x4]\n"
+ "smlal2 v22.4s, v1.8h, v10.8h\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "smlal2 v17.4s, v18.8h, v10.8h\n"
+ "ldr d10, [x10, x4]\n"
+ "smlal v7.4s, v15.4h, v5.4h\n"
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "smlal v27.4s, v4.4h, v5.4h\n"
+ "smlal v8.4s, v3.4h, v5.4h\n"
+ "smlal2 v24.4s, v15.8h, v23.8h\n"
+ "smlal v13.4s, v31.4h, v30.4h\n"
+ "smlal2 v14.4s, v15.8h, v5.8h\n"
+ "ldr d15, [x7, #0x88]\n"
+ "smlal2 v22.4s, v4.8h, v5.8h\n"
+ "usubl v15.8h, v15.8b, v2.8b\n"
+ "smlal2 v17.4s, v3.8h, v5.8h\n"
+ "ldr d5, [x9, x4]\n"
+ "smlal v7.4s, v31.4h, v23.4h\n"
+ "ushll v5.8h, v5.8b, #0x0\n"
+ "smlal v27.4s, v3.4h, v23.4h\n"
+ "smlal v8.4s, v0.4h, v23.4h\n"
+ "smlal2 v24.4s, v31.8h, v30.8h\n"
+ "smlal v13.4s, v29.4h, v28.4h\n"
+ "smlal2 v14.4s, v31.8h, v23.8h\n"
+ "ldr d31, [x7, #0x90]\n"
+ "smlal2 v22.4s, v3.8h, v23.8h\n"
+ "usubl v31.8h, v31.8b, v2.8b\n"
+ "smlal2 v17.4s, v0.8h, v23.8h\n"
+ "ldr d23, [x28, x4]\n"
+ "smlal v7.4s, v29.4h, v30.4h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal v27.4s, v0.4h, v30.4h\n"
+ "smlal v8.4s, v20.4h, v30.4h\n"
+ "smlal2 v24.4s, v29.8h, v28.8h\n"
+ "smlal v13.4s, v1.4h, v16.4h\n"
+ "smlal2 v14.4s, v29.8h, v30.8h\n"
+ "ldr d29, [x7, #0x98]\n"
+ "smlal2 v22.4s, v0.8h, v30.8h\n"
+ "usubl v29.8h, v29.8b, v2.8b\n"
+ "smlal2 v17.4s, v20.8h, v30.8h\n"
+ "ldr d30, [x27, x4]\n"
+ "smlal v7.4s, v1.4h, v28.4h\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "smlal v27.4s, v20.4h, v28.4h\n"
+ "smlal v8.4s, v6.4h, v28.4h\n"
+ "smlal2 v24.4s, v1.8h, v16.8h\n"
+ "smlal v13.4s, v4.4h, v21.4h\n"
+ "smlal2 v14.4s, v1.8h, v28.8h\n"
+ "ldr d1, [x7, #0xa0]\n"
+ "smlal2 v22.4s, v20.8h, v28.8h\n"
+ "usubl v1.8h, v1.8b, v2.8b\n"
+ "smlal2 v17.4s, v6.8h, v28.8h\n"
+ "ldr d28, [x26, x4]\n"
+ "smlal v7.4s, v18.4h, v16.4h\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v27.4s, v6.4h, v16.4h\n"
+ "smlal v8.4s, v19.4h, v16.4h\n"
+ "smlal2 v24.4s, v4.8h, v21.8h\n"
+ "ldr d4, [x7, #0xa8]\n"
+ "smlal v13.4s, v3.4h, v11.4h\n"
+ "usubl v4.8h, v4.8b, v2.8b\n"
+ "smlal2 v14.4s, v18.8h, v16.8h\n"
+ "ldr d18, [x25, x4]\n"
+ "smlal2 v22.4s, v6.8h, v16.8h\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "smlal2 v17.4s, v19.8h, v16.8h\n"
+ "ldr d16, [x24, x4]\n"
+ "smlal v7.4s, v3.4h, v21.4h\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "smlal v27.4s, v9.4h, v21.4h\n"
+ "smlal v8.4s, v10.4h, v21.4h\n"
+ "smlal2 v24.4s, v3.8h, v11.8h\n"
+ "smlal v13.4s, v0.4h, v15.4h\n"
+ "smlal2 v14.4s, v3.8h, v21.8h\n"
+ "ldr d3, [x7, #0xb0]\n"
+ "smlal2 v22.4s, v9.8h, v21.8h\n"
+ "usubl v3.8h, v3.8b, v2.8b\n"
+ "smlal2 v17.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x23, x4]\n"
+ "smlal v7.4s, v0.4h, v11.4h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "smlal v27.4s, v10.4h, v11.4h\n"
+ "smlal v8.4s, v5.4h, v11.4h\n"
+ "smlal2 v24.4s, v0.8h, v15.8h\n"
+ "smlal v13.4s, v20.4h, v31.4h\n"
+ "smlal2 v14.4s, v0.8h, v11.8h\n"
+ "ldr d0, [x7, #0xb8]\n"
+ "smlal2 v22.4s, v10.8h, v11.8h\n"
+ "usubl v0.8h, v0.8b, v2.8b\n"
+ "smlal2 v17.4s, v5.8h, v11.8h\n"
+ "ldr d11, [x22, x4]\n"
+ "smlal v7.4s, v20.4h, v15.4h\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "smlal v27.4s, v5.4h, v15.4h\n"
+ "smlal v8.4s, v23.4h, v15.4h\n"
+ "smlal2 v24.4s, v20.8h, v31.8h\n"
+ "smlal v13.4s, v6.4h, v29.4h\n"
+ "smlal2 v14.4s, v20.8h, v15.8h\n"
+ "ldr d20, [x7, #0xc0]\n"
+ "smlal2 v22.4s, v5.8h, v15.8h\n"
+ "usubl v20.8h, v20.8b, v2.8b\n"
+ "smlal2 v17.4s, v23.8h, v15.8h\n"
+ "ldr d15, [x21, x4]\n"
+ "smlal v7.4s, v6.4h, v31.4h\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v27.4s, v23.4h, v31.4h\n"
+ "smlal v8.4s, v30.4h, v31.4h\n"
+ "smlal2 v24.4s, v6.8h, v29.8h\n"
+ "smlal v13.4s, v9.4h, v1.4h\n"
+ "smlal2 v14.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x20, x4]\n"
+ "smlal2 v22.4s, v23.8h, v31.8h\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "smlal2 v17.4s, v30.8h, v31.8h\n"
+ "ldr q31, [x8, #0x0]\n"
+ "smlal v7.4s, v19.4h, v29.4h\n"
+ "add x4, x4, #0x8\n"
+ "smlal v27.4s, v30.4h, v29.4h\n"
+ "smlal v8.4s, v28.4h, v29.4h\n"
+ "smlal2 v24.4s, v9.8h, v1.8h\n"
+ "ldr q9, [x17, #0x0]\n"
+ "smlal v13.4s, v10.4h, v4.4h\n"
+ "smlal2 v14.4s, v19.8h, v29.8h\n"
+ "ldr q19, [x8, #0x10]\n"
+ "smlal2 v22.4s, v30.8h, v29.8h\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v17.4s, v28.8h, v29.8h\n"
+ "ldr q29, [x17, #0x10]\n"
+ "smlal v7.4s, v10.4h, v1.4h\n"
+ "add x17, x17, #0x20\n"
+ "smlal v27.4s, v18.4h, v1.4h\n"
+ "smlal v8.4s, v16.4h, v1.4h\n"
+ "smlal2 v24.4s, v10.8h, v4.8h\n"
+ "smlal v13.4s, v5.4h, v3.4h\n"
+ "smlal2 v14.4s, v10.8h, v1.8h\n"
+ "smlal2 v22.4s, v18.8h, v1.8h\n"
+ "smlal2 v17.4s, v16.8h, v1.8h\n"
+ "smlal v7.4s, v5.4h, v4.4h\n"
+ "smlal v27.4s, v16.4h, v4.4h\n"
+ "smlal v8.4s, v21.4h, v4.4h\n"
+ "smlal2 v24.4s, v5.8h, v3.8h\n"
+ "smlal v13.4s, v23.4h, v0.4h\n"
+ "smlal2 v14.4s, v5.8h, v4.8h\n"
+ "smlal2 v22.4s, v16.8h, v4.8h\n"
+ "smlal2 v17.4s, v21.8h, v4.8h\n"
+ "smlal v7.4s, v23.4h, v3.4h\n"
+ "smlal v27.4s, v21.4h, v3.4h\n"
+ "smlal v8.4s, v11.4h, v3.4h\n"
+ "smlal2 v24.4s, v23.8h, v0.8h\n"
+ "smlal v13.4s, v30.4h, v20.4h\n"
+ "sqrdmulh v13.4s, v13.4s, v31.4s\n"
+ "smlal2 v14.4s, v23.8h, v3.8h\n"
+ "smlal2 v22.4s, v21.8h, v3.8h\n"
+ "and v21.16b, v13.16b, v9.16b\n"
+ "smlal2 v17.4s, v11.8h, v3.8h\n"
+ "smlal v7.4s, v30.4h, v0.4h\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "smlal v27.4s, v11.4h, v0.4h\n"
+ "smlal v8.4s, v15.4h, v0.4h\n"
+ "sqadd v13.4s, v13.4s, v21.4s\n"
+ "smlal2 v24.4s, v30.8h, v20.8h\n"
+ "smlal2 v14.4s, v30.8h, v0.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v19.4s\n"
+ "smlal2 v22.4s, v11.8h, v0.8h\n"
+ "smlal2 v17.4s, v15.8h, v0.8h\n"
+ "and v16.16b, v24.16b, v29.16b\n"
+ "smlal v7.4s, v28.4h, v20.4h\n"
+ "smlal v27.4s, v15.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v31.4s\n"
+ "smlal v8.4s, v6.4h, v20.4h\n"
+ "smlal2 v14.4s, v28.8h, v20.8h\n"
+ "sqrdmulh v27.4s, v27.4s, v31.4s\n"
+ "smlal2 v22.4s, v15.8h, v20.8h\n"
+ "smlal2 v17.4s, v6.8h, v20.8h\n"
+ "sqrdmulh v8.4s, v8.4s, v31.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v23.16b, v7.16b, v9.16b\n"
+ "sqrdmulh v14.4s, v14.4s, v19.4s\n"
+ "and v20.16b, v27.16b, v9.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v19.4s\n"
+ "and v3.16b, v8.16b, v9.16b\n"
+ "sqrdmulh v17.4s, v17.4s, v19.4s\n"
+ "sqadd v24.4s, v24.4s, v16.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v18.16b, v14.16b, v29.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v19.16b, v22.16b, v29.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v30.16b, v17.16b, v29.16b\n"
+ "sqadd v7.4s, v7.4s, v23.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v3.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v9.4s\n"
+ "srshl v7.4s, v7.4s, v9.4s\n"
+ "sqadd v14.4s, v14.4s, v18.4s\n"
+ "srshl v27.4s, v27.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v19.4s\n"
+ "srshl v8.4s, v8.4s, v9.4s\n"
+ "sqadd v17.4s, v17.4s, v30.4s\n"
+ "srshl v24.4s, v24.4s, v29.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v14.4s, v14.4s, v29.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v22.4s, v22.4s, v29.4s\n"
+ "sqxtn v27.4h, v27.4s\n"
+ "srshl v17.4s, v17.4s, v29.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "sqxtn2 v13.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v14.4s\n"
+ "sqxtn2 v27.8h, v22.4s\n"
+ "sqxtn2 v8.8h, v17.4s\n"
+ "sqadd v13.8h, v13.8h, v25.8h\n"
+ "sqadd v7.8h, v7.8h, v25.8h\n"
+ "sqadd v27.8h, v27.8h, v25.8h\n"
+ "sqadd v8.8h, v8.8h, v25.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v7.8h, v7.8h, v12.8h\n"
+ "smax v27.8h, v27.8h, v12.8h\n"
+ "smax v8.8h, v8.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v26.8h\n"
+ "smin v7.8h, v7.8h, v26.8h\n"
+ "smin v27.8h, v27.8h, v26.8h\n"
+ "smin v8.8h, v8.8h, v26.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "str d13, [x16, x5]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "str d7, [x15, x5]\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "str d27, [x14, x5]\n"
+ "str d8, [x13, x5]\n"
+ "add x5, x5, #0x8\n"
+ "beq 124f\n"
+ "add x7, x7, #0xc8\n"
+ "3:" // Oddments
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x2, #2, 5f\n"
+ "ld1 { v13.4s }, [x20], #0x10\n"
+ "tbz x2, #1, 4f\n"
+ "ld1 { v24.d }[0], [x20], #0x8\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v24.s }[2], [x20]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x2, #0, 7f\n"
+ "ld1 { v24.s }[0], [x20]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x2, #1, 6f\n"
+ "ld1 { v13.d }[0], [x20], #0x8\n"
+ "tbz x2, #0, 7f\n"
+ "ld1 { v13.s }[2], [x20]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 7f\n"
+ "ld1 { v13.s }[0], [x20]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "ldr d21, [x7, #0x0]\n"
+ "ldr d15, [x7, #0x8]\n"
+ "mov v7.16b, v13.16b\n"
+ "mov v14.16b, v24.16b\n"
+ "ldr d29, [x7, #0x10]\n"
+ "ldr d18, [x7, #0x18]\n"
+ "mov v27.16b, v13.16b\n"
+ "mov v22.16b, v24.16b\n"
+ "ldr d3, [x7, #0x20]\n"
+ "ldp x9, x28, [x6, #0x0]\n"
+ "mov v8.16b, v13.16b\n"
+ "mov v17.16b, v24.16b\n"
+ "ldp x27, x26, [x6, #0x10]\n"
+ "ldp x25, x24, [x6, #0x20]\n"
+ "usubl v21.8h, v21.8b, v2.8b\n"
+ "usubl v15.8h, v15.8b, v2.8b\n"
+ "ldp x23, x22, [x6, #0x30]\n"
+ "ldp x21, x20, [x6, #0x40]\n"
+ "usubl v29.8h, v29.8b, v2.8b\n"
+ "usubl v18.8h, v18.8b, v2.8b\n"
+ "usubl v3.8h, v3.8b, v2.8b\n"
+ "add x9, x9, x4\n"
+ "add x28, x28, x4\n"
+ "add x27, x27, x4\n"
+ "add x26, x26, x4\n"
+ "add x25, x25, x4\n"
+ "add x24, x24, x4\n"
+ "add x23, x23, x4\n"
+ "add x22, x22, x4\n"
+ "add x21, x21, x4\n"
+ "add x20, x20, x4\n"
+ "tbz x2, #2, 9f\n"
+ "ld1 { v10.s }[0], [x9], #0x4\n"
+ "ld1 { v16.s }[0], [x28], #0x4\n"
+ "ld1 { v23.s }[0], [x27], #0x4\n"
+ "ld1 { v30.s }[0], [x26], #0x4\n"
+ "ld1 { v4.s }[0], [x25], #0x4\n"
+ "ld1 { v28.s }[0], [x24], #0x4\n"
+ "ld1 { v31.s }[0], [x23], #0x4\n"
+ "ld1 { v1.s }[0], [x22], #0x4\n"
+ "ld1 { v9.s }[0], [x21], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 8f\n"
+ "ld1 { v10.h }[2], [x9], #0x2\n"
+ "ld1 { v16.h }[2], [x28], #0x2\n"
+ "ld1 { v23.h }[2], [x27], #0x2\n"
+ "ld1 { v30.h }[2], [x26], #0x2\n"
+ "ld1 { v4.h }[2], [x25], #0x2\n"
+ "ld1 { v28.h }[2], [x24], #0x2\n"
+ "ld1 { v31.h }[2], [x23], #0x2\n"
+ "ld1 { v1.h }[2], [x22], #0x2\n"
+ "ld1 { v9.h }[2], [x21], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v10.b }[6], [x9]\n"
+ "ld1 { v16.b }[6], [x28]\n"
+ "ld1 { v23.b }[6], [x27]\n"
+ "ld1 { v30.b }[6], [x26]\n"
+ "ld1 { v4.b }[6], [x25]\n"
+ "ld1 { v28.b }[6], [x24]\n"
+ "ld1 { v31.b }[6], [x23]\n"
+ "ld1 { v1.b }[6], [x22]\n"
+ "ld1 { v9.b }[6], [x21]\n"
+ "ld1 { v11.b }[6], [x20]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x2, #0, 11f\n"
+ "ld1 { v10.b }[4], [x9]\n"
+ "ld1 { v16.b }[4], [x28]\n"
+ "ld1 { v23.b }[4], [x27]\n"
+ "ld1 { v30.b }[4], [x26]\n"
+ "ld1 { v4.b }[4], [x25]\n"
+ "ld1 { v28.b }[4], [x24]\n"
+ "ld1 { v31.b }[4], [x23]\n"
+ "ld1 { v1.b }[4], [x22]\n"
+ "ld1 { v9.b }[4], [x21]\n"
+ "ld1 { v11.b }[4], [x20]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x2, #1, 10f\n"
+ "ld1 { v10.h }[0], [x9], #0x2\n"
+ "ld1 { v16.h }[0], [x28], #0x2\n"
+ "ld1 { v23.h }[0], [x27], #0x2\n"
+ "ld1 { v30.h }[0], [x26], #0x2\n"
+ "ld1 { v4.h }[0], [x25], #0x2\n"
+ "ld1 { v28.h }[0], [x24], #0x2\n"
+ "ld1 { v31.h }[0], [x23], #0x2\n"
+ "ld1 { v1.h }[0], [x22], #0x2\n"
+ "ld1 { v9.h }[0], [x21], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 11f\n"
+ "ld1 { v10.b }[2], [x9]\n"
+ "ld1 { v16.b }[2], [x28]\n"
+ "ld1 { v23.b }[2], [x27]\n"
+ "ld1 { v30.b }[2], [x26]\n"
+ "ld1 { v4.b }[2], [x25]\n"
+ "ld1 { v28.b }[2], [x24]\n"
+ "ld1 { v31.b }[2], [x23]\n"
+ "ld1 { v1.b }[2], [x22]\n"
+ "ld1 { v9.b }[2], [x21]\n"
+ "ld1 { v11.b }[2], [x20]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 11f\n"
+ "ld1 { v10.b }[0], [x9]\n"
+ "ld1 { v16.b }[0], [x28]\n"
+ "ld1 { v23.b }[0], [x27]\n"
+ "ld1 { v30.b }[0], [x26]\n"
+ "ld1 { v4.b }[0], [x25]\n"
+ "ld1 { v28.b }[0], [x24]\n"
+ "ld1 { v31.b }[0], [x23]\n"
+ "ld1 { v1.b }[0], [x22]\n"
+ "ld1 { v9.b }[0], [x21]\n"
+ "ld1 { v11.b }[0], [x20]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ushll v16.8h, v16.8b, #0x0\n"
+ "smlal v13.4s, v10.4h, v21.4h\n"
+ "ldr x20, [x6, #0x50]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal2 v24.4s, v10.8h, v21.8h\n"
+ "smlal v7.4s, v16.4h, v21.4h\n"
+ "smlal2 v14.4s, v16.8h, v21.8h\n"
+ "smlal v27.4s, v23.4h, v21.4h\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "add x20, x20, x4\n"
+ "smlal2 v22.4s, v23.8h, v21.8h\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "smlal v8.4s, v30.4h, v21.4h\n"
+ "smlal2 v17.4s, v30.8h, v21.8h\n"
+ "smlal v13.4s, v16.4h, v15.4h\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal2 v24.4s, v16.8h, v15.8h\n"
+ "smlal v7.4s, v4.4h, v15.4h\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal2 v14.4s, v4.8h, v15.8h\n"
+ "smlal v27.4s, v30.4h, v15.4h\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "smlal2 v22.4s, v30.8h, v15.8h\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "smlal v8.4s, v28.4h, v15.4h\n"
+ "ushll v11.8h, v11.8b, #0x0\n"
+ "smlal2 v17.4s, v28.8h, v15.8h\n"
+ "smlal v13.4s, v4.4h, v29.4h\n"
+ "smlal2 v24.4s, v4.8h, v29.8h\n"
+ "smlal v7.4s, v31.4h, v29.4h\n"
+ "smlal2 v14.4s, v31.8h, v29.8h\n"
+ "smlal v27.4s, v28.4h, v29.4h\n"
+ "smlal2 v22.4s, v28.8h, v29.8h\n"
+ "tbz x2, #2, 13f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 12f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v5.b }[6], [x20]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 15f\n"
+ "ld1 { v5.b }[4], [x20]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x2, #1, 14f\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 15f\n"
+ "ld1 { v5.b }[2], [x20]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 15f\n"
+ "ld1 { v5.b }[0], [x20]\n"
+ "15:" // Oddments: Load (1, 3): Bit 2: End
+ "ushll v5.8h, v5.8b, #0x0\n"
+ "ldr x20, [x6, #0x58]\n"
+ "smlal v8.4s, v5.4h, v29.4h\n"
+ "smlal2 v17.4s, v5.8h, v29.8h\n"
+ "smlal v13.4s, v31.4h, v18.4h\n"
+ "smlal2 v24.4s, v31.8h, v18.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v7.4s, v1.4h, v18.4h\n"
+ "smlal2 v14.4s, v1.8h, v18.8h\n"
+ "smlal v27.4s, v5.4h, v18.4h\n"
+ "smlal2 v22.4s, v5.8h, v18.8h\n"
+ "tbz x2, #2, 17f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 16f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v10.b }[6], [x20]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 19f\n"
+ "ld1 { v10.b }[4], [x20]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (1, 4): Bit 2: Unset
+ "tbz x2, #1, 18f\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 19f\n"
+ "ld1 { v10.b }[2], [x20]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 19f\n"
+ "ld1 { v10.b }[0], [x20]\n"
+ "19:" // Oddments: Load (1, 4): Bit 2: End
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ldr x20, [x6, #0x60]\n"
+ "smlal v8.4s, v10.4h, v18.4h\n"
+ "smlal2 v17.4s, v10.8h, v18.8h\n"
+ "smlal v13.4s, v1.4h, v3.4h\n"
+ "smlal2 v24.4s, v1.8h, v3.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x2, #2, 21f\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 20f\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v15.b }[6], [x20]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 23f\n"
+ "ld1 { v15.b }[4], [x20]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (0, 5): Bit 2: Unset
+ "tbz x2, #1, 22f\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 23f\n"
+ "ld1 { v15.b }[2], [x20]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 23f\n"
+ "ld1 { v15.b }[0], [x20]\n"
+ "23:" // Oddments: Load (0, 5): Bit 2: End
+ "ldr d6, [x7, #0x28]\n"
+ "ushll v15.8h, v15.8b, #0x0\n"
+ "smlal v7.4s, v15.4h, v3.4h\n"
+ "smlal2 v14.4s, v15.8h, v3.8h\n"
+ "smlal v27.4s, v10.4h, v3.4h\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "usubl v6.8h, v6.8b, v2.8b\n"
+ "ldr x20, [x6, #0x68]\n"
+ "smlal v8.4s, v9.4h, v3.4h\n"
+ "smlal2 v17.4s, v9.8h, v3.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v23.4h, v6.4h\n"
+ "smlal2 v24.4s, v23.8h, v6.8h\n"
+ "smlal v7.4s, v30.4h, v6.4h\n"
+ "smlal2 v14.4s, v30.8h, v6.8h\n"
+ "smlal v27.4s, v11.4h, v6.4h\n"
+ "smlal2 v22.4s, v11.8h, v6.8h\n"
+ "tbz x2, #2, 25f\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 24f\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v20.b }[6], [x20]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 27f\n"
+ "ld1 { v20.b }[4], [x20]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (2, 1): Bit 2: Unset
+ "tbz x2, #1, 26f\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 27f\n"
+ "ld1 { v20.b }[2], [x20]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 27f\n"
+ "ld1 { v20.b }[0], [x20]\n"
+ "27:" // Oddments: Load (2, 1): Bit 2: End
+ "ldr d4, [x7, #0x30]\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "usubl v4.8h, v4.8b, v2.8b\n"
+ "ldr x20, [x6, #0x70]\n"
+ "smlal v8.4s, v20.4h, v6.4h\n"
+ "smlal2 v17.4s, v20.8h, v6.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v30.4h, v4.4h\n"
+ "smlal2 v24.4s, v30.8h, v4.8h\n"
+ "smlal v7.4s, v28.4h, v4.4h\n"
+ "smlal2 v14.4s, v28.8h, v4.8h\n"
+ "smlal v27.4s, v20.4h, v4.4h\n"
+ "smlal2 v22.4s, v20.8h, v4.8h\n"
+ "tbz x2, #2, 29f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 28f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v23.b }[6], [x20]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 31f\n"
+ "ld1 { v23.b }[4], [x20]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 2): Bit 2: Unset
+ "tbz x2, #1, 30f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 31f\n"
+ "ld1 { v23.b }[2], [x20]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 31f\n"
+ "ld1 { v23.b }[0], [x20]\n"
+ "31:" // Oddments: Load (2, 2): Bit 2: End
+ "ldr d30, [x7, #0x38]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "usubl v30.8h, v30.8b, v2.8b\n"
+ "ldr x20, [x6, #0x78]\n"
+ "smlal v8.4s, v23.4h, v4.4h\n"
+ "smlal2 v17.4s, v23.8h, v4.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v28.4h, v30.4h\n"
+ "smlal2 v24.4s, v28.8h, v30.8h\n"
+ "smlal v7.4s, v5.4h, v30.4h\n"
+ "smlal2 v14.4s, v5.8h, v30.8h\n"
+ "smlal v27.4s, v23.4h, v30.4h\n"
+ "smlal2 v22.4s, v23.8h, v30.8h\n"
+ "tbz x2, #2, 33f\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 32f\n"
+ "ld1 { v3.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v3.b }[6], [x20]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 35f\n"
+ "ld1 { v3.b }[4], [x20]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x2, #1, 34f\n"
+ "ld1 { v3.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 35f\n"
+ "ld1 { v3.b }[2], [x20]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 35f\n"
+ "ld1 { v3.b }[0], [x20]\n"
+ "35:" // Oddments: Load (2, 3): Bit 2: End
+ "ldr d16, [x7, #0x40]\n"
+ "ushll v3.8h, v3.8b, #0x0\n"
+ "usubl v16.8h, v16.8b, v2.8b\n"
+ "ldr x20, [x6, #0x80]\n"
+ "smlal v8.4s, v3.4h, v30.4h\n"
+ "smlal2 v17.4s, v3.8h, v30.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v5.4h, v16.4h\n"
+ "smlal2 v24.4s, v5.8h, v16.8h\n"
+ "smlal v7.4s, v10.4h, v16.4h\n"
+ "smlal2 v14.4s, v10.8h, v16.8h\n"
+ "smlal v27.4s, v3.4h, v16.4h\n"
+ "smlal2 v22.4s, v3.8h, v16.8h\n"
+ "tbz x2, #2, 37f\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 36f\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v6.b }[6], [x20]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 39f\n"
+ "ld1 { v6.b }[4], [x20]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (2, 4): Bit 2: Unset
+ "tbz x2, #1, 38f\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 39f\n"
+ "ld1 { v6.b }[2], [x20]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 39f\n"
+ "ld1 { v6.b }[0], [x20]\n"
+ "39:" // Oddments: Load (2, 4): Bit 2: End
+ "ldr d1, [x7, #0x48]\n"
+ "ushll v6.8h, v6.8b, #0x0\n"
+ "usubl v1.8h, v1.8b, v2.8b\n"
+ "ldr x20, [x6, #0x88]\n"
+ "smlal v8.4s, v6.4h, v16.4h\n"
+ "smlal2 v17.4s, v6.8h, v16.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v10.4h, v1.4h\n"
+ "smlal2 v24.4s, v10.8h, v1.8h\n"
+ "smlal v7.4s, v9.4h, v1.4h\n"
+ "smlal2 v14.4s, v9.8h, v1.8h\n"
+ "smlal v27.4s, v6.4h, v1.4h\n"
+ "smlal2 v22.4s, v6.8h, v1.8h\n"
+ "tbz x2, #2, 41f\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 40f\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 43f\n"
+ "ld1 { v18.b }[6], [x20]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 43f\n"
+ "ld1 { v18.b }[4], [x20]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (2, 5): Bit 2: Unset
+ "tbz x2, #1, 42f\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 43f\n"
+ "ld1 { v18.b }[2], [x20]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 43f\n"
+ "ld1 { v18.b }[0], [x20]\n"
+ "43:" // Oddments: Load (2, 5): Bit 2: End
+ "ldr d28, [x7, #0x50]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "usubl v28.8h, v28.8b, v2.8b\n"
+ "ldr x20, [x6, #0x90]\n"
+ "smlal v8.4s, v18.4h, v1.4h\n"
+ "smlal2 v17.4s, v18.8h, v1.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v11.4h, v28.4h\n"
+ "smlal2 v24.4s, v11.8h, v28.8h\n"
+ "smlal v7.4s, v20.4h, v28.4h\n"
+ "smlal2 v14.4s, v20.8h, v28.8h\n"
+ "tbz x2, #2, 45f\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 44f\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v30.b }[6], [x20]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 47f\n"
+ "ld1 { v30.b }[4], [x20]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x2, #1, 46f\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 47f\n"
+ "ld1 { v30.b }[2], [x20]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 47f\n"
+ "ld1 { v30.b }[0], [x20]\n"
+ "47:" // Oddments: Load (3, 0): Bit 2: End
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ldr x20, [x6, #0x98]\n"
+ "smlal v27.4s, v30.4h, v28.4h\n"
+ "smlal2 v22.4s, v30.8h, v28.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x2, #2, 49f\n"
+ "ld1 { v19.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 48f\n"
+ "ld1 { v19.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 51f\n"
+ "ld1 { v19.b }[6], [x20]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 51f\n"
+ "ld1 { v19.b }[4], [x20]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x2, #1, 50f\n"
+ "ld1 { v19.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 51f\n"
+ "ld1 { v19.b }[2], [x20]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 51f\n"
+ "ld1 { v19.b }[0], [x20]\n"
+ "51:" // Oddments: Load (3, 1): Bit 2: End
+ "ldr d0, [x7, #0x58]\n"
+ "ushll v19.8h, v19.8b, #0x0\n"
+ "usubl v0.8h, v0.8b, v2.8b\n"
+ "ldr x20, [x6, #0xa0]\n"
+ "smlal v8.4s, v19.4h, v28.4h\n"
+ "smlal2 v17.4s, v19.8h, v28.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v20.4h, v0.4h\n"
+ "smlal2 v24.4s, v20.8h, v0.8h\n"
+ "smlal v7.4s, v23.4h, v0.4h\n"
+ "smlal2 v14.4s, v23.8h, v0.8h\n"
+ "smlal v27.4s, v19.4h, v0.4h\n"
+ "smlal2 v22.4s, v19.8h, v0.8h\n"
+ "tbz x2, #2, 53f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 52f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v9.b }[6], [x20]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 55f\n"
+ "ld1 { v9.b }[4], [x20]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x2, #1, 54f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 55f\n"
+ "ld1 { v9.b }[2], [x20]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 55f\n"
+ "ld1 { v9.b }[0], [x20]\n"
+ "55:" // Oddments: Load (3, 2): Bit 2: End
+ "ldr d10, [x7, #0x60]\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "usubl v10.8h, v10.8b, v2.8b\n"
+ "ldr x20, [x6, #0xa8]\n"
+ "smlal v8.4s, v9.4h, v0.4h\n"
+ "smlal2 v17.4s, v9.8h, v0.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v23.4h, v10.4h\n"
+ "smlal2 v24.4s, v23.8h, v10.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "smlal2 v14.4s, v3.8h, v10.8h\n"
+ "smlal v27.4s, v9.4h, v10.4h\n"
+ "smlal2 v22.4s, v9.8h, v10.8h\n"
+ "tbz x2, #2, 57f\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 56f\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 59f\n"
+ "ld1 { v20.b }[6], [x20]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 59f\n"
+ "ld1 { v20.b }[4], [x20]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x2, #1, 58f\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 59f\n"
+ "ld1 { v20.b }[2], [x20]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 59f\n"
+ "ld1 { v20.b }[0], [x20]\n"
+ "59:" // Oddments: Load (3, 3): Bit 2: End
+ "ldr d28, [x7, #0x68]\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "usubl v28.8h, v28.8b, v2.8b\n"
+ "ldr x20, [x6, #0xb0]\n"
+ "smlal v8.4s, v20.4h, v10.4h\n"
+ "smlal2 v17.4s, v20.8h, v10.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v3.4h, v28.4h\n"
+ "smlal2 v24.4s, v3.8h, v28.8h\n"
+ "smlal v7.4s, v6.4h, v28.4h\n"
+ "smlal2 v14.4s, v6.8h, v28.8h\n"
+ "smlal v27.4s, v20.4h, v28.4h\n"
+ "smlal2 v22.4s, v20.8h, v28.8h\n"
+ "tbz x2, #2, 61f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 60f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v5.b }[6], [x20]\n"
+ "b 63f\n"
+ "60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 63f\n"
+ "ld1 { v5.b }[4], [x20]\n"
+ "b 63f\n"
+ "61:" // Oddments: Load (3, 4): Bit 2: Unset
+ "tbz x2, #1, 62f\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 63f\n"
+ "ld1 { v5.b }[2], [x20]\n"
+ "b 63f\n"
+ "62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 63f\n"
+ "ld1 { v5.b }[0], [x20]\n"
+ "63:" // Oddments: Load (3, 4): Bit 2: End
+ "ldr d23, [x7, #0x70]\n"
+ "ushll v5.8h, v5.8b, #0x0\n"
+ "usubl v23.8h, v23.8b, v2.8b\n"
+ "ldr x20, [x6, #0xb8]\n"
+ "smlal v8.4s, v5.4h, v28.4h\n"
+ "smlal2 v17.4s, v5.8h, v28.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v6.4h, v23.4h\n"
+ "smlal2 v24.4s, v6.8h, v23.8h\n"
+ "smlal v7.4s, v18.4h, v23.4h\n"
+ "smlal2 v14.4s, v18.8h, v23.8h\n"
+ "smlal v27.4s, v5.4h, v23.4h\n"
+ "smlal2 v22.4s, v5.8h, v23.8h\n"
+ "tbz x2, #2, 65f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 64f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 67f\n"
+ "ld1 { v29.b }[6], [x20]\n"
+ "b 67f\n"
+ "64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 67f\n"
+ "ld1 { v29.b }[4], [x20]\n"
+ "b 67f\n"
+ "65:" // Oddments: Load (3, 5): Bit 2: Unset
+ "tbz x2, #1, 66f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 67f\n"
+ "ld1 { v29.b }[2], [x20]\n"
+ "b 67f\n"
+ "66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 67f\n"
+ "ld1 { v29.b }[0], [x20]\n"
+ "67:" // Oddments: Load (3, 5): Bit 2: End
+ "ldr d4, [x7, #0x78]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "usubl v4.8h, v4.8b, v2.8b\n"
+ "ldr x20, [x6, #0xc0]\n"
+ "smlal v8.4s, v29.4h, v23.4h\n"
+ "smlal2 v17.4s, v29.8h, v23.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v30.4h, v4.4h\n"
+ "smlal2 v24.4s, v30.8h, v4.8h\n"
+ "smlal v7.4s, v19.4h, v4.4h\n"
+ "smlal2 v14.4s, v19.8h, v4.8h\n"
+ "tbz x2, #2, 69f\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 68f\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v18.b }[6], [x20]\n"
+ "b 71f\n"
+ "68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 71f\n"
+ "ld1 { v18.b }[4], [x20]\n"
+ "b 71f\n"
+ "69:" // Oddments: Load (4, 0): Bit 2: Unset
+ "tbz x2, #1, 70f\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 71f\n"
+ "ld1 { v18.b }[2], [x20]\n"
+ "b 71f\n"
+ "70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 71f\n"
+ "ld1 { v18.b }[0], [x20]\n"
+ "71:" // Oddments: Load (4, 0): Bit 2: End
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "ldr x20, [x6, #0xc8]\n"
+ "smlal v27.4s, v18.4h, v4.4h\n"
+ "smlal2 v22.4s, v18.8h, v4.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x2, #2, 73f\n"
+ "ld1 { v1.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 72f\n"
+ "ld1 { v1.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 75f\n"
+ "ld1 { v1.b }[6], [x20]\n"
+ "b 75f\n"
+ "72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 75f\n"
+ "ld1 { v1.b }[4], [x20]\n"
+ "b 75f\n"
+ "73:" // Oddments: Load (4, 1): Bit 2: Unset
+ "tbz x2, #1, 74f\n"
+ "ld1 { v1.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 75f\n"
+ "ld1 { v1.b }[2], [x20]\n"
+ "b 75f\n"
+ "74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 75f\n"
+ "ld1 { v1.b }[0], [x20]\n"
+ "75:" // Oddments: Load (4, 1): Bit 2: End
+ "ldr d23, [x7, #0x80]\n"
+ "ushll v1.8h, v1.8b, #0x0\n"
+ "usubl v23.8h, v23.8b, v2.8b\n"
+ "ldr x20, [x6, #0xd0]\n"
+ "smlal v8.4s, v1.4h, v4.4h\n"
+ "smlal2 v17.4s, v1.8h, v4.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v19.4h, v23.4h\n"
+ "smlal2 v24.4s, v19.8h, v23.8h\n"
+ "smlal v7.4s, v9.4h, v23.4h\n"
+ "smlal2 v14.4s, v9.8h, v23.8h\n"
+ "smlal v27.4s, v1.4h, v23.4h\n"
+ "smlal2 v22.4s, v1.8h, v23.8h\n"
+ "tbz x2, #2, 77f\n"
+ "ld1 { v4.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 76f\n"
+ "ld1 { v4.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v4.b }[6], [x20]\n"
+ "b 79f\n"
+ "76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 79f\n"
+ "ld1 { v4.b }[4], [x20]\n"
+ "b 79f\n"
+ "77:" // Oddments: Load (4, 2): Bit 2: Unset
+ "tbz x2, #1, 78f\n"
+ "ld1 { v4.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 79f\n"
+ "ld1 { v4.b }[2], [x20]\n"
+ "b 79f\n"
+ "78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 79f\n"
+ "ld1 { v4.b }[0], [x20]\n"
+ "79:" // Oddments: Load (4, 2): Bit 2: End
+ "ldr d30, [x7, #0x88]\n"
+ "ushll v4.8h, v4.8b, #0x0\n"
+ "usubl v30.8h, v30.8b, v2.8b\n"
+ "ldr x20, [x6, #0xd8]\n"
+ "smlal v8.4s, v4.4h, v23.4h\n"
+ "smlal2 v17.4s, v4.8h, v23.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v9.4h, v30.4h\n"
+ "smlal2 v24.4s, v9.8h, v30.8h\n"
+ "smlal v7.4s, v20.4h, v30.4h\n"
+ "smlal2 v14.4s, v20.8h, v30.8h\n"
+ "smlal v27.4s, v4.4h, v30.4h\n"
+ "smlal2 v22.4s, v4.8h, v30.8h\n"
+ "tbz x2, #2, 81f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 80f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v21.b }[6], [x20]\n"
+ "b 83f\n"
+ "80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 83f\n"
+ "ld1 { v21.b }[4], [x20]\n"
+ "b 83f\n"
+ "81:" // Oddments: Load (4, 3): Bit 2: Unset
+ "tbz x2, #1, 82f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 83f\n"
+ "ld1 { v21.b }[2], [x20]\n"
+ "b 83f\n"
+ "82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 83f\n"
+ "ld1 { v21.b }[0], [x20]\n"
+ "83:" // Oddments: Load (4, 3): Bit 2: End
+ "ldr d3, [x7, #0x90]\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "usubl v3.8h, v3.8b, v2.8b\n"
+ "ldr x20, [x6, #0xe0]\n"
+ "smlal v8.4s, v21.4h, v30.4h\n"
+ "smlal2 v17.4s, v21.8h, v30.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v20.4h, v3.4h\n"
+ "smlal2 v24.4s, v20.8h, v3.8h\n"
+ "smlal v7.4s, v5.4h, v3.4h\n"
+ "smlal2 v14.4s, v5.8h, v3.8h\n"
+ "smlal v27.4s, v21.4h, v3.4h\n"
+ "smlal2 v22.4s, v21.8h, v3.8h\n"
+ "tbz x2, #2, 85f\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 84f\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 87f\n"
+ "ld1 { v30.b }[6], [x20]\n"
+ "b 87f\n"
+ "84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 87f\n"
+ "ld1 { v30.b }[4], [x20]\n"
+ "b 87f\n"
+ "85:" // Oddments: Load (4, 4): Bit 2: Unset
+ "tbz x2, #1, 86f\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 87f\n"
+ "ld1 { v30.b }[2], [x20]\n"
+ "b 87f\n"
+ "86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 87f\n"
+ "ld1 { v30.b }[0], [x20]\n"
+ "87:" // Oddments: Load (4, 4): Bit 2: End
+ "ldr d19, [x7, #0x98]\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "usubl v19.8h, v19.8b, v2.8b\n"
+ "ldr x20, [x6, #0xe8]\n"
+ "smlal v8.4s, v30.4h, v3.4h\n"
+ "smlal2 v17.4s, v30.8h, v3.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v5.4h, v19.4h\n"
+ "smlal2 v24.4s, v5.8h, v19.8h\n"
+ "smlal v7.4s, v29.4h, v19.4h\n"
+ "smlal2 v14.4s, v29.8h, v19.8h\n"
+ "smlal v27.4s, v30.4h, v19.4h\n"
+ "smlal2 v22.4s, v30.8h, v19.8h\n"
+ "tbz x2, #2, 89f\n"
+ "ld1 { v20.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 88f\n"
+ "ld1 { v20.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 91f\n"
+ "ld1 { v20.b }[6], [x20]\n"
+ "b 91f\n"
+ "88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 91f\n"
+ "ld1 { v20.b }[4], [x20]\n"
+ "b 91f\n"
+ "89:" // Oddments: Load (4, 5): Bit 2: Unset
+ "tbz x2, #1, 90f\n"
+ "ld1 { v20.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 91f\n"
+ "ld1 { v20.b }[2], [x20]\n"
+ "b 91f\n"
+ "90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 91f\n"
+ "ld1 { v20.b }[0], [x20]\n"
+ "91:" // Oddments: Load (4, 5): Bit 2: End
+ "ldr d23, [x7, #0xa0]\n"
+ "ushll v20.8h, v20.8b, #0x0\n"
+ "usubl v23.8h, v23.8b, v2.8b\n"
+ "ldr x20, [x6, #0xf0]\n"
+ "smlal v8.4s, v20.4h, v19.4h\n"
+ "smlal2 v17.4s, v20.8h, v19.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v18.4h, v23.4h\n"
+ "smlal2 v24.4s, v18.8h, v23.8h\n"
+ "smlal v7.4s, v1.4h, v23.4h\n"
+ "smlal2 v14.4s, v1.8h, v23.8h\n"
+ "tbz x2, #2, 93f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 92f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 95f\n"
+ "ld1 { v10.b }[6], [x20]\n"
+ "b 95f\n"
+ "92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 95f\n"
+ "ld1 { v10.b }[4], [x20]\n"
+ "b 95f\n"
+ "93:" // Oddments: Load (5, 0): Bit 2: Unset
+ "tbz x2, #1, 94f\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 95f\n"
+ "ld1 { v10.b }[2], [x20]\n"
+ "b 95f\n"
+ "94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 95f\n"
+ "ld1 { v10.b }[0], [x20]\n"
+ "95:" // Oddments: Load (5, 0): Bit 2: End
+ "ushll v10.8h, v10.8b, #0x0\n"
+ "ldr x20, [x6, #0xf8]\n"
+ "smlal v27.4s, v10.4h, v23.4h\n"
+ "smlal2 v22.4s, v10.8h, v23.8h\n"
+ "add x20, x20, x4\n"
+ "tbz x2, #2, 97f\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 96f\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 99f\n"
+ "ld1 { v18.b }[6], [x20]\n"
+ "b 99f\n"
+ "96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 99f\n"
+ "ld1 { v18.b }[4], [x20]\n"
+ "b 99f\n"
+ "97:" // Oddments: Load (5, 1): Bit 2: Unset
+ "tbz x2, #1, 98f\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 99f\n"
+ "ld1 { v18.b }[2], [x20]\n"
+ "b 99f\n"
+ "98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 99f\n"
+ "ld1 { v18.b }[0], [x20]\n"
+ "99:" // Oddments: Load (5, 1): Bit 2: End
+ "ldr d5, [x7, #0xa8]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "usubl v5.8h, v5.8b, v2.8b\n"
+ "ldr x20, [x6, #0x100]\n"
+ "smlal v8.4s, v18.4h, v23.4h\n"
+ "smlal2 v17.4s, v18.8h, v23.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v1.4h, v5.4h\n"
+ "smlal2 v24.4s, v1.8h, v5.8h\n"
+ "smlal v7.4s, v4.4h, v5.4h\n"
+ "smlal2 v14.4s, v4.8h, v5.8h\n"
+ "smlal v27.4s, v18.4h, v5.4h\n"
+ "smlal2 v22.4s, v18.8h, v5.8h\n"
+ "tbz x2, #2, 101f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 100f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 103f\n"
+ "ld1 { v9.b }[6], [x20]\n"
+ "b 103f\n"
+ "100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 103f\n"
+ "ld1 { v9.b }[4], [x20]\n"
+ "b 103f\n"
+ "101:" // Oddments: Load (5, 2): Bit 2: Unset
+ "tbz x2, #1, 102f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 103f\n"
+ "ld1 { v9.b }[2], [x20]\n"
+ "b 103f\n"
+ "102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 103f\n"
+ "ld1 { v9.b }[0], [x20]\n"
+ "103:" // Oddments: Load (5, 2): Bit 2: End
+ "ldr d18, [x7, #0xb0]\n"
+ "ushll v9.8h, v9.8b, #0x0\n"
+ "usubl v18.8h, v18.8b, v2.8b\n"
+ "ldr x20, [x6, #0x108]\n"
+ "smlal v8.4s, v9.4h, v5.4h\n"
+ "smlal2 v17.4s, v9.8h, v5.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v21.4h, v18.4h\n"
+ "smlal2 v14.4s, v21.8h, v18.8h\n"
+ "smlal v27.4s, v9.4h, v18.4h\n"
+ "smlal2 v22.4s, v9.8h, v18.8h\n"
+ "tbz x2, #2, 105f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 104f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 107f\n"
+ "ld1 { v5.b }[6], [x20]\n"
+ "b 107f\n"
+ "104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 107f\n"
+ "ld1 { v5.b }[4], [x20]\n"
+ "b 107f\n"
+ "105:" // Oddments: Load (5, 3): Bit 2: Unset
+ "tbz x2, #1, 106f\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 107f\n"
+ "ld1 { v5.b }[2], [x20]\n"
+ "b 107f\n"
+ "106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 107f\n"
+ "ld1 { v5.b }[0], [x20]\n"
+ "107:" // Oddments: Load (5, 3): Bit 2: End
+ "ldr d11, [x7, #0xb8]\n"
+ "ushll v5.8h, v5.8b, #0x0\n"
+ "usubl v11.8h, v11.8b, v2.8b\n"
+ "ldr x20, [x6, #0x110]\n"
+ "smlal v8.4s, v5.4h, v18.4h\n"
+ "smlal2 v17.4s, v5.8h, v18.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v21.4h, v11.4h\n"
+ "smlal2 v24.4s, v21.8h, v11.8h\n"
+ "smlal v7.4s, v30.4h, v11.4h\n"
+ "smlal2 v14.4s, v30.8h, v11.8h\n"
+ "smlal v27.4s, v5.4h, v11.4h\n"
+ "smlal2 v22.4s, v5.8h, v11.8h\n"
+ "tbz x2, #2, 109f\n"
+ "ld1 { v18.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 108f\n"
+ "ld1 { v18.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 111f\n"
+ "ld1 { v18.b }[6], [x20]\n"
+ "b 111f\n"
+ "108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 111f\n"
+ "ld1 { v18.b }[4], [x20]\n"
+ "b 111f\n"
+ "109:" // Oddments: Load (5, 4): Bit 2: Unset
+ "tbz x2, #1, 110f\n"
+ "ld1 { v18.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 111f\n"
+ "ld1 { v18.b }[2], [x20]\n"
+ "b 111f\n"
+ "110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 111f\n"
+ "ld1 { v18.b }[0], [x20]\n"
+ "111:" // Oddments: Load (5, 4): Bit 2: End
+ "ldr d16, [x7, #0xc0]\n"
+ "ushll v18.8h, v18.8b, #0x0\n"
+ "usubl v16.8h, v16.8b, v2.8b\n"
+ "ldr x20, [x6, #0x118]\n"
+ "smlal v8.4s, v18.4h, v11.4h\n"
+ "smlal2 v17.4s, v18.8h, v11.8h\n"
+ "add x20, x20, x4\n"
+ "smlal v13.4s, v30.4h, v16.4h\n"
+ "smlal2 v24.4s, v30.8h, v16.8h\n"
+ "smlal v7.4s, v20.4h, v16.4h\n"
+ "smlal2 v14.4s, v20.8h, v16.8h\n"
+ "smlal v27.4s, v18.4h, v16.4h\n"
+ "smlal2 v22.4s, v18.8h, v16.8h\n"
+ "tbz x2, #2, 113f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x2, #1, 112f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x2, #0, 115f\n"
+ "ld1 { v21.b }[6], [x20]\n"
+ "b 115f\n"
+ "112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
+ "tbz x2, #0, 115f\n"
+ "ld1 { v21.b }[4], [x20]\n"
+ "b 115f\n"
+ "113:" // Oddments: Load (5, 5): Bit 2: Unset
+ "tbz x2, #1, 114f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x2, #0, 115f\n"
+ "ld1 { v21.b }[2], [x20]\n"
+ "b 115f\n"
+ "114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 115f\n"
+ "ld1 { v21.b }[0], [x20]\n"
+ "115:" // Oddments: Load (5, 5): Bit 2: End
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "smlal v8.4s, v21.4h, v16.4h\n"
+ "smlal2 v17.4s, v21.8h, v16.8h\n"
+ "tbz x2, #2, 117f\n"
+ "ld1 { v16.4s }, [x8], #0x10\n"
+ "ld1 { v21.4s }, [x17], #0x10\n"
+ "tbz x2, #1, 116f\n"
+ "ld1 { v18.d }[0], [x8], #0x8\n"
+ "ld1 { v0.d }[0], [x17], #0x8\n"
+ "tbz x2, #0, 119f\n"
+ "ld1 { v18.s }[2], [x8]\n"
+ "ld1 { v0.s }[2], [x17]\n"
+ "b 119f\n"
+ "116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x2, #0, 119f\n"
+ "ld1 { v18.s }[0], [x8]\n"
+ "ld1 { v0.s }[0], [x17]\n"
+ "b 119f\n"
+ "117:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x2, #1, 118f\n"
+ "ld1 { v16.d }[0], [x8], #0x8\n"
+ "ld1 { v21.d }[0], [x17], #0x8\n"
+ "tbz x2, #0, 119f\n"
+ "ld1 { v16.s }[2], [x8]\n"
+ "ld1 { v21.s }[2], [x17]\n"
+ "b 119f\n"
+ "118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 119f\n"
+ "ld1 { v16.s }[0], [x8]\n"
+ "ld1 { v21.s }[0], [x17]\n"
+ "119:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v13.4s, v13.4s, v16.4s\n"
+ "and v5.16b, v13.16b, v21.16b\n"
+ "add x16, x16, x5\n"
+ "add x15, x15, x5\n"
+ "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "add x14, x14, x5\n"
+ "add x13, x13, x5\n"
+ "and v2.16b, v24.16b, v0.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v16.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v16.4s\n"
+ "sqrdmulh v8.4s, v8.4s, v16.4s\n"
+ "sqadd v13.4s, v13.4s, v5.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "and v23.16b, v7.16b, v21.16b\n"
+ "sqrdmulh v14.4s, v14.4s, v18.4s\n"
+ "and v20.16b, v27.16b, v21.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v18.4s\n"
+ "and v31.16b, v8.16b, v21.16b\n"
+ "sqrdmulh v17.4s, v17.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v2.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v18.16b, v14.16b, v0.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v11.16b, v22.16b, v0.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v10.16b, v17.16b, v0.16b\n"
+ "sqadd v7.4s, v7.4s, v23.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v31.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v21.4s\n"
+ "srshl v7.4s, v7.4s, v21.4s\n"
+ "sqadd v14.4s, v14.4s, v18.4s\n"
+ "srshl v27.4s, v27.4s, v21.4s\n"
+ "sqadd v22.4s, v22.4s, v11.4s\n"
+ "srshl v8.4s, v8.4s, v21.4s\n"
+ "sqadd v17.4s, v17.4s, v10.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v14.4s, v14.4s, v0.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "sqxtn v27.4h, v27.4s\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "sqxtn v8.4h, v8.4s\n"
+ "sqxtn2 v13.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v14.4s\n"
+ "sqxtn2 v27.8h, v22.4s\n"
+ "sqxtn2 v8.8h, v17.4s\n"
+ "sqadd v13.8h, v13.8h, v25.8h\n"
+ "sqadd v7.8h, v7.8h, v25.8h\n"
+ "sqadd v27.8h, v27.8h, v25.8h\n"
+ "sqadd v8.8h, v8.8h, v25.8h\n"
+ "smax v13.8h, v13.8h, v12.8h\n"
+ "smax v7.8h, v7.8h, v12.8h\n"
+ "smax v27.8h, v27.8h, v12.8h\n"
+ "smax v8.8h, v8.8h, v12.8h\n"
+ "smin v13.8h, v13.8h, v26.8h\n"
+ "smin v7.8h, v7.8h, v26.8h\n"
+ "smin v27.8h, v27.8h, v26.8h\n"
+ "smin v8.8h, v8.8h, v26.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v8.16b, v8.16b, v8.16b\n"
+ "tbz x2, #2, 121f\n"
+ "st1 { v13.s }[0], [x16], #0x4\n"
+ "st1 { v7.s }[0], [x15], #0x4\n"
+ "st1 { v27.s }[0], [x14], #0x4\n"
+ "st1 { v8.s }[0], [x13], #0x4\n"
+ "tbz x2, #1, 120f\n"
+ "st1 { v13.h }[2], [x16], #0x2\n"
+ "st1 { v7.h }[2], [x15], #0x2\n"
+ "st1 { v27.h }[2], [x14], #0x2\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "tbz x2, #0, 123f\n"
+ "st1 { v13.b }[6], [x16], #0x1\n"
+ "st1 { v7.b }[6], [x15], #0x1\n"
+ "st1 { v27.b }[6], [x14], #0x1\n"
+ "st1 { v8.b }[6], [x13], #0x1\n"
+ "b 123f\n"
+ "120:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x2, #0, 123f\n"
+ "st1 { v13.b }[4], [x16], #0x1\n"
+ "st1 { v7.b }[4], [x15], #0x1\n"
+ "st1 { v27.b }[4], [x14], #0x1\n"
+ "st1 { v8.b }[4], [x13], #0x1\n"
+ "b 123f\n"
+ "121:" // Oddments: Bit 2: Unset
+ "tbz x2, #1, 122f\n"
+ "st1 { v13.h }[0], [x16], #0x2\n"
+ "st1 { v7.h }[0], [x15], #0x2\n"
+ "st1 { v27.h }[0], [x14], #0x2\n"
+ "st1 { v8.h }[0], [x13], #0x2\n"
+ "tbz x2, #0, 123f\n"
+ "st1 { v13.b }[2], [x16], #0x1\n"
+ "st1 { v7.b }[2], [x15], #0x1\n"
+ "st1 { v27.b }[2], [x14], #0x1\n"
+ "st1 { v8.b }[2], [x13], #0x1\n"
+ "b 123f\n"
+ "122:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x2, #0, 123f\n"
+ "st1 { v13.b }[0], [x16], #0x1\n"
+ "st1 { v7.b }[0], [x15], #0x1\n"
+ "st1 { v27.b }[0], [x14], #0x1\n"
+ "st1 { v8.b }[0], [x13], #0x1\n"
+ "123:" // Oddments: Bit 2: End
+ "124:" // End
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..1666c17ca0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int,
+ const uint8_t *const *const,
+ const int8_t *const,
+ const int32_t *const,
+ const arm_gemm::Requantize32 &,
+ const int32_t *const,
+ const int32_t *const,
+ uint8_t *const *const
+);
+
+class a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..f1c1b2315c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1166 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ uint64_t n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[16];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[5];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[3];
+ inptrs[3] = inptrs_raw[6];
+ inptrs[4] = inptrs_raw[9];
+ inptrs[5] = inptrs_raw[12];
+ inptrs[6] = inptrs_raw[15];
+ inptrs[7] = inptrs_raw[1];
+ inptrs[8] = inptrs_raw[2];
+ inptrs[9] = inptrs_raw[10];
+ inptrs[10] = inptrs_raw[4];
+ inptrs[11] = inptrs_raw[7];
+ inptrs[12] = inptrs_raw[8];
+ inptrs[13] = inptrs_raw[11];
+ inptrs[14] = inptrs_raw[13];
+ inptrs[15] = inptrs_raw[14];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x8, x7, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v14.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v19.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v29.8h }, [x21]\n"
+ "ld1r { v12.8h }, [x20]\n"
+ "mov x17, #0x0\n"
+ "mov x16, #0x0\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x11, x10, [x22, #0x0]\n"
+ "ldp x9, x28, [x22, #0x10]\n"
+ "cbz x8, 3f\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "subs x8, x8, #0x1\n"
+ "ssubl v23.8h, v23.8b, v19.8b\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "ssubl v16.8h, v16.8b, v19.8b\n"
+ "ssubl v1.8h, v1.8b, v19.8b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "ssubl v5.8h, v5.8b, v19.8b\n"
+ "ssubl v26.8h, v26.8b, v19.8b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "ssubl v18.8h, v18.8b, v19.8b\n"
+ "ssubl v31.8h, v31.8b, v19.8b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v25.8h, v25.8b, v19.8b\n"
+ "ssubl v20.8h, v20.8b, v19.8b\n"
+ "ldr q9, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d22, [x23, x17]\n"
+ "ldr d4, [x22, x17]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d8, [x21, x17]\n"
+ "ldr d27, [x20, x17]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ldr d15, [x20, x17]\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "ldr q3, [x13, #0x0]\n"
+ "ldr q17, [x12, #0x0]\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr q21, [x13, #0x10]\n"
+ "ldr q28, [x12, #0x10]\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ldr x20, [x15, #0x38]\n"
+ "ldr d4, [x20, x17]\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x27, [x15, #0x48]\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x26, [x15, #0x50]\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "ldr d8, [x20, x17]\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr x25, [x15, #0x58]\n"
+ "ldr x24, [x15, #0x60]\n"
+ "smlal v2.4s, v11.4h, v31.4h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x70]\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v9.4s, v4.4h, v16.4h\n"
+ "ldr x21, [x15, #0x78]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "ldr d27, [x27, x17]\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d11, [x26, x17]\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "add x14, x14, #0x48\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal v10.4s, v22.4h, v20.4h\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "subs x8, x8, #0x1\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v9.4s, v8.4h, v1.4h\n"
+ "add x13, x13, #0x20\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "ldr d15, [x25, x17]\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v20.8h\n"
+ "ldr d22, [x24, x17]\n"
+ "smlal v7.4s, v4.4h, v23.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v2.4s, v27.4h, v18.4h\n"
+ "smlal v10.4s, v27.4h, v26.4h\n"
+ "smlal2 v24.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v20.4h\n"
+ "smlal2 v0.4s, v4.8h, v23.8h\n"
+ "ldr d4, [x23, x17]\n"
+ "smlal2 v30.4s, v27.8h, v18.8h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v26.8h\n"
+ "ldr d26, [x22, x17]\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "usubl v26.8h, v26.8b, v14.8b\n"
+ "smlal v2.4s, v11.4h, v23.4h\n"
+ "smlal v10.4s, v15.4h, v1.4h\n"
+ "smlal2 v24.4s, v27.8h, v20.8h\n"
+ "smlal v9.4s, v11.4h, v5.4h\n"
+ "smlal2 v0.4s, v8.8h, v16.8h\n"
+ "ldr d8, [x21, x17]\n"
+ "smlal2 v30.4s, v11.8h, v23.8h\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v15.8h, v1.8h\n"
+ "smlal v7.4s, v27.4h, v25.4h\n"
+ "add x17, x17, #0x8\n"
+ "smlal v2.4s, v22.4h, v5.4h\n"
+ "smlal v10.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v9.4s, v22.4h, v31.4h\n"
+ "sqrdmulh v9.4s, v9.4s, v3.4s\n"
+ "smlal2 v0.4s, v27.8h, v25.8h\n"
+ "smlal2 v30.4s, v22.8h, v5.8h\n"
+ "and v27.16b, v9.16b, v17.16b\n"
+ "smlal2 v6.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v15.4h, v18.4h\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "smlal v2.4s, v26.4h, v25.4h\n"
+ "smlal v10.4s, v26.4h, v31.4h\n"
+ "sqadd v9.4s, v9.4s, v27.4s\n"
+ "smlal2 v24.4s, v22.8h, v31.8h\n"
+ "smlal2 v0.4s, v15.8h, v18.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "smlal2 v30.4s, v26.8h, v25.8h\n"
+ "smlal2 v6.4s, v26.8h, v31.8h\n"
+ "and v31.16b, v24.16b, v28.16b\n"
+ "smlal v7.4s, v4.4h, v20.4h\n"
+ "smlal v2.4s, v8.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v3.4s\n"
+ "smlal v10.4s, v8.4h, v25.4h\n"
+ "smlal2 v0.4s, v4.8h, v20.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v3.4s\n"
+ "smlal2 v30.4s, v8.8h, v20.8h\n"
+ "smlal2 v6.4s, v8.8h, v25.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v3.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v22.16b, v7.16b, v17.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+ "and v3.16b, v2.16b, v17.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "and v11.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v20.16b, v0.16b, v28.16b\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v31.16b, v30.16b, v28.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v18.16b, v6.16b, v28.16b\n"
+ "sqadd v7.4s, v7.4s, v22.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v3.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v11.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v17.4s\n"
+ "sqadd v0.4s, v0.4s, v20.4s\n"
+ "srshl v2.4s, v2.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v31.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v18.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v0.4s, v0.4s, v28.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v28.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v28.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "str d9, [x11, x16]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str d7, [x10, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d2, [x9, x16]\n"
+ "str d10, [x28, x16]\n"
+ "ldr q9, [x20, #0x0]\n"
+ "ldr q24, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "add x16, x16, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldp x23, x22, [x15, #0x0]\n"
+ "ssubl v23.8h, v23.8b, v19.8b\n"
+ "ssubl v16.8h, v16.8b, v19.8b\n"
+ "ldp x21, x20, [x15, #0x10]\n"
+ "ldr d22, [x23, x17]\n"
+ "ssubl v1.8h, v1.8b, v19.8b\n"
+ "ssubl v5.8h, v5.8b, v19.8b\n"
+ "ldr d4, [x22, x17]\n"
+ "ldr d8, [x21, x17]\n"
+ "ssubl v26.8h, v26.8b, v19.8b\n"
+ "ssubl v18.8h, v18.8b, v19.8b\n"
+ "ldr d27, [x20, x17]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ssubl v31.8h, v31.8b, v19.8b\n"
+ "ssubl v25.8h, v25.8b, v19.8b\n"
+ "ldr d15, [x20, x17]\n"
+ "ssubl v20.8h, v20.8b, v19.8b\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "ldr q28, [x13, #0x0]\n"
+ "ldr q17, [x12, #0x0]\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr q21, [x13, #0x10]\n"
+ "ldr q3, [x12, #0x10]\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "ldr d11, [x20, x17]\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "ldr x20, [x15, #0x38]\n"
+ "ldr d4, [x20, x17]\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x26, [x15, #0x48]\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x25, [x15, #0x50]\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "ldr d8, [x20, x17]\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "ldr x24, [x15, #0x58]\n"
+ "ldr x23, [x15, #0x60]\n"
+ "smlal v2.4s, v11.4h, v31.4h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "ldr x22, [x15, #0x68]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "smlal v9.4s, v4.4h, v16.4h\n"
+ "ldr x20, [x15, #0x78]\n"
+ "tst x7, #0x7\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "ldr d27, [x26, x17]\n"
+ "smlal2 v30.4s, v11.8h, v31.8h\n"
+ "ldr d11, [x25, x17]\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "add x13, x13, #0x20\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal v10.4s, v22.4h, v20.4h\n"
+ "usubl v11.8h, v11.8b, v14.8b\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v24.4s, v4.8h, v16.8h\n"
+ "smlal v9.4s, v8.4h, v1.4h\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "ldr d15, [x24, x17]\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "smlal2 v6.4s, v22.8h, v20.8h\n"
+ "ldr d22, [x23, x17]\n"
+ "smlal v7.4s, v4.4h, v23.4h\n"
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v2.4s, v27.4h, v18.4h\n"
+ "smlal v10.4s, v27.4h, v26.4h\n"
+ "smlal2 v24.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v20.4h\n"
+ "smlal2 v0.4s, v4.8h, v23.8h\n"
+ "ldr d4, [x22, x17]\n"
+ "smlal2 v30.4s, v27.8h, v18.8h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal2 v6.4s, v27.8h, v26.8h\n"
+ "ldr d26, [x21, x17]\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "usubl v26.8h, v26.8b, v14.8b\n"
+ "smlal v2.4s, v11.4h, v23.4h\n"
+ "smlal v10.4s, v15.4h, v1.4h\n"
+ "smlal2 v24.4s, v27.8h, v20.8h\n"
+ "smlal v9.4s, v11.4h, v5.4h\n"
+ "smlal2 v0.4s, v8.8h, v16.8h\n"
+ "ldr d16, [x20, x17]\n"
+ "smlal2 v30.4s, v11.8h, v23.8h\n"
+ "usubl v16.8h, v16.8b, v14.8b\n"
+ "smlal2 v6.4s, v15.8h, v1.8h\n"
+ "smlal v7.4s, v27.4h, v25.4h\n"
+ "add x17, x17, #0x8\n"
+ "smlal v2.4s, v22.4h, v5.4h\n"
+ "smlal v10.4s, v4.4h, v18.4h\n"
+ "smlal2 v24.4s, v11.8h, v5.8h\n"
+ "smlal v9.4s, v22.4h, v31.4h\n"
+ "sqrdmulh v9.4s, v9.4s, v28.4s\n"
+ "smlal2 v0.4s, v27.8h, v25.8h\n"
+ "smlal2 v30.4s, v22.8h, v5.8h\n"
+ "and v1.16b, v9.16b, v17.16b\n"
+ "smlal2 v6.4s, v4.8h, v18.8h\n"
+ "smlal v7.4s, v15.4h, v18.4h\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "smlal v2.4s, v26.4h, v25.4h\n"
+ "smlal v10.4s, v26.4h, v31.4h\n"
+ "sqadd v9.4s, v9.4s, v1.4s\n"
+ "smlal2 v24.4s, v22.8h, v31.8h\n"
+ "smlal2 v0.4s, v15.8h, v18.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v21.4s\n"
+ "smlal2 v30.4s, v26.8h, v25.8h\n"
+ "smlal2 v6.4s, v26.8h, v31.8h\n"
+ "and v31.16b, v24.16b, v3.16b\n"
+ "smlal v7.4s, v4.4h, v20.4h\n"
+ "smlal v2.4s, v16.4h, v20.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v28.4s\n"
+ "smlal v10.4s, v16.4h, v25.4h\n"
+ "smlal2 v0.4s, v4.8h, v20.8h\n"
+ "sqrdmulh v2.4s, v2.4s, v28.4s\n"
+ "smlal2 v30.4s, v16.8h, v20.8h\n"
+ "smlal2 v6.4s, v16.8h, v25.8h\n"
+ "sqrdmulh v10.4s, v10.4s, v28.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v22.16b, v7.16b, v17.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v21.4s\n"
+ "and v15.16b, v2.16b, v17.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v21.4s\n"
+ "and v11.16b, v10.16b, v17.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v21.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v18.16b, v0.16b, v3.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v23.16b, v30.16b, v3.16b\n"
+ "sshr v11.4s, v11.4s, #0x1f\n"
+ "and v21.16b, v6.16b, v3.16b\n"
+ "sqadd v7.4s, v7.4s, v22.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v15.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v11.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v17.4s\n"
+ "srshl v7.4s, v7.4s, v17.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v17.4s\n"
+ "sqadd v30.4s, v30.4s, v23.4s\n"
+ "srshl v10.4s, v10.4s, v17.4s\n"
+ "sqadd v6.4s, v6.4s, v21.4s\n"
+ "srshl v24.4s, v24.4s, v3.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v0.4s, v0.4s, v3.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v3.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v3.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "str d9, [x11, x16]\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "str d7, [x10, x16]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "str d2, [x9, x16]\n"
+ "str d10, [x28, x16]\n"
+ "add x16, x16, #0x8\n"
+ "beq 64f\n"
+ "add x14, x14, #0x48\n"
+ "3:" // Oddments
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x7, #2, 5f\n"
+ "ld1 { v9.4s }, [x20], #0x10\n"
+ "tbz x7, #1, 4f\n"
+ "ld1 { v24.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v24.s }[2], [x20]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 7f\n"
+ "ld1 { v24.s }[0], [x20]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x7, #1, 6f\n"
+ "ld1 { v9.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v9.s }[2], [x20]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 7f\n"
+ "ld1 { v9.s }[0], [x20]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "ldr d23, [x14, #0x0]\n"
+ "ldr d16, [x14, #0x8]\n"
+ "mov v7.16b, v9.16b\n"
+ "mov v0.16b, v24.16b\n"
+ "ldr d1, [x14, #0x10]\n"
+ "ldr d5, [x14, #0x18]\n"
+ "mov v2.16b, v9.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "ldr d26, [x14, #0x20]\n"
+ "ldr d18, [x14, #0x28]\n"
+ "mov v10.16b, v9.16b\n"
+ "mov v6.16b, v24.16b\n"
+ "ldr d31, [x14, #0x30]\n"
+ "ldr d25, [x14, #0x38]\n"
+ "ssubl v23.8h, v23.8b, v19.8b\n"
+ "ssubl v16.8h, v16.8b, v19.8b\n"
+ "ldr d20, [x14, #0x40]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "ssubl v1.8h, v1.8b, v19.8b\n"
+ "ssubl v5.8h, v5.8b, v19.8b\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ssubl v26.8h, v26.8b, v19.8b\n"
+ "ssubl v18.8h, v18.8b, v19.8b\n"
+ "ssubl v31.8h, v31.8b, v19.8b\n"
+ "ssubl v25.8h, v25.8b, v19.8b\n"
+ "ssubl v20.8h, v20.8b, v19.8b\n"
+ "add x24, x24, x17\n"
+ "add x23, x23, x17\n"
+ "add x22, x22, x17\n"
+ "add x21, x21, x17\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 9f\n"
+ "ld1 { v22.s }[0], [x24], #0x4\n"
+ "ld1 { v4.s }[0], [x23], #0x4\n"
+ "ld1 { v8.s }[0], [x22], #0x4\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 8f\n"
+ "ld1 { v22.h }[2], [x24], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v8.h }[2], [x22], #0x2\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[6], [x24]\n"
+ "ld1 { v4.b }[6], [x23]\n"
+ "ld1 { v8.b }[6], [x22]\n"
+ "ld1 { v27.b }[6], [x21]\n"
+ "ld1 { v15.b }[6], [x20]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[4], [x24]\n"
+ "ld1 { v4.b }[4], [x23]\n"
+ "ld1 { v8.b }[4], [x22]\n"
+ "ld1 { v27.b }[4], [x21]\n"
+ "ld1 { v15.b }[4], [x20]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x7, #1, 10f\n"
+ "ld1 { v22.h }[0], [x24], #0x2\n"
+ "ld1 { v4.h }[0], [x23], #0x2\n"
+ "ld1 { v8.h }[0], [x22], #0x2\n"
+ "ld1 { v27.h }[0], [x21], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x23]\n"
+ "ld1 { v8.b }[2], [x22]\n"
+ "ld1 { v27.b }[2], [x21]\n"
+ "ld1 { v15.b }[2], [x20]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 11f\n"
+ "ld1 { v22.b }[0], [x24]\n"
+ "ld1 { v4.b }[0], [x23]\n"
+ "ld1 { v8.b }[0], [x22]\n"
+ "ld1 { v27.b }[0], [x21]\n"
+ "ld1 { v15.b }[0], [x20]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "smlal v9.4s, v22.4h, v26.4h\n"
+ "smlal2 v24.4s, v22.8h, v26.8h\n"
+ "ldr x20, [x15, #0x28]\n"
+ "smlal v7.4s, v22.4h, v5.4h\n"
+ "smlal2 v0.4s, v22.8h, v5.8h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "smlal v2.4s, v22.4h, v16.4h\n"
+ "smlal2 v30.4s, v22.8h, v16.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v10.4s, v22.4h, v23.4h\n"
+ "smlal2 v6.4s, v22.8h, v23.8h\n"
+ "usubl v27.8h, v27.8b, v14.8b\n"
+ "smlal v9.4s, v4.4h, v23.4h\n"
+ "smlal2 v24.4s, v4.8h, v23.8h\n"
+ "usubl v15.8h, v15.8b, v14.8b\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "smlal2 v0.4s, v8.8h, v1.8h\n"
+ "smlal v9.4s, v27.4h, v18.4h\n"
+ "smlal2 v24.4s, v27.8h, v18.8h\n"
+ "smlal v7.4s, v27.4h, v26.4h\n"
+ "smlal2 v0.4s, v27.8h, v26.8h\n"
+ "smlal v2.4s, v27.4h, v1.4h\n"
+ "smlal2 v30.4s, v27.8h, v1.8h\n"
+ "smlal v10.4s, v27.4h, v16.4h\n"
+ "smlal2 v6.4s, v27.8h, v16.8h\n"
+ "tbz x7, #2, 13f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 12f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[6], [x20]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[4], [x20]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x7, #1, 14f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[2], [x20]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 15f\n"
+ "ld1 { v21.b }[0], [x20]\n"
+ "15:" // Oddments: Load (3, 0): Bit 2: End
+ "usubl v21.8h, v21.8b, v14.8b\n"
+ "smlal v2.4s, v21.4h, v31.4h\n"
+ "smlal2 v30.4s, v21.8h, v31.8h\n"
+ "ldr x20, [x15, #0x30]\n"
+ "smlal v9.4s, v15.4h, v25.4h\n"
+ "smlal2 v24.4s, v15.8h, v25.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v7.4s, v15.4h, v31.4h\n"
+ "smlal2 v0.4s, v15.8h, v31.8h\n"
+ "smlal v2.4s, v15.4h, v26.4h\n"
+ "smlal2 v30.4s, v15.8h, v26.8h\n"
+ "smlal v10.4s, v15.4h, v5.4h\n"
+ "smlal2 v6.4s, v15.8h, v5.8h\n"
+ "tbz x7, #2, 17f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 16f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[6], [x20]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[4], [x20]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x7, #1, 18f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[2], [x20]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 19f\n"
+ "ld1 { v28.b }[0], [x20]\n"
+ "19:" // Oddments: Load (3, 3): Bit 2: End
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "ldr x20, [x15, #0x38]\n"
+ "smlal v10.4s, v28.4h, v20.4h\n"
+ "smlal2 v6.4s, v28.8h, v20.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 21f\n"
+ "ld1 { v22.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 20f\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[6], [x20]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[4], [x20]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (0, 1): Bit 2: Unset
+ "tbz x7, #1, 22f\n"
+ "ld1 { v22.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[2], [x20]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 23f\n"
+ "ld1 { v22.b }[0], [x20]\n"
+ "23:" // Oddments: Load (0, 1): Bit 2: End
+ "usubl v22.8h, v22.8b, v14.8b\n"
+ "ldr x20, [x15, #0x40]\n"
+ "smlal v9.4s, v22.4h, v16.4h\n"
+ "smlal2 v24.4s, v22.8h, v16.8h\n"
+ "smlal v7.4s, v22.4h, v23.4h\n"
+ "smlal2 v0.4s, v22.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 25f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 24f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[6], [x20]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[4], [x20]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (0, 2): Bit 2: Unset
+ "tbz x7, #1, 26f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[2], [x20]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 27f\n"
+ "ld1 { v21.b }[0], [x20]\n"
+ "27:" // Oddments: Load (0, 2): Bit 2: End
+ "usubl v21.8h, v21.8b, v14.8b\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v9.4s, v21.4h, v1.4h\n"
+ "smlal2 v24.4s, v21.8h, v1.8h\n"
+ "smlal v7.4s, v21.4h, v16.4h\n"
+ "smlal2 v0.4s, v21.8h, v16.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 29f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 28f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[6], [x20]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[4], [x20]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 2): Bit 2: Unset
+ "tbz x7, #1, 30f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[2], [x20]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 31f\n"
+ "ld1 { v28.b }[0], [x20]\n"
+ "31:" // Oddments: Load (2, 2): Bit 2: End
+ "usubl v28.8h, v28.8b, v14.8b\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v9.4s, v28.4h, v20.4h\n"
+ "smlal2 v24.4s, v28.8h, v20.8h\n"
+ "smlal v7.4s, v28.4h, v25.4h\n"
+ "smlal2 v0.4s, v28.8h, v25.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v2.4s, v28.4h, v18.4h\n"
+ "smlal2 v30.4s, v28.8h, v18.8h\n"
+ "smlal v10.4s, v28.4h, v26.4h\n"
+ "smlal2 v6.4s, v28.8h, v26.8h\n"
+ "tbz x7, #2, 33f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 32f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[6], [x20]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[4], [x20]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (1, 0): Bit 2: Unset
+ "tbz x7, #1, 34f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[2], [x20]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 35f\n"
+ "ld1 { v8.b }[0], [x20]\n"
+ "35:" // Oddments: Load (1, 0): Bit 2: End
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "ldr x20, [x15, #0x58]\n"
+ "smlal v9.4s, v8.4h, v5.4h\n"
+ "smlal2 v24.4s, v8.8h, v5.8h\n"
+ "smlal v2.4s, v8.4h, v23.4h\n"
+ "smlal2 v30.4s, v8.8h, v23.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 37f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 36f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[6], [x20]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[4], [x20]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x7, #1, 38f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[2], [x20]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 39f\n"
+ "ld1 { v8.b }[0], [x20]\n"
+ "39:" // Oddments: Load (1, 3): Bit 2: End
+ "usubl v8.8h, v8.8b, v14.8b\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v7.4s, v8.4h, v18.4h\n"
+ "smlal2 v0.4s, v8.8h, v18.8h\n"
+ "smlal v10.4s, v8.4h, v1.4h\n"
+ "smlal2 v6.4s, v8.8h, v1.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 41f\n"
+ "ld1 { v17.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 40f\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[6], [x20]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[4], [x20]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (2, 0): Bit 2: Unset
+ "tbz x7, #1, 42f\n"
+ "ld1 { v17.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[2], [x20]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 43f\n"
+ "ld1 { v17.b }[0], [x20]\n"
+ "43:" // Oddments: Load (2, 0): Bit 2: End
+ "usubl v17.8h, v17.8b, v14.8b\n"
+ "ldr x20, [x15, #0x68]\n"
+ "smlal v9.4s, v17.4h, v31.4h\n"
+ "smlal2 v24.4s, v17.8h, v31.8h\n"
+ "smlal v2.4s, v17.4h, v5.4h\n"
+ "smlal2 v30.4s, v17.8h, v5.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 45f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 44f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[6], [x20]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[4], [x20]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x7, #1, 46f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[2], [x20]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 47f\n"
+ "ld1 { v23.b }[0], [x20]\n"
+ "47:" // Oddments: Load (2, 3): Bit 2: End
+ "usubl v23.8h, v23.8b, v14.8b\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v7.4s, v23.4h, v20.4h\n"
+ "smlal2 v0.4s, v23.8h, v20.8h\n"
+ "smlal v10.4s, v23.4h, v18.4h\n"
+ "smlal2 v6.4s, v23.8h, v18.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 49f\n"
+ "ld1 { v5.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 48f\n"
+ "ld1 { v5.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[6], [x20]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[4], [x20]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x7, #1, 50f\n"
+ "ld1 { v5.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[2], [x20]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 51f\n"
+ "ld1 { v5.b }[0], [x20]\n"
+ "51:" // Oddments: Load (3, 1): Bit 2: End
+ "usubl v5.8h, v5.8b, v14.8b\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v2.4s, v5.4h, v25.4h\n"
+ "smlal2 v30.4s, v5.8h, v25.8h\n"
+ "smlal v10.4s, v5.4h, v31.4h\n"
+ "smlal2 v6.4s, v5.8h, v31.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 53f\n"
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 52f\n"
+ "ld1 { v23.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[6], [x20]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[4], [x20]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x7, #1, 54f\n"
+ "ld1 { v23.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[2], [x20]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 55f\n"
+ "ld1 { v23.b }[0], [x20]\n"
+ "55:" // Oddments: Load (3, 2): Bit 2: End
+ "usubl v23.8h, v23.8b, v14.8b\n"
+ "smlal v2.4s, v23.4h, v20.4h\n"
+ "smlal2 v30.4s, v23.8h, v20.8h\n"
+ "smlal v10.4s, v23.4h, v25.4h\n"
+ "smlal2 v6.4s, v23.8h, v25.8h\n"
+ "tbz x7, #2, 57f\n"
+ "ld1 { v15.4s }, [x13], #0x10\n"
+ "ld1 { v19.4s }, [x12], #0x10\n"
+ "tbz x7, #1, 56f\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v22.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v22.s }[2], [x12]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 59f\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v22.s }[0], [x12]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x7, #1, 58f\n"
+ "ld1 { v15.d }[0], [x13], #0x8\n"
+ "ld1 { v19.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v15.s }[2], [x13]\n"
+ "ld1 { v19.s }[2], [x12]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 59f\n"
+ "ld1 { v15.s }[0], [x13]\n"
+ "ld1 { v19.s }[0], [x12]\n"
+ "59:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v9.4s, v9.4s, v15.4s\n"
+ "and v17.16b, v9.16b, v19.16b\n"
+ "add x11, x11, x16\n"
+ "add x10, x10, x16\n"
+ "sqrdmulh v24.4s, v24.4s, v18.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "add x9, x9, x16\n"
+ "add x28, x28, x16\n"
+ "and v20.16b, v24.16b, v22.16b\n"
+ "sqrdmulh v7.4s, v7.4s, v15.4s\n"
+ "sqrdmulh v2.4s, v2.4s, v15.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v15.4s\n"
+ "sqadd v9.4s, v9.4s, v17.4s\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v21.16b, v7.16b, v19.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "and v15.16b, v2.16b, v19.16b\n"
+ "sqrdmulh v30.4s, v30.4s, v18.4s\n"
+ "and v23.16b, v10.16b, v19.16b\n"
+ "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v20.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v18.16b, v0.16b, v22.16b\n"
+ "sshr v15.4s, v15.4s, #0x1f\n"
+ "and v17.16b, v30.16b, v22.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v28.16b, v6.16b, v22.16b\n"
+ "sqadd v7.4s, v7.4s, v21.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v2.4s, v2.4s, v15.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v23.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v9.4s, v9.4s, v19.4s\n"
+ "srshl v7.4s, v7.4s, v19.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v2.4s, v2.4s, v19.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqadd v6.4s, v6.4s, v28.4s\n"
+ "srshl v24.4s, v24.4s, v22.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v0.4s, v0.4s, v22.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v30.4s, v30.4s, v22.4s\n"
+ "sqxtn v2.4h, v2.4s\n"
+ "srshl v6.4s, v6.4s, v22.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "sqxtn2 v9.8h, v24.4s\n"
+ "sqxtn2 v7.8h, v0.4s\n"
+ "sqxtn2 v2.8h, v30.4s\n"
+ "sqxtn2 v10.8h, v6.4s\n"
+ "sqadd v9.8h, v9.8h, v13.8h\n"
+ "sqadd v7.8h, v7.8h, v13.8h\n"
+ "sqadd v2.8h, v2.8h, v13.8h\n"
+ "sqadd v10.8h, v10.8h, v13.8h\n"
+ "smax v9.8h, v9.8h, v29.8h\n"
+ "smax v7.8h, v7.8h, v29.8h\n"
+ "smax v2.8h, v2.8h, v29.8h\n"
+ "smax v10.8h, v10.8h, v29.8h\n"
+ "smin v9.8h, v9.8h, v12.8h\n"
+ "smin v7.8h, v7.8h, v12.8h\n"
+ "smin v2.8h, v2.8h, v12.8h\n"
+ "smin v10.8h, v10.8h, v12.8h\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v2.16b, v2.16b, v2.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "tbz x7, #2, 61f\n"
+ "st1 { v9.s }[0], [x11], #0x4\n"
+ "st1 { v7.s }[0], [x10], #0x4\n"
+ "st1 { v2.s }[0], [x9], #0x4\n"
+ "st1 { v10.s }[0], [x28], #0x4\n"
+ "tbz x7, #1, 60f\n"
+ "st1 { v9.h }[2], [x11], #0x2\n"
+ "st1 { v7.h }[2], [x10], #0x2\n"
+ "st1 { v2.h }[2], [x9], #0x2\n"
+ "st1 { v10.h }[2], [x28], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "st1 { v9.b }[6], [x11], #0x1\n"
+ "st1 { v7.b }[6], [x10], #0x1\n"
+ "st1 { v2.b }[6], [x9], #0x1\n"
+ "st1 { v10.b }[6], [x28], #0x1\n"
+ "b 63f\n"
+ "60:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 63f\n"
+ "st1 { v9.b }[4], [x11], #0x1\n"
+ "st1 { v7.b }[4], [x10], #0x1\n"
+ "st1 { v2.b }[4], [x9], #0x1\n"
+ "st1 { v10.b }[4], [x28], #0x1\n"
+ "b 63f\n"
+ "61:" // Oddments: Bit 2: Unset
+ "tbz x7, #1, 62f\n"
+ "st1 { v9.h }[0], [x11], #0x2\n"
+ "st1 { v7.h }[0], [x10], #0x2\n"
+ "st1 { v2.h }[0], [x9], #0x2\n"
+ "st1 { v10.h }[0], [x28], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "st1 { v9.b }[2], [x11], #0x1\n"
+ "st1 { v7.b }[2], [x10], #0x1\n"
+ "st1 { v2.b }[2], [x9], #0x1\n"
+ "st1 { v10.b }[2], [x28], #0x1\n"
+ "b 63f\n"
+ "62:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 63f\n"
+ "st1 { v9.b }[0], [x11], #0x1\n"
+ "st1 { v7.b }[0], [x10], #0x1\n"
+ "st1 { v2.b }[0], [x9], #0x1\n"
+ "st1 { v10.b }[0], [x28], #0x1\n"
+ "63:" // Oddments: Bit 2: End
+ "64:" // End
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..7c05b36f36
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+
+
+void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+ const unsigned int,
+ const uint8_t *const *const,
+ const int8_t *const,
+ const int32_t *const,
+ const arm_gemm::Requantize32 &,
+ const int32_t *const,
+ const int32_t *const,
+ uint8_t *const *const
+);
+
+class a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..e9db8e1322
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ uint64_t n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[25];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[12];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[1];
+ inptrs[3] = inptrs_raw[3];
+ inptrs[4] = inptrs_raw[4];
+ inptrs[5] = inptrs_raw[5];
+ inptrs[6] = inptrs_raw[6];
+ inptrs[7] = inptrs_raw[2];
+ inptrs[8] = inptrs_raw[8];
+ inptrs[9] = inptrs_raw[9];
+ inptrs[10] = inptrs_raw[7];
+ inptrs[11] = inptrs_raw[15];
+ inptrs[12] = inptrs_raw[10];
+ inptrs[13] = inptrs_raw[16];
+ inptrs[14] = inptrs_raw[11];
+ inptrs[15] = inptrs_raw[18];
+ inptrs[16] = inptrs_raw[13];
+ inptrs[17] = inptrs_raw[19];
+ inptrs[18] = inptrs_raw[20];
+ inptrs[19] = inptrs_raw[14];
+ inptrs[20] = inptrs_raw[21];
+ inptrs[21] = inptrs_raw[17];
+ inptrs[22] = inptrs_raw[23];
+ inptrs[23] = inptrs_raw[22];
+ inptrs[24] = inptrs_raw[24];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x7, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x8, x7, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v6.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v15.16b }, [x21]\n"
+ "ld1r { v13.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v17.8h }, [x21]\n"
+ "ld1r { v24.8h }, [x20]\n"
+ "mov x17, #0x0\n"
+ "mov x16, #0x0\n"
+ "add x15, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x11, x10, [x22, #0x0]\n"
+ "ldp x9, x28, [x22, #0x10]\n"
+ "cbz x8, 3f\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
+ "subs x8, x8, #0x1\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "ssubl v22.8h, v22.8b, v15.8b\n"
+ "ssubl v14.8h, v14.8b, v15.8b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
+ "ldr d26, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "ssubl v9.8h, v9.8b, v15.8b\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ldr d4, [x14, #0x40]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d25, [x27, x17]\n"
+ "ldr d27, [x26, x17]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d1, [x25, x17]\n"
+ "ldr d2, [x24, x17]\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "ldr d12, [x23, x17]\n"
+ "ldr d16, [x22, x17]\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "ldr d23, [x21, x17]\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "ldr q30, [x13, #0x0]\n"
+ "ldr q29, [x12, #0x0]\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x21, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "ldr x25, [x15, #0x60]\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
+ "ldr d27, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal v20.4s, v27.4h, v28.4h\n"
+ "smlal v19.4s, v25.4h, v18.4h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ldr d1, [x25, x17]\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "ldr d2, [x24, x17]\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v28.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v18.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
+ "ldr x25, [x15, #0x98]\n"
+ "smlal v20.4s, v1.4h, v11.4h\n"
+ "smlal v19.4s, v2.4h, v22.4h\n"
+ "ldr x24, [x15, #0x50]\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ldr d16, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "ldr d12, [x20, x17]\n"
+ "ldr x23, [x15, #0x48]\n"
+ "smlal2 v0.4s, v1.8h, v11.8h\n"
+ "smlal2 v31.4s, v2.8h, v22.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal v20.4s, v27.4h, v18.4h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v19.4s, v25.4h, v9.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ldr d23, [x25, x17]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "ldr d11, [x24, x17]\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v18.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v9.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v21.4s, v16.4h, v18.4h\n"
+ "smlal v20.4s, v12.4h, v22.4h\n"
+ "smlal v19.4s, v23.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal2 v8.4s, v16.8h, v18.8h\n"
+ "ldr d18, [x22, x17]\n"
+ "ldr d16, [x21, x17]\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal2 v31.4s, v23.8h, v14.8h\n"
+ "ldr q14, [x13, #0x10]\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v25.4h, v26.4h\n"
+ "smlal v19.4s, v10.4h, v28.4h\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "ldr x21, [x15, #0xc0]\n"
+ "smlal2 v3.4s, v11.8h, v9.8h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v26.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v8.4s, v27.8h, v9.8h\n"
+ "ldr d27, [x21, x17]\n"
+ "smlal2 v0.4s, v25.8h, v26.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal2 v31.4s, v10.8h, v28.8h\n"
+ "smlal v21.4s, v11.4h, v28.4h\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
+ "add x14, x14, #0x48\n"
+ "smlal v20.4s, v18.4h, v7.4h\n"
+ "smlal v19.4s, v16.4h, v7.4h\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "add x17, x17, #0x8\n"
+ "smlal2 v3.4s, v1.8h, v26.8h\n"
+ "smlal v5.4s, v12.4h, v7.4h\n"
+ "sqrdmulh v5.4s, v5.4s, v30.4s\n"
+ "subs x8, x8, #0x1\n"
+ "smlal2 v8.4s, v11.8h, v28.8h\n"
+ "smlal2 v0.4s, v18.8h, v7.8h\n"
+ "and v28.16b, v5.16b, v29.16b\n"
+ "add x13, x13, #0x20\n"
+ "smlal2 v31.4s, v16.8h, v7.8h\n"
+ "smlal v21.4s, v2.4h, v7.4h\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "add x12, x12, #0x20\n"
+ "smlal v20.4s, v10.4h, v9.4h\n"
+ "smlal v19.4s, v22.4h, v26.4h\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "smlal2 v3.4s, v12.8h, v7.8h\n"
+ "smlal2 v8.4s, v2.8h, v7.8h\n"
+ "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+ "smlal2 v0.4s, v10.8h, v9.8h\n"
+ "smlal2 v31.4s, v22.8h, v26.8h\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "smlal v21.4s, v23.4h, v4.4h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v30.4s\n"
+ "smlal v19.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "smlal2 v0.4s, v22.8h, v4.8h\n"
+ "smlal2 v31.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v30.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v12.16b, v21.16b, v29.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+ "and v23.16b, v20.16b, v29.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+ "and v9.16b, v19.16b, v29.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "and v18.16b, v8.16b, v25.16b\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v22.16b, v0.16b, v25.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v12.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v23.4s\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v9.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v29.4s\n"
+ "srshl v21.4s, v21.4s, v29.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v29.4s\n"
+ "sqadd v0.4s, v0.4s, v22.4s\n"
+ "srshl v19.4s, v19.4s, v29.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v25.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d5, [x11, x16]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d19, [x28, x16]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q3, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
+ "add x16, x16, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d26, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d4, [x14, #0x40]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "ssubl v22.8h, v22.8b, v15.8b\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ssubl v14.8h, v14.8b, v15.8b\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ldr d25, [x27, x17]\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
+ "ssubl v9.8h, v9.8b, v15.8b\n"
+ "ldr d27, [x26, x17]\n"
+ "ldr d1, [x25, x17]\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ldr d2, [x24, x17]\n"
+ "ldr d12, [x23, x17]\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "ldr d16, [x22, x17]\n"
+ "ldr d23, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "ldr q29, [x13, #0x0]\n"
+ "ldr q30, [x12, #0x0]\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x21, [x15, #0x58]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "ldr x25, [x15, #0x60]\n"
+ "ldr x24, [x15, #0x80]\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
+ "ldr d27, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "ldr x23, [x15, #0x68]\n"
+ "ldr x22, [x15, #0x88]\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "ldr d25, [x20, x17]\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal v20.4s, v27.4h, v28.4h\n"
+ "smlal v19.4s, v25.4h, v18.4h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "ldr d1, [x25, x17]\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "ldr d2, [x24, x17]\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v28.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v18.8h\n"
+ "ldr d25, [x22, x17]\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
+ "ldr x25, [x15, #0x98]\n"
+ "smlal v20.4s, v1.4h, v11.4h\n"
+ "smlal v19.4s, v2.4h, v22.4h\n"
+ "ldr x24, [x15, #0x50]\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "ldr d16, [x21, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "ldr d12, [x20, x17]\n"
+ "ldr x23, [x15, #0x48]\n"
+ "smlal2 v0.4s, v1.8h, v11.8h\n"
+ "smlal2 v31.4s, v2.8h, v22.8h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal v20.4s, v27.4h, v18.4h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x22, [x15, #0xa0]\n"
+ "smlal v19.4s, v25.4h, v9.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "ldr d23, [x25, x17]\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "ldr d11, [x24, x17]\n"
+ "usubl v11.8h, v11.8b, v6.8b\n"
+ "smlal2 v0.4s, v27.8h, v18.8h\n"
+ "ldr d27, [x23, x17]\n"
+ "smlal2 v31.4s, v25.8h, v9.8h\n"
+ "ldr d25, [x21, x17]\n"
+ "ldr x21, [x15, #0xb0]\n"
+ "smlal v21.4s, v16.4h, v18.4h\n"
+ "smlal v20.4s, v12.4h, v22.4h\n"
+ "smlal v19.4s, v23.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "ldr d10, [x20, x17]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v5.4s, v11.4h, v9.4h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal2 v8.4s, v16.8h, v18.8h\n"
+ "ldr d16, [x22, x17]\n"
+ "ldr d18, [x21, x17]\n"
+ "smlal2 v0.4s, v12.8h, v22.8h\n"
+ "ldr d22, [x20, x17]\n"
+ "smlal2 v31.4s, v23.8h, v14.8h\n"
+ "ldr q14, [x13, #0x10]\n"
+ "smlal v21.4s, v27.4h, v9.4h\n"
+ "smlal v20.4s, v25.4h, v26.4h\n"
+ "smlal v19.4s, v10.4h, v28.4h\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "smlal2 v3.4s, v11.8h, v9.8h\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "smlal v5.4s, v1.4h, v26.4h\n"
+ "tst x7, #0x7\n"
+ "smlal2 v8.4s, v27.8h, v9.8h\n"
+ "ldr d27, [x20, x17]\n"
+ "smlal2 v0.4s, v25.8h, v26.8h\n"
+ "ldr q25, [x12, #0x10]\n"
+ "smlal2 v31.4s, v10.8h, v28.8h\n"
+ "smlal v21.4s, v11.4h, v28.4h\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
+ "add x17, x17, #0x8\n"
+ "smlal v20.4s, v16.4h, v7.4h\n"
+ "smlal v19.4s, v18.4h, v7.4h\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "add x13, x13, #0x20\n"
+ "smlal2 v3.4s, v1.8h, v26.8h\n"
+ "smlal v5.4s, v12.4h, v7.4h\n"
+ "sqrdmulh v5.4s, v5.4s, v29.4s\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v8.4s, v11.8h, v28.8h\n"
+ "smlal2 v0.4s, v16.8h, v7.8h\n"
+ "and v16.16b, v5.16b, v30.16b\n"
+ "smlal2 v31.4s, v18.8h, v7.8h\n"
+ "smlal v21.4s, v2.4h, v7.4h\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "smlal v20.4s, v10.4h, v9.4h\n"
+ "smlal v19.4s, v22.4h, v26.4h\n"
+ "sqadd v5.4s, v5.4s, v16.4s\n"
+ "smlal2 v3.4s, v12.8h, v7.8h\n"
+ "smlal2 v8.4s, v2.8h, v7.8h\n"
+ "sqrdmulh v3.4s, v3.4s, v14.4s\n"
+ "smlal2 v0.4s, v10.8h, v9.8h\n"
+ "smlal2 v31.4s, v22.8h, v26.8h\n"
+ "and v16.16b, v3.16b, v25.16b\n"
+ "smlal v21.4s, v23.4h, v4.4h\n"
+ "smlal v20.4s, v22.4h, v4.4h\n"
+ "sqrdmulh v21.4s, v21.4s, v29.4s\n"
+ "smlal v19.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "sqrdmulh v20.4s, v20.4s, v29.4s\n"
+ "smlal2 v0.4s, v22.8h, v4.8h\n"
+ "smlal2 v31.4s, v27.8h, v4.8h\n"
+ "sqrdmulh v19.4s, v19.4s, v29.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v23.16b, v21.16b, v30.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v14.4s\n"
+ "and v27.16b, v20.16b, v30.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v14.4s\n"
+ "and v22.16b, v19.16b, v30.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v14.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v14.16b, v8.16b, v25.16b\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "and v18.16b, v0.16b, v25.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v25.16b\n"
+ "sqadd v21.4s, v21.4s, v23.4s\n"
+ "sshr v14.4s, v14.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v27.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v22.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v30.4s\n"
+ "srshl v21.4s, v21.4s, v30.4s\n"
+ "sqadd v8.4s, v8.4s, v14.4s\n"
+ "srshl v20.4s, v20.4s, v30.4s\n"
+ "sqadd v0.4s, v0.4s, v18.4s\n"
+ "srshl v19.4s, v19.4s, v30.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v25.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v25.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "srshl v0.4s, v0.4s, v25.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v25.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "str d5, [x11, x16]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str d21, [x10, x16]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d20, [x9, x16]\n"
+ "str d19, [x28, x16]\n"
+ "add x16, x16, #0x8\n"
+ "beq 88f\n"
+ "add x14, x14, #0x48\n"
+ "3:" // Oddments
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x7, #2, 5f\n"
+ "ld1 { v5.4s }, [x20], #0x10\n"
+ "tbz x7, #1, 4f\n"
+ "ld1 { v3.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v3.s }[2], [x20]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 7f\n"
+ "ld1 { v3.s }[0], [x20]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x7, #1, 6f\n"
+ "ld1 { v5.d }[0], [x20], #0x8\n"
+ "tbz x7, #0, 7f\n"
+ "ld1 { v5.s }[2], [x20]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 7f\n"
+ "ld1 { v5.s }[0], [x20]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "ldr d11, [x14, #0x0]\n"
+ "ldr d22, [x14, #0x8]\n"
+ "mov v21.16b, v5.16b\n"
+ "mov v8.16b, v3.16b\n"
+ "ldr d14, [x14, #0x10]\n"
+ "ldr d28, [x14, #0x18]\n"
+ "mov v20.16b, v5.16b\n"
+ "mov v0.16b, v3.16b\n"
+ "ldr d18, [x14, #0x20]\n"
+ "ldr d9, [x14, #0x28]\n"
+ "mov v19.16b, v5.16b\n"
+ "mov v31.16b, v3.16b\n"
+ "ldr d26, [x14, #0x30]\n"
+ "ldr d7, [x14, #0x38]\n"
+ "ssubl v11.8h, v11.8b, v15.8b\n"
+ "ssubl v22.8h, v22.8b, v15.8b\n"
+ "ldr d4, [x14, #0x40]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ssubl v14.8h, v14.8b, v15.8b\n"
+ "ssubl v28.8h, v28.8b, v15.8b\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ssubl v18.8h, v18.8b, v15.8b\n"
+ "ssubl v9.8h, v9.8b, v15.8b\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ssubl v26.8h, v26.8b, v15.8b\n"
+ "ssubl v7.8h, v7.8b, v15.8b\n"
+ "ssubl v4.8h, v4.8b, v15.8b\n"
+ "add x27, x27, x17\n"
+ "add x26, x26, x17\n"
+ "add x25, x25, x17\n"
+ "add x24, x24, x17\n"
+ "add x23, x23, x17\n"
+ "add x22, x22, x17\n"
+ "add x21, x21, x17\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 9f\n"
+ "ld1 { v25.s }[0], [x27], #0x4\n"
+ "ld1 { v27.s }[0], [x26], #0x4\n"
+ "ld1 { v1.s }[0], [x25], #0x4\n"
+ "ld1 { v2.s }[0], [x24], #0x4\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v16.s }[0], [x22], #0x4\n"
+ "ld1 { v23.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 8f\n"
+ "ld1 { v25.h }[2], [x27], #0x2\n"
+ "ld1 { v27.h }[2], [x26], #0x2\n"
+ "ld1 { v1.h }[2], [x25], #0x2\n"
+ "ld1 { v2.h }[2], [x24], #0x2\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "ld1 { v16.h }[2], [x22], #0x2\n"
+ "ld1 { v23.h }[2], [x21], #0x2\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v25.b }[6], [x27]\n"
+ "ld1 { v27.b }[6], [x26]\n"
+ "ld1 { v1.b }[6], [x25]\n"
+ "ld1 { v2.b }[6], [x24]\n"
+ "ld1 { v12.b }[6], [x23]\n"
+ "ld1 { v16.b }[6], [x22]\n"
+ "ld1 { v23.b }[6], [x21]\n"
+ "ld1 { v10.b }[6], [x20]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 11f\n"
+ "ld1 { v25.b }[4], [x27]\n"
+ "ld1 { v27.b }[4], [x26]\n"
+ "ld1 { v1.b }[4], [x25]\n"
+ "ld1 { v2.b }[4], [x24]\n"
+ "ld1 { v12.b }[4], [x23]\n"
+ "ld1 { v16.b }[4], [x22]\n"
+ "ld1 { v23.b }[4], [x21]\n"
+ "ld1 { v10.b }[4], [x20]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x7, #1, 10f\n"
+ "ld1 { v25.h }[0], [x27], #0x2\n"
+ "ld1 { v27.h }[0], [x26], #0x2\n"
+ "ld1 { v1.h }[0], [x25], #0x2\n"
+ "ld1 { v2.h }[0], [x24], #0x2\n"
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "ld1 { v16.h }[0], [x22], #0x2\n"
+ "ld1 { v23.h }[0], [x21], #0x2\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 11f\n"
+ "ld1 { v25.b }[2], [x27]\n"
+ "ld1 { v27.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x25]\n"
+ "ld1 { v2.b }[2], [x24]\n"
+ "ld1 { v12.b }[2], [x23]\n"
+ "ld1 { v16.b }[2], [x22]\n"
+ "ld1 { v23.b }[2], [x21]\n"
+ "ld1 { v10.b }[2], [x20]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 11f\n"
+ "ld1 { v25.b }[0], [x27]\n"
+ "ld1 { v27.b }[0], [x26]\n"
+ "ld1 { v1.b }[0], [x25]\n"
+ "ld1 { v2.b }[0], [x24]\n"
+ "ld1 { v12.b }[0], [x23]\n"
+ "ld1 { v16.b }[0], [x22]\n"
+ "ld1 { v23.b }[0], [x21]\n"
+ "ld1 { v10.b }[0], [x20]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "usubl v25.8h, v25.8b, v6.8b\n"
+ "smlal v5.4s, v25.4h, v4.4h\n"
+ "smlal2 v3.4s, v25.8h, v4.8h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "usubl v27.8h, v27.8b, v6.8b\n"
+ "smlal v5.4s, v27.4h, v11.4h\n"
+ "smlal2 v3.4s, v27.8h, v11.8h\n"
+ "usubl v1.8h, v1.8b, v6.8b\n"
+ "smlal v21.4s, v25.4h, v26.4h\n"
+ "smlal2 v8.4s, v25.8h, v26.8h\n"
+ "add x20, x20, x17\n"
+ "smlal v5.4s, v1.4h, v22.4h\n"
+ "smlal2 v3.4s, v1.8h, v22.8h\n"
+ "usubl v2.8h, v2.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v21.4s, v2.4h, v22.4h\n"
+ "smlal2 v8.4s, v2.8h, v22.8h\n"
+ "smlal v5.4s, v16.4h, v28.4h\n"
+ "smlal2 v3.4s, v16.8h, v28.8h\n"
+ "usubl v12.8h, v12.8b, v6.8b\n"
+ "usubl v23.8h, v23.8b, v6.8b\n"
+ "smlal v21.4s, v12.4h, v14.4h\n"
+ "smlal2 v8.4s, v12.8h, v14.8h\n"
+ "smlal v5.4s, v23.4h, v18.4h\n"
+ "smlal2 v3.4s, v23.8h, v18.8h\n"
+ "usubl v10.8h, v10.8b, v6.8b\n"
+ "smlal v20.4s, v25.4h, v14.4h\n"
+ "smlal2 v0.4s, v25.8h, v14.8h\n"
+ "smlal v19.4s, v25.4h, v11.4h\n"
+ "smlal2 v31.4s, v25.8h, v11.8h\n"
+ "smlal v5.4s, v10.4h, v14.4h\n"
+ "smlal2 v3.4s, v10.8h, v14.8h\n"
+ "smlal v21.4s, v10.4h, v11.4h\n"
+ "smlal2 v8.4s, v10.8h, v11.8h\n"
+ "tbz x7, #2, 13f\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 12f\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v15.b }[6], [x20]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 15f\n"
+ "ld1 { v15.b }[4], [x20]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x7, #1, 14f\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 15f\n"
+ "ld1 { v15.b }[2], [x20]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 15f\n"
+ "ld1 { v15.b }[0], [x20]\n"
+ "15:" // Oddments: Load (1, 3): Bit 2: End
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "ldr x20, [x15, #0x48]\n"
+ "smlal v21.4s, v15.4h, v18.4h\n"
+ "smlal2 v8.4s, v15.8h, v18.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 17f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 16f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 19f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (1, 4): Bit 2: Unset
+ "tbz x7, #1, 18f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 19f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 19f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "19:" // Oddments: Load (1, 4): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x50]\n"
+ "smlal v21.4s, v16.4h, v9.4h\n"
+ "smlal2 v8.4s, v16.8h, v9.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 21f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 20f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 23f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (1, 2): Bit 2: Unset
+ "tbz x7, #1, 22f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 23f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 23f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "23:" // Oddments: Load (1, 2): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x58]\n"
+ "smlal v5.4s, v16.4h, v9.4h\n"
+ "smlal2 v3.4s, v16.8h, v9.8h\n"
+ "smlal v21.4s, v16.4h, v28.4h\n"
+ "smlal2 v8.4s, v16.8h, v28.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 25f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 24f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 27f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x7, #1, 26f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 27f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 27f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "27:" // Oddments: Load (3, 0): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x60]\n"
+ "smlal v20.4s, v16.4h, v28.4h\n"
+ "smlal2 v0.4s, v16.8h, v28.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 29f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 28f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 31f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 0): Bit 2: Unset
+ "tbz x7, #1, 30f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 31f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 31f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "31:" // Oddments: Load (2, 0): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x68]\n"
+ "smlal v5.4s, v16.4h, v26.4h\n"
+ "smlal2 v3.4s, v16.8h, v26.8h\n"
+ "smlal v20.4s, v16.4h, v11.4h\n"
+ "smlal2 v0.4s, v16.8h, v11.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 33f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 32f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 35f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x7, #1, 34f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 35f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 35f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "35:" // Oddments: Load (3, 1): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x70]\n"
+ "smlal v20.4s, v16.4h, v18.4h\n"
+ "smlal2 v0.4s, v16.8h, v18.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 37f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 36f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 39f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (2, 1): Bit 2: Unset
+ "tbz x7, #1, 38f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 39f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 39f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "39:" // Oddments: Load (2, 1): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x78]\n"
+ "smlal v5.4s, v16.4h, v7.4h\n"
+ "smlal2 v3.4s, v16.8h, v7.8h\n"
+ "smlal v20.4s, v16.4h, v22.4h\n"
+ "smlal2 v0.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 41f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 40f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 43f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x7, #1, 42f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 43f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 43f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "43:" // Oddments: Load (3, 3): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x80]\n"
+ "smlal v19.4s, v16.4h, v18.4h\n"
+ "smlal2 v31.4s, v16.8h, v18.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 45f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 44f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 47f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x7, #1, 46f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 47f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 47f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "47:" // Oddments: Load (2, 3): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x88]\n"
+ "smlal v21.4s, v16.4h, v7.4h\n"
+ "smlal2 v8.4s, v16.8h, v7.8h\n"
+ "smlal v19.4s, v16.4h, v22.4h\n"
+ "smlal2 v31.4s, v16.8h, v22.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 49f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 48f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 51f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 4): Bit 2: Unset
+ "tbz x7, #1, 50f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 51f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 51f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "51:" // Oddments: Load (3, 4): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x90]\n"
+ "smlal v19.4s, v16.4h, v9.4h\n"
+ "smlal2 v31.4s, v16.8h, v9.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 53f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 52f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 55f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (4, 0): Bit 2: Unset
+ "tbz x7, #1, 54f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 55f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 55f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "55:" // Oddments: Load (4, 0): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0x98]\n"
+ "smlal v20.4s, v16.4h, v26.4h\n"
+ "smlal2 v0.4s, v16.8h, v26.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 57f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 56f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 59f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load (2, 4): Bit 2: Unset
+ "tbz x7, #1, 58f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 59f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 59f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "59:" // Oddments: Load (2, 4): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xa0]\n"
+ "smlal v21.4s, v16.4h, v4.4h\n"
+ "smlal2 v8.4s, v16.8h, v4.8h\n"
+ "smlal v19.4s, v16.4h, v14.4h\n"
+ "smlal2 v31.4s, v16.8h, v14.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 61f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 60f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 63f\n"
+ "60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 63f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 63f\n"
+ "61:" // Oddments: Load (4, 1): Bit 2: Unset
+ "tbz x7, #1, 62f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 63f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 63f\n"
+ "62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 63f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "63:" // Oddments: Load (4, 1): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "smlal v20.4s, v16.4h, v7.4h\n"
+ "smlal2 v0.4s, v16.8h, v7.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 65f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 64f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 67f\n"
+ "64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 67f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 67f\n"
+ "65:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x7, #1, 66f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 67f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 67f\n"
+ "66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 67f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "67:" // Oddments: Load (3, 2): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "smlal v20.4s, v16.4h, v9.4h\n"
+ "smlal2 v0.4s, v16.8h, v9.8h\n"
+ "smlal v19.4s, v16.4h, v28.4h\n"
+ "smlal2 v31.4s, v16.8h, v28.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 69f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 68f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 71f\n"
+ "68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 71f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 71f\n"
+ "69:" // Oddments: Load (4, 3): Bit 2: Unset
+ "tbz x7, #1, 70f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 71f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 71f\n"
+ "70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 71f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "71:" // Oddments: Load (4, 3): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "smlal v19.4s, v16.4h, v7.4h\n"
+ "smlal2 v31.4s, v16.8h, v7.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 73f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 72f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 75f\n"
+ "72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 75f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 75f\n"
+ "73:" // Oddments: Load (4, 2): Bit 2: Unset
+ "tbz x7, #1, 74f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 75f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 75f\n"
+ "74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 75f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "75:" // Oddments: Load (4, 2): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "smlal v20.4s, v16.4h, v4.4h\n"
+ "smlal2 v0.4s, v16.8h, v4.8h\n"
+ "smlal v19.4s, v16.4h, v26.4h\n"
+ "smlal2 v31.4s, v16.8h, v26.8h\n"
+ "add x20, x20, x17\n"
+ "tbz x7, #2, 77f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x7, #1, 76f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 79f\n"
+ "76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+ "tbz x7, #0, 79f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 79f\n"
+ "77:" // Oddments: Load (4, 4): Bit 2: Unset
+ "tbz x7, #1, 78f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x7, #0, 79f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 79f\n"
+ "78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 79f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "79:" // Oddments: Load (4, 4): Bit 2: End
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "smlal v19.4s, v16.4h, v4.4h\n"
+ "smlal2 v31.4s, v16.8h, v4.8h\n"
+ "tbz x7, #2, 81f\n"
+ "ld1 { v14.4s }, [x13], #0x10\n"
+ "ld1 { v25.4s }, [x12], #0x10\n"
+ "tbz x7, #1, 80f\n"
+ "ld1 { v18.d }[0], [x13], #0x8\n"
+ "ld1 { v12.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 83f\n"
+ "ld1 { v18.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x12]\n"
+ "b 83f\n"
+ "80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 83f\n"
+ "ld1 { v18.s }[0], [x13]\n"
+ "ld1 { v12.s }[0], [x12]\n"
+ "b 83f\n"
+ "81:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x7, #1, 82f\n"
+ "ld1 { v14.d }[0], [x13], #0x8\n"
+ "ld1 { v25.d }[0], [x12], #0x8\n"
+ "tbz x7, #0, 83f\n"
+ "ld1 { v14.s }[2], [x13]\n"
+ "ld1 { v25.s }[2], [x12]\n"
+ "b 83f\n"
+ "82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 83f\n"
+ "ld1 { v14.s }[0], [x13]\n"
+ "ld1 { v25.s }[0], [x12]\n"
+ "83:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+ "and v28.16b, v5.16b, v25.16b\n"
+ "add x11, x11, x16\n"
+ "add x10, x10, x16\n"
+ "sqrdmulh v3.4s, v3.4s, v18.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "add x9, x9, x16\n"
+ "add x28, x28, x16\n"
+ "and v16.16b, v3.16b, v12.16b\n"
+ "sqrdmulh v21.4s, v21.4s, v14.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v14.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+ "sqadd v5.4s, v5.4s, v28.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v14.16b, v21.16b, v25.16b\n"
+ "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+ "and v6.16b, v20.16b, v25.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
+ "and v4.16b, v19.16b, v25.16b\n"
+ "sqrdmulh v31.4s, v31.4s, v18.4s\n"
+ "sqadd v3.4s, v3.4s, v16.4s\n"
+ "sshr v14.4s, v14.4s, #0x1f\n"
+ "and v18.16b, v8.16b, v12.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v7.16b, v0.16b, v12.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v16.16b, v31.16b, v12.16b\n"
+ "sqadd v21.4s, v21.4s, v14.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v6.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "srshl v5.4s, v5.4s, v25.4s\n"
+ "srshl v21.4s, v21.4s, v25.4s\n"
+ "sqadd v8.4s, v8.4s, v18.4s\n"
+ "srshl v20.4s, v20.4s, v25.4s\n"
+ "sqadd v0.4s, v0.4s, v7.4s\n"
+ "srshl v19.4s, v19.4s, v25.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v12.4s\n"
+ "sqxtn v5.4h, v5.4s\n"
+ "srshl v8.4s, v8.4s, v12.4s\n"
+ "sqxtn v21.4h, v21.4s\n"
+ "srshl v0.4s, v0.4s, v12.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v31.4s, v31.4s, v12.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "sqxtn2 v5.8h, v3.4s\n"
+ "sqxtn2 v21.8h, v8.4s\n"
+ "sqxtn2 v20.8h, v0.4s\n"
+ "sqxtn2 v19.8h, v31.4s\n"
+ "sqadd v5.8h, v5.8h, v13.8h\n"
+ "sqadd v21.8h, v21.8h, v13.8h\n"
+ "sqadd v20.8h, v20.8h, v13.8h\n"
+ "sqadd v19.8h, v19.8h, v13.8h\n"
+ "smax v5.8h, v5.8h, v17.8h\n"
+ "smax v21.8h, v21.8h, v17.8h\n"
+ "smax v20.8h, v20.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smin v5.8h, v5.8h, v24.8h\n"
+ "smin v21.8h, v21.8h, v24.8h\n"
+ "smin v20.8h, v20.8h, v24.8h\n"
+ "smin v19.8h, v19.8h, v24.8h\n"
+ "uzp1 v5.16b, v5.16b, v5.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "tbz x7, #2, 85f\n"
+ "st1 { v5.s }[0], [x11], #0x4\n"
+ "st1 { v21.s }[0], [x10], #0x4\n"
+ "st1 { v20.s }[0], [x9], #0x4\n"
+ "st1 { v19.s }[0], [x28], #0x4\n"
+ "tbz x7, #1, 84f\n"
+ "st1 { v5.h }[2], [x11], #0x2\n"
+ "st1 { v21.h }[2], [x10], #0x2\n"
+ "st1 { v20.h }[2], [x9], #0x2\n"
+ "st1 { v19.h }[2], [x28], #0x2\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v5.b }[6], [x11], #0x1\n"
+ "st1 { v21.b }[6], [x10], #0x1\n"
+ "st1 { v20.b }[6], [x9], #0x1\n"
+ "st1 { v19.b }[6], [x28], #0x1\n"
+ "b 87f\n"
+ "84:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x7, #0, 87f\n"
+ "st1 { v5.b }[4], [x11], #0x1\n"
+ "st1 { v21.b }[4], [x10], #0x1\n"
+ "st1 { v20.b }[4], [x9], #0x1\n"
+ "st1 { v19.b }[4], [x28], #0x1\n"
+ "b 87f\n"
+ "85:" // Oddments: Bit 2: Unset
+ "tbz x7, #1, 86f\n"
+ "st1 { v5.h }[0], [x11], #0x2\n"
+ "st1 { v21.h }[0], [x10], #0x2\n"
+ "st1 { v20.h }[0], [x9], #0x2\n"
+ "st1 { v19.h }[0], [x28], #0x2\n"
+ "tbz x7, #0, 87f\n"
+ "st1 { v5.b }[2], [x11], #0x1\n"
+ "st1 { v21.b }[2], [x10], #0x1\n"
+ "st1 { v20.b }[2], [x9], #0x1\n"
+ "st1 { v19.b }[2], [x28], #0x1\n"
+ "b 87f\n"
+ "86:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x7, #0, 87f\n"
+ "st1 { v5.b }[0], [x11], #0x1\n"
+ "st1 { v21.b }[0], [x10], #0x1\n"
+ "st1 { v20.b }[0], [x9], #0x1\n"
+ "st1 { v19.b }[0], [x28], #0x1\n"
+ "87:" // Oddments: Bit 2: End
+ "88:" // End
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..5d53b17e53
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int,
+ const uint8_t *const *const,
+ const int8_t *const,
+ const int32_t *const,
+ const arm_gemm::Requantize32 &,
+ const int32_t *const,
+ const int32_t *const,
+ uint8_t *const *const
+);
+
+class a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..df955206e2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,2187 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[36];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[0];
+ inptrs[1] = inptrs_raw[1];
+ inptrs[2] = inptrs_raw[6];
+ inptrs[3] = inptrs_raw[7];
+ inptrs[4] = inptrs_raw[2];
+ inptrs[5] = inptrs_raw[8];
+ inptrs[6] = inptrs_raw[3];
+ inptrs[7] = inptrs_raw[4];
+ inptrs[8] = inptrs_raw[11];
+ inptrs[9] = inptrs_raw[12];
+ inptrs[10] = inptrs_raw[9];
+ inptrs[11] = inptrs_raw[10];
+ inptrs[12] = inptrs_raw[5];
+ inptrs[13] = inptrs_raw[13];
+ inptrs[14] = inptrs_raw[14];
+ inptrs[15] = inptrs_raw[15];
+ inptrs[16] = inptrs_raw[16];
+ inptrs[17] = inptrs_raw[17];
+ inptrs[18] = inptrs_raw[18];
+ inptrs[19] = inptrs_raw[19];
+ inptrs[20] = inptrs_raw[20];
+ inptrs[21] = inptrs_raw[21];
+ inptrs[22] = inptrs_raw[22];
+ inptrs[23] = inptrs_raw[23];
+ inptrs[24] = inptrs_raw[24];
+ inptrs[25] = inptrs_raw[25];
+ inptrs[26] = inptrs_raw[26];
+ inptrs[27] = inptrs_raw[27];
+ inptrs[28] = inptrs_raw[28];
+ inptrs[29] = inptrs_raw[29];
+ inptrs[30] = inptrs_raw[30];
+ inptrs[31] = inptrs_raw[31];
+ inptrs[32] = inptrs_raw[32];
+ inptrs[33] = inptrs_raw[33];
+ inptrs[34] = inptrs_raw[34];
+ inptrs[35] = inptrs_raw[35];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x1, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "lsr x2, x1, #0x3\n"
+ "add x20, x23, %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v18.16b }, [x20]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x23, %[offsetof_Requantize32_b_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v13.16b }, [x21]\n"
+ "ld1r { v26.8h }, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_minval]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v11.8h }, [x21]\n"
+ "ld1r { v0.8h }, [x20]\n"
+ "mov x3, #0x0\n"
+ "mov x4, #0x0\n"
+ "add x5, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
+ "ldr x7, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x17, x16, [x22, #0x0]\n"
+ "ldp x15, x14, [x22, #0x10]\n"
+ "cbz x2, 3f\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "subs x2, x2, #0x1\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ssubl v10.8h, v10.8b, v13.8b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q15, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ldr d31, [x9, x3]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldr d17, [x28, x3]\n"
+ "ldr d30, [x27, x3]\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "ldr d16, [x26, x3]\n"
+ "ldr d3, [x25, x3]\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "ldr d4, [x24, x3]\n"
+ "ldr d25, [x23, x3]\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "ldr d9, [x22, x3]\n"
+ "ldr d29, [x21, x3]\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "ldr d28, [x20, x3]\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "ldr d2, [x6, #0x28]\n"
+ "ldr d27, [x6, #0x30]\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "ldr d1, [x6, #0x38]\n"
+ "ldr d31, [x6, #0x40]\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "ldr d8, [x6, #0x48]\n"
+ "ldr x22, [x5, #0x50]\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x21, [x5, #0x60]\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "ldr x22, [x5, #0x70]\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "ldr d14, [x20, x3]\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal v23.4s, v17.4h, v10.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x21, [x5, #0x80]\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "ldr d25, [x22, x3]\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v10.8h\n"
+ "ldr d10, [x20, x3]\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal v24.4s, v17.4h, v21.4h\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x24, [x5, #0x88]\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "smlal v7.4s, v30.4h, v2.4h\n"
+ "ldr x20, [x5, #0x90]\n"
+ "ldr x23, [x5, #0x98]\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "ldr d9, [x21, x3]\n"
+ "smlal2 v22.4s, v17.8h, v21.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "smlal v20.4s, v3.4h, v12.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "ldr x22, [x5, #0xa0]\n"
+ "ldr x21, [x5, #0xa8]\n"
+ "smlal2 v15.4s, v30.8h, v2.8h\n"
+ "ldr d30, [x24, x3]\n"
+ "smlal v7.4s, v16.4h, v27.4h\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v12.8h\n"
+ "ldr d3, [x6, #0x58]\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "ldr d12, [x20, x3]\n"
+ "smlal v20.4s, v16.4h, v2.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal v23.4s, v14.4h, v2.4h\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "ldr x13, [x5, #0xb8]\n"
+ "smlal2 v15.4s, v16.8h, v27.8h\n"
+ "smlal v7.4s, v4.4h, v1.4h\n"
+ "ldr x12, [x5, #0xc0]\n"
+ "ldr x11, [x5, #0xc8]\n"
+ "smlal2 v5.4s, v16.8h, v2.8h\n"
+ "ldr d16, [x23, x3]\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v2.8h\n"
+ "ldr d2, [x6, #0x60]\n"
+ "smlal v20.4s, v4.4h, v27.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v27.4h\n"
+ "smlal v23.4s, v25.4h, v27.4h\n"
+ "ldr x10, [x5, #0xd0]\n"
+ "ldr x9, [x5, #0xd8]\n"
+ "smlal2 v15.4s, v4.8h, v1.8h\n"
+ "smlal v7.4s, v17.4h, v31.4h\n"
+ "ldr x28, [x5, #0xe0]\n"
+ "ldr x27, [x5, #0xe8]\n"
+ "smlal2 v5.4s, v4.8h, v27.8h\n"
+ "ldr d4, [x22, x3]\n"
+ "smlal2 v22.4s, v14.8h, v27.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v27.8h\n"
+ "ldr d27, [x6, #0x68]\n"
+ "smlal v20.4s, v17.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v1.4h\n"
+ "smlal v23.4s, v10.4h, v1.4h\n"
+ "ldr x26, [x5, #0xf0]\n"
+ "ldr x25, [x5, #0xf8]\n"
+ "smlal2 v15.4s, v17.8h, v31.8h\n"
+ "smlal v7.4s, v6.4h, v8.4h\n"
+ "ldr x24, [x5, #0x100]\n"
+ "ldr x23, [x5, #0x108]\n"
+ "smlal2 v5.4s, v17.8h, v1.8h\n"
+ "ldr d17, [x21, x3]\n"
+ "smlal2 v22.4s, v25.8h, v1.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v1.8h\n"
+ "ldr d1, [x6, #0x70]\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v31.4h\n"
+ "smlal v23.4s, v9.4h, v31.4h\n"
+ "ldr x22, [x5, #0x110]\n"
+ "ldr x21, [x5, #0x118]\n"
+ "smlal2 v15.4s, v6.8h, v8.8h\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "subs x2, x2, #0x1\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal2 v22.4s, v10.8h, v31.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v31.8h\n"
+ "ldr d31, [x6, #0x78]\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v9.4h, v8.4h\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "ldr d28, [x13, x3]\n"
+ "smlal v7.4s, v14.4h, v3.4h\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v5.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "smlal2 v22.4s, v9.8h, v8.8h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "ldr d8, [x12, x3]\n"
+ "smlal v20.4s, v14.4h, v21.4h\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "smlal v24.4s, v12.4h, v21.4h\n"
+ "smlal v23.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v14.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v14.8h, v21.8h\n"
+ "ldr d14, [x11, x3]\n"
+ "smlal2 v22.4s, v12.8h, v21.8h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v21.8h\n"
+ "ldr d21, [x6, #0x88]\n"
+ "smlal v20.4s, v25.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v16.4h, v3.4h\n"
+ "smlal v23.4s, v4.4h, v3.4h\n"
+ "smlal2 v15.4s, v25.8h, v2.8h\n"
+ "smlal v7.4s, v10.4h, v27.4h\n"
+ "smlal2 v5.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x10, x3]\n"
+ "smlal2 v22.4s, v16.8h, v3.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v3.8h\n"
+ "ldr d3, [x6, #0x90]\n"
+ "smlal v20.4s, v10.4h, v2.4h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v4.4h, v2.4h\n"
+ "smlal v23.4s, v17.4h, v2.4h\n"
+ "smlal2 v15.4s, v10.8h, v27.8h\n"
+ "smlal v7.4s, v9.4h, v1.4h\n"
+ "smlal2 v5.4s, v10.8h, v2.8h\n"
+ "ldr d10, [x9, x3]\n"
+ "smlal2 v22.4s, v4.8h, v2.8h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v2.8h\n"
+ "ldr d2, [x6, #0x98]\n"
+ "smlal v20.4s, v9.4h, v27.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v17.4h, v27.4h\n"
+ "smlal v23.4s, v6.4h, v27.4h\n"
+ "smlal2 v15.4s, v9.8h, v1.8h\n"
+ "smlal v7.4s, v12.4h, v31.4h\n"
+ "smlal2 v5.4s, v9.8h, v27.8h\n"
+ "ldr d9, [x28, x3]\n"
+ "smlal2 v22.4s, v17.8h, v27.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v27.8h\n"
+ "ldr d27, [x6, #0xa0]\n"
+ "smlal v20.4s, v30.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v1.4h\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal2 v15.4s, v12.8h, v31.8h\n"
+ "ldr d12, [x27, x3]\n"
+ "smlal v7.4s, v16.4h, v29.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal2 v5.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x6, #0xa8]\n"
+ "smlal2 v22.4s, v6.8h, v1.8h\n"
+ "ssubl v30.8h, v30.8b, v13.8b\n"
+ "smlal2 v19.4s, v28.8h, v1.8h\n"
+ "ldr d1, [x26, x3]\n"
+ "smlal v20.4s, v16.4h, v31.4h\n"
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "smlal v24.4s, v8.4h, v31.4h\n"
+ "smlal v23.4s, v14.4h, v31.4h\n"
+ "smlal2 v15.4s, v16.8h, v29.8h\n"
+ "smlal v7.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v16.8h, v31.8h\n"
+ "ldr d16, [x25, x3]\n"
+ "smlal2 v22.4s, v8.8h, v31.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v31.8h\n"
+ "ldr d31, [x6, #0xb0]\n"
+ "smlal v20.4s, v4.4h, v29.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v29.4h\n"
+ "smlal v23.4s, v25.4h, v29.4h\n"
+ "smlal2 v15.4s, v4.8h, v21.8h\n"
+ "smlal v7.4s, v17.4h, v3.4h\n"
+ "smlal2 v5.4s, v4.8h, v29.8h\n"
+ "ldr d4, [x24, x3]\n"
+ "smlal2 v22.4s, v14.8h, v29.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v29.8h\n"
+ "ldr d29, [x6, #0xb8]\n"
+ "smlal v20.4s, v17.4h, v21.4h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v15.4s, v17.8h, v3.8h\n"
+ "smlal v7.4s, v6.4h, v2.4h\n"
+ "smlal2 v5.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x23, x3]\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x6, #0xc0]\n"
+ "smlal v20.4s, v6.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v3.4h\n"
+ "smlal v23.4s, v9.4h, v3.4h\n"
+ "add x6, x6, #0xc8\n"
+ "smlal2 v15.4s, v6.8h, v2.8h\n"
+ "smlal v7.4s, v8.4h, v27.4h\n"
+ "smlal2 v5.4s, v6.8h, v3.8h\n"
+ "ldr d6, [x22, x3]\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal v20.4s, v28.4h, v2.4h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v24.4s, v9.4h, v2.4h\n"
+ "smlal v23.4s, v12.4h, v2.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v15.4s, v8.8h, v27.8h\n"
+ "ldr q8, [x7, #0x0]\n"
+ "smlal v7.4s, v14.4h, v30.4h\n"
+ "smlal2 v5.4s, v28.8h, v2.8h\n"
+ "ldr q28, [x8, #0x0]\n"
+ "smlal2 v22.4s, v9.8h, v2.8h\n"
+ "smlal2 v19.4s, v12.8h, v2.8h\n"
+ "ldr q2, [x7, #0x10]\n"
+ "smlal v20.4s, v14.4h, v27.4h\n"
+ "add x7, x7, #0x20\n"
+ "smlal v24.4s, v1.4h, v27.4h\n"
+ "smlal v23.4s, v16.4h, v27.4h\n"
+ "smlal2 v15.4s, v14.8h, v30.8h\n"
+ "smlal v7.4s, v25.4h, v31.4h\n"
+ "smlal2 v5.4s, v14.8h, v27.8h\n"
+ "ldr q14, [x8, #0x10]\n"
+ "smlal2 v22.4s, v1.8h, v27.8h\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v19.4s, v16.8h, v27.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal v24.4s, v16.4h, v30.4h\n"
+ "smlal v23.4s, v4.4h, v30.4h\n"
+ "smlal2 v15.4s, v25.8h, v31.8h\n"
+ "smlal v7.4s, v10.4h, v29.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal2 v22.4s, v16.8h, v30.8h\n"
+ "smlal2 v19.4s, v4.8h, v30.8h\n"
+ "smlal v20.4s, v10.4h, v31.4h\n"
+ "smlal v24.4s, v4.4h, v31.4h\n"
+ "smlal v23.4s, v17.4h, v31.4h\n"
+ "smlal2 v15.4s, v10.8h, v29.8h\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+ "smlal2 v5.4s, v10.8h, v31.8h\n"
+ "smlal2 v22.4s, v4.8h, v31.8h\n"
+ "and v27.16b, v7.16b, v28.16b\n"
+ "smlal2 v19.4s, v17.8h, v31.8h\n"
+ "smlal v20.4s, v9.4h, v29.4h\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "smlal v24.4s, v17.4h, v29.4h\n"
+ "smlal v23.4s, v6.4h, v29.4h\n"
+ "sqadd v7.4s, v7.4s, v27.4s\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal2 v5.4s, v9.8h, v29.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v2.4s\n"
+ "smlal2 v22.4s, v17.8h, v29.8h\n"
+ "smlal2 v19.4s, v6.8h, v29.8h\n"
+ "and v9.16b, v15.16b, v14.16b\n"
+ "smlal v20.4s, v12.4h, v21.4h\n"
+ "smlal v24.4s, v6.4h, v21.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smlal v23.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v12.8h, v21.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "smlal2 v22.4s, v6.8h, v21.8h\n"
+ "smlal2 v19.4s, v3.8h, v21.8h\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v25.16b, v20.16b, v28.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v2.4s\n"
+ "and v10.16b, v24.16b, v28.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v2.4s\n"
+ "and v21.16b, v23.16b, v28.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v2.4s\n"
+ "sqadd v15.4s, v15.4s, v9.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v9.16b, v5.16b, v14.16b\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "and v12.16b, v22.16b, v14.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v17.16b, v19.16b, v14.16b\n"
+ "sqadd v20.4s, v20.4s, v25.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v10.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v21.4s\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v28.4s\n"
+ "srshl v20.4s, v20.4s, v28.4s\n"
+ "sqadd v5.4s, v5.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqadd v22.4s, v22.4s, v12.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "sqadd v19.4s, v19.4s, v17.4s\n"
+ "srshl v15.4s, v15.4s, v14.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v22.4s, v22.4s, v14.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v14.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d7, [x17, x4]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d20, [x16, x4]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d24, [x15, x4]\n"
+ "str d23, [x14, x4]\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q15, [x20, #0x10]\n"
+ "add x20, x20, #0x20\n"
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "add x4, x4, #0x8\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ldr d31, [x9, x3]\n"
+ "ldr d17, [x28, x3]\n"
+ "ssubl v10.8h, v10.8b, v13.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr d30, [x27, x3]\n"
+ "ldr d16, [x26, x3]\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "ldr d3, [x25, x3]\n"
+ "ldr d4, [x24, x3]\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "ldr d25, [x23, x3]\n"
+ "ldr d9, [x22, x3]\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "ldr d29, [x21, x3]\n"
+ "ldr d28, [x20, x3]\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "ldr d27, [x6, #0x28]\n"
+ "ldr d1, [x6, #0x30]\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "ldr d2, [x6, #0x38]\n"
+ "ldr d31, [x6, #0x40]\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "ldr d8, [x6, #0x48]\n"
+ "ldr x22, [x5, #0x50]\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "ldr x20, [x5, #0x58]\n"
+ "ldr x21, [x5, #0x60]\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "ldr d6, [x20, x3]\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x22, [x5, #0x70]\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "ldr d3, [x21, x3]\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "ldr d14, [x20, x3]\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal v23.4s, v17.4h, v10.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x21, [x5, #0x78]\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "ldr d25, [x22, x3]\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v10.8h\n"
+ "ldr d10, [x21, x3]\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal v24.4s, v17.4h, v21.4h\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x24, [x5, #0x88]\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "smlal v7.4s, v30.4h, v27.4h\n"
+ "ldr x23, [x5, #0x90]\n"
+ "ldr x22, [x5, #0x98]\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "ldr d9, [x20, x3]\n"
+ "smlal2 v22.4s, v17.8h, v21.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "ldr d21, [x6, #0x50]\n"
+ "smlal v20.4s, v3.4h, v12.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "ldr x21, [x5, #0xa0]\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal2 v15.4s, v30.8h, v27.8h\n"
+ "ldr d30, [x24, x3]\n"
+ "smlal v7.4s, v16.4h, v1.4h\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v12.8h\n"
+ "ldr d3, [x6, #0x58]\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "ldr d12, [x23, x3]\n"
+ "smlal v20.4s, v16.4h, v27.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal v24.4s, v28.4h, v27.4h\n"
+ "smlal v23.4s, v14.4h, v27.4h\n"
+ "ldr x13, [x5, #0xb0]\n"
+ "ldr x12, [x5, #0xb8]\n"
+ "smlal2 v15.4s, v16.8h, v1.8h\n"
+ "smlal v7.4s, v4.4h, v2.4h\n"
+ "ldr x11, [x5, #0xc0]\n"
+ "ldr x10, [x5, #0xc8]\n"
+ "smlal2 v5.4s, v16.8h, v27.8h\n"
+ "ldr d16, [x22, x3]\n"
+ "smlal2 v22.4s, v28.8h, v27.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v27.8h\n"
+ "ldr d27, [x6, #0x60]\n"
+ "smlal v20.4s, v4.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v1.4h\n"
+ "smlal v23.4s, v25.4h, v1.4h\n"
+ "ldr x9, [x5, #0xd0]\n"
+ "ldr x28, [x5, #0xd8]\n"
+ "smlal2 v15.4s, v4.8h, v2.8h\n"
+ "smlal v7.4s, v17.4h, v31.4h\n"
+ "ldr x27, [x5, #0xe0]\n"
+ "ldr x26, [x5, #0xe8]\n"
+ "smlal2 v5.4s, v4.8h, v1.8h\n"
+ "ldr d4, [x21, x3]\n"
+ "smlal2 v22.4s, v14.8h, v1.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v1.8h\n"
+ "ldr d1, [x6, #0x68]\n"
+ "smlal v20.4s, v17.4h, v2.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v2.4h\n"
+ "smlal v23.4s, v10.4h, v2.4h\n"
+ "ldr x25, [x5, #0xf0]\n"
+ "ldr x24, [x5, #0xf8]\n"
+ "smlal2 v15.4s, v17.8h, v31.8h\n"
+ "smlal v7.4s, v6.4h, v8.4h\n"
+ "ldr x23, [x5, #0x100]\n"
+ "ldr x22, [x5, #0x108]\n"
+ "smlal2 v5.4s, v17.8h, v2.8h\n"
+ "ldr d17, [x20, x3]\n"
+ "smlal2 v22.4s, v25.8h, v2.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v2.8h\n"
+ "ldr d2, [x6, #0x70]\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v31.4h\n"
+ "smlal v23.4s, v9.4h, v31.4h\n"
+ "ldr x21, [x5, #0x110]\n"
+ "ldr x20, [x5, #0x118]\n"
+ "smlal2 v15.4s, v6.8h, v8.8h\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "tst x1, #0x7\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "ldr d6, [x13, x3]\n"
+ "smlal2 v22.4s, v10.8h, v31.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v31.8h\n"
+ "ldr d31, [x6, #0x78]\n"
+ "smlal v20.4s, v29.4h, v8.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v9.4h, v8.4h\n"
+ "smlal v23.4s, v30.4h, v8.4h\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "ldr d28, [x12, x3]\n"
+ "smlal v7.4s, v14.4h, v3.4h\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v5.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x6, #0x80]\n"
+ "smlal2 v22.4s, v9.8h, v8.8h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal2 v19.4s, v30.8h, v8.8h\n"
+ "ldr d8, [x11, x3]\n"
+ "smlal v20.4s, v14.4h, v21.4h\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "smlal v24.4s, v12.4h, v21.4h\n"
+ "smlal v23.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v14.8h, v3.8h\n"
+ "smlal v7.4s, v25.4h, v27.4h\n"
+ "smlal2 v5.4s, v14.8h, v21.8h\n"
+ "ldr d14, [x10, x3]\n"
+ "smlal2 v22.4s, v12.8h, v21.8h\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "smlal2 v19.4s, v16.8h, v21.8h\n"
+ "ldr d21, [x6, #0x88]\n"
+ "smlal v20.4s, v25.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v16.4h, v3.4h\n"
+ "smlal v23.4s, v4.4h, v3.4h\n"
+ "smlal2 v15.4s, v25.8h, v27.8h\n"
+ "smlal v7.4s, v10.4h, v1.4h\n"
+ "smlal2 v5.4s, v25.8h, v3.8h\n"
+ "ldr d25, [x9, x3]\n"
+ "smlal2 v22.4s, v16.8h, v3.8h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v3.8h\n"
+ "ldr d3, [x6, #0x90]\n"
+ "smlal v20.4s, v10.4h, v27.4h\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v4.4h, v27.4h\n"
+ "smlal v23.4s, v17.4h, v27.4h\n"
+ "smlal2 v15.4s, v10.8h, v1.8h\n"
+ "smlal v7.4s, v9.4h, v2.4h\n"
+ "smlal2 v5.4s, v10.8h, v27.8h\n"
+ "ldr d10, [x28, x3]\n"
+ "smlal2 v22.4s, v4.8h, v27.8h\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "smlal2 v19.4s, v17.8h, v27.8h\n"
+ "ldr d27, [x6, #0x98]\n"
+ "smlal v20.4s, v9.4h, v1.4h\n"
+ "ssubl v27.8h, v27.8b, v13.8b\n"
+ "smlal v24.4s, v17.4h, v1.4h\n"
+ "smlal v23.4s, v6.4h, v1.4h\n"
+ "smlal2 v15.4s, v9.8h, v2.8h\n"
+ "smlal v7.4s, v12.4h, v31.4h\n"
+ "smlal2 v5.4s, v9.8h, v1.8h\n"
+ "ldr d9, [x27, x3]\n"
+ "smlal2 v22.4s, v17.8h, v1.8h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v19.4s, v6.8h, v1.8h\n"
+ "ldr d1, [x6, #0xa0]\n"
+ "smlal v20.4s, v30.4h, v2.4h\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v24.4s, v6.4h, v2.4h\n"
+ "smlal v23.4s, v28.4h, v2.4h\n"
+ "smlal2 v15.4s, v12.8h, v31.8h\n"
+ "ldr d12, [x26, x3]\n"
+ "smlal v7.4s, v16.4h, v29.4h\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "ldr d30, [x6, #0xa8]\n"
+ "smlal2 v22.4s, v6.8h, v2.8h\n"
+ "ssubl v30.8h, v30.8b, v13.8b\n"
+ "smlal2 v19.4s, v28.8h, v2.8h\n"
+ "ldr d2, [x25, x3]\n"
+ "smlal v20.4s, v16.4h, v31.4h\n"
+ "usubl v2.8h, v2.8b, v18.8b\n"
+ "smlal v24.4s, v8.4h, v31.4h\n"
+ "smlal v23.4s, v14.4h, v31.4h\n"
+ "smlal2 v15.4s, v16.8h, v29.8h\n"
+ "smlal v7.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v16.8h, v31.8h\n"
+ "ldr d16, [x24, x3]\n"
+ "smlal2 v22.4s, v8.8h, v31.8h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "smlal2 v19.4s, v14.8h, v31.8h\n"
+ "ldr d31, [x6, #0xb0]\n"
+ "smlal v20.4s, v4.4h, v29.4h\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "smlal v24.4s, v14.4h, v29.4h\n"
+ "smlal v23.4s, v25.4h, v29.4h\n"
+ "smlal2 v15.4s, v4.8h, v21.8h\n"
+ "smlal v7.4s, v17.4h, v3.4h\n"
+ "smlal2 v5.4s, v4.8h, v29.8h\n"
+ "ldr d4, [x23, x3]\n"
+ "smlal2 v22.4s, v14.8h, v29.8h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v19.4s, v25.8h, v29.8h\n"
+ "ldr d29, [x6, #0xb8]\n"
+ "smlal v20.4s, v17.4h, v21.4h\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v15.4s, v17.8h, v3.8h\n"
+ "smlal v7.4s, v6.4h, v27.4h\n"
+ "smlal2 v5.4s, v17.8h, v21.8h\n"
+ "ldr d17, [x22, x3]\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "ldr d21, [x6, #0xc0]\n"
+ "smlal v20.4s, v6.4h, v3.4h\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "smlal v24.4s, v10.4h, v3.4h\n"
+ "smlal v23.4s, v9.4h, v3.4h\n"
+ "smlal2 v15.4s, v6.8h, v27.8h\n"
+ "smlal v7.4s, v8.4h, v1.4h\n"
+ "smlal2 v5.4s, v6.8h, v3.8h\n"
+ "ldr d6, [x21, x3]\n"
+ "smlal2 v22.4s, v10.8h, v3.8h\n"
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "smlal2 v19.4s, v9.8h, v3.8h\n"
+ "ldr d3, [x20, x3]\n"
+ "smlal v20.4s, v28.4h, v27.4h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v24.4s, v9.4h, v27.4h\n"
+ "smlal v23.4s, v12.4h, v27.4h\n"
+ "add x3, x3, #0x8\n"
+ "smlal2 v15.4s, v8.8h, v1.8h\n"
+ "ldr q8, [x7, #0x0]\n"
+ "smlal v7.4s, v14.4h, v30.4h\n"
+ "smlal2 v5.4s, v28.8h, v27.8h\n"
+ "ldr q28, [x8, #0x0]\n"
+ "smlal2 v22.4s, v9.8h, v27.8h\n"
+ "smlal2 v19.4s, v12.8h, v27.8h\n"
+ "ldr q27, [x7, #0x10]\n"
+ "smlal v20.4s, v14.4h, v1.4h\n"
+ "add x7, x7, #0x20\n"
+ "smlal v24.4s, v2.4h, v1.4h\n"
+ "smlal v23.4s, v16.4h, v1.4h\n"
+ "smlal2 v15.4s, v14.8h, v30.8h\n"
+ "smlal v7.4s, v25.4h, v31.4h\n"
+ "smlal2 v5.4s, v14.8h, v1.8h\n"
+ "ldr q14, [x8, #0x10]\n"
+ "smlal2 v22.4s, v2.8h, v1.8h\n"
+ "add x8, x8, #0x20\n"
+ "smlal2 v19.4s, v16.8h, v1.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal v24.4s, v16.4h, v30.4h\n"
+ "smlal v23.4s, v4.4h, v30.4h\n"
+ "smlal2 v15.4s, v25.8h, v31.8h\n"
+ "smlal v7.4s, v10.4h, v29.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal2 v22.4s, v16.8h, v30.8h\n"
+ "smlal2 v19.4s, v4.8h, v30.8h\n"
+ "smlal v20.4s, v10.4h, v31.4h\n"
+ "smlal v24.4s, v4.4h, v31.4h\n"
+ "smlal v23.4s, v17.4h, v31.4h\n"
+ "smlal2 v15.4s, v10.8h, v29.8h\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "sqrdmulh v7.4s, v7.4s, v8.4s\n"
+ "smlal2 v5.4s, v10.8h, v31.8h\n"
+ "smlal2 v22.4s, v4.8h, v31.8h\n"
+ "and v4.16b, v7.16b, v28.16b\n"
+ "smlal2 v19.4s, v17.8h, v31.8h\n"
+ "smlal v20.4s, v9.4h, v29.4h\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "smlal v24.4s, v17.4h, v29.4h\n"
+ "smlal v23.4s, v6.4h, v29.4h\n"
+ "sqadd v7.4s, v7.4s, v4.4s\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal2 v5.4s, v9.8h, v29.8h\n"
+ "sqrdmulh v15.4s, v15.4s, v27.4s\n"
+ "smlal2 v22.4s, v17.8h, v29.8h\n"
+ "smlal2 v19.4s, v6.8h, v29.8h\n"
+ "and v30.16b, v15.16b, v14.16b\n"
+ "smlal v20.4s, v12.4h, v21.4h\n"
+ "smlal v24.4s, v6.4h, v21.4h\n"
+ "sqrdmulh v20.4s, v20.4s, v8.4s\n"
+ "smlal v23.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v12.8h, v21.8h\n"
+ "sqrdmulh v24.4s, v24.4s, v8.4s\n"
+ "smlal2 v22.4s, v6.8h, v21.8h\n"
+ "smlal2 v19.4s, v3.8h, v21.8h\n"
+ "sqrdmulh v23.4s, v23.4s, v8.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "and v3.16b, v20.16b, v28.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v27.4s\n"
+ "and v25.16b, v24.16b, v28.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+ "and v16.16b, v23.16b, v28.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+ "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "and v4.16b, v5.16b, v14.16b\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "and v10.16b, v22.16b, v14.16b\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "and v12.16b, v19.16b, v14.16b\n"
+ "sqadd v20.4s, v20.4s, v3.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v25.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v16.4s\n"
+ "sshr v12.4s, v12.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v28.4s\n"
+ "srshl v20.4s, v20.4s, v28.4s\n"
+ "sqadd v5.4s, v5.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v28.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "srshl v23.4s, v23.4s, v28.4s\n"
+ "sqadd v19.4s, v19.4s, v12.4s\n"
+ "srshl v15.4s, v15.4s, v14.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v14.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v22.4s, v22.4s, v14.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v14.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d7, [x17, x4]\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str d20, [x16, x4]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d24, [x15, x4]\n"
+ "str d23, [x14, x4]\n"
+ "add x4, x4, #0x8\n"
+ "beq 124f\n"
+ "add x6, x6, #0xc8\n"
+ "3:" // Oddments
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x1, #2, 5f\n"
+ "ld1 { v7.4s }, [x20], #0x10\n"
+ "tbz x1, #1, 4f\n"
+ "ld1 { v15.d }[0], [x20], #0x8\n"
+ "tbz x1, #0, 7f\n"
+ "ld1 { v15.s }[2], [x20]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x1, #0, 7f\n"
+ "ld1 { v15.s }[0], [x20]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x1, #1, 6f\n"
+ "ld1 { v7.d }[0], [x20], #0x8\n"
+ "tbz x1, #0, 7f\n"
+ "ld1 { v7.s }[2], [x20]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 7f\n"
+ "ld1 { v7.s }[0], [x20]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "ldr d6, [x6, #0x0]\n"
+ "ldr d14, [x6, #0x8]\n"
+ "mov v20.16b, v7.16b\n"
+ "mov v5.16b, v15.16b\n"
+ "ldr d10, [x6, #0x10]\n"
+ "ldr d21, [x6, #0x18]\n"
+ "mov v24.16b, v7.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldr d12, [x6, #0x20]\n"
+ "ldp x9, x28, [x5, #0x0]\n"
+ "mov v23.16b, v7.16b\n"
+ "mov v19.16b, v15.16b\n"
+ "ldp x27, x26, [x5, #0x10]\n"
+ "ldp x25, x24, [x5, #0x20]\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ldp x23, x22, [x5, #0x30]\n"
+ "ldp x21, x20, [x5, #0x40]\n"
+ "ssubl v10.8h, v10.8b, v13.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "add x9, x9, x3\n"
+ "add x28, x28, x3\n"
+ "add x27, x27, x3\n"
+ "add x26, x26, x3\n"
+ "add x25, x25, x3\n"
+ "add x24, x24, x3\n"
+ "add x23, x23, x3\n"
+ "add x22, x22, x3\n"
+ "add x21, x21, x3\n"
+ "add x20, x20, x3\n"
+ "tbz x1, #2, 9f\n"
+ "ld1 { v31.s }[0], [x9], #0x4\n"
+ "ld1 { v17.s }[0], [x28], #0x4\n"
+ "ld1 { v30.s }[0], [x27], #0x4\n"
+ "ld1 { v16.s }[0], [x26], #0x4\n"
+ "ld1 { v3.s }[0], [x25], #0x4\n"
+ "ld1 { v4.s }[0], [x24], #0x4\n"
+ "ld1 { v25.s }[0], [x23], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v29.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 8f\n"
+ "ld1 { v31.h }[2], [x9], #0x2\n"
+ "ld1 { v17.h }[2], [x28], #0x2\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v16.h }[2], [x26], #0x2\n"
+ "ld1 { v3.h }[2], [x25], #0x2\n"
+ "ld1 { v4.h }[2], [x24], #0x2\n"
+ "ld1 { v25.h }[2], [x23], #0x2\n"
+ "ld1 { v9.h }[2], [x22], #0x2\n"
+ "ld1 { v29.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[6], [x9]\n"
+ "ld1 { v17.b }[6], [x28]\n"
+ "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v16.b }[6], [x26]\n"
+ "ld1 { v3.b }[6], [x25]\n"
+ "ld1 { v4.b }[6], [x24]\n"
+ "ld1 { v25.b }[6], [x23]\n"
+ "ld1 { v9.b }[6], [x22]\n"
+ "ld1 { v29.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[4], [x9]\n"
+ "ld1 { v17.b }[4], [x28]\n"
+ "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v16.b }[4], [x26]\n"
+ "ld1 { v3.b }[4], [x25]\n"
+ "ld1 { v4.b }[4], [x24]\n"
+ "ld1 { v25.b }[4], [x23]\n"
+ "ld1 { v9.b }[4], [x22]\n"
+ "ld1 { v29.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x1, #1, 10f\n"
+ "ld1 { v31.h }[0], [x9], #0x2\n"
+ "ld1 { v17.h }[0], [x28], #0x2\n"
+ "ld1 { v30.h }[0], [x27], #0x2\n"
+ "ld1 { v16.h }[0], [x26], #0x2\n"
+ "ld1 { v3.h }[0], [x25], #0x2\n"
+ "ld1 { v4.h }[0], [x24], #0x2\n"
+ "ld1 { v25.h }[0], [x23], #0x2\n"
+ "ld1 { v9.h }[0], [x22], #0x2\n"
+ "ld1 { v29.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[2], [x9]\n"
+ "ld1 { v17.b }[2], [x28]\n"
+ "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v16.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x25]\n"
+ "ld1 { v4.b }[2], [x24]\n"
+ "ld1 { v25.b }[2], [x23]\n"
+ "ld1 { v9.b }[2], [x22]\n"
+ "ld1 { v29.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 11f\n"
+ "ld1 { v31.b }[0], [x9]\n"
+ "ld1 { v17.b }[0], [x28]\n"
+ "ld1 { v30.b }[0], [x27]\n"
+ "ld1 { v16.b }[0], [x26]\n"
+ "ld1 { v3.b }[0], [x25]\n"
+ "ld1 { v4.b }[0], [x24]\n"
+ "ld1 { v25.b }[0], [x23]\n"
+ "ld1 { v9.b }[0], [x22]\n"
+ "ld1 { v29.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "usubl v17.8h, v17.8b, v18.8b\n"
+ "smlal v7.4s, v31.4h, v6.4h\n"
+ "ldr x20, [x5, #0x50]\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "smlal2 v15.4s, v31.8h, v6.8h\n"
+ "smlal v20.4s, v17.4h, v6.4h\n"
+ "smlal2 v5.4s, v17.8h, v6.8h\n"
+ "smlal v24.4s, v30.4h, v6.4h\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "add x20, x20, x3\n"
+ "smlal2 v22.4s, v30.8h, v6.8h\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "smlal v23.4s, v16.4h, v6.4h\n"
+ "smlal2 v19.4s, v16.8h, v6.8h\n"
+ "smlal v7.4s, v17.4h, v14.4h\n"
+ "usubl v4.8h, v4.8b, v18.8b\n"
+ "smlal2 v15.4s, v17.8h, v14.8h\n"
+ "smlal v20.4s, v3.4h, v14.4h\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "smlal2 v5.4s, v3.8h, v14.8h\n"
+ "smlal v24.4s, v16.4h, v14.4h\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal2 v22.4s, v16.8h, v14.8h\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "smlal v23.4s, v4.4h, v14.4h\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "smlal2 v19.4s, v4.8h, v14.8h\n"
+ "smlal v7.4s, v3.4h, v10.4h\n"
+ "smlal2 v15.4s, v3.8h, v10.8h\n"
+ "smlal v20.4s, v25.4h, v10.4h\n"
+ "smlal2 v5.4s, v25.8h, v10.8h\n"
+ "smlal v24.4s, v4.4h, v10.4h\n"
+ "smlal2 v22.4s, v4.8h, v10.8h\n"
+ "tbz x1, #2, 13f\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 12f\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v27.b }[6], [x20]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 15f\n"
+ "ld1 { v27.b }[4], [x20]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x1, #1, 14f\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 15f\n"
+ "ld1 { v27.b }[2], [x20]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 15f\n"
+ "ld1 { v27.b }[0], [x20]\n"
+ "15:" // Oddments: Load (1, 3): Bit 2: End
+ "usubl v27.8h, v27.8b, v18.8b\n"
+ "ldr x20, [x5, #0x58]\n"
+ "smlal v23.4s, v27.4h, v10.4h\n"
+ "smlal2 v19.4s, v27.8h, v10.8h\n"
+ "smlal v7.4s, v25.4h, v21.4h\n"
+ "smlal2 v15.4s, v25.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v20.4s, v9.4h, v21.4h\n"
+ "smlal2 v5.4s, v9.8h, v21.8h\n"
+ "smlal v24.4s, v27.4h, v21.4h\n"
+ "smlal2 v22.4s, v27.8h, v21.8h\n"
+ "tbz x1, #2, 17f\n"
+ "ld1 { v6.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 16f\n"
+ "ld1 { v6.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v6.b }[6], [x20]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 19f\n"
+ "ld1 { v6.b }[4], [x20]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (1, 4): Bit 2: Unset
+ "tbz x1, #1, 18f\n"
+ "ld1 { v6.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 19f\n"
+ "ld1 { v6.b }[2], [x20]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 19f\n"
+ "ld1 { v6.b }[0], [x20]\n"
+ "19:" // Oddments: Load (1, 4): Bit 2: End
+ "usubl v6.8h, v6.8b, v18.8b\n"
+ "ldr x20, [x5, #0x60]\n"
+ "smlal v23.4s, v6.4h, v21.4h\n"
+ "smlal2 v19.4s, v6.8h, v21.8h\n"
+ "smlal v7.4s, v9.4h, v12.4h\n"
+ "smlal2 v15.4s, v9.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x1, #2, 21f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 20f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v9.b }[6], [x20]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 23f\n"
+ "ld1 { v9.b }[4], [x20]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (0, 5): Bit 2: Unset
+ "tbz x1, #1, 22f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 23f\n"
+ "ld1 { v9.b }[2], [x20]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 23f\n"
+ "ld1 { v9.b }[0], [x20]\n"
+ "23:" // Oddments: Load (0, 5): Bit 2: End
+ "ldr d14, [x6, #0x28]\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal v20.4s, v9.4h, v12.4h\n"
+ "smlal2 v5.4s, v9.8h, v12.8h\n"
+ "smlal v24.4s, v6.4h, v12.4h\n"
+ "smlal2 v22.4s, v6.8h, v12.8h\n"
+ "ssubl v14.8h, v14.8b, v13.8b\n"
+ "ldr x20, [x5, #0x68]\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v30.4h, v14.4h\n"
+ "smlal2 v15.4s, v30.8h, v14.8h\n"
+ "smlal v20.4s, v16.4h, v14.4h\n"
+ "smlal2 v5.4s, v16.8h, v14.8h\n"
+ "smlal v24.4s, v28.4h, v14.4h\n"
+ "smlal2 v22.4s, v28.8h, v14.8h\n"
+ "tbz x1, #2, 25f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 24f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (2, 1): Bit 2: Unset
+ "tbz x1, #1, 26f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 27f\n"
+ "ld1 { v25.b }[0], [x20]\n"
+ "27:" // Oddments: Load (2, 1): Bit 2: End
+ "ldr d21, [x6, #0x30]\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0x70]\n"
+ "smlal v23.4s, v25.4h, v14.4h\n"
+ "smlal2 v19.4s, v25.8h, v14.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v16.4h, v21.4h\n"
+ "smlal2 v15.4s, v16.8h, v21.8h\n"
+ "smlal v20.4s, v4.4h, v21.4h\n"
+ "smlal2 v5.4s, v4.8h, v21.8h\n"
+ "smlal v24.4s, v25.4h, v21.4h\n"
+ "smlal2 v22.4s, v25.8h, v21.8h\n"
+ "tbz x1, #2, 29f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 28f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v10.b }[6], [x20]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 31f\n"
+ "ld1 { v10.b }[4], [x20]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 2): Bit 2: Unset
+ "tbz x1, #1, 30f\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 31f\n"
+ "ld1 { v10.b }[2], [x20]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 31f\n"
+ "ld1 { v10.b }[0], [x20]\n"
+ "31:" // Oddments: Load (2, 2): Bit 2: End
+ "ldr d9, [x6, #0x38]\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0x78]\n"
+ "smlal v23.4s, v10.4h, v21.4h\n"
+ "smlal2 v19.4s, v10.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v4.4h, v9.4h\n"
+ "smlal2 v15.4s, v4.8h, v9.8h\n"
+ "smlal v20.4s, v27.4h, v9.4h\n"
+ "smlal2 v5.4s, v27.8h, v9.8h\n"
+ "smlal v24.4s, v10.4h, v9.4h\n"
+ "smlal2 v22.4s, v10.8h, v9.8h\n"
+ "tbz x1, #2, 33f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 32f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v12.b }[6], [x20]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 35f\n"
+ "ld1 { v12.b }[4], [x20]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x1, #1, 34f\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 35f\n"
+ "ld1 { v12.b }[2], [x20]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 35f\n"
+ "ld1 { v12.b }[0], [x20]\n"
+ "35:" // Oddments: Load (2, 3): Bit 2: End
+ "ldr d31, [x6, #0x40]\n"
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "ssubl v31.8h, v31.8b, v13.8b\n"
+ "ldr x20, [x5, #0x80]\n"
+ "smlal v23.4s, v12.4h, v9.4h\n"
+ "smlal2 v19.4s, v12.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v27.4h, v31.4h\n"
+ "smlal2 v15.4s, v27.8h, v31.8h\n"
+ "smlal v20.4s, v6.4h, v31.4h\n"
+ "smlal2 v5.4s, v6.8h, v31.8h\n"
+ "smlal v24.4s, v12.4h, v31.4h\n"
+ "smlal2 v22.4s, v12.8h, v31.8h\n"
+ "tbz x1, #2, 37f\n"
+ "ld1 { v8.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 36f\n"
+ "ld1 { v8.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v8.b }[6], [x20]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 39f\n"
+ "ld1 { v8.b }[4], [x20]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (2, 4): Bit 2: Unset
+ "tbz x1, #1, 38f\n"
+ "ld1 { v8.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 39f\n"
+ "ld1 { v8.b }[2], [x20]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 39f\n"
+ "ld1 { v8.b }[0], [x20]\n"
+ "39:" // Oddments: Load (2, 4): Bit 2: End
+ "ldr d16, [x6, #0x48]\n"
+ "usubl v8.8h, v8.8b, v18.8b\n"
+ "ssubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0x88]\n"
+ "smlal v23.4s, v8.4h, v31.4h\n"
+ "smlal2 v19.4s, v8.8h, v31.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v6.4h, v16.4h\n"
+ "smlal2 v15.4s, v6.8h, v16.8h\n"
+ "smlal v20.4s, v29.4h, v16.4h\n"
+ "smlal2 v5.4s, v29.8h, v16.8h\n"
+ "smlal v24.4s, v8.4h, v16.4h\n"
+ "smlal2 v22.4s, v8.8h, v16.8h\n"
+ "tbz x1, #2, 41f\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 40f\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v27.b }[6], [x20]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 43f\n"
+ "ld1 { v27.b }[4], [x20]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (2, 5): Bit 2: Unset
+ "tbz x1, #1, 42f\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 43f\n"
+ "ld1 { v27.b }[2], [x20]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 43f\n"
+ "ld1 { v27.b }[0], [x20]\n"
+ "43:" // Oddments: Load (2, 5): Bit 2: End
+ "ldr d21, [x6, #0x50]\n"
+ "usubl v27.8h, v27.8b, v18.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0x90]\n"
+ "smlal v23.4s, v27.4h, v16.4h\n"
+ "smlal2 v19.4s, v27.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v28.4h, v21.4h\n"
+ "smlal2 v15.4s, v28.8h, v21.8h\n"
+ "smlal v20.4s, v25.4h, v21.4h\n"
+ "smlal2 v5.4s, v25.8h, v21.8h\n"
+ "tbz x1, #2, 45f\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 44f\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[6], [x20]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[4], [x20]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x1, #1, 46f\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[2], [x20]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 47f\n"
+ "ld1 { v31.b }[0], [x20]\n"
+ "47:" // Oddments: Load (3, 0): Bit 2: End
+ "usubl v31.8h, v31.8b, v18.8b\n"
+ "ldr x20, [x5, #0x98]\n"
+ "smlal v24.4s, v31.4h, v21.4h\n"
+ "smlal2 v22.4s, v31.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x1, #2, 49f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 48f\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v28.b }[6], [x20]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 51f\n"
+ "ld1 { v28.b }[4], [x20]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x1, #1, 50f\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 51f\n"
+ "ld1 { v28.b }[2], [x20]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 51f\n"
+ "ld1 { v28.b }[0], [x20]\n"
+ "51:" // Oddments: Load (3, 1): Bit 2: End
+ "ldr d2, [x6, #0x58]\n"
+ "usubl v28.8h, v28.8b, v18.8b\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa0]\n"
+ "smlal v23.4s, v28.4h, v21.4h\n"
+ "smlal2 v19.4s, v28.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v15.4s, v25.8h, v2.8h\n"
+ "smlal v20.4s, v10.4h, v2.4h\n"
+ "smlal2 v5.4s, v10.8h, v2.8h\n"
+ "smlal v24.4s, v28.4h, v2.4h\n"
+ "smlal2 v22.4s, v28.8h, v2.8h\n"
+ "tbz x1, #2, 53f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 52f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v21.b }[6], [x20]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 55f\n"
+ "ld1 { v21.b }[4], [x20]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x1, #1, 54f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 55f\n"
+ "ld1 { v21.b }[2], [x20]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 55f\n"
+ "ld1 { v21.b }[0], [x20]\n"
+ "55:" // Oddments: Load (3, 2): Bit 2: End
+ "ldr d25, [x6, #0x60]\n"
+ "usubl v21.8h, v21.8b, v18.8b\n"
+ "ssubl v25.8h, v25.8b, v13.8b\n"
+ "ldr x20, [x5, #0xa8]\n"
+ "smlal v23.4s, v21.4h, v2.4h\n"
+ "smlal2 v19.4s, v21.8h, v2.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v10.4h, v25.4h\n"
+ "smlal2 v15.4s, v10.8h, v25.8h\n"
+ "smlal v20.4s, v12.4h, v25.4h\n"
+ "smlal2 v5.4s, v12.8h, v25.8h\n"
+ "smlal v24.4s, v21.4h, v25.4h\n"
+ "smlal2 v22.4s, v21.8h, v25.8h\n"
+ "tbz x1, #2, 57f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 56f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v9.b }[6], [x20]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 59f\n"
+ "ld1 { v9.b }[4], [x20]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x1, #1, 58f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 59f\n"
+ "ld1 { v9.b }[2], [x20]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 59f\n"
+ "ld1 { v9.b }[0], [x20]\n"
+ "59:" // Oddments: Load (3, 3): Bit 2: End
+ "ldr d1, [x6, #0x68]\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb0]\n"
+ "smlal v23.4s, v9.4h, v25.4h\n"
+ "smlal2 v19.4s, v9.8h, v25.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v12.4h, v1.4h\n"
+ "smlal2 v15.4s, v12.8h, v1.8h\n"
+ "smlal v20.4s, v8.4h, v1.4h\n"
+ "smlal2 v5.4s, v8.8h, v1.8h\n"
+ "smlal v24.4s, v9.4h, v1.4h\n"
+ "smlal2 v22.4s, v9.8h, v1.8h\n"
+ "tbz x1, #2, 61f\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 60f\n"
+ "ld1 { v3.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v3.b }[6], [x20]\n"
+ "b 63f\n"
+ "60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 63f\n"
+ "ld1 { v3.b }[4], [x20]\n"
+ "b 63f\n"
+ "61:" // Oddments: Load (3, 4): Bit 2: Unset
+ "tbz x1, #1, 62f\n"
+ "ld1 { v3.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 63f\n"
+ "ld1 { v3.b }[2], [x20]\n"
+ "b 63f\n"
+ "62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 63f\n"
+ "ld1 { v3.b }[0], [x20]\n"
+ "63:" // Oddments: Load (3, 4): Bit 2: End
+ "ldr d16, [x6, #0x70]\n"
+ "usubl v3.8h, v3.8b, v18.8b\n"
+ "ssubl v16.8h, v16.8b, v13.8b\n"
+ "ldr x20, [x5, #0xb8]\n"
+ "smlal v23.4s, v3.4h, v1.4h\n"
+ "smlal2 v19.4s, v3.8h, v1.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v8.4h, v16.4h\n"
+ "smlal2 v15.4s, v8.8h, v16.8h\n"
+ "smlal v20.4s, v27.4h, v16.4h\n"
+ "smlal2 v5.4s, v27.8h, v16.8h\n"
+ "smlal v24.4s, v3.4h, v16.4h\n"
+ "smlal2 v22.4s, v3.8h, v16.8h\n"
+ "tbz x1, #2, 65f\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 64f\n"
+ "ld1 { v14.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v14.b }[6], [x20]\n"
+ "b 67f\n"
+ "64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 67f\n"
+ "ld1 { v14.b }[4], [x20]\n"
+ "b 67f\n"
+ "65:" // Oddments: Load (3, 5): Bit 2: Unset
+ "tbz x1, #1, 66f\n"
+ "ld1 { v14.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 67f\n"
+ "ld1 { v14.b }[2], [x20]\n"
+ "b 67f\n"
+ "66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 67f\n"
+ "ld1 { v14.b }[0], [x20]\n"
+ "67:" // Oddments: Load (3, 5): Bit 2: End
+ "ldr d17, [x6, #0x78]\n"
+ "usubl v14.8h, v14.8b, v18.8b\n"
+ "ssubl v17.8h, v17.8b, v13.8b\n"
+ "ldr x20, [x5, #0xc0]\n"
+ "smlal v23.4s, v14.4h, v16.4h\n"
+ "smlal2 v19.4s, v14.8h, v16.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v31.4h, v17.4h\n"
+ "smlal2 v15.4s, v31.8h, v17.8h\n"
+ "smlal v20.4s, v28.4h, v17.4h\n"
+ "smlal2 v5.4s, v28.8h, v17.8h\n"
+ "tbz x1, #2, 69f\n"
+ "ld1 { v1.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 68f\n"
+ "ld1 { v1.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 71f\n"
+ "ld1 { v1.b }[6], [x20]\n"
+ "b 71f\n"
+ "68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 71f\n"
+ "ld1 { v1.b }[4], [x20]\n"
+ "b 71f\n"
+ "69:" // Oddments: Load (4, 0): Bit 2: Unset
+ "tbz x1, #1, 70f\n"
+ "ld1 { v1.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 71f\n"
+ "ld1 { v1.b }[2], [x20]\n"
+ "b 71f\n"
+ "70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 71f\n"
+ "ld1 { v1.b }[0], [x20]\n"
+ "71:" // Oddments: Load (4, 0): Bit 2: End
+ "usubl v1.8h, v1.8b, v18.8b\n"
+ "ldr x20, [x5, #0xc8]\n"
+ "smlal v24.4s, v1.4h, v17.4h\n"
+ "smlal2 v22.4s, v1.8h, v17.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x1, #2, 73f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 72f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 75f\n"
+ "72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 75f\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 75f\n"
+ "73:" // Oddments: Load (4, 1): Bit 2: Unset
+ "tbz x1, #1, 74f\n"
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 75f\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 75f\n"
+ "74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 75f\n"
+ "ld1 { v16.b }[0], [x20]\n"
+ "75:" // Oddments: Load (4, 1): Bit 2: End
+ "ldr d29, [x6, #0x80]\n"
+ "usubl v16.8h, v16.8b, v18.8b\n"
+ "ssubl v29.8h, v29.8b, v13.8b\n"
+ "ldr x20, [x5, #0xd0]\n"
+ "smlal v23.4s, v16.4h, v17.4h\n"
+ "smlal2 v19.4s, v16.8h, v17.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v28.4h, v29.4h\n"
+ "smlal2 v15.4s, v28.8h, v29.8h\n"
+ "smlal v20.4s, v21.4h, v29.4h\n"
+ "smlal2 v5.4s, v21.8h, v29.8h\n"
+ "smlal v24.4s, v16.4h, v29.4h\n"
+ "smlal2 v22.4s, v16.8h, v29.8h\n"
+ "tbz x1, #2, 77f\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 76f\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v30.b }[6], [x20]\n"
+ "b 79f\n"
+ "76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 79f\n"
+ "ld1 { v30.b }[4], [x20]\n"
+ "b 79f\n"
+ "77:" // Oddments: Load (4, 2): Bit 2: Unset
+ "tbz x1, #1, 78f\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 79f\n"
+ "ld1 { v30.b }[2], [x20]\n"
+ "b 79f\n"
+ "78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 79f\n"
+ "ld1 { v30.b }[0], [x20]\n"
+ "79:" // Oddments: Load (4, 2): Bit 2: End
+ "ldr d12, [x6, #0x88]\n"
+ "usubl v30.8h, v30.8b, v18.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ldr x20, [x5, #0xd8]\n"
+ "smlal v23.4s, v30.4h, v29.4h\n"
+ "smlal2 v19.4s, v30.8h, v29.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v21.4h, v12.4h\n"
+ "smlal2 v15.4s, v21.8h, v12.8h\n"
+ "smlal v20.4s, v9.4h, v12.4h\n"
+ "smlal2 v5.4s, v9.8h, v12.8h\n"
+ "smlal v24.4s, v30.4h, v12.4h\n"
+ "smlal2 v22.4s, v30.8h, v12.8h\n"
+ "tbz x1, #2, 81f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 80f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v29.b }[6], [x20]\n"
+ "b 83f\n"
+ "80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 83f\n"
+ "ld1 { v29.b }[4], [x20]\n"
+ "b 83f\n"
+ "81:" // Oddments: Load (4, 3): Bit 2: Unset
+ "tbz x1, #1, 82f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 83f\n"
+ "ld1 { v29.b }[2], [x20]\n"
+ "b 83f\n"
+ "82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 83f\n"
+ "ld1 { v29.b }[0], [x20]\n"
+ "83:" // Oddments: Load (4, 3): Bit 2: End
+ "ldr d21, [x6, #0x90]\n"
+ "usubl v29.8h, v29.8b, v18.8b\n"
+ "ssubl v21.8h, v21.8b, v13.8b\n"
+ "ldr x20, [x5, #0xe0]\n"
+ "smlal v23.4s, v29.4h, v12.4h\n"
+ "smlal2 v19.4s, v29.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v9.4h, v21.4h\n"
+ "smlal2 v15.4s, v9.8h, v21.8h\n"
+ "smlal v20.4s, v3.4h, v21.4h\n"
+ "smlal2 v5.4s, v3.8h, v21.8h\n"
+ "smlal v24.4s, v29.4h, v21.4h\n"
+ "smlal2 v22.4s, v29.8h, v21.8h\n"
+ "tbz x1, #2, 85f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 84f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "b 87f\n"
+ "84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 87f\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "b 87f\n"
+ "85:" // Oddments: Load (4, 4): Bit 2: Unset
+ "tbz x1, #1, 86f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 87f\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "b 87f\n"
+ "86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 87f\n"
+ "ld1 { v25.b }[0], [x20]\n"
+ "87:" // Oddments: Load (4, 4): Bit 2: End
+ "ldr d8, [x6, #0x98]\n"
+ "usubl v25.8h, v25.8b, v18.8b\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x20, [x5, #0xe8]\n"
+ "smlal v23.4s, v25.4h, v21.4h\n"
+ "smlal2 v19.4s, v25.8h, v21.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v3.4h, v8.4h\n"
+ "smlal2 v15.4s, v3.8h, v8.8h\n"
+ "smlal v20.4s, v14.4h, v8.4h\n"
+ "smlal2 v5.4s, v14.8h, v8.8h\n"
+ "smlal v24.4s, v25.4h, v8.4h\n"
+ "smlal2 v22.4s, v25.8h, v8.8h\n"
+ "tbz x1, #2, 89f\n"
+ "ld1 { v21.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 88f\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v21.b }[6], [x20]\n"
+ "b 91f\n"
+ "88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 91f\n"
+ "ld1 { v21.b }[4], [x20]\n"
+ "b 91f\n"
+ "89:" // Oddments: Load (4, 5): Bit 2: Unset
+ "tbz x1, #1, 90f\n"
+ "ld1 { v21.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 91f\n"
+ "ld1 { v21.b }[2], [x20]\n"
+ "b 91f\n"
+ "90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 91f\n"
+ "ld1 { v21.b }[0], [x20]\n"
+ "91:" // Oddments: Load (4, 5): Bit 2: End
+ "ldr d9, [x6, #0xa0]\n"
+ "usubl v21.8h, v21.8b, v18.8b\n"
+ "ssubl v9.8h, v9.8b, v13.8b\n"
+ "ldr x20, [x5, #0xf0]\n"
+ "smlal v23.4s, v21.4h, v8.4h\n"
+ "smlal2 v19.4s, v21.8h, v8.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v1.4h, v9.4h\n"
+ "smlal2 v15.4s, v1.8h, v9.8h\n"
+ "smlal v20.4s, v16.4h, v9.4h\n"
+ "smlal2 v5.4s, v16.8h, v9.8h\n"
+ "tbz x1, #2, 93f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 92f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v12.b }[6], [x20]\n"
+ "b 95f\n"
+ "92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 95f\n"
+ "ld1 { v12.b }[4], [x20]\n"
+ "b 95f\n"
+ "93:" // Oddments: Load (5, 0): Bit 2: Unset
+ "tbz x1, #1, 94f\n"
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 95f\n"
+ "ld1 { v12.b }[2], [x20]\n"
+ "b 95f\n"
+ "94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 95f\n"
+ "ld1 { v12.b }[0], [x20]\n"
+ "95:" // Oddments: Load (5, 0): Bit 2: End
+ "usubl v12.8h, v12.8b, v18.8b\n"
+ "ldr x20, [x5, #0xf8]\n"
+ "smlal v24.4s, v12.4h, v9.4h\n"
+ "smlal2 v22.4s, v12.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "tbz x1, #2, 97f\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 96f\n"
+ "ld1 { v10.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v10.b }[6], [x20]\n"
+ "b 99f\n"
+ "96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 99f\n"
+ "ld1 { v10.b }[4], [x20]\n"
+ "b 99f\n"
+ "97:" // Oddments: Load (5, 1): Bit 2: Unset
+ "tbz x1, #1, 98f\n"
+ "ld1 { v10.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 99f\n"
+ "ld1 { v10.b }[2], [x20]\n"
+ "b 99f\n"
+ "98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 99f\n"
+ "ld1 { v10.b }[0], [x20]\n"
+ "99:" // Oddments: Load (5, 1): Bit 2: End
+ "ldr d12, [x6, #0xa8]\n"
+ "usubl v10.8h, v10.8b, v18.8b\n"
+ "ssubl v12.8h, v12.8b, v13.8b\n"
+ "ldr x20, [x5, #0x100]\n"
+ "smlal v23.4s, v10.4h, v9.4h\n"
+ "smlal2 v19.4s, v10.8h, v9.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v16.4h, v12.4h\n"
+ "smlal2 v15.4s, v16.8h, v12.8h\n"
+ "smlal v20.4s, v30.4h, v12.4h\n"
+ "smlal2 v5.4s, v30.8h, v12.8h\n"
+ "smlal v24.4s, v10.4h, v12.4h\n"
+ "smlal2 v22.4s, v10.8h, v12.8h\n"
+ "tbz x1, #2, 101f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 100f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v9.b }[6], [x20]\n"
+ "b 103f\n"
+ "100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 103f\n"
+ "ld1 { v9.b }[4], [x20]\n"
+ "b 103f\n"
+ "101:" // Oddments: Load (5, 2): Bit 2: Unset
+ "tbz x1, #1, 102f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 103f\n"
+ "ld1 { v9.b }[2], [x20]\n"
+ "b 103f\n"
+ "102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 103f\n"
+ "ld1 { v9.b }[0], [x20]\n"
+ "103:" // Oddments: Load (5, 2): Bit 2: End
+ "ldr d28, [x6, #0xb0]\n"
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "ssubl v28.8h, v28.8b, v13.8b\n"
+ "ldr x20, [x5, #0x108]\n"
+ "smlal v23.4s, v9.4h, v12.4h\n"
+ "smlal2 v19.4s, v9.8h, v12.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v30.4h, v28.4h\n"
+ "smlal2 v15.4s, v30.8h, v28.8h\n"
+ "smlal v20.4s, v29.4h, v28.4h\n"
+ "smlal2 v5.4s, v29.8h, v28.8h\n"
+ "smlal v24.4s, v9.4h, v28.4h\n"
+ "smlal2 v22.4s, v9.8h, v28.8h\n"
+ "tbz x1, #2, 105f\n"
+ "ld1 { v2.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 104f\n"
+ "ld1 { v2.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v2.b }[6], [x20]\n"
+ "b 107f\n"
+ "104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 107f\n"
+ "ld1 { v2.b }[4], [x20]\n"
+ "b 107f\n"
+ "105:" // Oddments: Load (5, 3): Bit 2: Unset
+ "tbz x1, #1, 106f\n"
+ "ld1 { v2.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 107f\n"
+ "ld1 { v2.b }[2], [x20]\n"
+ "b 107f\n"
+ "106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 107f\n"
+ "ld1 { v2.b }[0], [x20]\n"
+ "107:" // Oddments: Load (5, 3): Bit 2: End
+ "ldr d30, [x6, #0xb8]\n"
+ "usubl v2.8h, v2.8b, v18.8b\n"
+ "ssubl v30.8h, v30.8b, v13.8b\n"
+ "ldr x20, [x5, #0x110]\n"
+ "smlal v23.4s, v2.4h, v28.4h\n"
+ "smlal2 v19.4s, v2.8h, v28.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v29.4h, v30.4h\n"
+ "smlal2 v15.4s, v29.8h, v30.8h\n"
+ "smlal v20.4s, v25.4h, v30.4h\n"
+ "smlal2 v5.4s, v25.8h, v30.8h\n"
+ "smlal v24.4s, v2.4h, v30.4h\n"
+ "smlal2 v22.4s, v2.8h, v30.8h\n"
+ "tbz x1, #2, 109f\n"
+ "ld1 { v27.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 108f\n"
+ "ld1 { v27.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v27.b }[6], [x20]\n"
+ "b 111f\n"
+ "108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 111f\n"
+ "ld1 { v27.b }[4], [x20]\n"
+ "b 111f\n"
+ "109:" // Oddments: Load (5, 4): Bit 2: Unset
+ "tbz x1, #1, 110f\n"
+ "ld1 { v27.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 111f\n"
+ "ld1 { v27.b }[2], [x20]\n"
+ "b 111f\n"
+ "110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 111f\n"
+ "ld1 { v27.b }[0], [x20]\n"
+ "111:" // Oddments: Load (5, 4): Bit 2: End
+ "ldr d8, [x6, #0xc0]\n"
+ "usubl v27.8h, v27.8b, v18.8b\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr x20, [x5, #0x118]\n"
+ "smlal v23.4s, v27.4h, v30.4h\n"
+ "smlal2 v19.4s, v27.8h, v30.8h\n"
+ "add x20, x20, x3\n"
+ "smlal v7.4s, v25.4h, v8.4h\n"
+ "smlal2 v15.4s, v25.8h, v8.8h\n"
+ "smlal v20.4s, v21.4h, v8.4h\n"
+ "smlal2 v5.4s, v21.8h, v8.8h\n"
+ "smlal v24.4s, v27.4h, v8.4h\n"
+ "smlal2 v22.4s, v27.8h, v8.8h\n"
+ "tbz x1, #2, 113f\n"
+ "ld1 { v9.s }[0], [x20], #0x4\n"
+ "tbz x1, #1, 112f\n"
+ "ld1 { v9.h }[2], [x20], #0x2\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v9.b }[6], [x20]\n"
+ "b 115f\n"
+ "112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
+ "tbz x1, #0, 115f\n"
+ "ld1 { v9.b }[4], [x20]\n"
+ "b 115f\n"
+ "113:" // Oddments: Load (5, 5): Bit 2: Unset
+ "tbz x1, #1, 114f\n"
+ "ld1 { v9.h }[0], [x20], #0x2\n"
+ "tbz x1, #0, 115f\n"
+ "ld1 { v9.b }[2], [x20]\n"
+ "b 115f\n"
+ "114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 115f\n"
+ "ld1 { v9.b }[0], [x20]\n"
+ "115:" // Oddments: Load (5, 5): Bit 2: End
+ "usubl v9.8h, v9.8b, v18.8b\n"
+ "smlal v23.4s, v9.4h, v8.4h\n"
+ "smlal2 v19.4s, v9.8h, v8.8h\n"
+ "tbz x1, #2, 117f\n"
+ "ld1 { v30.4s }, [x7], #0x10\n"
+ "ld1 { v12.4s }, [x8], #0x10\n"
+ "tbz x1, #1, 116f\n"
+ "ld1 { v14.d }[0], [x7], #0x8\n"
+ "ld1 { v27.d }[0], [x8], #0x8\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v14.s }[2], [x7]\n"
+ "ld1 { v27.s }[2], [x8]\n"
+ "b 119f\n"
+ "116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x1, #0, 119f\n"
+ "ld1 { v14.s }[0], [x7]\n"
+ "ld1 { v27.s }[0], [x8]\n"
+ "b 119f\n"
+ "117:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x1, #1, 118f\n"
+ "ld1 { v30.d }[0], [x7], #0x8\n"
+ "ld1 { v12.d }[0], [x8], #0x8\n"
+ "tbz x1, #0, 119f\n"
+ "ld1 { v30.s }[2], [x7]\n"
+ "ld1 { v12.s }[2], [x8]\n"
+ "b 119f\n"
+ "118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 119f\n"
+ "ld1 { v30.s }[0], [x7]\n"
+ "ld1 { v12.s }[0], [x8]\n"
+ "119:" // Oddments: Load requant params: Bit 2: End
+ "sqrdmulh v7.4s, v7.4s, v30.4s\n"
+ "and v16.16b, v7.16b, v12.16b\n"
+ "add x17, x17, x4\n"
+ "add x16, x16, x4\n"
+ "sqrdmulh v15.4s, v15.4s, v14.4s\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "add x15, x15, x4\n"
+ "add x14, x14, x4\n"
+ "and v2.16b, v15.16b, v27.16b\n"
+ "sqrdmulh v20.4s, v20.4s, v30.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v30.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v30.4s\n"
+ "sqadd v7.4s, v7.4s, v16.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "and v21.16b, v20.16b, v12.16b\n"
+ "sqrdmulh v5.4s, v5.4s, v14.4s\n"
+ "and v18.16b, v24.16b, v12.16b\n"
+ "sqrdmulh v22.4s, v22.4s, v14.4s\n"
+ "and v31.16b, v23.16b, v12.16b\n"
+ "sqrdmulh v19.4s, v19.4s, v14.4s\n"
+ "sqadd v15.4s, v15.4s, v2.4s\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v9.16b, v5.16b, v27.16b\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "and v4.16b, v22.16b, v27.16b\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "and v28.16b, v19.16b, v27.16b\n"
+ "sqadd v20.4s, v20.4s, v21.4s\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v18.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v31.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v7.4s, v7.4s, v12.4s\n"
+ "srshl v20.4s, v20.4s, v12.4s\n"
+ "sqadd v5.4s, v5.4s, v9.4s\n"
+ "srshl v24.4s, v24.4s, v12.4s\n"
+ "sqadd v22.4s, v22.4s, v4.4s\n"
+ "srshl v23.4s, v23.4s, v12.4s\n"
+ "sqadd v19.4s, v19.4s, v28.4s\n"
+ "srshl v15.4s, v15.4s, v27.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "srshl v5.4s, v5.4s, v27.4s\n"
+ "sqxtn v20.4h, v20.4s\n"
+ "srshl v22.4s, v22.4s, v27.4s\n"
+ "sqxtn v24.4h, v24.4s\n"
+ "srshl v19.4s, v19.4s, v27.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v7.8h, v15.4s\n"
+ "sqxtn2 v20.8h, v5.4s\n"
+ "sqxtn2 v24.8h, v22.4s\n"
+ "sqxtn2 v23.8h, v19.4s\n"
+ "sqadd v7.8h, v7.8h, v26.8h\n"
+ "sqadd v20.8h, v20.8h, v26.8h\n"
+ "sqadd v24.8h, v24.8h, v26.8h\n"
+ "sqadd v23.8h, v23.8h, v26.8h\n"
+ "smax v7.8h, v7.8h, v11.8h\n"
+ "smax v20.8h, v20.8h, v11.8h\n"
+ "smax v24.8h, v24.8h, v11.8h\n"
+ "smax v23.8h, v23.8h, v11.8h\n"
+ "smin v7.8h, v7.8h, v0.8h\n"
+ "smin v20.8h, v20.8h, v0.8h\n"
+ "smin v24.8h, v24.8h, v0.8h\n"
+ "smin v23.8h, v23.8h, v0.8h\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "tbz x1, #2, 121f\n"
+ "st1 { v7.s }[0], [x17], #0x4\n"
+ "st1 { v20.s }[0], [x16], #0x4\n"
+ "st1 { v24.s }[0], [x15], #0x4\n"
+ "st1 { v23.s }[0], [x14], #0x4\n"
+ "tbz x1, #1, 120f\n"
+ "st1 { v7.h }[2], [x17], #0x2\n"
+ "st1 { v20.h }[2], [x16], #0x2\n"
+ "st1 { v24.h }[2], [x15], #0x2\n"
+ "st1 { v23.h }[2], [x14], #0x2\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v7.b }[6], [x17], #0x1\n"
+ "st1 { v20.b }[6], [x16], #0x1\n"
+ "st1 { v24.b }[6], [x15], #0x1\n"
+ "st1 { v23.b }[6], [x14], #0x1\n"
+ "b 123f\n"
+ "120:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x1, #0, 123f\n"
+ "st1 { v7.b }[4], [x17], #0x1\n"
+ "st1 { v20.b }[4], [x16], #0x1\n"
+ "st1 { v24.b }[4], [x15], #0x1\n"
+ "st1 { v23.b }[4], [x14], #0x1\n"
+ "b 123f\n"
+ "121:" // Oddments: Bit 2: Unset
+ "tbz x1, #1, 122f\n"
+ "st1 { v7.h }[0], [x17], #0x2\n"
+ "st1 { v20.h }[0], [x16], #0x2\n"
+ "st1 { v24.h }[0], [x15], #0x2\n"
+ "st1 { v23.h }[0], [x14], #0x2\n"
+ "tbz x1, #0, 123f\n"
+ "st1 { v7.b }[2], [x17], #0x1\n"
+ "st1 { v20.b }[2], [x16], #0x1\n"
+ "st1 { v24.b }[2], [x15], #0x1\n"
+ "st1 { v23.b }[2], [x14], #0x1\n"
+ "b 123f\n"
+ "122:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x1, #0, 123f\n"
+ "st1 { v7.b }[0], [x17], #0x1\n"
+ "st1 { v20.b }[0], [x16], #0x1\n"
+ "st1 { v24.b }[0], [x15], #0x1\n"
+ "st1 { v23.b }[0], [x14], #0x1\n"
+ "123:" // Oddments: Bit 2: End
+ "124:" // End
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..2c677d2f62
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
+
+class a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+ KernelType kernel = a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl;
+
+ public:
+ a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<uint8_t, int8_t, uint8_t, int32_t>(9, arm_gemm::VLType::None) {}
+
+ KernelType get_kernel() const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..c2bec4cdab
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(
+ const uint8_t *const *const inptrs,
+ uint8_t *const *const outptrs,
+ const void *params,
+ const arm_gemm::Requantize32& qp,
+ const unsigned int n_points,
+ const unsigned int n_channels
+)
+{
+ __asm__ __volatile__(
+ "lsr x9, %x[n_channels], #0x2\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v8.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v7.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v6.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v5.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v4.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+ "ld1r { v3.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v2.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "ld1r { v1.4s }, [x20]\n"
+ "mov x11, #0x0\n"
+ "cbz x9, 6f\n"
+ "1:" // Channel loop
+ "movi v23.4s, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "lsl x20, x11, #0x2\n"
+ "ldr q23, [%x[bias], x20]\n"
+ "2:" // Channel loop: Load bias: Done
+ "ldr s0, [%x[params]], #0x4\n"
+ "mov x25, %x[inptrs]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "subs x24, %x[n_points], #0x1\n"
+ "ldr s14, [x21, x11]\n"
+ "ldr s15, [x20, x11]\n"
+ "mov v24.16b, v23.16b\n"
+ "mov v25.16b, v23.16b\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s16, [x21, x11]\n"
+ "mov v26.16b, v23.16b\n"
+ "mov v27.16b, v23.16b\n"
+ "ldr s17, [x20, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "mov v28.16b, v23.16b\n"
+ "mov v29.16b, v23.16b\n"
+ "ldr s18, [x21, x11]\n"
+ "ldr s19, [x20, x11]\n"
+ "mov v30.16b, v23.16b\n"
+ "mov v31.16b, v23.16b\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s20, [x21, x11]\n"
+ "ssubl v0.8h, v0.8b, v5.8b\n"
+ "usubl v14.8h, v14.8b, v6.8b\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x25], #0x8\n"
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr s22, [x20, x11]\n"
+ "usubl v17.8h, v17.8b, v6.8b\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "usubl v19.8h, v19.8b, v6.8b\n"
+ "usubl v20.8h, v20.8b, v6.8b\n"
+ "usubl v21.8h, v21.8b, v6.8b\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
+ "ble 4f\n"
+ "3:" // Channel loop: Planar loop
+ "ldp x23, x22, [x25], #0x10\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "ldr s14, [x23, x11]\n"
+ "ldr s15, [x22, x11]\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "ldr s16, [x21, x11]\n"
+ "ldr s17, [x20, x11]\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "ldr s18, [x21, x11]\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "ldr s19, [x20, x11]\n"
+ "ldp x21, x20, [x25], #0x10\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
+ "subs x24, x24, #0x1\n"
+ "ldr s0, [%x[params]], #0x4\n"
+ "ldr s20, [x21, x11]\n"
+ "ssubl v0.8h, v0.8b, v5.8b\n"
+ "usubl v14.8h, v14.8b, v6.8b\n"
+ "ldr s21, [x20, x11]\n"
+ "ldr x20, [x25], #0x8\n"
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "ldr s22, [x20, x11]\n"
+ "usubl v17.8h, v17.8b, v6.8b\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "usubl v19.8h, v19.8b, v6.8b\n"
+ "usubl v20.8h, v20.8b, v6.8b\n"
+ "usubl v21.8h, v21.8b, v6.8b\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
+ "bgt 3b\n"
+ "4:" // Channel loop: Planar tail
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
+ "cbz %x[rq_mul_ptr], 5f\n"
+ "lsl x20, x11, #0x2\n"
+ "ldr q2, [%x[rq_mul_ptr], x20]\n"
+ "ldr q1, [%x[rq_right_shift_ptr], x20]\n"
+ "cbz %x[rq_left_shift_ptr], 5f\n"
+ "ldr q3, [%x[rq_left_shift_ptr], x20]\n"
+ "5:" // Channel loop: Load quantisation parameters: Done
+ "sshl v23.4s, v23.4s, v3.4s\n"
+ "sshl v24.4s, v24.4s, v3.4s\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "sshl v25.4s, v25.4s, v3.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
+ "and v16.16b, v25.16b, v1.16b\n"
+ "sshl v26.4s, v26.4s, v3.4s\n"
+ "sshl v27.4s, v27.4s, v3.4s\n"
+ "sshl v28.4s, v28.4s, v3.4s\n"
+ "sshl v29.4s, v29.4s, v3.4s\n"
+ "sshl v30.4s, v30.4s, v3.4s\n"
+ "sshl v31.4s, v31.4s, v3.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v21.16b, v26.16b, v1.16b\n"
+ "and v20.16b, v27.16b, v1.16b\n"
+ "and v19.16b, v28.16b, v1.16b\n"
+ "and v18.16b, v29.16b, v1.16b\n"
+ "and v17.16b, v30.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v1.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v21.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v23.4s, v23.4s, v1.4s\n"
+ "srshl v24.4s, v24.4s, v1.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v27.4s, v27.4s, v1.4s\n"
+ "srshl v28.4s, v28.4s, v1.4s\n"
+ "srshl v29.4s, v29.4s, v1.4s\n"
+ "srshl v30.4s, v30.4s, v1.4s\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "smax v23.4s, v23.4s, v8.4s\n"
+ "smax v24.4s, v24.4s, v8.4s\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v27.4s, v27.4s, v8.4s\n"
+ "smax v28.4s, v28.4s, v8.4s\n"
+ "smax v29.4s, v29.4s, v8.4s\n"
+ "smax v30.4s, v30.4s, v8.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smin v23.4s, v23.4s, v7.4s\n"
+ "smin v24.4s, v24.4s, v7.4s\n"
+ "smin v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v7.4s\n"
+ "smin v27.4s, v27.4s, v7.4s\n"
+ "smin v28.4s, v28.4s, v7.4s\n"
+ "smin v29.4s, v29.4s, v7.4s\n"
+ "smin v30.4s, v30.4s, v7.4s\n"
+ "smin v31.4s, v31.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s23, [x28, x11]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s24, [x27, x11]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s25, [x26, x11]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s26, [x25, x11]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s27, [x24, x11]\n"
+ "str s28, [x23, x11]\n"
+ "str s29, [x22, x11]\n"
+ "str s30, [x21, x11]\n"
+ "str s31, [x20, x11]\n"
+ "add x11, x11, #0x4\n"
+ "cmp x11, x9, LSL #2\n"
+ "blt 1b\n"
+ "6:" // Oddments
+ "tst %x[n_channels], #0x3\n"
+ "beq 24f\n"
+ "movi v23.4s, #0x0\n"
+ "cbz %x[bias], 9f\n"
+ "add x20, %x[bias], x11, LSL #2\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ld1 { v23.d }[0], [x20], #0x8\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v23.s }[2], [x20], #0x4\n"
+ "b 8f\n"
+ "7:" // Oddments: Load bias: Bit 1: Unset
+ "ld1 { v23.s }[0], [x20], #0x4\n"
+ "8:" // Oddments: Load bias: Bit 1: End
+ "9:" // Oddments: Load bias: Done
+ "ldr s0, [%x[params]], #0x4\n"
+ "mov x10, %x[inptrs]\n"
+ "ldp x9, x28, [x10], #0x10\n"
+ "mov v24.16b, v23.16b\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "mov v25.16b, v23.16b\n"
+ "mov v26.16b, v23.16b\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "ldr x21, [x10], #0x8\n"
+ "mov v27.16b, v23.16b\n"
+ "mov v28.16b, v23.16b\n"
+ "mov v29.16b, v23.16b\n"
+ "mov v30.16b, v23.16b\n"
+ "add x9, x9, x11\n"
+ "add x28, x28, x11\n"
+ "mov v31.16b, v23.16b\n"
+ "ssubl v0.8h, v0.8b, v5.8b\n"
+ "add x27, x27, x11\n"
+ "add x26, x26, x11\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v15.b }[2], [x28], #0x1\n"
+ "ld1 { v16.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "ld1 { v18.b }[2], [x25], #0x1\n"
+ "ld1 { v19.b }[2], [x24], #0x1\n"
+ "ld1 { v20.b }[2], [x23], #0x1\n"
+ "ld1 { v21.b }[2], [x22], #0x1\n"
+ "ld1 { v22.b }[2], [x21], #0x1\n"
+ "b 11f\n"
+ "10:" // Oddments: Load: Bit 1: Unset
+ "ldr b14, [x9], #0x1\n"
+ "ldr b15, [x28], #0x1\n"
+ "ldr b16, [x27], #0x1\n"
+ "ldr b17, [x26], #0x1\n"
+ "ldr b18, [x25], #0x1\n"
+ "ldr b19, [x24], #0x1\n"
+ "ldr b20, [x23], #0x1\n"
+ "ldr b21, [x22], #0x1\n"
+ "ldr b22, [x21], #0x1\n"
+ "11:" // Oddments: Load: Bit 1: End
+ "subs x20, %x[n_points], #0x1\n"
+ "usubl v14.8h, v14.8b, v6.8b\n"
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "usubl v17.8h, v17.8b, v6.8b\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "usubl v19.8h, v19.8b, v6.8b\n"
+ "usubl v20.8h, v20.8b, v6.8b\n"
+ "usubl v21.8h, v21.8b, v6.8b\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
+ "ble 15f\n"
+ "12:" // Oddments: Planar loop
+ "ldp x9, x28, [x10], #0x10\n"
+ "ldp x27, x26, [x10], #0x10\n"
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "ldp x25, x24, [x10], #0x10\n"
+ "ldp x23, x22, [x10], #0x10\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "ldr x21, [x10], #0x8\n"
+ "add x9, x9, x11\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "add x28, x28, x11\n"
+ "add x27, x27, x11\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
+ "ldr s0, [%x[params]], #0x4\n"
+ "ssubl v0.8h, v0.8b, v5.8b\n"
+ "add x26, x26, x11\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "add x21, x21, x11\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr h14, [x9], #0x2\n"
+ "ldr h15, [x28], #0x2\n"
+ "ldr h16, [x27], #0x2\n"
+ "ldr h17, [x26], #0x2\n"
+ "ldr h18, [x25], #0x2\n"
+ "ldr h19, [x24], #0x2\n"
+ "ldr h20, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h22, [x21], #0x2\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v14.b }[2], [x9], #0x1\n"
+ "ld1 { v15.b }[2], [x28], #0x1\n"
+ "ld1 { v16.b }[2], [x27], #0x1\n"
+ "ld1 { v17.b }[2], [x26], #0x1\n"
+ "ld1 { v18.b }[2], [x25], #0x1\n"
+ "ld1 { v19.b }[2], [x24], #0x1\n"
+ "ld1 { v20.b }[2], [x23], #0x1\n"
+ "ld1 { v21.b }[2], [x22], #0x1\n"
+ "ld1 { v22.b }[2], [x21], #0x1\n"
+ "b 14f\n"
+ "13:" // Oddments: Planar loop: Load: Bit 1: Unset
+ "ldr b14, [x9], #0x1\n"
+ "ldr b15, [x28], #0x1\n"
+ "ldr b16, [x27], #0x1\n"
+ "ldr b17, [x26], #0x1\n"
+ "ldr b18, [x25], #0x1\n"
+ "ldr b19, [x24], #0x1\n"
+ "ldr b20, [x23], #0x1\n"
+ "ldr b21, [x22], #0x1\n"
+ "ldr b22, [x21], #0x1\n"
+ "14:" // Oddments: Planar loop: Load: Bit 1: End
+ "subs x20, x20, #0x1\n"
+ "usubl v14.8h, v14.8b, v6.8b\n"
+ "usubl v15.8h, v15.8b, v6.8b\n"
+ "usubl v16.8h, v16.8b, v6.8b\n"
+ "usubl v17.8h, v17.8b, v6.8b\n"
+ "usubl v18.8h, v18.8b, v6.8b\n"
+ "usubl v19.8h, v19.8b, v6.8b\n"
+ "usubl v20.8h, v20.8b, v6.8b\n"
+ "usubl v21.8h, v21.8b, v6.8b\n"
+ "usubl v22.8h, v22.8b, v6.8b\n"
+ "bgt 12b\n"
+ "15:" // Oddments: Planar tail
+ "smlal v23.4s, v14.4h, v0.4h\n"
+ "smlal v24.4s, v15.4h, v0.4h\n"
+ "smlal v25.4s, v16.4h, v0.4h\n"
+ "smlal v26.4s, v17.4h, v0.4h\n"
+ "smlal v27.4s, v18.4h, v0.4h\n"
+ "smlal v28.4s, v19.4h, v0.4h\n"
+ "smlal v29.4s, v20.4h, v0.4h\n"
+ "smlal v30.4s, v21.4h, v0.4h\n"
+ "smlal v31.4s, v22.4h, v0.4h\n"
+ "cbz %x[rq_mul_ptr], 21f\n"
+ "add x22, %x[rq_mul_ptr], x11, LSL #2\n"
+ "add x21, %x[rq_right_shift_ptr], x11, LSL #2\n"
+ "add x20, %x[rq_left_shift_ptr], x11, LSL #2\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v2.d }[0], [x22], #0x8\n"
+ "ld1 { v1.d }[0], [x21], #0x8\n"
+ "cbz %x[rq_left_shift_ptr], 16f\n"
+ "ld1 { v3.d }[0], [x20], #0x8\n"
+ "16:" // Oddments: Load quantisation parameters: Bit 1: Load left shift: Done
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v2.s }[2], [x22], #0x4\n"
+ "ld1 { v1.s }[2], [x21], #0x4\n"
+ "cbz %x[rq_left_shift_ptr], 17f\n"
+ "ld1 { v3.s }[2], [x20], #0x4\n"
+ "17:" // Oddments: Load quantisation parameters: Bit 1: Bit 0: Load left shift: Done
+ "b 20f\n"
+ "18:" // Oddments: Load quantisation parameters: Bit 1: Unset
+ "ld1 { v2.s }[0], [x22], #0x4\n"
+ "ld1 { v1.s }[0], [x21], #0x4\n"
+ "cbz %x[rq_left_shift_ptr], 19f\n"
+ "ld1 { v3.s }[0], [x20], #0x4\n"
+ "19:" // Oddments: Load quantisation parameters: Bit 1: Unset: Bit 0: Load left shift: Done
+ "20:" // Oddments: Load quantisation parameters: Bit 1: End
+ "21:" // Oddments: Load quantisation parameters: Done
+ "sshl v23.4s, v23.4s, v3.4s\n"
+ "sshl v24.4s, v24.4s, v3.4s\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "sshl v25.4s, v25.4s, v3.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "add x28, x28, x11\n"
+ "and v18.16b, v23.16b, v1.16b\n"
+ "and v17.16b, v24.16b, v1.16b\n"
+ "add x27, x27, x11\n"
+ "add x26, x26, x11\n"
+ "and v16.16b, v25.16b, v1.16b\n"
+ "sshl v26.4s, v26.4s, v3.4s\n"
+ "add x25, x25, x11\n"
+ "add x24, x24, x11\n"
+ "sshl v27.4s, v27.4s, v3.4s\n"
+ "sshl v28.4s, v28.4s, v3.4s\n"
+ "add x23, x23, x11\n"
+ "add x22, x22, x11\n"
+ "sshl v29.4s, v29.4s, v3.4s\n"
+ "sshl v30.4s, v30.4s, v3.4s\n"
+ "add x21, x21, x11\n"
+ "add x20, x20, x11\n"
+ "sshl v31.4s, v31.4s, v3.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+ "sqadd v23.4s, v23.4s, v18.4s\n"
+ "sqadd v24.4s, v24.4s, v17.4s\n"
+ "sqadd v25.4s, v25.4s, v16.4s\n"
+ "and v21.16b, v26.16b, v1.16b\n"
+ "and v20.16b, v27.16b, v1.16b\n"
+ "and v19.16b, v28.16b, v1.16b\n"
+ "and v18.16b, v29.16b, v1.16b\n"
+ "and v17.16b, v30.16b, v1.16b\n"
+ "and v16.16b, v31.16b, v1.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "sshr v16.4s, v16.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v21.4s\n"
+ "sqadd v27.4s, v27.4s, v20.4s\n"
+ "sqadd v28.4s, v28.4s, v19.4s\n"
+ "sqadd v29.4s, v29.4s, v18.4s\n"
+ "sqadd v30.4s, v30.4s, v17.4s\n"
+ "sqadd v31.4s, v31.4s, v16.4s\n"
+ "srshl v23.4s, v23.4s, v1.4s\n"
+ "srshl v24.4s, v24.4s, v1.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v27.4s, v27.4s, v1.4s\n"
+ "srshl v28.4s, v28.4s, v1.4s\n"
+ "srshl v29.4s, v29.4s, v1.4s\n"
+ "srshl v30.4s, v30.4s, v1.4s\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "smax v23.4s, v23.4s, v8.4s\n"
+ "smax v24.4s, v24.4s, v8.4s\n"
+ "smax v25.4s, v25.4s, v8.4s\n"
+ "smax v26.4s, v26.4s, v8.4s\n"
+ "smax v27.4s, v27.4s, v8.4s\n"
+ "smax v28.4s, v28.4s, v8.4s\n"
+ "smax v29.4s, v29.4s, v8.4s\n"
+ "smax v30.4s, v30.4s, v8.4s\n"
+ "smax v31.4s, v31.4s, v8.4s\n"
+ "smin v23.4s, v23.4s, v7.4s\n"
+ "smin v24.4s, v24.4s, v7.4s\n"
+ "smin v25.4s, v25.4s, v7.4s\n"
+ "smin v26.4s, v26.4s, v7.4s\n"
+ "smin v27.4s, v27.4s, v7.4s\n"
+ "smin v28.4s, v28.4s, v7.4s\n"
+ "smin v29.4s, v29.4s, v7.4s\n"
+ "smin v30.4s, v30.4s, v7.4s\n"
+ "smin v31.4s, v31.4s, v7.4s\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "st1 { v23.h }[0], [x28], #0x2\n"
+ "st1 { v24.h }[0], [x27], #0x2\n"
+ "st1 { v25.h }[0], [x26], #0x2\n"
+ "st1 { v26.h }[0], [x25], #0x2\n"
+ "st1 { v27.h }[0], [x24], #0x2\n"
+ "st1 { v28.h }[0], [x23], #0x2\n"
+ "st1 { v29.h }[0], [x22], #0x2\n"
+ "st1 { v30.h }[0], [x21], #0x2\n"
+ "st1 { v31.h }[0], [x20], #0x2\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "st1 { v23.b }[2], [x28], #0x1\n"
+ "st1 { v24.b }[2], [x27], #0x1\n"
+ "st1 { v25.b }[2], [x26], #0x1\n"
+ "st1 { v26.b }[2], [x25], #0x1\n"
+ "st1 { v27.b }[2], [x24], #0x1\n"
+ "st1 { v28.b }[2], [x23], #0x1\n"
+ "st1 { v29.b }[2], [x22], #0x1\n"
+ "st1 { v30.b }[2], [x21], #0x1\n"
+ "st1 { v31.b }[2], [x20], #0x1\n"
+ "b 23f\n"
+ "22:" // Oddments: Store: Bit 1: Unset
+ "st1 { v23.b }[0], [x28], #0x1\n"
+ "st1 { v24.b }[0], [x27], #0x1\n"
+ "st1 { v25.b }[0], [x26], #0x1\n"
+ "st1 { v26.b }[0], [x25], #0x1\n"
+ "st1 { v27.b }[0], [x24], #0x1\n"
+ "st1 { v28.b }[0], [x23], #0x1\n"
+ "st1 { v29.b }[0], [x22], #0x1\n"
+ "st1 { v30.b }[0], [x21], #0x1\n"
+ "st1 { v31.b }[0], [x20], #0x1\n"
+ "23:" // Oddments: Store: Bit 1: End
+ "24:" // End
+ : [params] "+&r" (params)
+ : [bias] "r" (qp.bias), [inptrs] "r" (inptrs), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (qp.per_channel_left_shifts), [rq_mul_ptr] "r" (qp.per_channel_muls), [rq_right_shift_ptr] "r" (qp.per_channel_right_shifts)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..b7ba363b43
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
+
+struct a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+ using Parent = GenericDepthfirstMultiplierKernelStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+ a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+ : Parent(2, 8, arm_gemm::VLType::None)
+ {
+ }
+ Parent::KernelType kernel = a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..ed99f1f642
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,1480 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+ const uint8_t *const *const inptrs,
+ uint8_t *const *const outptrs,
+ const int8_t *weights,
+ const int32_t *bias,
+ const unsigned int kernel_points,
+ const unsigned int n_output_channels,
+ const int32_t *per_channel_left_shifts,
+ const int32_t *per_channel_muls,
+ const int32_t *per_channel_right_shifts,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "lsr x10, %x[n_output_channels], #0x2\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_minval]\n"
+ "ld1r { v15.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v14.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_a_offset]\n"
+ "ld1r { v13.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_b_offset]\n"
+ "ld1r { v12.16b }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_c_offset]\n"
+ "ld1r { v11.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_left_shift]\n"
+ "ld1r { v10.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_mul]\n"
+ "ld1r { v9.4s }, [x20]\n"
+ "add x20, %x[qp], %[offsetof_Requantize32_per_layer_right_shift]\n"
+ "ld1r { v8.4s }, [x20]\n"
+ "mov x9, #0x0\n"
+ "cbz x10, 9f\n"
+ "1:" // Output channel loop
+ "movi v31.4s, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "lsl x20, x9, #0x2\n"
+ "ldr q31, [%x[bias], x20]\n"
+ "2:" // Output channel loop: Load bias: Done
+ "mov v16.16b, v31.16b\n"
+ "mov v17.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v19.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v22.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ "mov v24.16b, v31.16b\n"
+ "mov v25.16b, v31.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v27.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v31.16b, v31.16b\n"
+ "cbz %x[rq_mul_ptr], 3f\n"
+ "lsl x20, x9, #0x2\n"
+ "ldr q9, [%x[rq_mul_ptr], x20]\n"
+ "ldr q8, [%x[rq_right_shift_ptr], x20]\n"
+ "cbz %x[rq_left_shift_ptr], 3f\n"
+ "ldr q10, [%x[rq_left_shift_ptr], x20]\n"
+ "3:" // Output channel loop: Load quantization parameters: Done
+ "ldr s5, [%x[weights]], #0x4\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "cbz x23, 7f\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "beq 5f\n"
+ "4:" // Output channel loop: Kernel loop
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x21, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "ldr d3, [x21, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "bgt 4b\n"
+ "5:" // Output channel loop: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 6f\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x27, x9]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s25, [x26, x9]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s26, [x25, x9]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s27, [x24, x9]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
+ "b 8f\n"
+ "6:" // Output channel loop: Odd tail
+ "ldp x20, x28, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x20, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldr d4, [x28, #0x0]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x27, x9]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s25, [x26, x9]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s26, [x25, x9]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s27, [x24, x9]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
+ "b 8f\n"
+ "7:" // Output channel loop: Single kernel point
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "str s16, [x27, x9]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str s17, [x26, x9]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "str s18, [x25, x9]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str s19, [x24, x9]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s20, [x23, x9]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s21, [x22, x9]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s22, [x21, x9]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s23, [x20, x9]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "str s24, [x27, x9]\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "str s25, [x26, x9]\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "str s26, [x25, x9]\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "str s27, [x24, x9]\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "str s28, [x23, x9]\n"
+ "str s29, [x22, x9]\n"
+ "str s30, [x21, x9]\n"
+ "str s31, [x20, x9]\n"
+ "8:" // Output channel loop: Done
+ "add x9, x9, #0x4\n"
+ "cmp x9, x10, LSL #2\n"
+ "blt 1b\n"
+ "tst %x[n_output_channels], #0x3\n"
+ "beq 26f\n"
+ "9:" // Output channel oddments
+ "movi v31.4s, #0x0\n"
+ "cbz %x[bias], 12f\n"
+ "add x20, %x[bias], x9, LSL #2\n"
+ "tbz %x[n_output_channels], #1, 10f\n"
+ "ld1 { v31.d }[0], [x20], #0x8\n"
+ "tbz %x[n_output_channels], #0, 11f\n"
+ "ld1 { v31.s }[2], [x20]\n"
+ "b 11f\n"
+ "10:" // Output channel oddments: Load bias: Bit 1: Unset
+ "ld1 { v31.s }[0], [x20]\n"
+ "11:" // Output channel oddments: Load bias: Bit 1: End
+ "12:" // Output channel oddments: Load bias: Done
+ "mov v16.16b, v31.16b\n"
+ "mov v17.16b, v31.16b\n"
+ "mov v18.16b, v31.16b\n"
+ "mov v19.16b, v31.16b\n"
+ "mov v20.16b, v31.16b\n"
+ "mov v21.16b, v31.16b\n"
+ "mov v22.16b, v31.16b\n"
+ "mov v23.16b, v31.16b\n"
+ "mov v24.16b, v31.16b\n"
+ "mov v25.16b, v31.16b\n"
+ "mov v26.16b, v31.16b\n"
+ "mov v27.16b, v31.16b\n"
+ "mov v28.16b, v31.16b\n"
+ "mov v29.16b, v31.16b\n"
+ "mov v30.16b, v31.16b\n"
+ "mov v31.16b, v31.16b\n"
+ "cbz %x[rq_mul_ptr], 18f\n"
+ "add x22, %x[rq_mul_ptr], x9, LSL #2\n"
+ "add x21, %x[rq_right_shift_ptr], x9, LSL #2\n"
+ "add x20, %x[rq_left_shift_ptr], x9, LSL #2\n"
+ "cbz %x[rq_left_shift_ptr], 15f\n"
+ "tbz %x[n_output_channels], #1, 13f\n"
+ "ld1 { v9.d }[0], [x22], #0x8\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
+ "ld1 { v10.d }[0], [x20], #0x8\n"
+ "tbz %x[n_output_channels], #0, 14f\n"
+ "ld1 { v9.s }[2], [x22], #0x4\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
+ "ld1 { v10.s }[2], [x20], #0x4\n"
+ "b 14f\n"
+ "13:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: Unset
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
+ "ld1 { v10.s }[0], [x20], #0x4\n"
+ "14:" // Output channel oddments: Load quantization parameters: With left shift: Bit 1: End
+ "b 18f\n"
+ "15:" // Output channel oddments: Load quantization parameters: No left shift
+ "tbz %x[n_output_channels], #1, 16f\n"
+ "ld1 { v9.d }[0], [x22], #0x8\n"
+ "ld1 { v8.d }[0], [x21], #0x8\n"
+ "tbz %x[n_output_channels], #0, 17f\n"
+ "ld1 { v9.s }[2], [x22], #0x4\n"
+ "ld1 { v8.s }[2], [x21], #0x4\n"
+ "b 17f\n"
+ "16:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: Unset
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v8.s }[0], [x21], #0x4\n"
+ "17:" // Output channel oddments: Load quantization parameters: No left shift: Bit 1: End
+ "18:" // Output channel oddments: Load quantization parameters: Done
+ "ldr s5, [%x[weights]], #0x4\n"
+ "mov x22, %x[inptrs]\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "lsr x23, %x[kernel_points], #0x1\n"
+ "ldr d0, [x21, #0x0]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "cbz x23, 22f\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "subs x23, x23, #0x1\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d3, [x21, #0x0]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "beq 20f\n"
+ "19:" // Output channel oddments: Kernel loop
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "subs x23, x23, #0x1\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d0, [x21, #0x0]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d4, [x20, #0x0]\n"
+ "ldr s5, [%x[weights]], #0x4\n"
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "ldr d3, [x21, #0x0]\n"
+ "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "ldr d6, [x20, #0x0]\n"
+ "ldr s7, [%x[weights]], #0x4\n"
+ "usubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "bgt 19b\n"
+ "20:" // Output channel oddments: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 21f\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "b 23f\n"
+ "21:" // Output channel oddments: Odd tail
+ "ldp x21, x20, [x22], #0x10\n"
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "ldr d2, [x21, #0x0]\n"
+ "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "ldr d1, [x20, #0x0]\n"
+ "ldr s0, [%x[weights]], #0x4\n"
+ "smlal v16.4s, v7.4h, v3.h[0]\n"
+ "smlal v17.4s, v7.4h, v3.h[1]\n"
+ "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal v18.4s, v7.4h, v3.h[2]\n"
+ "smlal v19.4s, v7.4h, v3.h[3]\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
+ "smlal v20.4s, v7.4h, v3.h[4]\n"
+ "smlal v21.4s, v7.4h, v3.h[5]\n"
+ "smlal v22.4s, v7.4h, v3.h[6]\n"
+ "smlal v23.4s, v7.4h, v3.h[7]\n"
+ "smlal v24.4s, v7.4h, v6.h[0]\n"
+ "smlal v25.4s, v7.4h, v6.h[1]\n"
+ "smlal v26.4s, v7.4h, v6.h[2]\n"
+ "smlal v27.4s, v7.4h, v6.h[3]\n"
+ "smlal v28.4s, v7.4h, v6.h[4]\n"
+ "smlal v29.4s, v7.4h, v6.h[5]\n"
+ "smlal v30.4s, v7.4h, v6.h[6]\n"
+ "smlal v31.4s, v7.4h, v6.h[7]\n"
+ "smlal v16.4s, v0.4h, v2.h[0]\n"
+ "smlal v17.4s, v0.4h, v2.h[1]\n"
+ "smlal v18.4s, v0.4h, v2.h[2]\n"
+ "smlal v19.4s, v0.4h, v2.h[3]\n"
+ "smlal v20.4s, v0.4h, v2.h[4]\n"
+ "smlal v21.4s, v0.4h, v2.h[5]\n"
+ "smlal v22.4s, v0.4h, v2.h[6]\n"
+ "smlal v23.4s, v0.4h, v2.h[7]\n"
+ "smlal v24.4s, v0.4h, v1.h[0]\n"
+ "smlal v25.4s, v0.4h, v1.h[1]\n"
+ "smlal v26.4s, v0.4h, v1.h[2]\n"
+ "smlal v27.4s, v0.4h, v1.h[3]\n"
+ "smlal v28.4s, v0.4h, v1.h[4]\n"
+ "smlal v29.4s, v0.4h, v1.h[5]\n"
+ "smlal v30.4s, v0.4h, v1.h[6]\n"
+ "smlal v31.4s, v0.4h, v1.h[7]\n"
+ "b 23f\n"
+ "22:" // Output channel oddments: Single kernel point
+ "smlal v16.4s, v5.4h, v0.h[0]\n"
+ "smlal v17.4s, v5.4h, v0.h[1]\n"
+ "smlal v18.4s, v5.4h, v0.h[2]\n"
+ "smlal v19.4s, v5.4h, v0.h[3]\n"
+ "smlal v20.4s, v5.4h, v0.h[4]\n"
+ "smlal v21.4s, v5.4h, v0.h[5]\n"
+ "smlal v22.4s, v5.4h, v0.h[6]\n"
+ "smlal v23.4s, v5.4h, v0.h[7]\n"
+ "smlal v24.4s, v5.4h, v4.h[0]\n"
+ "smlal v25.4s, v5.4h, v4.h[1]\n"
+ "smlal v26.4s, v5.4h, v4.h[2]\n"
+ "smlal v27.4s, v5.4h, v4.h[3]\n"
+ "smlal v28.4s, v5.4h, v4.h[4]\n"
+ "smlal v29.4s, v5.4h, v4.h[5]\n"
+ "smlal v30.4s, v5.4h, v4.h[6]\n"
+ "smlal v31.4s, v5.4h, v4.h[7]\n"
+ "23:" // Output channel oddments: Done
+ "sshl v16.4s, v16.4s, v10.4s\n"
+ "sshl v17.4s, v17.4s, v10.4s\n"
+ "sshl v18.4s, v18.4s, v10.4s\n"
+ "sshl v19.4s, v19.4s, v10.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v9.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v9.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v9.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v9.4s\n"
+ "and v3.16b, v16.16b, v8.16b\n"
+ "and v2.16b, v17.16b, v8.16b\n"
+ "and v1.16b, v18.16b, v8.16b\n"
+ "and v0.16b, v19.16b, v8.16b\n"
+ "sshl v20.4s, v20.4s, v10.4s\n"
+ "sshl v21.4s, v21.4s, v10.4s\n"
+ "sshl v22.4s, v22.4s, v10.4s\n"
+ "sshl v23.4s, v23.4s, v10.4s\n"
+ "sshl v24.4s, v24.4s, v10.4s\n"
+ "sshl v25.4s, v25.4s, v10.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v20.4s, v20.4s, v9.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v9.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v9.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v9.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v9.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v9.4s\n"
+ "sqadd v16.4s, v16.4s, v3.4s\n"
+ "sqadd v17.4s, v17.4s, v2.4s\n"
+ "sqadd v18.4s, v18.4s, v1.4s\n"
+ "sqadd v19.4s, v19.4s, v0.4s\n"
+ "and v5.16b, v20.16b, v8.16b\n"
+ "and v4.16b, v21.16b, v8.16b\n"
+ "and v3.16b, v22.16b, v8.16b\n"
+ "and v2.16b, v23.16b, v8.16b\n"
+ "and v1.16b, v24.16b, v8.16b\n"
+ "and v0.16b, v25.16b, v8.16b\n"
+ "sshl v26.4s, v26.4s, v10.4s\n"
+ "sshl v27.4s, v27.4s, v10.4s\n"
+ "sshl v28.4s, v28.4s, v10.4s\n"
+ "sshl v29.4s, v29.4s, v10.4s\n"
+ "sshl v30.4s, v30.4s, v10.4s\n"
+ "sshl v31.4s, v31.4s, v10.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqrdmulh v26.4s, v26.4s, v9.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v9.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v9.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v9.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v9.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v9.4s\n"
+ "sqadd v20.4s, v20.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v4.4s\n"
+ "sqadd v22.4s, v22.4s, v3.4s\n"
+ "sqadd v23.4s, v23.4s, v2.4s\n"
+ "sqadd v24.4s, v24.4s, v1.4s\n"
+ "sqadd v25.4s, v25.4s, v0.4s\n"
+ "and v5.16b, v26.16b, v8.16b\n"
+ "and v4.16b, v27.16b, v8.16b\n"
+ "and v3.16b, v28.16b, v8.16b\n"
+ "and v2.16b, v29.16b, v8.16b\n"
+ "and v1.16b, v30.16b, v8.16b\n"
+ "and v0.16b, v31.16b, v8.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v5.4s\n"
+ "sqadd v27.4s, v27.4s, v4.4s\n"
+ "sqadd v28.4s, v28.4s, v3.4s\n"
+ "sqadd v29.4s, v29.4s, v2.4s\n"
+ "sqadd v30.4s, v30.4s, v1.4s\n"
+ "sqadd v31.4s, v31.4s, v0.4s\n"
+ "srshl v16.4s, v16.4s, v8.4s\n"
+ "srshl v17.4s, v17.4s, v8.4s\n"
+ "srshl v18.4s, v18.4s, v8.4s\n"
+ "srshl v19.4s, v19.4s, v8.4s\n"
+ "srshl v20.4s, v20.4s, v8.4s\n"
+ "srshl v21.4s, v21.4s, v8.4s\n"
+ "srshl v22.4s, v22.4s, v8.4s\n"
+ "srshl v23.4s, v23.4s, v8.4s\n"
+ "srshl v24.4s, v24.4s, v8.4s\n"
+ "srshl v25.4s, v25.4s, v8.4s\n"
+ "srshl v26.4s, v26.4s, v8.4s\n"
+ "srshl v27.4s, v27.4s, v8.4s\n"
+ "srshl v28.4s, v28.4s, v8.4s\n"
+ "srshl v29.4s, v29.4s, v8.4s\n"
+ "srshl v30.4s, v30.4s, v8.4s\n"
+ "srshl v31.4s, v31.4s, v8.4s\n"
+ "add v16.4s, v16.4s, v11.4s\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "add v20.4s, v20.4s, v11.4s\n"
+ "add v21.4s, v21.4s, v11.4s\n"
+ "add v22.4s, v22.4s, v11.4s\n"
+ "add v23.4s, v23.4s, v11.4s\n"
+ "add v24.4s, v24.4s, v11.4s\n"
+ "add v25.4s, v25.4s, v11.4s\n"
+ "add v26.4s, v26.4s, v11.4s\n"
+ "add v27.4s, v27.4s, v11.4s\n"
+ "add v28.4s, v28.4s, v11.4s\n"
+ "add v29.4s, v29.4s, v11.4s\n"
+ "add v30.4s, v30.4s, v11.4s\n"
+ "add v31.4s, v31.4s, v11.4s\n"
+ "smin v16.4s, v16.4s, v14.4s\n"
+ "smin v17.4s, v17.4s, v14.4s\n"
+ "smin v18.4s, v18.4s, v14.4s\n"
+ "smin v19.4s, v19.4s, v14.4s\n"
+ "smin v20.4s, v20.4s, v14.4s\n"
+ "smin v21.4s, v21.4s, v14.4s\n"
+ "smin v22.4s, v22.4s, v14.4s\n"
+ "smin v23.4s, v23.4s, v14.4s\n"
+ "smin v24.4s, v24.4s, v14.4s\n"
+ "smin v25.4s, v25.4s, v14.4s\n"
+ "smin v26.4s, v26.4s, v14.4s\n"
+ "smin v27.4s, v27.4s, v14.4s\n"
+ "smin v28.4s, v28.4s, v14.4s\n"
+ "smin v29.4s, v29.4s, v14.4s\n"
+ "smin v30.4s, v30.4s, v14.4s\n"
+ "smin v31.4s, v31.4s, v14.4s\n"
+ "smax v16.4s, v16.4s, v15.4s\n"
+ "smax v17.4s, v17.4s, v15.4s\n"
+ "smax v18.4s, v18.4s, v15.4s\n"
+ "smax v19.4s, v19.4s, v15.4s\n"
+ "smax v20.4s, v20.4s, v15.4s\n"
+ "smax v21.4s, v21.4s, v15.4s\n"
+ "smax v22.4s, v22.4s, v15.4s\n"
+ "smax v23.4s, v23.4s, v15.4s\n"
+ "smax v24.4s, v24.4s, v15.4s\n"
+ "smax v25.4s, v25.4s, v15.4s\n"
+ "smax v26.4s, v26.4s, v15.4s\n"
+ "smax v27.4s, v27.4s, v15.4s\n"
+ "smax v28.4s, v28.4s, v15.4s\n"
+ "smax v29.4s, v29.4s, v15.4s\n"
+ "smax v30.4s, v30.4s, v15.4s\n"
+ "smax v31.4s, v31.4s, v15.4s\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v20.16b, v20.16b, v20.16b\n"
+ "uzp1 v21.16b, v21.16b, v21.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "uzp1 v25.16b, v25.16b, v25.16b\n"
+ "uzp1 v26.16b, v26.16b, v26.16b\n"
+ "uzp1 v27.16b, v27.16b, v27.16b\n"
+ "uzp1 v28.16b, v28.16b, v28.16b\n"
+ "uzp1 v29.16b, v29.16b, v29.16b\n"
+ "uzp1 v30.16b, v30.16b, v30.16b\n"
+ "uzp1 v31.16b, v31.16b, v31.16b\n"
+ "tbz %x[n_output_channels], #1, 24f\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "add x27, x27, x9\n"
+ "add x26, x26, x9\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.h }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "add x27, x27, x9\n"
+ "st1 { v17.h }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.h }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.h }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.h }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.h }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.h }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.h }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "add x9, x9, #0x2\n"
+ "st1 { v24.h }[0], [x27]\n"
+ "st1 { v25.h }[0], [x26]\n"
+ "st1 { v26.h }[0], [x25]\n"
+ "st1 { v27.h }[0], [x24]\n"
+ "st1 { v28.h }[0], [x23]\n"
+ "st1 { v29.h }[0], [x22]\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "st1 { v31.h }[0], [x20]\n"
+ "tbz %x[n_output_channels], #0, 25f\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "add x27, x27, x9\n"
+ "add x26, x26, x9\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.b }[2], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "add x27, x27, x9\n"
+ "st1 { v17.b }[2], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.b }[2], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.b }[2], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.b }[2], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.b }[2], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.b }[2], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.b }[2], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "st1 { v24.b }[2], [x27]\n"
+ "st1 { v25.b }[2], [x26]\n"
+ "st1 { v26.b }[2], [x25]\n"
+ "st1 { v27.b }[2], [x24]\n"
+ "st1 { v28.b }[2], [x23]\n"
+ "st1 { v29.b }[2], [x22]\n"
+ "st1 { v30.b }[2], [x21]\n"
+ "st1 { v31.b }[2], [x20]\n"
+ "b 25f\n"
+ "24:" // Output channel oddments: Done: Store: Bit 1: Unset
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "add x27, x27, x9\n"
+ "add x26, x26, x9\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "add x25, x25, x9\n"
+ "add x24, x24, x9\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "add x23, x23, x9\n"
+ "add x22, x22, x9\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "add x21, x21, x9\n"
+ "add x20, x20, x9\n"
+ "st1 { v16.b }[0], [x27]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "add x27, x27, x9\n"
+ "st1 { v17.b }[0], [x26]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "add x26, x26, x9\n"
+ "st1 { v18.b }[0], [x25]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "add x25, x25, x9\n"
+ "st1 { v19.b }[0], [x24]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "add x24, x24, x9\n"
+ "st1 { v20.b }[0], [x23]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "add x23, x23, x9\n"
+ "st1 { v21.b }[0], [x22]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "add x22, x22, x9\n"
+ "st1 { v22.b }[0], [x21]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "add x21, x21, x9\n"
+ "st1 { v23.b }[0], [x20]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "add x20, x20, x9\n"
+ "st1 { v24.b }[0], [x27]\n"
+ "st1 { v25.b }[0], [x26]\n"
+ "st1 { v26.b }[0], [x25]\n"
+ "st1 { v27.b }[0], [x24]\n"
+ "st1 { v28.b }[0], [x23]\n"
+ "st1 { v29.b }[0], [x22]\n"
+ "st1 { v30.b }[0], [x21]\n"
+ "st1 { v31.b }[0], [x20]\n"
+ "25:" // Output channel oddments: Done: Store: Bit 1: End
+ "26:" // Done
+ : [weights] "+&r" (weights)
+ : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [n_output_channels] "r" ((uint64_t) n_output_channels), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_layer_left_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_left_shift)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [outptrs] "r" (outptrs), [qp] "r" (&qp), [rq_left_shift_ptr] "r" (per_channel_left_shifts), [rq_mul_ptr] "r" (per_channel_muls), [rq_right_shift_ptr] "r" (per_channel_right_shifts)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..2b6f70c089
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..2d558ade3f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,336 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x4, #0x0\n"
+ "mov x5, #0x0\n"
+ "ptrue p3.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "1:" // Tile loop
+ "str x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x22, #0x2\n"
+ "str x5, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x20, x4, x21\n" // offset = tile_i * ld_input_row
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x20, x5, x6, x20\n" // offset += tile_j * ld_input_col
+ "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
+ "add x17, x6, x6\n"
+ "add x7, x7, x20, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x16, x7, x21, LSL #1\n"
+ "add x15, x17, x6\n"
+ "add x14, x16, x21, LSL #1\n"
+ "add x13, x14, x21, LSL #1\n"
+ "cbnz x5, 2f\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "lsl x12, %x[n_channels], #0x1\n"
+ "mov x21, #0x4\n"
+ "mul x21, x21, x6\n"
+ "add x11, x16, x6, LSL #1\n"
+ "add x10, x7, x15, LSL #1\n"
+ "add x9, x16, x17, LSL #1\n"
+ "sub x20, x24, x5\n"
+ "add x28, x14, x6, LSL #1\n"
+ "sub x20, x20, #0x1\n"
+ "add x27, x13, x15, LSL #1\n"
+ "and x20, x20, #0x3fffff\n"
+ "add x26, x7, x6, LSL #1\n"
+ "orr x12, x12, x20, LSL #22\n"
+ "add x25, x7, x17, LSL #1\n"
+ "orr x12, x12, x21, LSL #38\n"
+ "add x24, x14, x17, LSL #1\n"
+ "add x23, x16, x15, LSL #1\n"
+ "add x22, x14, x15, LSL #1\n"
+ "add x21, x13, x6, LSL #1\n"
+ "add x20, x13, x17, LSL #1\n"
+ ".inst 0xf8ac497a // rprfm pldonce, x12, [x11]\n"
+ ".inst 0xf8ac48fa // rprfm pldonce, x12, [x7]\n"
+ ".inst 0xf8ac495a // rprfm pldonce, x12, [x10]\n"
+ ".inst 0xf8ac493a // rprfm pldonce, x12, [x9]\n"
+ ".inst 0xf8ac4b9a // rprfm pldonce, x12, [x28]\n"
+ ".inst 0xf8ac49ba // rprfm pldonce, x12, [x13]\n"
+ ".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
+ ".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
+ ".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
+ ".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
+ ".inst 0xf8ac4a1a // rprfm pldonce, x12, [x16]\n"
+ ".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
+ ".inst 0xf8ac49da // rprfm pldonce, x12, [x14]\n"
+ ".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
+ ".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
+ ".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
+ "2:" // Tile loop: Prefetch input rows: End
+ "ldr x26, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mov x20, #0x2\n"
+ "ld1h { z18.h }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "cnth x24\n"
+ ".inst 0xa040a100 // ld1h { z0.h-z3.h }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ ".inst 0xa040a104 // ld1h { z4.h-z7.h }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "mul x22, x4, x26\n" // offset = tile_i * ld_output_row
+ "cmp x24, %x[n_channels]\n"
+ "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "madd x22, x5, x25, x22\n" // offset += tile_j * ld_output_col
+ "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "mov x21, #0x0\n"
+ "mul x22, x22, x20\n" // offset *= output_tile_size
+ "sub x20, XZR, x24\n"
+ "ld1h { z8.h }, p3/Z, [x8]\n"
+ "add x23, x23, x22, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "ld1h { z9.h }, p2/Z, [x16, x6, LSL #1]\n"
+ "addvl x8, x8, #1\n"
+ "add x22, x23, x26, LSL #1\n"
+ "ld1h { z10.h }, p2/Z, [x7]\n"
+ "ld1h { z11.h }, p2/Z, [x7, x15, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x16, x17, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x14, x6, LSL #1]\n"
+ "bge 4f\n"
+ "3:" // Tile loop: Channel loop
+ "movprfx z28, z18\n fmla z28.h, p3/M, z4.h, z9.h\n"
+ "movprfx z29, z18\n fmla z29.h, p3/M, z3.h, z9.h\n"
+ "whilelt p1.h, x24, %x[n_channels]\n"
+ "inch x21\n"
+ "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x13]\n"
+ "inch x24\n"
+ "ld1h { z18.h }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "mov p0.b, p2.b\n"
+ "inch x20\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z28.h, p3/M, z5.h, z12.h\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x7, x6, LSL #1]\n"
+ "fmla z30.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x7, x17, LSL #1]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z31.h, p3/M, z3.h, z13.h\n"
+ "fmla z28.h, p3/M, z7.h, z13.h\n"
+ "fmla z29.h, p3/M, z6.h, z13.h\n"
+ "fmla z30.h, p3/M, z4.h, z13.h\n"
+ "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x16]\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "fmla z29.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x16, x15, LSL #1]\n"
+ "addvl x16, x16, #1\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "fmla z29.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x14]\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "addvl x14, x14, #1\n"
+ "ld1h { z13.h }, p1/Z, [x14, x6, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x6, LSL #1]\n"
+ ".inst 0xa040a100 // ld1h { z0.h-z3.h }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "fmla z29.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x13, x17, LSL #1]\n"
+ "whilelt p2.h, x21, %x[n_channels]\n"
+ "cmp x24, %x[n_channels]\n"
+ "fmla z30.h, p3/M, z7.h, z11.h\n"
+ "fmla z31.h, p3/M, z6.h, z11.h\n"
+ "addvl x13, x13, #1\n"
+ "ld1h { z11.h }, p1/Z, [x7, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z9.h }, p1/Z, [x16, x6, LSL #1]\n"
+ "fmla z29.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z10.h }, p1/Z, [x7]\n"
+ "fmla z30.h, p3/M, z8.h, z12.h\n"
+ "fmla z31.h, p3/M, z7.h, z12.h\n"
+ ".inst 0xa040a104 // ld1h { z4.h-z7.h }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "ld1h { z12.h }, p1/Z, [x16, x17, LSL #1]\n"
+ "ld1h { z8.h }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ ".inst 0xc170ca3c // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
+ "st1h { z28.h }, p0, [x23]\n"
+ "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
+ "addvl x23, x23, #1\n"
+ "st1h { z30.h }, p0, [x22]\n"
+ "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+ "addvl x22, x22, #1\n"
+ "blt 3b\n"
+ "4:" // Tile loop: Channel tail
+ "movprfx z28, z18\n fmla z28.h, p3/M, z4.h, z9.h\n"
+ "movprfx z29, z18\n fmla z29.h, p3/M, z3.h, z9.h\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov p0.b, p2.b\n"
+ "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x13]\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x5, x5, #0x1\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "add x20, x4, #0x1\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "cmp x5, x24\n"
+ "csel x4, x4, x20, LT\n"
+ "csel x5, x5, XZR, LT\n"
+ "cmp x4, x21\n"
+ "fmla z28.h, p3/M, z5.h, z12.h\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x7, x6, LSL #1]\n"
+ "fmla z30.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x7, x17, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z13.h\n"
+ "fmla z28.h, p3/M, z7.h, z13.h\n"
+ "fmla z29.h, p3/M, z6.h, z13.h\n"
+ "fmla z30.h, p3/M, z4.h, z13.h\n"
+ "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x16]\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "fmla z29.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x16, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "fmla z29.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x14]\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x6, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x13, x17, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z11.h\n"
+ "fmla z31.h, p3/M, z6.h, z11.h\n"
+ "fmla z28.h, p3/M, z6.h, z9.h\n"
+ "fmla z29.h, p3/M, z8.h, z10.h\n"
+ "fmla z30.h, p3/M, z8.h, z12.h\n"
+ "fmla z31.h, p3/M, z7.h, z12.h\n"
+ ".inst 0xc170ca3c // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
+ "st1h { z28.h }, p0, [x23]\n"
+ "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
+ "st1h { z30.h }, p0, [x22]\n"
+ "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+ "blt 1b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..415e344832
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[16];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[5];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[3];
+ inptrs[3] = input_ptrs[6];
+ inptrs[4] = input_ptrs[9];
+ inptrs[5] = input_ptrs[12];
+ inptrs[6] = input_ptrs[15];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[2];
+ inptrs[9] = input_ptrs[10];
+ inptrs[10] = input_ptrs[4];
+ inptrs[11] = input_ptrs[7];
+ inptrs[12] = input_ptrs[8];
+ inptrs[13] = input_ptrs[11];
+ inptrs[14] = input_ptrs[13];
+ inptrs[15] = input_ptrs[14];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x15, #0x0\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ptrue p3.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "ldr x13, [x16, #0x20]\n"
+ "cnth x12\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ldp x11, x10, [x20, #0x0]\n"
+ "cmp x12, %x[n_channels]\n"
+ "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x9, XZR, x12\n"
+ "ldp x28, x27, [x20, #0x10]\n"
+ "ld1h { z16.h }, p3/Z, [x14]\n"
+ "addvl x14, x14, #1\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ ".inst 0xa040a1c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
+ "addvl x14, x14, #4\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ ".inst 0xa040a1c4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x14]\n"
+ "addvl x14, x14, #4\n"
+ "ld1h { z8.h }, p3/Z, [x14]\n"
+ "addvl x14, x14, #1\n"
+ "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "movprfx z28, z16\n fmla z28.h, p3/M, z4.h, z9.h\n"
+ "movprfx z29, z16\n fmla z29.h, p3/M, z3.h, z9.h\n"
+ "ldr x22, [x16, #0x28]\n"
+ "whilelt p1.h, x12, %x[n_channels]\n"
+ "movprfx z30, z16\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "movprfx z31, z16\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ldr x21, [x16, #0x30]\n"
+ "ld1h { z16.h }, p3/Z, [x14]\n"
+ "ldr x20, [x16, #0x38]\n"
+ "addvl x14, x14, #1\n"
+ "inch x9\n"
+ "ld1h { z9.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ldr x25, [x16, #0x48]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ldr x26, [x16, #0x40]\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "ldr x24, [x16, #0x50]\n"
+ "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ldr x23, [x16, #0x58]\n"
+ "fmla z28.h, p3/M, z5.h, z12.h\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x13, [x16, #0x60]\n"
+ "fmla z30.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z13.h\n"
+ "ldr x22, [x16, #0x68]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "fmla z28.h, p3/M, z7.h, z13.h\n"
+ "fmla z29.h, p3/M, z6.h, z13.h\n"
+ "ldr x20, [x16, #0x78]\n"
+ "fmla z30.h, p3/M, z4.h, z13.h\n"
+ "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "fmla z29.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "fmla z29.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "ldr x13, [x16, #0x20]\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z13.h }, p1/Z, [x13, x12, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x21, x15, LSL #1]\n"
+ ".inst 0xa040a1c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
+ "addvl x14, x14, #4\n"
+ "fmla z29.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "inch x15\n"
+ "fmla z30.h, p3/M, z7.h, z11.h\n"
+ "fmla z31.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p1/Z, [x24, x12, LSL #1]\n"
+ "whilelt p2.h, x15, %x[n_channels]\n"
+ "fmla z28.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z9.h }, p1/Z, [x26, x12, LSL #1]\n"
+ "fmla z29.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z10.h }, p1/Z, [x25, x12, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z12.h\n"
+ "fmla z31.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z12.h }, p1/Z, [x23, x12, LSL #1]\n"
+ "inch x12\n"
+ ".inst 0xa040a1c4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x14]\n"
+ "addvl x14, x14, #4\n"
+ "cmp x12, %x[n_channels]\n"
+ "ld1h { z8.h }, p3/Z, [x14]\n"
+ "addvl x14, x14, #1\n"
+ ".inst 0xc171ca5c // fclamp { z28.h-z31.h }, z18.h, z17.h\n"
+ "st1h { z28.h }, p0, [x11, x9, LSL #1]\n"
+ "st1h { z29.h }, p0, [x10, x9, LSL #1]\n"
+ "st1h { z30.h }, p0, [x28, x9, LSL #1]\n"
+ "st1h { z31.h }, p0, [x27, x9, LSL #1]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z28, z16\n fmla z28.h, p3/M, z4.h, z9.h\n"
+ "movprfx z29, z16\n fmla z29.h, p3/M, z3.h, z9.h\n"
+ "ldr x22, [x16, #0x28]\n"
+ "inch x9\n"
+ "movprfx z30, z16\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "movprfx z31, z16\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ldr x21, [x16, #0x30]\n"
+ "mov p0.b, p2.b\n"
+ "ldr x20, [x16, #0x38]\n"
+ "ld1h { z9.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ldr x25, [x16, #0x48]\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ldr x26, [x16, #0x40]\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "ldr x24, [x16, #0x50]\n"
+ "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ldr x23, [x16, #0x58]\n"
+ "fmla z28.h, p3/M, z5.h, z12.h\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x13, [x16, #0x60]\n"
+ "fmla z30.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z13.h\n"
+ "ldr x22, [x16, #0x68]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "fmla z28.h, p3/M, z7.h, z13.h\n"
+ "fmla z29.h, p3/M, z6.h, z13.h\n"
+ "ldr x20, [x16, #0x78]\n"
+ "fmla z30.h, p3/M, z4.h, z13.h\n"
+ "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z1.h, z12.h\n"
+ "fmla z29.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "fmla z29.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z11.h\n"
+ "fmla z31.h, p3/M, z6.h, z11.h\n"
+ "fmla z28.h, p3/M, z6.h, z9.h\n"
+ "fmla z29.h, p3/M, z8.h, z10.h\n"
+ "fmla z30.h, p3/M, z8.h, z12.h\n"
+ "fmla z31.h, p3/M, z7.h, z12.h\n"
+ ".inst 0xc171ca5c // fclamp { z28.h-z31.h }, z18.h, z17.h\n"
+ "st1h { z28.h }, p0, [x11, x9, LSL #1]\n"
+ "st1h { z29.h }, p0, [x10, x9, LSL #1]\n"
+ "st1h { z30.h }, p0, [x28, x9, LSL #1]\n"
+ "st1h { z31.h }, p0, [x27, x9, LSL #1]\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..f90fbc3906
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 3;
+ constexpr static unsigned int output_cols = 3;
+
+ sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..3a7d1cb0b4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,483 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x2, #0x0\n"
+ "mov x3, #0x0\n"
+ "ptrue p3.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "1:" // Tile loop
+ "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x22, #0x3\n"
+ "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x20, x2, x21\n" // offset = tile_i * ld_input_row
+ "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x20, x3, x4, x20\n" // offset += tile_j * ld_input_col
+ "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
+ "add x7, x4, x4\n"
+ "add x5, x5, x20, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x8, x5, x21, LSL #1\n"
+ "add x17, x7, x4\n"
+ "add x16, x8, x21, LSL #1\n"
+ "add x15, x17, x4\n"
+ "add x14, x16, x21, LSL #1\n"
+ "add x13, x14, x21, LSL #1\n"
+ "cbnz x3, 2f\n"
+ "ldr x9, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "lsl x12, %x[n_channels], #0x1\n"
+ "mov x28, #0x6\n"
+ "mul x28, x28, x4\n"
+ "add x27, x16, x7, LSL #1\n"
+ "add x26, x5, x15, LSL #1\n"
+ "add x25, x8, x7, LSL #1\n"
+ "sub x20, x9, x3\n"
+ "add x24, x13, x15, LSL #1\n"
+ "sub x20, x20, #0x1\n"
+ "add x23, x16, x4, LSL #1\n"
+ "and x20, x20, #0x3fffff\n"
+ "add x22, x5, x4, LSL #1\n"
+ "orr x12, x12, x20, LSL #22\n"
+ "add x21, x5, x17, LSL #1\n"
+ "orr x12, x12, x28, LSL #38\n"
+ "add x20, x16, x17, LSL #1\n"
+ "add x11, x8, x15, LSL #1\n"
+ "add x10, x14, x7, LSL #1\n"
+ "add x9, x14, x15, LSL #1\n"
+ "add x28, x13, x4, LSL #1\n"
+ ".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
+ "add x27, x8, x4, LSL #1\n"
+ ".inst 0xf8ac48ba // rprfm pldonce, x12, [x5]\n"
+ ".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
+ "add x26, x8, x17, LSL #1\n"
+ ".inst 0xf8ac49ba // rprfm pldonce, x12, [x13]\n"
+ ".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
+ "add x25, x13, x17, LSL #1\n"
+ ".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
+ "add x24, x14, x4, LSL #1\n"
+ ".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
+ "add x23, x5, x7, LSL #1\n"
+ ".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
+ "add x22, x14, x17, LSL #1\n"
+ ".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
+ "add x21, x16, x15, LSL #1\n"
+ ".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
+ "add x20, x13, x7, LSL #1\n"
+ ".inst 0xf8ac491a // rprfm pldonce, x12, [x8]\n"
+ ".inst 0xf8ac497a // rprfm pldonce, x12, [x11]\n"
+ ".inst 0xf8ac49da // rprfm pldonce, x12, [x14]\n"
+ ".inst 0xf8ac495a // rprfm pldonce, x12, [x10]\n"
+ ".inst 0xf8ac493a // rprfm pldonce, x12, [x9]\n"
+ ".inst 0xf8ac4b9a // rprfm pldonce, x12, [x28]\n"
+ ".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
+ ".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
+ ".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
+ ".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
+ ".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
+ ".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
+ ".inst 0xf8ac4a1a // rprfm pldonce, x12, [x16]\n"
+ ".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
+ ".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
+ "2:" // Tile loop: Prefetch input rows: End
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mov x21, #0x3\n"
+ "ld1h { z18.h }, p3/Z, [x6]\n"
+ "addvl x6, x6, #1\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "cnth x26\n"
+ ".inst 0xa040a0c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ ".inst 0xa040a0c4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ "mul x20, x2, x22\n" // offset = tile_i * ld_output_row
+ "cmp x26, %x[n_channels]\n"
+ "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "madd x20, x3, x27, x20\n" // offset += tile_j * ld_output_col
+ "add x24, x27, x27\n"
+ "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "mul x20, x20, x21\n" // offset *= output_tile_size
+ "mov x21, #0x0\n"
+ "ld1h { z8.h }, p3/Z, [x6]\n"
+ "add x25, x25, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "sub x20, XZR, x26\n"
+ "ld1h { z9.h }, p2/Z, [x16, x7, LSL #1]\n"
+ "add x23, x25, x22, LSL #1\n"
+ "ld1h { z10.h }, p2/Z, [x5]\n"
+ "addvl x6, x6, #1\n"
+ "add x22, x23, x22, LSL #1\n"
+ "ld1h { z11.h }, p2/Z, [x5, x15, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x13]\n"
+ "ld1h { z13.h }, p2/Z, [x8, x7, LSL #1]\n"
+ "bge 4f\n"
+ "3:" // Tile loop: Channel loop
+ "movprfx z24, z18\n fmla z24.h, p3/M, z7.h, z9.h\n"
+ "movprfx z23, z18\n fmla z23.h, p3/M, z8.h, z9.h\n"
+ "whilelt p1.h, x26, %x[n_channels]\n"
+ "inch x21\n"
+ "movprfx z25, z18\n fmla z25.h, p3/M, z6.h, z9.h\n"
+ "movprfx z26, z18\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "inch x26\n"
+ "mov p0.b, p2.b\n"
+ "movprfx z27, z18\n fmla z27.h, p3/M, z4.h, z9.h\n"
+ "movprfx z28, z18\n fmla z28.h, p3/M, z3.h, z9.h\n"
+ "inch x20\n"
+ "movprfx z29, z18\n fmla z29.h, p3/M, z2.h, z9.h\n"
+ "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "fmla z24.h, p3/M, z4.h, z13.h\n"
+ "fmla z23.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x16, x17, LSL #1]\n"
+ "fmla z25.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x16, x4, LSL #1]\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "fmla z27.h, p3/M, z1.h, z13.h\n"
+ "fmla z28.h, p3/M, z0.h, z13.h\n"
+ "fmla z29.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "fmla z23.h, p3/M, z5.h, z13.h\n"
+ "ld1h { z18.h }, p3/Z, [x6]\n"
+ "addvl x6, x6, #1\n"
+ "fmla z25.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x5, x4, LSL #1]\n"
+ "fmla z26.h, p3/M, z4.h, z11.h\n"
+ "fmla z31.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x5, x17, LSL #1]\n"
+ "fmla z27.h, p3/M, z3.h, z11.h\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z29.h, p3/M, z1.h, z11.h\n"
+ "fmla z24.h, p3/M, z0.h, z13.h\n"
+ "fmla z23.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x8]\n"
+ "fmla z25.h, p3/M, z1.h, z12.h\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "fmla z31.h, p3/M, z1.h, z10.h\n"
+ "fmla z27.h, p3/M, z5.h, z10.h\n"
+ "fmla z30.h, p3/M, z2.h, z10.h\n"
+ "fmla z26.h, p3/M, z0.h, z11.h\n"
+ "fmla z24.h, p3/M, z2.h, z12.h\n"
+ "fmla z23.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x8, x15, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x14]\n"
+ "fmla z25.h, p3/M, z7.h, z10.h\n"
+ "fmla z28.h, p3/M, z2.h, z13.h\n"
+ "fmla z24.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "fmla z23.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x8, x4, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "fmla z31.h, p3/M, z3.h, z10.h\n"
+ "fmla z25.h, p3/M, z5.h, z13.h\n"
+ "fmla z27.h, p3/M, z7.h, z10.h\n"
+ "ld1h { z13.h }, p2/Z, [x13, x4, LSL #1]\n"
+ "fmla z28.h, p3/M, z6.h, z10.h\n"
+ "fmla z29.h, p3/M, z5.h, z10.h\n"
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "fmla z24.h, p3/M, z3.h, z12.h\n"
+ "fmla z31.h, p3/M, z5.h, z11.h\n"
+ "fmla z30.h, p3/M, z6.h, z13.h\n"
+ "fmla z27.h, p3/M, z0.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "fmla z28.h, p3/M, z8.h, z11.h\n"
+ "fmla z29.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x13, x17, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x8, x17, LSL #1]\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "addvl x8, x8, #1\n"
+ "ld1h { z12.h }, p2/Z, [x14, x4, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z13.h\n"
+ "fmla z31.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "addvl x14, x14, #1\n"
+ "fmla z24.h, p3/M, z5.h, z11.h\n"
+ "fmla z25.h, p3/M, z4.h, z11.h\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x5, x7, LSL #1]\n"
+ "addvl x5, x5, #1\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "fmla z26.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z10.h }, p1/Z, [x5]\n"
+ "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "fmla z27.h, p3/M, z6.h, z12.h\n"
+ "fmla z25.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z12.h }, p2/Z, [x16]\n"
+ "ld1h { z11.h }, p2/Z, [x16, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z13.h\n"
+ "addvl x16, x16, #1\n"
+ "fmla z30.h, p3/M, z5.h, z13.h\n"
+ "ld1h { z9.h }, p1/Z, [x16, x7, LSL #1]\n"
+ "fmla z23.h, p3/M, z6.h, z12.h\n"
+ "fmla z29.h, p3/M, z0.h, z12.h\n"
+ "fmla z26.h, p3/M, z3.h, z12.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ ".inst 0xa040a0c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ "fmla z27.h, p3/M, z8.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x13, x7, LSL #1]\n"
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "whilelt p2.h, x21, %x[n_channels]\n"
+ "fmla z28.h, p3/M, z5.h, z11.h\n"
+ "addvl x13, x13, #1\n"
+ "cmp x26, %x[n_channels]\n"
+ "ld1h { z11.h }, p1/Z, [x5, x15, LSL #1]\n"
+ "fmax z23.h, p3/M, z23.h, z17.h\n"
+ "ld1h { z12.h }, p1/Z, [x13]\n"
+ "fmla z29.h, p3/M, z8.h, z13.h\n"
+ "fmla z30.h, p3/M, z7.h, z13.h\n"
+ "fmla z31.h, p3/M, z6.h, z13.h\n"
+ ".inst 0xa040a0c4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ ".inst 0xc170ca38 // fclamp { z24.h-z27.h }, z17.h, z16.h\n"
+ "ld1h { z13.h }, p1/Z, [x8, x7, LSL #1]\n"
+ "ld1h { z8.h }, p3/Z, [x6]\n"
+ "addvl x6, x6, #1\n"
+ "fmin z23.h, p3/M, z23.h, z16.h\n"
+ ".inst 0xc170ca3c // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
+ "st1h { z26.h }, p0, [x23]\n"
+ "st1h { z27.h }, p0, [x23, x27, LSL #1]\n"
+ "st1h { z23.h }, p0, [x25]\n"
+ "st1h { z24.h }, p0, [x25, x27, LSL #1]\n"
+ "st1h { z25.h }, p0, [x25, x24, LSL #1]\n"
+ "addvl x25, x25, #1\n"
+ "st1h { z28.h }, p0, [x23, x24, LSL #1]\n"
+ "addvl x23, x23, #1\n"
+ "st1h { z29.h }, p0, [x22]\n"
+ "st1h { z30.h }, p0, [x22, x27, LSL #1]\n"
+ "st1h { z31.h }, p0, [x22, x24, LSL #1]\n"
+ "addvl x22, x22, #1\n"
+ "blt 3b\n"
+ "4:" // Tile loop: Channel tail
+ "movprfx z24, z18\n fmla z24.h, p3/M, z7.h, z9.h\n"
+ "movprfx z23, z18\n fmla z23.h, p3/M, z8.h, z9.h\n"
+ "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov p0.b, p2.b\n"
+ "movprfx z25, z18\n fmla z25.h, p3/M, z6.h, z9.h\n"
+ "movprfx z26, z18\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z27, z18\n fmla z27.h, p3/M, z4.h, z9.h\n"
+ "movprfx z28, z18\n fmla z28.h, p3/M, z3.h, z9.h\n"
+ "ldr x9, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "movprfx z29, z18\n fmla z29.h, p3/M, z2.h, z9.h\n"
+ "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x3, x3, #0x1\n"
+ "fmla z24.h, p3/M, z4.h, z13.h\n"
+ "fmla z23.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x16, x17, LSL #1]\n"
+ "add x20, x2, #0x1\n"
+ "fmla z25.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x16, x4, LSL #1]\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "cmp x3, x9\n"
+ "fmla z27.h, p3/M, z1.h, z13.h\n"
+ "fmla z28.h, p3/M, z0.h, z13.h\n"
+ "csel x2, x2, x20, LT\n"
+ "csel x3, x3, XZR, LT\n"
+ "fmla z29.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "movprfx z30, z18\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "cmp x2, x21\n"
+ "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "fmla z23.h, p3/M, z5.h, z13.h\n"
+ "fmla z25.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x5, x4, LSL #1]\n"
+ "fmla z26.h, p3/M, z4.h, z11.h\n"
+ "fmla z31.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x5, x17, LSL #1]\n"
+ "fmla z27.h, p3/M, z3.h, z11.h\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z29.h, p3/M, z1.h, z11.h\n"
+ "fmla z24.h, p3/M, z0.h, z13.h\n"
+ "fmla z23.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x8]\n"
+ "fmla z25.h, p3/M, z1.h, z12.h\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "fmla z31.h, p3/M, z1.h, z10.h\n"
+ "fmla z27.h, p3/M, z5.h, z10.h\n"
+ "fmla z30.h, p3/M, z2.h, z10.h\n"
+ "fmla z26.h, p3/M, z0.h, z11.h\n"
+ "fmla z24.h, p3/M, z2.h, z12.h\n"
+ "fmla z23.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x8, x15, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x14]\n"
+ "fmla z25.h, p3/M, z7.h, z10.h\n"
+ "fmla z28.h, p3/M, z2.h, z13.h\n"
+ "fmla z24.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "fmla z23.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x8, x4, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "fmla z31.h, p3/M, z3.h, z10.h\n"
+ "fmla z25.h, p3/M, z5.h, z13.h\n"
+ "fmla z27.h, p3/M, z7.h, z10.h\n"
+ "ld1h { z13.h }, p2/Z, [x13, x4, LSL #1]\n"
+ "fmla z28.h, p3/M, z6.h, z10.h\n"
+ "fmla z29.h, p3/M, z5.h, z10.h\n"
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "fmla z24.h, p3/M, z3.h, z12.h\n"
+ "fmla z31.h, p3/M, z5.h, z11.h\n"
+ "fmla z30.h, p3/M, z6.h, z13.h\n"
+ "fmla z27.h, p3/M, z0.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "fmla z28.h, p3/M, z8.h, z11.h\n"
+ "fmla z29.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x13, x17, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x8, x17, LSL #1]\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x4, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z13.h\n"
+ "fmla z31.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z24.h, p3/M, z5.h, z11.h\n"
+ "fmla z25.h, p3/M, z4.h, z11.h\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x5, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "fmla z26.h, p3/M, z7.h, z12.h\n"
+ "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "fmla z27.h, p3/M, z6.h, z12.h\n"
+ "fmla z25.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z12.h }, p2/Z, [x16]\n"
+ "ld1h { z11.h }, p2/Z, [x16, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z13.h\n"
+ "fmla z30.h, p3/M, z5.h, z13.h\n"
+ "fmla z23.h, p3/M, z6.h, z12.h\n"
+ "fmla z29.h, p3/M, z0.h, z12.h\n"
+ "fmla z26.h, p3/M, z3.h, z12.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "fmla z27.h, p3/M, z8.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x13, x7, LSL #1]\n"
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "fmla z28.h, p3/M, z5.h, z11.h\n"
+ "fmax z23.h, p3/M, z23.h, z17.h\n"
+ "fmla z29.h, p3/M, z8.h, z13.h\n"
+ "fmla z30.h, p3/M, z7.h, z13.h\n"
+ "fmla z31.h, p3/M, z6.h, z13.h\n"
+ ".inst 0xc170ca38 // fclamp { z24.h-z27.h }, z17.h, z16.h\n"
+ "fmin z23.h, p3/M, z23.h, z16.h\n"
+ ".inst 0xc170ca3c // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
+ "st1h { z26.h }, p0, [x23]\n"
+ "st1h { z27.h }, p0, [x23, x27, LSL #1]\n"
+ "st1h { z23.h }, p0, [x25]\n"
+ "st1h { z24.h }, p0, [x25, x27, LSL #1]\n"
+ "st1h { z25.h }, p0, [x25, x24, LSL #1]\n"
+ "st1h { z28.h }, p0, [x23, x24, LSL #1]\n"
+ "st1h { z29.h }, p0, [x22]\n"
+ "st1h { z30.h }, p0, [x22, x27, LSL #1]\n"
+ "st1h { z31.h }, p0, [x22, x24, LSL #1]\n"
+ "blt 1b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..e85cb9e017
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[25];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[4];
+ inptrs[3] = input_ptrs[20];
+ inptrs[4] = input_ptrs[7];
+ inptrs[5] = input_ptrs[24];
+ inptrs[6] = input_ptrs[11];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[3];
+ inptrs[9] = input_ptrs[13];
+ inptrs[10] = input_ptrs[5];
+ inptrs[11] = input_ptrs[9];
+ inptrs[12] = input_ptrs[15];
+ inptrs[13] = input_ptrs[17];
+ inptrs[14] = input_ptrs[19];
+ inptrs[15] = input_ptrs[21];
+ inptrs[16] = input_ptrs[6];
+ inptrs[17] = input_ptrs[8];
+ inptrs[18] = input_ptrs[23];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[18];
+ inptrs[22] = input_ptrs[10];
+ inptrs[23] = input_ptrs[14];
+ inptrs[24] = input_ptrs[22];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x15, #0x0\n"
+ "ptrue p3.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "ldp x14, x13, [x16, #0x0]\n"
+ "ldp x12, x11, [x16, #0x10]\n"
+ "cnth x10\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1h { z17.h }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "ldr x9, [x16, #0x20]\n"
+ "cmp x10, %x[n_channels]\n"
+ ".inst 0xa040a220 // ld1h { z0.h-z3.h }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "ldr x28, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "sub x27, XZR, x10\n"
+ ".inst 0xa040a224 // ld1h { z4.h-z7.h }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z8.h }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "ld1h { z9.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x9, x15, LSL #1]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "movprfx z23, z17\n fmla z23.h, p3/M, z8.h, z9.h\n"
+ "movprfx z24, z17\n fmla z24.h, p3/M, z7.h, z9.h\n"
+ "ldr x26, [x16, #0x30]\n"
+ "inch x27\n"
+ "movprfx z25, z17\n fmla z25.h, p3/M, z6.h, z9.h\n"
+ "movprfx z26, z17\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "ldr x25, [x16, #0x38]\n"
+ "mov p1.b, p2.b\n"
+ "movprfx z27, z17\n fmla z27.h, p3/M, z4.h, z9.h\n"
+ "movprfx z28, z17\n fmla z28.h, p3/M, z3.h, z9.h\n"
+ "ldr x24, [x16, #0x28]\n"
+ "whilelt p0.h, x10, %x[n_channels]\n"
+ "movprfx z29, z17\n fmla z29.h, p3/M, z2.h, z9.h\n"
+ "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ldr x13, [x16, #0x48]\n"
+ "fmla z23.h, p3/M, z0.h, z10.h\n"
+ "fmla z24.h, p3/M, z4.h, z13.h\n"
+ "ldr x14, [x16, #0x40]\n"
+ "fmla z25.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "ldr x12, [x16, #0x50]\n"
+ "fmla z27.h, p3/M, z1.h, z13.h\n"
+ "fmla z28.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "ldr x11, [x16, #0x58]\n"
+ "fmla z29.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "movprfx z30, z17\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "ldr x9, [x16, #0x60]\n"
+ "fmla z23.h, p3/M, z5.h, z13.h\n"
+ "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "ldr x24, [x16, #0x68]\n"
+ "ld1h { z17.h }, p3/Z, [x17]\n"
+ "fmla z25.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z4.h, z11.h\n"
+ "ldr x26, [x16, #0x70]\n"
+ "fmla z31.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "fmla z27.h, p3/M, z3.h, z11.h\n"
+ "ldr x25, [x16, #0x78]\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "ldr x14, [x16, #0x80]\n"
+ "addvl x17, x17, #1\n"
+ "fmla z23.h, p3/M, z7.h, z11.h\n"
+ "fmla z24.h, p3/M, z0.h, z13.h\n"
+ "ldr x13, [x16, #0x88]\n"
+ "fmla z29.h, p3/M, z1.h, z11.h\n"
+ "fmla z25.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "ldr x12, [x16, #0x90]\n"
+ "fmla z27.h, p3/M, z5.h, z10.h\n"
+ "fmla z31.h, p3/M, z1.h, z10.h\n"
+ "ldr x23, [x28, #0x0]\n"
+ "fmla z30.h, p3/M, z2.h, z10.h\n"
+ "ldr x22, [x28, #0x8]\n"
+ "fmla z23.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z24.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z0.h, z11.h\n"
+ "ldr x9, [x16, #0xa0]\n"
+ "fmla z25.h, p3/M, z7.h, z10.h\n"
+ "ldr x11, [x16, #0x98]\n"
+ "fmla z28.h, p3/M, z2.h, z13.h\n"
+ "ldr x21, [x28, #0x10]\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "fmla z23.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ldr x26, [x16, #0xb0]\n"
+ "fmla z24.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "ldr x24, [x16, #0xa8]\n"
+ "fmla z25.h, p3/M, z5.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ldr x25, [x16, #0xb8]\n"
+ "fmla z27.h, p3/M, z7.h, z10.h\n"
+ "fmla z28.h, p3/M, z6.h, z10.h\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "fmla z29.h, p3/M, z5.h, z10.h\n"
+ "ldr x14, [x16, #0xc0]\n"
+ "fmla z31.h, p3/M, z3.h, z10.h\n"
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "ldr x20, [x28, #0x18]\n"
+ "fmla z24.h, p3/M, z3.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "fmla z28.h, p3/M, z8.h, z11.h\n"
+ "fmla z27.h, p3/M, z0.h, z12.h\n"
+ "fmla z30.h, p3/M, z6.h, z13.h\n"
+ "fmla z29.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "fmla z31.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z24.h, p3/M, z5.h, z11.h\n"
+ "fmla z25.h, p3/M, z4.h, z11.h\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, x15, LSL #1]\n"
+ "ldr x9, [x16, #0x20]\n"
+ "fmla z30.h, p3/M, z8.h, z13.h\n"
+ "fmla z26.h, p3/M, z7.h, z12.h\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "fmla z31.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "fmla z27.h, p3/M, z6.h, z12.h\n"
+ "fmla z25.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z13.h\n"
+ "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "fmla z23.h, p3/M, z6.h, z12.h\n"
+ "fmla z29.h, p3/M, z0.h, z12.h\n"
+ "fmla z27.h, p3/M, z8.h, z13.h\n"
+ "fmla z26.h, p3/M, z3.h, z12.h\n"
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "fmla z30.h, p3/M, z5.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "ldp x14, x13, [x16, #0x0]\n"
+ "fmla z28.h, p3/M, z5.h, z11.h\n"
+ "fmax z23.h, p3/M, z23.h, z18.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "ldp x12, x11, [x16, #0x10]\n"
+ "inch x15\n"
+ ".inst 0xa040a220 // ld1h { z0.h-z3.h }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "whilelt p2.h, x15, %x[n_channels]\n"
+ "fmla z29.h, p3/M, z8.h, z13.h\n"
+ ".inst 0xc170ca58 // fclamp { z24.h-z27.h }, z18.h, z16.h\n"
+ "ld1h { z9.h }, p0/Z, [x14, x10, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z10.h }, p0/Z, [x13, x10, LSL #1]\n"
+ "fmin z23.h, p3/M, z23.h, z16.h\n"
+ "fmla z31.h, p3/M, z6.h, z13.h\n"
+ "ld1h { z11.h }, p0/Z, [x12, x10, LSL #1]\n"
+ "ld1h { z12.h }, p0/Z, [x11, x10, LSL #1]\n"
+ "st1h { z24.h }, p1, [x22, x27, LSL #1]\n"
+ "ldr x22, [x28, #0x28]\n"
+ "st1h { z25.h }, p1, [x21, x27, LSL #1]\n"
+ "ldr x21, [x28, #0x30]\n"
+ "ld1h { z13.h }, p0/Z, [x9, x10, LSL #1]\n"
+ "inch x10\n"
+ "st1h { z23.h }, p1, [x23, x27, LSL #1]\n"
+ "ldr x23, [x28, #0x20]\n"
+ ".inst 0xa040a224 // ld1h { z4.h-z7.h }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "st1h { z26.h }, p1, [x20, x27, LSL #1]\n"
+ "ldr x20, [x28, #0x38]\n"
+ "cmp x10, %x[n_channels]\n"
+ ".inst 0xc170ca5c // fclamp { z28.h-z31.h }, z18.h, z16.h\n"
+ "ld1h { z8.h }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "st1h { z27.h }, p1, [x23, x27, LSL #1]\n"
+ "ldr x23, [x28, #0x40]\n"
+ "st1h { z28.h }, p1, [x22, x27, LSL #1]\n"
+ "st1h { z29.h }, p1, [x21, x27, LSL #1]\n"
+ "st1h { z30.h }, p1, [x20, x27, LSL #1]\n"
+ "st1h { z31.h }, p1, [x23, x27, LSL #1]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z23, z17\n fmla z23.h, p3/M, z8.h, z9.h\n"
+ "movprfx z24, z17\n fmla z24.h, p3/M, z7.h, z9.h\n"
+ "ldr x26, [x16, #0x30]\n"
+ "inch x27\n"
+ "movprfx z25, z17\n fmla z25.h, p3/M, z6.h, z9.h\n"
+ "movprfx z26, z17\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "ldr x25, [x16, #0x38]\n"
+ "mov p1.b, p2.b\n"
+ "movprfx z27, z17\n fmla z27.h, p3/M, z4.h, z9.h\n"
+ "movprfx z28, z17\n fmla z28.h, p3/M, z3.h, z9.h\n"
+ "ldr x24, [x16, #0x28]\n"
+ "movprfx z29, z17\n fmla z29.h, p3/M, z2.h, z9.h\n"
+ "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ldr x13, [x16, #0x48]\n"
+ "fmla z23.h, p3/M, z0.h, z10.h\n"
+ "fmla z24.h, p3/M, z4.h, z13.h\n"
+ "ldr x14, [x16, #0x40]\n"
+ "fmla z25.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "ldr x12, [x16, #0x50]\n"
+ "fmla z27.h, p3/M, z1.h, z13.h\n"
+ "fmla z28.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "ldr x11, [x16, #0x58]\n"
+ "fmla z29.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "movprfx z30, z17\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "ldr x9, [x16, #0x60]\n"
+ "fmla z23.h, p3/M, z5.h, z13.h\n"
+ "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "ldr x24, [x16, #0x68]\n"
+ "fmla z25.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z4.h, z11.h\n"
+ "ldr x26, [x16, #0x70]\n"
+ "fmla z31.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "fmla z27.h, p3/M, z3.h, z11.h\n"
+ "ldr x25, [x16, #0x78]\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z28.h, p3/M, z4.h, z10.h\n"
+ "ldr x14, [x16, #0x80]\n"
+ "fmla z23.h, p3/M, z7.h, z11.h\n"
+ "fmla z24.h, p3/M, z0.h, z13.h\n"
+ "ldr x13, [x16, #0x88]\n"
+ "fmla z29.h, p3/M, z1.h, z11.h\n"
+ "fmla z25.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "ldr x12, [x16, #0x90]\n"
+ "fmla z27.h, p3/M, z5.h, z10.h\n"
+ "fmla z31.h, p3/M, z1.h, z10.h\n"
+ "ldr x23, [x28, #0x0]\n"
+ "fmla z30.h, p3/M, z2.h, z10.h\n"
+ "ldr x22, [x28, #0x8]\n"
+ "fmla z23.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z24.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x9, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z0.h, z11.h\n"
+ "ldr x9, [x16, #0xa0]\n"
+ "fmla z25.h, p3/M, z7.h, z10.h\n"
+ "ldr x11, [x16, #0x98]\n"
+ "fmla z28.h, p3/M, z2.h, z13.h\n"
+ "ldr x21, [x28, #0x10]\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "fmla z23.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ldr x26, [x16, #0xb0]\n"
+ "fmla z24.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "ldr x24, [x16, #0xa8]\n"
+ "fmla z25.h, p3/M, z5.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ldr x25, [x16, #0xb8]\n"
+ "fmla z27.h, p3/M, z7.h, z10.h\n"
+ "fmla z28.h, p3/M, z6.h, z10.h\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "fmla z29.h, p3/M, z5.h, z10.h\n"
+ "ldr x14, [x16, #0xc0]\n"
+ "fmla z31.h, p3/M, z3.h, z10.h\n"
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "ldr x20, [x28, #0x18]\n"
+ "fmla z24.h, p3/M, z3.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "fmla z28.h, p3/M, z8.h, z11.h\n"
+ "fmla z27.h, p3/M, z0.h, z12.h\n"
+ "fmla z30.h, p3/M, z6.h, z13.h\n"
+ "fmla z29.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "fmla z31.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z24.h, p3/M, z5.h, z11.h\n"
+ "fmla z25.h, p3/M, z4.h, z11.h\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z13.h\n"
+ "fmla z26.h, p3/M, z7.h, z12.h\n"
+ "fmla z29.h, p3/M, z4.h, z12.h\n"
+ "fmla z31.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "fmla z27.h, p3/M, z6.h, z12.h\n"
+ "fmla z25.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z13.h\n"
+ "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "fmla z23.h, p3/M, z6.h, z12.h\n"
+ "fmla z29.h, p3/M, z0.h, z12.h\n"
+ "fmla z27.h, p3/M, z8.h, z13.h\n"
+ "fmla z26.h, p3/M, z3.h, z12.h\n"
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "fmla z30.h, p3/M, z5.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z5.h, z11.h\n"
+ "fmax z23.h, p3/M, z23.h, z18.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "fmla z29.h, p3/M, z8.h, z13.h\n"
+ ".inst 0xc170ca58 // fclamp { z24.h-z27.h }, z18.h, z16.h\n"
+ "fmla z30.h, p3/M, z7.h, z13.h\n"
+ "fmin z23.h, p3/M, z23.h, z16.h\n"
+ "fmla z31.h, p3/M, z6.h, z13.h\n"
+ "st1h { z24.h }, p1, [x22, x27, LSL #1]\n"
+ "ldr x22, [x28, #0x28]\n"
+ "st1h { z25.h }, p1, [x21, x27, LSL #1]\n"
+ "ldr x21, [x28, #0x30]\n"
+ "st1h { z26.h }, p1, [x20, x27, LSL #1]\n"
+ "ldr x20, [x28, #0x38]\n"
+ "st1h { z23.h }, p1, [x23, x27, LSL #1]\n"
+ "ldr x23, [x28, #0x20]\n"
+ ".inst 0xc170ca5c // fclamp { z28.h-z31.h }, z18.h, z16.h\n"
+ "st1h { z27.h }, p1, [x23, x27, LSL #1]\n"
+ "ldr x23, [x28, #0x40]\n"
+ "st1h { z28.h }, p1, [x22, x27, LSL #1]\n"
+ "st1h { z29.h }, p1, [x21, x27, LSL #1]\n"
+ "st1h { z30.h }, p1, [x20, x27, LSL #1]\n"
+ "st1h { z31.h }, p1, [x23, x27, LSL #1]\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..6b75d12295
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 4;
+ constexpr static unsigned int output_cols = 4;
+
+ sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..37a9febf47
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,672 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x1, #0x0\n"
+ "mov x2, #0x0\n"
+ "ptrue p3.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "1:" // Tile loop
+ "str x1, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x22, #0x4\n"
+ "str x2, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x3, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x20, x1, x21\n" // offset = tile_i * ld_input_row
+ "ldr x5, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x20, x2, x3, x20\n" // offset += tile_j * ld_input_col
+ "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
+ "add x6, x3, x3\n"
+ "add x4, x4, x20, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x7, x4, x21, LSL #1\n"
+ "add x8, x6, x3\n"
+ "add x17, x7, x21, LSL #1\n"
+ "add x16, x8, x3\n"
+ "add x15, x17, x21, LSL #1\n"
+ "add x14, x16, x3\n"
+ "add x13, x15, x21, LSL #1\n"
+ "add x12, x13, x21, LSL #1\n"
+ "cbnz x2, 2f\n"
+ "ldr x11, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "lsl x10, %x[n_channels], #0x1\n"
+ "mov x21, #0x8\n"
+ "mul x21, x21, x3\n"
+ "add x9, x17, x6, LSL #1\n"
+ "add x28, x4, x14, LSL #1\n"
+ "add x27, x17, x8, LSL #1\n"
+ "sub x20, x11, x2\n"
+ "add x26, x12, x14, LSL #1\n"
+ "sub x20, x20, #0x1\n"
+ "add x25, x15, x6, LSL #1\n"
+ "and x20, x20, #0x3fffff\n"
+ "add x24, x4, x3, LSL #1\n"
+ "orr x10, x10, x20, LSL #22\n"
+ "add x23, x4, x16, LSL #1\n"
+ "orr x10, x10, x21, LSL #38\n"
+ "add x22, x15, x8, LSL #1\n"
+ "add x21, x7, x14, LSL #1\n"
+ "add x20, x7, x6, LSL #1\n"
+ ".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
+ "add x9, x13, x14, LSL #1\n"
+ ".inst 0xf8aa489a // rprfm pldonce, x10, [x4]\n"
+ ".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
+ "add x28, x7, x8, LSL #1\n"
+ ".inst 0xf8aa4b7a // rprfm pldonce, x10, [x27]\n"
+ "add x27, x12, x3, LSL #1\n"
+ ".inst 0xf8aa499a // rprfm pldonce, x10, [x12]\n"
+ ".inst 0xf8aa4b5a // rprfm pldonce, x10, [x26]\n"
+ "add x26, x17, x3, LSL #1\n"
+ ".inst 0xf8aa4b3a // rprfm pldonce, x10, [x25]\n"
+ "add x25, x12, x16, LSL #1\n"
+ ".inst 0xf8aa4b1a // rprfm pldonce, x10, [x24]\n"
+ "add x24, x17, x16, LSL #1\n"
+ ".inst 0xf8aa4afa // rprfm pldonce, x10, [x23]\n"
+ "add x23, x4, x6, LSL #1\n"
+ ".inst 0xf8aa4ada // rprfm pldonce, x10, [x22]\n"
+ "add x22, x15, x3, LSL #1\n"
+ ".inst 0xf8aa48fa // rprfm pldonce, x10, [x7]\n"
+ ".inst 0xf8aa4aba // rprfm pldonce, x10, [x21]\n"
+ "add x21, x4, x8, LSL #1\n"
+ ".inst 0xf8aa49ba // rprfm pldonce, x10, [x13]\n"
+ ".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
+ "add x20, x15, x16, LSL #1\n"
+ ".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
+ "add x9, x17, x14, LSL #1\n"
+ ".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
+ "add x28, x13, x6, LSL #1\n"
+ ".inst 0xf8aa4b7a // rprfm pldonce, x10, [x27]\n"
+ "add x27, x15, x14, LSL #1\n"
+ ".inst 0xf8aa4b5a // rprfm pldonce, x10, [x26]\n"
+ "add x26, x12, x6, LSL #1\n"
+ ".inst 0xf8aa4b3a // rprfm pldonce, x10, [x25]\n"
+ "add x25, x13, x8, LSL #1\n"
+ ".inst 0xf8aa4b1a // rprfm pldonce, x10, [x24]\n"
+ "add x24, x12, x8, LSL #1\n"
+ ".inst 0xf8aa4afa // rprfm pldonce, x10, [x23]\n"
+ "add x23, x7, x3, LSL #1\n"
+ ".inst 0xf8aa4ada // rprfm pldonce, x10, [x22]\n"
+ "add x22, x7, x16, LSL #1\n"
+ ".inst 0xf8aa4aba // rprfm pldonce, x10, [x21]\n"
+ "add x21, x13, x3, LSL #1\n"
+ ".inst 0xf8aa4a3a // rprfm pldonce, x10, [x17]\n"
+ ".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
+ "add x20, x13, x16, LSL #1\n"
+ ".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
+ ".inst 0xf8aa49fa // rprfm pldonce, x10, [x15]\n"
+ ".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
+ ".inst 0xf8aa4b7a // rprfm pldonce, x10, [x27]\n"
+ ".inst 0xf8aa4b5a // rprfm pldonce, x10, [x26]\n"
+ ".inst 0xf8aa4b3a // rprfm pldonce, x10, [x25]\n"
+ ".inst 0xf8aa4b1a // rprfm pldonce, x10, [x24]\n"
+ ".inst 0xf8aa4afa // rprfm pldonce, x10, [x23]\n"
+ ".inst 0xf8aa4ada // rprfm pldonce, x10, [x22]\n"
+ ".inst 0xf8aa4aba // rprfm pldonce, x10, [x21]\n"
+ ".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
+ "2:" // Tile loop: Prefetch input rows: End
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mov x21, #0x4\n"
+ "ld1h { z15.h }, p3/Z, [x5]\n"
+ "addvl x5, x5, #1\n"
+ "ldr x9, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "cnth x28\n"
+ ".inst 0xa040a0a0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x5]\n"
+ "addvl x5, x5, #4\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ ".inst 0xa040a0a4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x5]\n"
+ "addvl x5, x5, #4\n"
+ "mul x20, x1, x22\n" // offset = tile_i * ld_output_row
+ "cmp x28, %x[n_channels]\n"
+ "ld1rh { z14.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "madd x20, x2, x9, x20\n" // offset += tile_j * ld_output_col
+ "add x26, x9, x9\n"
+ "ld1rh { z13.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "mul x20, x20, x21\n" // offset *= output_tile_size
+ "add x25, x26, x9\n"
+ "ld1h { z8.h }, p3/Z, [x5]\n"
+ "add x27, x27, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "mov x21, #0x0\n"
+ "ld1h { z9.h }, p2/Z, [x17, x6, LSL #1]\n"
+ "add x24, x27, x22, LSL #1\n"
+ "sub x20, XZR, x28\n"
+ "ld1h { z10.h }, p2/Z, [x4]\n"
+ "add x23, x24, x22, LSL #1\n"
+ "ld1h { z11.h }, p2/Z, [x4, x14, LSL #1]\n"
+ "addvl x5, x5, #1\n"
+ "add x22, x23, x22, LSL #1\n"
+ "ld1h { z12.h }, p2/Z, [x17, x8, LSL #1]\n"
+ "bge 4f\n"
+ "3:" // Tile loop: Channel loop
+ "movprfx z21, z15\n fmla z21.h, p3/M, z4.h, z9.h\n"
+ "movprfx z16, z15\n fmla z16.h, p3/M, z8.h, z9.h\n"
+ "whilelt p1.h, x28, %x[n_channels]\n"
+ "inch x21\n"
+ "movprfx z22, z15\n fmla z22.h, p3/M, z3.h, z9.h\n"
+ "movprfx z25, z15\n fmla z25.h, p3/M, z1.h, z9.h\n"
+ "inch x28\n"
+ "mov p0.b, p2.b\n"
+ "movprfx z26, z15\n fmla z26.h, p3/M, z0.h, z9.h\n"
+ "movprfx z17, z15\n fmla z17.h, p3/M, z7.h, z9.h\n"
+ "inch x20\n"
+ "movprfx z18, z15\n fmla z18.h, p3/M, z6.h, z9.h\n"
+ "movprfx z20, z15\n fmla z20.h, p3/M, z5.h, z9.h\n"
+ "fmla z21.h, p3/M, z5.h, z12.h\n"
+ "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x15, x6, LSL #1]\n"
+ "fmla z16.h, p3/M, z0.h, z10.h\n"
+ "movprfx z19, z15\n fmla z19.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x12]\n"
+ "fmla z22.h, p3/M, z4.h, z12.h\n"
+ "fmla z25.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "fmla z17.h, p3/M, z8.h, z12.h\n"
+ "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z10.h\n"
+ "fmla z21.h, p3/M, z7.h, z9.h\n"
+ "ld1h { z10.h }, p2/Z, [x15, x8, LSL #1]\n"
+ "fmla z18.h, p3/M, z7.h, z12.h\n"
+ "fmla z19.h, p3/M, z6.h, z12.h\n"
+ "movprfx z23, z15\n fmla z23.h, p3/M, z3.h, z12.h\n"
+ "movprfx z27, z15\n fmla z27.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x4, x3, LSL #1]\n"
+ "movprfx z31, z15\n fmla z31.h, p3/M, z8.h, z11.h\n"
+ "fmla z22.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z11.h }, p2/Z, [x4, x16, LSL #1]\n"
+ "fmla z25.h, p3/M, z4.h, z9.h\n"
+ "fmla z26.h, p3/M, z3.h, z9.h\n"
+ "movprfx z29, z15\n fmla z29.h, p3/M, z1.h, z9.h\n"
+ "movprfx z30, z15\n fmla z30.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z15.h }, p3/Z, [x5]\n"
+ "addvl x5, x5, #1\n"
+ "fmla z20.h, p3/M, z8.h, z9.h\n"
+ "fmla z24.h, p3/M, z5.h, z9.h\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "fmla z21.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z9.h }, p2/Z, [x7]\n"
+ "fmla z16.h, p3/M, z1.h, z12.h\n"
+ "fmla z17.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x7, x14, LSL #1]\n"
+ "fmla z18.h, p3/M, z2.h, z11.h\n"
+ "fmla z19.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13]\n"
+ "fmla z22.h, p3/M, z7.h, z10.h\n"
+ "fmla z23.h, p3/M, z6.h, z10.h\n"
+ "fmla z25.h, p3/M, z5.h, z10.h\n"
+ "fmla z26.h, p3/M, z4.h, z10.h\n"
+ "fmla z27.h, p3/M, z3.h, z10.h\n"
+ "fmla z29.h, p3/M, z2.h, z10.h\n"
+ "fmla z30.h, p3/M, z1.h, z10.h\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x7, x6, LSL #1]\n"
+ "fmla z20.h, p3/M, z0.h, z9.h\n"
+ "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x14, LSL #1]\n"
+ "fmla z16.h, p3/M, z3.h, z9.h\n"
+ "fmla z21.h, p3/M, z1.h, z10.h\n"
+ "fmla z19.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z2.h, z12.h\n"
+ "fmla z17.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x7, x8, LSL #1]\n"
+ "fmla z18.h, p3/M, z3.h, z10.h\n"
+ "fmla z22.h, p3/M, z0.h, z10.h\n"
+ "fmla z27.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x3, LSL #1]\n"
+ "fmla z20.h, p3/M, z2.h, z10.h\n"
+ "fmla z21.h, p3/M, z2.h, z12.h\n"
+ "fmla z16.h, p3/M, z5.h, z10.h\n"
+ "fmla z17.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x17, x3, LSL #1]\n"
+ "fmla z18.h, p3/M, z4.h, z12.h\n"
+ "fmla z19.h, p3/M, z3.h, z12.h\n"
+ "fmla z22.h, p3/M, z1.h, z12.h\n"
+ "fmla z23.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x17, x16, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z11.h\n"
+ "fmla z29.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x16, LSL #1]\n"
+ "fmla z20.h, p3/M, z4.h, z10.h\n"
+ "fmla z21.h, p3/M, z3.h, z10.h\n"
+ "fmla z24.h, p3/M, z1.h, z10.h\n"
+ "fmla z25.h, p3/M, z0.h, z10.h\n"
+ "fmla z16.h, p3/M, z7.h, z10.h\n"
+ "fmla z17.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x4, x6, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x15, x3, LSL #1]\n"
+ "fmla z18.h, p3/M, z8.h, z12.h\n"
+ "fmla z19.h, p3/M, z7.h, z12.h\n"
+ "fmla z22.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
+ "fmla z27.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x4, x8, LSL #1]\n"
+ "addvl x4, x4, #1\n"
+ "fmla z20.h, p3/M, z7.h, z11.h\n"
+ "fmla z21.h, p3/M, z6.h, z11.h\n"
+ "fmla z24.h, p3/M, z4.h, z11.h\n"
+ "fmla z25.h, p3/M, z3.h, z11.h\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x15, x16, LSL #1]\n"
+ "fmla z16.h, p3/M, z2.h, z10.h\n"
+ "fmla z17.h, p3/M, z1.h, z10.h\n"
+ "fmla z18.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x17]\n"
+ "fmla z19.h, p3/M, z0.h, z12.h\n"
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "fmla z22.h, p3/M, z8.h, z11.h\n"
+ "fmla z23.h, p3/M, z7.h, z11.h\n"
+ "fmla z26.h, p3/M, z5.h, z11.h\n"
+ "fmla z20.h, p3/M, z3.h, z10.h\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z27.h, p3/M, z4.h, z11.h\n"
+ "fmla z31.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x6, LSL #1]\n"
+ "fmla z17.h, p3/M, z2.h, z12.h\n"
+ "fmla z18.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x17, x14, LSL #1]\n"
+ "addvl x17, x17, #1\n"
+ "fmla z16.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x15]\n"
+ "fmla z29.h, p3/M, z4.h, z11.h\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
+ "fmla z25.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z9.h }, p1/Z, [x17, x6, LSL #1]\n"
+ "fmla z19.h, p3/M, z8.h, z12.h\n"
+ "fmla z23.h, p3/M, z5.h, z12.h\n"
+ "fmla z27.h, p3/M, z2.h, z12.h\n"
+ "fmla z20.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x15, x14, LSL #1]\n"
+ "addvl x15, x15, #1\n"
+ "fmla z24.h, p3/M, z3.h, z10.h\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x12, x6, LSL #1]\n"
+ "fmla z26.h, p3/M, z6.h, z11.h\n"
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "fmla z23.h, p3/M, z8.h, z12.h\n"
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
+ "fmla z30.h, p3/M, z6.h, z10.h\n"
+ "fmla z27.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x12, x8, LSL #1]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z24.h, p3/M, z8.h, z11.h\n"
+ "fmla z28.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x8, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z11.h\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "fmla z26.h, p3/M, z7.h, z11.h\n"
+ "fmla z27.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x7, x16, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x7, x3, LSL #1]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z29.h, p3/M, z8.h, z12.h\n"
+ "fmla z30.h, p3/M, z7.h, z12.h\n"
+ "fmla z31.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x13, x3, LSL #1]\n"
+ "fmla z18.h, p3/M, z5.h, z11.h\n"
+ "fmla z19.h, p3/M, z4.h, z11.h\n"
+ "fmla z16.h, p3/M, z4.h, z10.h\n"
+ "fmla z17.h, p3/M, z3.h, z10.h\n"
+ "fmla z20.h, p3/M, z1.h, z10.h\n"
+ "fmla z21.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x13, x16, LSL #1]\n"
+ "whilelt p2.h, x21, %x[n_channels]\n"
+ "fmla z22.h, p3/M, z2.h, z11.h\n"
+ "fmla z23.h, p3/M, z1.h, z11.h\n"
+ "cmp x28, %x[n_channels]\n"
+ "addvl x13, x13, #1\n"
+ "fmla z24.h, p3/M, z7.h, z12.h\n"
+ "fmla z25.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z11.h }, p1/Z, [x4, x14, LSL #1]\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ ".inst 0xa040a0a0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x5]\n"
+ "addvl x5, x5, #4\n"
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "fmla z27.h, p3/M, z7.h, z10.h\n"
+ "ld1h { z12.h }, p1/Z, [x17, x8, LSL #1]\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ ".inst 0xa040a0a4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x5]\n"
+ "addvl x5, x5, #4\n"
+ ".inst 0xc16dc9d0 // fclamp { z16.h-z19.h }, z14.h, z13.h\n"
+ ".inst 0xc16dc9d4 // fclamp { z20.h-z23.h }, z14.h, z13.h\n"
+ "ld1h { z10.h }, p1/Z, [x4]\n"
+ "ld1h { z8.h }, p3/Z, [x5]\n"
+ "addvl x5, x5, #1\n"
+ ".inst 0xc16dc9d8 // fclamp { z24.h-z27.h }, z14.h, z13.h\n"
+ ".inst 0xc16dc9dc // fclamp { z28.h-z31.h }, z14.h, z13.h\n"
+ "st1h { z16.h }, p0, [x27]\n"
+ "st1h { z17.h }, p0, [x27, x9, LSL #1]\n"
+ "st1h { z18.h }, p0, [x27, x26, LSL #1]\n"
+ "st1h { z19.h }, p0, [x27, x25, LSL #1]\n"
+ "addvl x27, x27, #1\n"
+ "st1h { z20.h }, p0, [x24]\n"
+ "st1h { z21.h }, p0, [x24, x9, LSL #1]\n"
+ "st1h { z22.h }, p0, [x24, x26, LSL #1]\n"
+ "st1h { z23.h }, p0, [x24, x25, LSL #1]\n"
+ "addvl x24, x24, #1\n"
+ "st1h { z24.h }, p0, [x23]\n"
+ "st1h { z25.h }, p0, [x23, x9, LSL #1]\n"
+ "st1h { z26.h }, p0, [x23, x26, LSL #1]\n"
+ "st1h { z27.h }, p0, [x23, x25, LSL #1]\n"
+ "addvl x23, x23, #1\n"
+ "st1h { z28.h }, p0, [x22]\n"
+ "st1h { z29.h }, p0, [x22, x9, LSL #1]\n"
+ "st1h { z30.h }, p0, [x22, x26, LSL #1]\n"
+ "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+ "addvl x22, x22, #1\n"
+ "blt 3b\n"
+ "4:" // Tile loop: Channel tail
+ "movprfx z21, z15\n fmla z21.h, p3/M, z4.h, z9.h\n"
+ "movprfx z16, z15\n fmla z16.h, p3/M, z8.h, z9.h\n"
+ "ldr x2, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov p0.b, p2.b\n"
+ "movprfx z22, z15\n fmla z22.h, p3/M, z3.h, z9.h\n"
+ "movprfx z25, z15\n fmla z25.h, p3/M, z1.h, z9.h\n"
+ "ldr x1, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z26, z15\n fmla z26.h, p3/M, z0.h, z9.h\n"
+ "movprfx z17, z15\n fmla z17.h, p3/M, z7.h, z9.h\n"
+ "ldr x11, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "movprfx z18, z15\n fmla z18.h, p3/M, z6.h, z9.h\n"
+ "movprfx z20, z15\n fmla z20.h, p3/M, z5.h, z9.h\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x2, x2, #0x1\n"
+ "fmla z21.h, p3/M, z5.h, z12.h\n"
+ "movprfx z24, z15\n fmla z24.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x15, x6, LSL #1]\n"
+ "add x20, x1, #0x1\n"
+ "fmla z16.h, p3/M, z0.h, z10.h\n"
+ "movprfx z19, z15\n fmla z19.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x12]\n"
+ "cmp x2, x11\n"
+ "fmla z22.h, p3/M, z4.h, z12.h\n"
+ "fmla z25.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "csel x1, x1, x20, LT\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "fmla z17.h, p3/M, z8.h, z12.h\n"
+ "csel x2, x2, XZR, LT\n"
+ "cmp x1, x21\n"
+ "movprfx z28, z15\n fmla z28.h, p3/M, z6.h, z10.h\n"
+ "fmla z21.h, p3/M, z7.h, z9.h\n"
+ "ld1h { z10.h }, p2/Z, [x15, x8, LSL #1]\n"
+ "fmla z18.h, p3/M, z7.h, z12.h\n"
+ "fmla z19.h, p3/M, z6.h, z12.h\n"
+ "movprfx z23, z15\n fmla z23.h, p3/M, z3.h, z12.h\n"
+ "movprfx z27, z15\n fmla z27.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x4, x3, LSL #1]\n"
+ "movprfx z31, z15\n fmla z31.h, p3/M, z8.h, z11.h\n"
+ "fmla z22.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z11.h }, p2/Z, [x4, x16, LSL #1]\n"
+ "fmla z25.h, p3/M, z4.h, z9.h\n"
+ "fmla z26.h, p3/M, z3.h, z9.h\n"
+ "movprfx z29, z15\n fmla z29.h, p3/M, z1.h, z9.h\n"
+ "movprfx z30, z15\n fmla z30.h, p3/M, z0.h, z9.h\n"
+ "fmla z20.h, p3/M, z8.h, z9.h\n"
+ "fmla z24.h, p3/M, z5.h, z9.h\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "fmla z21.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z9.h }, p2/Z, [x7]\n"
+ "fmla z16.h, p3/M, z1.h, z12.h\n"
+ "fmla z17.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x7, x14, LSL #1]\n"
+ "fmla z18.h, p3/M, z2.h, z11.h\n"
+ "fmla z19.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13]\n"
+ "fmla z22.h, p3/M, z7.h, z10.h\n"
+ "fmla z23.h, p3/M, z6.h, z10.h\n"
+ "fmla z25.h, p3/M, z5.h, z10.h\n"
+ "fmla z26.h, p3/M, z4.h, z10.h\n"
+ "fmla z27.h, p3/M, z3.h, z10.h\n"
+ "fmla z29.h, p3/M, z2.h, z10.h\n"
+ "fmla z30.h, p3/M, z1.h, z10.h\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x7, x6, LSL #1]\n"
+ "fmla z20.h, p3/M, z0.h, z9.h\n"
+ "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x14, LSL #1]\n"
+ "fmla z16.h, p3/M, z3.h, z9.h\n"
+ "fmla z21.h, p3/M, z1.h, z10.h\n"
+ "fmla z19.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z2.h, z12.h\n"
+ "fmla z17.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x7, x8, LSL #1]\n"
+ "fmla z18.h, p3/M, z3.h, z10.h\n"
+ "fmla z22.h, p3/M, z0.h, z10.h\n"
+ "fmla z27.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x3, LSL #1]\n"
+ "fmla z20.h, p3/M, z2.h, z10.h\n"
+ "fmla z21.h, p3/M, z2.h, z12.h\n"
+ "fmla z16.h, p3/M, z5.h, z10.h\n"
+ "fmla z17.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x17, x3, LSL #1]\n"
+ "fmla z18.h, p3/M, z4.h, z12.h\n"
+ "fmla z19.h, p3/M, z3.h, z12.h\n"
+ "fmla z22.h, p3/M, z1.h, z12.h\n"
+ "fmla z23.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x17, x16, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z11.h\n"
+ "fmla z29.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x16, LSL #1]\n"
+ "fmla z20.h, p3/M, z4.h, z10.h\n"
+ "fmla z21.h, p3/M, z3.h, z10.h\n"
+ "fmla z24.h, p3/M, z1.h, z10.h\n"
+ "fmla z25.h, p3/M, z0.h, z10.h\n"
+ "fmla z16.h, p3/M, z7.h, z10.h\n"
+ "fmla z17.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x4, x6, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x15, x3, LSL #1]\n"
+ "fmla z18.h, p3/M, z8.h, z12.h\n"
+ "fmla z19.h, p3/M, z7.h, z12.h\n"
+ "fmla z22.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
+ "fmla z27.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x4, x8, LSL #1]\n"
+ "fmla z20.h, p3/M, z7.h, z11.h\n"
+ "fmla z21.h, p3/M, z6.h, z11.h\n"
+ "fmla z24.h, p3/M, z4.h, z11.h\n"
+ "fmla z25.h, p3/M, z3.h, z11.h\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x15, x16, LSL #1]\n"
+ "fmla z16.h, p3/M, z2.h, z10.h\n"
+ "fmla z17.h, p3/M, z1.h, z10.h\n"
+ "fmla z18.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x17]\n"
+ "fmla z19.h, p3/M, z0.h, z12.h\n"
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "fmla z22.h, p3/M, z8.h, z11.h\n"
+ "fmla z23.h, p3/M, z7.h, z11.h\n"
+ "fmla z26.h, p3/M, z5.h, z11.h\n"
+ "fmla z20.h, p3/M, z3.h, z10.h\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z27.h, p3/M, z4.h, z11.h\n"
+ "fmla z31.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x6, LSL #1]\n"
+ "fmla z17.h, p3/M, z2.h, z12.h\n"
+ "fmla z18.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x17, x14, LSL #1]\n"
+ "fmla z16.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x15]\n"
+ "fmla z29.h, p3/M, z4.h, z11.h\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
+ "fmla z25.h, p3/M, z7.h, z11.h\n"
+ "fmla z19.h, p3/M, z8.h, z12.h\n"
+ "fmla z23.h, p3/M, z5.h, z12.h\n"
+ "fmla z27.h, p3/M, z2.h, z12.h\n"
+ "fmla z20.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x15, x14, LSL #1]\n"
+ "fmla z24.h, p3/M, z3.h, z10.h\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x12, x6, LSL #1]\n"
+ "fmla z26.h, p3/M, z6.h, z11.h\n"
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "fmla z23.h, p3/M, z8.h, z12.h\n"
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
+ "fmla z30.h, p3/M, z6.h, z10.h\n"
+ "fmla z27.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x12, x8, LSL #1]\n"
+ "fmla z24.h, p3/M, z8.h, z11.h\n"
+ "fmla z28.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x8, LSL #1]\n"
+ "fmla z29.h, p3/M, z5.h, z11.h\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "fmla z26.h, p3/M, z7.h, z11.h\n"
+ "fmla z27.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x7, x16, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x7, x3, LSL #1]\n"
+ "fmla z29.h, p3/M, z8.h, z12.h\n"
+ "fmla z30.h, p3/M, z7.h, z12.h\n"
+ "fmla z31.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x13, x3, LSL #1]\n"
+ "fmla z18.h, p3/M, z5.h, z11.h\n"
+ "fmla z19.h, p3/M, z4.h, z11.h\n"
+ "fmla z16.h, p3/M, z4.h, z10.h\n"
+ "fmla z17.h, p3/M, z3.h, z10.h\n"
+ "fmla z20.h, p3/M, z1.h, z10.h\n"
+ "fmla z21.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x13, x16, LSL #1]\n"
+ "fmla z22.h, p3/M, z2.h, z11.h\n"
+ "fmla z23.h, p3/M, z1.h, z11.h\n"
+ "fmla z24.h, p3/M, z7.h, z12.h\n"
+ "fmla z25.h, p3/M, z6.h, z12.h\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "fmla z27.h, p3/M, z7.h, z10.h\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ ".inst 0xc16dc9d0 // fclamp { z16.h-z19.h }, z14.h, z13.h\n"
+ ".inst 0xc16dc9d4 // fclamp { z20.h-z23.h }, z14.h, z13.h\n"
+ ".inst 0xc16dc9d8 // fclamp { z24.h-z27.h }, z14.h, z13.h\n"
+ ".inst 0xc16dc9dc // fclamp { z28.h-z31.h }, z14.h, z13.h\n"
+ "st1h { z16.h }, p0, [x27]\n"
+ "st1h { z17.h }, p0, [x27, x9, LSL #1]\n"
+ "st1h { z18.h }, p0, [x27, x26, LSL #1]\n"
+ "st1h { z19.h }, p0, [x27, x25, LSL #1]\n"
+ "st1h { z20.h }, p0, [x24]\n"
+ "st1h { z21.h }, p0, [x24, x9, LSL #1]\n"
+ "st1h { z22.h }, p0, [x24, x26, LSL #1]\n"
+ "st1h { z23.h }, p0, [x24, x25, LSL #1]\n"
+ "st1h { z24.h }, p0, [x23]\n"
+ "st1h { z25.h }, p0, [x23, x9, LSL #1]\n"
+ "st1h { z26.h }, p0, [x23, x26, LSL #1]\n"
+ "st1h { z27.h }, p0, [x23, x25, LSL #1]\n"
+ "st1h { z28.h }, p0, [x22]\n"
+ "st1h { z29.h }, p0, [x22, x9, LSL #1]\n"
+ "st1h { z30.h }, p0, [x22, x26, LSL #1]\n"
+ "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+ "blt 1b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..2e6f1123a4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,653 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[36];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[14];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[5];
+ inptrs[3] = input_ptrs[15];
+ inptrs[4] = input_ptrs[30];
+ inptrs[5] = input_ptrs[35];
+ inptrs[6] = input_ptrs[20];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[4];
+ inptrs[9] = input_ptrs[21];
+ inptrs[10] = input_ptrs[6];
+ inptrs[11] = input_ptrs[11];
+ inptrs[12] = input_ptrs[24];
+ inptrs[13] = input_ptrs[8];
+ inptrs[14] = input_ptrs[29];
+ inptrs[15] = input_ptrs[9];
+ inptrs[16] = input_ptrs[31];
+ inptrs[17] = input_ptrs[13];
+ inptrs[18] = input_ptrs[34];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[19];
+ inptrs[22] = input_ptrs[3];
+ inptrs[23] = input_ptrs[12];
+ inptrs[24] = input_ptrs[22];
+ inptrs[25] = input_ptrs[17];
+ inptrs[26] = input_ptrs[18];
+ inptrs[27] = input_ptrs[26];
+ inptrs[28] = input_ptrs[23];
+ inptrs[29] = input_ptrs[32];
+ inptrs[30] = input_ptrs[27];
+ inptrs[31] = input_ptrs[33];
+ inptrs[32] = input_ptrs[7];
+ inptrs[33] = input_ptrs[10];
+ inptrs[34] = input_ptrs[25];
+ inptrs[35] = input_ptrs[28];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x15, #0x0\n"
+ "ptrue p3.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "ldp x14, x13, [x16, #0x0]\n"
+ "ldp x12, x11, [x16, #0x10]\n"
+ "cnth x10\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1h { z14.h }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "cmp x10, %x[n_channels]\n"
+ "ldr x9, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ ".inst 0xa040a220 // ld1h { z0.h-z3.h }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "sub x28, XZR, x10\n"
+ ".inst 0xa040a224 // ld1h { z4.h-z7.h }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "ld1rh { z13.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z8.h }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "ld1h { z9.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "movprfx z21, z14\n fmla z21.h, p3/M, z4.h, z9.h\n"
+ "movprfx z16, z14\n fmla z16.h, p3/M, z8.h, z9.h\n"
+ "ldr x27, [x16, #0x20]\n"
+ "inch x28\n"
+ "movprfx z22, z14\n fmla z22.h, p3/M, z3.h, z9.h\n"
+ "movprfx z25, z14\n fmla z25.h, p3/M, z1.h, z9.h\n"
+ "ldr x26, [x16, #0x30]\n"
+ "mov p1.b, p2.b\n"
+ "movprfx z26, z14\n fmla z26.h, p3/M, z0.h, z9.h\n"
+ "ldr x25, [x16, #0x28]\n"
+ "movprfx z17, z14\n fmla z17.h, p3/M, z7.h, z9.h\n"
+ "whilelt p0.h, x10, %x[n_channels]\n"
+ "movprfx z18, z14\n fmla z18.h, p3/M, z6.h, z9.h\n"
+ "movprfx z20, z14\n fmla z20.h, p3/M, z5.h, z9.h\n"
+ "ldr x24, [x16, #0x38]\n"
+ "fmla z21.h, p3/M, z5.h, z12.h\n"
+ "movprfx z24, z14\n fmla z24.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ldr x14, [x16, #0x40]\n"
+ "fmla z16.h, p3/M, z0.h, z10.h\n"
+ "movprfx z19, z14\n fmla z19.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "ldr x13, [x16, #0x48]\n"
+ "fmla z22.h, p3/M, z4.h, z12.h\n"
+ "fmla z25.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ldr x12, [x16, #0x50]\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "fmla z17.h, p3/M, z8.h, z12.h\n"
+ "ldr x27, [x16, #0x60]\n"
+ "fmla z18.h, p3/M, z7.h, z12.h\n"
+ "movprfx z28, z14\n fmla z28.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla z21.h, p3/M, z7.h, z9.h\n"
+ "fmla z19.h, p3/M, z6.h, z12.h\n"
+ "ldr x11, [x16, #0x58]\n"
+ "movprfx z23, z14\n fmla z23.h, p3/M, z3.h, z12.h\n"
+ "movprfx z27, z14\n fmla z27.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "ldr x26, [x16, #0x70]\n"
+ "movprfx z31, z14\n fmla z31.h, p3/M, z8.h, z11.h\n"
+ "fmla z22.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "ldr x24, [x16, #0x78]\n"
+ "fmla z25.h, p3/M, z4.h, z9.h\n"
+ "fmla z26.h, p3/M, z3.h, z9.h\n"
+ "ldr x14, [x16, #0x80]\n"
+ "movprfx z29, z14\n fmla z29.h, p3/M, z1.h, z9.h\n"
+ "movprfx z30, z14\n fmla z30.h, p3/M, z0.h, z9.h\n"
+ "ldr x13, [x16, #0x88]\n"
+ "ld1h { z14.h }, p3/Z, [x17]\n"
+ "fmla z20.h, p3/M, z8.h, z9.h\n"
+ "fmla z24.h, p3/M, z5.h, z9.h\n"
+ "ldr x23, [x9, #0x0]\n"
+ "addvl x17, x17, #1\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "fmla z16.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z9.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "ldr x12, [x16, #0x90]\n"
+ "fmla z17.h, p3/M, z0.h, z12.h\n"
+ "fmla z18.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "ldr x11, [x16, #0x98]\n"
+ "fmla z21.h, p3/M, z8.h, z10.h\n"
+ "fmla z19.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "ldr x27, [x16, #0xa0]\n"
+ "fmla z22.h, p3/M, z7.h, z10.h\n"
+ "fmla z23.h, p3/M, z6.h, z10.h\n"
+ "ldr x22, [x9, #0x8]\n"
+ "fmla z25.h, p3/M, z5.h, z10.h\n"
+ "fmla z26.h, p3/M, z4.h, z10.h\n"
+ "ldr x21, [x9, #0x10]\n"
+ "fmla z27.h, p3/M, z3.h, z10.h\n"
+ "fmla z29.h, p3/M, z2.h, z10.h\n"
+ "ldr x20, [x9, #0x18]\n"
+ "fmla z30.h, p3/M, z1.h, z10.h\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla z16.h, p3/M, z3.h, z9.h\n"
+ "fmla z20.h, p3/M, z0.h, z9.h\n"
+ "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ldr x26, [x16, #0xb0]\n"
+ "fmla z17.h, p3/M, z4.h, z10.h\n"
+ "fmla z18.h, p3/M, z3.h, z10.h\n"
+ "fmla z21.h, p3/M, z1.h, z10.h\n"
+ "fmla z19.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z2.h, z12.h\n"
+ "fmla z22.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "ldr x24, [x16, #0xb8]\n"
+ "fmla z27.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "ldr x14, [x16, #0xc0]\n"
+ "fmla z16.h, p3/M, z5.h, z10.h\n"
+ "fmla z20.h, p3/M, z2.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "ldr x13, [x16, #0xc8]\n"
+ "fmla z17.h, p3/M, z5.h, z12.h\n"
+ "fmla z18.h, p3/M, z4.h, z12.h\n"
+ "fmla z21.h, p3/M, z2.h, z12.h\n"
+ "fmla z19.h, p3/M, z3.h, z12.h\n"
+ "fmla z22.h, p3/M, z1.h, z12.h\n"
+ "fmla z23.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "ldr x11, [x16, #0xd8]\n"
+ "fmla z28.h, p3/M, z7.h, z11.h\n"
+ "fmla z29.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "ldr x12, [x16, #0xd0]\n"
+ "fmla z16.h, p3/M, z7.h, z10.h\n"
+ "fmla z17.h, p3/M, z6.h, z10.h\n"
+ "fmla z20.h, p3/M, z4.h, z10.h\n"
+ "fmla z21.h, p3/M, z3.h, z10.h\n"
+ "fmla z24.h, p3/M, z1.h, z10.h\n"
+ "fmla z25.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "ldr x27, [x16, #0xe0]\n"
+ "fmla z18.h, p3/M, z8.h, z12.h\n"
+ "fmla z30.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z27.h, p3/M, z1.h, z12.h\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla z19.h, p3/M, z7.h, z12.h\n"
+ "fmla z22.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ldr x26, [x16, #0xf0]\n"
+ "fmla z16.h, p3/M, z2.h, z10.h\n"
+ "fmla z17.h, p3/M, z1.h, z10.h\n"
+ "fmla z18.h, p3/M, z0.h, z10.h\n"
+ "fmla z20.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "ldr x24, [x16, #0xf8]\n"
+ "fmla z21.h, p3/M, z6.h, z11.h\n"
+ "fmla z24.h, p3/M, z4.h, z11.h\n"
+ "fmla z25.h, p3/M, z3.h, z11.h\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "fmla z17.h, p3/M, z2.h, z12.h\n"
+ "ldr x14, [x16, #0x100]\n"
+ "fmla z18.h, p3/M, z1.h, z12.h\n"
+ "fmla z19.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "ldr x13, [x16, #0x108]\n"
+ "fmla z16.h, p3/M, z6.h, z10.h\n"
+ "fmla z20.h, p3/M, z3.h, z10.h\n"
+ "fmla z27.h, p3/M, z4.h, z11.h\n"
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z22.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "ldr x12, [x16, #0x110]\n"
+ "fmla z23.h, p3/M, z7.h, z11.h\n"
+ "fmla z26.h, p3/M, z5.h, z11.h\n"
+ "fmla z31.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z19.h, p3/M, z8.h, z12.h\n"
+ "ldr x11, [x16, #0x118]\n"
+ "fmla z27.h, p3/M, z2.h, z12.h\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "fmla z20.h, p3/M, z6.h, z10.h\n"
+ "fmla z24.h, p3/M, z3.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z11.h\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
+ "fmla z23.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z25.h, p3/M, z7.h, z11.h\n"
+ "fmla z26.h, p3/M, z6.h, z11.h\n"
+ "fmla z28.h, p3/M, z5.h, z11.h\n"
+ "fmla z24.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z27.h, p3/M, z5.h, z12.h\n"
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
+ "fmla z30.h, p3/M, z6.h, z10.h\n"
+ "fmla z23.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z7.h, z11.h\n"
+ "fmla z27.h, p3/M, z6.h, z11.h\n"
+ "fmla z29.h, p3/M, z5.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "fmla z16.h, p3/M, z4.h, z10.h\n"
+ "ldp x14, x13, [x16, #0x0]\n"
+ "fmla z17.h, p3/M, z3.h, z10.h\n"
+ "fmla z20.h, p3/M, z1.h, z10.h\n"
+ "fmla z21.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z18.h, p3/M, z5.h, z11.h\n"
+ "fmla z19.h, p3/M, z4.h, z11.h\n"
+ "fmla z29.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z9.h }, p0/Z, [x14, x10, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z12.h\n"
+ "fmla z31.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "ldp x12, x11, [x16, #0x10]\n"
+ "fmla z22.h, p3/M, z2.h, z11.h\n"
+ "fmla z23.h, p3/M, z1.h, z11.h\n"
+ "inch x15\n"
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "fmla z27.h, p3/M, z7.h, z10.h\n"
+ "whilelt p2.h, x15, %x[n_channels]\n"
+ ".inst 0xc16dc9f0 // fclamp { z16.h-z19.h }, z15.h, z13.h\n"
+ "fmla z24.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z11.h }, p0/Z, [x12, x10, LSL #1]\n"
+ "fmla z25.h, p3/M, z6.h, z12.h\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ ".inst 0xc16dc9f4 // fclamp { z20.h-z23.h }, z15.h, z13.h\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z12.h }, p0/Z, [x11, x10, LSL #1]\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z10.h }, p0/Z, [x13, x10, LSL #1]\n"
+ "inch x10\n"
+ "st1h { z16.h }, p1, [x23, x28, LSL #1]\n"
+ "ldr x23, [x9, #0x20]\n"
+ ".inst 0xa040a220 // ld1h { z0.h-z3.h }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "st1h { z17.h }, p1, [x22, x28, LSL #1]\n"
+ "ldr x22, [x9, #0x28]\n"
+ ".inst 0xc16dc9f8 // fclamp { z24.h-z27.h }, z15.h, z13.h\n"
+ ".inst 0xa040a224 // ld1h { z4.h-z7.h }, pn8.b/Z, [x17]\n"
+ "st1h { z18.h }, p1, [x21, x28, LSL #1]\n"
+ "ldr x21, [x9, #0x30]\n"
+ "addvl x17, x17, #4\n"
+ "cmp x10, %x[n_channels]\n"
+ "st1h { z19.h }, p1, [x20, x28, LSL #1]\n"
+ "ldr x20, [x9, #0x38]\n"
+ ".inst 0xc16dc9fc // fclamp { z28.h-z31.h }, z15.h, z13.h\n"
+ "ld1h { z8.h }, p3/Z, [x17]\n"
+ "st1h { z20.h }, p1, [x23, x28, LSL #1]\n"
+ "ldr x23, [x9, #0x40]\n"
+ "addvl x17, x17, #1\n"
+ "st1h { z21.h }, p1, [x22, x28, LSL #1]\n"
+ "ldr x22, [x9, #0x48]\n"
+ "st1h { z22.h }, p1, [x21, x28, LSL #1]\n"
+ "ldr x21, [x9, #0x50]\n"
+ "st1h { z23.h }, p1, [x20, x28, LSL #1]\n"
+ "ldr x20, [x9, #0x58]\n"
+ "st1h { z24.h }, p1, [x23, x28, LSL #1]\n"
+ "ldr x23, [x9, #0x60]\n"
+ "st1h { z25.h }, p1, [x22, x28, LSL #1]\n"
+ "ldr x22, [x9, #0x68]\n"
+ "st1h { z26.h }, p1, [x21, x28, LSL #1]\n"
+ "ldr x21, [x9, #0x70]\n"
+ "st1h { z27.h }, p1, [x20, x28, LSL #1]\n"
+ "ldr x20, [x9, #0x78]\n"
+ "st1h { z28.h }, p1, [x23, x28, LSL #1]\n"
+ "st1h { z29.h }, p1, [x22, x28, LSL #1]\n"
+ "st1h { z30.h }, p1, [x21, x28, LSL #1]\n"
+ "st1h { z31.h }, p1, [x20, x28, LSL #1]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z21, z14\n fmla z21.h, p3/M, z4.h, z9.h\n"
+ "movprfx z16, z14\n fmla z16.h, p3/M, z8.h, z9.h\n"
+ "ldr x27, [x16, #0x20]\n"
+ "inch x28\n"
+ "movprfx z22, z14\n fmla z22.h, p3/M, z3.h, z9.h\n"
+ "movprfx z25, z14\n fmla z25.h, p3/M, z1.h, z9.h\n"
+ "ldr x26, [x16, #0x30]\n"
+ "mov p1.b, p2.b\n"
+ "movprfx z26, z14\n fmla z26.h, p3/M, z0.h, z9.h\n"
+ "ldr x25, [x16, #0x28]\n"
+ "movprfx z17, z14\n fmla z17.h, p3/M, z7.h, z9.h\n"
+ "movprfx z18, z14\n fmla z18.h, p3/M, z6.h, z9.h\n"
+ "movprfx z20, z14\n fmla z20.h, p3/M, z5.h, z9.h\n"
+ "ldr x24, [x16, #0x38]\n"
+ "fmla z21.h, p3/M, z5.h, z12.h\n"
+ "movprfx z24, z14\n fmla z24.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ldr x14, [x16, #0x40]\n"
+ "fmla z16.h, p3/M, z0.h, z10.h\n"
+ "movprfx z19, z14\n fmla z19.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "ldr x13, [x16, #0x48]\n"
+ "fmla z22.h, p3/M, z4.h, z12.h\n"
+ "fmla z25.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ldr x12, [x16, #0x50]\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "fmla z17.h, p3/M, z8.h, z12.h\n"
+ "ldr x27, [x16, #0x60]\n"
+ "fmla z18.h, p3/M, z7.h, z12.h\n"
+ "movprfx z28, z14\n fmla z28.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla z21.h, p3/M, z7.h, z9.h\n"
+ "fmla z19.h, p3/M, z6.h, z12.h\n"
+ "ldr x11, [x16, #0x58]\n"
+ "movprfx z23, z14\n fmla z23.h, p3/M, z3.h, z12.h\n"
+ "movprfx z27, z14\n fmla z27.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "ldr x26, [x16, #0x70]\n"
+ "movprfx z31, z14\n fmla z31.h, p3/M, z8.h, z11.h\n"
+ "fmla z22.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "ldr x24, [x16, #0x78]\n"
+ "fmla z25.h, p3/M, z4.h, z9.h\n"
+ "fmla z26.h, p3/M, z3.h, z9.h\n"
+ "ldr x14, [x16, #0x80]\n"
+ "movprfx z29, z14\n fmla z29.h, p3/M, z1.h, z9.h\n"
+ "movprfx z30, z14\n fmla z30.h, p3/M, z0.h, z9.h\n"
+ "ldr x13, [x16, #0x88]\n"
+ "fmla z20.h, p3/M, z8.h, z9.h\n"
+ "fmla z24.h, p3/M, z5.h, z9.h\n"
+ "ldr x23, [x9, #0x0]\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "fmla z16.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z9.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "ldr x12, [x16, #0x90]\n"
+ "fmla z17.h, p3/M, z0.h, z12.h\n"
+ "fmla z18.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "ldr x11, [x16, #0x98]\n"
+ "fmla z21.h, p3/M, z8.h, z10.h\n"
+ "fmla z19.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "ldr x27, [x16, #0xa0]\n"
+ "fmla z22.h, p3/M, z7.h, z10.h\n"
+ "fmla z23.h, p3/M, z6.h, z10.h\n"
+ "ldr x22, [x9, #0x8]\n"
+ "fmla z25.h, p3/M, z5.h, z10.h\n"
+ "fmla z26.h, p3/M, z4.h, z10.h\n"
+ "ldr x21, [x9, #0x10]\n"
+ "fmla z27.h, p3/M, z3.h, z10.h\n"
+ "fmla z29.h, p3/M, z2.h, z10.h\n"
+ "ldr x20, [x9, #0x18]\n"
+ "fmla z30.h, p3/M, z1.h, z10.h\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla z16.h, p3/M, z3.h, z9.h\n"
+ "fmla z20.h, p3/M, z0.h, z9.h\n"
+ "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ldr x26, [x16, #0xb0]\n"
+ "fmla z17.h, p3/M, z4.h, z10.h\n"
+ "fmla z18.h, p3/M, z3.h, z10.h\n"
+ "fmla z21.h, p3/M, z1.h, z10.h\n"
+ "fmla z19.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z2.h, z12.h\n"
+ "fmla z22.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "ldr x24, [x16, #0xb8]\n"
+ "fmla z27.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "ldr x14, [x16, #0xc0]\n"
+ "fmla z16.h, p3/M, z5.h, z10.h\n"
+ "fmla z20.h, p3/M, z2.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "ldr x13, [x16, #0xc8]\n"
+ "fmla z17.h, p3/M, z5.h, z12.h\n"
+ "fmla z18.h, p3/M, z4.h, z12.h\n"
+ "fmla z21.h, p3/M, z2.h, z12.h\n"
+ "fmla z19.h, p3/M, z3.h, z12.h\n"
+ "fmla z22.h, p3/M, z1.h, z12.h\n"
+ "fmla z23.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "ldr x11, [x16, #0xd8]\n"
+ "fmla z28.h, p3/M, z7.h, z11.h\n"
+ "fmla z29.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "ldr x12, [x16, #0xd0]\n"
+ "fmla z16.h, p3/M, z7.h, z10.h\n"
+ "fmla z17.h, p3/M, z6.h, z10.h\n"
+ "fmla z20.h, p3/M, z4.h, z10.h\n"
+ "fmla z21.h, p3/M, z3.h, z10.h\n"
+ "fmla z24.h, p3/M, z1.h, z10.h\n"
+ "fmla z25.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "ldr x27, [x16, #0xe0]\n"
+ "fmla z18.h, p3/M, z8.h, z12.h\n"
+ "fmla z30.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z27.h, p3/M, z1.h, z12.h\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla z19.h, p3/M, z7.h, z12.h\n"
+ "fmla z22.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ldr x26, [x16, #0xf0]\n"
+ "fmla z16.h, p3/M, z2.h, z10.h\n"
+ "fmla z17.h, p3/M, z1.h, z10.h\n"
+ "fmla z18.h, p3/M, z0.h, z10.h\n"
+ "fmla z20.h, p3/M, z7.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "ldr x24, [x16, #0xf8]\n"
+ "fmla z21.h, p3/M, z6.h, z11.h\n"
+ "fmla z24.h, p3/M, z4.h, z11.h\n"
+ "fmla z25.h, p3/M, z3.h, z11.h\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "fmla z17.h, p3/M, z2.h, z12.h\n"
+ "ldr x14, [x16, #0x100]\n"
+ "fmla z18.h, p3/M, z1.h, z12.h\n"
+ "fmla z19.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "ldr x13, [x16, #0x108]\n"
+ "fmla z16.h, p3/M, z6.h, z10.h\n"
+ "fmla z20.h, p3/M, z3.h, z10.h\n"
+ "fmla z27.h, p3/M, z4.h, z11.h\n"
+ "fmla z30.h, p3/M, z2.h, z11.h\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z22.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "ldr x12, [x16, #0x110]\n"
+ "fmla z23.h, p3/M, z7.h, z11.h\n"
+ "fmla z26.h, p3/M, z5.h, z11.h\n"
+ "fmla z31.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z19.h, p3/M, z8.h, z12.h\n"
+ "ldr x11, [x16, #0x118]\n"
+ "fmla z27.h, p3/M, z2.h, z12.h\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "fmla z20.h, p3/M, z6.h, z10.h\n"
+ "fmla z24.h, p3/M, z3.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z11.h\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
+ "fmla z23.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z25.h, p3/M, z7.h, z11.h\n"
+ "fmla z26.h, p3/M, z6.h, z11.h\n"
+ "fmla z28.h, p3/M, z5.h, z11.h\n"
+ "fmla z24.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z27.h, p3/M, z5.h, z12.h\n"
+ "fmla z31.h, p3/M, z2.h, z12.h\n"
+ "fmla z29.h, p3/M, z7.h, z10.h\n"
+ "fmla z30.h, p3/M, z6.h, z10.h\n"
+ "fmla z23.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z10.h\n"
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z7.h, z11.h\n"
+ "fmla z27.h, p3/M, z6.h, z11.h\n"
+ "fmla z29.h, p3/M, z5.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z11.h\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "fmla z16.h, p3/M, z4.h, z10.h\n"
+ "fmla z17.h, p3/M, z3.h, z10.h\n"
+ "fmla z20.h, p3/M, z1.h, z10.h\n"
+ "fmla z21.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z18.h, p3/M, z5.h, z11.h\n"
+ "fmla z19.h, p3/M, z4.h, z11.h\n"
+ "fmla z29.h, p3/M, z8.h, z12.h\n"
+ "fmla z30.h, p3/M, z7.h, z12.h\n"
+ "fmla z31.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "fmla z22.h, p3/M, z2.h, z11.h\n"
+ "fmla z23.h, p3/M, z1.h, z11.h\n"
+ "fmla z26.h, p3/M, z8.h, z10.h\n"
+ "fmla z27.h, p3/M, z7.h, z10.h\n"
+ ".inst 0xc16dc9f0 // fclamp { z16.h-z19.h }, z15.h, z13.h\n"
+ "fmla z24.h, p3/M, z7.h, z12.h\n"
+ "fmla z25.h, p3/M, z6.h, z12.h\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ ".inst 0xc16dc9f4 // fclamp { z20.h-z23.h }, z15.h, z13.h\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "st1h { z16.h }, p1, [x23, x28, LSL #1]\n"
+ "ldr x23, [x9, #0x20]\n"
+ "st1h { z17.h }, p1, [x22, x28, LSL #1]\n"
+ "ldr x22, [x9, #0x28]\n"
+ "st1h { z18.h }, p1, [x21, x28, LSL #1]\n"
+ "ldr x21, [x9, #0x30]\n"
+ ".inst 0xc16dc9f8 // fclamp { z24.h-z27.h }, z15.h, z13.h\n"
+ "st1h { z19.h }, p1, [x20, x28, LSL #1]\n"
+ "ldr x20, [x9, #0x38]\n"
+ "st1h { z20.h }, p1, [x23, x28, LSL #1]\n"
+ "ldr x23, [x9, #0x40]\n"
+ ".inst 0xc16dc9fc // fclamp { z28.h-z31.h }, z15.h, z13.h\n"
+ "st1h { z21.h }, p1, [x22, x28, LSL #1]\n"
+ "ldr x22, [x9, #0x48]\n"
+ "st1h { z22.h }, p1, [x21, x28, LSL #1]\n"
+ "ldr x21, [x9, #0x50]\n"
+ "st1h { z23.h }, p1, [x20, x28, LSL #1]\n"
+ "ldr x20, [x9, #0x58]\n"
+ "st1h { z24.h }, p1, [x23, x28, LSL #1]\n"
+ "ldr x23, [x9, #0x60]\n"
+ "st1h { z25.h }, p1, [x22, x28, LSL #1]\n"
+ "ldr x22, [x9, #0x68]\n"
+ "st1h { z26.h }, p1, [x21, x28, LSL #1]\n"
+ "ldr x21, [x9, #0x70]\n"
+ "st1h { z27.h }, p1, [x20, x28, LSL #1]\n"
+ "ldr x20, [x9, #0x78]\n"
+ "st1h { z28.h }, p1, [x23, x28, LSL #1]\n"
+ "st1h { z29.h }, p1, [x22, x28, LSL #1]\n"
+ "st1h { z30.h }, p1, [x21, x28, LSL #1]\n"
+ "st1h { z31.h }, p1, [x20, x28, LSL #1]\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..27fcb2e6d2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..066ce06aa6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x2, #0x0\n"
+ "mov x3, #0x0\n"
+ "ptrue p3.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "1:" // Tile loop
+ "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x22, #0x4\n"
+ "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x20, x2, x21\n" // offset = tile_i * ld_input_row
+ "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x20, x3, x4, x20\n" // offset += tile_j * ld_input_col
+ "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
+ "add x7, x4, x4\n"
+ "add x5, x5, x20, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x8, x5, x21, LSL #1\n"
+ "add x17, x7, x4\n"
+ "add x16, x8, x21, LSL #1\n"
+ "add x15, x17, x4\n"
+ "add x14, x16, x21, LSL #1\n"
+ "add x13, x14, x21, LSL #1\n"
+ "cbnz x3, 2f\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "lsl x12, %x[n_channels], #0x1\n"
+ "mov x28, #0x8\n"
+ "mul x28, x28, x4\n"
+ "add x27, x16, x7, LSL #1\n"
+ "add x26, x5, x4, LSL #1\n"
+ "add x25, x5, x17, LSL #1\n"
+ "sub x20, x24, x3\n"
+ "add x24, x5, x15, LSL #1\n"
+ "sub x20, x20, #0x1\n"
+ "add x23, x8, x4, LSL #1\n"
+ "and x20, x20, #0x3fffff\n"
+ "add x22, x5, x7, LSL #1\n"
+ "orr x12, x12, x20, LSL #22\n"
+ "add x21, x8, x17, LSL #1\n"
+ "orr x12, x12, x28, LSL #38\n"
+ "add x20, x8, x15, LSL #1\n"
+ "add x11, x8, x7, LSL #1\n"
+ "add x10, x14, x4, LSL #1\n"
+ "add x9, x16, x4, LSL #1\n"
+ "add x28, x14, x17, LSL #1\n"
+ ".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
+ "add x27, x16, x17, LSL #1\n"
+ ".inst 0xf8ac48ba // rprfm pldonce, x12, [x5]\n"
+ ".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
+ "add x26, x14, x15, LSL #1\n"
+ ".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
+ "add x25, x16, x15, LSL #1\n"
+ ".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
+ "add x24, x13, x4, LSL #1\n"
+ ".inst 0xf8ac491a // rprfm pldonce, x12, [x8]\n"
+ ".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
+ "add x23, x14, x7, LSL #1\n"
+ ".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
+ "add x22, x13, x17, LSL #1\n"
+ ".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
+ "add x21, x13, x7, LSL #1\n"
+ ".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
+ "add x20, x13, x15, LSL #1\n"
+ ".inst 0xf8ac497a // rprfm pldonce, x12, [x11]\n"
+ ".inst 0xf8ac49da // rprfm pldonce, x12, [x14]\n"
+ ".inst 0xf8ac4a1a // rprfm pldonce, x12, [x16]\n"
+ ".inst 0xf8ac495a // rprfm pldonce, x12, [x10]\n"
+ ".inst 0xf8ac493a // rprfm pldonce, x12, [x9]\n"
+ ".inst 0xf8ac4b9a // rprfm pldonce, x12, [x28]\n"
+ ".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
+ ".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
+ ".inst 0xf8ac49ba // rprfm pldonce, x12, [x13]\n"
+ ".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
+ ".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
+ ".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
+ ".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
+ ".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
+ ".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
+ "2:" // Tile loop: Prefetch input rows: End
+ "ldr x26, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mov x20, #0x2\n"
+ "ld1h { z19.h }, p3/Z, [x6]\n"
+ "addvl x6, x6, #1\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "cnth x24\n"
+ ".inst 0xa040a0c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ ".inst 0xa040a0c4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ "mul x22, x2, x26\n" // offset = tile_i * ld_output_row
+ "cmp x24, %x[n_channels]\n"
+ "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "madd x22, x3, x25, x22\n" // offset += tile_j * ld_output_col
+ "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "mov x21, #0x0\n"
+ "mul x22, x22, x20\n" // offset *= output_tile_size
+ "sub x20, XZR, x24\n"
+ "ld1h { z8.h }, p3/Z, [x6]\n"
+ "add x23, x23, x22, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "ld1h { z9.h }, p2/Z, [x16, x7, LSL #1]\n"
+ "addvl x6, x6, #1\n"
+ "add x22, x23, x26, LSL #1\n"
+ "ld1h { z10.h }, p2/Z, [x5]\n"
+ "ld1h { z11.h }, p2/Z, [x5, x4, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x5, x17, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x5, x15, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x8]\n"
+ "ld1h { z15.h }, p2/Z, [x8, x4, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x5, x7, LSL #1]\n"
+ "bge 4f\n"
+ "3:" // Tile loop: Channel loop
+ "movprfx z28, z19\n fmla z28.h, p3/M, z8.h, z9.h\n"
+ "movprfx z29, z19\n fmla z29.h, p3/M, z6.h, z9.h\n"
+ "whilelt p1.h, x24, %x[n_channels]\n"
+ "inch x21\n"
+ "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z9.h\n"
+ "movprfx z31, z19\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z19.h }, p3/Z, [x6]\n"
+ "addvl x6, x6, #1\n"
+ "inch x24\n"
+ "mov p0.b, p2.b\n"
+ "addvl x5, x5, #1\n"
+ "inch x20\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x8, x15, LSL #1]\n"
+ "ld1h { z10.h }, p1/Z, [x5]\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x8, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x8, x7, LSL #1]\n"
+ "addvl x8, x8, #1\n"
+ "fmla z28.h, p3/M, z3.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x14]\n"
+ "fmla z29.h, p3/M, z0.h, z16.h\n"
+ "fmla z28.h, p3/M, z4.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x16]\n"
+ "fmla z30.h, p3/M, z3.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x14, x4, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x16, x4, LSL #1]\n"
+ "fmla z30.h, p3/M, z0.h, z15.h\n"
+ "fmla z29.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x16, x17, LSL #1]\n"
+ "fmla z28.h, p3/M, z5.h, z13.h\n"
+ "fmla z29.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x16, x15, LSL #1]\n"
+ "addvl x16, x16, #1\n"
+ "ld1h { z9.h }, p1/Z, [x16, x7, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "fmla z28.h, p3/M, z6.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x13]\n"
+ "fmla z29.h, p3/M, z7.h, z12.h\n"
+ "fmla z30.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z13.h }, p2/Z, [x13, x4, LSL #1]\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p1/Z, [x5, x17, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x7, LSL #1]\n"
+ "addvl x14, x14, #1\n"
+ "fmla z30.h, p3/M, z6.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x13, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z5.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x13, x17, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z13.h\n"
+ "ld1h { z13.h }, p1/Z, [x5, x15, LSL #1]\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "whilelt p2.h, x21, %x[n_channels]\n"
+ "cmp x24, %x[n_channels]\n"
+ "addvl x13, x13, #1\n"
+ "fmla z30.h, p3/M, z5.h, z16.h\n"
+ "fmla z31.h, p3/M, z3.h, z16.h\n"
+ ".inst 0xa040a0c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ "ld1h { z16.h }, p1/Z, [x5, x7, LSL #1]\n"
+ "fmla z31.h, p3/M, z7.h, z14.h\n"
+ "ld1h { z14.h }, p1/Z, [x8]\n"
+ "fmla z30.h, p3/M, z8.h, z15.h\n"
+ "fmla z31.h, p3/M, z6.h, z15.h\n"
+ ".inst 0xa040a0c4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ "ld1h { z15.h }, p1/Z, [x8, x4, LSL #1]\n"
+ "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p1/Z, [x5, x4, LSL #1]\n"
+ "ld1h { z8.h }, p3/Z, [x6]\n"
+ "addvl x6, x6, #1\n"
+ ".inst 0xc171ca5c // fclamp { z28.h-z31.h }, z18.h, z17.h\n"
+ "st1h { z28.h }, p0, [x23]\n"
+ "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
+ "addvl x23, x23, #1\n"
+ "st1h { z30.h }, p0, [x22]\n"
+ "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+ "addvl x22, x22, #1\n"
+ "blt 3b\n"
+ "4:" // Tile loop: Channel tail
+ "movprfx z28, z19\n fmla z28.h, p3/M, z8.h, z9.h\n"
+ "movprfx z29, z19\n fmla z29.h, p3/M, z6.h, z9.h\n"
+ "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov p0.b, p2.b\n"
+ "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z9.h\n"
+ "movprfx z31, z19\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x3, x3, #0x1\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x8, x15, LSL #1]\n"
+ "add x20, x2, #0x1\n"
+ "cmp x3, x24\n"
+ "csel x2, x2, x20, LT\n"
+ "csel x3, x3, XZR, LT\n"
+ "cmp x2, x21\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x8, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x8, x7, LSL #1]\n"
+ "fmla z28.h, p3/M, z3.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x14]\n"
+ "fmla z29.h, p3/M, z0.h, z16.h\n"
+ "fmla z28.h, p3/M, z4.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x16]\n"
+ "fmla z30.h, p3/M, z3.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x14, x4, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x16, x4, LSL #1]\n"
+ "fmla z30.h, p3/M, z0.h, z15.h\n"
+ "fmla z29.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x16, x17, LSL #1]\n"
+ "fmla z28.h, p3/M, z5.h, z13.h\n"
+ "fmla z29.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x16, x15, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x13, x4, LSL #1]\n"
+ "fmla z28.h, p3/M, z6.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x13]\n"
+ "fmla z29.h, p3/M, z7.h, z12.h\n"
+ "fmla z30.h, p3/M, z1.h, z16.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z28.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z6.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x13, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z5.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x13, x17, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z13.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x13, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z5.h, z16.h\n"
+ "fmla z31.h, p3/M, z3.h, z16.h\n"
+ "fmla z30.h, p3/M, z8.h, z15.h\n"
+ "fmla z31.h, p3/M, z7.h, z14.h\n"
+ "fmla z31.h, p3/M, z6.h, z15.h\n"
+ "fmla z31.h, p3/M, z8.h, z11.h\n"
+ ".inst 0xc171ca5c // fclamp { z28.h-z31.h }, z18.h, z17.h\n"
+ "st1h { z28.h }, p0, [x23]\n"
+ "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
+ "st1h { z30.h }, p0, [x22]\n"
+ "st1h { z31.h }, p0, [x22, x25, LSL #1]\n"
+ "blt 1b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..1bf3a84959
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[25];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[1];
+ inptrs[3] = input_ptrs[3];
+ inptrs[4] = input_ptrs[4];
+ inptrs[5] = input_ptrs[5];
+ inptrs[6] = input_ptrs[6];
+ inptrs[7] = input_ptrs[2];
+ inptrs[8] = input_ptrs[8];
+ inptrs[9] = input_ptrs[9];
+ inptrs[10] = input_ptrs[7];
+ inptrs[11] = input_ptrs[15];
+ inptrs[12] = input_ptrs[10];
+ inptrs[13] = input_ptrs[16];
+ inptrs[14] = input_ptrs[11];
+ inptrs[15] = input_ptrs[18];
+ inptrs[16] = input_ptrs[13];
+ inptrs[17] = input_ptrs[19];
+ inptrs[18] = input_ptrs[20];
+ inptrs[19] = input_ptrs[14];
+ inptrs[20] = input_ptrs[21];
+ inptrs[21] = input_ptrs[17];
+ inptrs[22] = input_ptrs[23];
+ inptrs[23] = input_ptrs[22];
+ inptrs[24] = input_ptrs[24];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "mov x15, #0x0\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ptrue p3.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "cnth x13\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ld1rh { z19.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ldp x12, x11, [x20, #0x0]\n"
+ "ldp x10, x9, [x20, #0x10]\n"
+ "cmp x13, %x[n_channels]\n"
+ "ld1rh { z18.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x28, XZR, x13\n"
+ "ld1h { z17.h }, p3/Z, [x14]\n"
+ "addvl x14, x14, #1\n"
+ "ldp x27, x26, [x16, #0x0]\n"
+ "ldp x25, x24, [x16, #0x10]\n"
+ ".inst 0xa040a1c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
+ "addvl x14, x14, #4\n"
+ "ldp x23, x22, [x16, #0x20]\n"
+ ".inst 0xa040a1c4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x14]\n"
+ "addvl x14, x14, #4\n"
+ "ldp x21, x20, [x16, #0x30]\n"
+ "ld1h { z8.h }, p3/Z, [x14]\n"
+ "addvl x14, x14, #1\n"
+ "ld1h { z9.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ld1h { z15.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "movprfx z28, z17\n fmla z28.h, p3/M, z8.h, z9.h\n"
+ "movprfx z29, z17\n fmla z29.h, p3/M, z6.h, z9.h\n"
+ "ldr x27, [x16, #0x40]\n"
+ "whilelt p1.h, x13, %x[n_channels]\n"
+ "ldr x26, [x16, #0x48]\n"
+ "movprfx z30, z17\n fmla z30.h, p3/M, z2.h, z9.h\n"
+ "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z17.h }, p3/Z, [x14]\n"
+ "ldr x25, [x16, #0x50]\n"
+ "addvl x14, x14, #1\n"
+ "inch x28\n"
+ "ldr x24, [x16, #0x58]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ldr x20, [x16, #0x78]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "ldr x22, [x16, #0x68]\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ldr x27, [x16, #0x80]\n"
+ "ldr x26, [x16, #0x88]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "fmla z28.h, p3/M, z3.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z0.h, z16.h\n"
+ "ldr x24, [x16, #0x98]\n"
+ "ldr x25, [x16, #0x90]\n"
+ "fmla z30.h, p3/M, z3.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z4.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ldr x23, [x16, #0xa0]\n"
+ "fmla z29.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ldr x22, [x16, #0xa8]\n"
+ "fmla z28.h, p3/M, z2.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ldr x21, [x16, #0xb0]\n"
+ "fmla z30.h, p3/M, z0.h, z15.h\n"
+ "fmla z29.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "ldr x27, [x16, #0xc0]\n"
+ "fmla z28.h, p3/M, z5.h, z13.h\n"
+ "fmla z29.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z6.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z7.h, z12.h\n"
+ "fmla z30.h, p3/M, z1.h, z16.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z28.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z6.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z5.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z13.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "ldp x27, x26, [x16, #0x0]\n"
+ "inch x15\n"
+ "ldp x25, x24, [x16, #0x10]\n"
+ "whilelt p2.h, x15, %x[n_channels]\n"
+ "ldp x23, x22, [x16, #0x20]\n"
+ "fmla z30.h, p3/M, z5.h, z16.h\n"
+ "ldp x21, x20, [x16, #0x30]\n"
+ "ld1h { z9.h }, p1/Z, [x27, x13, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z10.h }, p1/Z, [x26, x13, LSL #1]\n"
+ "ld1h { z12.h }, p1/Z, [x24, x13, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z15.h\n"
+ "ld1h { z13.h }, p1/Z, [x23, x13, LSL #1]\n"
+ "fmla z31.h, p3/M, z7.h, z14.h\n"
+ "ld1h { z14.h }, p1/Z, [x22, x13, LSL #1]\n"
+ "ld1h { z16.h }, p1/Z, [x20, x13, LSL #1]\n"
+ ".inst 0xa040a1c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
+ "addvl x14, x14, #4\n"
+ "fmla z31.h, p3/M, z6.h, z15.h\n"
+ "ld1h { z15.h }, p1/Z, [x21, x13, LSL #1]\n"
+ ".inst 0xa040a1c4 // ld1h { z4.h-z7.h }, pn8.b/Z, [x14]\n"
+ "addvl x14, x14, #4\n"
+ "fmla z31.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p1/Z, [x25, x13, LSL #1]\n"
+ "inch x13\n"
+ "cmp x13, %x[n_channels]\n"
+ "ld1h { z8.h }, p3/Z, [x14]\n"
+ "addvl x14, x14, #1\n"
+ ".inst 0xc172ca7c // fclamp { z28.h-z31.h }, z19.h, z18.h\n"
+ "st1h { z28.h }, p0, [x12, x28, LSL #1]\n"
+ "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
+ "st1h { z30.h }, p0, [x10, x28, LSL #1]\n"
+ "st1h { z31.h }, p0, [x9, x28, LSL #1]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z28, z17\n fmla z28.h, p3/M, z8.h, z9.h\n"
+ "movprfx z29, z17\n fmla z29.h, p3/M, z6.h, z9.h\n"
+ "ldr x27, [x16, #0x40]\n"
+ "inch x28\n"
+ "ldr x26, [x16, #0x48]\n"
+ "movprfx z30, z17\n fmla z30.h, p3/M, z2.h, z9.h\n"
+ "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "mov p0.b, p2.b\n"
+ "ldr x25, [x16, #0x50]\n"
+ "ldr x24, [x16, #0x58]\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "ldr x20, [x16, #0x78]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "ldr x22, [x16, #0x68]\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ldr x27, [x16, #0x80]\n"
+ "ldr x26, [x16, #0x88]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "fmla z28.h, p3/M, z3.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z0.h, z16.h\n"
+ "ldr x24, [x16, #0x98]\n"
+ "ldr x25, [x16, #0x90]\n"
+ "fmla z30.h, p3/M, z3.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z4.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ldr x23, [x16, #0xa0]\n"
+ "fmla z29.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ldr x22, [x16, #0xa8]\n"
+ "fmla z28.h, p3/M, z2.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ldr x21, [x16, #0xb0]\n"
+ "fmla z30.h, p3/M, z0.h, z15.h\n"
+ "fmla z29.h, p3/M, z5.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "ldr x27, [x16, #0xc0]\n"
+ "fmla z28.h, p3/M, z5.h, z13.h\n"
+ "fmla z29.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z30.h, p3/M, z4.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z6.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z7.h, z12.h\n"
+ "fmla z30.h, p3/M, z1.h, z16.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z28.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z6.h, z15.h\n"
+ "ld1h { z15.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z8.h, z11.h\n"
+ "fmla z31.h, p3/M, z5.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z13.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z5.h, z16.h\n"
+ "fmla z31.h, p3/M, z3.h, z16.h\n"
+ "fmla z30.h, p3/M, z8.h, z15.h\n"
+ "fmla z31.h, p3/M, z7.h, z14.h\n"
+ "fmla z31.h, p3/M, z6.h, z15.h\n"
+ "fmla z31.h, p3/M, z8.h, z11.h\n"
+ ".inst 0xc172ca7c // fclamp { z28.h-z31.h }, z19.h, z18.h\n"
+ "st1h { z28.h }, p0, [x12, x28, LSL #1]\n"
+ "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
+ "st1h { z30.h }, p0, [x10, x28, LSL #1]\n"
+ "st1h { z31.h }, p0, [x9, x28, LSL #1]\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..84263cb564
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..58b7824b98
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,586 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x2, #0x0\n"
+ "mov x3, #0x0\n"
+ "ptrue p3.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "1:" // Tile loop
+ "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x22, #0x2\n"
+ "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x20, x2, x21\n" // offset = tile_i * ld_input_row
+ "ldr x6, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x20, x3, x4, x20\n" // offset += tile_j * ld_input_col
+ "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
+ "add x7, x4, x4\n"
+ "add x5, x5, x20, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x8, x5, x21, LSL #1\n"
+ "add x17, x7, x4\n"
+ "add x16, x8, x21, LSL #1\n"
+ "add x15, x17, x4\n"
+ "add x14, x16, x21, LSL #1\n"
+ "add x13, x15, x4\n"
+ "add x12, x14, x21, LSL #1\n"
+ "add x11, x12, x21, LSL #1\n"
+ "cbnz x3, 2f\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "lsl x10, %x[n_channels], #0x1\n"
+ "mov x21, #0x4\n"
+ "mul x21, x21, x4\n"
+ "add x9, x5, x4, LSL #1\n"
+ "add x28, x8, x4, LSL #1\n"
+ "add x27, x5, x7, LSL #1\n"
+ "sub x20, x25, x3\n"
+ "add x26, x8, x7, LSL #1\n"
+ "sub x20, x20, #0x1\n"
+ "add x25, x5, x17, LSL #1\n"
+ "and x20, x20, #0x3fffff\n"
+ "add x24, x5, x15, LSL #1\n"
+ "orr x10, x10, x20, LSL #22\n"
+ "add x23, x8, x13, LSL #1\n"
+ "orr x10, x10, x21, LSL #38\n"
+ "add x22, x8, x17, LSL #1\n"
+ "add x21, x8, x15, LSL #1\n"
+ "add x20, x5, x13, LSL #1\n"
+ ".inst 0xf8aa48ba // rprfm pldonce, x10, [x5]\n"
+ ".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
+ "add x9, x16, x4, LSL #1\n"
+ ".inst 0xf8aa491a // rprfm pldonce, x10, [x8]\n"
+ ".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
+ "add x28, x16, x7, LSL #1\n"
+ ".inst 0xf8aa4b7a // rprfm pldonce, x10, [x27]\n"
+ "add x27, x16, x17, LSL #1\n"
+ ".inst 0xf8aa4b5a // rprfm pldonce, x10, [x26]\n"
+ "add x26, x16, x15, LSL #1\n"
+ ".inst 0xf8aa4b3a // rprfm pldonce, x10, [x25]\n"
+ "add x25, x16, x13, LSL #1\n"
+ ".inst 0xf8aa4b1a // rprfm pldonce, x10, [x24]\n"
+ "add x24, x14, x4, LSL #1\n"
+ ".inst 0xf8aa4afa // rprfm pldonce, x10, [x23]\n"
+ "add x23, x14, x7, LSL #1\n"
+ ".inst 0xf8aa4a1a // rprfm pldonce, x10, [x16]\n"
+ ".inst 0xf8aa4ada // rprfm pldonce, x10, [x22]\n"
+ "add x22, x14, x17, LSL #1\n"
+ ".inst 0xf8aa4aba // rprfm pldonce, x10, [x21]\n"
+ "add x21, x14, x15, LSL #1\n"
+ ".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
+ "add x20, x14, x13, LSL #1\n"
+ ".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
+ "add x9, x12, x4, LSL #1\n"
+ ".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
+ "add x28, x12, x7, LSL #1\n"
+ ".inst 0xf8aa4b7a // rprfm pldonce, x10, [x27]\n"
+ "add x27, x12, x17, LSL #1\n"
+ ".inst 0xf8aa4b5a // rprfm pldonce, x10, [x26]\n"
+ "add x26, x12, x15, LSL #1\n"
+ ".inst 0xf8aa4b3a // rprfm pldonce, x10, [x25]\n"
+ "add x25, x12, x13, LSL #1\n"
+ ".inst 0xf8aa49da // rprfm pldonce, x10, [x14]\n"
+ ".inst 0xf8aa4b1a // rprfm pldonce, x10, [x24]\n"
+ "add x24, x11, x4, LSL #1\n"
+ ".inst 0xf8aa4afa // rprfm pldonce, x10, [x23]\n"
+ "add x23, x11, x7, LSL #1\n"
+ ".inst 0xf8aa4ada // rprfm pldonce, x10, [x22]\n"
+ "add x22, x11, x17, LSL #1\n"
+ ".inst 0xf8aa4aba // rprfm pldonce, x10, [x21]\n"
+ "add x21, x11, x15, LSL #1\n"
+ ".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
+ "add x20, x11, x13, LSL #1\n"
+ ".inst 0xf8aa499a // rprfm pldonce, x10, [x12]\n"
+ ".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
+ ".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
+ ".inst 0xf8aa4b7a // rprfm pldonce, x10, [x27]\n"
+ ".inst 0xf8aa4b5a // rprfm pldonce, x10, [x26]\n"
+ ".inst 0xf8aa4b3a // rprfm pldonce, x10, [x25]\n"
+ ".inst 0xf8aa497a // rprfm pldonce, x10, [x11]\n"
+ ".inst 0xf8aa4b1a // rprfm pldonce, x10, [x24]\n"
+ ".inst 0xf8aa4afa // rprfm pldonce, x10, [x23]\n"
+ ".inst 0xf8aa4ada // rprfm pldonce, x10, [x22]\n"
+ ".inst 0xf8aa4aba // rprfm pldonce, x10, [x21]\n"
+ ".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
+ "2:" // Tile loop: Prefetch input rows: End
+ "ldr x27, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mov x26, #0x2\n"
+ "cnth x25\n"
+ "ld1h { z18.h }, p3/Z, [x6]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "addvl x6, x6, #1\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ld1rh { z17.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ ".inst 0xa040a0c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ "cmp x25, %x[n_channels]\n"
+ "mul x22, x2, x27\n" // offset = tile_i * ld_output_row
+ "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "mov x21, #0x0\n"
+ "madd x22, x3, x24, x22\n" // offset += tile_j * ld_output_col
+ "sub x20, XZR, x25\n"
+ "ld1h { z4.h }, p3/Z, [x6]\n"
+ "mul x22, x22, x26\n" // offset *= output_tile_size
+ "ld1h { z5.h }, p2/Z, [x5]\n"
+ "addvl x6, x6, #1\n"
+ "add x23, x23, x22, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "ld1h { z6.h }, p2/Z, [x5, x4, LSL #1]\n"
+ "add x22, x23, x27, LSL #1\n"
+ "ld1h { z7.h }, p2/Z, [x8]\n"
+ "ld1h { z8.h }, p2/Z, [x8, x4, LSL #1]\n"
+ "ld1h { z9.h }, p2/Z, [x5, x7, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x8, x7, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x5, x17, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x5, x15, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x8, x13, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x16]\n"
+ "bge 4f\n"
+ "3:" // Tile loop: Channel loop
+ "movprfx z28, z18\n fmla z28.h, p3/M, z0.h, z5.h\n"
+ "movprfx z29, z18\n fmla z29.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z5.h }, p2/Z, [x8, x17, LSL #1]\n"
+ "whilelt p1.h, x25, %x[n_channels]\n"
+ "movprfx z30, z18\n fmla z30.h, p3/M, z0.h, z7.h\n"
+ "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z8.h\n"
+ "ld1h { z0.h }, p3/Z, [x6]\n"
+ "inch x21\n"
+ "inch x25\n"
+ "mov p0.b, p2.b\n"
+ "inch x20\n"
+ "fmla z28.h, p3/M, z1.h, z6.h\n"
+ "ld1h { z6.h }, p2/Z, [x8, x15, LSL #1]\n"
+ "addvl x8, x8, #1\n"
+ "fmla z29.h, p3/M, z1.h, z9.h\n"
+ "fmla z30.h, p3/M, z1.h, z8.h\n"
+ "fmla z31.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z1.h }, p3/Z, [x6, #1, MUL VL]\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x5, x13, LSL #1]\n"
+ "addvl x5, x5, #1\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "fmla z30.h, p3/M, z2.h, z13.h\n"
+ "fmla z31.h, p3/M, z2.h, z5.h\n"
+ "ld1h { z2.h }, p3/Z, [x6, #2, MUL VL]\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x16, x4, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "fmla z30.h, p3/M, z3.h, z5.h\n"
+ "fmla z31.h, p3/M, z3.h, z6.h\n"
+ "ld1h { z3.h }, p3/Z, [x6, #3, MUL VL]\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x16, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x16, x17, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z6.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p3/Z, [x6, #4, MUL VL]\n"
+ "fmla z28.h, p3/M, z0.h, z7.h\n"
+ "ld1h { z7.h }, p1/Z, [x8]\n"
+ "fmla z29.h, p3/M, z0.h, z8.h\n"
+ "fmla z30.h, p3/M, z0.h, z14.h\n"
+ "fmla z31.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z0.h }, p3/Z, [x6, #5, MUL VL]\n"
+ "fmla z28.h, p3/M, z1.h, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x16, x13, LSL #1]\n"
+ "fmla z29.h, p3/M, z1.h, z13.h\n"
+ "fmla z30.h, p3/M, z1.h, z11.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z1.h }, p3/Z, [x6, #6, MUL VL]\n"
+ "fmla z28.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x16, x15, LSL #1]\n"
+ "addvl x16, x16, #1\n"
+ "fmla z29.h, p3/M, z2.h, z5.h\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "fmla z31.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z2.h }, p3/Z, [x6, #7, MUL VL]\n"
+ "addvl x6, x6, #16\n"
+ "ld1h { z18.h }, p3/Z, [x6, #4, MUL VL]\n"
+ "fmla z28.h, p3/M, z3.h, z5.h\n"
+ "ld1h { z5.h }, p2/Z, [x14]\n"
+ "fmla z29.h, p3/M, z3.h, z6.h\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "fmla z31.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z3.h }, p3/Z, [x6, #-8, MUL VL]\n"
+ "fmla z28.h, p3/M, z4.h, z6.h\n"
+ "ld1h { z6.h }, p2/Z, [x14, x4, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z13.h\n"
+ "fmla z31.h, p3/M, z4.h, z8.h\n"
+ "ld1h { z4.h }, p3/Z, [x6, #-7, MUL VL]\n"
+ "fmla z28.h, p3/M, z0.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x14, x13, LSL #1]\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "fmla z30.h, p3/M, z0.h, z5.h\n"
+ "fmla z31.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z0.h }, p3/Z, [x6, #-6, MUL VL]\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "fmla z30.h, p3/M, z1.h, z6.h\n"
+ "fmla z31.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z1.h }, p3/Z, [x6, #-5, MUL VL]\n"
+ "fmla z28.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "addvl x14, x14, #1\n"
+ "fmla z29.h, p3/M, z2.h, z9.h\n"
+ "fmla z30.h, p3/M, z2.h, z10.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z2.h }, p3/Z, [x6, #-4, MUL VL]\n"
+ "fmla z28.h, p3/M, z3.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x12]\n"
+ "fmla z29.h, p3/M, z3.h, z13.h\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z3.h }, p3/Z, [x6, #-3, MUL VL]\n"
+ "fmla z28.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x12, x4, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z12.h\n"
+ "fmla z31.h, p3/M, z4.h, z14.h\n"
+ "ld1h { z4.h }, p3/Z, [x6, #-2, MUL VL]\n"
+ "fmla z28.h, p3/M, z0.h, z5.h\n"
+ "ld1h { z5.h }, p2/Z, [x12, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z0.h, z6.h\n"
+ "fmla z30.h, p3/M, z0.h, z9.h\n"
+ "fmla z31.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z0.h }, p3/Z, [x6, #-1, MUL VL]\n"
+ "fmla z28.h, p3/M, z1.h, z6.h\n"
+ "ld1h { z6.h }, p2/Z, [x12, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z1.h, z10.h\n"
+ "fmla z30.h, p3/M, z1.h, z13.h\n"
+ "fmla z31.h, p3/M, z1.h, z5.h\n"
+ "ld1h { z1.h }, p3/Z, [x6]\n"
+ "fmla z28.h, p3/M, z2.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x12, x13, LSL #1]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "fmla z30.h, p3/M, z2.h, z5.h\n"
+ "fmla z31.h, p3/M, z2.h, z6.h\n"
+ "ld1h { z2.h }, p3/Z, [x6, #1, MUL VL]\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11]\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "fmla z30.h, p3/M, z3.h, z6.h\n"
+ "fmla z31.h, p3/M, z3.h, z8.h\n"
+ "ld1h { z3.h }, p3/Z, [x6, #2, MUL VL]\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x4, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z14.h\n"
+ "ld1h { z14.h }, p1/Z, [x16]\n"
+ "fmla z30.h, p3/M, z4.h, z8.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p3/Z, [x6, #3, MUL VL]\n"
+ "addvl x6, x6, #5\n"
+ "fmla z28.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z0.h, z13.h\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x17, LSL #1]\n"
+ "fmla z31.h, p3/M, z0.h, z12.h\n"
+ "fmla z28.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z13.h }, p1/Z, [x8, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z1.h, z5.h\n"
+ "fmla z30.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z31.h, p3/M, z1.h, z9.h\n"
+ "fmla z28.h, p3/M, z2.h, z5.h\n"
+ "ld1h { z5.h }, p1/Z, [x5]\n"
+ "fmla z29.h, p3/M, z2.h, z6.h\n"
+ "fmla z30.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x11, x13, LSL #1]\n"
+ "whilelt p2.h, x21, %x[n_channels]\n"
+ "cmp x25, %x[n_channels]\n"
+ "addvl x11, x11, #1\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "fmla z28.h, p3/M, z3.h, z6.h\n"
+ "ld1h { z6.h }, p1/Z, [x5, x4, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z8.h\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p1/Z, [x5, x17, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z12.h\n"
+ ".inst 0xa040a0c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x6]\n"
+ "addvl x6, x6, #4\n"
+ "fmla z28.h, p3/M, z4.h, z8.h\n"
+ "ld1h { z8.h }, p1/Z, [x8, x4, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z10.h }, p1/Z, [x8, x13, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p1/Z, [x5, x15, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z9.h\n"
+ "ld1h { z9.h }, p1/Z, [x5, x7, LSL #1]\n"
+ "ld1h { z4.h }, p3/Z, [x6]\n"
+ "addvl x6, x6, #1\n"
+ ".inst 0xc170ca3c // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
+ "st1h { z28.h }, p0, [x23]\n"
+ "st1h { z29.h }, p0, [x23, x24, LSL #1]\n"
+ "addvl x23, x23, #1\n"
+ "st1h { z30.h }, p0, [x22]\n"
+ "st1h { z31.h }, p0, [x22, x24, LSL #1]\n"
+ "addvl x22, x22, #1\n"
+ "blt 3b\n"
+ "4:" // Tile loop: Channel tail
+ "movprfx z28, z18\n fmla z28.h, p3/M, z0.h, z5.h\n"
+ "movprfx z29, z18\n fmla z29.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z5.h }, p2/Z, [x8, x17, LSL #1]\n"
+ "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "movprfx z30, z18\n fmla z30.h, p3/M, z0.h, z7.h\n"
+ "movprfx z31, z18\n fmla z31.h, p3/M, z0.h, z8.h\n"
+ "ld1h { z0.h }, p3/Z, [x6]\n"
+ "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "mov p0.b, p2.b\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x3, x3, #0x1\n"
+ "fmla z28.h, p3/M, z1.h, z6.h\n"
+ "ld1h { z6.h }, p2/Z, [x8, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z1.h, z9.h\n"
+ "add x20, x2, #0x1\n"
+ "fmla z30.h, p3/M, z1.h, z8.h\n"
+ "fmla z31.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z1.h }, p3/Z, [x6, #1, MUL VL]\n"
+ "cmp x3, x25\n"
+ "csel x2, x2, x20, LT\n"
+ "csel x3, x3, XZR, LT\n"
+ "cmp x2, x21\n"
+ "fmla z28.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x5, x13, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "fmla z30.h, p3/M, z2.h, z13.h\n"
+ "fmla z31.h, p3/M, z2.h, z5.h\n"
+ "ld1h { z2.h }, p3/Z, [x6, #2, MUL VL]\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x16, x4, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "fmla z30.h, p3/M, z3.h, z5.h\n"
+ "fmla z31.h, p3/M, z3.h, z6.h\n"
+ "ld1h { z3.h }, p3/Z, [x6, #3, MUL VL]\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x16, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x16, x17, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z6.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p3/Z, [x6, #4, MUL VL]\n"
+ "fmla z28.h, p3/M, z0.h, z7.h\n"
+ "fmla z29.h, p3/M, z0.h, z8.h\n"
+ "fmla z30.h, p3/M, z0.h, z14.h\n"
+ "fmla z31.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z0.h }, p3/Z, [x6, #5, MUL VL]\n"
+ "fmla z28.h, p3/M, z1.h, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x16, x13, LSL #1]\n"
+ "fmla z29.h, p3/M, z1.h, z13.h\n"
+ "fmla z30.h, p3/M, z1.h, z11.h\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z1.h }, p3/Z, [x6, #6, MUL VL]\n"
+ "fmla z28.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x16, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z5.h\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "fmla z31.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z2.h }, p3/Z, [x6, #7, MUL VL]\n"
+ "addvl x6, x6, #16\n"
+ "fmla z28.h, p3/M, z3.h, z5.h\n"
+ "ld1h { z5.h }, p2/Z, [x14]\n"
+ "fmla z29.h, p3/M, z3.h, z6.h\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "fmla z31.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z3.h }, p3/Z, [x6, #-8, MUL VL]\n"
+ "fmla z28.h, p3/M, z4.h, z6.h\n"
+ "ld1h { z6.h }, p2/Z, [x14, x4, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x14, x7, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z13.h\n"
+ "fmla z31.h, p3/M, z4.h, z8.h\n"
+ "ld1h { z4.h }, p3/Z, [x6, #-7, MUL VL]\n"
+ "fmla z28.h, p3/M, z0.h, z14.h\n"
+ "ld1h { z14.h }, p2/Z, [x14, x13, LSL #1]\n"
+ "fmla z29.h, p3/M, z0.h, z11.h\n"
+ "fmla z30.h, p3/M, z0.h, z5.h\n"
+ "fmla z31.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z0.h }, p3/Z, [x6, #-6, MUL VL]\n"
+ "fmla z28.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z1.h, z12.h\n"
+ "fmla z30.h, p3/M, z1.h, z6.h\n"
+ "fmla z31.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z1.h }, p3/Z, [x6, #-5, MUL VL]\n"
+ "fmla z28.h, p3/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z9.h\n"
+ "fmla z30.h, p3/M, z2.h, z10.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z2.h }, p3/Z, [x6, #-4, MUL VL]\n"
+ "fmla z28.h, p3/M, z3.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x12]\n"
+ "fmla z29.h, p3/M, z3.h, z13.h\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z3.h }, p3/Z, [x6, #-3, MUL VL]\n"
+ "fmla z28.h, p3/M, z4.h, z13.h\n"
+ "ld1h { z13.h }, p2/Z, [x12, x4, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z12.h\n"
+ "fmla z31.h, p3/M, z4.h, z14.h\n"
+ "ld1h { z4.h }, p3/Z, [x6, #-2, MUL VL]\n"
+ "fmla z28.h, p3/M, z0.h, z5.h\n"
+ "ld1h { z5.h }, p2/Z, [x12, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z0.h, z6.h\n"
+ "fmla z30.h, p3/M, z0.h, z9.h\n"
+ "fmla z31.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z0.h }, p3/Z, [x6, #-1, MUL VL]\n"
+ "fmla z28.h, p3/M, z1.h, z6.h\n"
+ "ld1h { z6.h }, p2/Z, [x12, x17, LSL #1]\n"
+ "fmla z29.h, p3/M, z1.h, z10.h\n"
+ "fmla z30.h, p3/M, z1.h, z13.h\n"
+ "fmla z31.h, p3/M, z1.h, z5.h\n"
+ "ld1h { z1.h }, p3/Z, [x6]\n"
+ "fmla z28.h, p3/M, z2.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x12, x13, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z11.h\n"
+ "fmla z30.h, p3/M, z2.h, z5.h\n"
+ "fmla z31.h, p3/M, z2.h, z6.h\n"
+ "ld1h { z2.h }, p3/Z, [x6, #1, MUL VL]\n"
+ "fmla z28.h, p3/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11]\n"
+ "fmla z29.h, p3/M, z3.h, z12.h\n"
+ "fmla z30.h, p3/M, z3.h, z6.h\n"
+ "fmla z31.h, p3/M, z3.h, z8.h\n"
+ "ld1h { z3.h }, p3/Z, [x6, #2, MUL VL]\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x4, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z14.h\n"
+ "fmla z30.h, p3/M, z4.h, z8.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p3/Z, [x6, #3, MUL VL]\n"
+ "fmla z28.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z0.h, z13.h\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x17, LSL #1]\n"
+ "fmla z31.h, p3/M, z0.h, z12.h\n"
+ "fmla z28.h, p3/M, z1.h, z13.h\n"
+ "fmla z29.h, p3/M, z1.h, z5.h\n"
+ "fmla z30.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z31.h, p3/M, z1.h, z9.h\n"
+ "fmla z28.h, p3/M, z2.h, z5.h\n"
+ "fmla z29.h, p3/M, z2.h, z6.h\n"
+ "fmla z30.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x11, x13, LSL #1]\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "fmla z28.h, p3/M, z3.h, z6.h\n"
+ "fmla z29.h, p3/M, z3.h, z8.h\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z12.h\n"
+ "fmla z28.h, p3/M, z4.h, z8.h\n"
+ "fmla z29.h, p3/M, z4.h, z10.h\n"
+ "fmla z30.h, p3/M, z4.h, z12.h\n"
+ "fmla z31.h, p3/M, z4.h, z9.h\n"
+ ".inst 0xc170ca3c // fclamp { z28.h-z31.h }, z17.h, z16.h\n"
+ "st1h { z28.h }, p0, [x23]\n"
+ "st1h { z29.h }, p0, [x23, x24, LSL #1]\n"
+ "st1h { z30.h }, p0, [x22]\n"
+ "st1h { z31.h }, p0, [x22, x24, LSL #1]\n"
+ "blt 1b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..313036876e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,537 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[36];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[0];
+ inptrs[1] = input_ptrs[1];
+ inptrs[2] = input_ptrs[6];
+ inptrs[3] = input_ptrs[7];
+ inptrs[4] = input_ptrs[2];
+ inptrs[5] = input_ptrs[8];
+ inptrs[6] = input_ptrs[3];
+ inptrs[7] = input_ptrs[4];
+ inptrs[8] = input_ptrs[11];
+ inptrs[9] = input_ptrs[12];
+ inptrs[10] = input_ptrs[9];
+ inptrs[11] = input_ptrs[10];
+ inptrs[12] = input_ptrs[5];
+ inptrs[13] = input_ptrs[13];
+ inptrs[14] = input_ptrs[14];
+ inptrs[15] = input_ptrs[15];
+ inptrs[16] = input_ptrs[16];
+ inptrs[17] = input_ptrs[17];
+ inptrs[18] = input_ptrs[18];
+ inptrs[19] = input_ptrs[19];
+ inptrs[20] = input_ptrs[20];
+ inptrs[21] = input_ptrs[21];
+ inptrs[22] = input_ptrs[22];
+ inptrs[23] = input_ptrs[23];
+ inptrs[24] = input_ptrs[24];
+ inptrs[25] = input_ptrs[25];
+ inptrs[26] = input_ptrs[26];
+ inptrs[27] = input_ptrs[27];
+ inptrs[28] = input_ptrs[28];
+ inptrs[29] = input_ptrs[29];
+ inptrs[30] = input_ptrs[30];
+ inptrs[31] = input_ptrs[31];
+ inptrs[32] = input_ptrs[32];
+ inptrs[33] = input_ptrs[33];
+ inptrs[34] = input_ptrs[34];
+ inptrs[35] = input_ptrs[35];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x15, #0x0\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "whilelt p3.h, XZR, %x[n_channels]\n"
+ "ptrue p2.b\n"
+ "cnth x13\n"
+ "ldp x12, x11, [x20, #0x0]\n"
+ "ldp x10, x9, [x20, #0x10]\n"
+ "cmp x13, %x[n_channels]\n"
+ "ld1rh { z18.h }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "sub x28, XZR, x13\n"
+ "ldp x27, x26, [x16, #0x0]\n"
+ "ld1h { z17.h }, p2/Z, [x14]\n"
+ "addvl x14, x14, #1\n"
+ "ldp x25, x24, [x16, #0x10]\n"
+ ".inst 0xa040a1c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
+ "addvl x14, x14, #4\n"
+ "ldp x23, x22, [x16, #0x20]\n"
+ "ld1rh { z16.h }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z5.h }, p3/Z, [x27, x15, LSL #1]\n"
+ "ldp x21, x20, [x16, #0x30]\n"
+ "ld1h { z6.h }, p3/Z, [x26, x15, LSL #1]\n"
+ "ldp x27, x26, [x16, #0x40]\n"
+ "ld1h { z4.h }, p2/Z, [x14]\n"
+ "addvl x14, x14, #1\n"
+ "ld1h { z7.h }, p3/Z, [x25, x15, LSL #1]\n"
+ "ld1h { z8.h }, p3/Z, [x24, x15, LSL #1]\n"
+ "ld1h { z9.h }, p3/Z, [x23, x15, LSL #1]\n"
+ "ld1h { z13.h }, p3/Z, [x22, x15, LSL #1]\n"
+ "ld1h { z11.h }, p3/Z, [x21, x15, LSL #1]\n"
+ "ld1h { z12.h }, p3/Z, [x20, x15, LSL #1]\n"
+ "ld1h { z10.h }, p3/Z, [x27, x15, LSL #1]\n"
+ "ld1h { z14.h }, p3/Z, [x26, x15, LSL #1]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "movprfx z28, z17\n fmla z28.h, p2/M, z0.h, z5.h\n"
+ "movprfx z29, z17\n fmla z29.h, p2/M, z0.h, z6.h\n"
+ "ldr x25, [x16, #0x50]\n"
+ "whilelt p1.h, x13, %x[n_channels]\n"
+ "movprfx z30, z17\n fmla z30.h, p2/M, z0.h, z7.h\n"
+ "movprfx z31, z17\n fmla z31.h, p2/M, z0.h, z8.h\n"
+ "ldr x24, [x16, #0x58]\n"
+ "ld1h { z0.h }, p2/Z, [x14]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "inch x28\n"
+ "mov p0.b, p3.b\n"
+ "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
+ "ldr x22, [x16, #0x68]\n"
+ "fmla z28.h, p2/M, z1.h, z6.h\n"
+ "fmla z29.h, p2/M, z1.h, z9.h\n"
+ "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "fmla z30.h, p2/M, z1.h, z8.h\n"
+ "fmla z31.h, p2/M, z1.h, z13.h\n"
+ "ld1h { z1.h }, p2/Z, [x14, #1, MUL VL]\n"
+ "ldr x20, [x16, #0x78]\n"
+ "ldr x27, [x16, #0x80]\n"
+ "fmla z28.h, p2/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p3/Z, [x23, x15, LSL #1]\n"
+ "fmla z29.h, p2/M, z2.h, z11.h\n"
+ "ldr x26, [x16, #0x88]\n"
+ "fmla z30.h, p2/M, z2.h, z13.h\n"
+ "fmla z31.h, p2/M, z2.h, z5.h\n"
+ "ld1h { z2.h }, p2/Z, [x14, #2, MUL VL]\n"
+ "ldr x25, [x16, #0x90]\n"
+ "ldr x24, [x16, #0x98]\n"
+ "fmla z28.h, p2/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p3/Z, [x22, x15, LSL #1]\n"
+ "fmla z29.h, p2/M, z3.h, z12.h\n"
+ "ldr x23, [x16, #0xa0]\n"
+ "fmla z30.h, p2/M, z3.h, z5.h\n"
+ "fmla z31.h, p2/M, z3.h, z6.h\n"
+ "ld1h { z3.h }, p2/Z, [x14, #3, MUL VL]\n"
+ "ldr x22, [x16, #0xa8]\n"
+ "fmla z28.h, p2/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p3/Z, [x21, x15, LSL #1]\n"
+ "ldr x21, [x16, #0xb0]\n"
+ "fmla z29.h, p2/M, z4.h, z9.h\n"
+ "ld1h { z9.h }, p3/Z, [x20, x15, LSL #1]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z30.h, p2/M, z4.h, z6.h\n"
+ "fmla z31.h, p2/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p2/Z, [x14, #4, MUL VL]\n"
+ "fmla z28.h, p2/M, z0.h, z7.h\n"
+ "fmla z29.h, p2/M, z0.h, z8.h\n"
+ "fmla z30.h, p2/M, z0.h, z14.h\n"
+ "fmla z31.h, p2/M, z0.h, z11.h\n"
+ "ld1h { z0.h }, p2/Z, [x14, #5, MUL VL]\n"
+ "fmla z28.h, p2/M, z1.h, z8.h\n"
+ "ld1h { z8.h }, p3/Z, [x26, x15, LSL #1]\n"
+ "ldr x26, [x16, #0xc8]\n"
+ "fmla z29.h, p2/M, z1.h, z13.h\n"
+ "fmla z30.h, p2/M, z1.h, z11.h\n"
+ "fmla z31.h, p2/M, z1.h, z12.h\n"
+ "ld1h { z1.h }, p2/Z, [x14, #6, MUL VL]\n"
+ "fmla z28.h, p2/M, z2.h, z13.h\n"
+ "ld1h { z13.h }, p3/Z, [x27, x15, LSL #1]\n"
+ "ldr x27, [x16, #0xc0]\n"
+ "fmla z29.h, p2/M, z2.h, z5.h\n"
+ "fmla z30.h, p2/M, z2.h, z12.h\n"
+ "fmla z31.h, p2/M, z2.h, z9.h\n"
+ "ld1h { z2.h }, p2/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "ld1h { z17.h }, p2/Z, [x14, #4, MUL VL]\n"
+ "fmla z28.h, p2/M, z3.h, z5.h\n"
+ "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
+ "ldr x25, [x16, #0xd0]\n"
+ "fmla z29.h, p2/M, z3.h, z6.h\n"
+ "fmla z30.h, p2/M, z3.h, z9.h\n"
+ "fmla z31.h, p2/M, z3.h, z13.h\n"
+ "ld1h { z3.h }, p2/Z, [x14, #-8, MUL VL]\n"
+ "fmla z28.h, p2/M, z4.h, z6.h\n"
+ "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
+ "ldr x24, [x16, #0xd8]\n"
+ "fmla z29.h, p2/M, z4.h, z10.h\n"
+ "ld1h { z10.h }, p3/Z, [x23, x15, LSL #1]\n"
+ "ldr x23, [x16, #0xe0]\n"
+ "fmla z30.h, p2/M, z4.h, z13.h\n"
+ "fmla z31.h, p2/M, z4.h, z8.h\n"
+ "ld1h { z4.h }, p2/Z, [x14, #-7, MUL VL]\n"
+ "fmla z28.h, p2/M, z0.h, z14.h\n"
+ "ld1h { z14.h }, p3/Z, [x20, x15, LSL #1]\n"
+ "ldr x20, [x16, #0xf8]\n"
+ "fmla z29.h, p2/M, z0.h, z11.h\n"
+ "fmla z30.h, p2/M, z0.h, z5.h\n"
+ "fmla z31.h, p2/M, z0.h, z6.h\n"
+ "ld1h { z0.h }, p2/Z, [x14, #-6, MUL VL]\n"
+ "fmla z28.h, p2/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p3/Z, [x22, x15, LSL #1]\n"
+ "ldr x22, [x16, #0xe8]\n"
+ "fmla z29.h, p2/M, z1.h, z12.h\n"
+ "fmla z30.h, p2/M, z1.h, z6.h\n"
+ "fmla z31.h, p2/M, z1.h, z10.h\n"
+ "ld1h { z1.h }, p2/Z, [x14, #-5, MUL VL]\n"
+ "fmla z28.h, p2/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p3/Z, [x21, x15, LSL #1]\n"
+ "ldr x21, [x16, #0xf0]\n"
+ "fmla z29.h, p2/M, z2.h, z9.h\n"
+ "fmla z30.h, p2/M, z2.h, z10.h\n"
+ "fmla z31.h, p2/M, z2.h, z11.h\n"
+ "ld1h { z2.h }, p2/Z, [x14, #-4, MUL VL]\n"
+ "fmla z28.h, p2/M, z3.h, z9.h\n"
+ "ld1h { z9.h }, p3/Z, [x27, x15, LSL #1]\n"
+ "ldr x27, [x16, #0x100]\n"
+ "fmla z29.h, p2/M, z3.h, z13.h\n"
+ "fmla z30.h, p2/M, z3.h, z11.h\n"
+ "fmla z31.h, p2/M, z3.h, z12.h\n"
+ "ld1h { z3.h }, p2/Z, [x14, #-3, MUL VL]\n"
+ "fmla z28.h, p2/M, z4.h, z13.h\n"
+ "ld1h { z13.h }, p3/Z, [x26, x15, LSL #1]\n"
+ "ldr x26, [x16, #0x108]\n"
+ "fmla z29.h, p2/M, z4.h, z8.h\n"
+ "ld1h { z8.h }, p3/Z, [x23, x15, LSL #1]\n"
+ "fmla z30.h, p2/M, z4.h, z12.h\n"
+ "fmla z31.h, p2/M, z4.h, z14.h\n"
+ "ld1h { z4.h }, p2/Z, [x14, #-2, MUL VL]\n"
+ "fmla z28.h, p2/M, z0.h, z5.h\n"
+ "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
+ "ldr x25, [x16, #0x110]\n"
+ "fmla z29.h, p2/M, z0.h, z6.h\n"
+ "fmla z30.h, p2/M, z0.h, z9.h\n"
+ "fmla z31.h, p2/M, z0.h, z13.h\n"
+ "ld1h { z0.h }, p2/Z, [x14, #-1, MUL VL]\n"
+ "fmla z28.h, p2/M, z1.h, z6.h\n"
+ "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
+ "ldr x24, [x16, #0x118]\n"
+ "fmla z29.h, p2/M, z1.h, z10.h\n"
+ "fmla z30.h, p2/M, z1.h, z13.h\n"
+ "fmla z31.h, p2/M, z1.h, z5.h\n"
+ "ld1h { z1.h }, p2/Z, [x14]\n"
+ "fmla z28.h, p2/M, z2.h, z10.h\n"
+ "ld1h { z10.h }, p3/Z, [x22, x15, LSL #1]\n"
+ "fmla z29.h, p2/M, z2.h, z11.h\n"
+ "fmla z30.h, p2/M, z2.h, z5.h\n"
+ "fmla z31.h, p2/M, z2.h, z6.h\n"
+ "ld1h { z2.h }, p2/Z, [x14, #1, MUL VL]\n"
+ "fmla z28.h, p2/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p3/Z, [x21, x15, LSL #1]\n"
+ "fmla z29.h, p2/M, z3.h, z12.h\n"
+ "fmla z30.h, p2/M, z3.h, z6.h\n"
+ "fmla z31.h, p2/M, z3.h, z8.h\n"
+ "ld1h { z3.h }, p2/Z, [x14, #2, MUL VL]\n"
+ "fmla z28.h, p2/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p3/Z, [x20, x15, LSL #1]\n"
+ "fmla z29.h, p2/M, z4.h, z14.h\n"
+ "fmla z30.h, p2/M, z4.h, z8.h\n"
+ "fmla z31.h, p2/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ "fmla z28.h, p2/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p3/Z, [x27, x15, LSL #1]\n"
+ "fmla z29.h, p2/M, z0.h, z13.h\n"
+ "fmla z30.h, p2/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p3/Z, [x26, x15, LSL #1]\n"
+ "ldp x27, x26, [x16, #0x0]\n"
+ "fmla z31.h, p2/M, z0.h, z12.h\n"
+ "fmla z28.h, p2/M, z1.h, z13.h\n"
+ "fmla z29.h, p2/M, z1.h, z5.h\n"
+ "fmla z30.h, p2/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p3/Z, [x25, x15, LSL #1]\n"
+ "fmla z31.h, p2/M, z1.h, z9.h\n"
+ "fmla z28.h, p2/M, z2.h, z5.h\n"
+ "ld1h { z5.h }, p1/Z, [x27, x13, LSL #1]\n"
+ "fmla z29.h, p2/M, z2.h, z6.h\n"
+ "fmla z30.h, p2/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p3/Z, [x24, x15, LSL #1]\n"
+ "ldp x25, x24, [x16, #0x10]\n"
+ "ldp x23, x22, [x16, #0x20]\n"
+ "inch x15\n"
+ "ldp x21, x20, [x16, #0x30]\n"
+ "whilelt p3.h, x15, %x[n_channels]\n"
+ "fmla z31.h, p2/M, z2.h, z11.h\n"
+ "fmla z28.h, p2/M, z3.h, z6.h\n"
+ "ld1h { z6.h }, p1/Z, [x26, x13, LSL #1]\n"
+ "ldp x27, x26, [x16, #0x40]\n"
+ "fmla z29.h, p2/M, z3.h, z8.h\n"
+ "fmla z30.h, p2/M, z3.h, z11.h\n"
+ "ld1h { z7.h }, p1/Z, [x25, x13, LSL #1]\n"
+ "ld1h { z13.h }, p1/Z, [x22, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z3.h, z12.h\n"
+ "fmla z28.h, p2/M, z4.h, z8.h\n"
+ "ld1h { z8.h }, p1/Z, [x24, x13, LSL #1]\n"
+ "fmla z29.h, p2/M, z4.h, z10.h\n"
+ "fmla z30.h, p2/M, z4.h, z12.h\n"
+ "ld1h { z11.h }, p1/Z, [x21, x13, LSL #1]\n"
+ "ld1h { z12.h }, p1/Z, [x20, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z4.h, z9.h\n"
+ "ld1h { z9.h }, p1/Z, [x23, x13, LSL #1]\n"
+ "ld1h { z10.h }, p1/Z, [x27, x13, LSL #1]\n"
+ "ld1h { z14.h }, p1/Z, [x26, x13, LSL #1]\n"
+ "inch x13\n"
+ ".inst 0xa040a1c0 // ld1h { z0.h-z3.h }, pn8.b/Z, [x14]\n"
+ "addvl x14, x14, #4\n"
+ "cmp x13, %x[n_channels]\n"
+ ".inst 0xc170ca5c // fclamp { z28.h-z31.h }, z18.h, z16.h\n"
+ "ld1h { z4.h }, p2/Z, [x14]\n"
+ "addvl x14, x14, #1\n"
+ "st1h { z28.h }, p0, [x12, x28, LSL #1]\n"
+ "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
+ "st1h { z30.h }, p0, [x10, x28, LSL #1]\n"
+ "st1h { z31.h }, p0, [x9, x28, LSL #1]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z28, z17\n fmla z28.h, p2/M, z0.h, z5.h\n"
+ "movprfx z29, z17\n fmla z29.h, p2/M, z0.h, z6.h\n"
+ "ldr x25, [x16, #0x50]\n"
+ "inch x28\n"
+ "movprfx z30, z17\n fmla z30.h, p2/M, z0.h, z7.h\n"
+ "movprfx z31, z17\n fmla z31.h, p2/M, z0.h, z8.h\n"
+ "ldr x24, [x16, #0x58]\n"
+ "ld1h { z0.h }, p2/Z, [x14]\n"
+ "ldr x23, [x16, #0x60]\n"
+ "mov p0.b, p3.b\n"
+ "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
+ "ldr x22, [x16, #0x68]\n"
+ "fmla z28.h, p2/M, z1.h, z6.h\n"
+ "fmla z29.h, p2/M, z1.h, z9.h\n"
+ "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "fmla z30.h, p2/M, z1.h, z8.h\n"
+ "fmla z31.h, p2/M, z1.h, z13.h\n"
+ "ld1h { z1.h }, p2/Z, [x14, #1, MUL VL]\n"
+ "ldr x20, [x16, #0x78]\n"
+ "ldr x27, [x16, #0x80]\n"
+ "fmla z28.h, p2/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p3/Z, [x23, x15, LSL #1]\n"
+ "fmla z29.h, p2/M, z2.h, z11.h\n"
+ "ldr x26, [x16, #0x88]\n"
+ "fmla z30.h, p2/M, z2.h, z13.h\n"
+ "fmla z31.h, p2/M, z2.h, z5.h\n"
+ "ld1h { z2.h }, p2/Z, [x14, #2, MUL VL]\n"
+ "ldr x25, [x16, #0x90]\n"
+ "ldr x24, [x16, #0x98]\n"
+ "fmla z28.h, p2/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p3/Z, [x22, x15, LSL #1]\n"
+ "fmla z29.h, p2/M, z3.h, z12.h\n"
+ "ldr x23, [x16, #0xa0]\n"
+ "fmla z30.h, p2/M, z3.h, z5.h\n"
+ "fmla z31.h, p2/M, z3.h, z6.h\n"
+ "ld1h { z3.h }, p2/Z, [x14, #3, MUL VL]\n"
+ "ldr x22, [x16, #0xa8]\n"
+ "fmla z28.h, p2/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p3/Z, [x21, x15, LSL #1]\n"
+ "ldr x21, [x16, #0xb0]\n"
+ "fmla z29.h, p2/M, z4.h, z9.h\n"
+ "ld1h { z9.h }, p3/Z, [x20, x15, LSL #1]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z30.h, p2/M, z4.h, z6.h\n"
+ "fmla z31.h, p2/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p2/Z, [x14, #4, MUL VL]\n"
+ "fmla z28.h, p2/M, z0.h, z7.h\n"
+ "fmla z29.h, p2/M, z0.h, z8.h\n"
+ "fmla z30.h, p2/M, z0.h, z14.h\n"
+ "fmla z31.h, p2/M, z0.h, z11.h\n"
+ "ld1h { z0.h }, p2/Z, [x14, #5, MUL VL]\n"
+ "fmla z28.h, p2/M, z1.h, z8.h\n"
+ "ld1h { z8.h }, p3/Z, [x26, x15, LSL #1]\n"
+ "ldr x26, [x16, #0xc8]\n"
+ "fmla z29.h, p2/M, z1.h, z13.h\n"
+ "fmla z30.h, p2/M, z1.h, z11.h\n"
+ "fmla z31.h, p2/M, z1.h, z12.h\n"
+ "ld1h { z1.h }, p2/Z, [x14, #6, MUL VL]\n"
+ "fmla z28.h, p2/M, z2.h, z13.h\n"
+ "ld1h { z13.h }, p3/Z, [x27, x15, LSL #1]\n"
+ "ldr x27, [x16, #0xc0]\n"
+ "fmla z29.h, p2/M, z2.h, z5.h\n"
+ "fmla z30.h, p2/M, z2.h, z12.h\n"
+ "fmla z31.h, p2/M, z2.h, z9.h\n"
+ "ld1h { z2.h }, p2/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "fmla z28.h, p2/M, z3.h, z5.h\n"
+ "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
+ "ldr x25, [x16, #0xd0]\n"
+ "fmla z29.h, p2/M, z3.h, z6.h\n"
+ "fmla z30.h, p2/M, z3.h, z9.h\n"
+ "fmla z31.h, p2/M, z3.h, z13.h\n"
+ "ld1h { z3.h }, p2/Z, [x14, #-8, MUL VL]\n"
+ "fmla z28.h, p2/M, z4.h, z6.h\n"
+ "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
+ "ldr x24, [x16, #0xd8]\n"
+ "fmla z29.h, p2/M, z4.h, z10.h\n"
+ "ld1h { z10.h }, p3/Z, [x23, x15, LSL #1]\n"
+ "ldr x23, [x16, #0xe0]\n"
+ "fmla z30.h, p2/M, z4.h, z13.h\n"
+ "fmla z31.h, p2/M, z4.h, z8.h\n"
+ "ld1h { z4.h }, p2/Z, [x14, #-7, MUL VL]\n"
+ "fmla z28.h, p2/M, z0.h, z14.h\n"
+ "ld1h { z14.h }, p3/Z, [x20, x15, LSL #1]\n"
+ "ldr x20, [x16, #0xf8]\n"
+ "fmla z29.h, p2/M, z0.h, z11.h\n"
+ "fmla z30.h, p2/M, z0.h, z5.h\n"
+ "fmla z31.h, p2/M, z0.h, z6.h\n"
+ "ld1h { z0.h }, p2/Z, [x14, #-6, MUL VL]\n"
+ "fmla z28.h, p2/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p3/Z, [x22, x15, LSL #1]\n"
+ "ldr x22, [x16, #0xe8]\n"
+ "fmla z29.h, p2/M, z1.h, z12.h\n"
+ "fmla z30.h, p2/M, z1.h, z6.h\n"
+ "fmla z31.h, p2/M, z1.h, z10.h\n"
+ "ld1h { z1.h }, p2/Z, [x14, #-5, MUL VL]\n"
+ "fmla z28.h, p2/M, z2.h, z12.h\n"
+ "ld1h { z12.h }, p3/Z, [x21, x15, LSL #1]\n"
+ "ldr x21, [x16, #0xf0]\n"
+ "fmla z29.h, p2/M, z2.h, z9.h\n"
+ "fmla z30.h, p2/M, z2.h, z10.h\n"
+ "fmla z31.h, p2/M, z2.h, z11.h\n"
+ "ld1h { z2.h }, p2/Z, [x14, #-4, MUL VL]\n"
+ "fmla z28.h, p2/M, z3.h, z9.h\n"
+ "ld1h { z9.h }, p3/Z, [x27, x15, LSL #1]\n"
+ "ldr x27, [x16, #0x100]\n"
+ "fmla z29.h, p2/M, z3.h, z13.h\n"
+ "fmla z30.h, p2/M, z3.h, z11.h\n"
+ "fmla z31.h, p2/M, z3.h, z12.h\n"
+ "ld1h { z3.h }, p2/Z, [x14, #-3, MUL VL]\n"
+ "fmla z28.h, p2/M, z4.h, z13.h\n"
+ "ld1h { z13.h }, p3/Z, [x26, x15, LSL #1]\n"
+ "ldr x26, [x16, #0x108]\n"
+ "fmla z29.h, p2/M, z4.h, z8.h\n"
+ "ld1h { z8.h }, p3/Z, [x23, x15, LSL #1]\n"
+ "fmla z30.h, p2/M, z4.h, z12.h\n"
+ "fmla z31.h, p2/M, z4.h, z14.h\n"
+ "ld1h { z4.h }, p2/Z, [x14, #-2, MUL VL]\n"
+ "fmla z28.h, p2/M, z0.h, z5.h\n"
+ "ld1h { z5.h }, p3/Z, [x25, x15, LSL #1]\n"
+ "ldr x25, [x16, #0x110]\n"
+ "fmla z29.h, p2/M, z0.h, z6.h\n"
+ "fmla z30.h, p2/M, z0.h, z9.h\n"
+ "fmla z31.h, p2/M, z0.h, z13.h\n"
+ "ld1h { z0.h }, p2/Z, [x14, #-1, MUL VL]\n"
+ "fmla z28.h, p2/M, z1.h, z6.h\n"
+ "ld1h { z6.h }, p3/Z, [x24, x15, LSL #1]\n"
+ "ldr x24, [x16, #0x118]\n"
+ "fmla z29.h, p2/M, z1.h, z10.h\n"
+ "fmla z30.h, p2/M, z1.h, z13.h\n"
+ "fmla z31.h, p2/M, z1.h, z5.h\n"
+ "ld1h { z1.h }, p2/Z, [x14]\n"
+ "fmla z28.h, p2/M, z2.h, z10.h\n"
+ "ld1h { z10.h }, p3/Z, [x22, x15, LSL #1]\n"
+ "fmla z29.h, p2/M, z2.h, z11.h\n"
+ "fmla z30.h, p2/M, z2.h, z5.h\n"
+ "fmla z31.h, p2/M, z2.h, z6.h\n"
+ "ld1h { z2.h }, p2/Z, [x14, #1, MUL VL]\n"
+ "fmla z28.h, p2/M, z3.h, z11.h\n"
+ "ld1h { z11.h }, p3/Z, [x21, x15, LSL #1]\n"
+ "fmla z29.h, p2/M, z3.h, z12.h\n"
+ "fmla z30.h, p2/M, z3.h, z6.h\n"
+ "fmla z31.h, p2/M, z3.h, z8.h\n"
+ "ld1h { z3.h }, p2/Z, [x14, #2, MUL VL]\n"
+ "fmla z28.h, p2/M, z4.h, z12.h\n"
+ "ld1h { z12.h }, p3/Z, [x20, x15, LSL #1]\n"
+ "fmla z29.h, p2/M, z4.h, z14.h\n"
+ "fmla z30.h, p2/M, z4.h, z8.h\n"
+ "fmla z31.h, p2/M, z4.h, z10.h\n"
+ "ld1h { z4.h }, p2/Z, [x14, #3, MUL VL]\n"
+ "fmla z28.h, p2/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p3/Z, [x27, x15, LSL #1]\n"
+ "fmla z29.h, p2/M, z0.h, z13.h\n"
+ "fmla z30.h, p2/M, z0.h, z11.h\n"
+ "ld1h { z11.h }, p3/Z, [x26, x15, LSL #1]\n"
+ "fmla z31.h, p2/M, z0.h, z12.h\n"
+ "fmla z28.h, p2/M, z1.h, z13.h\n"
+ "fmla z29.h, p2/M, z1.h, z5.h\n"
+ "fmla z30.h, p2/M, z1.h, z12.h\n"
+ "ld1h { z12.h }, p3/Z, [x25, x15, LSL #1]\n"
+ "fmla z31.h, p2/M, z1.h, z9.h\n"
+ "fmla z28.h, p2/M, z2.h, z5.h\n"
+ "fmla z29.h, p2/M, z2.h, z6.h\n"
+ "fmla z30.h, p2/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p3/Z, [x24, x15, LSL #1]\n"
+ "fmla z31.h, p2/M, z2.h, z11.h\n"
+ "fmla z28.h, p2/M, z3.h, z6.h\n"
+ "fmla z29.h, p2/M, z3.h, z8.h\n"
+ "fmla z30.h, p2/M, z3.h, z11.h\n"
+ "fmla z31.h, p2/M, z3.h, z12.h\n"
+ "fmla z28.h, p2/M, z4.h, z8.h\n"
+ "fmla z29.h, p2/M, z4.h, z10.h\n"
+ "fmla z30.h, p2/M, z4.h, z12.h\n"
+ "fmla z31.h, p2/M, z4.h, z9.h\n"
+ ".inst 0xc170ca5c // fclamp { z28.h-z31.h }, z18.h, z16.h\n"
+ "st1h { z28.h }, p0, [x12, x28, LSL #1]\n"
+ "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
+ "st1h { z30.h }, p0, [x10, x28, LSL #1]\n"
+ "st1h { z31.h }, p0, [x9, x28, LSL #1]\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..25d83f15c3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..96cfd5e497
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,336 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p3.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "mov x4, #0x0\n"
+ "mov x5, #0x0\n"
+ "1:" // Tile loop
+ "str x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x22, #0x2\n"
+ "str x5, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "mul x20, x4, x21\n" // offset = tile_i * ld_input_row
+ "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "madd x20, x5, x6, x20\n" // offset += tile_j * ld_input_col
+ "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
+ "ldr x7, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "add x7, x7, x20, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x8, x7, x21, LSL #2\n"
+ "add x17, x8, x21, LSL #2\n"
+ "add x16, x6, x6\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x14, x17, x21, LSL #2\n"
+ "add x13, x16, x6\n"
+ "cbnz x5, 2f\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "sub x21, x20, x5\n"
+ "sub x21, x21, #0x1\n"
+ "lsl x12, %x[n_channels], #0x2\n"
+ "mov x20, #0x8\n"
+ "and x21, x21, #0x3fffff\n"
+ "mul x20, x20, x6\n"
+ "orr x12, x12, x21, LSL #22\n"
+ "orr x12, x12, x20, LSL #38\n"
+ "add x11, x8, x6, LSL #2\n"
+ "add x10, x7, x13, LSL #2\n"
+ "add x9, x8, x16, LSL #2\n"
+ "add x28, x17, x6, LSL #2\n"
+ "add x27, x14, x13, LSL #2\n"
+ "add x26, x7, x6, LSL #2\n"
+ "add x25, x7, x16, LSL #2\n"
+ "add x24, x17, x16, LSL #2\n"
+ "add x23, x8, x13, LSL #2\n"
+ "add x22, x17, x13, LSL #2\n"
+ "add x21, x14, x6, LSL #2\n"
+ "add x20, x14, x16, LSL #2\n"
+ ".inst 0xf8ac497a // rprfm pldonce, x12, [x11]\n"
+ ".inst 0xf8ac48fa // rprfm pldonce, x12, [x7]\n"
+ ".inst 0xf8ac495a // rprfm pldonce, x12, [x10]\n"
+ ".inst 0xf8ac493a // rprfm pldonce, x12, [x9]\n"
+ ".inst 0xf8ac4b9a // rprfm pldonce, x12, [x28]\n"
+ ".inst 0xf8ac49da // rprfm pldonce, x12, [x14]\n"
+ ".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
+ ".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
+ ".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
+ ".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
+ ".inst 0xf8ac491a // rprfm pldonce, x12, [x8]\n"
+ ".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
+ ".inst 0xf8ac4a3a // rprfm pldonce, x12, [x17]\n"
+ ".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
+ ".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
+ ".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
+ "2:" // Tile loop: Prefetch input rows: End
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x4, x22\n" // offset = tile_i * ld_output_row
+ "mov x20, #0x2\n"
+ "ld1w { z22.s }, p3/Z, [x15]\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "madd x21, x5, x25, x21\n" // offset += tile_j * ld_output_col
+ "addvl x15, x15, #1\n"
+ ".inst 0xa040c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "mul x21, x21, x20\n" // offset *= output_tile_size
+ "cntw x23\n"
+ "ld1rw { z21.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "addvl x15, x15, #4\n"
+ "add x24, x24, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "addvl x15, x15, #4\n"
+ "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "cmp x23, %x[n_channels]\n"
+ "add x22, x24, x22, LSL #2\n"
+ "ld1w { z8.s }, p3/Z, [x15]\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x23\n"
+ "ld1w { z9.s }, p2/Z, [x8, x6, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x7]\n"
+ "addvl x15, x15, #1\n"
+ "ld1w { z11.s }, p2/Z, [x7, x13, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x8, x16, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x17, x6, LSL #2]\n"
+ "bge 4f\n"
+ "3:" // Tile loop: Channel loop
+ "movprfx z28, z22\n fmla z28.s, p3/M, z4.s, z9.s\n"
+ "movprfx z29, z22\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "whilelt p1.s, x23, %x[n_channels]\n"
+ "incw x21\n"
+ "movprfx z30, z22\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "movprfx z31, z22\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x14]\n"
+ "incw x23\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x17, x16, LSL #2]\n"
+ "incw x20\n"
+ "fmla z28.s, p3/M, z5.s, z12.s\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x7, x6, LSL #2]\n"
+ "fmla z30.s, p3/M, z6.s, z18.s\n"
+ "fmla z31.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z10.s }, p2/Z, [x7, x16, LSL #2]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z28.s, p3/M, z7.s, z13.s\n"
+ "fmla z29.s, p3/M, z6.s, z13.s\n"
+ "ld1w { z22.s }, p3/Z, [x15]\n"
+ "addvl x15, x15, #1\n"
+ "fmla z30.s, p3/M, z4.s, z13.s\n"
+ "fmla z31.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z9.s }, p2/Z, [x8]\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "fmla z29.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z19.s }, p2/Z, [x8, x13, LSL #2]\n"
+ "addvl x8, x8, #1\n"
+ "fmla z30.s, p3/M, z5.s, z16.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "fmla z28.s, p3/M, z2.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z18.s }, p2/Z, [x17]\n"
+ "fmla z30.s, p3/M, z0.s, z9.s\n"
+ "fmla z31.s, p3/M, z2.s, z19.s\n"
+ "fmla z28.s, p3/M, z8.s, z16.s\n"
+ "fmla z29.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x17, x13, LSL #2]\n"
+ "addvl x17, x17, #1\n"
+ "fmla z30.s, p3/M, z3.s, z18.s\n"
+ "fmla z31.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z13.s }, p1/Z, [x17, x6, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z9.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x6, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z19.s\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z31.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x16, LSL #2]\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "fmla z29.s, p3/M, z8.s, z17.s\n"
+ ".inst 0xa040c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z30.s, p3/M, z8.s, z16.s\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
+ "addvl x15, x15, #4\n"
+ "cmp x23, %x[n_channels]\n"
+ ".inst 0xc1aecabc // fclamp { z28.s-z31.s }, z21.s, z14.s\n"
+ "addvl x14, x14, #1\n"
+ "ld1w { z9.s }, p1/Z, [x8, x6, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x7]\n"
+ "st1w { z28.s }, p0, [x24]\n"
+ "ld1w { z11.s }, p1/Z, [x7, x13, LSL #2]\n"
+ "st1w { z29.s }, p0, [x24, x25, LSL #2]\n"
+ "addvl x24, x24, #1\n"
+ "ld1w { z12.s }, p1/Z, [x8, x16, LSL #2]\n"
+ "st1w { z30.s }, p0, [x22]\n"
+ "st1w { z31.s }, p0, [x22, x25, LSL #2]\n"
+ "addvl x22, x22, #1\n"
+ "ld1w { z8.s }, p3/Z, [x15]\n"
+ "addvl x15, x15, #1\n"
+ "blt 3b\n"
+ "4:" // Tile loop: Channel tail
+ "movprfx z24, z22\n fmla z24.s, p3/M, z4.s, z9.s\n"
+ "movprfx z25, z22\n fmla z25.s, p3/M, z3.s, z9.s\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "add x5, x5, #0x1\n"
+ "movprfx z26, z22\n fmla z26.s, p3/M, z1.s, z9.s\n"
+ "movprfx z27, z22\n fmla z27.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z17.s }, p2/Z, [x14]\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z25.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
+ "fmla z27.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z20.s }, p2/Z, [x17, x16, LSL #2]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "fmla z25.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z18.s }, p2/Z, [x7, x6, LSL #2]\n"
+ "cmp x5, x20\n"
+ "fmla z26.s, p3/M, z6.s, z17.s\n"
+ "fmla z27.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x7, x16, LSL #2]\n"
+ "add x20, x4, #0x1\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "fmla z25.s, p3/M, z6.s, z13.s\n"
+ "csel x4, x4, x20, LT\n"
+ "mov p0.b, p2.b\n"
+ "fmla z26.s, p3/M, z4.s, z13.s\n"
+ "fmla z27.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x8]\n"
+ "csel x5, x5, XZR, LT\n"
+ "fmla z24.s, p3/M, z1.s, z18.s\n"
+ "fmla z25.s, p3/M, z0.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x8, x13, LSL #2]\n"
+ "cmp x4, x21\n"
+ "fmla z26.s, p3/M, z5.s, z20.s\n"
+ "fmla z27.s, p3/M, z4.s, z20.s\n"
+ "fmla z24.s, p3/M, z2.s, z17.s\n"
+ "fmla z25.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x17]\n"
+ "fmla z26.s, p3/M, z0.s, z16.s\n"
+ "fmla z27.s, p3/M, z2.s, z19.s\n"
+ "fmla z24.s, p3/M, z8.s, z20.s\n"
+ "fmla z25.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z17.s }, p2/Z, [x17, x13, LSL #2]\n"
+ "fmla z26.s, p3/M, z3.s, z18.s\n"
+ "fmla z27.s, p3/M, z5.s, z17.s\n"
+ "fmla z24.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x6, LSL #2]\n"
+ "fmla z25.s, p3/M, z5.s, z19.s\n"
+ "fmla z26.s, p3/M, z7.s, z16.s\n"
+ "fmla z27.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x16, LSL #2]\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "fmla z25.s, p3/M, z8.s, z17.s\n"
+ "fmla z26.s, p3/M, z8.s, z16.s\n"
+ "fmla z27.s, p3/M, z7.s, z16.s\n"
+ ".inst 0xc1aecab8 // fclamp { z24.s-z27.s }, z21.s, z14.s\n"
+ "st1w { z24.s }, p0, [x24]\n"
+ "st1w { z25.s }, p0, [x24, x25, LSL #2]\n"
+ "st1w { z26.s }, p0, [x22]\n"
+ "st1w { z27.s }, p0, [x22, x25, LSL #2]\n"
+ "blt 1b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..39f1b3635f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[16];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[5];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[3];
+ inptrs[3] = input_ptrs[6];
+ inptrs[4] = input_ptrs[9];
+ inptrs[5] = input_ptrs[12];
+ inptrs[6] = input_ptrs[15];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[2];
+ inptrs[9] = input_ptrs[10];
+ inptrs[10] = input_ptrs[4];
+ inptrs[11] = input_ptrs[7];
+ inptrs[12] = input_ptrs[8];
+ inptrs[13] = input_ptrs[11];
+ inptrs[14] = input_ptrs[13];
+ inptrs[15] = input_ptrs[14];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ptrue p3.b\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "ld1w { z23.s }, p3/Z, [x14]\n"
+ "addvl x14, x14, #1\n"
+ "ldp x13, x12, [x20, #0x0]\n"
+ "cntw x11\n"
+ ".inst 0xa040c1c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x14]\n"
+ "addvl x14, x14, #4\n"
+ "ldp x10, x9, [x20, #0x10]\n"
+ "mov x28, #0x0\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ ".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "addvl x14, x14, #4\n"
+ "cmp x11, %x[n_channels]\n"
+ "ld1rw { z22.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x27, XZR, x11\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ld1w { z8.s }, p3/Z, [x14]\n"
+ "addvl x14, x14, #1\n"
+ "ld1w { z9.s }, p2/Z, [x24, x28, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "movprfx z28, z23\n fmla z28.s, p3/M, z4.s, z9.s\n"
+ "movprfx z29, z23\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "ldr x20, [x15, #0x28]\n"
+ "whilelt p1.s, x11, %x[n_channels]\n"
+ "movprfx z30, z23\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "movprfx z31, z23\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x30]\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ldr x21, [x15, #0x38]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z12.s\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x40]\n"
+ "fmla z30.s, p3/M, z6.s, z19.s\n"
+ "fmla z31.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z25.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x21, [x15, #0x50]\n"
+ "fmla z28.s, p3/M, z7.s, z13.s\n"
+ "fmla z29.s, p3/M, z6.s, z13.s\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ld1w { z23.s }, p3/Z, [x14]\n"
+ "fmla z30.s, p3/M, z4.s, z13.s\n"
+ "fmla z31.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ldr x21, [x15, #0x60]\n"
+ "fmla z28.s, p3/M, z1.s, z16.s\n"
+ "fmla z29.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x68]\n"
+ "fmla z30.s, p3/M, z5.s, z17.s\n"
+ "fmla z31.s, p3/M, z4.s, z17.s\n"
+ "ldr x26, [x15, #0x70]\n"
+ "addvl x14, x14, #1\n"
+ "fmla z28.s, p3/M, z2.s, z25.s\n"
+ "fmla z29.s, p3/M, z1.s, z25.s\n"
+ "ld1w { z18.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ldr x25, [x15, #0x78]\n"
+ "fmla z30.s, p3/M, z0.s, z11.s\n"
+ "fmla z31.s, p3/M, z2.s, z19.s\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "incw x27\n"
+ "fmla z28.s, p3/M, z8.s, z17.s\n"
+ "fmla z29.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "fmla z30.s, p3/M, z3.s, z18.s\n"
+ "fmla z31.s, p3/M, z5.s, z17.s\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ld1w { z13.s }, p1/Z, [x20, x11, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x28, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z19.s\n"
+ "mov p0.b, p2.b\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z31.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x28, LSL #2]\n"
+ "incw x28\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "fmla z29.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z9.s }, p1/Z, [x24, x11, LSL #2]\n"
+ "whilelt p2.s, x28, %x[n_channels]\n"
+ "fmla z30.s, p3/M, z8.s, z16.s\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z10.s }, p1/Z, [x23, x11, LSL #2]\n"
+ "ld1w { z11.s }, p1/Z, [x22, x11, LSL #2]\n"
+ ".inst 0xc1afcadc // fclamp { z28.s-z31.s }, z22.s, z15.s\n"
+ "st1w { z28.s }, p0, [x13, x27, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x21, x11, LSL #2]\n"
+ "incw x11\n"
+ "cmp x11, %x[n_channels]\n"
+ "st1w { z29.s }, p0, [x12, x27, LSL #2]\n"
+ ".inst 0xa040c1c0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x14]\n"
+ "addvl x14, x14, #4\n"
+ "st1w { z30.s }, p0, [x10, x27, LSL #2]\n"
+ ".inst 0xa040c1c4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x14]\n"
+ "addvl x14, x14, #4\n"
+ "st1w { z31.s }, p0, [x9, x27, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x14]\n"
+ "addvl x14, x14, #1\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z28, z23\n fmla z28.s, p3/M, z4.s, z9.s\n"
+ "movprfx z29, z23\n fmla z29.s, p3/M, z3.s, z9.s\n"
+ "ldr x20, [x15, #0x28]\n"
+ "incw x27\n"
+ "movprfx z30, z23\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "movprfx z31, z23\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x30]\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ldr x21, [x15, #0x38]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "ldr x20, [x15, #0x48]\n"
+ "ld1w { z20.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z12.s\n"
+ "fmla z29.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z18.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x40]\n"
+ "fmla z30.s, p3/M, z6.s, z17.s\n"
+ "fmla z31.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x50]\n"
+ "fmla z28.s, p3/M, z7.s, z13.s\n"
+ "fmla z29.s, p3/M, z6.s, z13.s\n"
+ "ldr x21, [x15, #0x58]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z30.s, p3/M, z4.s, z13.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x60]\n"
+ "fmla z28.s, p3/M, z1.s, z18.s\n"
+ "fmla z29.s, p3/M, z0.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ldr x22, [x15, #0x68]\n"
+ "fmla z30.s, p3/M, z5.s, z20.s\n"
+ "fmla z31.s, p3/M, z4.s, z20.s\n"
+ "ldr x21, [x15, #0x70]\n"
+ "fmla z28.s, p3/M, z2.s, z17.s\n"
+ "fmla z29.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "fmla z30.s, p3/M, z0.s, z16.s\n"
+ "fmla z31.s, p3/M, z2.s, z19.s\n"
+ "fmla z28.s, p3/M, z8.s, z20.s\n"
+ "fmla z29.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z17.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z18.s\n"
+ "fmla z31.s, p3/M, z5.s, z17.s\n"
+ "fmla z28.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "fmla z29.s, p3/M, z5.s, z19.s\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z31.s, p3/M, z6.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "fmla z29.s, p3/M, z8.s, z17.s\n"
+ "fmla z30.s, p3/M, z8.s, z16.s\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ ".inst 0xc1afcadc // fclamp { z28.s-z31.s }, z22.s, z15.s\n"
+ "st1w { z28.s }, p0, [x13, x27, LSL #2]\n"
+ "st1w { z29.s }, p0, [x12, x27, LSL #2]\n"
+ "st1w { z30.s }, p0, [x10, x27, LSL #2]\n"
+ "st1w { z31.s }, p0, [x9, x27, LSL #2]\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..bd330dc21e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 3;
+ constexpr static unsigned int output_cols = 3;
+
+ sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..d15a3a8377
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,483 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p3.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "mov x2, #0x0\n"
+ "mov x3, #0x0\n"
+ "1:" // Tile loop
+ "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x22, #0x3\n"
+ "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "mul x20, x2, x21\n" // offset = tile_i * ld_input_row
+ "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "madd x20, x3, x4, x20\n" // offset += tile_j * ld_input_col
+ "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
+ "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "add x5, x5, x20, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x6, x5, x21, LSL #2\n"
+ "add x7, x6, x21, LSL #2\n"
+ "add x8, x4, x4\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x16, x7, x21, LSL #2\n"
+ "add x15, x8, x4\n"
+ "add x14, x16, x21, LSL #2\n"
+ "add x13, x15, x4\n"
+ "cbnz x3, 2f\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "sub x21, x20, x3\n"
+ "sub x21, x21, #0x1\n"
+ "lsl x12, %x[n_channels], #0x2\n"
+ "mov x20, #0xc\n"
+ "and x21, x21, #0x3fffff\n"
+ "mul x20, x20, x4\n"
+ "orr x12, x12, x21, LSL #22\n"
+ "orr x12, x12, x20, LSL #38\n"
+ "add x27, x7, x8, LSL #2\n"
+ "add x26, x5, x13, LSL #2\n"
+ "add x25, x6, x8, LSL #2\n"
+ "add x24, x14, x13, LSL #2\n"
+ "add x23, x7, x4, LSL #2\n"
+ "add x22, x5, x4, LSL #2\n"
+ "add x21, x5, x15, LSL #2\n"
+ "add x20, x7, x15, LSL #2\n"
+ "add x11, x6, x13, LSL #2\n"
+ "add x10, x16, x8, LSL #2\n"
+ "add x9, x16, x13, LSL #2\n"
+ "add x28, x14, x4, LSL #2\n"
+ ".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
+ "add x27, x6, x4, LSL #2\n"
+ ".inst 0xf8ac48ba // rprfm pldonce, x12, [x5]\n"
+ ".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
+ "add x26, x6, x15, LSL #2\n"
+ ".inst 0xf8ac49da // rprfm pldonce, x12, [x14]\n"
+ ".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
+ "add x25, x14, x15, LSL #2\n"
+ ".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
+ "add x24, x16, x4, LSL #2\n"
+ ".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
+ "add x23, x5, x8, LSL #2\n"
+ ".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
+ "add x22, x16, x15, LSL #2\n"
+ ".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
+ "add x21, x7, x13, LSL #2\n"
+ ".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
+ "add x20, x14, x8, LSL #2\n"
+ ".inst 0xf8ac48da // rprfm pldonce, x12, [x6]\n"
+ ".inst 0xf8ac497a // rprfm pldonce, x12, [x11]\n"
+ ".inst 0xf8ac4a1a // rprfm pldonce, x12, [x16]\n"
+ ".inst 0xf8ac495a // rprfm pldonce, x12, [x10]\n"
+ ".inst 0xf8ac493a // rprfm pldonce, x12, [x9]\n"
+ ".inst 0xf8ac4b9a // rprfm pldonce, x12, [x28]\n"
+ ".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
+ ".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
+ ".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
+ ".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
+ ".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
+ ".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
+ ".inst 0xf8ac48fa // rprfm pldonce, x12, [x7]\n"
+ ".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
+ ".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
+ "2:" // Tile loop: Prefetch input rows: End
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x2, x22\n" // offset = tile_i * ld_output_row
+ "mov x20, #0x3\n"
+ "ld1w { z24.s }, p3/Z, [x17]\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "madd x21, x3, x27, x21\n" // offset += tile_j * ld_output_col
+ "mul x21, x21, x20\n" // offset *= output_tile_size
+ "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ldr x26, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "addvl x17, x17, #1\n"
+ "add x26, x26, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+ "cntw x25\n"
+ "addvl x17, x17, #4\n"
+ ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+ "add x24, x26, x22, LSL #2\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "addvl x17, x17, #4\n"
+ "cmp x25, %x[n_channels]\n"
+ "ld1w { z8.s }, p3/Z, [x17]\n"
+ "add x23, x24, x22, LSL #2\n"
+ "add x22, x27, x27\n"
+ "ld1w { z9.s }, p2/Z, [x7, x8, LSL #2]\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x25\n"
+ "ld1w { z10.s }, p2/Z, [x5]\n"
+ "ld1w { z11.s }, p2/Z, [x5, x13, LSL #2]\n"
+ "addvl x17, x17, #1\n"
+ "ld1w { z12.s }, p2/Z, [x14]\n"
+ "ld1w { z13.s }, p2/Z, [x6, x8, LSL #2]\n"
+ "bge 4f\n"
+ "3:" // Tile loop: Channel loop
+ "movprfx z28, z24\n fmla z28.s, p3/M, z7.s, z9.s\n"
+ "movprfx z27, z24\n fmla z27.s, p3/M, z8.s, z9.s\n"
+ "whilelt p1.s, x25, %x[n_channels]\n"
+ "incw x21\n"
+ "movprfx z29, z24\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "fmla z28.s, p3/M, z4.s, z13.s\n"
+ "incw x25\n"
+ "mov p0.b, p2.b\n"
+ "movprfx z30, z24\n fmla z30.s, p3/M, z5.s, z9.s\n"
+ "movprfx z31, z24\n fmla z31.s, p3/M, z4.s, z9.s\n"
+ "incw x20\n"
+ "movprfx z20, z24\n fmla z20.s, p3/M, z3.s, z9.s\n"
+ "fmla z27.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x7, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z19.s }, p2/Z, [x7, x4, LSL #2]\n"
+ "movprfx z21, z24\n fmla z21.s, p3/M, z2.s, z9.s\n"
+ "fmla z28.s, p3/M, z6.s, z19.s\n"
+ "movprfx z23, z24\n fmla z23.s, p3/M, z0.s, z9.s\n"
+ "fmla z27.s, p3/M, z5.s, z13.s\n"
+ "fmla z29.s, p3/M, z3.s, z13.s\n"
+ "fmla z30.s, p3/M, z2.s, z13.s\n"
+ "fmla z31.s, p3/M, z1.s, z13.s\n"
+ "fmla z20.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x5, x4, LSL #2]\n"
+ "fmla z21.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z15.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "movprfx z22, z24\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z18.s\n"
+ "fmla z23.s, p3/M, z8.s, z15.s\n"
+ "fmla z27.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x5, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z19.s\n"
+ "fmla z30.s, p3/M, z4.s, z19.s\n"
+ "ld1w { z24.s }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "fmla z31.s, p3/M, z3.s, z19.s\n"
+ "fmla z21.s, p3/M, z1.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x6]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x16]\n"
+ "fmla z20.s, p3/M, z4.s, z10.s\n"
+ "fmla z27.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z9.s }, p2/Z, [x6, x13, LSL #2]\n"
+ "fmla z22.s, p3/M, z2.s, z10.s\n"
+ "fmla z23.s, p3/M, z1.s, z10.s\n"
+ "fmla z28.s, p3/M, z8.s, z10.s\n"
+ "fmla z29.s, p3/M, z7.s, z10.s\n"
+ "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "fmla z30.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z19.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "fmla z20.s, p3/M, z2.s, z9.s\n"
+ "fmla z21.s, p3/M, z3.s, z16.s\n"
+ "fmla z22.s, p3/M, z4.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z19.s\n"
+ "fmla z27.s, p3/M, z3.s, z17.s\n"
+ "fmla z29.s, p3/M, z5.s, z9.s\n"
+ "ld1w { z17.s }, p2/Z, [x16, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z31.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
+ "fmla z20.s, p3/M, z6.s, z19.s\n"
+ "fmla z21.s, p3/M, z5.s, z19.s\n"
+ "ld1w { z18.s }, p2/Z, [x6, x4, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z17.s\n"
+ "fmla z22.s, p3/M, z6.s, z16.s\n"
+ "fmla z30.s, p3/M, z8.s, z19.s\n"
+ "fmla z20.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x6, x15, LSL #2]\n"
+ "addvl x6, x6, #1\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmla z28.s, p3/M, z3.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z0.s, z18.s\n"
+ "fmla z27.s, p3/M, z4.s, z18.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z23.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x16, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z18.s\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z11.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "addvl x16, x16, #1\n"
+ "fmla z29.s, p3/M, z4.s, z17.s\n"
+ "fmla z31.s, p3/M, z2.s, z17.s\n"
+ "fmla z20.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x5, x8, LSL #2]\n"
+ "fmla z21.s, p3/M, z4.s, z11.s\n"
+ "addvl x5, x5, #1\n"
+ "fmla z22.s, p3/M, z3.s, z11.s\n"
+ "fmla z27.s, p3/M, z2.s, z16.s\n"
+ "ld1w { z10.s }, p1/Z, [x5]\n"
+ "fmla z23.s, p3/M, z4.s, z19.s\n"
+ "fmla z30.s, p3/M, z7.s, z11.s\n"
+ "fmla z31.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x7]\n"
+ "fmla z29.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x7, x13, LSL #2]\n"
+ "fmla z20.s, p3/M, z7.s, z19.s\n"
+ "addvl x7, x7, #1\n"
+ "fmla z22.s, p3/M, z5.s, z19.s\n"
+ "fmla z27.s, p3/M, z6.s, z18.s\n"
+ "ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
+ "fmla z21.s, p3/M, z0.s, z18.s\n"
+ "fmla z23.s, p3/M, z2.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z18.s\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
+ "fmla z29.s, p3/M, z8.s, z17.s\n"
+ "fmla z20.s, p3/M, z5.s, z17.s\n"
+ ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "fmla z22.s, p3/M, z7.s, z16.s\n"
+ "addvl x14, x14, #1\n"
+ "cmp x25, %x[n_channels]\n"
+ "fmla z23.s, p3/M, z6.s, z16.s\n"
+ "fmax z27.s, p3/M, z27.s, z26.s\n"
+ ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "fmin z27.s, p3/M, z27.s, z14.s\n"
+ ".inst 0xc1aecb5c // fclamp { z28.s-z31.s }, z26.s, z14.s\n"
+ "ld1w { z11.s }, p1/Z, [x5, x13, LSL #2]\n"
+ ".inst 0xc1aecb54 // fclamp { z20.s-z23.s }, z26.s, z14.s\n"
+ "ld1w { z12.s }, p1/Z, [x14]\n"
+ "st1w { z27.s }, p0, [x26]\n"
+ "ld1w { z13.s }, p1/Z, [x6, x8, LSL #2]\n"
+ "st1w { z28.s }, p0, [x26, x27, LSL #2]\n"
+ "st1w { z29.s }, p0, [x26, x22, LSL #2]\n"
+ "addvl x26, x26, #1\n"
+ "ld1w { z8.s }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "st1w { z30.s }, p0, [x24]\n"
+ "st1w { z31.s }, p0, [x24, x27, LSL #2]\n"
+ "st1w { z20.s }, p0, [x24, x22, LSL #2]\n"
+ "addvl x24, x24, #1\n"
+ "st1w { z21.s }, p0, [x23]\n"
+ "st1w { z22.s }, p0, [x23, x27, LSL #2]\n"
+ "st1w { z23.s }, p0, [x23, x22, LSL #2]\n"
+ "addvl x23, x23, #1\n"
+ "blt 3b\n"
+ "4:" // Tile loop: Channel tail
+ "movprfx z28, z24\n fmla z28.s, p3/M, z7.s, z9.s\n"
+ "movprfx z25, z24\n fmla z25.s, p3/M, z8.s, z9.s\n"
+ "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "add x3, x3, #0x1\n"
+ "movprfx z29, z24\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "fmla z28.s, p3/M, z4.s, z13.s\n"
+ "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x2, #0x1\n"
+ "movprfx z30, z24\n fmla z30.s, p3/M, z5.s, z9.s\n"
+ "movprfx z31, z24\n fmla z31.s, p3/M, z4.s, z9.s\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x3, x20\n"
+ "movprfx z20, z24\n fmla z20.s, p3/M, z3.s, z9.s\n"
+ "fmla z25.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z27.s }, p2/Z, [x7, x15, LSL #2]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z29.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x7, x4, LSL #2]\n"
+ "movprfx z21, z24\n fmla z21.s, p3/M, z2.s, z9.s\n"
+ "csel x2, x2, x21, LT\n"
+ "fmla z28.s, p3/M, z6.s, z17.s\n"
+ "movprfx z23, z24\n fmla z23.s, p3/M, z0.s, z9.s\n"
+ "mov p0.b, p2.b\n"
+ "csel x3, x3, XZR, LT\n"
+ "fmla z25.s, p3/M, z5.s, z13.s\n"
+ "fmla z29.s, p3/M, z3.s, z13.s\n"
+ "cmp x2, x20\n"
+ "fmla z30.s, p3/M, z2.s, z13.s\n"
+ "fmla z31.s, p3/M, z1.s, z13.s\n"
+ "fmla z20.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z19.s }, p2/Z, [x5, x4, LSL #2]\n"
+ "fmla z21.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "movprfx z22, z24\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z19.s\n"
+ "fmla z23.s, p3/M, z8.s, z16.s\n"
+ "fmla z25.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x5, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z17.s\n"
+ "fmla z30.s, p3/M, z4.s, z17.s\n"
+ "fmla z31.s, p3/M, z3.s, z17.s\n"
+ "fmla z21.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x6]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x16]\n"
+ "fmla z20.s, p3/M, z4.s, z27.s\n"
+ "fmla z25.s, p3/M, z1.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x6, x13, LSL #2]\n"
+ "fmla z22.s, p3/M, z2.s, z27.s\n"
+ "fmla z23.s, p3/M, z1.s, z27.s\n"
+ "fmla z28.s, p3/M, z8.s, z27.s\n"
+ "fmla z29.s, p3/M, z7.s, z27.s\n"
+ "fmla z31.s, p3/M, z5.s, z27.s\n"
+ "fmla z30.s, p3/M, z0.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "fmla z20.s, p3/M, z2.s, z16.s\n"
+ "fmla z21.s, p3/M, z3.s, z17.s\n"
+ "fmla z22.s, p3/M, z4.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z19.s\n"
+ "fmla z25.s, p3/M, z3.s, z18.s\n"
+ "fmla z29.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x16, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z6.s, z17.s\n"
+ "fmla z31.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
+ "fmla z20.s, p3/M, z6.s, z19.s\n"
+ "fmla z21.s, p3/M, z5.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x6, x4, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z18.s\n"
+ "fmla z22.s, p3/M, z6.s, z16.s\n"
+ "fmla z30.s, p3/M, z8.s, z19.s\n"
+ "fmla z20.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x6, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmla z28.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z0.s, z17.s\n"
+ "fmla z25.s, p3/M, z4.s, z17.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z23.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x16, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z17.s\n"
+ "fmla z28.s, p3/M, z5.s, z18.s\n"
+ "ld1w { z17.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z18.s\n"
+ "fmla z31.s, p3/M, z2.s, z18.s\n"
+ "fmla z20.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x5, x8, LSL #2]\n"
+ "fmla z21.s, p3/M, z4.s, z17.s\n"
+ "fmla z22.s, p3/M, z3.s, z17.s\n"
+ "fmla z25.s, p3/M, z2.s, z16.s\n"
+ "fmla z23.s, p3/M, z4.s, z19.s\n"
+ "fmla z30.s, p3/M, z7.s, z17.s\n"
+ "fmla z31.s, p3/M, z6.s, z17.s\n"
+ "fmla z28.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x7]\n"
+ "fmla z29.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x7, x13, LSL #2]\n"
+ "fmla z20.s, p3/M, z7.s, z19.s\n"
+ "fmla z22.s, p3/M, z5.s, z19.s\n"
+ "fmla z25.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z0.s, z18.s\n"
+ "fmla z23.s, p3/M, z2.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z18.s\n"
+ "fmla z29.s, p3/M, z8.s, z17.s\n"
+ "fmla z20.s, p3/M, z5.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "fmla z22.s, p3/M, z7.s, z16.s\n"
+ "fmla z23.s, p3/M, z6.s, z16.s\n"
+ "fmax z25.s, p3/M, z25.s, z26.s\n"
+ "fmin z25.s, p3/M, z25.s, z14.s\n"
+ ".inst 0xc1aecb5c // fclamp { z28.s-z31.s }, z26.s, z14.s\n"
+ "st1w { z25.s }, p0, [x26]\n"
+ ".inst 0xc1aecb54 // fclamp { z20.s-z23.s }, z26.s, z14.s\n"
+ "st1w { z28.s }, p0, [x26, x27, LSL #2]\n"
+ "st1w { z29.s }, p0, [x26, x22, LSL #2]\n"
+ "st1w { z30.s }, p0, [x24]\n"
+ "st1w { z31.s }, p0, [x24, x27, LSL #2]\n"
+ "st1w { z20.s }, p0, [x24, x22, LSL #2]\n"
+ "st1w { z21.s }, p0, [x23]\n"
+ "st1w { z22.s }, p0, [x23, x27, LSL #2]\n"
+ "st1w { z23.s }, p0, [x23, x22, LSL #2]\n"
+ "blt 1b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..2c868b6cf3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,444 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[25];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[4];
+ inptrs[3] = input_ptrs[20];
+ inptrs[4] = input_ptrs[7];
+ inptrs[5] = input_ptrs[24];
+ inptrs[6] = input_ptrs[11];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[3];
+ inptrs[9] = input_ptrs[13];
+ inptrs[10] = input_ptrs[5];
+ inptrs[11] = input_ptrs[9];
+ inptrs[12] = input_ptrs[15];
+ inptrs[13] = input_ptrs[17];
+ inptrs[14] = input_ptrs[19];
+ inptrs[15] = input_ptrs[21];
+ inptrs[16] = input_ptrs[6];
+ inptrs[17] = input_ptrs[8];
+ inptrs[18] = input_ptrs[23];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[18];
+ inptrs[22] = input_ptrs[10];
+ inptrs[23] = input_ptrs[14];
+ inptrs[24] = input_ptrs[22];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ptrue p3.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "ld1w { z20.s }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "ldp x24, x23, [x17, #0x0]\n"
+ "ldp x22, x21, [x17, #0x10]\n"
+ "cntw x16\n"
+ ".inst 0xa040c100 // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "ldr x20, [x17, #0x20]\n"
+ "mov x15, #0x0\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ ".inst 0xa040c104 // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "cmp x16, %x[n_channels]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ld1rw { z22.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x13, XZR, x16\n"
+ "ld1w { z8.s }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "movprfx z21, z20\n fmla z21.s, p3/M, z8.s, z9.s\n"
+ "movprfx z24, z20\n fmla z24.s, p3/M, z7.s, z9.s\n"
+ "ldr x22, [x17, #0x30]\n"
+ "incw x13\n"
+ "movprfx z25, z20\n fmla z25.s, p3/M, z6.s, z9.s\n"
+ "fmla z21.s, p3/M, z0.s, z10.s\n"
+ "ldr x25, [x17, #0x38]\n"
+ "mov p1.b, p2.b\n"
+ "fmla z24.s, p3/M, z4.s, z13.s\n"
+ "movprfx z26, z20\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "ldr x21, [x17, #0x28]\n"
+ "whilelt p0.s, x16, %x[n_channels]\n"
+ "movprfx z27, z20\n fmla z27.s, p3/M, z4.s, z9.s\n"
+ "movprfx z28, z20\n fmla z28.s, p3/M, z3.s, z9.s\n"
+ "ldr x20, [x17, #0x48]\n"
+ "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z23.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "movprfx z29, z20\n fmla z29.s, p3/M, z2.s, z9.s\n"
+ "ldr x20, [x17, #0x40]\n"
+ "fmla z21.s, p3/M, z5.s, z13.s\n"
+ "fmla z24.s, p3/M, z6.s, z23.s\n"
+ "ldr x24, [x17, #0x50]\n"
+ "movprfx z31, z20\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "fmla z25.s, p3/M, z3.s, z13.s\n"
+ "ldr x23, [x17, #0x58]\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "fmla z27.s, p3/M, z1.s, z13.s\n"
+ "ldr x22, [x17, #0x60]\n"
+ "fmla z28.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z6.s, z12.s\n"
+ "ldr x12, [x17, #0x70]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "movprfx z30, z20\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "fmla z21.s, p3/M, z7.s, z23.s\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmla z24.s, p3/M, z0.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x27, [x17, #0x78]\n"
+ "fmla z26.s, p3/M, z4.s, z23.s\n"
+ "fmla z27.s, p3/M, z3.s, z23.s\n"
+ "ldr x20, [x17, #0x80]\n"
+ "ld1w { z20.s }, p3/Z, [x8]\n"
+ "fmla z30.s, p3/M, z0.s, z23.s\n"
+ "fmla z28.s, p3/M, z4.s, z19.s\n"
+ "ldr x11, [x17, #0x88]\n"
+ "addvl x8, x8, #1\n"
+ "fmla z29.s, p3/M, z1.s, z23.s\n"
+ "fmla z21.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x26, [x17, #0x90]\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z25.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x25, [x17, #0x98]\n"
+ "ld1w { z17.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z5.s, z19.s\n"
+ "fmla z30.s, p3/M, z2.s, z19.s\n"
+ "ldr x24, [x17, #0xa0]\n"
+ "fmla z26.s, p3/M, z0.s, z18.s\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ldr x10, [x14, #0x0]\n"
+ "fmla z24.s, p3/M, z8.s, z19.s\n"
+ "fmla z25.s, p3/M, z7.s, z19.s\n"
+ "ldr x9, [x14, #0x8]\n"
+ "fmla z31.s, p3/M, z1.s, z19.s\n"
+ "fmla z29.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x23, [x17, #0xa8]\n"
+ "fmla z26.s, p3/M, z6.s, z17.s\n"
+ "fmla z27.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z23.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z28.s, p3/M, z6.s, z16.s\n"
+ "fmla z30.s, p3/M, z4.s, z16.s\n"
+ "ldr x28, [x14, #0x10]\n"
+ "fmla z21.s, p3/M, z3.s, z18.s\n"
+ "fmla z25.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z15.s }, p2/Z, [x12, x15, LSL #2]\n"
+ "ldr x21, [x17, #0xb0]\n"
+ "fmla z29.s, p3/M, z5.s, z16.s\n"
+ "fmla z31.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.s, p3/M, z8.s, z16.s\n"
+ "fmla z28.s, p3/M, z8.s, z15.s\n"
+ "ldr x27, [x14, #0x18]\n"
+ "fmla z30.s, p3/M, z6.s, z19.s\n"
+ "fmla z24.s, p3/M, z3.s, z23.s\n"
+ "fmla z27.s, p3/M, z0.s, z23.s\n"
+ "fmla z31.s, p3/M, z5.s, z15.s\n"
+ "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z19.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z4.s, z23.s\n"
+ "fmla z26.s, p3/M, z1.s, z23.s\n"
+ "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z4.s, z17.s\n"
+ "fmla z27.s, p3/M, z2.s, z17.s\n"
+ "fmla z28.s, p3/M, z1.s, z17.s\n"
+ "fmla z30.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x26, [x17, #0x20]\n"
+ "fmla z21.s, p3/M, z2.s, z17.s\n"
+ "fmla z26.s, p3/M, z7.s, z16.s\n"
+ "fmla z27.s, p3/M, z6.s, z16.s\n"
+ "fmla z29.s, p3/M, z4.s, z16.s\n"
+ "fmla z30.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z6.s, z18.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "fmla z24.s, p3/M, z1.s, z17.s\n"
+ "fmla z25.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmax z21.s, p3/M, z21.s, z22.s\n"
+ "fmla z28.s, p3/M, z7.s, z16.s\n"
+ "fmla z30.s, p3/M, z5.s, z16.s\n"
+ "fmla z29.s, p3/M, z0.s, z18.s\n"
+ "fmla z31.s, p3/M, z2.s, z17.s\n"
+ "fmla z27.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldp x22, x21, [x17, #0x0]\n"
+ "fmla z26.s, p3/M, z3.s, z18.s\n"
+ "fmla z25.s, p3/M, z8.s, z17.s\n"
+ "ldp x25, x24, [x17, #0x10]\n"
+ "incw x15\n"
+ "fmin z21.s, p3/M, z21.s, z14.s\n"
+ "st1w { z21.s }, p1, [x10, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x20]\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z16.s\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z9.s }, p0/Z, [x22, x16, LSL #2]\n"
+ "whilelt p2.s, x15, %x[n_channels]\n"
+ "fmla z31.s, p3/M, z6.s, z16.s\n"
+ ".inst 0xc1aecad8 // fclamp { z24.s-z27.s }, z22.s, z14.s\n"
+ "st1w { z24.s }, p1, [x9, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x28]\n"
+ "st1w { z25.s }, p1, [x28, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x30]\n"
+ "ld1w { z10.s }, p0/Z, [x21, x16, LSL #2]\n"
+ ".inst 0xc1aecadc // fclamp { z28.s-z31.s }, z22.s, z14.s\n"
+ "st1w { z26.s }, p1, [x27, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x38]\n"
+ "ld1w { z11.s }, p0/Z, [x25, x16, LSL #2]\n"
+ "st1w { z27.s }, p1, [x20, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x40]\n"
+ "ld1w { z12.s }, p0/Z, [x24, x16, LSL #2]\n"
+ "ld1w { z13.s }, p0/Z, [x26, x16, LSL #2]\n"
+ "incw x16\n"
+ "cmp x16, %x[n_channels]\n"
+ "st1w { z28.s }, p1, [x23, x13, LSL #2]\n"
+ ".inst 0xa040c100 // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "st1w { z29.s }, p1, [x22, x13, LSL #2]\n"
+ ".inst 0xa040c104 // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "st1w { z30.s }, p1, [x21, x13, LSL #2]\n"
+ "st1w { z31.s }, p1, [x20, x13, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z21, z20\n fmla z21.s, p3/M, z8.s, z9.s\n"
+ "movprfx z24, z20\n fmla z24.s, p3/M, z7.s, z9.s\n"
+ "ldr x23, [x17, #0x30]\n"
+ "incw x13\n"
+ "movprfx z25, z20\n fmla z25.s, p3/M, z6.s, z9.s\n"
+ "fmla z21.s, p3/M, z0.s, z10.s\n"
+ "ldr x22, [x17, #0x38]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z24.s, p3/M, z4.s, z13.s\n"
+ "movprfx z26, z20\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "ldr x21, [x17, #0x28]\n"
+ "movprfx z27, z20\n fmla z27.s, p3/M, z4.s, z9.s\n"
+ "movprfx z28, z20\n fmla z28.s, p3/M, z3.s, z9.s\n"
+ "ldr x20, [x17, #0x48]\n"
+ "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "movprfx z29, z20\n fmla z29.s, p3/M, z2.s, z9.s\n"
+ "ldr x20, [x17, #0x40]\n"
+ "fmla z21.s, p3/M, z5.s, z13.s\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "ldr x25, [x17, #0x50]\n"
+ "movprfx z31, z20\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "fmla z25.s, p3/M, z3.s, z13.s\n"
+ "ldr x24, [x17, #0x58]\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "fmla z27.s, p3/M, z1.s, z13.s\n"
+ "ldr x23, [x17, #0x60]\n"
+ "fmla z28.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z6.s, z12.s\n"
+ "ldr x12, [x17, #0x70]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "movprfx z30, z20\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "fmla z21.s, p3/M, z7.s, z18.s\n"
+ "ldr x22, [x17, #0x68]\n"
+ "fmla z24.s, p3/M, z0.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x21, [x17, #0x78]\n"
+ "fmla z26.s, p3/M, z4.s, z18.s\n"
+ "fmla z27.s, p3/M, z3.s, z18.s\n"
+ "ldr x20, [x17, #0x80]\n"
+ "fmla z30.s, p3/M, z0.s, z18.s\n"
+ "fmla z28.s, p3/M, z4.s, z19.s\n"
+ "ldr x11, [x17, #0x88]\n"
+ "fmla z29.s, p3/M, z1.s, z18.s\n"
+ "fmla z21.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z20.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ldr x10, [x17, #0x90]\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z25.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x9, [x17, #0x98]\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z5.s, z19.s\n"
+ "fmla z30.s, p3/M, z2.s, z19.s\n"
+ "ldr x28, [x17, #0xa0]\n"
+ "fmla z26.s, p3/M, z0.s, z20.s\n"
+ "fmla z28.s, p3/M, z2.s, z17.s\n"
+ "ldr x27, [x14, #0x0]\n"
+ "fmla z24.s, p3/M, z8.s, z19.s\n"
+ "fmla z25.s, p3/M, z7.s, z19.s\n"
+ "ldr x26, [x14, #0x8]\n"
+ "fmla z31.s, p3/M, z1.s, z19.s\n"
+ "fmla z29.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x25, [x17, #0xa8]\n"
+ "fmla z26.s, p3/M, z6.s, z16.s\n"
+ "fmla z27.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x23, [x17, #0xc0]\n"
+ "fmla z28.s, p3/M, z6.s, z19.s\n"
+ "fmla z30.s, p3/M, z4.s, z19.s\n"
+ "ldr x24, [x14, #0x10]\n"
+ "fmla z21.s, p3/M, z3.s, z20.s\n"
+ "fmla z25.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x12, x15, LSL #2]\n"
+ "ldr x22, [x17, #0xb0]\n"
+ "fmla z29.s, p3/M, z5.s, z19.s\n"
+ "fmla z31.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.s, p3/M, z8.s, z19.s\n"
+ "fmla z28.s, p3/M, z8.s, z17.s\n"
+ "ldr x21, [x14, #0x18]\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z24.s, p3/M, z3.s, z18.s\n"
+ "fmla z27.s, p3/M, z0.s, z18.s\n"
+ "fmla z31.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x10, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z4.s, z18.s\n"
+ "fmla z26.s, p3/M, z1.s, z18.s\n"
+ "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z4.s, z17.s\n"
+ "fmla z27.s, p3/M, z2.s, z17.s\n"
+ "fmla z28.s, p3/M, z1.s, z17.s\n"
+ "fmla z30.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z2.s, z17.s\n"
+ "fmla z26.s, p3/M, z7.s, z16.s\n"
+ "fmla z27.s, p3/M, z6.s, z16.s\n"
+ "fmla z29.s, p3/M, z4.s, z16.s\n"
+ "fmla z30.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z6.s, z18.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "fmla z24.s, p3/M, z1.s, z17.s\n"
+ "fmla z25.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmax z21.s, p3/M, z21.s, z22.s\n"
+ "fmla z28.s, p3/M, z7.s, z16.s\n"
+ "fmla z30.s, p3/M, z5.s, z16.s\n"
+ "fmla z29.s, p3/M, z0.s, z18.s\n"
+ "fmla z31.s, p3/M, z2.s, z17.s\n"
+ "fmla z27.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z3.s, z18.s\n"
+ "fmla z25.s, p3/M, z8.s, z17.s\n"
+ "fmin z21.s, p3/M, z21.s, z14.s\n"
+ "st1w { z21.s }, p0, [x27, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x20]\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z16.s\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z31.s, p3/M, z6.s, z16.s\n"
+ ".inst 0xc1aecad8 // fclamp { z24.s-z27.s }, z22.s, z14.s\n"
+ "st1w { z24.s }, p0, [x26, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x28]\n"
+ "st1w { z25.s }, p0, [x24, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x30]\n"
+ ".inst 0xc1aecadc // fclamp { z28.s-z31.s }, z22.s, z14.s\n"
+ "st1w { z26.s }, p0, [x21, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x38]\n"
+ "st1w { z27.s }, p0, [x20, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x40]\n"
+ "st1w { z28.s }, p0, [x23, x13, LSL #2]\n"
+ "st1w { z29.s }, p0, [x22, x13, LSL #2]\n"
+ "st1w { z30.s }, p0, [x21, x13, LSL #2]\n"
+ "st1w { z31.s }, p0, [x20, x13, LSL #2]\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..add666e14e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 4;
+ constexpr static unsigned int output_cols = 4;
+
+ sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..efd37c38ec
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,672 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p3.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "mov x2, #0x0\n"
+ "mov x3, #0x0\n"
+ "1:" // Tile loop
+ "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x22, #0x4\n"
+ "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "mul x20, x2, x21\n" // offset = tile_i * ld_input_row
+ "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "madd x20, x3, x4, x20\n" // offset += tile_j * ld_input_col
+ "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
+ "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "add x5, x5, x20, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x6, x5, x21, LSL #2\n"
+ "add x7, x6, x21, LSL #2\n"
+ "add x8, x4, x4\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x16, x7, x21, LSL #2\n"
+ "add x15, x8, x4\n"
+ "add x14, x16, x21, LSL #2\n"
+ "add x13, x15, x4\n"
+ "add x12, x14, x21, LSL #2\n"
+ "add x11, x13, x4\n"
+ "cbnz x3, 2f\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "sub x21, x20, x3\n"
+ "sub x21, x21, #0x1\n"
+ "lsl x10, %x[n_channels], #0x2\n"
+ "mov x20, #0x10\n"
+ "and x21, x21, #0x3fffff\n"
+ "mul x20, x20, x4\n"
+ "orr x10, x10, x21, LSL #22\n"
+ "orr x10, x10, x20, LSL #38\n"
+ "add x9, x7, x8, LSL #2\n"
+ "add x28, x5, x11, LSL #2\n"
+ "add x27, x7, x15, LSL #2\n"
+ "add x26, x12, x11, LSL #2\n"
+ "add x25, x16, x8, LSL #2\n"
+ "add x24, x5, x4, LSL #2\n"
+ "add x23, x5, x13, LSL #2\n"
+ "add x22, x16, x15, LSL #2\n"
+ "add x21, x6, x11, LSL #2\n"
+ "add x20, x6, x8, LSL #2\n"
+ ".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
+ "add x9, x14, x11, LSL #2\n"
+ ".inst 0xf8aa48ba // rprfm pldonce, x10, [x5]\n"
+ ".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
+ "add x28, x6, x15, LSL #2\n"
+ ".inst 0xf8aa4b7a // rprfm pldonce, x10, [x27]\n"
+ "add x27, x12, x4, LSL #2\n"
+ ".inst 0xf8aa499a // rprfm pldonce, x10, [x12]\n"
+ ".inst 0xf8aa4b5a // rprfm pldonce, x10, [x26]\n"
+ "add x26, x7, x4, LSL #2\n"
+ ".inst 0xf8aa4b3a // rprfm pldonce, x10, [x25]\n"
+ "add x25, x12, x13, LSL #2\n"
+ ".inst 0xf8aa4b1a // rprfm pldonce, x10, [x24]\n"
+ "add x24, x7, x13, LSL #2\n"
+ ".inst 0xf8aa4afa // rprfm pldonce, x10, [x23]\n"
+ "add x23, x5, x8, LSL #2\n"
+ ".inst 0xf8aa4ada // rprfm pldonce, x10, [x22]\n"
+ "add x22, x16, x4, LSL #2\n"
+ ".inst 0xf8aa48da // rprfm pldonce, x10, [x6]\n"
+ ".inst 0xf8aa4aba // rprfm pldonce, x10, [x21]\n"
+ "add x21, x5, x15, LSL #2\n"
+ ".inst 0xf8aa49da // rprfm pldonce, x10, [x14]\n"
+ ".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
+ "add x20, x16, x13, LSL #2\n"
+ ".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
+ "add x9, x7, x11, LSL #2\n"
+ ".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
+ "add x28, x14, x8, LSL #2\n"
+ ".inst 0xf8aa4b7a // rprfm pldonce, x10, [x27]\n"
+ "add x27, x16, x11, LSL #2\n"
+ ".inst 0xf8aa4b5a // rprfm pldonce, x10, [x26]\n"
+ "add x26, x12, x8, LSL #2\n"
+ ".inst 0xf8aa4b3a // rprfm pldonce, x10, [x25]\n"
+ "add x25, x14, x15, LSL #2\n"
+ ".inst 0xf8aa4b1a // rprfm pldonce, x10, [x24]\n"
+ "add x24, x12, x15, LSL #2\n"
+ ".inst 0xf8aa4afa // rprfm pldonce, x10, [x23]\n"
+ "add x23, x6, x4, LSL #2\n"
+ ".inst 0xf8aa4ada // rprfm pldonce, x10, [x22]\n"
+ "add x22, x6, x13, LSL #2\n"
+ ".inst 0xf8aa4aba // rprfm pldonce, x10, [x21]\n"
+ "add x21, x14, x4, LSL #2\n"
+ ".inst 0xf8aa48fa // rprfm pldonce, x10, [x7]\n"
+ ".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
+ "add x20, x14, x13, LSL #2\n"
+ ".inst 0xf8aa493a // rprfm pldonce, x10, [x9]\n"
+ ".inst 0xf8aa4a1a // rprfm pldonce, x10, [x16]\n"
+ ".inst 0xf8aa4b9a // rprfm pldonce, x10, [x28]\n"
+ ".inst 0xf8aa4b7a // rprfm pldonce, x10, [x27]\n"
+ ".inst 0xf8aa4b5a // rprfm pldonce, x10, [x26]\n"
+ ".inst 0xf8aa4b3a // rprfm pldonce, x10, [x25]\n"
+ ".inst 0xf8aa4b1a // rprfm pldonce, x10, [x24]\n"
+ ".inst 0xf8aa4afa // rprfm pldonce, x10, [x23]\n"
+ ".inst 0xf8aa4ada // rprfm pldonce, x10, [x22]\n"
+ ".inst 0xf8aa4aba // rprfm pldonce, x10, [x21]\n"
+ ".inst 0xf8aa4a9a // rprfm pldonce, x10, [x20]\n"
+ "2:" // Tile loop: Prefetch input rows: End
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x2, x22\n" // offset = tile_i * ld_output_row
+ "mov x20, #0x4\n"
+ "ld1w { z14.s }, p3/Z, [x17]\n"
+ "ldr x9, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "madd x21, x3, x9, x21\n" // offset += tile_j * ld_output_col
+ "mul x21, x21, x20\n" // offset *= output_tile_size
+ "ld1rw { z13.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x28, x28, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "addvl x17, x17, #1\n"
+ ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+ "add x27, x28, x22, LSL #2\n"
+ "cntw x26\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "addvl x17, x17, #4\n"
+ "add x25, x27, x22, LSL #2\n"
+ ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+ "add x24, x9, x9\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "ld1w { z9.s }, p2/Z, [x7, x8, LSL #2]\n"
+ "addvl x17, x17, #4\n"
+ "cmp x26, %x[n_channels]\n"
+ "ld1w { z8.s }, p3/Z, [x17]\n"
+ "add x23, x25, x22, LSL #2\n"
+ "add x22, x24, x9\n"
+ "ld1w { z10.s }, p2/Z, [x5]\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x26\n"
+ "ld1w { z11.s }, p2/Z, [x5, x11, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x7, x15, LSL #2]\n"
+ "addvl x17, x17, #1\n"
+ "bge 4f\n"
+ "3:" // Tile loop: Channel loop
+ "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+ "movprfx z28, z14\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "whilelt p1.s, x26, %x[n_channels]\n"
+ "incw x21\n"
+ "movprfx z26, z14\n fmla z26.s, p3/M, z3.s, z9.s\n"
+ "movprfx z17, z14\n fmla z17.s, p3/M, z1.s, z9.s\n"
+ "incw x26\n"
+ "mov p0.b, p2.b\n"
+ "movprfx z18, z14\n fmla z18.s, p3/M, z0.s, z9.s\n"
+ "fmla z25.s, p3/M, z5.s, z12.s\n"
+ "incw x20\n"
+ "movprfx z29, z14\n fmla z29.s, p3/M, z7.s, z9.s\n"
+ "movprfx z30, z14\n fmla z30.s, p3/M, z6.s, z9.s\n"
+ "movprfx z24, z14\n fmla z24.s, p3/M, z5.s, z9.s\n"
+ "movprfx z16, z14\n fmla z16.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "movprfx z31, z14\n fmla z31.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z19.s }, p2/Z, [x12]\n"
+ "fmla z26.s, p3/M, z4.s, z12.s\n"
+ "fmla z17.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z22.s }, p2/Z, [x12, x11, LSL #2]\n"
+ "fmla z18.s, p3/M, z1.s, z12.s\n"
+ "movprfx z20, z14\n fmla z20.s, p3/M, z6.s, z19.s\n"
+ "ld1w { z11.s }, p2/Z, [x16, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z7.s, z9.s\n"
+ "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "fmla z30.s, p3/M, z7.s, z12.s\n"
+ "fmla z31.s, p3/M, z6.s, z12.s\n"
+ "movprfx z27, z14\n fmla z27.s, p3/M, z3.s, z12.s\n"
+ "movprfx z19, z14\n fmla z19.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x5, x4, LSL #2]\n"
+ "movprfx z23, z14\n fmla z23.s, p3/M, z8.s, z22.s\n"
+ "fmla z26.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x5, x13, LSL #2]\n"
+ "fmla z17.s, p3/M, z4.s, z9.s\n"
+ "fmla z18.s, p3/M, z3.s, z9.s\n"
+ "movprfx z21, z14\n fmla z21.s, p3/M, z1.s, z9.s\n"
+ "movprfx z22, z14\n fmla z22.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z14.s }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "fmla z24.s, p3/M, z8.s, z9.s\n"
+ "fmla z16.s, p3/M, z5.s, z9.s\n"
+ "fmla z20.s, p3/M, z2.s, z9.s\n"
+ "fmla z25.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x6]\n"
+ "fmla z28.s, p3/M, z1.s, z10.s\n"
+ "fmla z29.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x6, x11, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x14]\n"
+ "fmla z26.s, p3/M, z7.s, z11.s\n"
+ "fmla z27.s, p3/M, z6.s, z11.s\n"
+ "fmla z17.s, p3/M, z5.s, z11.s\n"
+ "fmla z18.s, p3/M, z4.s, z11.s\n"
+ "fmla z19.s, p3/M, z3.s, z11.s\n"
+ "fmla z21.s, p3/M, z2.s, z11.s\n"
+ "fmla z22.s, p3/M, z1.s, z11.s\n"
+ "fmla z23.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x6, x8, LSL #2]\n"
+ "fmla z24.s, p3/M, z0.s, z9.s\n"
+ "fmla z16.s, p3/M, z6.s, z12.s\n"
+ "fmla z20.s, p3/M, z3.s, z12.s\n"
+ "fmla z25.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, x11, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z9.s\n"
+ "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "fmla z27.s, p3/M, z2.s, z10.s\n"
+ "fmla z29.s, p3/M, z4.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x6, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z11.s\n"
+ "fmla z26.s, p3/M, z0.s, z11.s\n"
+ "fmla z19.s, p3/M, z8.s, z12.s\n"
+ "fmla z23.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x4, LSL #2]\n"
+ "fmla z24.s, p3/M, z2.s, z11.s\n"
+ "fmla z25.s, p3/M, z2.s, z10.s\n"
+ "fmla z28.s, p3/M, z5.s, z11.s\n"
+ "fmla z29.s, p3/M, z5.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x7, x4, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "fmla z31.s, p3/M, z3.s, z10.s\n"
+ "fmla z26.s, p3/M, z1.s, z10.s\n"
+ "fmla z27.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x7, x13, LSL #2]\n"
+ "fmla z20.s, p3/M, z7.s, z12.s\n"
+ "fmla z21.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x13, LSL #2]\n"
+ "fmla z24.s, p3/M, z4.s, z9.s\n"
+ "fmla z25.s, p3/M, z3.s, z9.s\n"
+ "fmla z16.s, p3/M, z1.s, z9.s\n"
+ "fmla z17.s, p3/M, z0.s, z9.s\n"
+ "fmla z28.s, p3/M, z7.s, z9.s\n"
+ "fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x5, x8, LSL #2]\n"
+ "fmla z22.s, p3/M, z8.s, z11.s\n"
+ "fmla z23.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z10.s\n"
+ "fmla z31.s, p3/M, z7.s, z10.s\n"
+ "fmla z26.s, p3/M, z5.s, z10.s\n"
+ "fmla z27.s, p3/M, z4.s, z10.s\n"
+ "fmla z18.s, p3/M, z2.s, z10.s\n"
+ "fmla z19.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x5, x15, LSL #2]\n"
+ "addvl x5, x5, #1\n"
+ "fmla z24.s, p3/M, z7.s, z11.s\n"
+ "fmla z25.s, p3/M, z6.s, z11.s\n"
+ "fmla z16.s, p3/M, z4.s, z11.s\n"
+ "fmla z17.s, p3/M, z3.s, z11.s\n"
+ "fmla z20.s, p3/M, z1.s, z11.s\n"
+ "fmla z21.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x16, x13, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z12.s\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "fmla z30.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x7]\n"
+ "fmla z22.s, p3/M, z2.s, z11.s\n"
+ "fmla z31.s, p3/M, z0.s, z9.s\n"
+ "fmla z24.s, p3/M, z3.s, z10.s\n"
+ "fmla z16.s, p3/M, z0.s, z10.s\n"
+ "fmla z26.s, p3/M, z8.s, z11.s\n"
+ "fmla z27.s, p3/M, z7.s, z11.s\n"
+ "fmla z18.s, p3/M, z5.s, z11.s\n"
+ "fmla z19.s, p3/M, z4.s, z11.s\n"
+ "fmla z23.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "fmla z29.s, p3/M, z2.s, z9.s\n"
+ "fmla z30.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x7, x11, LSL #2]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z28.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x16]\n"
+ "fmla z21.s, p3/M, z4.s, z11.s\n"
+ "fmla z22.s, p3/M, z3.s, z11.s\n"
+ "fmla z31.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
+ "fmla z27.s, p3/M, z5.s, z12.s\n"
+ "fmla z19.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x16, x11, LSL #2]\n"
+ "addvl x16, x16, #1\n"
+ "fmla z24.s, p3/M, z6.s, z10.s\n"
+ "fmla z16.s, p3/M, z3.s, z10.s\n"
+ "fmla z20.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x12, x8, LSL #2]\n"
+ "fmla z23.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z7.s, z10.s\n"
+ "fmla z22.s, p3/M, z6.s, z10.s\n"
+ "fmla z16.s, p3/M, z8.s, z11.s\n"
+ "fmla z17.s, p3/M, z7.s, z11.s\n"
+ "fmla z18.s, p3/M, z6.s, z11.s\n"
+ "fmla z20.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "fmla z19.s, p3/M, z5.s, z12.s\n"
+ "fmla z21.s, p3/M, z5.s, z11.s\n"
+ "fmla z22.s, p3/M, z4.s, z11.s\n"
+ "fmla z23.s, p3/M, z3.s, z11.s\n"
+ "fmla z27.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x15, LSL #2]\n"
+ "fmla z20.s, p3/M, z8.s, z10.s\n"
+ "addvl x12, x12, #1\n"
+ "ld1w { z10.s }, p2/Z, [x6, x4, LSL #2]\n"
+ "fmla z17.s, p3/M, z8.s, z11.s\n"
+ "fmla z18.s, p3/M, z7.s, z11.s\n"
+ "fmla z19.s, p3/M, z6.s, z11.s\n"
+ "fmla z21.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z11.s }, p2/Z, [x6, x13, LSL #2]\n"
+ "addvl x6, x6, #1\n"
+ "fmla z22.s, p3/M, z7.s, z12.s\n"
+ "fmla z23.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, x4, LSL #2]\n"
+ "fmla z28.s, p3/M, z4.s, z10.s\n"
+ "fmla z29.s, p3/M, z3.s, z10.s\n"
+ "fmla z24.s, p3/M, z1.s, z10.s\n"
+ "fmla z25.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
+ "fmla z30.s, p3/M, z5.s, z11.s\n"
+ "fmla z31.s, p3/M, z4.s, z11.s\n"
+ "cmp x26, %x[n_channels]\n"
+ "addvl x14, x14, #1\n"
+ "fmla z26.s, p3/M, z2.s, z11.s\n"
+ "fmla z27.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p1/Z, [x5, x11, LSL #2]\n"
+ "fmla z16.s, p3/M, z7.s, z12.s\n"
+ "fmla z17.s, p3/M, z6.s, z12.s\n"
+ "fmla z20.s, p3/M, z4.s, z12.s\n"
+ "fmla z21.s, p3/M, z3.s, z12.s\n"
+ ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "fmla z18.s, p3/M, z8.s, z10.s\n"
+ "fmla z19.s, p3/M, z7.s, z10.s\n"
+ "ld1w { z12.s }, p1/Z, [x7, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z5.s, z10.s\n"
+ "fmla z23.s, p3/M, z4.s, z10.s\n"
+ ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ ".inst 0xc1afc9bc // fclamp { z28.s-z31.s }, z13.s, z15.s\n"
+ ".inst 0xc1afc9b8 // fclamp { z24.s-z27.s }, z13.s, z15.s\n"
+ "ld1w { z10.s }, p1/Z, [x5]\n"
+ ".inst 0xc1afc9b0 // fclamp { z16.s-z19.s }, z13.s, z15.s\n"
+ ".inst 0xc1afc9b4 // fclamp { z20.s-z23.s }, z13.s, z15.s\n"
+ "st1w { z28.s }, p0, [x28]\n"
+ "st1w { z29.s }, p0, [x28, x9, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "st1w { z30.s }, p0, [x28, x24, LSL #2]\n"
+ "st1w { z31.s }, p0, [x28, x22, LSL #2]\n"
+ "addvl x28, x28, #1\n"
+ "st1w { z24.s }, p0, [x27]\n"
+ "st1w { z25.s }, p0, [x27, x9, LSL #2]\n"
+ "st1w { z26.s }, p0, [x27, x24, LSL #2]\n"
+ "st1w { z27.s }, p0, [x27, x22, LSL #2]\n"
+ "addvl x27, x27, #1\n"
+ "st1w { z16.s }, p0, [x25]\n"
+ "st1w { z17.s }, p0, [x25, x9, LSL #2]\n"
+ "st1w { z18.s }, p0, [x25, x24, LSL #2]\n"
+ "st1w { z19.s }, p0, [x25, x22, LSL #2]\n"
+ "addvl x25, x25, #1\n"
+ "st1w { z20.s }, p0, [x23]\n"
+ "st1w { z21.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z22.s }, p0, [x23, x24, LSL #2]\n"
+ "st1w { z23.s }, p0, [x23, x22, LSL #2]\n"
+ "addvl x23, x23, #1\n"
+ "blt 3b\n"
+ "4:" // Tile loop: Channel tail
+ "movprfx z21, z14\n fmla z21.s, p3/M, z4.s, z9.s\n"
+ "movprfx z24, z14\n fmla z24.s, p3/M, z8.s, z9.s\n"
+ "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "add x3, x3, #0x1\n"
+ "movprfx z22, z14\n fmla z22.s, p3/M, z3.s, z9.s\n"
+ "movprfx z29, z14\n fmla z29.s, p3/M, z1.s, z9.s\n"
+ "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x2, #0x1\n"
+ "movprfx z30, z14\n fmla z30.s, p3/M, z0.s, z9.s\n"
+ "fmla z21.s, p3/M, z5.s, z12.s\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x3, x20\n"
+ "movprfx z25, z14\n fmla z25.s, p3/M, z7.s, z9.s\n"
+ "movprfx z26, z14\n fmla z26.s, p3/M, z6.s, z9.s\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x2, x2, x21, LT\n"
+ "movprfx z20, z14\n fmla z20.s, p3/M, z5.s, z9.s\n"
+ "movprfx z28, z14\n fmla z28.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "movprfx z27, z14\n fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x12]\n"
+ "csel x3, x3, XZR, LT\n"
+ "fmla z22.s, p3/M, z4.s, z12.s\n"
+ "fmla z29.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z18.s }, p2/Z, [x12, x11, LSL #2]\n"
+ "cmp x2, x20\n"
+ "fmla z30.s, p3/M, z1.s, z12.s\n"
+ "movprfx z16, z14\n fmla z16.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z11.s }, p2/Z, [x16, x15, LSL #2]\n"
+ "fmla z21.s, p3/M, z7.s, z9.s\n"
+ "fmla z25.s, p3/M, z8.s, z12.s\n"
+ "fmla z26.s, p3/M, z7.s, z12.s\n"
+ "fmla z27.s, p3/M, z6.s, z12.s\n"
+ "movprfx z23, z14\n fmla z23.s, p3/M, z3.s, z12.s\n"
+ "movprfx z31, z14\n fmla z31.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x5, x4, LSL #2]\n"
+ "movprfx z19, z14\n fmla z19.s, p3/M, z8.s, z18.s\n"
+ "fmla z22.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x5, x13, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z9.s\n"
+ "fmla z30.s, p3/M, z3.s, z9.s\n"
+ "movprfx z17, z14\n fmla z17.s, p3/M, z1.s, z9.s\n"
+ "movprfx z18, z14\n fmla z18.s, p3/M, z0.s, z9.s\n"
+ "fmla z20.s, p3/M, z8.s, z9.s\n"
+ "fmla z28.s, p3/M, z5.s, z9.s\n"
+ "fmla z16.s, p3/M, z2.s, z9.s\n"
+ "fmla z21.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z14.s }, p2/Z, [x6]\n"
+ "fmla z24.s, p3/M, z1.s, z10.s\n"
+ "fmla z25.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x6, x11, LSL #2]\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
+ "fmla z27.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x14]\n"
+ "fmla z22.s, p3/M, z7.s, z11.s\n"
+ "fmla z23.s, p3/M, z6.s, z11.s\n"
+ "fmla z29.s, p3/M, z5.s, z11.s\n"
+ "fmla z30.s, p3/M, z4.s, z11.s\n"
+ "fmla z31.s, p3/M, z3.s, z11.s\n"
+ "fmla z17.s, p3/M, z2.s, z11.s\n"
+ "fmla z18.s, p3/M, z1.s, z11.s\n"
+ "fmla z19.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x6, x8, LSL #2]\n"
+ "fmla z20.s, p3/M, z0.s, z14.s\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
+ "fmla z16.s, p3/M, z3.s, z12.s\n"
+ "fmla z21.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x14, x11, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z14.s\n"
+ "fmla z27.s, p3/M, z5.s, z10.s\n"
+ "fmla z23.s, p3/M, z2.s, z10.s\n"
+ "fmla z25.s, p3/M, z4.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x6, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z3.s, z9.s\n"
+ "fmla z22.s, p3/M, z0.s, z9.s\n"
+ "fmla z31.s, p3/M, z8.s, z11.s\n"
+ "fmla z19.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x12, x4, LSL #2]\n"
+ "fmla z20.s, p3/M, z2.s, z9.s\n"
+ "fmla z21.s, p3/M, z2.s, z12.s\n"
+ "fmla z24.s, p3/M, z5.s, z9.s\n"
+ "fmla z25.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x7, x4, LSL #2]\n"
+ "fmla z26.s, p3/M, z4.s, z12.s\n"
+ "fmla z27.s, p3/M, z3.s, z12.s\n"
+ "fmla z22.s, p3/M, z1.s, z12.s\n"
+ "fmla z23.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x7, x13, LSL #2]\n"
+ "fmla z16.s, p3/M, z7.s, z10.s\n"
+ "fmla z17.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x13, LSL #2]\n"
+ "fmla z20.s, p3/M, z4.s, z9.s\n"
+ "fmla z21.s, p3/M, z3.s, z9.s\n"
+ "fmla z28.s, p3/M, z1.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z9.s\n"
+ "fmla z24.s, p3/M, z7.s, z9.s\n"
+ "fmla z25.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z10.s }, p2/Z, [x5, x8, LSL #2]\n"
+ "fmla z18.s, p3/M, z8.s, z11.s\n"
+ "fmla z19.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z14.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "fmla z26.s, p3/M, z8.s, z12.s\n"
+ "fmla z27.s, p3/M, z7.s, z12.s\n"
+ "fmla z22.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x5, x15, LSL #2]\n"
+ "fmla z20.s, p3/M, z7.s, z14.s\n"
+ "fmla z21.s, p3/M, z6.s, z14.s\n"
+ "fmla z28.s, p3/M, z4.s, z14.s\n"
+ "fmla z29.s, p3/M, z3.s, z14.s\n"
+ "fmla z16.s, p3/M, z1.s, z14.s\n"
+ "fmla z17.s, p3/M, z0.s, z14.s\n"
+ "ld1w { z14.s }, p2/Z, [x16, x13, LSL #2]\n"
+ "fmla z24.s, p3/M, z2.s, z10.s\n"
+ "fmla z25.s, p3/M, z1.s, z10.s\n"
+ "fmla z26.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x7]\n"
+ "fmla z18.s, p3/M, z2.s, z14.s\n"
+ "fmla z27.s, p3/M, z0.s, z9.s\n"
+ "fmla z20.s, p3/M, z3.s, z10.s\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z22.s, p3/M, z8.s, z14.s\n"
+ "fmla z23.s, p3/M, z7.s, z14.s\n"
+ "fmla z30.s, p3/M, z5.s, z14.s\n"
+ "fmla z31.s, p3/M, z4.s, z14.s\n"
+ "fmla z19.s, p3/M, z1.s, z14.s\n"
+ "ld1w { z11.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "fmla z25.s, p3/M, z2.s, z9.s\n"
+ "fmla z26.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x7, x11, LSL #2]\n"
+ "fmla z24.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z14.s }, p2/Z, [x16]\n"
+ "fmla z17.s, p3/M, z4.s, z11.s\n"
+ "fmla z18.s, p3/M, z3.s, z11.s\n"
+ "fmla z27.s, p3/M, z8.s, z12.s\n"
+ "fmla z23.s, p3/M, z5.s, z12.s\n"
+ "fmla z31.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x16, x11, LSL #2]\n"
+ "fmla z20.s, p3/M, z6.s, z14.s\n"
+ "fmla z28.s, p3/M, z3.s, z14.s\n"
+ "fmla z16.s, p3/M, z0.s, z14.s\n"
+ "ld1w { z12.s }, p2/Z, [x12, x8, LSL #2]\n"
+ "fmla z19.s, p3/M, z2.s, z9.s\n"
+ "fmla z17.s, p3/M, z7.s, z12.s\n"
+ "fmla z18.s, p3/M, z6.s, z12.s\n"
+ "fmla z28.s, p3/M, z8.s, z11.s\n"
+ "fmla z29.s, p3/M, z7.s, z11.s\n"
+ "fmla z30.s, p3/M, z6.s, z11.s\n"
+ "fmla z16.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z5.s, z9.s\n"
+ "fmla z17.s, p3/M, z5.s, z10.s\n"
+ "fmla z18.s, p3/M, z4.s, z10.s\n"
+ "fmla z19.s, p3/M, z3.s, z10.s\n"
+ "fmla z23.s, p3/M, z8.s, z9.s\n"
+ "ld1w { z14.s }, p2/Z, [x12, x15, LSL #2]\n"
+ "fmla z16.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x6, x4, LSL #2]\n"
+ "fmla z29.s, p3/M, z8.s, z10.s\n"
+ "fmla z30.s, p3/M, z7.s, z10.s\n"
+ "fmla z31.s, p3/M, z6.s, z10.s\n"
+ "fmla z17.s, p3/M, z8.s, z14.s\n"
+ "ld1w { z11.s }, p2/Z, [x6, x13, LSL #2]\n"
+ "fmla z18.s, p3/M, z7.s, z14.s\n"
+ "fmla z19.s, p3/M, z6.s, z14.s\n"
+ "ld1w { z10.s }, p2/Z, [x14, x4, LSL #2]\n"
+ "fmla z24.s, p3/M, z4.s, z9.s\n"
+ "fmla z25.s, p3/M, z3.s, z9.s\n"
+ "fmla z20.s, p3/M, z1.s, z9.s\n"
+ "fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "fmla z26.s, p3/M, z5.s, z11.s\n"
+ "fmla z27.s, p3/M, z4.s, z11.s\n"
+ "fmla z22.s, p3/M, z2.s, z11.s\n"
+ "fmla z23.s, p3/M, z1.s, z11.s\n"
+ "fmla z28.s, p3/M, z7.s, z10.s\n"
+ "fmla z29.s, p3/M, z6.s, z10.s\n"
+ "fmla z16.s, p3/M, z4.s, z10.s\n"
+ "fmla z17.s, p3/M, z3.s, z10.s\n"
+ "fmla z30.s, p3/M, z8.s, z12.s\n"
+ "fmla z31.s, p3/M, z7.s, z12.s\n"
+ "fmla z18.s, p3/M, z5.s, z12.s\n"
+ "fmla z19.s, p3/M, z4.s, z12.s\n"
+ ".inst 0xc1afc9b8 // fclamp { z24.s-z27.s }, z13.s, z15.s\n"
+ ".inst 0xc1afc9b4 // fclamp { z20.s-z23.s }, z13.s, z15.s\n"
+ "st1w { z24.s }, p0, [x28]\n"
+ ".inst 0xc1afc9bc // fclamp { z28.s-z31.s }, z13.s, z15.s\n"
+ ".inst 0xc1afc9b0 // fclamp { z16.s-z19.s }, z13.s, z15.s\n"
+ "st1w { z25.s }, p0, [x28, x9, LSL #2]\n"
+ "st1w { z26.s }, p0, [x28, x24, LSL #2]\n"
+ "st1w { z27.s }, p0, [x28, x22, LSL #2]\n"
+ "st1w { z20.s }, p0, [x27]\n"
+ "st1w { z21.s }, p0, [x27, x9, LSL #2]\n"
+ "st1w { z22.s }, p0, [x27, x24, LSL #2]\n"
+ "st1w { z23.s }, p0, [x27, x22, LSL #2]\n"
+ "st1w { z28.s }, p0, [x25]\n"
+ "st1w { z29.s }, p0, [x25, x9, LSL #2]\n"
+ "st1w { z30.s }, p0, [x25, x24, LSL #2]\n"
+ "st1w { z31.s }, p0, [x25, x22, LSL #2]\n"
+ "st1w { z16.s }, p0, [x23]\n"
+ "st1w { z17.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z18.s }, p0, [x23, x24, LSL #2]\n"
+ "st1w { z19.s }, p0, [x23, x22, LSL #2]\n"
+ "blt 1b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..2e2a45bab0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,653 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[36];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[14];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[5];
+ inptrs[3] = input_ptrs[15];
+ inptrs[4] = input_ptrs[30];
+ inptrs[5] = input_ptrs[35];
+ inptrs[6] = input_ptrs[20];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[4];
+ inptrs[9] = input_ptrs[21];
+ inptrs[10] = input_ptrs[6];
+ inptrs[11] = input_ptrs[11];
+ inptrs[12] = input_ptrs[24];
+ inptrs[13] = input_ptrs[8];
+ inptrs[14] = input_ptrs[29];
+ inptrs[15] = input_ptrs[9];
+ inptrs[16] = input_ptrs[31];
+ inptrs[17] = input_ptrs[13];
+ inptrs[18] = input_ptrs[34];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[19];
+ inptrs[22] = input_ptrs[3];
+ inptrs[23] = input_ptrs[12];
+ inptrs[24] = input_ptrs[22];
+ inptrs[25] = input_ptrs[17];
+ inptrs[26] = input_ptrs[18];
+ inptrs[27] = input_ptrs[26];
+ inptrs[28] = input_ptrs[23];
+ inptrs[29] = input_ptrs[32];
+ inptrs[30] = input_ptrs[27];
+ inptrs[31] = input_ptrs[33];
+ inptrs[32] = input_ptrs[7];
+ inptrs[33] = input_ptrs[10];
+ inptrs[34] = input_ptrs[25];
+ inptrs[35] = input_ptrs[28];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ptrue p3.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "ld1w { z13.s }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "ldp x23, x22, [x17, #0x0]\n"
+ "ldp x21, x20, [x17, #0x10]\n"
+ "cntw x16\n"
+ ".inst 0xa040c100 // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "mov x15, #0x0\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ ".inst 0xa040c104 // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "addvl x8, x8, #4\n"
+ "cmp x16, %x[n_channels]\n"
+ "ld1rw { z14.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x13, XZR, x16\n"
+ "ld1w { z8.s }, p3/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "movprfx z29, z13\n fmla z29.s, p3/M, z4.s, z9.s\n"
+ "movprfx z16, z13\n fmla z16.s, p3/M, z8.s, z9.s\n"
+ "ldr x24, [x17, #0x20]\n"
+ "incw x13\n"
+ "movprfx z30, z13\n fmla z30.s, p3/M, z3.s, z9.s\n"
+ "movprfx z25, z13\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "ldr x20, [x17, #0x30]\n"
+ "mov p1.b, p2.b\n"
+ "movprfx z26, z13\n fmla z26.s, p3/M, z0.s, z9.s\n"
+ "ldr x21, [x17, #0x28]\n"
+ "movprfx z17, z13\n fmla z17.s, p3/M, z7.s, z9.s\n"
+ "whilelt p0.s, x16, %x[n_channels]\n"
+ "movprfx z18, z13\n fmla z18.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z5.s, z12.s\n"
+ "ldr x23, [x17, #0x38]\n"
+ "movprfx z28, z13\n fmla z28.s, p3/M, z5.s, z9.s\n"
+ "movprfx z24, z13\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "fmla z16.s, p3/M, z0.s, z10.s\n"
+ "movprfx z19, z13\n fmla z19.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z22.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x20, [x17, #0x48]\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
+ "fmla z25.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z21.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x27, [x17, #0x50]\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "fmla z17.s, p3/M, z8.s, z12.s\n"
+ "ldr x26, [x17, #0x60]\n"
+ "fmla z18.s, p3/M, z7.s, z12.s\n"
+ "movprfx z20, z13\n fmla z20.s, p3/M, z6.s, z22.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x25, [x17, #0x68]\n"
+ "fmla z29.s, p3/M, z7.s, z9.s\n"
+ "fmla z19.s, p3/M, z6.s, z12.s\n"
+ "ldr x21, [x17, #0x58]\n"
+ "movprfx z31, z13\n fmla z31.s, p3/M, z3.s, z12.s\n"
+ "movprfx z27, z13\n fmla z27.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x24, [x17, #0x70]\n"
+ "movprfx z23, z13\n fmla z23.s, p3/M, z8.s, z21.s\n"
+ "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x23, [x17, #0x78]\n"
+ "fmla z25.s, p3/M, z4.s, z9.s\n"
+ "fmla z26.s, p3/M, z3.s, z9.s\n"
+ "ldr x22, [x17, #0x80]\n"
+ "movprfx z21, z13\n fmla z21.s, p3/M, z1.s, z9.s\n"
+ "movprfx z22, z13\n fmla z22.s, p3/M, z0.s, z9.s\n"
+ "ldr x20, [x17, #0x88]\n"
+ "ld1w { z13.s }, p3/Z, [x8]\n"
+ "fmla z28.s, p3/M, z8.s, z9.s\n"
+ "fmla z24.s, p3/M, z5.s, z9.s\n"
+ "ldr x12, [x14, #0x0]\n"
+ "addvl x8, x8, #1\n"
+ "fmla z20.s, p3/M, z2.s, z9.s\n"
+ "fmla z16.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ldr x27, [x17, #0x90]\n"
+ "fmla z17.s, p3/M, z0.s, z10.s\n"
+ "fmla z18.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x21, [x17, #0x98]\n"
+ "fmla z29.s, p3/M, z8.s, z11.s\n"
+ "fmla z19.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "ldr x26, [x17, #0xa0]\n"
+ "fmla z30.s, p3/M, z7.s, z11.s\n"
+ "fmla z31.s, p3/M, z6.s, z11.s\n"
+ "ldr x11, [x14, #0x8]\n"
+ "fmla z25.s, p3/M, z5.s, z11.s\n"
+ "fmla z26.s, p3/M, z4.s, z11.s\n"
+ "ldr x10, [x14, #0x10]\n"
+ "fmla z27.s, p3/M, z3.s, z11.s\n"
+ "fmla z21.s, p3/M, z2.s, z11.s\n"
+ "ldr x9, [x14, #0x18]\n"
+ "fmla z22.s, p3/M, z1.s, z11.s\n"
+ "fmla z23.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ldr x25, [x17, #0xa8]\n"
+ "fmla z16.s, p3/M, z3.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z9.s\n"
+ "fmla z24.s, p3/M, z6.s, z12.s\n"
+ "fmla z20.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x24, [x17, #0xb0]\n"
+ "fmla z17.s, p3/M, z4.s, z11.s\n"
+ "fmla z18.s, p3/M, z3.s, z11.s\n"
+ "fmla z29.s, p3/M, z1.s, z11.s\n"
+ "fmla z19.s, p3/M, z5.s, z10.s\n"
+ "fmla z31.s, p3/M, z2.s, z10.s\n"
+ "fmla z30.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x23, [x17, #0xb8]\n"
+ "fmla z27.s, p3/M, z8.s, z9.s\n"
+ "fmla z23.s, p3/M, z5.s, z9.s\n"
+ "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z16.s, p3/M, z5.s, z11.s\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x17, #0xc8]\n"
+ "fmla z17.s, p3/M, z5.s, z12.s\n"
+ "fmla z18.s, p3/M, z4.s, z12.s\n"
+ "fmla z29.s, p3/M, z2.s, z12.s\n"
+ "fmla z19.s, p3/M, z3.s, z12.s\n"
+ "fmla z30.s, p3/M, z1.s, z12.s\n"
+ "fmla z31.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x28, [x17, #0xd8]\n"
+ "fmla z20.s, p3/M, z7.s, z10.s\n"
+ "fmla z21.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ldr x21, [x17, #0xd0]\n"
+ "fmla z16.s, p3/M, z7.s, z11.s\n"
+ "fmla z17.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z4.s, z11.s\n"
+ "fmla z29.s, p3/M, z3.s, z11.s\n"
+ "fmla z24.s, p3/M, z1.s, z11.s\n"
+ "fmla z25.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "ldr x27, [x17, #0xe0]\n"
+ "fmla z18.s, p3/M, z8.s, z9.s\n"
+ "fmla z22.s, p3/M, z8.s, z10.s\n"
+ "fmla z23.s, p3/M, z7.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z1.s, z9.s\n"
+ "ldr x26, [x17, #0xe8]\n"
+ "fmla z19.s, p3/M, z7.s, z9.s\n"
+ "fmla z30.s, p3/M, z5.s, z9.s\n"
+ "fmla z31.s, p3/M, z4.s, z9.s\n"
+ "fmla z26.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x25, [x17, #0xf0]\n"
+ "fmla z16.s, p3/M, z2.s, z11.s\n"
+ "fmla z17.s, p3/M, z1.s, z11.s\n"
+ "fmla z18.s, p3/M, z0.s, z11.s\n"
+ "fmla z28.s, p3/M, z7.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x24, [x17, #0xf8]\n"
+ "fmla z29.s, p3/M, z6.s, z10.s\n"
+ "fmla z24.s, p3/M, z4.s, z10.s\n"
+ "fmla z25.s, p3/M, z3.s, z10.s\n"
+ "fmla z20.s, p3/M, z1.s, z10.s\n"
+ "fmla z21.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z10.s\n"
+ "ldr x23, [x17, #0x100]\n"
+ "fmla z22.s, p3/M, z2.s, z10.s\n"
+ "fmla z17.s, p3/M, z2.s, z9.s\n"
+ "fmla z18.s, p3/M, z1.s, z9.s\n"
+ "fmla z19.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x17, #0x108]\n"
+ "fmla z16.s, p3/M, z6.s, z11.s\n"
+ "fmla z28.s, p3/M, z3.s, z11.s\n"
+ "fmla z24.s, p3/M, z0.s, z11.s\n"
+ "fmla z30.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x22, [x17, #0x110]\n"
+ "fmla z31.s, p3/M, z7.s, z10.s\n"
+ "fmla z26.s, p3/M, z5.s, z10.s\n"
+ "fmla z23.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z9.s\n"
+ "ldr x21, [x17, #0x118]\n"
+ "fmla z20.s, p3/M, z0.s, z11.s\n"
+ "fmla z21.s, p3/M, z4.s, z10.s\n"
+ "fmla z22.s, p3/M, z3.s, z10.s\n"
+ "fmla z19.s, p3/M, z8.s, z9.s\n"
+ "fmla z31.s, p3/M, z5.s, z9.s\n"
+ "fmla z28.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z7.s, z10.s\n"
+ "fmla z26.s, p3/M, z6.s, z10.s\n"
+ "fmla z20.s, p3/M, z5.s, z10.s\n"
+ "fmla z27.s, p3/M, z5.s, z9.s\n"
+ "fmla z23.s, p3/M, z2.s, z9.s\n"
+ "fmla z21.s, p3/M, z7.s, z12.s\n"
+ "fmla z22.s, p3/M, z6.s, z12.s\n"
+ "fmla z24.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z20.s, p3/M, z8.s, z12.s\n"
+ "fmla z25.s, p3/M, z8.s, z11.s\n"
+ "fmla z26.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z6.s, z11.s\n"
+ "fmla z21.s, p3/M, z5.s, z11.s\n"
+ "fmla z22.s, p3/M, z4.s, z11.s\n"
+ "fmla z23.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldp x20, x25, [x17, #0x0]\n"
+ "fmla z31.s, p3/M, z8.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z16.s, p3/M, z4.s, z10.s\n"
+ "fmla z17.s, p3/M, z3.s, z10.s\n"
+ "fmla z18.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z9.s }, p0/Z, [x20, x16, LSL #2]\n"
+ "fmla z19.s, p3/M, z4.s, z11.s\n"
+ "fmla z21.s, p3/M, z8.s, z12.s\n"
+ "fmla z22.s, p3/M, z7.s, z12.s\n"
+ "fmla z23.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z1.s, z10.s\n"
+ "fmla z29.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z0.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldp x20, x24, [x17, #0x10]\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "fmla z31.s, p3/M, z1.s, z11.s\n"
+ "incw x15\n"
+ "ld1w { z11.s }, p0/Z, [x20, x16, LSL #2]\n"
+ ".inst 0xc1afc9d0 // fclamp { z16.s-z19.s }, z14.s, z15.s\n"
+ "st1w { z16.s }, p1, [x12, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x20]\n"
+ "fmla z24.s, p3/M, z7.s, z12.s\n"
+ "st1w { z17.s }, p1, [x11, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x28]\n"
+ "fmla z25.s, p3/M, z6.s, z12.s\n"
+ "fmla z26.s, p3/M, z8.s, z0.s\n"
+ "st1w { z18.s }, p1, [x10, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x30]\n"
+ "fmla z27.s, p3/M, z7.s, z0.s\n"
+ ".inst 0xc1afc9dc // fclamp { z28.s-z31.s }, z14.s, z15.s\n"
+ "st1w { z19.s }, p1, [x9, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x38]\n"
+ "fmla z20.s, p3/M, z4.s, z12.s\n"
+ "fmla z21.s, p3/M, z3.s, z12.s\n"
+ "st1w { z28.s }, p1, [x23, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x40]\n"
+ "fmla z22.s, p3/M, z5.s, z0.s\n"
+ "fmla z23.s, p3/M, z4.s, z0.s\n"
+ "st1w { z29.s }, p1, [x22, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x48]\n"
+ ".inst 0xc1afc9d8 // fclamp { z24.s-z27.s }, z14.s, z15.s\n"
+ "ld1w { z10.s }, p0/Z, [x25, x16, LSL #2]\n"
+ "st1w { z30.s }, p1, [x21, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "ld1w { z12.s }, p0/Z, [x24, x16, LSL #2]\n"
+ "incw x16\n"
+ "st1w { z31.s }, p1, [x20, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x58]\n"
+ ".inst 0xa040c100 // ld1w { z0.s-z3.s }, pn8.b/Z, [x8]\n"
+ "addvl x8, x8, #4\n"
+ "st1w { z24.s }, p1, [x23, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "whilelt p2.s, x15, %x[n_channels]\n"
+ ".inst 0xa040c104 // ld1w { z4.s-z7.s }, pn8.b/Z, [x8]\n"
+ "st1w { z25.s }, p1, [x22, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "addvl x8, x8, #4\n"
+ "cmp x16, %x[n_channels]\n"
+ "st1w { z26.s }, p1, [x21, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x70]\n"
+ ".inst 0xc1afc9d4 // fclamp { z20.s-z23.s }, z14.s, z15.s\n"
+ "ld1w { z8.s }, p3/Z, [x8]\n"
+ "st1w { z27.s }, p1, [x20, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x78]\n"
+ "addvl x8, x8, #1\n"
+ "st1w { z20.s }, p1, [x23, x13, LSL #2]\n"
+ "st1w { z21.s }, p1, [x22, x13, LSL #2]\n"
+ "st1w { z22.s }, p1, [x21, x13, LSL #2]\n"
+ "st1w { z23.s }, p1, [x20, x13, LSL #2]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z29, z13\n fmla z29.s, p3/M, z4.s, z9.s\n"
+ "movprfx z20, z13\n fmla z20.s, p3/M, z8.s, z9.s\n"
+ "ldr x24, [x17, #0x20]\n"
+ "incw x13\n"
+ "movprfx z30, z13\n fmla z30.s, p3/M, z3.s, z9.s\n"
+ "movprfx z25, z13\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "ldr x20, [x17, #0x30]\n"
+ "mov p0.b, p2.b\n"
+ "movprfx z26, z13\n fmla z26.s, p3/M, z0.s, z9.s\n"
+ "ldr x23, [x17, #0x28]\n"
+ "movprfx z21, z13\n fmla z21.s, p3/M, z7.s, z9.s\n"
+ "movprfx z22, z13\n fmla z22.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z5.s, z12.s\n"
+ "ldr x22, [x17, #0x38]\n"
+ "movprfx z28, z13\n fmla z28.s, p3/M, z5.s, z9.s\n"
+ "movprfx z24, z13\n fmla z24.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x21, [x17, #0x40]\n"
+ "fmla z20.s, p3/M, z0.s, z10.s\n"
+ "movprfx z23, z13\n fmla z23.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z19.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x20, [x17, #0x48]\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
+ "fmla z25.s, p3/M, z2.s, z12.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x27, [x17, #0x50]\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "fmla z21.s, p3/M, z8.s, z12.s\n"
+ "ldr x26, [x17, #0x60]\n"
+ "fmla z22.s, p3/M, z7.s, z12.s\n"
+ "movprfx z16, z13\n fmla z16.s, p3/M, z6.s, z19.s\n"
+ "ld1w { z11.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x25, [x17, #0x68]\n"
+ "fmla z29.s, p3/M, z7.s, z9.s\n"
+ "fmla z23.s, p3/M, z6.s, z12.s\n"
+ "ldr x20, [x17, #0x58]\n"
+ "movprfx z31, z13\n fmla z31.s, p3/M, z3.s, z12.s\n"
+ "movprfx z27, z13\n fmla z27.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x24, [x17, #0x70]\n"
+ "movprfx z19, z13\n fmla z19.s, p3/M, z8.s, z17.s\n"
+ "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z10.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x23, [x17, #0x78]\n"
+ "fmla z25.s, p3/M, z4.s, z9.s\n"
+ "fmla z26.s, p3/M, z3.s, z9.s\n"
+ "ldr x22, [x17, #0x80]\n"
+ "movprfx z17, z13\n fmla z17.s, p3/M, z1.s, z9.s\n"
+ "movprfx z18, z13\n fmla z18.s, p3/M, z0.s, z9.s\n"
+ "ldr x21, [x17, #0x88]\n"
+ "fmla z28.s, p3/M, z8.s, z9.s\n"
+ "fmla z24.s, p3/M, z5.s, z9.s\n"
+ "ldr x12, [x14, #0x0]\n"
+ "fmla z16.s, p3/M, z2.s, z9.s\n"
+ "fmla z20.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ldr x27, [x17, #0x90]\n"
+ "fmla z21.s, p3/M, z0.s, z12.s\n"
+ "fmla z22.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z13.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x17, #0x98]\n"
+ "fmla z29.s, p3/M, z8.s, z11.s\n"
+ "fmla z23.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "ldr x26, [x17, #0xa0]\n"
+ "fmla z30.s, p3/M, z7.s, z11.s\n"
+ "fmla z31.s, p3/M, z6.s, z11.s\n"
+ "ldr x11, [x14, #0x8]\n"
+ "fmla z25.s, p3/M, z5.s, z11.s\n"
+ "fmla z26.s, p3/M, z4.s, z11.s\n"
+ "ldr x10, [x14, #0x10]\n"
+ "fmla z27.s, p3/M, z3.s, z11.s\n"
+ "fmla z17.s, p3/M, z2.s, z11.s\n"
+ "ldr x9, [x14, #0x18]\n"
+ "fmla z18.s, p3/M, z1.s, z11.s\n"
+ "fmla z19.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ldr x25, [x17, #0xa8]\n"
+ "fmla z20.s, p3/M, z3.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z9.s\n"
+ "fmla z24.s, p3/M, z6.s, z12.s\n"
+ "fmla z16.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x24, [x17, #0xb0]\n"
+ "fmla z21.s, p3/M, z4.s, z10.s\n"
+ "fmla z22.s, p3/M, z3.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z10.s\n"
+ "fmla z23.s, p3/M, z5.s, z13.s\n"
+ "fmla z31.s, p3/M, z2.s, z13.s\n"
+ "fmla z30.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z13.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x23, [x17, #0xb8]\n"
+ "fmla z27.s, p3/M, z8.s, z12.s\n"
+ "fmla z19.s, p3/M, z5.s, z12.s\n"
+ "ld1w { z9.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z20.s, p3/M, z5.s, z10.s\n"
+ "fmla z28.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x21, [x17, #0xc8]\n"
+ "fmla z21.s, p3/M, z5.s, z13.s\n"
+ "fmla z22.s, p3/M, z4.s, z13.s\n"
+ "fmla z29.s, p3/M, z2.s, z13.s\n"
+ "fmla z23.s, p3/M, z3.s, z13.s\n"
+ "fmla z30.s, p3/M, z1.s, z13.s\n"
+ "fmla z31.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z10.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x28, [x17, #0xd8]\n"
+ "fmla z16.s, p3/M, z7.s, z9.s\n"
+ "fmla z17.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "ldr x20, [x17, #0xd0]\n"
+ "fmla z20.s, p3/M, z7.s, z12.s\n"
+ "fmla z21.s, p3/M, z6.s, z12.s\n"
+ "fmla z28.s, p3/M, z4.s, z12.s\n"
+ "fmla z29.s, p3/M, z3.s, z12.s\n"
+ "fmla z24.s, p3/M, z1.s, z12.s\n"
+ "fmla z25.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "ldr x27, [x17, #0xe0]\n"
+ "fmla z22.s, p3/M, z8.s, z10.s\n"
+ "fmla z18.s, p3/M, z8.s, z11.s\n"
+ "fmla z19.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z1.s, z10.s\n"
+ "ldr x26, [x17, #0xe8]\n"
+ "fmla z23.s, p3/M, z7.s, z10.s\n"
+ "fmla z30.s, p3/M, z5.s, z10.s\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmla z26.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "ldr x25, [x17, #0xf0]\n"
+ "fmla z20.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z1.s, z12.s\n"
+ "fmla z22.s, p3/M, z0.s, z12.s\n"
+ "fmla z28.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x24, [x17, #0xf8]\n"
+ "fmla z29.s, p3/M, z6.s, z11.s\n"
+ "fmla z24.s, p3/M, z4.s, z11.s\n"
+ "fmla z25.s, p3/M, z3.s, z11.s\n"
+ "fmla z16.s, p3/M, z1.s, z11.s\n"
+ "fmla z17.s, p3/M, z0.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z10.s\n"
+ "ldr x23, [x17, #0x100]\n"
+ "fmla z18.s, p3/M, z2.s, z10.s\n"
+ "fmla z21.s, p3/M, z2.s, z9.s\n"
+ "fmla z22.s, p3/M, z1.s, z9.s\n"
+ "fmla z23.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ldr x22, [x17, #0x108]\n"
+ "fmla z20.s, p3/M, z6.s, z12.s\n"
+ "fmla z28.s, p3/M, z3.s, z12.s\n"
+ "fmla z24.s, p3/M, z0.s, z12.s\n"
+ "fmla z30.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x21, [x17, #0x110]\n"
+ "fmla z31.s, p3/M, z7.s, z10.s\n"
+ "fmla z26.s, p3/M, z5.s, z10.s\n"
+ "fmla z19.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x28, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ldr x20, [x17, #0x118]\n"
+ "fmla z16.s, p3/M, z0.s, z12.s\n"
+ "fmla z17.s, p3/M, z4.s, z9.s\n"
+ "fmla z18.s, p3/M, z3.s, z9.s\n"
+ "fmla z23.s, p3/M, z8.s, z11.s\n"
+ "fmla z31.s, p3/M, z5.s, z11.s\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z25.s, p3/M, z7.s, z9.s\n"
+ "fmla z26.s, p3/M, z6.s, z9.s\n"
+ "fmla z16.s, p3/M, z5.s, z9.s\n"
+ "fmla z27.s, p3/M, z5.s, z10.s\n"
+ "fmla z19.s, p3/M, z2.s, z10.s\n"
+ "fmla z17.s, p3/M, z7.s, z12.s\n"
+ "fmla z18.s, p3/M, z6.s, z12.s\n"
+ "fmla z24.s, p3/M, z8.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z16.s, p3/M, z8.s, z12.s\n"
+ "fmla z25.s, p3/M, z8.s, z11.s\n"
+ "fmla z26.s, p3/M, z7.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z6.s, z11.s\n"
+ "fmla z17.s, p3/M, z5.s, z11.s\n"
+ "fmla z18.s, p3/M, z4.s, z11.s\n"
+ "fmla z19.s, p3/M, z3.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z20.s, p3/M, z4.s, z9.s\n"
+ "fmla z21.s, p3/M, z3.s, z9.s\n"
+ "fmla z22.s, p3/M, z5.s, z11.s\n"
+ "fmla z23.s, p3/M, z4.s, z11.s\n"
+ "fmla z17.s, p3/M, z8.s, z12.s\n"
+ "fmla z18.s, p3/M, z7.s, z12.s\n"
+ "fmla z19.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z13.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z1.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z0.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z2.s, z11.s\n"
+ "fmla z31.s, p3/M, z1.s, z11.s\n"
+ ".inst 0xc1afc9d4 // fclamp { z20.s-z23.s }, z14.s, z15.s\n"
+ "st1w { z20.s }, p0, [x12, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x20]\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "st1w { z21.s }, p0, [x11, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x28]\n"
+ "fmla z25.s, p3/M, z6.s, z13.s\n"
+ "fmla z26.s, p3/M, z8.s, z0.s\n"
+ "st1w { z22.s }, p0, [x10, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x30]\n"
+ "fmla z27.s, p3/M, z7.s, z0.s\n"
+ ".inst 0xc1afc9dc // fclamp { z28.s-z31.s }, z14.s, z15.s\n"
+ "st1w { z23.s }, p0, [x9, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x38]\n"
+ "fmla z16.s, p3/M, z4.s, z13.s\n"
+ "fmla z17.s, p3/M, z3.s, z13.s\n"
+ "st1w { z28.s }, p0, [x23, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x40]\n"
+ "fmla z18.s, p3/M, z5.s, z0.s\n"
+ "fmla z19.s, p3/M, z4.s, z0.s\n"
+ "st1w { z29.s }, p0, [x22, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x48]\n"
+ ".inst 0xc1afc9d8 // fclamp { z24.s-z27.s }, z14.s, z15.s\n"
+ ".inst 0xc1afc9d0 // fclamp { z16.s-z19.s }, z14.s, z15.s\n"
+ "st1w { z30.s }, p0, [x21, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "st1w { z31.s }, p0, [x20, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x58]\n"
+ "st1w { z24.s }, p0, [x23, x13, LSL #2]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "st1w { z25.s }, p0, [x22, x13, LSL #2]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "st1w { z26.s }, p0, [x21, x13, LSL #2]\n"
+ "ldr x21, [x14, #0x70]\n"
+ "st1w { z27.s }, p0, [x20, x13, LSL #2]\n"
+ "ldr x20, [x14, #0x78]\n"
+ "st1w { z16.s }, p0, [x23, x13, LSL #2]\n"
+ "st1w { z17.s }, p0, [x22, x13, LSL #2]\n"
+ "st1w { z18.s }, p0, [x21, x13, LSL #2]\n"
+ "st1w { z19.s }, p0, [x20, x13, LSL #2]\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..dcffffeb21
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..066b935486
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ptrue p3.b\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "mov x2, #0x0\n"
+ "mov x3, #0x0\n"
+ "1:" // Tile loop
+ "str x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x22, #0x4\n"
+ "str x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "mul x20, x2, x21\n" // offset = tile_i * ld_input_row
+ "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "madd x20, x3, x4, x20\n" // offset += tile_j * ld_input_col
+ "mul x20, x20, x22\n" // offset *= kernel_stride * output_size
+ "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "add x5, x5, x20, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x6, x5, x21, LSL #2\n"
+ "add x7, x6, x21, LSL #2\n"
+ "add x8, x4, x4\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x16, x7, x21, LSL #2\n"
+ "add x15, x8, x4\n"
+ "add x14, x16, x21, LSL #2\n"
+ "add x13, x15, x4\n"
+ "cbnz x3, 2f\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "sub x21, x20, x3\n"
+ "sub x21, x21, #0x1\n"
+ "lsl x12, %x[n_channels], #0x2\n"
+ "mov x20, #0x10\n"
+ "and x21, x21, #0x3fffff\n"
+ "mul x20, x20, x4\n"
+ "orr x12, x12, x21, LSL #22\n"
+ "orr x12, x12, x20, LSL #38\n"
+ "add x27, x7, x8, LSL #2\n"
+ "add x26, x5, x4, LSL #2\n"
+ "add x25, x5, x15, LSL #2\n"
+ "add x24, x5, x13, LSL #2\n"
+ "add x23, x6, x4, LSL #2\n"
+ "add x22, x5, x8, LSL #2\n"
+ "add x21, x6, x15, LSL #2\n"
+ "add x20, x6, x13, LSL #2\n"
+ "add x11, x6, x8, LSL #2\n"
+ "add x10, x16, x4, LSL #2\n"
+ "add x9, x7, x4, LSL #2\n"
+ "add x28, x16, x15, LSL #2\n"
+ ".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
+ "add x27, x7, x15, LSL #2\n"
+ ".inst 0xf8ac48ba // rprfm pldonce, x12, [x5]\n"
+ ".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
+ "add x26, x16, x13, LSL #2\n"
+ ".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
+ "add x25, x7, x13, LSL #2\n"
+ ".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
+ "add x24, x14, x4, LSL #2\n"
+ ".inst 0xf8ac48da // rprfm pldonce, x12, [x6]\n"
+ ".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
+ "add x23, x16, x8, LSL #2\n"
+ ".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
+ "add x22, x14, x15, LSL #2\n"
+ ".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
+ "add x21, x14, x8, LSL #2\n"
+ ".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
+ "add x20, x14, x13, LSL #2\n"
+ ".inst 0xf8ac497a // rprfm pldonce, x12, [x11]\n"
+ ".inst 0xf8ac4a1a // rprfm pldonce, x12, [x16]\n"
+ ".inst 0xf8ac48fa // rprfm pldonce, x12, [x7]\n"
+ ".inst 0xf8ac495a // rprfm pldonce, x12, [x10]\n"
+ ".inst 0xf8ac493a // rprfm pldonce, x12, [x9]\n"
+ ".inst 0xf8ac4b9a // rprfm pldonce, x12, [x28]\n"
+ ".inst 0xf8ac4b7a // rprfm pldonce, x12, [x27]\n"
+ ".inst 0xf8ac4b5a // rprfm pldonce, x12, [x26]\n"
+ ".inst 0xf8ac49da // rprfm pldonce, x12, [x14]\n"
+ ".inst 0xf8ac4b3a // rprfm pldonce, x12, [x25]\n"
+ ".inst 0xf8ac4b1a // rprfm pldonce, x12, [x24]\n"
+ ".inst 0xf8ac4afa // rprfm pldonce, x12, [x23]\n"
+ ".inst 0xf8ac4ada // rprfm pldonce, x12, [x22]\n"
+ ".inst 0xf8ac4aba // rprfm pldonce, x12, [x21]\n"
+ ".inst 0xf8ac4a9a // rprfm pldonce, x12, [x20]\n"
+ "2:" // Tile loop: Prefetch input rows: End
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x2, x22\n" // offset = tile_i * ld_output_row
+ "mov x20, #0x2\n"
+ "ld1w { z22.s }, p3/Z, [x17]\n"
+ "ldr x25, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "madd x21, x3, x25, x21\n" // offset += tile_j * ld_output_col
+ "addvl x17, x17, #1\n"
+ ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+ "ldr x24, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "mul x21, x21, x20\n" // offset *= output_tile_size
+ "cntw x23\n"
+ "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "addvl x17, x17, #4\n"
+ "add x24, x24, x21, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "addvl x17, x17, #4\n"
+ "ld1rw { z24.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "cmp x23, %x[n_channels]\n"
+ "add x22, x24, x22, LSL #2\n"
+ "ld1w { z8.s }, p3/Z, [x17]\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x23\n"
+ "ld1w { z9.s }, p2/Z, [x7, x8, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x5]\n"
+ "addvl x17, x17, #1\n"
+ "ld1w { z11.s }, p2/Z, [x5, x4, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x5, x15, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x5, x13, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x6]\n"
+ "ld1w { z15.s }, p2/Z, [x6, x4, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x5, x8, LSL #2]\n"
+ "bge 4f\n"
+ "3:" // Tile loop: Channel loop
+ "movprfx z28, z22\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z22\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "whilelt p1.s, x23, %x[n_channels]\n"
+ "incw x21\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z18.s }, p2/Z, [x6, x13, LSL #2]\n"
+ "incw x23\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "fmla z29.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z27.s }, p2/Z, [x6, x15, LSL #2]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z28.s, p3/M, z3.s, z14.s\n"
+ "fmla z29.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x6, x8, LSL #2]\n"
+ "addvl x5, x5, #1\n"
+ "fmla z28.s, p3/M, z4.s, z15.s\n"
+ "fmla z29.s, p3/M, z4.s, z27.s\n"
+ "ld1w { z25.s }, p2/Z, [x16]\n"
+ "addvl x6, x6, #1\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z5.s, z18.s\n"
+ "ld1w { z12.s }, p2/Z, [x7]\n"
+ "incw x20\n"
+ "movprfx z30, z22\n fmla z30.s, p3/M, z2.s, z9.s\n"
+ "movprfx z31, z22\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x7, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "fmla z29.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x16, x15, LSL #2]\n"
+ "fmla z30.s, p3/M, z3.s, z25.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z10.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z12.s\n"
+ "fmla z31.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x16, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "fmla z31.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x7, x4, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z22.s }, p2/Z, [x7, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z16.s\n"
+ "addvl x7, x7, #1\n"
+ "fmla z31.s, p3/M, z2.s, z22.s\n"
+ "fmla z28.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14]\n"
+ "ld1w { z17.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z31.s, p3/M, z3.s, z17.s\n"
+ "addvl x16, x16, #1\n"
+ "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z29.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "fmla z30.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "fmla z31.s, p3/M, z6.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z22.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
+ "ld1w { z22.s }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "cmp x23, %x[n_channels]\n"
+ ".inst 0xc1b8cb5c // fclamp { z28.s-z31.s }, z26.s, z24.s\n"
+ ".inst 0xa040c220 // ld1w { z0.s-z3.s }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "addvl x14, x14, #1\n"
+ "st1w { z28.s }, p0, [x24]\n"
+ ".inst 0xa040c224 // ld1w { z4.s-z7.s }, pn8.b/Z, [x17]\n"
+ "addvl x17, x17, #4\n"
+ "st1w { z29.s }, p0, [x24, x25, LSL #2]\n"
+ "addvl x24, x24, #1\n"
+ "ld1w { z9.s }, p1/Z, [x7, x8, LSL #2]\n"
+ "st1w { z30.s }, p0, [x22]\n"
+ "ld1w { z10.s }, p1/Z, [x5]\n"
+ "st1w { z31.s }, p0, [x22, x25, LSL #2]\n"
+ "addvl x22, x22, #1\n"
+ "ld1w { z11.s }, p1/Z, [x5, x4, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x5, x15, LSL #2]\n"
+ "ld1w { z13.s }, p1/Z, [x5, x13, LSL #2]\n"
+ "ld1w { z14.s }, p1/Z, [x6]\n"
+ "ld1w { z15.s }, p1/Z, [x6, x4, LSL #2]\n"
+ "ld1w { z16.s }, p1/Z, [x5, x8, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "blt 3b\n"
+ "4:" // Tile loop: Channel tail
+ "movprfx z28, z22\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z22\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ldr x3, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "add x3, x3, #0x1\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z18.s }, p2/Z, [x6, x13, LSL #2]\n"
+ "ldr x2, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "fmla z29.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x6, x15, LSL #2]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "fmla z28.s, p3/M, z3.s, z14.s\n"
+ "fmla z29.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z20.s }, p2/Z, [x6, x8, LSL #2]\n"
+ "ldr x21, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z28.s, p3/M, z4.s, z15.s\n"
+ "fmla z29.s, p3/M, z4.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x16]\n"
+ "cmp x3, x20\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z5.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x7]\n"
+ "add x20, x2, #0x1\n"
+ "movprfx z30, z22\n fmla z30.s, p3/M, z2.s, z9.s\n"
+ "movprfx z31, z22\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z19.s }, p2/Z, [x7, x15, LSL #2]\n"
+ "csel x2, x2, x20, LT\n"
+ "fmla z28.s, p3/M, z5.s, z20.s\n"
+ "fmla z29.s, p3/M, z3.s, z20.s\n"
+ "ld1w { z16.s }, p2/Z, [x16, x15, LSL #2]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z30.s, p3/M, z3.s, z17.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x16, x4, LSL #2]\n"
+ "csel x3, x3, XZR, LT\n"
+ "fmla z30.s, p3/M, z0.s, z18.s\n"
+ "fmla z31.s, p3/M, z1.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x16, x13, LSL #2]\n"
+ "cmp x2, x21\n"
+ "fmla z30.s, p3/M, z4.s, z17.s\n"
+ "fmla z31.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x7, x4, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x7, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z1.s, z16.s\n"
+ "fmla z31.s, p3/M, z2.s, z18.s\n"
+ "fmla z28.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14]\n"
+ "ld1w { z17.s }, p2/Z, [x16, x8, LSL #2]\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z31.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x4, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z29.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "fmla z30.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x14, x8, LSL #2]\n"
+ "fmla z31.s, p3/M, z6.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x13, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
+ ".inst 0xc1b8cb5c // fclamp { z28.s-z31.s }, z26.s, z24.s\n"
+ "st1w { z28.s }, p0, [x24]\n"
+ "st1w { z29.s }, p0, [x24, x25, LSL #2]\n"
+ "st1w { z30.s }, p0, [x22]\n"
+ "st1w { z31.s }, p0, [x22, x25, LSL #2]\n"
+ "blt 1b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..dc7a40ff54
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[25];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[1];
+ inptrs[3] = input_ptrs[3];
+ inptrs[4] = input_ptrs[4];
+ inptrs[5] = input_ptrs[5];
+ inptrs[6] = input_ptrs[6];
+ inptrs[7] = input_ptrs[2];
+ inptrs[8] = input_ptrs[8];
+ inptrs[9] = input_ptrs[9];
+ inptrs[10] = input_ptrs[7];
+ inptrs[11] = input_ptrs[15];
+ inptrs[12] = input_ptrs[10];
+ inptrs[13] = input_ptrs[16];
+ inptrs[14] = input_ptrs[11];
+ inptrs[15] = input_ptrs[18];
+ inptrs[16] = input_ptrs[13];
+ inptrs[17] = input_ptrs[19];
+ inptrs[18] = input_ptrs[20];
+ inptrs[19] = input_ptrs[14];
+ inptrs[20] = input_ptrs[21];
+ inptrs[21] = input_ptrs[17];
+ inptrs[22] = input_ptrs[23];
+ inptrs[23] = input_ptrs[22];
+ inptrs[24] = input_ptrs[24];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ptrue p3.b\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ ".inst 0x25207810 // ptrue pn8.b\n"
+ "ld1w { z26.s }, p3/Z, [x15]\n"
+ "addvl x15, x15, #1\n"
+ "ldp x14, x13, [x20, #0x0]\n"
+ "cntw x12\n"
+ ".inst 0xa040c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
+ "addvl x15, x15, #4\n"
+ "ldp x11, x10, [x20, #0x10]\n"
+ "mov x9, #0x0\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
+ "ldp x28, x26, [x16, #0x0]\n"
+ "addvl x15, x15, #4\n"
+ "cmp x12, %x[n_channels]\n"
+ "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ldp x25, x24, [x16, #0x10]\n"
+ "ld1rw { z24.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "sub x27, XZR, x12\n"
+ "ldp x23, x22, [x16, #0x20]\n"
+ "ld1w { z8.s }, p3/Z, [x15]\n"
+ "addvl x15, x15, #1\n"
+ "ldp x21, x20, [x16, #0x30]\n"
+ "ld1w { z9.s }, p2/Z, [x28, x9, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z15.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "movprfx z28, z26\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z26\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ldr x21, [x16, #0x40]\n"
+ "whilelt p1.s, x12, %x[n_channels]\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "ldr x20, [x16, #0x48]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "fmla z29.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z22.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x20, [x16, #0x50]\n"
+ "fmla z28.s, p3/M, z3.s, z14.s\n"
+ "fmla z29.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "fmla z28.s, p3/M, z4.s, z15.s\n"
+ "fmla z29.s, p3/M, z4.s, z22.s\n"
+ "ldr x21, [x16, #0x78]\n"
+ "ld1w { z23.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z5.s, z18.s\n"
+ "ldr x20, [x16, #0x60]\n"
+ "ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "movprfx z30, z26\n fmla z30.s, p3/M, z2.s, z9.s\n"
+ "movprfx z31, z26\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ldr x20, [x16, #0x80]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "fmla z29.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla z30.s, p3/M, z3.s, z23.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "ldr x20, [x16, #0x88]\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z13.s\n"
+ "fmla z31.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "ldr x20, [x16, #0x98]\n"
+ "fmla z30.s, p3/M, z4.s, z17.s\n"
+ "fmla z31.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z13.s\n"
+ "ld1w { z4.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x16, #0x90]\n"
+ "fmla z30.s, p3/M, z1.s, z16.s\n"
+ "ldr x20, [x16, #0xa8]\n"
+ "fmla z31.s, p3/M, z2.s, z4.s\n"
+ "fmla z28.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x16, #0xa0]\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z31.s, p3/M, z3.s, z17.s\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z29.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "fmla z30.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x16, #0xc0]\n"
+ "fmla z31.s, p3/M, z6.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z4.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldp x20, x26, [x16, #0x0]\n"
+ "fmla z30.s, p3/M, z8.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
+ "ldp x25, x24, [x16, #0x10]\n"
+ "ld1w { z26.s }, p3/Z, [x15]\n"
+ "addvl x15, x15, #1\n"
+ "incw x9\n"
+ "ldp x23, x22, [x16, #0x20]\n"
+ "ld1w { z9.s }, p1/Z, [x20, x12, LSL #2]\n"
+ "incw x27\n"
+ "mov p0.b, p2.b\n"
+ "ldp x21, x20, [x16, #0x30]\n"
+ "ld1w { z10.s }, p1/Z, [x26, x12, LSL #2]\n"
+ "whilelt p2.s, x9, %x[n_channels]\n"
+ ".inst 0xc1b8cb3c // fclamp { z28.s-z31.s }, z25.s, z24.s\n"
+ "ld1w { z11.s }, p1/Z, [x25, x12, LSL #2]\n"
+ "st1w { z28.s }, p0, [x14, x27, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x24, x12, LSL #2]\n"
+ "st1w { z29.s }, p0, [x13, x27, LSL #2]\n"
+ "ld1w { z13.s }, p1/Z, [x23, x12, LSL #2]\n"
+ "st1w { z30.s }, p0, [x11, x27, LSL #2]\n"
+ "ld1w { z14.s }, p1/Z, [x22, x12, LSL #2]\n"
+ "st1w { z31.s }, p0, [x10, x27, LSL #2]\n"
+ "ld1w { z15.s }, p1/Z, [x21, x12, LSL #2]\n"
+ "ld1w { z16.s }, p1/Z, [x20, x12, LSL #2]\n"
+ "incw x12\n"
+ "cmp x12, %x[n_channels]\n"
+ ".inst 0xa040c1e0 // ld1w { z0.s-z3.s }, pn8.b/Z, [x15]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0xa040c1e4 // ld1w { z4.s-z7.s }, pn8.b/Z, [x15]\n"
+ "addvl x15, x15, #4\n"
+ "ld1w { z8.s }, p3/Z, [x15]\n"
+ "addvl x15, x15, #1\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z28, z26\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "movprfx z29, z26\n fmla z29.s, p3/M, z6.s, z9.s\n"
+ "ldr x21, [x16, #0x40]\n"
+ "incw x27\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z12.s\n"
+ "ldr x20, [x16, #0x48]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z28.s, p3/M, z1.s, z11.s\n"
+ "fmla z29.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x20, [x16, #0x50]\n"
+ "fmla z28.s, p3/M, z3.s, z14.s\n"
+ "fmla z29.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z20.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x16, #0x58]\n"
+ "fmla z28.s, p3/M, z4.s, z15.s\n"
+ "fmla z29.s, p3/M, z4.s, z17.s\n"
+ "ldr x21, [x16, #0x78]\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z5.s, z18.s\n"
+ "ldr x20, [x16, #0x60]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "movprfx z30, z26\n fmla z30.s, p3/M, z2.s, z9.s\n"
+ "movprfx z31, z26\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "ldr x20, [x16, #0x80]\n"
+ "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z20.s\n"
+ "fmla z29.s, p3/M, z3.s, z20.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla z30.s, p3/M, z3.s, z17.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "ldr x20, [x16, #0x88]\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z0.s, z18.s\n"
+ "fmla z31.s, p3/M, z1.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x16, #0x70]\n"
+ "ldr x20, [x16, #0x98]\n"
+ "fmla z30.s, p3/M, z4.s, z17.s\n"
+ "fmla z31.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x16, #0x90]\n"
+ "fmla z30.s, p3/M, z1.s, z16.s\n"
+ "ldr x20, [x16, #0xa8]\n"
+ "fmla z31.s, p3/M, z2.s, z18.s\n"
+ "fmla z28.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x16, #0xa0]\n"
+ "fmla z30.s, p3/M, z6.s, z16.s\n"
+ "fmla z31.s, p3/M, z3.s, z17.s\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z16.s\n"
+ "fmla z29.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z31.s, p3/M, z7.s, z16.s\n"
+ "fmla z30.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x16, #0xc0]\n"
+ "fmla z31.s, p3/M, z6.s, z17.s\n"
+ "fmla z29.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z16.s\n"
+ "mov p0.b, p2.b\n"
+ ".inst 0xc1b8cb3c // fclamp { z28.s-z31.s }, z25.s, z24.s\n"
+ "st1w { z28.s }, p0, [x14, x27, LSL #2]\n"
+ "st1w { z29.s }, p0, [x13, x27, LSL #2]\n"
+ "st1w { z30.s }, p0, [x11, x27, LSL #2]\n"
+ "st1w { z31.s }, p0, [x10, x27, LSL #2]\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp
new file mode 100644
index 0000000000..061b0a1e2e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_3x3_s1_4rows_mla_za_impl(
+ const float *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const float *weights,
+ const float *bias,
+ float **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ float act_min,
+ float act_max
+);
+
+class sme2_fp32_planar_3x3_s1_4rows_mla_za : public PlanarStrategy<float, float>
+{
+ using Parent = PlanarStrategy<float, float>;
+
+ public:
+ using return_type = float;
+ constexpr static auto output_rows = 4u;
+ constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+ constexpr static auto stride_rows = 1u, stride_cols = 1u;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ sme2_fp32_planar_3x3_s1_4rows_mla_za(const CPUInfo *)
+ : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+ {
+ }
+
+ typename Parent::KernelType get_kernel(void) const override
+ {
+ return sme2_fp32_planar_3x3_s1_4rows_mla_za_impl;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp
new file mode 100644
index 0000000000..a385893146
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s1_4rows_mla_za/generic.cpp
@@ -0,0 +1,455 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_3x3_s1_4rows_mla_za_impl(
+ const float *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const float *weights,
+ const float *bias,
+ float **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ float act_min,
+ float act_max
+)
+{
+ struct Args
+ {
+ const float *inptr;
+ size_t ld_in_vl;
+ long unsigned int pad_top, pad_bottom, pad_left;
+ const float *weights;
+ const float *bias;
+ long unsigned int input_cols, output_cols;
+ float **outptrs;
+ const size_t *ld_out_cols;
+ const size_t *ld_out_vls;
+ long unsigned int current_channel, n_channels;
+ float clamp_min, clamp_max;
+ };
+
+ Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+ __asm__ __volatile__(
+ "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x20, #0x6\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "sub x20, x20, x7\n"
+ "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ptrue p2.b\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "ld1rw { z2.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x16\n"
+ "whilelt p9.s, XZR, x20\n"
+ "ld1rw { z24.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p8.s, XZR, x17\n"
+ "eor p8.b, p2/Z, p8.b, p9.b\n"
+ "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "1:" // Channel loop
+ "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+ "fmov z20.s, #0x0\n"
+ "cbz x20, 2f\n"
+ "ld1w { z20.s }, p1/Z, [x20, x15, LSL #2]\n"
+ "2:" // Load bias: Done
+ "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x14, #0x1\n"
+ "orr x24, x20, %x[ld_in_col], LSL #18\n"
+ "mov z21.d, z20.d\n"
+ "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa0404ae6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x23]\n"
+ "orr x24, x16, x24, LSL #20\n"
+ "mov x22, #0x6\n"
+ "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ld1w { z10.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "addvl x23, x23, #3\n"
+ "add x21, x17, x7\n"
+ ".inst 0xa1404ae0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x23]\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "mov z22.d, z20.d\n"
+ "mov z23.d, z20.d\n"
+ "ld1w { z9.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "addvl x23, x23, #3\n"
+ "mov x8, #0x0\n"
+ "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+ ".inst 0xa0404ae4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x23]\n"
+ "lsl x24, x24, #0x2\n"
+ "sub x22, x22, x21\n"
+ "ld1w { z1.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "madd x20, x20, x17, x13\n"
+ "3:" // Issue prefetches
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b84a9c // rprfm pldstrm, x24, [x20]\n"
+ "add x20, x20, %x[ld_in_col], LSL #2\n"
+ "bgt 3b\n"
+ "ldr x22, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "msub x13, x17, x20, x13\n"
+ ".inst 0xc0040e80 // mova za.d[x8, #0], { z20.d-z23.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040e81 // mova za.d[x8, #1], { z20.d-z23.d }\n"
+ "mov x10, #0x2\n"
+ "ldp x9, x28, [x22], #0x10\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldp x25, x24, [x22], #0x10\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "cbz x21, 5f\n"
+ "cmp x21, x10\n"
+ "csel x20, x21, x10, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x10, x10, x20\n"
+ "cbz x21, 5f\n"
+ ".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
+ "sub x11, x11, x21\n"
+ ".inst 0xc1b8c84c // fclamp { z12.s-z15.s }, z2.s, z24.s\n"
+ "4:" // Left padding
+ "subs x21, x21, #0x1\n"
+ "st1w { z12.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z13.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ "st1w { z14.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "st1w { z15.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
+ "bgt 4b\n"
+ "5:" // Left padding: End
+ "adds XZR, x17, x7\n"
+ "bne 10f\n"
+ "cbz x10, 8f\n"
+ "cmp x10, #0x1\n"
+ "sub x14, x14, x10\n"
+ "beq 7f\n"
+ "6:" // Unpadded: 2 priming loads
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13619c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z6.s\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13019e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z0.s\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ ".inst 0xc1341a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z4.s\n"
+ "7:" // Unpadded: 1 priming loads
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z13.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z14.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13719a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z7.s\n"
+ ".inst 0xc13619a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z6.s\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13819c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z8.s\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ ".inst 0xc13019c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z0.s\n"
+ ".inst 0xc13519e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z5.s\n"
+ ".inst 0xc13419e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z4.s\n"
+ "8:" // Unpadded: 0 priming loads
+ "cbz x14, 16f\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "sub x14, x14, #0x1\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "sub x11, x11, #0x1\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "cmp x14, x11\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "csel x21, x14, x11, LT\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
+ "sub x11, x11, x21\n"
+ "cbz x21, 15f\n"
+ "9:" // Unpadded: Main loop
+ ".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xc1391b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z9.s\n"
+ ".inst 0xc1371b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z7.s\n"
+ ".inst 0xc1361b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z6.s\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1311b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+ ".inst 0xc1381b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z8.s\n"
+ ".inst 0xc1301b42 // fmla za.s[x8, 2], { z26.s-z29.s }, z0.s\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
+ ".inst 0xc1b8c84c // fclamp { z12.s-z15.s }, z2.s, z24.s\n"
+ "st1w { z12.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ ".inst 0xc1351b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z5.s\n"
+ "st1w { z13.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ ".inst 0xc1341b62 // fmla za.s[x8, 2], { z27.s-z30.s }, z4.s\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x8, x8, #0x1\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "st1w { z14.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "st1w { z15.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
+ "bgt 9b\n"
+ "b 15f\n"
+ "10:" // Padded
+ "cbz x10, 13f\n"
+ "cmp x10, #0x1\n"
+ "sub x14, x14, x10\n"
+ "beq 12f\n"
+ "11:" // Padded: 2 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z11.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1361960 // fmla za.s[x8, 0], { z11.s-z14.s }, z6.s\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1301980 // fmla za.s[x8, 0], { z12.s-z15.s }, z0.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc13419a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z4.s\n"
+ "12:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z11.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1371960 // fmla za.s[x8, 0], { z11.s-z14.s }, z7.s\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1361961 // fmla za.s[x8, 1], { z11.s-z14.s }, z6.s\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc1381980 // fmla za.s[x8, 0], { z12.s-z15.s }, z8.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1301981 // fmla za.s[x8, 1], { z12.s-z15.s }, z0.s\n"
+ ".inst 0xc13519a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z5.s\n"
+ ".inst 0xc13419a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z4.s\n"
+ "13:" // Padded: 0 priming loads
+ "cbz x14, 16f\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z25.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "sub x14, x14, #0x1\n"
+ "sub x11, x11, #0x1\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "cmp x14, x11\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
+ "csel x21, x14, x11, LT\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "sub x11, x11, x21\n"
+ "cbz x21, 15f\n"
+ "14:" // Padded: Main loop
+ ".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1391b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z9.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xc1371b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z7.s\n"
+ ".inst 0xc1361b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z6.s\n"
+ "ld1w { z25.s }, p0/Z, [x13]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1311b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+ ".inst 0xc1381b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z8.s\n"
+ ".inst 0xc1301b42 // fmla za.s[x8, 2], { z26.s-z29.s }, z0.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1b8c850 // fclamp { z16.s-z19.s }, z2.s, z24.s\n"
+ "st1w { z16.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ ".inst 0xc1351b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z5.s\n"
+ "st1w { z17.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ ".inst 0xc1341b62 // fmla za.s[x8, 2], { z27.s-z30.s }, z4.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x4\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "st1w { z18.s }, p1, [x25]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "add x8, x8, #0x1\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
+ "st1w { z19.s }, p1, [x24]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "add x24, x24, x22, LSL #2\n"
+ "bgt 14b\n"
+ "15:" // Main loop tail
+ ".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
+ ".inst 0xc1391b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z9.s\n"
+ ".inst 0xc1371b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z7.s\n"
+ ".inst 0xc1361b22 // fmla za.s[x8, 2], { z25.s-z28.s }, z6.s\n"
+ ".inst 0xc1311b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+ ".inst 0xc1381b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z8.s\n"
+ ".inst 0xc1301b42 // fmla za.s[x8, 2], { z26.s-z29.s }, z0.s\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1b8c850 // fclamp { z16.s-z19.s }, z2.s, z24.s\n"
+ "st1w { z16.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ ".inst 0xc1351b61 // fmla za.s[x8, 1], { z27.s-z30.s }, z5.s\n"
+ "st1w { z17.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ ".inst 0xc1341b62 // fmla za.s[x8, 2], { z27.s-z30.s }, z4.s\n"
+ "add x8, x8, #0x1\n"
+ "st1w { z18.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "st1w { z19.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
+ "16:" // Main loop skip tail
+ "cbz x11, 18f\n"
+ "17:" // Right padding loop
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "subs x11, x11, #0x1\n"
+ ".inst 0xc1b8c848 // fclamp { z8.s-z11.s }, z2.s, z24.s\n"
+ "st1w { z8.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ ".inst 0xc0040e82 // mova za.d[x8, #2], { z20.d-z23.d }\n"
+ "st1w { z9.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ "st1w { z10.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "st1w { z11.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
+ "bgt 17b\n"
+ "18:" // End
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x15\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x15, x16\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21, LSL #2\n"
+ "add x22, x22, x20, LSL #2\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21, LSL #2\n"
+ "add x22, x22, x20, LSL #2\n"
+ "stp x23, x22, [x25, #0x10]\n"
+ "b.any 1b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp
new file mode 100644
index 0000000000..711f7f479a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_3x3_s2_4rows_mla_za_impl(
+ const float *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const float *weights,
+ const float *bias,
+ float **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ float act_min,
+ float act_max
+);
+
+class sme2_fp32_planar_3x3_s2_4rows_mla_za : public PlanarStrategy<float, float>
+{
+ using Parent = PlanarStrategy<float, float>;
+
+ public:
+ using return_type = float;
+ constexpr static auto output_rows = 4u;
+ constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+ constexpr static auto stride_rows = 2u, stride_cols = 2u;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ sme2_fp32_planar_3x3_s2_4rows_mla_za(const CPUInfo *)
+ : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+ {
+ }
+
+ typename Parent::KernelType get_kernel(void) const override
+ {
+ return sme2_fp32_planar_3x3_s2_4rows_mla_za_impl;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp
new file mode 100644
index 0000000000..26315101b4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_3x3_s2_4rows_mla_za/generic.cpp
@@ -0,0 +1,650 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_3x3_s2_4rows_mla_za_impl(
+ const float *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const float *weights,
+ const float *bias,
+ float **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ float act_min,
+ float act_max
+)
+{
+ struct Args
+ {
+ const float *inptr;
+ size_t ld_in_vl;
+ long unsigned int pad_top, pad_bottom, pad_left;
+ const float *weights;
+ const float *bias;
+ long unsigned int input_cols, output_cols;
+ float **outptrs;
+ const size_t *ld_out_cols;
+ const size_t *ld_out_vls;
+ long unsigned int current_channel, n_channels;
+ float clamp_min, clamp_max;
+ };
+
+ Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+ __asm__ __volatile__(
+ "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x20, #0x9\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "sub x20, x20, x7\n"
+ "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ptrue p2.b\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "ld1rw { z7.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x16\n"
+ "whilelt p9.s, XZR, x20\n"
+ "ld1rw { z9.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p8.s, XZR, x17\n"
+ "eor p8.b, p2/Z, p8.b, p9.b\n"
+ "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "1:" // Channel loop
+ "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+ "fmov z12.s, #0x0\n"
+ "cbz x20, 2f\n"
+ "ld1w { z12.s }, p1/Z, [x20, x15, LSL #2]\n"
+ "2:" // Load bias: Done
+ "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x14, #0x1\n"
+ "orr x24, x20, %x[ld_in_col], LSL #18\n"
+ "mov z13.d, z12.d\n"
+ "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa1404ae2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x23]\n"
+ "orr x24, x16, x24, LSL #20\n"
+ "mov x22, #0x9\n"
+ "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ld1w { z8.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "addvl x23, x23, #3\n"
+ "add x21, x17, x7\n"
+ ".inst 0xa0404ae0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x23]\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "mov z14.d, z12.d\n"
+ "mov z15.d, z12.d\n"
+ "ld1w { z5.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "addvl x23, x23, #3\n"
+ "mov x8, #0x0\n"
+ "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+ ".inst 0xa1404ae3 // ld1w { z3.s, z11.s }, pn10.b/Z, [x23]\n"
+ "lsl x24, x24, #0x2\n"
+ "sub x22, x22, x21\n"
+ "ld1w { z6.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "madd x20, x20, x17, x13\n"
+ "3:" // Issue prefetches
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b84a9c // rprfm pldstrm, x24, [x20]\n"
+ "add x20, x20, %x[ld_in_col], LSL #2\n"
+ "bgt 3b\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "msub x13, x17, x20, x13\n"
+ ".inst 0xc0040d80 // mova za.d[x8, #0], { z12.d-z15.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040d81 // mova za.d[x8, #1], { z12.d-z15.d }\n"
+ "mov x22, #0x2\n"
+ "ldp x10, x9, [x23], #0x10\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "ldp x28, x27, [x20], #0x10\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldp x26, x25, [x23], #0x10\n"
+ "ldp x24, x23, [x20], #0x10\n"
+ "cbz x21, 5f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 5f\n"
+ ".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
+ "and x22, x21, #0x1\n"
+ "add x21, x21, #0x1\n"
+ ".inst 0xc1a9c8f4 // fclamp { z20.s-z23.s }, z7.s, z9.s\n"
+ "lsr x21, x21, #0x1\n"
+ "sub x11, x11, x21\n"
+ "4:" // Left padding
+ "subs x21, x21, #0x1\n"
+ "st1w { z20.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z21.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z22.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
+ "st1w { z23.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "bgt 4b\n"
+ "5:" // Left padding: End
+ "adds XZR, x17, x7\n"
+ "bne 10f\n"
+ "cbz x22, 8f\n"
+ "cmp x22, #0x1\n"
+ "sub x14, x14, x22\n"
+ "beq 7f\n"
+ "6:" // Unpadded: 2 priming loads
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1321a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z2.s\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1301b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z0.s\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ ".inst 0xc1331a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z3.s\n"
+ "7:" // Unpadded: 1 priming loads
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z26.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z10.s\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1311a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z1.s\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
+ ".inst 0xc13b1b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z11.s\n"
+ "8:" // Unpadded: 0 priming loads
+ "cmp x14, #0x2\n"
+ "blt 16f\n"
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "sub x14, x14, #0x2\n"
+ "ld1w { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "sub x11, x11, #0x1\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "lsr x20, x14, #0x1\n"
+ "ld1w { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "cmp x20, x11\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "csel x22, x20, x11, LT\n"
+ "ld1w { z21.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z28.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "and x14, x14, #0x1\n"
+ "ld1w { z22.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "sub x11, x11, x22\n"
+ "ld1w { z29.s }, p1/Z, [x21]\n"
+ "cbz x22, 15f\n"
+ "9:" // Unpadded: Main loop
+ ".inst 0xc1381b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z8.s\n"
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xc1321b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1351a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z5.s\n"
+ ".inst 0xc1301a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z0.s\n"
+ "ld1w { z18.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1361b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z6.s\n"
+ ".inst 0xc1331b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z3.s\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0060c14 // mova { z20.d-z23.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1a9c8f4 // fclamp { z20.s-z23.s }, z7.s, z9.s\n"
+ "st1w { z20.s }, p1, [x10]\n"
+ "ld1w { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z21.s }, p1, [x9]\n"
+ "ld1w { z28.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z10.s\n"
+ "add x9, x9, x27, LSL #2\n"
+ "ld1w { z21.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1311a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
+ "st1w { z22.s }, p1, [x26]\n"
+ "ld1w { z29.s }, p1/Z, [x21]\n"
+ ".inst 0xc13b1b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z11.s\n"
+ "add x26, x26, x24, LSL #2\n"
+ "st1w { z23.s }, p1, [x25]\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "add x25, x25, x23, LSL #2\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
+ "bgt 9b\n"
+ "b 15f\n"
+ "10:" // Padded
+ "cbz x22, 13f\n"
+ "cmp x22, #0x1\n"
+ "sub x14, x14, x22\n"
+ "beq 12f\n"
+ "11:" // Padded: 2 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z27.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc1321b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z2.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1301ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1331b80 // fmla za.s[x8, 0], { z28.s-z31.s }, z3.s\n"
+ "12:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z22.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc13a1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z10.s\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1311b60 // fmla za.s[x8, 0], { z27.s-z30.s }, z1.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc13b1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z11.s\n"
+ "13:" // Padded: 0 priming loads
+ "cmp x14, #0x2\n"
+ "blt 16f\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z25.s }, p0/Z, [x13]\n"
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z19.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z20.s }, p0/Z, [x21]\n"
+ "mov x12, #0x4\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "sub x14, x14, #0x2\n"
+ "ld1w { z21.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z28.s }, p0/Z, [x21]\n"
+ "sub x11, x11, #0x1\n"
+ "lsr x20, x14, #0x1\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z22.s }, p0/Z, [x21]\n"
+ "mov x12, #0x8\n"
+ "cmp x20, x11\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z29.s }, p0/Z, [x21]\n"
+ "csel x22, x20, x11, LT\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "and x14, x14, #0x1\n"
+ "sub x11, x11, x22\n"
+ "cbz x22, 15f\n"
+ "14:" // Padded: Main loop
+ ".inst 0xc1381b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z8.s\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1321b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
+ "ld1w { z18.s }, p0/Z, [x13]\n"
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc1351a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z5.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1301a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z0.s\n"
+ "ld1w { z25.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc1361b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z6.s\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xc1331b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z3.s\n"
+ "ld1w { z19.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x4\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1a9c8fc // fclamp { z28.s-z31.s }, z7.s, z9.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "st1w { z28.s }, p1, [x10]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z21.s }, p0/Z, [x21]\n"
+ "add x8, x8, #0x1\n"
+ "st1w { z29.s }, p1, [x9]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z28.s }, p0/Z, [x21]\n"
+ "st1w { z30.s }, p1, [x26]\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc13a1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "st1w { z31.s }, p1, [x25]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1311b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z1.s\n"
+ "mov x12, #0x0\n"
+ "ld1w { z22.s }, p0/Z, [x21]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z25.s }, p0/Z, [x13]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc13b1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z11.s\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x4\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "add x9, x9, x27, LSL #2\n"
+ "add x26, x26, x24, LSL #2\n"
+ "add x25, x25, x23, LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "bgt 14b\n"
+ "15:" // Main loop tail
+ ".inst 0xc1381b20 // fmla za.s[x8, 0], { z25.s-z28.s }, z8.s\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1321b21 // fmla za.s[x8, 1], { z25.s-z28.s }, z2.s\n"
+ "ld1w { z18.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc1351a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z5.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1301a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z0.s\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc1361b40 // fmla za.s[x8, 0], { z26.s-z29.s }, z6.s\n"
+ ".inst 0xc1331b41 // fmla za.s[x8, 1], { z26.s-z29.s }, z3.s\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x4\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1a9c8fc // fclamp { z28.s-z31.s }, z7.s, z9.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "st1w { z28.s }, p1, [x10]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "add x8, x8, #0x1\n"
+ "st1w { z29.s }, p1, [x9]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "st1w { z30.s }, p1, [x26]\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc13a1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z10.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "st1w { z31.s }, p1, [x25]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1311ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "add x9, x9, x27, LSL #2\n"
+ "add x26, x26, x24, LSL #2\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "add x25, x25, x23, LSL #2\n"
+ ".inst 0xc13b1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z11.s\n"
+ "16:" // Main loop skip tail
+ "cbz x14, 17f\n" // Skip remainder inputs
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z21.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc1381aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z8.s\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1351b80 // fmla za.s[x8, 0], { z28.s-z31.s }, z5.s\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ ".inst 0xc1321aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z2.s\n"
+ "sub x11, x11, #0x1\n"
+ ".inst 0xc1361ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z6.s\n"
+ ".inst 0xc1301b81 // fmla za.s[x8, 1], { z28.s-z31.s }, z0.s\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a9c8f0 // fclamp { z16.s-z19.s }, z7.s, z9.s\n"
+ "st1w { z16.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ ".inst 0xc1331ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z3.s\n"
+ "add x8, x8, #0x1\n"
+ "st1w { z17.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z18.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "st1w { z19.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "17:" // Tail input: End
+ "cbz x11, 19f\n"
+ "18:" // Right padding loop
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "subs x11, x11, #0x1\n"
+ ".inst 0xc1a9c8e0 // fclamp { z0.s-z3.s }, z7.s, z9.s\n"
+ "st1w { z0.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "st1w { z1.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z2.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
+ "st1w { z3.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "bgt 18b\n"
+ "19:" // End
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x15\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x15, x16\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21, LSL #2\n"
+ "add x22, x22, x20, LSL #2\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21, LSL #2\n"
+ "add x22, x22, x20, LSL #2\n"
+ "stp x23, x22, [x25, #0x10]\n"
+ "b.any 1b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp
new file mode 100644
index 0000000000..71487e08b6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_5x5_s1_4rows_mla_za_impl(
+ const float *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const float *weights,
+ const float *bias,
+ float **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ float act_min,
+ float act_max
+);
+
+class sme2_fp32_planar_5x5_s1_4rows_mla_za : public PlanarStrategy<float, float>
+{
+ using Parent = PlanarStrategy<float, float>;
+
+ public:
+ using return_type = float;
+ constexpr static auto output_rows = 4u;
+ constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+ constexpr static auto stride_rows = 1u, stride_cols = 1u;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ sme2_fp32_planar_5x5_s1_4rows_mla_za(const CPUInfo *)
+ : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+ {
+ }
+
+ typename Parent::KernelType get_kernel(void) const override
+ {
+ return sme2_fp32_planar_5x5_s1_4rows_mla_za_impl;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp
new file mode 100644
index 0000000000..3741b973b4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s1_4rows_mla_za/generic.cpp
@@ -0,0 +1,883 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_5x5_s1_4rows_mla_za_impl(
+ const float *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const float *weights,
+ const float *bias,
+ float **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ float act_min,
+ float act_max
+)
+{
+ struct Args
+ {
+ const float *inptr;
+ size_t ld_in_vl;
+ long unsigned int pad_top, pad_bottom, pad_left;
+ const float *weights;
+ const float *bias;
+ long unsigned int input_cols, output_cols;
+ float **outptrs;
+ const size_t *ld_out_cols;
+ const size_t *ld_out_vls;
+ long unsigned int current_channel, n_channels;
+ float clamp_min, clamp_max;
+ };
+
+ Args args = { inptr, ld_in_vl, pad_top, 8u - std::min(8u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+ __asm__ __volatile__(
+ "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x20, #0x8\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "sub x20, x20, x6\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ptrue p2.b\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "ld1rw { z16.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x17\n"
+ "whilelt p9.s, XZR, x20\n"
+ "ld1rw { z17.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p8.s, XZR, x7\n"
+ "eor p8.b, p2/Z, p8.b, p9.b\n"
+ "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "1:" // Channel loop
+ "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+ "fmov z28.s, #0x0\n"
+ "cbz x20, 2f\n"
+ "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "2:" // Load bias: Done
+ "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x15, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #18\n"
+ "mov z29.d, z28.d\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "orr x23, x17, x23, LSL #20\n"
+ "mov x22, #0x8\n"
+ "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "add x21, x7, x6\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ "mov z30.d, z28.d\n"
+ "mov z31.d, z28.d\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ "mov x8, #0x0\n"
+ "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "lsl x23, x23, #0x2\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x7, x13\n"
+ "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ "3:" // Issue prefetches
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col], LSL #2\n"
+ "bgt 3b\n"
+ "ldr x22, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "msub x13, x7, x20, x13\n"
+ ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
+ "mov x10, #0x4\n"
+ "ldp x9, x28, [x22], #0x10\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ ".inst 0xc0040f83 // mova za.d[x8, #3], { z28.d-z31.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
+ "ldp x25, x24, [x22], #0x10\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "cbz x21, 5f\n"
+ "cmp x21, x10\n"
+ "csel x20, x21, x10, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x10, x10, x20\n"
+ "cbz x21, 5f\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ "sub x11, x11, x21\n"
+ ".inst 0xc1b1ca04 // fclamp { z4.s-z7.s }, z16.s, z17.s\n"
+ "4:" // Left padding
+ "subs x21, x21, #0x1\n"
+ "st1w { z4.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z5.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ "st1w { z6.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "st1w { z7.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
+ "bgt 4b\n"
+ "5:" // Left padding: End
+ "adds XZR, x7, x6\n"
+ "bne 12f\n"
+ "cbz x10, 10f\n"
+ "cmp x10, #0x1\n"
+ "sub x15, x15, x10\n"
+ "beq 9f\n"
+ "cmp x10, #0x2\n"
+ "beq 8f\n"
+ "cmp x10, #0x3\n"
+ "beq 7f\n"
+ "6:" // Unpadded: 4 priming loads
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z14.s\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z10.s\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa14049c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1351a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z5.s\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04049cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13c1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z12.s\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1311ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ "7:" // Unpadded: 3 priming loads
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z15.s\n"
+ ".inst 0xc13e1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z14.s\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13b1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z11.s\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z10.s\n"
+ ".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1371ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z7.s\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1361ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z6.s\n"
+ ".inst 0xa14049c5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13d1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z13.s\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ ".inst 0xc1351ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z5.s\n"
+ ".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1371b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z7.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1361b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z6.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ "8:" // Unpadded: 2 priming loads
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z1.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z2.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z3.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z4.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13c1820 // fmla za.s[x8, 0], { z1.s-z4.s }, z12.s\n"
+ ".inst 0xc13f1821 // fmla za.s[x8, 1], { z1.s-z4.s }, z15.s\n"
+ "ld1w { z5.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1822 // fmla za.s[x8, 2], { z1.s-z4.s }, z14.s\n"
+ "ld1w { z6.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1381840 // fmla za.s[x8, 0], { z2.s-z5.s }, z8.s\n"
+ ".inst 0xa04049cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc13b1841 // fmla za.s[x8, 1], { z2.s-z5.s }, z11.s\n"
+ ".inst 0xa04149ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13a1842 // fmla za.s[x8, 2], { z2.s-z5.s }, z10.s\n"
+ "ld1w { z7.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1860 // fmla za.s[x8, 0], { z3.s-z6.s }, z14.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc13d1861 // fmla za.s[x8, 1], { z3.s-z6.s }, z13.s\n"
+ ".inst 0xa14149c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13c1862 // fmla za.s[x8, 2], { z3.s-z6.s }, z12.s\n"
+ "ld1w { z8.s }, p1/Z, [x20]\n"
+ ".inst 0xc1301880 // fmla za.s[x8, 0], { z4.s-z7.s }, z0.s\n"
+ ".inst 0xa04049c0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc13f1881 // fmla za.s[x8, 1], { z4.s-z7.s }, z15.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13e1882 // fmla za.s[x8, 2], { z4.s-z7.s }, z14.s\n"
+ ".inst 0xc13c18a0 // fmla za.s[x8, 0], { z5.s-z8.s }, z12.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13118a1 // fmla za.s[x8, 1], { z5.s-z8.s }, z1.s\n"
+ ".inst 0xc13018a2 // fmla za.s[x8, 2], { z5.s-z8.s }, z0.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ "9:" // Unpadded: 1 priming loads
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13d1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z13.s\n"
+ ".inst 0xc13c1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z12.s\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f1a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z15.s\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1a83 // fmla za.s[x8, 3], { z20.s-z23.s }, z14.s\n"
+ ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc1391aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z9.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1381aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13b1aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z11.s\n"
+ ".inst 0xa14149c6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13a1aa3 // fmla za.s[x8, 3], { z21.s-z24.s }, z10.s\n"
+ ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13d1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z13.s\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ ".inst 0xc13c1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z12.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1351ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z5.s\n"
+ ".inst 0xc1341ac3 // fmla za.s[x8, 3], { z22.s-z25.s }, z4.s\n"
+ ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13e1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z14.s\n"
+ "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1361ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z6.s\n"
+ ".inst 0xc1391ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z9.s\n"
+ ".inst 0xc1311ae3 // fmla za.s[x8, 3], { z23.s-z26.s }, z1.s\n"
+ ".inst 0xc13d1b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z13.s\n"
+ ".inst 0xc13c1b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z12.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1351b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z5.s\n"
+ ".inst 0xc1341b03 // fmla za.s[x8, 3], { z24.s-z27.s }, z4.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ "10:" // Unpadded: 0 priming loads
+ "cbz x15, 20f\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x13]\n"
+ "sub x15, x15, #0x1\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "sub x11, x11, #0x1\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "cmp x15, x11\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "csel x21, x15, x11, LT\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "sub x11, x11, x21\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "cbz x21, 19f\n"
+ "11:" // Unpadded: Main loop
+ ".inst 0xc1321a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
+ "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xc1331a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z3.s\n"
+ ".inst 0xc13d1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z13.s\n"
+ ".inst 0xc13c1a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z12.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13f1a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z15.s\n"
+ ".inst 0xc13e1a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z14.s\n"
+ ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1361a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z6.s\n"
+ "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1391a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z9.s\n"
+ "ld1w { z18.s }, p1/Z, [x13]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1381a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z8.s\n"
+ ".inst 0xa04149ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13b1a63 // fmla za.s[x8, 3], { z19.s-z22.s }, z11.s\n"
+ ".inst 0xc13a1a64 // fmla za.s[x8, 4], { z19.s-z22.s }, z10.s\n"
+ ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1361aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z6.s\n"
+ "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc13d1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z13.s\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13c1a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z12.s\n"
+ ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1351a83 // fmla za.s[x8, 3], { z20.s-z23.s }, z5.s\n"
+ ".inst 0xc1341a84 // fmla za.s[x8, 4], { z20.s-z23.s }, z4.s\n"
+ ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1321ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z2.s\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f1aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z15.s\n"
+ "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc13e1aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z14.s\n"
+ ".inst 0xc1381aa3 // fmla za.s[x8, 3], { z21.s-z24.s }, z8.s\n"
+ ".inst 0xc1301aa4 // fmla za.s[x8, 4], { z21.s-z24.s }, z0.s\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0060c0c // mova { z12.d-z15.d }, za.d[x8, #0]\n"
+ ".inst 0xc1b1ca0c // fclamp { z12.s-z15.s }, z16.s, z17.s\n"
+ "st1w { z12.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ ".inst 0xc1371ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z7.s\n"
+ "st1w { z13.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ ".inst 0xc1361ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z6.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "st1w { z14.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ ".inst 0xc1351ac3 // fmla za.s[x8, 3], { z22.s-z25.s }, z5.s\n"
+ "st1w { z15.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
+ ".inst 0xc1341ac4 // fmla za.s[x8, 4], { z22.s-z25.s }, z4.s\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x8, x8, #0x1\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "bgt 11b\n"
+ "b 19f\n"
+ "12:" // Padded
+ "cbz x10, 17f\n"
+ "cmp x10, #0x1\n"
+ "sub x15, x15, x10\n"
+ "beq 16f\n"
+ "cmp x10, #0x2\n"
+ "beq 15f\n"
+ "cmp x10, #0x3\n"
+ "beq 14f\n"
+ "13:" // Padded: 4 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z19.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z14.s\n"
+ ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z10.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1311aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z1.s\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "addvl x14, x14, #5\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ ".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc1361ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z6.s\n"
+ "addvl x14, x14, #5\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1301ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ "14:" // Padded: 3 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z0.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z1.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z2.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z3.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f1800 // fmla za.s[x8, 0], { z0.s-z3.s }, z15.s\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc13e1801 // fmla za.s[x8, 1], { z0.s-z3.s }, z14.s\n"
+ "ld1w { z4.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc13b1820 // fmla za.s[x8, 0], { z1.s-z4.s }, z11.s\n"
+ "ld1w { z5.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1821 // fmla za.s[x8, 1], { z1.s-z4.s }, z10.s\n"
+ ".inst 0xa04049c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1391840 // fmla za.s[x8, 0], { z2.s-z5.s }, z9.s\n"
+ "ld1w { z6.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc1381841 // fmla za.s[x8, 1], { z2.s-z5.s }, z8.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc13f1860 // fmla za.s[x8, 0], { z3.s-z6.s }, z15.s\n"
+ "ld1w { z7.s }, p0/Z, [x20]\n"
+ ".inst 0xc13e1861 // fmla za.s[x8, 1], { z3.s-z6.s }, z14.s\n"
+ ".inst 0xa14049c3 // ld1w { z3.s, z11.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13b1880 // fmla za.s[x8, 0], { z4.s-z7.s }, z11.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1331881 // fmla za.s[x8, 1], { z4.s-z7.s }, z3.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ "15:" // Padded: 2 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z19.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13c1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z12.s\n"
+ ".inst 0xa04149c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc13f1a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z15.s\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z14.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1381a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z8.s\n"
+ ".inst 0xa14049c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13b1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z11.s\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc13a1a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z10.s\n"
+ ".inst 0xa14049c2 // ld1w { z2.s, z10.s }, pn10.b/Z, [x14]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1361aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z6.s\n"
+ ".inst 0xa14149c4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1381aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ ".inst 0xc1301aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z0.s\n"
+ ".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc1341ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z4.s\n"
+ ".inst 0xa14149c3 // ld1w { z3.s, z11.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13a1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
+ ".inst 0xc1321ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z2.s\n"
+ ".inst 0xc1331ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z3.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1371ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z7.s\n"
+ ".inst 0xc1361ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z6.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ "16:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z18.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13d1a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z13.s\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc13c1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z12.s\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f1a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z15.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z14.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1391a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z9.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1381a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z8.s\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc13b1a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z11.s\n"
+ ".inst 0xa14149c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13a1a63 // fmla za.s[x8, 3], { z19.s-z22.s }, z10.s\n"
+ ".inst 0xa14049c1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13d1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z13.s\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ ".inst 0xc13c1a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z12.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13f1a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z15.s\n"
+ ".inst 0xc13e1a83 // fmla za.s[x8, 3], { z20.s-z23.s }, z14.s\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1381aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z8.s\n"
+ "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1301aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z0.s\n"
+ ".inst 0xc1391aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z9.s\n"
+ ".inst 0xc1311aa3 // fmla za.s[x8, 3], { z21.s-z24.s }, z1.s\n"
+ ".inst 0xc13d1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z13.s\n"
+ ".inst 0xc13c1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z12.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13b1ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z11.s\n"
+ ".inst 0xc13a1ac3 // fmla za.s[x8, 3], { z22.s-z25.s }, z10.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ "17:" // Padded: 0 priming loads
+ "cbz x15, 20f\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z18.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "sub x15, x15, #0x1\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "sub x11, x11, #0x1\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "cmp x15, x11\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "csel x21, x15, x11, LT\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "sub x11, x11, x21\n"
+ "cbz x21, 19f\n"
+ "18:" // Padded: Main loop
+ ".inst 0xc1321a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
+ "ld1w { z0.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1331a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z3.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xc13d1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z13.s\n"
+ ".inst 0xc13c1a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z12.s\n"
+ ".inst 0xa04149c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13f1a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z15.s\n"
+ ".inst 0xc13e1a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z14.s\n"
+ ".inst 0xa04049c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1301a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z0.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1391a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z9.s\n"
+ "ld1w { z18.s }, p0/Z, [x13]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1381a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z8.s\n"
+ ".inst 0xa14149c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13b1a63 // fmla za.s[x8, 3], { z19.s-z22.s }, z11.s\n"
+ ".inst 0xc13a1a64 // fmla za.s[x8, 4], { z19.s-z22.s }, z10.s\n"
+ ".inst 0xa04049c6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc13c1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1331a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z3.s\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc1321a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z2.s\n"
+ ".inst 0xa14149c3 // ld1w { z3.s, z11.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1351a83 // fmla za.s[x8, 3], { z20.s-z23.s }, z5.s\n"
+ ".inst 0xc1341a84 // fmla za.s[x8, 4], { z20.s-z23.s }, z4.s\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x4\n"
+ ".inst 0xc13c1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z12.s\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ldr x14, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1381aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+ "ld1w { z2.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1301aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z0.s\n"
+ ".inst 0xc1371aa3 // fmla za.s[x8, 3], { z21.s-z24.s }, z7.s\n"
+ ".inst 0xc1361aa4 // fmla za.s[x8, 4], { z21.s-z24.s }, z6.s\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc1b1ca04 // fclamp { z4.s-z7.s }, z16.s, z17.s\n"
+ "st1w { z4.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ ".inst 0xc13b1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z11.s\n"
+ "st1w { z5.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ ".inst 0xc1331ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z3.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "st1w { z6.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ ".inst 0xc13f1ac3 // fmla za.s[x8, 3], { z22.s-z25.s }, z15.s\n"
+ "st1w { z7.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
+ ".inst 0xc13e1ac4 // fmla za.s[x8, 4], { z22.s-z25.s }, z14.s\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xa04049ce // ld1w { z14.s-z15.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xa04149c8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x14, #4, MUL VL]\n"
+ "addvl x14, x14, #5\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "bgt 18b\n"
+ "19:" // Main loop tail
+ ".inst 0xc1321a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z2.s\n"
+ "ld1w { z6.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1331a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z3.s\n"
+ ".inst 0xc13d1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z13.s\n"
+ ".inst 0xc13c1a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z12.s\n"
+ ".inst 0xa04149c4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13f1a43 // fmla za.s[x8, 3], { z18.s-z21.s }, z15.s\n"
+ ".inst 0xc13e1a44 // fmla za.s[x8, 4], { z18.s-z21.s }, z14.s\n"
+ ".inst 0xa04049c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1361a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z6.s\n"
+ "ld1w { z7.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1391a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z9.s\n"
+ ".inst 0xc1381a62 // fmla za.s[x8, 2], { z19.s-z22.s }, z8.s\n"
+ ".inst 0xa14149c0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc13b1a63 // fmla za.s[x8, 3], { z19.s-z22.s }, z11.s\n"
+ ".inst 0xc13a1a64 // fmla za.s[x8, 4], { z19.s-z22.s }, z10.s\n"
+ ".inst 0xa04049ca // ld1w { z10.s-z11.s }, pn10.b/Z, [x14]\n"
+ "addvl x14, x14, #5\n"
+ ".inst 0xc1371aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z7.s\n"
+ "ld1w { z1.s }, p2/Z, [x14, #4, MUL VL]\n"
+ ".inst 0xc1351a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z5.s\n"
+ ".inst 0xc1341a82 // fmla za.s[x8, 2], { z20.s-z23.s }, z4.s\n"
+ ".inst 0xa04149cc // ld1w { z12.s-z13.s }, pn10.b/Z, [x14, #0x2, MUL VL]\n"
+ ".inst 0xc1331a83 // fmla za.s[x8, 3], { z20.s-z23.s }, z3.s\n"
+ ".inst 0xc1321a84 // fmla za.s[x8, 4], { z20.s-z23.s }, z2.s\n"
+ ".inst 0xa04049c2 // ld1w { z2.s-z3.s }, pn10.b/Z, [x14]\n"
+ ".inst 0xc1311ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z1.s\n"
+ ".inst 0xc1381aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+ ".inst 0xc1301aa2 // fmla za.s[x8, 2], { z21.s-z24.s }, z0.s\n"
+ ".inst 0xc13b1aa3 // fmla za.s[x8, 3], { z21.s-z24.s }, z11.s\n"
+ ".inst 0xc13a1aa4 // fmla za.s[x8, 4], { z21.s-z24.s }, z10.s\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc1b1ca04 // fclamp { z4.s-z7.s }, z16.s, z17.s\n"
+ "st1w { z4.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ ".inst 0xc13d1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z13.s\n"
+ "st1w { z5.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ ".inst 0xc13c1ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z12.s\n"
+ "st1w { z6.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ ".inst 0xc1331ac3 // fmla za.s[x8, 3], { z22.s-z25.s }, z3.s\n"
+ "st1w { z7.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
+ ".inst 0xc1321ac4 // fmla za.s[x8, 4], { z22.s-z25.s }, z2.s\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
+ "20:" // Main loop skip tail
+ "cbz x11, 22f\n"
+ "21:" // Right padding loop
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "subs x11, x11, #0x1\n"
+ ".inst 0xc1b1ca00 // fclamp { z0.s-z3.s }, z16.s, z17.s\n"
+ "st1w { z0.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
+ "st1w { z1.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ "st1w { z2.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "st1w { z3.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
+ "bgt 21b\n"
+ "22:" // End
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x20, ALL, MUL #16\n"
+ "incb x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x16\n"
+ "whilelt p1.s, x16, x17\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21, LSL #2\n"
+ "add x22, x22, x20, LSL #2\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21, LSL #2\n"
+ "add x22, x22, x20, LSL #2\n"
+ "stp x23, x22, [x25, #0x10]\n"
+ "b.any 1b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp
new file mode 100644
index 0000000000..7412c7b57c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_5x5_s2_4rows_mla_za_impl(
+ const float *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const float *weights,
+ const float *bias,
+ float **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ float act_min,
+ float act_max
+);
+
+class sme2_fp32_planar_5x5_s2_4rows_mla_za : public PlanarStrategy<float, float>
+{
+ using Parent = PlanarStrategy<float, float>;
+
+ public:
+ using return_type = float;
+ constexpr static auto output_rows = 4u;
+ constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+ constexpr static auto stride_rows = 2u, stride_cols = 2u;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ sme2_fp32_planar_5x5_s2_4rows_mla_za(const CPUInfo *)
+ : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+ {
+ }
+
+ typename Parent::KernelType get_kernel(void) const override
+ {
+ return sme2_fp32_planar_5x5_s2_4rows_mla_za_impl;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp
new file mode 100644
index 0000000000..81ad8e5833
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32_planar_5x5_s2_4rows_mla_za/generic.cpp
@@ -0,0 +1,1172 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32_planar_5x5_s2_4rows_mla_za_impl(
+ const float *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const float *weights,
+ const float *bias,
+ float **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ float act_min,
+ float act_max
+)
+{
+ struct Args
+ {
+ const float *inptr;
+ size_t ld_in_vl;
+ long unsigned int pad_top, pad_bottom, pad_left;
+ const float *weights;
+ const float *bias;
+ long unsigned int input_cols, output_cols;
+ float **outptrs;
+ const size_t *ld_out_cols;
+ const size_t *ld_out_vls;
+ long unsigned int current_channel, n_channels;
+ float clamp_min, clamp_max;
+ };
+
+ Args args = { inptr, ld_in_vl, pad_top, 11u - std::min(11u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+ __asm__ __volatile__(
+ "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x20, #0xb\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "sub x20, x20, x5\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ptrue p2.b\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "ld1rw { z2.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x7\n"
+ "whilelt p9.s, XZR, x20\n"
+ "ld1rw { z3.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p8.s, XZR, x6\n"
+ "eor p8.b, p2/Z, p8.b, p9.b\n"
+ "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "1:" // Channel loop
+ "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+ "fmov z28.s, #0x0\n"
+ "cbz x20, 2f\n"
+ "ld1w { z28.s }, p1/Z, [x20, x17, LSL #2]\n"
+ "2:" // Load bias: Done
+ "ldr x16, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x16, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #18\n"
+ "mov z29.d, z28.d\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "orr x23, x7, x23, LSL #20\n"
+ "mov x22, #0xb\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "add x21, x6, x5\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "mov z30.d, z28.d\n"
+ "mov z31.d, z28.d\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ "mov x8, #0x0\n"
+ "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "lsl x23, x23, #0x2\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x6, x14\n"
+ "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "3:" // Issue prefetches
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col], LSL #2\n"
+ "bgt 3b\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "msub x14, x6, x20, x14\n"
+ ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
+ "mov x22, #0x4\n"
+ "ldp x11, x10, [x23], #0x10\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ ".inst 0xc0040f83 // mova za.d[x8, #3], { z28.d-z31.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
+ "ldp x27, x26, [x23], #0x10\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "cbz x21, 5f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 5f\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "and x22, x21, #0x1\n"
+ "add x21, x21, #0x1\n"
+ ".inst 0xc1a3c850 // fclamp { z16.s-z19.s }, z2.s, z3.s\n"
+ "lsr x21, x21, #0x1\n"
+ "sub x13, x13, x21\n"
+ "4:" // Left padding
+ "subs x21, x21, #0x1\n"
+ "st1w { z16.s }, p1, [x11]\n"
+ "add x11, x11, x9, LSL #2\n"
+ "st1w { z17.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z18.s }, p1, [x27]\n"
+ "add x27, x27, x25, LSL #2\n"
+ "st1w { z19.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
+ "bgt 4b\n"
+ "5:" // Left padding: End
+ "adds XZR, x6, x5\n"
+ "bne 12f\n"
+ "cbz x22, 10f\n"
+ "cmp x22, #0x1\n"
+ "sub x16, x16, x22\n"
+ "beq 9f\n"
+ "cmp x22, #0x2\n"
+ "beq 8f\n"
+ "cmp x22, #0x3\n"
+ "beq 7f\n"
+ "6:" // Unpadded: 4 priming loads
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z9.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z10.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z11.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1341920 // fmla za.s[x8, 0], { z9.s-z12.s }, z4.s\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1371a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z7.s\n"
+ "ld1w { z13.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc1341940 // fmla za.s[x8, 0], { z10.s-z13.s }, z4.s\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04049e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc1301aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z0.s\n"
+ "ld1w { z14.s }, p1/Z, [x20]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1341960 // fmla za.s[x8, 0], { z11.s-z14.s }, z4.s\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ "7:" // Unpadded: 3 priming loads
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z7.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z8.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z9.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1351ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z5.s\n"
+ "ld1w { z10.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f18e0 // fmla za.s[x8, 0], { z7.s-z10.s }, z15.s\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc13f1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z15.s\n"
+ "ld1w { z11.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc1371900 // fmla za.s[x8, 0], { z8.s-z11.s }, z7.s\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ ".inst 0xa04049ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13b1b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z11.s\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "8:" // Unpadded: 2 priming loads
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z14.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13a1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z10.s\n"
+ ".inst 0xc1341a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z4.s\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13019c0 // fmla za.s[x8, 0], { z14.s-z17.s }, z0.s\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13719c1 // fmla za.s[x8, 1], { z14.s-z17.s }, z7.s\n"
+ ".inst 0xa04049e8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc13a1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z10.s\n"
+ ".inst 0xc1381a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z8.s\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13619e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z6.s\n"
+ ".inst 0xa04149e8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc13819e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z8.s\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13e1aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z14.s\n"
+ ".inst 0xa14149e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1371aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z7.s\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "9:" // Unpadded: 1 priming loads
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z7.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z8.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z9.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z10.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13b18e0 // fmla za.s[x8, 0], { z7.s-z10.s }, z11.s\n"
+ ".inst 0xc13518e1 // fmla za.s[x8, 1], { z7.s-z10.s }, z5.s\n"
+ "ld1w { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1311a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
+ "ld1w { z11.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z15.s\n"
+ ".inst 0xa04049e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc13d1900 // fmla za.s[x8, 0], { z8.s-z11.s }, z13.s\n"
+ ".inst 0xc1311901 // fmla za.s[x8, 1], { z8.s-z11.s }, z1.s\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa14049e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13e1a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z14.s\n"
+ ".inst 0xa14149e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc13e1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z14.s\n"
+ "ld1w { z12.s }, p1/Z, [x20]\n"
+ ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc13f1921 // fmla za.s[x8, 1], { z9.s-z12.s }, z15.s\n"
+ ".inst 0xa04149ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13f1920 // fmla za.s[x8, 0], { z9.s-z12.s }, z15.s\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "10:" // Unpadded: 0 priming loads
+ "cmp x16, #0x2\n"
+ "blt 20f\n"
+ "add x21, x14, %x[ld_in_row], LSL #2\n"
+ "ld1w { z22.s }, p1/Z, [x14]\n"
+ "sub x16, x16, #0x2\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "sub x13, x13, #0x1\n"
+ "ld1w { z23.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "lsr x20, x16, #0x1\n"
+ "ld1w { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "cmp x20, x13\n"
+ "ld1w { z24.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "csel x23, x20, x13, LT\n"
+ "ld1w { z18.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "and x16, x16, #0x1\n"
+ "ld1w { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "sub x13, x13, x23\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
+ "cbz x23, 19f\n"
+ "11:" // Unpadded: Main loop
+ ".inst 0xc1391ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
+ "ld1w { z13.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "add x22, x14, %x[ld_in_row], LSL #2\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc13a1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
+ ".inst 0xa14149e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1341ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
+ ".inst 0xa04049e8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc1361a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z6.s\n"
+ "ld1w { z11.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1301a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z0.s\n"
+ ".inst 0xa04149ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1371a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z7.s\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc13d1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z13.s\n"
+ "ld1w { z4.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1311ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z1.s\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1381ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z8.s\n"
+ ".inst 0xa04049ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13b1a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z11.s\n"
+ "ld1w { z15.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z14.s\n"
+ "ld1w { z22.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1361a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1341b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z4.s\n"
+ ".inst 0xc1301b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z0.s\n"
+ ".inst 0xa0414aa6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc13c1b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z12.s\n"
+ "ld1w { z17.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xa1404aa4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x21]\n"
+ "addvl x21, x21, #5\n"
+ ".inst 0xc1a3c848 // fclamp { z8.s-z11.s }, z2.s, z3.s\n"
+ "st1w { z8.s }, p1, [x11]\n"
+ "ld1w { z18.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13719e0 // fmla za.s[x8, 0], { z15.s-z18.s }, z7.s\n"
+ "add x11, x11, x9, LSL #2\n"
+ ".inst 0xc13c19e1 // fmla za.s[x8, 1], { z15.s-z18.s }, z12.s\n"
+ ".inst 0xa1404aa7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x21]\n"
+ "st1w { z9.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ ".inst 0xa1414aa6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "addvl x21, x21, #5\n"
+ "st1w { z10.s }, p1, [x27]\n"
+ "add x27, x27, x25, LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z14.s\n"
+ "st1w { z11.s }, p1, [x26]\n"
+ ".inst 0xc13f1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z15.s\n"
+ "ld1w { z19.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ "add x26, x26, x24, LSL #2\n"
+ ".inst 0xa0404aae // ld1w { z14.s-z15.s }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13f1a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z15.s\n"
+ ".inst 0xa1414aa4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "addvl x21, x21, #5\n"
+ ".inst 0xc13c1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z12.s\n"
+ "ld1w { z26.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
+ ".inst 0xa0404aac // ld1w { z12.s-z13.s }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13d1ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z13.s\n"
+ ".inst 0xa1414aa4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ "addvl x21, x21, #5\n"
+ ".inst 0xc13c1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z12.s\n"
+ "ld1w { z20.s }, p1/Z, [x22]\n"
+ ".inst 0xa1404aa7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13f1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z15.s\n"
+ ".inst 0xa0414aaa // ld1w { z10.s-z11.s }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc13b1a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z11.s\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1w { z22.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ "bgt 11b\n"
+ "b 19f\n"
+ "12:" // Padded
+ "cbz x22, 17f\n"
+ "cmp x22, #0x1\n"
+ "sub x16, x16, x22\n"
+ "beq 16f\n"
+ "cmp x22, #0x2\n"
+ "beq 15f\n"
+ "cmp x22, #0x3\n"
+ "beq 14f\n"
+ "13:" // Padded: 4 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z9.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z10.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z11.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc1341920 // fmla za.s[x8, 0], { z9.s-z12.s }, z4.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1371ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z7.s\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc1361940 // fmla za.s[x8, 0], { z10.s-z13.s }, z6.s\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa14049e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc1361b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z6.s\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1361960 // fmla za.s[x8, 0], { z11.s-z14.s }, z6.s\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ "14:" // Padded: 3 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z22.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z10.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z11.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc1351ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z5.s\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc13f1920 // fmla za.s[x8, 0], { z9.s-z12.s }, z15.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc1381ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z8.s\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc13f1940 // fmla za.s[x8, 0], { z10.s-z13.s }, z15.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13f1b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z15.s\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "15:" // Padded: 2 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc13a1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z10.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ ".inst 0xc1341a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z4.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ ".inst 0xc1301ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z0.s\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc1371ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z7.s\n"
+ ".inst 0xa14149e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1371a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z7.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc1301a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z0.s\n"
+ ".inst 0xa14049e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc13a1b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z10.s\n"
+ ".inst 0xc1351b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z5.s\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ ".inst 0xa14049e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1351a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z5.s\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1301a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z0.s\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "16:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z19.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z8.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z10.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z22.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc13b1a60 // fmla za.s[x8, 0], { z19.s-z22.s }, z11.s\n"
+ "ld1w { z11.s }, p0/Z, [x20]\n"
+ ".inst 0xc1351a61 // fmla za.s[x8, 1], { z19.s-z22.s }, z5.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ ".inst 0xc1311900 // fmla za.s[x8, 0], { z8.s-z11.s }, z1.s\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc13f1901 // fmla za.s[x8, 1], { z8.s-z11.s }, z15.s\n"
+ ".inst 0xa14149e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc13e1a80 // fmla za.s[x8, 0], { z20.s-z23.s }, z14.s\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc1351a81 // fmla za.s[x8, 1], { z20.s-z23.s }, z5.s\n"
+ ".inst 0xa04049e6 // ld1w { z6.s-z7.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa14149e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc1381920 // fmla za.s[x8, 0], { z9.s-z12.s }, z8.s\n"
+ ".inst 0xc1371921 // fmla za.s[x8, 1], { z9.s-z12.s }, z7.s\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ ".inst 0xa14049e0 // ld1w { z0.s, z8.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1381aa1 // fmla za.s[x8, 1], { z21.s-z24.s }, z8.s\n"
+ ".inst 0xa04149ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13d1aa0 // fmla za.s[x8, 0], { z21.s-z24.s }, z13.s\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "17:" // Padded: 0 priming loads
+ "cmp x16, #0x2\n"
+ "blt 20f\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z22.s }, p0/Z, [x14]\n"
+ "add x21, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
+ "mov x12, #0x4\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z24.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z18.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z25.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ "ld1w { z19.s }, p0/Z, [x21]\n"
+ "sub x16, x16, #0x2\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "sub x13, x13, #0x1\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
+ "lsr x20, x16, #0x1\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "cmp x20, x13\n"
+ "ld1w { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
+ "csel x23, x20, x13, LT\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ "and x16, x16, #0x1\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 19f\n"
+ "18:" // Padded: Main loop
+ ".inst 0xc1391ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
+ "ld1w { z15.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc13a1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "add x22, x14, %x[ld_in_row], LSL #2\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc1341ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
+ ".inst 0xa14049e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc1361a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z6.s\n"
+ "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1301a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z0.s\n"
+ ".inst 0xa04149ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1371a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z7.s\n"
+ ".inst 0xa14049e6 // ld1w { z6.s, z14.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc13f1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z15.s\n"
+ "ld1w { z16.s }, p0/Z, [x14]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc13a1ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z10.s\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "add x21, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1351ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z5.s\n"
+ ".inst 0xa04049ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1311a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z1.s\n"
+ "ld1w { z0.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13c1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z12.s\n"
+ "ld1w { z12.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc1361a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z6.s\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x4\n"
+ "ld1w { z13.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1301b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z0.s\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc13a1b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z10.s\n"
+ ".inst 0xa1414a81 // ld1w { z1.s, z9.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc13e1b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z14.s\n"
+ "ld1w { z18.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z14.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xa0404a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20]\n"
+ "add x8, x8, #0x1\n"
+ "addvl x20, x20, #5\n"
+ ".inst 0xc1a3c858 // fclamp { z24.s-z27.s }, z2.s, z3.s\n"
+ "ld1w { z19.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "st1w { z24.s }, p1, [x11]\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc1391a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z9.s\n"
+ ".inst 0xa0404a88 // ld1w { z8.s-z9.s }, pn10.b/Z, [x20]\n"
+ "add x11, x11, x9, LSL #2\n"
+ ".inst 0xc1311a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z1.s\n"
+ ".inst 0xa0414a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ "st1w { z25.s }, p1, [x10]\n"
+ "ld1w { z15.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1311980 // fmla za.s[x8, 0], { z12.s-z15.s }, z1.s\n"
+ ".inst 0xc1391981 // fmla za.s[x8, 1], { z12.s-z15.s }, z9.s\n"
+ "ld1w { z20.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xa0404a8a // ld1w { z10.s-z11.s }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13b1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z11.s\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z26.s }, p1, [x27]\n"
+ ".inst 0xa1414a80 // ld1w { z0.s, z8.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ ".inst 0xc1381a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z8.s\n"
+ "add x27, x27, x25, LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "st1w { z27.s }, p1, [x26]\n"
+ ".inst 0xa0404a88 // ld1w { z8.s-z9.s }, pn10.b/Z, [x20]\n"
+ "mov x12, #0x0\n"
+ ".inst 0xc13919a1 // fmla za.s[x8, 1], { z13.s-z16.s }, z9.s\n"
+ "add x26, x26, x24, LSL #2\n"
+ ".inst 0xa1414a81 // ld1w { z1.s, z9.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ ".inst 0xc13919a0 // fmla za.s[x8, 0], { z13.s-z16.s }, z9.s\n"
+ "ld1w { z21.s }, p0/Z, [x22]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
+ "ld1w { z22.s }, p0/Z, [x14]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa0404a8e // ld1w { z14.s-z15.s }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13f1a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z15.s\n"
+ ".inst 0xa0414a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1311a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x4\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z24.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z18.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z25.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ "ld1w { z19.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xa04049e4 // ld1w { z4.s-z5.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
+ "bgt 18b\n"
+ "19:" // Main loop tail
+ ".inst 0xc1391ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z9.s\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc13a1ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z10.s\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "add x21, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1341ac2 // fmla za.s[x8, 2], { z22.s-z25.s }, z4.s\n"
+ ".inst 0xa14049e1 // ld1w { z1.s, z9.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc1361a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z6.s\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1301a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z0.s\n"
+ ".inst 0xa04149ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1371a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z7.s\n"
+ ".inst 0xa04049ec // ld1w { z12.s-z13.s }, pn10.b/Z, [x15]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc1381ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z8.s\n"
+ "ld1w { z16.s }, p0/Z, [x14]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "add x14, x14, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc13a1ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z10.s\n"
+ ".inst 0xa14149e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc1311ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z1.s\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1391a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z9.s\n"
+ "ld1w { z1.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc13e1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z14.s\n"
+ "ld1w { z22.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc13c1a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z12.s\n"
+ "ld1w { z17.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x4\n"
+ "ld1w { z23.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1311b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z1.s\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1351b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z5.s\n"
+ ".inst 0xa0414a8e // ld1w { z14.s-z15.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1371b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z7.s\n"
+ "ld1w { z18.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z24.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+ ".inst 0xa0404a84 // ld1w { z4.s-z5.s }, pn10.b/Z, [x20]\n"
+ "add x8, x8, #0x1\n"
+ "addvl x20, x20, #5\n"
+ ".inst 0xc1a3c848 // fclamp { z8.s-z11.s }, z2.s, z3.s\n"
+ "ld1w { z19.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "st1w { z8.s }, p1, [x11]\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc13f1a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z15.s\n"
+ ".inst 0xa0404a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20]\n"
+ "add x11, x11, x9, LSL #2\n"
+ ".inst 0xc1351a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z5.s\n"
+ ".inst 0xa1414a80 // ld1w { z0.s, z8.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ "st1w { z9.s }, p1, [x10]\n"
+ "ld1w { z25.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1381ac0 // fmla za.s[x8, 0], { z22.s-z25.s }, z8.s\n"
+ ".inst 0xc1311ac1 // fmla za.s[x8, 1], { z22.s-z25.s }, z1.s\n"
+ "ld1w { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xa0404a86 // ld1w { z6.s-z7.s }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1371a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z7.s\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z10.s }, p1, [x27]\n"
+ ".inst 0xa1414a81 // ld1w { z1.s, z9.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ ".inst 0xc1391a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z9.s\n"
+ "add x27, x27, x25, LSL #2\n"
+ "ld1w { z26.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "st1w { z11.s }, p1, [x26]\n"
+ ".inst 0xa1404a84 // ld1w { z4.s, z12.s }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13c1ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z12.s\n"
+ "add x26, x26, x24, LSL #2\n"
+ ".inst 0xa1414a84 // ld1w { z4.s, z12.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "addvl x20, x20, #5\n"
+ ".inst 0xc13c1ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z12.s\n"
+ "ld1w { z21.s }, p0/Z, [x21]\n"
+ ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
+ ".inst 0xa0404a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1311a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z1.s\n"
+ ".inst 0xa0414a80 // ld1w { z0.s-z1.s }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "ldr x15, [%x[args], %[offsetof_Args_weights]]\n"
+ ".inst 0xc1311a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z1.s\n"
+ ".inst 0xa14049e4 // ld1w { z4.s, z12.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xa14049e7 // ld1w { z7.s, z15.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x15, #4, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ "20:" // Main loop skip tail
+ "cbz x16, 21f\n" // Skip remainder inputs
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x14]\n"
+ "add x20, x14, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc1391a00 // fmla za.s[x8, 0], { z16.s-z19.s }, z9.s\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1361ae0 // fmla za.s[x8, 0], { z23.s-z26.s }, z6.s\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc13a1a01 // fmla za.s[x8, 1], { z16.s-z19.s }, z10.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc1341a02 // fmla za.s[x8, 2], { z16.s-z19.s }, z4.s\n"
+ ".inst 0xa04049ea // ld1w { z10.s-z11.s }, pn10.b/Z, [x15]\n"
+ "sub x13, x13, #0x1\n"
+ ".inst 0xa04149ee // ld1w { z14.s-z15.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc1381a20 // fmla za.s[x8, 0], { z17.s-z20.s }, z8.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0xc1301ae1 // fmla za.s[x8, 1], { z23.s-z26.s }, z0.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z9.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1371ae2 // fmla za.s[x8, 2], { z23.s-z26.s }, z7.s\n"
+ ".inst 0xa04049e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1391b00 // fmla za.s[x8, 0], { z24.s-z27.s }, z9.s\n"
+ ".inst 0xa14149e5 // ld1w { z5.s, z13.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ "addvl x15, x15, #5\n"
+ ".inst 0xc13e1a21 // fmla za.s[x8, 1], { z17.s-z20.s }, z14.s\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ ".inst 0xc13a1a22 // fmla za.s[x8, 2], { z17.s-z20.s }, z10.s\n"
+ "ld1w { z8.s }, p2/Z, [x15, #4, MUL VL]\n"
+ ".inst 0xc1381a40 // fmla za.s[x8, 0], { z18.s-z21.s }, z8.s\n"
+ ".inst 0xc1351b01 // fmla za.s[x8, 1], { z24.s-z27.s }, z5.s\n"
+ ".inst 0xa04049e8 // ld1w { z8.s-z9.s }, pn10.b/Z, [x15]\n"
+ ".inst 0xc1301b02 // fmla za.s[x8, 2], { z24.s-z27.s }, z0.s\n"
+ ".inst 0xa04149e0 // ld1w { z0.s-z1.s }, pn10.b/Z, [x15, #0x2, MUL VL]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a3c858 // fclamp { z24.s-z27.s }, z2.s, z3.s\n"
+ "st1w { z24.s }, p1, [x11]\n"
+ "add x11, x11, x9, LSL #2\n"
+ ".inst 0xc1301a41 // fmla za.s[x8, 1], { z18.s-z21.s }, z0.s\n"
+ "st1w { z25.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ ".inst 0xc1381a42 // fmla za.s[x8, 2], { z18.s-z21.s }, z8.s\n"
+ "add x8, x8, #0x1\n"
+ "st1w { z26.s }, p1, [x27]\n"
+ "add x27, x27, x25, LSL #2\n"
+ "st1w { z27.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
+ ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
+ "21:" // Tail input: End
+ "cbz x13, 23f\n"
+ "22:" // Right padding loop
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "subs x13, x13, #0x1\n"
+ ".inst 0xc1a3c848 // fclamp { z8.s-z11.s }, z2.s, z3.s\n"
+ "st1w { z8.s }, p1, [x11]\n"
+ "add x11, x11, x9, LSL #2\n"
+ ".inst 0xc0040f84 // mova za.d[x8, #4], { z28.d-z31.d }\n"
+ "st1w { z9.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z10.s }, p1, [x27]\n"
+ "add x27, x27, x25, LSL #2\n"
+ "st1w { z11.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
+ "bgt 22b\n"
+ "23:" // End
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x20, ALL, MUL #16\n"
+ "incb x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x17\n"
+ "whilelt p1.s, x17, x7\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21, LSL #2\n"
+ "add x22, x22, x20, LSL #2\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21, LSL #2\n"
+ "add x22, x22, x20, LSL #2\n"
+ "stp x23, x22, [x25, #0x10]\n"
+ "b.any 1b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..50ef6c3815
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl(
+ const float *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const float *weights,
+ const float *bias,
+ float **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ float act_min,
+ float act_max
+);
+
+class sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za : public PlanarStrategy<float, float>
+{
+ using Parent = PlanarStrategy<float, float>;
+
+ public:
+ using return_type = float;
+ constexpr static auto output_rows = 4u;
+ constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+ constexpr static auto stride_rows = 1u, stride_cols = 1u;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za(const CPUInfo *)
+ : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+ {
+ }
+
+ typename Parent::KernelType get_kernel(void) const override
+ {
+ return sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..be82e04613
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,560 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za_impl(
+ const float *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const float *weights,
+ const float *bias,
+ float **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ float act_min,
+ float act_max
+)
+{
+ struct Args
+ {
+ const float *inptr;
+ size_t ld_in_vl;
+ long unsigned int pad_top, pad_bottom, pad_left;
+ const float *weights;
+ const float *bias;
+ long unsigned int input_cols, output_cols;
+ float **outptrs;
+ const size_t *ld_out_cols;
+ const size_t *ld_out_vls;
+ long unsigned int current_channel, n_channels;
+ float clamp_min, clamp_max;
+ };
+
+ Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+ __asm__ __volatile__(
+ "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x20, #0x6\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "sub x20, x20, x7\n"
+ "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ptrue p2.b\n"
+ "ld1rw { z25.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x16\n"
+ "whilelt p9.s, XZR, x20\n"
+ "ld1rw { z13.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p8.s, XZR, x17\n"
+ "eor p8.b, p2/Z, p8.b, p9.b\n"
+ "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "1:" // Channel loop
+ "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+ "fmov z26.s, #0x0\n"
+ "cbz x20, 2f\n"
+ "ld1w { z26.s }, p1/Z, [x20, x15, LSL #2]\n"
+ "2:" // Load bias: Done
+ "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x21\n"
+ "fmov z6.s, #0x0\n"
+ "ld1w { z15.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ "incb x21\n"
+ "ld1w { z29.s }, p2/Z, [x20]\n"
+ ".inst 0x648aa9e6 // bfcvtnt z6.h, p2/M, z15.s\n"
+ "incb x20, ALL, MUL #3\n"
+ "ld1w { z30.s }, p2/Z, [x20]\n"
+ "mov x20, x21\n"
+ ".inst 0x658aa9e5 // bfcvt z5.h, p2/M, z15.s\n"
+ "ld1w { z14.s }, p2/Z, [x20]\n"
+ ".inst 0x658aaba8 // bfcvt z8.h, p2/M, z29.s\n"
+ "fmov z11.s, #0x0\n"
+ "incb x20, ALL, MUL #3\n"
+ ".inst 0x658aa9ca // bfcvt z10.h, p2/M, z14.s\n"
+ ".inst 0x648aaba5 // bfcvtnt z5.h, p2/M, z29.s\n"
+ "incb x21\n"
+ "ld1w { z24.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ ".inst 0x648aabc8 // bfcvtnt z8.h, p2/M, z30.s\n"
+ ".inst 0x658aabcc // bfcvt z12.h, p2/M, z30.s\n"
+ "ld1w { z28.s }, p2/Z, [x20]\n"
+ "mov x21, x21\n"
+ ".inst 0x648aa9cb // bfcvtnt z11.h, p2/M, z14.s\n"
+ "ld1w { z20.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #3\n"
+ ".inst 0x648aab0a // bfcvtnt z10.h, p2/M, z24.s\n"
+ ".inst 0x658aab09 // bfcvt z9.h, p2/M, z24.s\n"
+ "ld1w { z15.s }, p2/Z, [x21]\n"
+ "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "incb x21, ALL, MUL #3\n"
+ "fmov z14.s, #0x0\n"
+ ".inst 0x658aaa81 // bfcvt z1.h, p2/M, z20.s\n"
+ "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+ ".inst 0x658aa9e7 // bfcvt z7.h, p2/M, z15.s\n"
+ ".inst 0x648aab89 // bfcvtnt z9.h, p2/M, z28.s\n"
+ "sub x20, x14, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #18\n"
+ ".inst 0x658aab84 // bfcvt z4.h, p2/M, z28.s\n"
+ "ld1w { z29.s }, p2/Z, [x21]\n"
+ "orr x23, x16, x23, LSL #20\n"
+ "mov x22, #0x6\n"
+ "add x21, x17, x7\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "mov z27.d, z26.d\n"
+ ".inst 0x648aaa8e // bfcvtnt z14.h, p2/M, z20.s\n"
+ ".inst 0x648aa9e1 // bfcvtnt z1.h, p2/M, z15.s\n"
+ ".inst 0x648aaba7 // bfcvtnt z7.h, p2/M, z29.s\n"
+ "mov x8, #0x0\n"
+ "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+ ".inst 0x658aaba2 // bfcvt z2.h, p2/M, z29.s\n"
+ "lsl x23, x23, #0x2\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x17, x13\n"
+ "3:" // Issue prefetches
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col], LSL #2\n"
+ "bgt 3b\n"
+ "ldr x22, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "msub x13, x17, x20, x13\n"
+ ".inst 0xc0040b40 // mova za.d[x8, #0], { z26.d-z27.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040b41 // mova za.d[x8, #1], { z26.d-z27.d }\n"
+ "mov x10, #0x2\n"
+ "ldp x9, x28, [x22], #0x10\n"
+ ".inst 0xc0040b42 // mova za.d[x8, #2], { z26.d-z27.d }\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ ".inst 0xc0040b43 // mova za.d[x8, #3], { z26.d-z27.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040b44 // mova za.d[x8, #4], { z26.d-z27.d }\n"
+ "ldp x25, x24, [x22], #0x10\n"
+ ".inst 0xc0040b45 // mova za.d[x8, #5], { z26.d-z27.d }\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "cbz x21, 5f\n"
+ "cmp x21, x10\n"
+ "csel x20, x21, x10, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x10, x10, x20\n"
+ "cbz x21, 5f\n"
+ ".inst 0xc0060814 // mova { z20.d-z21.d }, za.d[x8, #0]\n"
+ "sub x11, x11, x21\n"
+ ".inst 0xc0060836 // mova { z22.d-z23.d }, za.d[x8, #1]\n"
+ ".inst 0xc1adcb34 // fclamp { z20.s-z23.s }, z25.s, z13.s\n"
+ "4:" // Left padding
+ "subs x21, x21, #0x1\n"
+ "st1w { z20.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z22.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ "st1w { z21.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "st1w { z23.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
+ "bgt 4b\n"
+ "5:" // Left padding: End
+ "adds XZR, x17, x7\n"
+ "bne 10f\n"
+ "cbz x10, 8f\n"
+ "cmp x10, #0x1\n"
+ "sub x14, x14, x10\n"
+ "beq 7f\n"
+ "6:" // Unpadded: 2 priming loads
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x13]\n"
+ ".inst 0x658aaa3e // bfcvt z30.h, p2/M, z17.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aab9e // bfcvtnt z30.h, p2/M, z28.s\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa1f // bfcvt z31.h, p2/M, z16.s\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa9ff // bfcvtnt z31.h, p2/M, z15.s\n"
+ ".inst 0xc12513d0 // bfdot za.s[x8, 0], { z30.h-z31.h }, z5.h\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa00 // bfcvt z0.h, p2/M, z16.s\n"
+ ".inst 0xc12613d1 // bfdot za.s[x8, 1], { z30.h-z31.h }, z6.h\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
+ ".inst 0x648aa9e0 // bfcvtnt z0.h, p2/M, z15.s\n"
+ ".inst 0xc12c13f0 // bfdot za.s[x8, 0], { z31.h-z0.h }, z12.h\n"
+ ".inst 0xc12813f1 // bfdot za.s[x8, 1], { z31.h-z0.h }, z8.h\n"
+ "7:" // Unpadded: 1 priming loads
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z31.s }, p1/Z, [x13]\n"
+ ".inst 0x658aabef // bfcvt z15.h, p2/M, z31.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa30 // bfcvtnt z16.h, p2/M, z17.s\n"
+ ".inst 0xc12a11f0 // bfdot za.s[x8, 0], { z15.h-z16.h }, z10.h\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaad1 // bfcvt z17.h, p2/M, z22.s\n"
+ ".inst 0xc12b11f1 // bfdot za.s[x8, 1], { z15.h-z16.h }, z11.h\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ ".inst 0x648aaa51 // bfcvtnt z17.h, p2/M, z18.s\n"
+ ".inst 0xc12511f2 // bfdot za.s[x8, 2], { z15.h-z16.h }, z5.h\n"
+ ".inst 0xc12611f3 // bfdot za.s[x8, 3], { z15.h-z16.h }, z6.h\n"
+ ".inst 0xc1241210 // bfdot za.s[x8, 0], { z16.h-z17.h }, z4.h\n"
+ ".inst 0xc1291211 // bfdot za.s[x8, 1], { z16.h-z17.h }, z9.h\n"
+ ".inst 0xc12c1212 // bfdot za.s[x8, 2], { z16.h-z17.h }, z12.h\n"
+ ".inst 0xc1281213 // bfdot za.s[x8, 3], { z16.h-z17.h }, z8.h\n"
+ "8:" // Unpadded: 0 priming loads
+ "cbz x14, 16f\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x13]\n"
+ ".inst 0x658aaa16 // bfcvt z22.h, p2/M, z16.s\n"
+ "sub x14, x14, #0x1\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "sub x11, x11, #0x1\n"
+ ".inst 0x648aaa16 // bfcvtnt z22.h, p2/M, z16.s\n"
+ "ld1w { z0.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa817 // bfcvt z23.h, p2/M, z0.s\n"
+ "cmp x14, x11\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "csel x21, x14, x11, LT\n"
+ ".inst 0x648aab17 // bfcvtnt z23.h, p2/M, z24.s\n"
+ "ld1w { z0.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa818 // bfcvt z24.h, p2/M, z0.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x648aaa18 // bfcvtnt z24.h, p2/M, z16.s\n"
+ "sub x11, x11, x21\n"
+ "cbz x21, 15f\n"
+ "9:" // Unpadded: Main loop
+ ".inst 0xc12112d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z0.s }, p1/Z, [x13]\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xc12e12d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z14.h\n"
+ "ld1w { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc12212f0 // bfdot za.s[x8, 0], { z23.h-z24.h }, z2.h\n"
+ ".inst 0xc12712f1 // bfdot za.s[x8, 1], { z23.h-z24.h }, z7.h\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc12a12d2 // bfdot za.s[x8, 2], { z22.h-z23.h }, z10.h\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc12b12d3 // bfdot za.s[x8, 3], { z22.h-z23.h }, z11.h\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
+ ".inst 0xc12512d4 // bfdot za.s[x8, 4], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc12612d5 // bfdot za.s[x8, 5], { z22.h-z23.h }, z6.h\n"
+ ".inst 0x658aa816 // bfcvt z22.h, p2/M, z0.s\n"
+ ".inst 0x648aaa96 // bfcvtnt z22.h, p2/M, z20.s\n"
+ ".inst 0xc12412f2 // bfdot za.s[x8, 2], { z23.h-z24.h }, z4.h\n"
+ ".inst 0xc12912f3 // bfdot za.s[x8, 3], { z23.h-z24.h }, z9.h\n"
+ ".inst 0xc12c12f4 // bfdot za.s[x8, 4], { z23.h-z24.h }, z12.h\n"
+ ".inst 0xc12812f5 // bfdot za.s[x8, 5], { z23.h-z24.h }, z8.h\n"
+ ".inst 0x658aaa77 // bfcvt z23.h, p2/M, z19.s\n"
+ ".inst 0x658aaa38 // bfcvt z24.h, p2/M, z17.s\n"
+ ".inst 0xc0060810 // mova { z16.d-z17.d }, za.d[x8, #0]\n"
+ ".inst 0x648aaa57 // bfcvtnt z23.h, p2/M, z18.s\n"
+ ".inst 0x648aab98 // bfcvtnt z24.h, p2/M, z28.s\n"
+ ".inst 0xc0060832 // mova { z18.d-z19.d }, za.d[x8, #1]\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc1adcb30 // fclamp { z16.s-z19.s }, z25.s, z13.s\n"
+ "st1w { z16.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z18.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ ".inst 0xc0040b44 // mova za.d[x8, #4], { z26.d-z27.d }\n"
+ "st1w { z17.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ ".inst 0xc0040b45 // mova za.d[x8, #5], { z26.d-z27.d }\n"
+ "st1w { z19.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
+ "bgt 9b\n"
+ "b 15f\n"
+ "10:" // Padded
+ "cbz x10, 13f\n"
+ "cmp x10, #0x1\n"
+ "sub x14, x14, x10\n"
+ "beq 12f\n"
+ "11:" // Padded: 2 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x658aaa15 // bfcvt z21.h, p2/M, z16.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1251290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z5.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaaf6 // bfcvt z22.h, p2/M, z23.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa16 // bfcvtnt z22.h, p2/M, z16.s\n"
+ ".inst 0xc1261291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z6.h\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc12c12b0 // bfdot za.s[x8, 0], { z21.h-z22.h }, z12.h\n"
+ ".inst 0xc12812b1 // bfdot za.s[x8, 1], { z21.h-z22.h }, z8.h\n"
+ "12:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa13 // bfcvt z19.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa13 // bfcvtnt z19.h, p2/M, z16.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc12a1270 // bfdot za.s[x8, 0], { z19.h-z20.h }, z10.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa9f5 // bfcvt z21.h, p2/M, z15.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
+ ".inst 0xc12b1271 // bfdot za.s[x8, 1], { z19.h-z20.h }, z11.h\n"
+ ".inst 0xc1251272 // bfdot za.s[x8, 2], { z19.h-z20.h }, z5.h\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1261273 // bfdot za.s[x8, 3], { z19.h-z20.h }, z6.h\n"
+ ".inst 0xc1241290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z4.h\n"
+ ".inst 0xc1291291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z9.h\n"
+ ".inst 0xc12c1292 // bfdot za.s[x8, 2], { z20.h-z21.h }, z12.h\n"
+ ".inst 0xc1281293 // bfdot za.s[x8, 3], { z20.h-z21.h }, z8.h\n"
+ "13:" // Padded: 0 priming loads
+ "cbz x14, 16f\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa16 // bfcvt z22.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa16 // bfcvtnt z22.h, p2/M, z16.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x658aaa17 // bfcvt z23.h, p2/M, z16.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa17 // bfcvtnt z23.h, p2/M, z16.s\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa18 // bfcvt z24.h, p2/M, z16.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "sub x14, x14, #0x1\n"
+ ".inst 0x648aaa18 // bfcvtnt z24.h, p2/M, z16.s\n"
+ "sub x11, x11, #0x1\n"
+ "cmp x14, x11\n"
+ "csel x21, x14, x11, LT\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "sub x11, x11, x21\n"
+ "cbz x21, 15f\n"
+ "14:" // Padded: Main loop
+ "mov x12, #0x0\n"
+ ".inst 0xc12112d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z20.s }, p0/Z, [x13]\n"
+ ".inst 0xc12e12d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z14.h\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0xc12212f0 // bfdot za.s[x8, 0], { z23.h-z24.h }, z2.h\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc12712f1 // bfdot za.s[x8, 1], { z23.h-z24.h }, z7.h\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ ".inst 0xc12a12d2 // bfdot za.s[x8, 2], { z22.h-z23.h }, z10.h\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc12b12d3 // bfdot za.s[x8, 3], { z22.h-z23.h }, z11.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc12512d4 // bfdot za.s[x8, 4], { z22.h-z23.h }, z5.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0xc12612d5 // bfdot za.s[x8, 5], { z22.h-z23.h }, z6.h\n"
+ ".inst 0x658aaa96 // bfcvt z22.h, p2/M, z20.s\n"
+ ".inst 0x648aaa76 // bfcvtnt z22.h, p2/M, z19.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc12412f2 // bfdot za.s[x8, 2], { z23.h-z24.h }, z4.h\n"
+ ".inst 0xc12912f3 // bfdot za.s[x8, 3], { z23.h-z24.h }, z9.h\n"
+ ".inst 0xc12c12f4 // bfdot za.s[x8, 4], { z23.h-z24.h }, z12.h\n"
+ ".inst 0xc12812f5 // bfdot za.s[x8, 5], { z23.h-z24.h }, z8.h\n"
+ ".inst 0x658aaa37 // bfcvt z23.h, p2/M, z17.s\n"
+ ".inst 0x658aaa18 // bfcvt z24.h, p2/M, z16.s\n"
+ ".inst 0xc0060810 // mova { z16.d-z17.d }, za.d[x8, #0]\n"
+ ".inst 0x648aaa57 // bfcvtnt z23.h, p2/M, z18.s\n"
+ ".inst 0x648aa9f8 // bfcvtnt z24.h, p2/M, z15.s\n"
+ ".inst 0xc0060832 // mova { z18.d-z19.d }, za.d[x8, #1]\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc1adcb30 // fclamp { z16.s-z19.s }, z25.s, z13.s\n"
+ "st1w { z16.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z18.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ ".inst 0xc0040b44 // mova za.d[x8, #4], { z26.d-z27.d }\n"
+ "st1w { z17.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ ".inst 0xc0040b45 // mova za.d[x8, #5], { z26.d-z27.d }\n"
+ "st1w { z19.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
+ "bgt 14b\n"
+ "15:" // Main loop tail
+ ".inst 0xc12112d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+ ".inst 0xc12e12d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc12212f0 // bfdot za.s[x8, 0], { z23.h-z24.h }, z2.h\n"
+ ".inst 0xc12712f1 // bfdot za.s[x8, 1], { z23.h-z24.h }, z7.h\n"
+ ".inst 0xc12a12d2 // bfdot za.s[x8, 2], { z22.h-z23.h }, z10.h\n"
+ ".inst 0xc12b12d3 // bfdot za.s[x8, 3], { z22.h-z23.h }, z11.h\n"
+ ".inst 0xc12512d4 // bfdot za.s[x8, 4], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc12612d5 // bfdot za.s[x8, 5], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xc0060810 // mova { z16.d-z17.d }, za.d[x8, #0]\n"
+ ".inst 0xc0060832 // mova { z18.d-z19.d }, za.d[x8, #1]\n"
+ ".inst 0xc1adcb30 // fclamp { z16.s-z19.s }, z25.s, z13.s\n"
+ "st1w { z16.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ ".inst 0xc12412f2 // bfdot za.s[x8, 2], { z23.h-z24.h }, z4.h\n"
+ "st1w { z18.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ ".inst 0xc12912f3 // bfdot za.s[x8, 3], { z23.h-z24.h }, z9.h\n"
+ "st1w { z17.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ ".inst 0xc12c12f4 // bfdot za.s[x8, 4], { z23.h-z24.h }, z12.h\n"
+ "st1w { z19.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
+ ".inst 0xc12812f5 // bfdot za.s[x8, 5], { z23.h-z24.h }, z8.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc0040b44 // mova za.d[x8, #4], { z26.d-z27.d }\n"
+ ".inst 0xc0040b45 // mova za.d[x8, #5], { z26.d-z27.d }\n"
+ "16:" // Main loop skip tail
+ "cbz x11, 18f\n"
+ "17:" // Right padding loop
+ ".inst 0xc006081c // mova { z28.d-z29.d }, za.d[x8, #0]\n"
+ "subs x11, x11, #0x1\n"
+ ".inst 0xc006083e // mova { z30.d-z31.d }, za.d[x8, #1]\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc1adcb3c // fclamp { z28.s-z31.s }, z25.s, z13.s\n"
+ "st1w { z28.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z30.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ ".inst 0xc0040b44 // mova za.d[x8, #4], { z26.d-z27.d }\n"
+ "st1w { z29.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ ".inst 0xc0040b45 // mova za.d[x8, #5], { z26.d-z27.d }\n"
+ "st1w { z31.s }, p1, [x24]\n"
+ "add x24, x24, x22, LSL #2\n"
+ "bgt 17b\n"
+ "18:" // End
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x15\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x15, x16\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21, LSL #2\n"
+ "add x22, x22, x20, LSL #2\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21, LSL #2\n"
+ "add x22, x22, x20, LSL #2\n"
+ "stp x23, x22, [x25, #0x10]\n"
+ "b.any 1b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..e685884762
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl(
+ const float *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const float *weights,
+ const float *bias,
+ float **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ float act_min,
+ float act_max
+);
+
+class sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za : public PlanarStrategy<float, float>
+{
+ using Parent = PlanarStrategy<float, float>;
+
+ public:
+ using return_type = float;
+ constexpr static auto output_rows = 4u;
+ constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+ constexpr static auto stride_rows = 2u, stride_cols = 2u;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za(const CPUInfo *)
+ : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+ {
+ }
+
+ typename Parent::KernelType get_kernel(void) const override
+ {
+ return sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..a3b9ca402a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,763 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za_impl(
+ const float *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const float *weights,
+ const float *bias,
+ float **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ float act_min,
+ float act_max
+)
+{
+ struct Args
+ {
+ const float *inptr;
+ size_t ld_in_vl;
+ long unsigned int pad_top, pad_bottom, pad_left;
+ const float *weights;
+ const float *bias;
+ long unsigned int input_cols, output_cols;
+ float **outptrs;
+ const size_t *ld_out_cols;
+ const size_t *ld_out_vls;
+ long unsigned int current_channel, n_channels;
+ float clamp_min, clamp_max;
+ };
+
+ Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+ __asm__ __volatile__(
+ "ldr x7, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x20, #0x9\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "sub x20, x20, x7\n"
+ "ldr x17, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ptrue p2.b\n"
+ "ld1rw { z4.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x16\n"
+ "whilelt p9.s, XZR, x20\n"
+ "ld1rw { z1.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p8.s, XZR, x17\n"
+ "eor p8.b, p2/Z, p8.b, p9.b\n"
+ "ldr x15, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "1:" // Channel loop
+ "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+ "fmov z24.s, #0x0\n"
+ "cbz x20, 2f\n"
+ "ld1w { z24.s }, p1/Z, [x20, x15, LSL #2]\n"
+ "2:" // Load bias: Done
+ "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x21\n"
+ "ld1w { z18.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ "incb x21\n"
+ "ld1w { z23.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ ".inst 0x658aaa4e // bfcvt z14.h, p2/M, z18.s\n"
+ "ld1w { z6.s }, p2/Z, [x20]\n"
+ "mov x20, x21\n"
+ ".inst 0x648aaaee // bfcvtnt z14.h, p2/M, z23.s\n"
+ "incb x21\n"
+ "ld1w { z28.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ ".inst 0x658aa8c3 // bfcvt z3.h, p2/M, z6.s\n"
+ ".inst 0x658aab88 // bfcvt z8.h, p2/M, z28.s\n"
+ "ld1w { z10.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #3\n"
+ "ldr x14, [%x[args], %[offsetof_Args_input_cols]]\n"
+ ".inst 0x648aa948 // bfcvtnt z8.h, p2/M, z10.s\n"
+ "ld1w { z2.s }, p2/Z, [x20]\n"
+ "mov x21, x21\n"
+ ".inst 0x658aa847 // bfcvt z7.h, p2/M, z2.s\n"
+ "ldr x13, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ld1w { z9.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #3\n"
+ ".inst 0x658aa920 // bfcvt z0.h, p2/M, z9.s\n"
+ "sub x20, x14, #0x1\n"
+ "ld1w { z6.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #3\n"
+ "orr x23, x20, %x[ld_in_col], LSL #18\n"
+ "mov z25.d, z24.d\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "orr x23, x16, x23, LSL #20\n"
+ "mov x22, #0x9\n"
+ "mov z26.d, z24.d\n"
+ "add x21, x17, x7\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "mov z27.d, z24.d\n"
+ ".inst 0x648aa8c0 // bfcvtnt z0.h, p2/M, z6.s\n"
+ ".inst 0x658aaa26 // bfcvt z6.h, p2/M, z17.s\n"
+ "mov x8, #0x0\n"
+ "ldr x11, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "lsl x23, x23, #0x2\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x17, x13\n"
+ "3:" // Issue prefetches
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col], LSL #2\n"
+ "bgt 3b\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "msub x13, x17, x20, x13\n"
+ ".inst 0xc0040f00 // mova za.d[x8, #0], { z24.d-z27.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040f01 // mova za.d[x8, #1], { z24.d-z27.d }\n"
+ "mov x22, #0x2\n"
+ "ldp x10, x9, [x23], #0x10\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
+ "ldp x28, x27, [x20], #0x10\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldp x26, x25, [x23], #0x10\n"
+ "ldp x24, x23, [x20], #0x10\n"
+ "cbz x21, 5f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 5f\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "and x22, x21, #0x1\n"
+ "add x21, x21, #0x1\n"
+ ".inst 0xc1a1c890 // fclamp { z16.s-z19.s }, z4.s, z1.s\n"
+ "lsr x21, x21, #0x1\n"
+ "sub x11, x11, x21\n"
+ "4:" // Left padding
+ "subs x21, x21, #0x1\n"
+ "st1w { z16.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z17.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z18.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
+ "st1w { z19.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "bgt 4b\n"
+ "5:" // Left padding: End
+ "adds XZR, x17, x7\n"
+ "bne 10f\n"
+ "cbz x22, 8f\n"
+ "cmp x22, #0x1\n"
+ "sub x14, x14, x22\n"
+ "beq 7f\n"
+ "6:" // Unpadded: 2 priming loads
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x13]\n"
+ ".inst 0x658aaa53 // bfcvt z19.h, p2/M, z18.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa993 // bfcvtnt z19.h, p2/M, z12.s\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaaf4 // bfcvt z20.h, p2/M, z23.s\n"
+ "ld1w { z2.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa854 // bfcvtnt z20.h, p2/M, z2.s\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa9f5 // bfcvt z21.h, p2/M, z15.s\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaad5 // bfcvtnt z21.h, p2/M, z22.s\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aabd6 // bfcvt z22.h, p2/M, z30.s\n"
+ "ld1w { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa996 // bfcvtnt z22.h, p2/M, z12.s\n"
+ ".inst 0xc13e1270 // bfdot za.s[x8, 0], { z19.h-z22.h }, z14.h\n"
+ "ld1w { z31.s }, p1/Z, [x20]\n"
+ ".inst 0x658aabf7 // bfcvt z23.h, p2/M, z31.s\n"
+ ".inst 0xc1331290 // bfdot za.s[x8, 0], { z20.h-z23.h }, z3.h\n"
+ "7:" // Unpadded: 1 priming loads
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x13]\n"
+ ".inst 0x658aaa30 // bfcvt z16.h, p2/M, z17.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaad0 // bfcvtnt z16.h, p2/M, z22.s\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab91 // bfcvt z17.h, p2/M, z28.s\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa51 // bfcvtnt z17.h, p2/M, z18.s\n"
+ "ld1w { z2.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa852 // bfcvt z18.h, p2/M, z2.s\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa72 // bfcvtnt z18.h, p2/M, z19.s\n"
+ "ld1w { z2.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa853 // bfcvt z19.h, p2/M, z2.s\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaaf3 // bfcvtnt z19.h, p2/M, z23.s\n"
+ ".inst 0xc1381210 // bfdot za.s[x8, 0], { z16.h-z19.h }, z8.h\n"
+ "ld1w { z10.s }, p1/Z, [x20]\n"
+ ".inst 0x658aa954 // bfcvt z20.h, p2/M, z10.s\n"
+ ".inst 0xc1371230 // bfdot za.s[x8, 0], { z17.h-z20.h }, z7.h\n"
+ "8:" // Unpadded: 0 priming loads
+ "cmp x14, #0x2\n"
+ "blt 16f\n"
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x13]\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
+ "sub x14, x14, #0x2\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "sub x11, x11, #0x1\n"
+ ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "lsr x20, x14, #0x1\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "cmp x20, x11\n"
+ ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "csel x22, x20, x11, LT\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "and x14, x14, #0x1\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "sub x11, x11, x22\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ "cbz x22, 15f\n"
+ "9:" // Unpadded: Main loop
+ "add x21, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x13]\n"
+ ".inst 0xc1301130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "ld1w { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13e1131 // bfdot za.s[x8, 1], { z9.h-z12.h }, z14.h\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
+ "ld1w { z18.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1361150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z6.h\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1331151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z3.h\n"
+ ".inst 0x658aaa4a // bfcvt z10.h, p2/M, z18.s\n"
+ "ld1w { z30.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aabcb // bfcvt z11.h, p2/M, z30.s\n"
+ ".inst 0x648aa9e9 // bfcvtnt z9.h, p2/M, z15.s\n"
+ "ld1w { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa2a // bfcvtnt z10.h, p2/M, z17.s\n"
+ ".inst 0x648aaa6b // bfcvtnt z11.h, p2/M, z19.s\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "ld1w { z2.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa84c // bfcvtnt z12.h, p2/M, z2.s\n"
+ "add x8, x8, #0x1\n"
+ "ld1w { z29.s }, p1/Z, [x13]\n"
+ ".inst 0xc1381130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z8.h\n"
+ ".inst 0x658aaba9 // bfcvt z9.h, p2/M, z29.s\n"
+ "subs x22, x22, #0x1\n"
+ "ld1w { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1a1c890 // fclamp { z16.s-z19.s }, z4.s, z1.s\n"
+ "st1w { z16.s }, p1, [x10]\n"
+ "ld1w { z16.s }, p1/Z, [x21]\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0xc1371150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z7.h\n"
+ "add x10, x10, x28, LSL #2\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab8a // bfcvt z10.h, p2/M, z28.s\n"
+ "st1w { z17.s }, p1, [x9]\n"
+ "ld1w { z31.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z18.s }, p1, [x26]\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ "add x26, x26, x24, LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "st1w { z19.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaac9 // bfcvtnt z9.h, p2/M, z22.s\n"
+ ".inst 0x648aabea // bfcvtnt z10.h, p2/M, z31.s\n"
+ "ld1w { z31.s }, p1/Z, [x20]\n"
+ ".inst 0x648aaa2b // bfcvtnt z11.h, p2/M, z17.s\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0x658aabed // bfcvt z13.h, p2/M, z31.s\n"
+ "bgt 9b\n"
+ "b 15f\n"
+ "10:" // Padded
+ "cbz x22, 13f\n"
+ "cmp x22, #0x1\n"
+ "sub x14, x14, x22\n"
+ "beq 12f\n"
+ "11:" // Padded: 2 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa49 // bfcvtnt z9.h, p2/M, z18.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
+ ".inst 0x658aa98a // bfcvt z10.h, p2/M, z12.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
+ ".inst 0x648aa98a // bfcvtnt z10.h, p2/M, z12.s\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa4b // bfcvt z11.h, p2/M, z18.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ ".inst 0xc13e1130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z14.h\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1331150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z3.h\n"
+ "12:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa70 // bfcvtnt z16.h, p2/M, z19.s\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa9b1 // bfcvt z17.h, p2/M, z13.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z12.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa991 // bfcvtnt z17.h, p2/M, z12.s\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa932 // bfcvt z18.h, p2/M, z9.s\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z11.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ ".inst 0x648aa972 // bfcvtnt z18.h, p2/M, z11.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ ".inst 0x658aaab3 // bfcvt z19.h, p2/M, z21.s\n"
+ ".inst 0xc13811f0 // bfdot za.s[x8, 0], { z15.h-z18.h }, z8.h\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1371210 // bfdot za.s[x8, 0], { z16.h-z19.h }, z7.h\n"
+ "13:" // Padded: 0 priming loads
+ "cmp x14, #0x2\n"
+ "blt 16f\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ "sub x14, x14, #0x2\n"
+ "sub x11, x11, #0x1\n"
+ "lsr x20, x14, #0x1\n"
+ "cmp x20, x11\n"
+ "csel x21, x20, x11, LT\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "and x14, x14, #0x1\n"
+ "sub x11, x11, x21\n"
+ "cbz x21, 15f\n"
+ "14:" // Padded: Main loop
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z18.s }, p0/Z, [x13]\n"
+ ".inst 0xc1301130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0xc13e1131 // bfdot za.s[x8, 1], { z9.h-z12.h }, z14.h\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xc1361150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z6.h\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ ".inst 0xc1331151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z3.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa49 // bfcvt z9.h, p2/M, z18.s\n"
+ ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z2.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa84b // bfcvt z11.h, p2/M, z2.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa29 // bfcvtnt z9.h, p2/M, z17.s\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z28.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab8c // bfcvt z12.h, p2/M, z28.s\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa6a // bfcvtnt z10.h, p2/M, z19.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z13.s }, p0/Z, [x20]\n"
+ ".inst 0x648aa9eb // bfcvtnt z11.h, p2/M, z15.s\n"
+ "mov x12, #0x0\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa9ad // bfcvt z13.h, p2/M, z13.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z21.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1a1c89c // fclamp { z28.s-z31.s }, z4.s, z1.s\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "st1w { z28.s }, p1, [x10]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x4\n"
+ "ld1w { z20.s }, p0/Z, [x20]\n"
+ "st1w { z29.s }, p1, [x9]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ "st1w { z30.s }, p1, [x26]\n"
+ "add x8, x8, #0x1\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1381130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z8.h\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z18.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1371150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z7.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa2a // bfcvt z10.h, p2/M, z17.s\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ "ld1w { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x658aaa6b // bfcvt z11.h, p2/M, z19.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "subs x21, x21, #0x1\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z31.s }, p1, [x25]\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
+ "add x9, x9, x27, LSL #2\n"
+ "add x26, x26, x24, LSL #2\n"
+ ".inst 0x648aaaa9 // bfcvtnt z9.h, p2/M, z21.s\n"
+ ".inst 0x648aaa8a // bfcvtnt z10.h, p2/M, z20.s\n"
+ "add x25, x25, x23, LSL #2\n"
+ ".inst 0x648aaa4b // bfcvtnt z11.h, p2/M, z18.s\n"
+ ".inst 0x648aaa2c // bfcvtnt z12.h, p2/M, z17.s\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ ".inst 0x658aaa0d // bfcvt z13.h, p2/M, z16.s\n"
+ "bgt 14b\n"
+ "15:" // Main loop tail
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z17.s }, p0/Z, [x13]\n"
+ ".inst 0xc1301130 // bfdot za.s[x8, 0], { z9.h-z12.h }, z0.h\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z2.s }, p0/Z, [x20]\n"
+ ".inst 0xc13e1131 // bfdot za.s[x8, 1], { z9.h-z12.h }, z14.h\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xc1361150 // bfdot za.s[x8, 0], { z10.h-z13.h }, z6.h\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z23.s }, p0/Z, [x20]\n"
+ ".inst 0xc1331151 // bfdot za.s[x8, 1], { z10.h-z13.h }, z3.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa32 // bfcvt z18.h, p2/M, z17.s\n"
+ ".inst 0x658aaa13 // bfcvt z19.h, p2/M, z16.s\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa852 // bfcvtnt z18.h, p2/M, z2.s\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa15 // bfcvt z21.h, p2/M, z16.s\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ ".inst 0x648aaaf3 // bfcvtnt z19.h, p2/M, z23.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0x648aa9f4 // bfcvtnt z20.h, p2/M, z15.s\n"
+ ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0x658aaa16 // bfcvt z22.h, p2/M, z16.s\n"
+ ".inst 0xc1381250 // bfdot za.s[x8, 0], { z18.h-z21.h }, z8.h\n"
+ ".inst 0xc1a1c89c // fclamp { z28.s-z31.s }, z4.s, z1.s\n"
+ "st1w { z28.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z29.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
+ "add x13, x13, %x[ld_in_col], LSL #2\n"
+ "st1w { z30.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
+ ".inst 0xc1371270 // bfdot za.s[x8, 0], { z19.h-z22.h }, z7.h\n"
+ "st1w { z31.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "16:" // Main loop skip tail
+ "cbz x14, 17f\n" // Skip remainder inputs
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x13]\n"
+ ".inst 0x658aaa0f // bfcvt z15.h, p2/M, z16.s\n"
+ "add x20, x13, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa0f // bfcvtnt z15.h, p2/M, z16.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x658aaa10 // bfcvt z16.h, p2/M, z16.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z2.s }, p0/Z, [x20]\n"
+ ".inst 0x648aa850 // bfcvtnt z16.h, p2/M, z2.s\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z10.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa951 // bfcvt z17.h, p2/M, z10.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aabd1 // bfcvtnt z17.h, p2/M, z30.s\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa72 // bfcvt z18.h, p2/M, z19.s\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ ".inst 0x648aaa72 // bfcvtnt z18.h, p2/M, z19.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z19.s }, p0/Z, [x20]\n"
+ ".inst 0x658aaa73 // bfcvt z19.h, p2/M, z19.s\n"
+ ".inst 0xc13011f0 // bfdot za.s[x8, 0], { z15.h-z18.h }, z0.h\n"
+ "sub x11, x11, #0x1\n"
+ ".inst 0xc1361210 // bfdot za.s[x8, 0], { z16.h-z19.h }, z6.h\n"
+ ".inst 0xc13e11f1 // bfdot za.s[x8, 1], { z15.h-z18.h }, z14.h\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a1c888 // fclamp { z8.s-z11.s }, z4.s, z1.s\n"
+ "st1w { z8.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ ".inst 0xc1331211 // bfdot za.s[x8, 1], { z16.h-z19.h }, z3.h\n"
+ "add x8, x8, #0x1\n"
+ "st1w { z9.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z10.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
+ "st1w { z11.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "17:" // Tail input: End
+ "cbz x11, 19f\n"
+ "18:" // Right padding loop
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "subs x11, x11, #0x1\n"
+ ".inst 0xc1a1c888 // fclamp { z8.s-z11.s }, z4.s, z1.s\n"
+ "st1w { z8.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ ".inst 0xc0040f02 // mova za.d[x8, #2], { z24.d-z27.d }\n"
+ "st1w { z9.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z10.s }, p1, [x26]\n"
+ "add x26, x26, x24, LSL #2\n"
+ "st1w { z11.s }, p1, [x25]\n"
+ "add x25, x25, x23, LSL #2\n"
+ "bgt 18b\n"
+ "19:" // End
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x15\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x15, x16\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21, LSL #2\n"
+ "add x22, x22, x20, LSL #2\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21, LSL #2\n"
+ "add x22, x22, x20, LSL #2\n"
+ "stp x23, x22, [x25, #0x10]\n"
+ "b.any 1b\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..5215ccaf39
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
+ const float *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const float *weights,
+ const float *bias,
+ float **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ float act_min,
+ float act_max
+);
+
+class sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za : public PlanarStrategy<float, float>
+{
+ using Parent = PlanarStrategy<float, float>;
+
+ public:
+ using return_type = float;
+ constexpr static auto output_rows = 4u;
+ constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+ constexpr static auto stride_rows = 1u, stride_cols = 1u;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za(const CPUInfo *)
+ : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+ {
+ }
+
+ typename Parent::KernelType get_kernel(void) const override
+ {
+ return sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..b72042558d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,1151 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za_impl(
+ const float *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const float *weights,
+ const float *bias,
+ float **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ float act_min,
+ float act_max
+)
+{
+ struct Args
+ {
+ const float *inptr;
+ size_t ld_in_vl;
+ long unsigned int pad_top, pad_bottom, pad_left;
+ const float *weights;
+ const float *bias;
+ long unsigned int input_cols, output_cols;
+ float **outptrs;
+ const size_t *ld_out_cols;
+ const size_t *ld_out_vls;
+ long unsigned int current_channel, n_channels;
+ float clamp_min, clamp_max;
+ };
+
+ Args args = { inptr, ld_in_vl, pad_top, 8u - std::min(8u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+ __asm__ __volatile__(
+ "ldr x4, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x20, #0x8\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "sub x20, x20, x4\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ptrue p2.b\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "ld1rw { z29.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x7\n"
+ "whilelt p9.s, XZR, x20\n"
+ "ld1rw { z28.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p8.s, XZR, x6\n"
+ "addvl SP, SP, #-30\n"
+ "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "eor p8.b, p2/Z, p8.b, p9.b\n"
+ "1:" // Channel loop
+ "ldr x21, [%x[args], %[offsetof_Args_bias]]\n"
+ "fmov z30.s, #0x0\n"
+ "cbz x21, 2f\n"
+ "ld1w { z30.s }, p1/Z, [x21, x17, LSL #2]\n"
+ "2:" // Load bias: Done
+ "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x21\n"
+ "ld1w { z12.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "ld1w { z24.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "fmov z11.s, #0x0\n"
+ "incb x21\n"
+ "ld1w { z3.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aa99a // bfcvt z26.h, p2/M, z12.s\n"
+ ".inst 0x658aab10 // bfcvt z16.h, p2/M, z24.s\n"
+ "ld1w { z20.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "addvl x24, SP, #30\n"
+ ".inst 0x648aa98b // bfcvtnt z11.h, p2/M, z12.s\n"
+ "ld1w { z25.s }, p2/Z, [x20]\n"
+ "mov x20, x21\n"
+ ".inst 0x658aa875 // bfcvt z21.h, p2/M, z3.s\n"
+ "addvl x24, x24, #-6\n"
+ "ld1w { z6.s }, p2/Z, [x20]\n"
+ ".inst 0x658aaa9b // bfcvt z27.h, p2/M, z20.s\n"
+ "incb x20, ALL, MUL #5\n"
+ "st1h { z11.h }, p2, [x24]\n"
+ ".inst 0x648aab1a // bfcvtnt z26.h, p2/M, z24.s\n"
+ "ld1w { z14.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "fmov z11.s, #0x0\n"
+ "st1h { z26.h }, p2, [x24, #1, MUL VL]\n"
+ ".inst 0x648aa870 // bfcvtnt z16.h, p2/M, z3.s\n"
+ "ld1w { z19.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aa8c9 // bfcvt z9.h, p2/M, z6.s\n"
+ ".inst 0x648aaa95 // bfcvtnt z21.h, p2/M, z20.s\n"
+ "incb x21\n"
+ "ld1w { z12.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "st1h { z16.h }, p2, [x24, #2, MUL VL]\n"
+ ".inst 0x648aab3b // bfcvtnt z27.h, p2/M, z25.s\n"
+ ".inst 0x658aab37 // bfcvt z23.h, p2/M, z25.s\n"
+ "ld1w { z5.s }, p2/Z, [x20]\n"
+ ".inst 0x658aa9c8 // bfcvt z8.h, p2/M, z14.s\n"
+ "mov x23, x21\n"
+ "st1h { z21.h }, p2, [x24, #3, MUL VL]\n"
+ ".inst 0x648aa8cb // bfcvtnt z11.h, p2/M, z6.s\n"
+ ".inst 0x658aaa79 // bfcvt z25.h, p2/M, z19.s\n"
+ "ld1w { z4.s }, p2/Z, [x23]\n"
+ "incb x23, ALL, MUL #5\n"
+ "st1h { z27.h }, p2, [x24, #4, MUL VL]\n"
+ ".inst 0x648aa9c9 // bfcvtnt z9.h, p2/M, z14.s\n"
+ ".inst 0x658aa991 // bfcvt z17.h, p2/M, z12.s\n"
+ "incb x21\n"
+ "st1h { z23.h }, p2, [x24, #5, MUL VL]\n"
+ "addvl x24, x24, #-6\n"
+ "ld1w { z26.s }, p2/Z, [x23]\n"
+ "incb x23, ALL, MUL #5\n"
+ "st1h { z11.h }, p2, [x24]\n"
+ "fmov z2.s, #0x0\n"
+ ".inst 0x648aaa68 // bfcvtnt z8.h, p2/M, z19.s\n"
+ "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "st1h { z9.h }, p2, [x24, #1, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x23]\n"
+ "incb x23, ALL, MUL #5\n"
+ ".inst 0x658aa893 // bfcvt z19.h, p2/M, z4.s\n"
+ "st1h { z8.h }, p2, [x24, #2, MUL VL]\n"
+ ".inst 0x648aa999 // bfcvtnt z25.h, p2/M, z12.s\n"
+ "ld1w { z7.s }, p2/Z, [x23]\n"
+ "incb x23, ALL, MUL #5\n"
+ ".inst 0x658aab4e // bfcvt z14.h, p2/M, z26.s\n"
+ ".inst 0x648aa8b1 // bfcvtnt z17.h, p2/M, z5.s\n"
+ "st1h { z25.h }, p2, [x24, #3, MUL VL]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ ".inst 0x658aa8ab // bfcvt z11.h, p2/M, z5.s\n"
+ "ld1w { z18.s }, p2/Z, [x23]\n"
+ "mov x20, x21\n"
+ ".inst 0x648aa882 // bfcvtnt z2.h, p2/M, z4.s\n"
+ ".inst 0x658aab66 // bfcvt z6.h, p2/M, z27.s\n"
+ "ld1w { z15.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "st1h { z17.h }, p2, [x24, #4, MUL VL]\n"
+ "st1h { z11.h }, p2, [x24, #5, MUL VL]\n"
+ "addvl x24, x24, #-6\n"
+ ".inst 0x648aab53 // bfcvtnt z19.h, p2/M, z26.s\n"
+ ".inst 0x658aa8fa // bfcvt z26.h, p2/M, z7.s\n"
+ "ld1w { z11.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "st1h { z2.h }, p2, [x24]\n"
+ ".inst 0x648aab6e // bfcvtnt z14.h, p2/M, z27.s\n"
+ "ld1w { z4.s }, p2/Z, [x20]\n"
+ "fmov z21.s, #0x0\n"
+ "st1h { z19.h }, p2, [x24, #1, MUL VL]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aa9ea // bfcvt z10.h, p2/M, z15.s\n"
+ "st1h { z14.h }, p2, [x24, #2, MUL VL]\n"
+ ".inst 0x648aa8e6 // bfcvtnt z6.h, p2/M, z7.s\n"
+ "incb x21\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aa973 // bfcvt z19.h, p2/M, z11.s\n"
+ "st1h { z6.h }, p2, [x24, #3, MUL VL]\n"
+ ".inst 0x648aaa5a // bfcvtnt z26.h, p2/M, z18.s\n"
+ ".inst 0x658aaa45 // bfcvt z5.h, p2/M, z18.s\n"
+ "ld1w { z12.s }, p2/Z, [x20]\n"
+ "mov x21, x21\n"
+ ".inst 0x658aa897 // bfcvt z23.h, p2/M, z4.s\n"
+ ".inst 0x648aa9f5 // bfcvtnt z21.h, p2/M, z15.s\n"
+ "ld1w { z24.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ ".inst 0x648aa96a // bfcvtnt z10.h, p2/M, z11.s\n"
+ "ld1w { z3.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ "st1h { z26.h }, p2, [x24, #4, MUL VL]\n"
+ ".inst 0x648aa893 // bfcvtnt z19.h, p2/M, z4.s\n"
+ ".inst 0x658aaa30 // bfcvt z16.h, p2/M, z17.s\n"
+ "ld1w { z2.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ ".inst 0x648aaa37 // bfcvtnt z23.h, p2/M, z17.s\n"
+ "ld1w { z26.s }, p2/Z, [x21]\n"
+ "st1h { z5.h }, p2, [x24, #5, MUL VL]\n"
+ "addvl x24, x24, #-6\n"
+ "st1h { z21.h }, p2, [x24]\n"
+ ".inst 0x648aa990 // bfcvtnt z16.h, p2/M, z12.s\n"
+ "incb x21, ALL, MUL #5\n"
+ "fmov z8.s, #0x0\n"
+ "st1h { z10.h }, p2, [x24, #1, MUL VL]\n"
+ ".inst 0x658aab04 // bfcvt z4.h, p2/M, z24.s\n"
+ ".inst 0x658aa985 // bfcvt z5.h, p2/M, z12.s\n"
+ "sub x20, x25, #0x1\n"
+ "st1h { z19.h }, p2, [x24, #2, MUL VL]\n"
+ ".inst 0x658aa871 // bfcvt z17.h, p2/M, z3.s\n"
+ "ld1w { z25.s }, p2/Z, [x21]\n"
+ "orr x23, x20, %x[ld_in_col], LSL #18\n"
+ "st1h { z23.h }, p2, [x24, #3, MUL VL]\n"
+ ".inst 0x658aa857 // bfcvt z23.h, p2/M, z2.s\n"
+ "orr x23, x7, x23, LSL #20\n"
+ "mov x22, #0x8\n"
+ "st1h { z16.h }, p2, [x24, #4, MUL VL]\n"
+ ".inst 0x658aab4e // bfcvt z14.h, p2/M, z26.s\n"
+ "add x21, x6, x4\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "st1h { z5.h }, p2, [x24, #5, MUL VL]\n"
+ "addvl x24, x24, #-6\n"
+ "mov z31.d, z30.d\n"
+ ".inst 0x648aab08 // bfcvtnt z8.h, p2/M, z24.s\n"
+ "st1h { z8.h }, p2, [x24]\n"
+ ".inst 0x648aa864 // bfcvtnt z4.h, p2/M, z3.s\n"
+ ".inst 0x648aa851 // bfcvtnt z17.h, p2/M, z2.s\n"
+ "mov x11, #0x0\n"
+ "st1h { z4.h }, p2, [x24, #1, MUL VL]\n"
+ ".inst 0x648aab57 // bfcvtnt z23.h, p2/M, z26.s\n"
+ ".inst 0x648aab2e // bfcvtnt z14.h, p2/M, z25.s\n"
+ "mov x8, #0x8\n"
+ "st1h { z17.h }, p2, [x24, #2, MUL VL]\n"
+ ".inst 0x658aab26 // bfcvt z6.h, p2/M, z25.s\n"
+ "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "lsl x23, x23, #0x2\n"
+ "st1h { z23.h }, p2, [x24, #3, MUL VL]\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x6, x16\n"
+ "st1h { z14.h }, p2, [x24, #4, MUL VL]\n"
+ "st1h { z6.h }, p2, [x24, #5, MUL VL]\n"
+ "3:" // Issue prefetches
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col], LSL #2\n"
+ "bgt 3b\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "msub x16, x6, x20, x16\n"
+ ".inst 0xc0046bc0 // mova za.d[x11, #0], { z30.d-z31.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0046bc1 // mova za.d[x11, #1], { z30.d-z31.d }\n"
+ "mov x22, #0x4\n"
+ "ldp x14, x13, [x23], #0x10\n"
+ ".inst 0xc0046bc2 // mova za.d[x11, #2], { z30.d-z31.d }\n"
+ "ldp x5, x10, [x20], #0x10\n"
+ ".inst 0xc0046bc3 // mova za.d[x11, #3], { z30.d-z31.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0046bc4 // mova za.d[x11, #4], { z30.d-z31.d }\n"
+ "ldp x9, x28, [x23], #0x10\n"
+ ".inst 0xc0046bc5 // mova za.d[x11, #5], { z30.d-z31.d }\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ ".inst 0xc0046bc6 // mova za.d[x11, #6], { z30.d-z31.d }\n"
+ ".inst 0xc0046bc7 // mova za.d[x11, #7], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
+ "cbz x21, 5f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 5f\n"
+ ".inst 0xc0066804 // mova { z4.d-z5.d }, za.d[x11, #0]\n"
+ "sub x15, x15, x21\n"
+ ".inst 0xc0066826 // mova { z6.d-z7.d }, za.d[x11, #1]\n"
+ ".inst 0xc1bccba4 // fclamp { z4.s-z7.s }, z29.s, z28.s\n"
+ "4:" // Left padding
+ "subs x21, x21, #0x1\n"
+ "st1w { z4.s }, p1, [x14]\n"
+ "add x14, x14, x5, LSL #2\n"
+ "st1w { z6.s }, p1, [x13]\n"
+ "add x13, x13, x10, LSL #2\n"
+ "st1w { z5.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "st1w { z7.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ "bgt 4b\n"
+ "5:" // Left padding: End
+ "adds XZR, x6, x4\n"
+ "bne 12f\n"
+ "cbz x22, 10f\n"
+ "cmp x22, #0x1\n"
+ "sub x25, x25, x22\n"
+ "beq 9f\n"
+ "cmp x22, #0x2\n"
+ "beq 8f\n"
+ "cmp x22, #0x3\n"
+ "beq 7f\n"
+ "6:" // Unpadded: 4 priming loads
+ "add x21, x16, %x[ld_in_row], LSL #2\n"
+ "ld1w { z21.s }, p1/Z, [x16]\n"
+ ".inst 0x658aaab2 // bfcvt z18.h, p2/M, z21.s\n"
+ "addvl x20, SP, #24\n"
+ "ld1w { z11.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa972 // bfcvtnt z18.h, p2/M, z11.s\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "ld1w { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa33 // bfcvt z19.h, p2/M, z17.s\n"
+ "ld1w { z12.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa993 // bfcvtnt z19.h, p2/M, z12.s\n"
+ "ld1w { z7.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa8f4 // bfcvt z20.h, p2/M, z7.s\n"
+ "ld1w { z12.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa994 // bfcvtnt z20.h, p2/M, z12.s\n"
+ ".inst 0xa0402a8c // ld1h { z12.h-z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12d7250 // bfdot za.s[x11, 0], { z18.h-z19.h }, z13.h\n"
+ "ld1w { z6.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa8d5 // bfcvt z21.h, p2/M, z6.s\n"
+ ".inst 0xc12c7251 // bfdot za.s[x11, 1], { z18.h-z19.h }, z12.h\n"
+ ".inst 0xa0412a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12b7270 // bfdot za.s[x11, 0], { z19.h-z20.h }, z11.h\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
+ ".inst 0x648aab75 // bfcvtnt z21.h, p2/M, z27.s\n"
+ ".inst 0xc12a7271 // bfdot za.s[x11, 1], { z19.h-z20.h }, z10.h\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc12b7290 // bfdot za.s[x11, 0], { z20.h-z21.h }, z11.h\n"
+ ".inst 0xc12a7291 // bfdot za.s[x11, 1], { z20.h-z21.h }, z10.h\n"
+ "7:" // Unpadded: 3 priming loads
+ "add x22, x16, %x[ld_in_row], LSL #2\n"
+ "ld1w { z6.s }, p1/Z, [x16]\n"
+ ".inst 0x658aa8d7 // bfcvt z23.h, p2/M, z6.s\n"
+ "addvl x21, SP, #18\n"
+ "ld1w { z1.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa837 // bfcvtnt z23.h, p2/M, z1.s\n"
+ "addvl x20, SP, #24\n"
+ "ld1w { z15.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa9f8 // bfcvt z24.h, p2/M, z15.s\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa18 // bfcvtnt z24.h, p2/M, z16.s\n"
+ "ld1w { z1.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa839 // bfcvt z25.h, p2/M, z1.s\n"
+ "ld1w { z9.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa939 // bfcvtnt z25.h, p2/M, z9.s\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12972f0 // bfdot za.s[x11, 0], { z23.h-z24.h }, z9.h\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa1a // bfcvt z26.h, p2/M, z16.s\n"
+ ".inst 0xc12172f1 // bfdot za.s[x11, 1], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f72f2 // bfdot za.s[x11, 2], { z23.h-z24.h }, z15.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12772f3 // bfdot za.s[x11, 3], { z23.h-z24.h }, z7.h\n"
+ "ld1w { z16.s }, p1/Z, [x22]\n"
+ ".inst 0x648aaa1a // bfcvtnt z26.h, p2/M, z16.s\n"
+ ".inst 0xc1297310 // bfdot za.s[x11, 0], { z24.h-z25.h }, z9.h\n"
+ ".inst 0xc1217311 // bfdot za.s[x11, 1], { z24.h-z25.h }, z1.h\n"
+ ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xa1422aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12f7312 // bfdot za.s[x11, 2], { z24.h-z25.h }, z15.h\n"
+ ".inst 0xc1277313 // bfdot za.s[x11, 3], { z24.h-z25.h }, z7.h\n"
+ ".inst 0xc12b7330 // bfdot za.s[x11, 0], { z25.h-z26.h }, z11.h\n"
+ ".inst 0xc1237331 // bfdot za.s[x11, 1], { z25.h-z26.h }, z3.h\n"
+ ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1237332 // bfdot za.s[x11, 2], { z25.h-z26.h }, z3.h\n"
+ ".inst 0xc1227333 // bfdot za.s[x11, 3], { z25.h-z26.h }, z2.h\n"
+ "8:" // Unpadded: 2 priming loads
+ "add x23, x16, %x[ld_in_row], LSL #2\n"
+ "ld1w { z24.s }, p1/Z, [x16]\n"
+ ".inst 0x658aab02 // bfcvt z2.h, p2/M, z24.s\n"
+ "addvl x22, SP, #12\n"
+ "ld1w { z16.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa02 // bfcvtnt z2.h, p2/M, z16.s\n"
+ "addvl x21, SP, #18\n"
+ "ld1w { z16.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa03 // bfcvt z3.h, p2/M, z16.s\n"
+ "addvl x20, SP, #24\n"
+ "ld1w { z16.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa03 // bfcvtnt z3.h, p2/M, z16.s\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "ld1w { z1.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa824 // bfcvt z4.h, p2/M, z1.s\n"
+ "ld1w { z19.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa64 // bfcvtnt z4.h, p2/M, z19.s\n"
+ ".inst 0xa1402ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc12f7050 // bfdot za.s[x11, 0], { z2.h-z3.h }, z15.h\n"
+ "ld1w { z0.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa805 // bfcvt z5.h, p2/M, z0.s\n"
+ ".inst 0xc1277051 // bfdot za.s[x11, 1], { z2.h-z3.h }, z7.h\n"
+ ".inst 0xa1402aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12f7052 // bfdot za.s[x11, 2], { z2.h-z3.h }, z15.h\n"
+ ".inst 0xa1412ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc1277053 // bfdot za.s[x11, 3], { z2.h-z3.h }, z7.h\n"
+ "ld1w { z10.s }, p1/Z, [x23]\n"
+ ".inst 0x648aa945 // bfcvtnt z5.h, p2/M, z10.s\n"
+ ".inst 0xc12e7070 // bfdot za.s[x11, 0], { z3.h-z4.h }, z14.h\n"
+ ".inst 0xa1402a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1267071 // bfdot za.s[x11, 1], { z3.h-z4.h }, z6.h\n"
+ ".inst 0xa0412aac // ld1h { z12.h-z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12f7054 // bfdot za.s[x11, 4], { z2.h-z3.h }, z15.h\n"
+ ".inst 0xa1422ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc1277055 // bfdot za.s[x11, 5], { z2.h-z3.h }, z7.h\n"
+ ".inst 0xc12d7072 // bfdot za.s[x11, 2], { z3.h-z4.h }, z13.h\n"
+ ".inst 0xc12c7073 // bfdot za.s[x11, 3], { z3.h-z4.h }, z12.h\n"
+ ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1287090 // bfdot za.s[x11, 0], { z4.h-z5.h }, z8.h\n"
+ ".inst 0xc1207091 // bfdot za.s[x11, 1], { z4.h-z5.h }, z0.h\n"
+ ".inst 0xa0422aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12f7074 // bfdot za.s[x11, 4], { z3.h-z4.h }, z15.h\n"
+ ".inst 0xc12e7075 // bfdot za.s[x11, 5], { z3.h-z4.h }, z14.h\n"
+ ".inst 0xc1277092 // bfdot za.s[x11, 2], { z4.h-z5.h }, z7.h\n"
+ ".inst 0xc1267093 // bfdot za.s[x11, 3], { z4.h-z5.h }, z6.h\n"
+ ".inst 0xa1422a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1287094 // bfdot za.s[x11, 4], { z4.h-z5.h }, z8.h\n"
+ ".inst 0xc1207095 // bfdot za.s[x11, 5], { z4.h-z5.h }, z0.h\n"
+ "9:" // Unpadded: 1 priming loads
+ "add x24, x16, %x[ld_in_row], LSL #2\n"
+ "ld1w { z18.s }, p1/Z, [x16]\n"
+ ".inst 0x658aaa4c // bfcvt z12.h, p2/M, z18.s\n"
+ "addvl x23, SP, #6\n"
+ "ld1w { z7.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa8ec // bfcvtnt z12.h, p2/M, z7.s\n"
+ "addvl x22, SP, #12\n"
+ "ld1w { z20.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa8d // bfcvt z13.h, p2/M, z20.s\n"
+ "addvl x21, SP, #18\n"
+ "ld1w { z0.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa80d // bfcvtnt z13.h, p2/M, z0.s\n"
+ "addvl x20, SP, #24\n"
+ "ld1w { z10.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa94e // bfcvt z14.h, p2/M, z10.s\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "ld1w { z0.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa80e // bfcvtnt z14.h, p2/M, z0.s\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1217190 // bfdot za.s[x11, 0], { z12.h-z13.h }, z1.h\n"
+ "ld1w { z17.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa2f // bfcvt z15.h, p2/M, z17.s\n"
+ ".inst 0xc1207191 // bfdot za.s[x11, 1], { z12.h-z13.h }, z0.h\n"
+ ".inst 0xa0402aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc12b7192 // bfdot za.s[x11, 2], { z12.h-z13.h }, z11.h\n"
+ ".inst 0xa0412ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc12a7193 // bfdot za.s[x11, 3], { z12.h-z13.h }, z10.h\n"
+ "ld1w { z18.s }, p1/Z, [x24]\n"
+ ".inst 0x648aaa4f // bfcvtnt z15.h, p2/M, z18.s\n"
+ ".inst 0xc12171b0 // bfdot za.s[x11, 0], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12071b1 // bfdot za.s[x11, 1], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa0412ac6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc12a7194 // bfdot za.s[x11, 4], { z12.h-z13.h }, z10.h\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc1227195 // bfdot za.s[x11, 5], { z12.h-z13.h }, z2.h\n"
+ ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12771b2 // bfdot za.s[x11, 2], { z13.h-z14.h }, z7.h\n"
+ ".inst 0xc12671b3 // bfdot za.s[x11, 3], { z13.h-z14.h }, z6.h\n"
+ ".inst 0xa0412aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12b71d0 // bfdot za.s[x11, 0], { z14.h-z15.h }, z11.h\n"
+ ".inst 0xc12a71d1 // bfdot za.s[x11, 1], { z14.h-z15.h }, z10.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc1297196 // bfdot za.s[x11, 6], { z12.h-z13.h }, z9.h\n"
+ ".inst 0xc1287197 // bfdot za.s[x11, 7], { z12.h-z13.h }, z8.h\n"
+ ".inst 0xc12171b4 // bfdot za.s[x11, 4], { z13.h-z14.h }, z1.h\n"
+ ".inst 0xc12071b5 // bfdot za.s[x11, 5], { z13.h-z14.h }, z0.h\n"
+ ".inst 0xa1412a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12a71d2 // bfdot za.s[x11, 2], { z14.h-z15.h }, z10.h\n"
+ ".inst 0xc12271d3 // bfdot za.s[x11, 3], { z14.h-z15.h }, z2.h\n"
+ ".inst 0xa0422aa6 // ld1h { z6.h-z7.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12b71b6 // bfdot za.s[x11, 6], { z13.h-z14.h }, z11.h\n"
+ ".inst 0xc12371b7 // bfdot za.s[x11, 7], { z13.h-z14.h }, z3.h\n"
+ ".inst 0xc12771d4 // bfdot za.s[x11, 4], { z14.h-z15.h }, z7.h\n"
+ ".inst 0xc12671d5 // bfdot za.s[x11, 5], { z14.h-z15.h }, z6.h\n"
+ ".inst 0xa0422a86 // ld1h { z6.h-z7.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc12771d6 // bfdot za.s[x11, 6], { z14.h-z15.h }, z7.h\n"
+ ".inst 0xc12671d7 // bfdot za.s[x11, 7], { z14.h-z15.h }, z6.h\n"
+ "10:" // Unpadded: 0 priming loads
+ ".inst 0xa1402be6 // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1422be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 20f\n"
+ "add x20, x16, %x[ld_in_row], LSL #2\n"
+ "ld1w { z1.s }, p1/Z, [x16]\n"
+ ".inst 0x658aa834 // bfcvt z20.h, p2/M, z1.s\n"
+ "sub x25, x25, #0x1\n"
+ "ld1w { z10.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "sub x15, x15, #0x1\n"
+ ".inst 0x648aa954 // bfcvtnt z20.h, p2/M, z10.s\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa15 // bfcvt z21.h, p2/M, z16.s\n"
+ "cmp x25, x15\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "csel x25, x25, x15, LT\n"
+ ".inst 0x648aaa75 // bfcvtnt z21.h, p2/M, z19.s\n"
+ "ld1w { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaaf6 // bfcvt z22.h, p2/M, z23.s\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aab76 // bfcvtnt z22.h, p2/M, z27.s\n"
+ "sub x15, x15, x25\n"
+ "ld1w { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa9f7 // bfcvt z23.h, p2/M, z15.s\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0x648aaa17 // bfcvtnt z23.h, p2/M, z16.s\n"
+ "cbz x25, 19f\n"
+ "11:" // Unpadded: Main loop
+ "addvl x24, SP, #6\n"
+ ".inst 0xc12e7290 // bfdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+ "addvl x23, SP, #12\n"
+ "ld1w { z27.s }, p1/Z, [x16]\n"
+ ".inst 0xc1267291 // bfdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa1402b01 // ld1h { z1.h, z9.h }, pn10.b/Z, [x24]\n"
+ "addvl x22, SP, #18\n"
+ "addvl x21, SP, #24\n"
+ ".inst 0xc1297292 // bfdot za.s[x11, 2], { z20.h-z21.h }, z9.h\n"
+ "add x20, x16, %x[ld_in_row], LSL #2\n"
+ "ld1w { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1217293 // bfdot za.s[x11, 3], { z20.h-z21.h }, z1.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+ "subs x25, x25, #0x1\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc12d72b0 // bfdot za.s[x11, 0], { z21.h-z22.h }, z13.h\n"
+ "ld1w { z25.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc12572b1 // bfdot za.s[x11, 1], { z21.h-z22.h }, z5.h\n"
+ ".inst 0xa1412b07 // ld1h { z7.h, z15.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ ".inst 0xc12e7294 // bfdot za.s[x11, 4], { z20.h-z21.h }, z14.h\n"
+ "ld1w { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1267295 // bfdot za.s[x11, 5], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa1402ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc12f72b2 // bfdot za.s[x11, 2], { z21.h-z22.h }, z15.h\n"
+ "ld1w { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc12772b3 // bfdot za.s[x11, 3], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1412ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc12c72d0 // bfdot za.s[x11, 0], { z22.h-z23.h }, z12.h\n"
+ "ld1w { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc12472d1 // bfdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa1422b07 // ld1h { z7.h, z15.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc12d7296 // bfdot za.s[x11, 6], { z20.h-z21.h }, z13.h\n"
+ "ld1w { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1257297 // bfdot za.s[x11, 7], { z20.h-z21.h }, z5.h\n"
+ ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12e72b4 // bfdot za.s[x11, 4], { z21.h-z22.h }, z14.h\n"
+ "ld1w { z16.s }, p1/Z, [x20]\n"
+ ".inst 0xc12672b5 // bfdot za.s[x11, 5], { z21.h-z22.h }, z6.h\n"
+ ".inst 0xa1412ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc12f72d2 // bfdot za.s[x11, 2], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc12772d3 // bfdot za.s[x11, 3], { z22.h-z23.h }, z7.h\n"
+ ".inst 0xa1422ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc12e72b6 // bfdot za.s[x11, 6], { z21.h-z22.h }, z14.h\n"
+ ".inst 0xc12672b7 // bfdot za.s[x11, 7], { z21.h-z22.h }, z6.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12f72d4 // bfdot za.s[x11, 4], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc12772d5 // bfdot za.s[x11, 5], { z22.h-z23.h }, z7.h\n"
+ ".inst 0xa0422ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc12f72d6 // bfdot za.s[x11, 6], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc12e72d7 // bfdot za.s[x11, 7], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12c1290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z12.h\n"
+ ".inst 0xc1241291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z4.h\n"
+ ".inst 0x658aab74 // bfcvt z20.h, p2/M, z27.s\n"
+ ".inst 0xa1402be6 // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc12d12b0 // bfdot za.s[x8, 0], { z21.h-z22.h }, z13.h\n"
+ ".inst 0x648aab54 // bfcvtnt z20.h, p2/M, z26.s\n"
+ ".inst 0xc12512b1 // bfdot za.s[x8, 1], { z21.h-z22.h }, z5.h\n"
+ ".inst 0x658aab35 // bfcvt z21.h, p2/M, z25.s\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xc12912d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z9.h\n"
+ ".inst 0x648aab15 // bfcvtnt z21.h, p2/M, z24.s\n"
+ ".inst 0xc12112d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z1.h\n"
+ ".inst 0x658aaa76 // bfcvt z22.h, p2/M, z19.s\n"
+ ".inst 0x658aaa37 // bfcvt z23.h, p2/M, z17.s\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xa1422be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ ".inst 0x648aaa56 // bfcvtnt z22.h, p2/M, z18.s\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1bccba8 // fclamp { z8.s-z11.s }, z29.s, z28.s\n"
+ "st1w { z8.s }, p1, [x14]\n"
+ "add x14, x14, x5, LSL #2\n"
+ "st1w { z10.s }, p1, [x13]\n"
+ "add x13, x13, x10, LSL #2\n"
+ "add x11, x11, #0x2\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
+ "st1w { z9.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
+ ".inst 0x648aaa17 // bfcvtnt z23.h, p2/M, z16.s\n"
+ "st1w { z11.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ "bgt 11b\n"
+ "b 19f\n"
+ "12:" // Padded
+ "cbz x22, 17f\n"
+ "cmp x22, #0x1\n"
+ "sub x25, x25, x22\n"
+ "beq 16f\n"
+ "cmp x22, #0x2\n"
+ "beq 15f\n"
+ "cmp x22, #0x3\n"
+ "beq 14f\n"
+ "13:" // Padded: 4 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x16]\n"
+ ".inst 0x658aaa06 // bfcvt z6.h, p2/M, z16.s\n"
+ "add x21, x16, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ ".inst 0x648aaa06 // bfcvtnt z6.h, p2/M, z16.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ ".inst 0x658aaa07 // bfcvt z7.h, p2/M, z16.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ ".inst 0x648aaa07 // bfcvtnt z7.h, p2/M, z16.s\n"
+ "mov x12, #0x4\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa08 // bfcvt z8.h, p2/M, z16.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "addvl x20, SP, #24\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ ".inst 0x648aaa08 // bfcvtnt z8.h, p2/M, z16.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f70d0 // bfdot za.s[x11, 0], { z6.h-z7.h }, z15.h\n"
+ "ld1w { z9.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0x658aa929 // bfcvt z9.h, p2/M, z9.s\n"
+ ".inst 0xc12e70d1 // bfdot za.s[x11, 1], { z6.h-z7.h }, z14.h\n"
+ ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x21]\n"
+ ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
+ ".inst 0xc12f70f0 // bfdot za.s[x11, 0], { z7.h-z8.h }, z15.h\n"
+ ".inst 0xc12e70f1 // bfdot za.s[x11, 1], { z7.h-z8.h }, z14.h\n"
+ ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1237110 // bfdot za.s[x11, 0], { z8.h-z9.h }, z3.h\n"
+ ".inst 0xc1227111 // bfdot za.s[x11, 1], { z8.h-z9.h }, z2.h\n"
+ "14:" // Padded: 3 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x16]\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
+ "add x22, x16, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ "mov x12, #0x4\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "addvl x21, SP, #18\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xa1402aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12f7130 // bfdot za.s[x11, 0], { z9.h-z10.h }, z15.h\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ "addvl x20, SP, #24\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc1277131 // bfdot za.s[x11, 1], { z9.h-z10.h }, z7.h\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ ".inst 0xa1412aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12e7132 // bfdot za.s[x11, 2], { z9.h-z10.h }, z14.h\n"
+ "ld1w { z16.s }, p0/Z, [x22]\n"
+ ".inst 0xc1267133 // bfdot za.s[x11, 3], { z9.h-z10.h }, z6.h\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0xc12f7150 // bfdot za.s[x11, 0], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xa1422aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc1277151 // bfdot za.s[x11, 1], { z10.h-z11.h }, z7.h\n"
+ ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12f7152 // bfdot za.s[x11, 2], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc12e7153 // bfdot za.s[x11, 3], { z10.h-z11.h }, z14.h\n"
+ ".inst 0xc12d7170 // bfdot za.s[x11, 0], { z11.h-z12.h }, z13.h\n"
+ ".inst 0xc1257171 // bfdot za.s[x11, 1], { z11.h-z12.h }, z5.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc12f7172 // bfdot za.s[x11, 2], { z11.h-z12.h }, z15.h\n"
+ ".inst 0xc12e7173 // bfdot za.s[x11, 3], { z11.h-z12.h }, z14.h\n"
+ "15:" // Padded: 2 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x16]\n"
+ ".inst 0x658aaa12 // bfcvt z18.h, p2/M, z16.s\n"
+ "add x23, x16, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z16.s }, p0/Z, [x23]\n"
+ ".inst 0x648aaa12 // bfcvtnt z18.h, p2/M, z16.s\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z16.s }, p0/Z, [x23]\n"
+ ".inst 0x658aaa13 // bfcvt z19.h, p2/M, z16.s\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z16.s }, p0/Z, [x23]\n"
+ ".inst 0x648aaa13 // bfcvtnt z19.h, p2/M, z16.s\n"
+ "mov x12, #0x4\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "addvl x22, SP, #12\n"
+ "ld1w { z16.s }, p0/Z, [x23]\n"
+ ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xa1402ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc1297250 // bfdot za.s[x11, 0], { z18.h-z19.h }, z9.h\n"
+ "ld1w { z26.s }, p0/Z, [x23]\n"
+ "addvl x21, SP, #18\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab55 // bfcvt z21.h, p2/M, z26.s\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc1217251 // bfdot za.s[x11, 1], { z18.h-z19.h }, z1.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xa1412ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc12e7252 // bfdot za.s[x11, 2], { z18.h-z19.h }, z14.h\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x23]\n"
+ ".inst 0xc1267253 // bfdot za.s[x11, 3], { z18.h-z19.h }, z6.h\n"
+ ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f7270 // bfdot za.s[x11, 0], { z19.h-z20.h }, z15.h\n"
+ ".inst 0xc1277271 // bfdot za.s[x11, 1], { z19.h-z20.h }, z7.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa1422ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc12d7254 // bfdot za.s[x11, 4], { z18.h-z19.h }, z13.h\n"
+ ".inst 0xc1257255 // bfdot za.s[x11, 5], { z18.h-z19.h }, z5.h\n"
+ ".inst 0xc12e7272 // bfdot za.s[x11, 2], { z19.h-z20.h }, z14.h\n"
+ ".inst 0xc1267273 // bfdot za.s[x11, 3], { z19.h-z20.h }, z6.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12f7290 // bfdot za.s[x11, 0], { z20.h-z21.h }, z15.h\n"
+ ".inst 0xc1277291 // bfdot za.s[x11, 1], { z20.h-z21.h }, z7.h\n"
+ ".inst 0xa0422aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12d7274 // bfdot za.s[x11, 4], { z19.h-z20.h }, z13.h\n"
+ ".inst 0xc1257275 // bfdot za.s[x11, 5], { z19.h-z20.h }, z5.h\n"
+ ".inst 0xc12f7292 // bfdot za.s[x11, 2], { z20.h-z21.h }, z15.h\n"
+ ".inst 0xc12e7293 // bfdot za.s[x11, 3], { z20.h-z21.h }, z14.h\n"
+ ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1237294 // bfdot za.s[x11, 4], { z20.h-z21.h }, z3.h\n"
+ ".inst 0xc1227295 // bfdot za.s[x11, 5], { z20.h-z21.h }, z2.h\n"
+ "16:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x16]\n"
+ ".inst 0x658aaa09 // bfcvt z9.h, p2/M, z16.s\n"
+ "add x24, x16, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z16.s }, p0/Z, [x24]\n"
+ ".inst 0x648aaa09 // bfcvtnt z9.h, p2/M, z16.s\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z16.s }, p0/Z, [x24]\n"
+ ".inst 0x658aaa0a // bfcvt z10.h, p2/M, z16.s\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z16.s }, p0/Z, [x24]\n"
+ ".inst 0x648aaa0a // bfcvtnt z10.h, p2/M, z16.s\n"
+ "mov x12, #0x4\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0b // bfcvt z11.h, p2/M, z16.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "addvl x23, SP, #6\n"
+ "ld1w { z16.s }, p0/Z, [x24]\n"
+ ".inst 0x648aaa0b // bfcvtnt z11.h, p2/M, z16.s\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xa1402ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc12f7130 // bfdot za.s[x11, 0], { z9.h-z10.h }, z15.h\n"
+ "ld1w { z16.s }, p0/Z, [x24]\n"
+ "addvl x22, SP, #12\n"
+ "add x24, x24, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa0c // bfcvt z12.h, p2/M, z16.s\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc1277131 // bfdot za.s[x11, 1], { z9.h-z10.h }, z7.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ "addvl x21, SP, #18\n"
+ ".inst 0xa1412ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc12e7132 // bfdot za.s[x11, 2], { z9.h-z10.h }, z14.h\n"
+ "addvl x20, SP, #24\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "ld1w { z16.s }, p0/Z, [x24]\n"
+ ".inst 0xc1267133 // bfdot za.s[x11, 3], { z9.h-z10.h }, z6.h\n"
+ ".inst 0x648aaa0c // bfcvtnt z12.h, p2/M, z16.s\n"
+ ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12f7150 // bfdot za.s[x11, 0], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc1277151 // bfdot za.s[x11, 1], { z10.h-z11.h }, z7.h\n"
+ ".inst 0xa1412ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa1422ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc12d7134 // bfdot za.s[x11, 4], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xc1257135 // bfdot za.s[x11, 5], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f7152 // bfdot za.s[x11, 2], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc1277153 // bfdot za.s[x11, 3], { z10.h-z11.h }, z7.h\n"
+ ".inst 0xa1412aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12e7170 // bfdot za.s[x11, 0], { z11.h-z12.h }, z14.h\n"
+ ".inst 0xc1267171 // bfdot za.s[x11, 1], { z11.h-z12.h }, z6.h\n"
+ ".inst 0xa1422ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc12d7136 // bfdot za.s[x11, 6], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xc1257137 // bfdot za.s[x11, 7], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xc12f7154 // bfdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc1277155 // bfdot za.s[x11, 5], { z10.h-z11.h }, z7.h\n"
+ ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12e7172 // bfdot za.s[x11, 2], { z11.h-z12.h }, z14.h\n"
+ ".inst 0xc1267173 // bfdot za.s[x11, 3], { z11.h-z12.h }, z6.h\n"
+ ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12f7156 // bfdot za.s[x11, 6], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc1277157 // bfdot za.s[x11, 7], { z10.h-z11.h }, z7.h\n"
+ ".inst 0xc1297174 // bfdot za.s[x11, 4], { z11.h-z12.h }, z9.h\n"
+ ".inst 0xc1217175 // bfdot za.s[x11, 5], { z11.h-z12.h }, z1.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1217176 // bfdot za.s[x11, 6], { z11.h-z12.h }, z1.h\n"
+ ".inst 0xc1207177 // bfdot za.s[x11, 7], { z11.h-z12.h }, z0.h\n"
+ "17:" // Padded: 0 priming loads
+ ".inst 0xa1402be6 // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa1422be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 20f\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x16]\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
+ "add x20, x16, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa14 // bfcvtnt z20.h, p2/M, z16.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x658aaa15 // bfcvt z21.h, p2/M, z16.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x648aaa15 // bfcvtnt z21.h, p2/M, z16.s\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa16 // bfcvt z22.h, p2/M, z16.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa16 // bfcvtnt z22.h, p2/M, z16.s\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa17 // bfcvt z23.h, p2/M, z16.s\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z16.s }, p0/Z, [x20]\n"
+ "sub x25, x25, #0x1\n"
+ ".inst 0x648aaa17 // bfcvtnt z23.h, p2/M, z16.s\n"
+ "sub x15, x15, #0x1\n"
+ "cmp x25, x15\n"
+ "csel x25, x25, x15, LT\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ "sub x15, x15, x25\n"
+ "cbz x25, 19f\n"
+ "18:" // Padded: Main loop
+ "addvl x24, SP, #6\n"
+ ".inst 0xc12e7290 // bfdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+ "addvl x23, SP, #12\n"
+ ".inst 0xc1267291 // bfdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa0402b02 // ld1h { z2.h-z3.h }, pn10.b/Z, [x24]\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1237292 // bfdot za.s[x11, 2], { z20.h-z21.h }, z3.h\n"
+ "ld1w { z16.s }, p0/Z, [x16]\n"
+ "add x22, x16, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc1227293 // bfdot za.s[x11, 3], { z20.h-z21.h }, z2.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+ "addvl x21, SP, #18\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xc12d72b0 // bfdot za.s[x11, 0], { z21.h-z22.h }, z13.h\n"
+ "ld1w { z19.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc12572b1 // bfdot za.s[x11, 1], { z21.h-z22.h }, z5.h\n"
+ ".inst 0xa1412b07 // ld1h { z7.h, z15.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ "subs x25, x25, #0x1\n"
+ "add x16, x16, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc12e7294 // bfdot za.s[x11, 4], { z20.h-z21.h }, z14.h\n"
+ "ld1w { z17.s }, p0/Z, [x22]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x4\n"
+ ".inst 0xc1267295 // bfdot za.s[x11, 5], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc12f72b2 // bfdot za.s[x11, 2], { z21.h-z22.h }, z15.h\n"
+ "ld1w { z27.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc12772b3 // bfdot za.s[x11, 3], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1412ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc12c72d0 // bfdot za.s[x11, 0], { z22.h-z23.h }, z12.h\n"
+ "ld1w { z10.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc12472d1 // bfdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa1422b04 // ld1h { z4.h, z12.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc12e7296 // bfdot za.s[x11, 6], { z20.h-z21.h }, z14.h\n"
+ "ld1w { z8.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc1267297 // bfdot za.s[x11, 7], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f72b4 // bfdot za.s[x11, 4], { z21.h-z22.h }, z15.h\n"
+ "ld1w { z11.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc12772b5 // bfdot za.s[x11, 5], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12c72d2 // bfdot za.s[x11, 2], { z22.h-z23.h }, z12.h\n"
+ "ld1w { z18.s }, p0/Z, [x22]\n"
+ ".inst 0xc12472d3 // bfdot za.s[x11, 3], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa1422ae4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc12f72b6 // bfdot za.s[x11, 6], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc12e72b7 // bfdot za.s[x11, 7], { z21.h-z22.h }, z14.h\n"
+ ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12c72d4 // bfdot za.s[x11, 4], { z22.h-z23.h }, z12.h\n"
+ ".inst 0xc12472d5 // bfdot za.s[x11, 5], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12172d6 // bfdot za.s[x11, 6], { z22.h-z23.h }, z1.h\n"
+ ".inst 0xc12072d7 // bfdot za.s[x11, 7], { z22.h-z23.h }, z0.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc12d1290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z13.h\n"
+ ".inst 0xc1251291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z5.h\n"
+ ".inst 0x658aaa14 // bfcvt z20.h, p2/M, z16.s\n"
+ ".inst 0xa1402be6 // ld1h { z6.h, z14.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc12f12b0 // bfdot za.s[x8, 0], { z21.h-z22.h }, z15.h\n"
+ ".inst 0x648aaa74 // bfcvtnt z20.h, p2/M, z19.s\n"
+ ".inst 0xc12712b1 // bfdot za.s[x8, 1], { z21.h-z22.h }, z7.h\n"
+ ".inst 0x658aaa35 // bfcvt z21.h, p2/M, z17.s\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xc12112d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+ ".inst 0x648aab75 // bfcvtnt z21.h, p2/M, z27.s\n"
+ ".inst 0xc12012d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
+ ".inst 0x658aa956 // bfcvt z22.h, p2/M, z10.s\n"
+ ".inst 0x658aa977 // bfcvt z23.h, p2/M, z11.s\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc0066800 // mova { z0.d-z1.d }, za.d[x11, #0]\n"
+ ".inst 0xa1422be4 // ld1h { z4.h, z12.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ ".inst 0x648aa916 // bfcvtnt z22.h, p2/M, z8.s\n"
+ ".inst 0xc0066822 // mova { z2.d-z3.d }, za.d[x11, #1]\n"
+ ".inst 0xc1bccba0 // fclamp { z0.s-z3.s }, z29.s, z28.s\n"
+ "st1w { z0.s }, p1, [x14]\n"
+ "add x14, x14, x5, LSL #2\n"
+ "st1w { z2.s }, p1, [x13]\n"
+ "add x13, x13, x10, LSL #2\n"
+ "add x11, x11, #0x2\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
+ "st1w { z1.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
+ ".inst 0x648aaa57 // bfcvtnt z23.h, p2/M, z18.s\n"
+ "st1w { z3.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ "bgt 18b\n"
+ "19:" // Main loop tail
+ "addvl x23, SP, #6\n"
+ ".inst 0xc12e7290 // bfdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+ "addvl x22, SP, #12\n"
+ ".inst 0xc1267291 // bfdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ "addvl x21, SP, #18\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xc1217292 // bfdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
+ ".inst 0xc1207293 // bfdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc12d72b0 // bfdot za.s[x11, 0], { z21.h-z22.h }, z13.h\n"
+ ".inst 0xc12572b1 // bfdot za.s[x11, 1], { z21.h-z22.h }, z5.h\n"
+ ".inst 0xa1412ae7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc12e7294 // bfdot za.s[x11, 4], { z20.h-z21.h }, z14.h\n"
+ ".inst 0xc1267295 // bfdot za.s[x11, 5], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc12f72b2 // bfdot za.s[x11, 2], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc12772b3 // bfdot za.s[x11, 3], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1412ac7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc12c72d0 // bfdot za.s[x11, 0], { z22.h-z23.h }, z12.h\n"
+ ".inst 0xc12472d1 // bfdot za.s[x11, 1], { z22.h-z23.h }, z4.h\n"
+ ".inst 0xa1422ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc12d7296 // bfdot za.s[x11, 6], { z20.h-z21.h }, z13.h\n"
+ ".inst 0xc1257297 // bfdot za.s[x11, 7], { z20.h-z21.h }, z5.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc12f72b4 // bfdot za.s[x11, 4], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc12772b5 // bfdot za.s[x11, 5], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1412aa7 // ld1h { z7.h, z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc12e72d2 // bfdot za.s[x11, 2], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc12672d3 // bfdot za.s[x11, 3], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa1422ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc12f72b6 // bfdot za.s[x11, 6], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc12772b7 // bfdot za.s[x11, 7], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xa1412a87 // ld1h { z7.h, z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc12e72d4 // bfdot za.s[x11, 4], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xc12672d5 // bfdot za.s[x11, 5], { z22.h-z23.h }, z6.h\n"
+ ".inst 0xa1422aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc12b72d6 // bfdot za.s[x11, 6], { z22.h-z23.h }, z11.h\n"
+ ".inst 0xc12372d7 // bfdot za.s[x11, 7], { z22.h-z23.h }, z3.h\n"
+ ".inst 0xa0422a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc12d1290 // bfdot za.s[x8, 0], { z20.h-z21.h }, z13.h\n"
+ ".inst 0xc1251291 // bfdot za.s[x8, 1], { z20.h-z21.h }, z5.h\n"
+ ".inst 0xc12f12b0 // bfdot za.s[x8, 0], { z21.h-z22.h }, z15.h\n"
+ ".inst 0xc12712b1 // bfdot za.s[x8, 1], { z21.h-z22.h }, z7.h\n"
+ ".inst 0xc12312d0 // bfdot za.s[x8, 0], { z22.h-z23.h }, z3.h\n"
+ ".inst 0xc12212d1 // bfdot za.s[x8, 1], { z22.h-z23.h }, z2.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
+ ".inst 0xc1bccbb4 // fclamp { z20.s-z23.s }, z29.s, z28.s\n"
+ "st1w { z20.s }, p1, [x14]\n"
+ "add x14, x14, x5, LSL #2\n"
+ "st1w { z22.s }, p1, [x13]\n"
+ "add x13, x13, x10, LSL #2\n"
+ "add x11, x11, #0x2\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
+ "st1w { z21.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
+ "st1w { z23.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ "20:" // Main loop skip tail
+ "cbz x15, 22f\n"
+ "21:" // Right padding loop
+ ".inst 0xc0066800 // mova { z0.d-z1.d }, za.d[x11, #0]\n"
+ "add x8, x8, #0x2\n"
+ "subs x15, x15, #0x1\n"
+ ".inst 0xc0066822 // mova { z2.d-z3.d }, za.d[x11, #1]\n"
+ ".inst 0xc1bccba0 // fclamp { z0.s-z3.s }, z29.s, z28.s\n"
+ "st1w { z0.s }, p1, [x14]\n"
+ "add x14, x14, x5, LSL #2\n"
+ "st1w { z2.s }, p1, [x13]\n"
+ "add x13, x13, x10, LSL #2\n"
+ "add x11, x11, #0x2\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
+ "st1w { z1.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
+ "st1w { z3.s }, p1, [x28]\n"
+ "add x28, x28, x26, LSL #2\n"
+ "bgt 21b\n"
+ "22:" // End
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x20, ALL, MUL #16\n"
+ "incb x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x17\n"
+ "whilelt p1.s, x17, x7\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21, LSL #2\n"
+ "add x22, x22, x20, LSL #2\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21, LSL #2\n"
+ "add x22, x22, x20, LSL #2\n"
+ "stp x23, x22, [x25, #0x10]\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #30\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..53e596418b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
+ const float *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const float *weights,
+ const float *bias,
+ float **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ float act_min,
+ float act_max
+);
+
+class sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za : public PlanarStrategy<float, float>
+{
+ using Parent = PlanarStrategy<float, float>;
+
+ public:
+ using return_type = float;
+ constexpr static auto output_rows = 4u;
+ constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+ constexpr static auto stride_rows = 2u, stride_cols = 2u;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za(const CPUInfo *)
+ : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+ {
+ }
+
+ typename Parent::KernelType get_kernel(void) const override
+ {
+ return sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..3a56e69d26
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,1246 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za_impl(
+ const float *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const float *weights,
+ const float *bias,
+ float **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ float act_min,
+ float act_max
+)
+{
+ struct Args
+ {
+ const float *inptr;
+ size_t ld_in_vl;
+ long unsigned int pad_top, pad_bottom, pad_left;
+ const float *weights;
+ const float *bias;
+ long unsigned int input_cols, output_cols;
+ float **outptrs;
+ const size_t *ld_out_cols;
+ const size_t *ld_out_vls;
+ long unsigned int current_channel, n_channels;
+ float clamp_min, clamp_max;
+ };
+
+ Args args = { inptr, ld_in_vl, pad_top, 11u - std::min(11u, pad_top + valid_input_rows), pad_left, weights, bias, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels, act_min, act_max };
+
+ __asm__ __volatile__(
+ "ldr x3, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "mov x20, #0xb\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "sub x20, x20, x3\n"
+ "ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ptrue p2.b\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "ld1rw { z13.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_min]]\n"
+ "ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x5\n"
+ "whilelt p9.s, XZR, x20\n"
+ "ld1rw { z12.s }, p2/Z, [%x[args], %[offsetof_Args_clamp_max]]\n"
+ "whilelt p8.s, XZR, x4\n"
+ "addvl SP, SP, #-15\n"
+ "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "eor p8.b, p2/Z, p8.b, p9.b\n"
+ "1:" // Channel loop
+ "ldr x20, [%x[args], %[offsetof_Args_bias]]\n"
+ "fmov z16.s, #0x0\n"
+ "cbz x20, 2f\n"
+ "ld1w { z16.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "2:" // Load bias: Done
+ "ldr x21, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x21\n"
+ "ld1w { z31.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "ld1w { z8.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aabef // bfcvt z15.h, p2/M, z31.s\n"
+ "incb x21\n"
+ "ld1w { z18.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aaa4e // bfcvt z14.h, p2/M, z18.s\n"
+ "addvl x24, SP, #15\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x648aa90f // bfcvtnt z15.h, p2/M, z8.s\n"
+ "addvl x24, x24, #-3\n"
+ "ld1w { z18.s }, p2/Z, [x20]\n"
+ "mov x20, x21\n"
+ "st1h { z15.h }, p2, [x24]\n"
+ ".inst 0x648aaa2e // bfcvtnt z14.h, p2/M, z17.s\n"
+ "ld1w { z29.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aabb5 // bfcvt z21.h, p2/M, z29.s\n"
+ "incb x21\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "st1h { z14.h }, p2, [x24, #1, MUL VL]\n"
+ ".inst 0x658aaa58 // bfcvt z24.h, p2/M, z18.s\n"
+ "ld1w { z26.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aab41 // bfcvt z1.h, p2/M, z26.s\n"
+ ".inst 0x648aaa35 // bfcvtnt z21.h, p2/M, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "st1h { z24.h }, p2, [x24, #2, MUL VL]\n"
+ "addvl x24, x24, #-3\n"
+ "ld1w { z9.s }, p2/Z, [x20]\n"
+ "mov x20, x21\n"
+ "st1h { z21.h }, p2, [x24]\n"
+ ".inst 0x648aaa21 // bfcvtnt z1.h, p2/M, z17.s\n"
+ "ld1w { z3.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "incb x21\n"
+ ".inst 0x658aa864 // bfcvt z4.h, p2/M, z3.s\n"
+ "ld1w { z31.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aa92b // bfcvt z11.h, p2/M, z9.s\n"
+ "st1h { z1.h }, p2, [x24, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aaa46 // bfcvt z6.h, p2/M, z18.s\n"
+ "st1h { z11.h }, p2, [x24, #2, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "addvl x24, x24, #-3\n"
+ ".inst 0x648aabe4 // bfcvtnt z4.h, p2/M, z31.s\n"
+ "ld1w { z27.s }, p2/Z, [x20]\n"
+ "mov x20, x21\n"
+ "st1h { z4.h }, p2, [x24]\n"
+ ".inst 0x648aa8a6 // bfcvtnt z6.h, p2/M, z5.s\n"
+ "ld1w { z9.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aa938 // bfcvt z24.h, p2/M, z9.s\n"
+ "incb x21\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x658aab75 // bfcvt z21.h, p2/M, z27.s\n"
+ "st1h { z6.h }, p2, [x24, #1, MUL VL]\n"
+ "ld1w { z31.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ ".inst 0x648aaa38 // bfcvtnt z24.h, p2/M, z17.s\n"
+ ".inst 0x658aabf9 // bfcvt z25.h, p2/M, z31.s\n"
+ "ld1w { z18.s }, p2/Z, [x20]\n"
+ "incb x20, ALL, MUL #5\n"
+ "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "st1h { z21.h }, p2, [x24, #2, MUL VL]\n"
+ "ld1w { z11.s }, p2/Z, [x20]\n"
+ "mov x21, x21\n"
+ "addvl x24, x24, #-3\n"
+ "st1h { z24.h }, p2, [x24]\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ ".inst 0x648aaa59 // bfcvtnt z25.h, p2/M, z18.s\n"
+ "st1h { z25.h }, p2, [x24, #1, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ ".inst 0x658aaa29 // bfcvt z9.h, p2/M, z17.s\n"
+ ".inst 0x658aa976 // bfcvt z22.h, p2/M, z11.s\n"
+ "ld1w { z28.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ ".inst 0x658aab85 // bfcvt z5.h, p2/M, z28.s\n"
+ "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ld1w { z25.s }, p2/Z, [x21]\n"
+ "incb x21, ALL, MUL #5\n"
+ "sub x20, x7, #0x1\n"
+ "st1h { z22.h }, p2, [x24, #2, MUL VL]\n"
+ "ld1w { z11.s }, p2/Z, [x21]\n"
+ "orr x23, x20, %x[ld_in_col], LSL #18\n"
+ "addvl x24, x24, #-3\n"
+ "mov z17.d, z16.d\n"
+ "orr x23, x5, x23, LSL #20\n"
+ "mov x22, #0xb\n"
+ "mov z18.d, z16.d\n"
+ "mov z19.d, z16.d\n"
+ "add x21, x4, x3\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ ".inst 0x648aa909 // bfcvtnt z9.h, p2/M, z8.s\n"
+ "st1h { z9.h }, p2, [x24]\n"
+ ".inst 0x648aab25 // bfcvtnt z5.h, p2/M, z25.s\n"
+ "st1h { z5.h }, p2, [x24, #1, MUL VL]\n"
+ ".inst 0x658aa97b // bfcvt z27.h, p2/M, z11.s\n"
+ "mov x8, #0x0\n"
+ "st1h { z27.h }, p2, [x24, #2, MUL VL]\n"
+ "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "lsl x23, x23, #0x2\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x4, x17\n"
+ "3:" // Issue prefetches
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col], LSL #2\n"
+ "bgt 3b\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x2\n"
+ "msub x17, x4, x20, x17\n"
+ ".inst 0xc0040e00 // mova za.d[x8, #0], { z16.d-z19.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040e01 // mova za.d[x8, #1], { z16.d-z19.d }\n"
+ "mov x22, #0x4\n"
+ "ldp x15, x14, [x23], #0x10\n"
+ ".inst 0xc0040e02 // mova za.d[x8, #2], { z16.d-z19.d }\n"
+ "ldp x13, x11, [x20], #0x10\n"
+ ".inst 0xc0040e03 // mova za.d[x8, #3], { z16.d-z19.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
+ "ldp x10, x9, [x23], #0x10\n"
+ "ldp x28, x27, [x20], #0x10\n"
+ "cbz x21, 5f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 5f\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ "and x22, x21, #0x1\n"
+ "add x21, x21, #0x1\n"
+ ".inst 0xc1acc9a4 // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
+ "lsr x21, x21, #0x1\n"
+ "sub x16, x16, x21\n"
+ "4:" // Left padding
+ "subs x21, x21, #0x1\n"
+ "st1w { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13, LSL #2\n"
+ "st1w { z5.s }, p1, [x14]\n"
+ "add x14, x14, x11, LSL #2\n"
+ "st1w { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z7.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "bgt 4b\n"
+ "5:" // Left padding: End
+ "adds XZR, x4, x3\n"
+ "bne 12f\n"
+ "cbz x22, 10f\n"
+ "cmp x22, #0x1\n"
+ "sub x7, x7, x22\n"
+ "beq 9f\n"
+ "cmp x22, #0x2\n"
+ "beq 8f\n"
+ "cmp x22, #0x3\n"
+ "beq 7f\n"
+ "6:" // Unpadded: 4 priming loads
+ "add x21, x17, %x[ld_in_row], LSL #2\n"
+ "ld1w { z0.s }, p1/Z, [x17]\n"
+ ".inst 0x658aa816 // bfcvt z22.h, p2/M, z0.s\n"
+ "addvl x20, SP, #12\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa936 // bfcvtnt z22.h, p2/M, z9.s\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1w { z28.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab97 // bfcvt z23.h, p2/M, z28.s\n"
+ "ld1w { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa97 // bfcvtnt z23.h, p2/M, z20.s\n"
+ "ld1w { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa98 // bfcvt z24.h, p2/M, z20.s\n"
+ "ld1w { z29.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aabb8 // bfcvtnt z24.h, p2/M, z29.s\n"
+ "ld1w { z30.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aabd9 // bfcvt z25.h, p2/M, z30.s\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa939 // bfcvtnt z25.h, p2/M, z9.s\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab5a // bfcvt z26.h, p2/M, z26.s\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13312d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z3.h\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa93a // bfcvtnt z26.h, p2/M, z9.s\n"
+ ".inst 0xc13b12f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z11.h\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
+ ".inst 0x658aa93b // bfcvt z27.h, p2/M, z9.s\n"
+ "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1391310 // bfdot za.s[x8, 0], { z24.h-z27.h }, z9.h\n"
+ "7:" // Unpadded: 3 priming loads
+ "add x21, x17, %x[ld_in_row], LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x17]\n"
+ ".inst 0x658aab7d // bfcvt z29.h, p2/M, z27.s\n"
+ "addvl x20, SP, #9\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aab5d // bfcvtnt z29.h, p2/M, z26.s\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa93e // bfcvt z30.h, p2/M, z9.s\n"
+ "ld1w { z20.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa9e // bfcvtnt z30.h, p2/M, z20.s\n"
+ "ld1w { z25.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab3f // bfcvt z31.h, p2/M, z25.s\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aab5f // bfcvtnt z31.h, p2/M, z26.s\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab60 // bfcvt z0.h, p2/M, z27.s\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa920 // bfcvtnt z0.h, p2/M, z9.s\n"
+ "ld1w { z23.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaae1 // bfcvt z1.h, p2/M, z23.s\n"
+ ".inst 0xa0402a84 // ld1h { z4.h-z5.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13413b0 // bfdot za.s[x8, 0], { z29.h-z0.h }, z4.h\n"
+ "ld1w { z9.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa921 // bfcvtnt z1.h, p2/M, z9.s\n"
+ ".inst 0xc13513d0 // bfdot za.s[x8, 0], { z30.h-z1.h }, z5.h\n"
+ "ld1w { z29.s }, p1/Z, [x21]\n"
+ ".inst 0x658aaba2 // bfcvt z2.h, p2/M, z29.s\n"
+ "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc13913f0 // bfdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
+ "8:" // Unpadded: 2 priming loads
+ "add x22, x17, %x[ld_in_row], LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x17]\n"
+ ".inst 0x658aab7a // bfcvt z26.h, p2/M, z27.s\n"
+ "addvl x21, SP, #6\n"
+ "ld1w { z21.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaaba // bfcvtnt z26.h, p2/M, z21.s\n"
+ "addvl x20, SP, #12\n"
+ "ld1w { z25.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab3b // bfcvt z27.h, p2/M, z25.s\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1w { z4.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa89b // bfcvtnt z27.h, p2/M, z4.s\n"
+ "ld1w { z10.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa95c // bfcvt z28.h, p2/M, z10.s\n"
+ "ld1w { z4.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa89c // bfcvtnt z28.h, p2/M, z4.s\n"
+ "ld1w { z5.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa8bd // bfcvt z29.h, p2/M, z5.s\n"
+ "ld1w { z5.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa8bd // bfcvtnt z29.h, p2/M, z5.s\n"
+ "ld1w { z5.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa8be // bfcvt z30.h, p2/M, z5.s\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13e1350 // bfdot za.s[x8, 0], { z26.h-z29.h }, z14.h\n"
+ "ld1w { z5.s }, p1/Z, [x22]\n"
+ ".inst 0x648aa8be // bfcvtnt z30.h, p2/M, z5.s\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13f1370 // bfdot za.s[x8, 0], { z27.h-z30.h }, z15.h\n"
+ ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1381351 // bfdot za.s[x8, 1], { z26.h-z29.h }, z8.h\n"
+ "ld1w { z23.s }, p1/Z, [x22]\n"
+ ".inst 0x658aaaff // bfcvt z31.h, p2/M, z23.s\n"
+ ".inst 0xc1391371 // bfdot za.s[x8, 1], { z27.h-z30.h }, z9.h\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1301390 // bfdot za.s[x8, 0], { z28.h-z31.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1301391 // bfdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
+ "9:" // Unpadded: 1 priming loads
+ "add x22, x17, %x[ld_in_row], LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x17]\n"
+ ".inst 0x658aab77 // bfcvt z23.h, p2/M, z27.s\n"
+ "addvl x21, SP, #3\n"
+ "ld1w { z24.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aab17 // bfcvtnt z23.h, p2/M, z24.s\n"
+ "addvl x20, SP, #9\n"
+ "ld1w { z31.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aabf8 // bfcvt z24.h, p2/M, z31.s\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1w { z6.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa8d8 // bfcvtnt z24.h, p2/M, z6.s\n"
+ "ld1w { z28.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab99 // bfcvt z25.h, p2/M, z28.s\n"
+ "ld1w { z26.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aab59 // bfcvtnt z25.h, p2/M, z26.s\n"
+ "ld1w { z28.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab9a // bfcvt z26.h, p2/M, z28.s\n"
+ "ld1w { z4.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa89a // bfcvtnt z26.h, p2/M, z4.s\n"
+ "ld1w { z20.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aaa9b // bfcvt z27.h, p2/M, z20.s\n"
+ ".inst 0xa1402aa0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13012f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z0.h\n"
+ "ld1w { z20.s }, p1/Z, [x22]\n"
+ ".inst 0x648aaa9b // bfcvtnt z27.h, p2/M, z20.s\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc1381310 // bfdot za.s[x8, 0], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13212f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
+ "ld1w { z11.s }, p1/Z, [x22]\n"
+ ".inst 0x658aa97c // bfcvt z28.h, p2/M, z11.s\n"
+ ".inst 0xc1331311 // bfdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
+ "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1341330 // bfdot za.s[x8, 0], { z25.h-z28.h }, z4.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1301331 // bfdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "10:" // Unpadded: 0 priming loads
+ "cmp x7, #0x2\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "blt 20f\n"
+ "add x21, x17, %x[ld_in_row], LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x17]\n"
+ ".inst 0x658aab75 // bfcvt z21.h, p2/M, z27.s\n"
+ "sub x7, x7, #0x2\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "sub x16, x16, #0x1\n"
+ ".inst 0x648aab55 // bfcvtnt z21.h, p2/M, z26.s\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab56 // bfcvt z22.h, p2/M, z26.s\n"
+ "lsr x20, x7, #0x1\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ "cmp x20, x16\n"
+ ".inst 0x648aab56 // bfcvtnt z22.h, p2/M, z26.s\n"
+ "ld1w { z8.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa917 // bfcvt z23.h, p2/M, z8.s\n"
+ "csel x26, x20, x16, LT\n"
+ "ld1w { z2.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa857 // bfcvtnt z23.h, p2/M, z2.s\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1w { z6.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa8d8 // bfcvt z24.h, p2/M, z6.s\n"
+ "and x7, x7, #0x1\n"
+ "ld1w { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa9f8 // bfcvtnt z24.h, p2/M, z15.s\n"
+ "sub x16, x16, x26\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab79 // bfcvt z25.h, p2/M, z27.s\n"
+ "ld1w { z26.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aab59 // bfcvtnt z25.h, p2/M, z26.s\n"
+ "ld1w { z27.s }, p1/Z, [x21]\n"
+ ".inst 0x658aab7a // bfcvt z26.h, p2/M, z27.s\n"
+ "cbz x26, 19f\n"
+ "11:" // Unpadded: Main loop
+ ".inst 0xc13312b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
+ "addvl x25, SP, #6\n"
+ "addvl x24, SP, #12\n"
+ "ld1w { z14.s }, p1/Z, [x17]\n"
+ ".inst 0xc13b12d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
+ ".inst 0xa1402b20 // ld1h { z0.h, z8.h }, pn10.b/Z, [x25]\n"
+ "add x23, x17, %x[ld_in_row], LSL #2\n"
+ "addvl x22, SP, #3\n"
+ ".inst 0xc13012b1 // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1w { z27.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc13812d1 // bfdot za.s[x8, 1], { z22.h-z25.h }, z8.h\n"
+ ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+ "addvl x21, SP, #9\n"
+ "add x20, x17, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13012b2 // bfdot za.s[x8, 2], { z21.h-z24.h }, z0.h\n"
+ "ld1w { z2.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa9d5 // bfcvt z21.h, p2/M, z14.s\n"
+ ".inst 0xc13712f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z7.h\n"
+ "ld1h { z11.h }, p2/Z, [x25, #2, MUL VL]\n"
+ ".inst 0x648aab75 // bfcvtnt z21.h, p2/M, z27.s\n"
+ "subs x26, x26, #0x1\n"
+ "ld1w { z14.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13812d2 // bfdot za.s[x8, 2], { z22.h-z25.h }, z8.h\n"
+ ".inst 0x658aa856 // bfcvt z22.h, p2/M, z2.s\n"
+ "ld1w { z7.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13b12f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z11.h\n"
+ ".inst 0x648aa9d6 // bfcvtnt z22.h, p2/M, z14.s\n"
+ "ld1w { z31.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0060c08 // mova { z8.d-z11.d }, za.d[x8, #0]\n"
+ ".inst 0xc1acc9a8 // fclamp { z8.s-z11.s }, z13.s, z12.s\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xc13012f2 // bfdot za.s[x8, 2], { z23.h-z26.h }, z0.h\n"
+ ".inst 0x658aa8f7 // bfcvt z23.h, p2/M, z7.s\n"
+ "add x8, x8, #0x1\n"
+ "ld1w { z26.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab58 // bfcvt z24.h, p2/M, z26.s\n"
+ ".inst 0x648aabf7 // bfcvtnt z23.h, p2/M, z31.s\n"
+ "ld1w { z2.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa858 // bfcvtnt z24.h, p2/M, z2.s\n"
+ "st1w { z8.s }, p1, [x15]\n"
+ "ld1w { z0.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa819 // bfcvt z25.h, p2/M, z0.s\n"
+ "add x15, x15, x13, LSL #2\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc13212b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z2.h\n"
+ "st1w { z9.s }, p1, [x14]\n"
+ "add x14, x14, x11, LSL #2\n"
+ "ld1w { z26.s }, p1/Z, [x23]\n"
+ ".inst 0x648aab59 // bfcvtnt z25.h, p2/M, z26.s\n"
+ "add x23, x23, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13312d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z3.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13112b1 // bfdot za.s[x8, 1], { z21.h-z24.h }, z1.h\n"
+ "st1w { z10.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "ld1w { z26.s }, p1/Z, [x23]\n"
+ ".inst 0x658aab5a // bfcvt z26.h, p2/M, z26.s\n"
+ ".inst 0xc13912d1 // bfdot za.s[x8, 1], { z22.h-z25.h }, z9.h\n"
+ "ld1w { z31.s }, p1/Z, [x17]\n"
+ ".inst 0x658aabf5 // bfcvt z21.h, p2/M, z31.s\n"
+ "st1w { z11.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "ld1w { z30.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
+ ".inst 0x648aabd5 // bfcvtnt z21.h, p2/M, z30.s\n"
+ "ld1w { z0.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa816 // bfcvt z22.h, p2/M, z0.s\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1w { z1.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa836 // bfcvtnt z22.h, p2/M, z1.s\n"
+ "ld1w { z11.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1h { z2.h }, p2/Z, [x22, #2, MUL VL]\n"
+ ".inst 0xc13212f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ "ld1w { z28.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z14.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1w { z27.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc13412f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z4.h\n"
+ ".inst 0x658aa977 // bfcvt z23.h, p2/M, z11.s\n"
+ "ld1w { z29.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa9d8 // bfcvt z24.h, p2/M, z14.s\n"
+ ".inst 0x658aabb9 // bfcvt z25.h, p2/M, z29.s\n"
+ "ld1w { z5.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aab97 // bfcvtnt z23.h, p2/M, z28.s\n"
+ ".inst 0x648aab78 // bfcvtnt z24.h, p2/M, z27.s\n"
+ "ld1w { z11.s }, p1/Z, [x20]\n"
+ ".inst 0x648aa8b9 // bfcvtnt z25.h, p2/M, z5.s\n"
+ ".inst 0x658aa97a // bfcvt z26.h, p2/M, z11.s\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "bgt 11b\n"
+ "b 19f\n"
+ "12:" // Padded
+ "cbz x22, 17f\n"
+ "cmp x22, #0x1\n"
+ "sub x7, x7, x22\n"
+ "beq 16f\n"
+ "cmp x22, #0x2\n"
+ "beq 15f\n"
+ "cmp x22, #0x3\n"
+ "beq 14f\n"
+ "13:" // Padded: 4 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z1.s }, p0/Z, [x17]\n"
+ ".inst 0x658aa837 // bfcvt z23.h, p2/M, z1.s\n"
+ "add x21, x17, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z29.s }, p0/Z, [x21]\n"
+ ".inst 0x648aabb7 // bfcvtnt z23.h, p2/M, z29.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z30.s }, p0/Z, [x21]\n"
+ ".inst 0x658aabd8 // bfcvt z24.h, p2/M, z30.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z15.s }, p0/Z, [x21]\n"
+ ".inst 0x648aa9f8 // bfcvtnt z24.h, p2/M, z15.s\n"
+ "mov x12, #0x4\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab79 // bfcvt z25.h, p2/M, z27.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z20.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa99 // bfcvtnt z25.h, p2/M, z20.s\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z10.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa95a // bfcvt z26.h, p2/M, z10.s\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ "ld1w { z8.s }, p0/Z, [x21]\n"
+ ".inst 0x648aa91a // bfcvtnt z26.h, p2/M, z8.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z28.s }, p0/Z, [x21]\n"
+ ".inst 0x658aab9b // bfcvt z27.h, p2/M, z28.s\n"
+ "addvl x20, SP, #12\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13112f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z28.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aab9b // bfcvtnt z27.h, p2/M, z28.s\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z0.s }, p0/Z, [x21]\n"
+ ".inst 0x658aa81c // bfcvt z28.h, p2/M, z0.s\n"
+ ".inst 0xc1391310 // bfdot za.s[x8, 0], { z24.h-z27.h }, z9.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc1301330 // bfdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
+ "14:" // Padded: 3 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z21.s }, p0/Z, [x17]\n"
+ ".inst 0x658aaab4 // bfcvt z20.h, p2/M, z21.s\n"
+ "add x21, x17, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
+ ".inst 0x648aab74 // bfcvtnt z20.h, p2/M, z27.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
+ ".inst 0x658aab75 // bfcvt z21.h, p2/M, z27.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
+ ".inst 0x648aab75 // bfcvtnt z21.h, p2/M, z27.s\n"
+ "mov x12, #0x4\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z29.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aabb6 // bfcvt z22.h, p2/M, z29.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aab76 // bfcvtnt z22.h, p2/M, z27.s\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z27.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab77 // bfcvt z23.h, p2/M, z27.s\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ "ld1w { z8.s }, p0/Z, [x21]\n"
+ ".inst 0x648aa917 // bfcvtnt z23.h, p2/M, z8.s\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z28.s }, p0/Z, [x21]\n"
+ ".inst 0x658aab98 // bfcvt z24.h, p2/M, z28.s\n"
+ "addvl x20, SP, #9\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1311290 // bfdot za.s[x8, 0], { z20.h-z23.h }, z1.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z0.s }, p0/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa818 // bfcvtnt z24.h, p2/M, z0.s\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z1.s }, p0/Z, [x21]\n"
+ ".inst 0x658aa839 // bfcvt z25.h, p2/M, z1.s\n"
+ ".inst 0xc13912b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z9.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc13012d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
+ "15:" // Padded: 2 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z6.s }, p0/Z, [x17]\n"
+ ".inst 0x658aa8da // bfcvt z26.h, p2/M, z6.s\n"
+ "add x22, x17, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z29.s }, p0/Z, [x22]\n"
+ ".inst 0x648aabba // bfcvtnt z26.h, p2/M, z29.s\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z28.s }, p0/Z, [x22]\n"
+ ".inst 0x658aab9b // bfcvt z27.h, p2/M, z28.s\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z14.s }, p0/Z, [x22]\n"
+ ".inst 0x648aa9db // bfcvtnt z27.h, p2/M, z14.s\n"
+ "mov x12, #0x4\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z24.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab1c // bfcvt z28.h, p2/M, z24.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z1.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa83c // bfcvtnt z28.h, p2/M, z1.s\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z3.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa87d // bfcvt z29.h, p2/M, z3.s\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ "ld1w { z0.s }, p0/Z, [x22]\n"
+ ".inst 0x648aa81d // bfcvtnt z29.h, p2/M, z0.s\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z24.s }, p0/Z, [x22]\n"
+ ".inst 0x658aab1e // bfcvt z30.h, p2/M, z24.s\n"
+ "addvl x21, SP, #6\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1311350 // bfdot za.s[x8, 0], { z26.h-z29.h }, z1.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z23.s }, p0/Z, [x22]\n"
+ ".inst 0x648aaafe // bfcvtnt z30.h, p2/M, z23.s\n"
+ "addvl x20, SP, #12\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc1391370 // bfdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "ld1w { z31.s }, p0/Z, [x22]\n"
+ ".inst 0xc1301351 // bfdot za.s[x8, 1], { z26.h-z29.h }, z0.h\n"
+ ".inst 0x658aabff // bfcvt z31.h, p2/M, z31.s\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1311371 // bfdot za.s[x8, 1], { z27.h-z30.h }, z1.h\n"
+ ".inst 0xc1301390 // bfdot za.s[x8, 0], { z28.h-z31.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1301391 // bfdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
+ "16:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z22.s }, p0/Z, [x17]\n"
+ ".inst 0x658aaad5 // bfcvt z21.h, p2/M, z22.s\n"
+ "add x22, x17, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z3.s }, p0/Z, [x22]\n"
+ ".inst 0x648aa875 // bfcvtnt z21.h, p2/M, z3.s\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z20.s }, p0/Z, [x22]\n"
+ ".inst 0x658aaa96 // bfcvt z22.h, p2/M, z20.s\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z25.s }, p0/Z, [x22]\n"
+ ".inst 0x648aab36 // bfcvtnt z22.h, p2/M, z25.s\n"
+ "mov x12, #0x4\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z24.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab17 // bfcvt z23.h, p2/M, z24.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z0.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa817 // bfcvtnt z23.h, p2/M, z0.s\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z7.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa8f8 // bfcvt z24.h, p2/M, z7.s\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ "ld1w { z28.s }, p0/Z, [x22]\n"
+ ".inst 0x648aab98 // bfcvtnt z24.h, p2/M, z28.s\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z6.s }, p0/Z, [x22]\n"
+ ".inst 0x658aa8d9 // bfcvt z25.h, p2/M, z6.s\n"
+ "addvl x21, SP, #3\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc13112b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z1.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z6.s }, p0/Z, [x22]\n"
+ ".inst 0x648aa8d9 // bfcvtnt z25.h, p2/M, z6.s\n"
+ "addvl x20, SP, #9\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc13912d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z9.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "ld1w { z3.s }, p0/Z, [x22]\n"
+ ".inst 0xc13012b1 // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ ".inst 0x658aa87a // bfcvt z26.h, p2/M, z3.s\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc13112d1 // bfdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ ".inst 0xc13012f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc13012f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "17:" // Padded: 0 priming loads
+ "cmp x7, #0x2\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "blt 20f\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z25.s }, p0/Z, [x17]\n"
+ ".inst 0x658aab35 // bfcvt z21.h, p2/M, z25.s\n"
+ "add x20, x17, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0x648aab75 // bfcvtnt z21.h, p2/M, z27.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0x658aab76 // bfcvt z22.h, p2/M, z27.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0x648aab76 // bfcvtnt z22.h, p2/M, z27.s\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab77 // bfcvt z23.h, p2/M, z27.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z25.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aab37 // bfcvtnt z23.h, p2/M, z25.s\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab58 // bfcvt z24.h, p2/M, z26.s\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0x648aab78 // bfcvtnt z24.h, p2/M, z27.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0x658aab79 // bfcvt z25.h, p2/M, z27.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z26.s }, p0/Z, [x20]\n"
+ ".inst 0x648aab59 // bfcvtnt z25.h, p2/M, z26.s\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ ".inst 0x658aab7a // bfcvt z26.h, p2/M, z27.s\n"
+ "sub x7, x7, #0x2\n"
+ "sub x16, x16, #0x1\n"
+ "lsr x20, x7, #0x1\n"
+ "cmp x20, x16\n"
+ "csel x24, x20, x16, LT\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "and x7, x7, #0x1\n"
+ "sub x16, x16, x24\n"
+ "cbz x24, 19f\n"
+ "18:" // Padded: Main loop
+ ".inst 0xc13312b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
+ "addvl x23, SP, #6\n"
+ "addvl x21, SP, #12\n"
+ ".inst 0xc13b12d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc13012b1 // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1w { z9.s }, p0/Z, [x17]\n"
+ "add x20, x17, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc13112d1 // bfdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "addvl x22, SP, #3\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ ".inst 0xc13012b2 // bfdot za.s[x8, 2], { z21.h-z24.h }, z0.h\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc13712f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z7.h\n"
+ "mov x12, #0x4\n"
+ "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xc13112d2 // bfdot za.s[x8, 2], { z22.h-z25.h }, z1.h\n"
+ ".inst 0x658aa921 // bfcvt z1.h, p2/M, z9.s\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc13012f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0x658aab62 // bfcvt z2.h, p2/M, z27.s\n"
+ "ld1w { z27.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0x648aa9c1 // bfcvtnt z1.h, p2/M, z14.s\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc13012f2 // bfdot za.s[x8, 2], { z23.h-z26.h }, z0.h\n"
+ ".inst 0x658aa923 // bfcvt z3.h, p2/M, z9.s\n"
+ "addvl x21, SP, #9\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0x658aa924 // bfcvt z4.h, p2/M, z9.s\n"
+ "mov x12, #0x8\n"
+ "ld1w { z24.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa9e2 // bfcvtnt z2.h, p2/M, z15.s\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aab63 // bfcvtnt z3.h, p2/M, z27.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0x648aab04 // bfcvtnt z4.h, p2/M, z24.s\n"
+ ".inst 0x658aa925 // bfcvt z5.h, p2/M, z9.s\n"
+ ".inst 0xa1402ac0 // ld1h { z0.h, z8.h }, pn10.b/Z, [x22]\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0x648aabc5 // bfcvtnt z5.h, p2/M, z30.s\n"
+ ".inst 0xc1301030 // bfdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1381050 // bfdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ "ld1w { z0.s }, p0/Z, [x17]\n"
+ "add x20, x17, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc1361031 // bfdot za.s[x8, 1], { z1.h-z4.h }, z6.h\n"
+ "ld1w { z10.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0x658aaba6 // bfcvt z6.h, p2/M, z29.s\n"
+ "ld1w { z9.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc13e1051 // bfdot za.s[x8, 1], { z2.h-z5.h }, z14.h\n"
+ "mov x12, #0x4\n"
+ "ld1w { z29.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa815 // bfcvt z21.h, p2/M, z0.s\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z31.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa936 // bfcvt z22.h, p2/M, z9.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+ ".inst 0xc1301070 // bfdot za.s[x8, 0], { z3.h-z6.h }, z0.h\n"
+ "subs x24, x24, #0x1\n"
+ "ld1w { z15.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc1acc9b8 // fclamp { z24.s-z27.s }, z13.s, z12.s\n"
+ "ld1w { z30.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "st1w { z24.s }, p1, [x15]\n"
+ "mov x12, #0x8\n"
+ "ld1w { z14.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ "st1w { z25.s }, p1, [x14]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1301071 // bfdot za.s[x8, 1], { z3.h-z6.h }, z0.h\n"
+ ".inst 0x658aabf7 // bfcvt z23.h, p2/M, z31.s\n"
+ "ld1w { z8.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0x658aabd8 // bfcvt z24.h, p2/M, z30.s\n"
+ "ld1w { z4.s }, p0/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0x658aa919 // bfcvt z25.h, p2/M, z8.s\n"
+ "ld1w { z5.s }, p0/Z, [x20]\n"
+ "add x15, x15, x13, LSL #2\n"
+ "add x14, x14, x11, LSL #2\n"
+ "st1w { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z27.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0x648aa955 // bfcvtnt z21.h, p2/M, z10.s\n"
+ ".inst 0x648aabb6 // bfcvtnt z22.h, p2/M, z29.s\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+ ".inst 0x648aa9f7 // bfcvtnt z23.h, p2/M, z15.s\n"
+ ".inst 0x648aa9d8 // bfcvtnt z24.h, p2/M, z14.s\n"
+ ".inst 0x648aa899 // bfcvtnt z25.h, p2/M, z4.s\n"
+ ".inst 0x658aa8ba // bfcvt z26.h, p2/M, z5.s\n"
+ "bgt 18b\n"
+ "19:" // Main loop tail
+ ".inst 0xc13312b0 // bfdot za.s[x8, 0], { z21.h-z24.h }, z3.h\n"
+ "addvl x24, SP, #6\n"
+ "addvl x23, SP, #12\n"
+ ".inst 0xc13b12d0 // bfdot za.s[x8, 0], { z22.h-z25.h }, z11.h\n"
+ ".inst 0xa0402b00 // ld1h { z0.h-z1.h }, pn10.b/Z, [x24]\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc13012b1 // bfdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1w { z5.s }, p0/Z, [x17]\n"
+ "add x22, x17, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xc13112d1 // bfdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ "addvl x21, SP, #3\n"
+ "addvl x20, SP, #9\n"
+ ".inst 0xc13012b2 // bfdot za.s[x8, 2], { z21.h-z24.h }, z0.h\n"
+ "ld1w { z29.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z2.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc13712f0 // bfdot za.s[x8, 0], { z23.h-z26.h }, z7.h\n"
+ "mov x12, #0x4\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0xc13112d2 // bfdot za.s[x8, 2], { z22.h-z25.h }, z1.h\n"
+ ".inst 0x658aa8bb // bfcvt z27.h, p2/M, z5.s\n"
+ "ld1w { z20.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc13012f1 // bfdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "ld1w { z1.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0x658aa85c // bfcvt z28.h, p2/M, z2.s\n"
+ "ld1w { z14.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0x648aabbb // bfcvtnt z27.h, p2/M, z29.s\n"
+ "ld1h { z0.h }, p2/Z, [x23, #2, MUL VL]\n"
+ ".inst 0xc13012f2 // bfdot za.s[x8, 2], { z23.h-z26.h }, z0.h\n"
+ ".inst 0x658aa83d // bfcvt z29.h, p2/M, z1.s\n"
+ "add x17, x17, %x[ld_in_col], LSL #2\n"
+ "ld1w { z1.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0x658aa83e // bfcvt z30.h, p2/M, z1.s\n"
+ "mov x12, #0x8\n"
+ "ld1w { z31.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aaa9c // bfcvtnt z28.h, p2/M, z20.s\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z26.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa9dd // bfcvtnt z29.h, p2/M, z14.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0x648aabfe // bfcvtnt z30.h, p2/M, z31.s\n"
+ ".inst 0x658aab5f // bfcvt z31.h, p2/M, z26.s\n"
+ ".inst 0xa1402aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21]\n"
+ "ld1w { z9.s }, p0/Z, [x22]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0x648aa93f // bfcvtnt z31.h, p2/M, z9.s\n"
+ ".inst 0xc1321370 // bfdot za.s[x8, 0], { z27.h-z30.h }, z2.h\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z26.s }, p0/Z, [x22]\n"
+ ".inst 0xc13a1390 // bfdot za.s[x8, 0], { z28.h-z31.h }, z10.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0x658aab40 // bfcvt z0.h, p2/M, z26.s\n"
+ ".inst 0xc1321371 // bfdot za.s[x8, 1], { z27.h-z30.h }, z2.h\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1acc9a4 // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
+ ".inst 0xc13a1391 // bfdot za.s[x8, 1], { z28.h-z31.h }, z10.h\n"
+ "st1w { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13, LSL #2\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc13913b0 // bfdot za.s[x8, 0], { z29.h-z0.h }, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "st1w { z5.s }, p1, [x14]\n"
+ "add x14, x14, x11, LSL #2\n"
+ "st1w { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
+ "st1w { z7.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ ".inst 0xc13913b1 // bfdot za.s[x8, 1], { z29.h-z0.h }, z9.h\n"
+ "ld1h { z7.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "20:" // Main loop skip tail
+ "cbz x7, 21f\n" // Skip remainder inputs
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z25.s }, p0/Z, [x17]\n"
+ ".inst 0x658aab3d // bfcvt z29.h, p2/M, z25.s\n"
+ "add x22, x17, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z26.s }, p0/Z, [x22]\n"
+ ".inst 0x648aab5d // bfcvtnt z29.h, p2/M, z26.s\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z25.s }, p0/Z, [x22]\n"
+ ".inst 0x658aab3e // bfcvt z30.h, p2/M, z25.s\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1w { z24.s }, p0/Z, [x22]\n"
+ ".inst 0x648aab1e // bfcvtnt z30.h, p2/M, z24.s\n"
+ "mov x12, #0x4\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z26.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aab5f // bfcvt z31.h, p2/M, z26.s\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z9.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x648aa93f // bfcvtnt z31.h, p2/M, z9.s\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1w { z9.s }, p0/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x658aa920 // bfcvt z0.h, p2/M, z9.s\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x8\n"
+ "ld1w { z24.s }, p0/Z, [x22]\n"
+ ".inst 0x648aab00 // bfcvtnt z0.h, p2/M, z24.s\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1w { z9.s }, p0/Z, [x22]\n"
+ ".inst 0x658aa921 // bfcvt z1.h, p2/M, z9.s\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1w { z25.s }, p0/Z, [x22]\n"
+ ".inst 0x648aab21 // bfcvtnt z1.h, p2/M, z25.s\n"
+ ".inst 0xc13313b0 // bfdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
+ "addvl x21, SP, #6\n"
+ "add x22, x22, %x[ld_in_row], LSL #2\n"
+ ".inst 0xc13b13d0 // bfdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "addvl x20, SP, #12\n"
+ ".inst 0xc13e13b1 // bfdot za.s[x8, 1], { z29.h-z0.h }, z14.h\n"
+ "ld1w { z25.s }, p0/Z, [x22]\n"
+ ".inst 0x658aab22 // bfcvt z2.h, p2/M, z25.s\n"
+ "sub x16, x16, #0x1\n"
+ ".inst 0xc13f13d1 // bfdot za.s[x8, 1], { z30.h-z1.h }, z15.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc13e13b2 // bfdot za.s[x8, 2], { z29.h-z0.h }, z14.h\n"
+ ".inst 0xc13713f0 // bfdot za.s[x8, 0], { z31.h-z2.h }, z7.h\n"
+ "ld1h { z4.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc13f13d2 // bfdot za.s[x8, 2], { z30.h-z1.h }, z15.h\n"
+ ".inst 0xc13413f1 // bfdot za.s[x8, 1], { z31.h-z2.h }, z4.h\n"
+ "ld1h { z9.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ ".inst 0xc1acc9a4 // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
+ "st1w { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13, LSL #2\n"
+ ".inst 0xc13913f2 // bfdot za.s[x8, 2], { z31.h-z2.h }, z9.h\n"
+ "add x8, x8, #0x1\n"
+ "st1w { z5.s }, p1, [x14]\n"
+ "add x14, x14, x11, LSL #2\n"
+ "st1w { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
+ "st1w { z7.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "21:" // Tail input: End
+ "cbz x16, 23f\n"
+ "22:" // Right padding loop
+ ".inst 0xc0060c04 // mova { z4.d-z7.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "subs x16, x16, #0x1\n"
+ ".inst 0xc1acc9a4 // fclamp { z4.s-z7.s }, z13.s, z12.s\n"
+ "st1w { z4.s }, p1, [x15]\n"
+ "add x15, x15, x13, LSL #2\n"
+ ".inst 0xc0040e04 // mova za.d[x8, #4], { z16.d-z19.d }\n"
+ "st1w { z5.s }, p1, [x14]\n"
+ "add x14, x14, x11, LSL #2\n"
+ "st1w { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28, LSL #2\n"
+ "st1w { z7.s }, p1, [x9]\n"
+ "add x9, x9, x27, LSL #2\n"
+ "bgt 22b\n"
+ "23:" // End
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incb x20, ALL, MUL #16\n"
+ "incb x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x6\n"
+ "whilelt p1.s, x6, x5\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21, LSL #2\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21, LSL #2\n"
+ "add x22, x22, x20, LSL #2\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21, LSL #2\n"
+ "add x22, x22, x20, LSL #2\n"
+ "stp x23, x22, [x25, #0x10]\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #15\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_bias] "I" (offsetof(Args, bias)), [offsetof_Args_clamp_max] "I" (offsetof(Args, clamp_max)), [offsetof_Args_clamp_min] "I" (offsetof(Args, clamp_min)), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..de3eadac8a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
+ const int8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const int8_t *weights,
+ int8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+);
+
+class sme2_s8q_planar_3x3_s1_4rows_dot_za : public PlanarStrategy<int8_t, int8_t>
+{
+ using Parent = PlanarStrategy<int8_t, int8_t>;
+
+ public:
+ using return_type = int8_t;
+ constexpr static auto output_rows = 4u;
+ constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+ constexpr static auto stride_rows = 1u, stride_cols = 1u;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ sme2_s8q_planar_3x3_s1_4rows_dot_za(const CPUInfo *)
+ : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+ {
+ }
+
+ typename Parent::KernelType get_kernel(void) const override
+ {
+ return sme2_s8q_planar_3x3_s1_4rows_dot_za_impl;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..845f376926
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,664 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_3x3_s1_4rows_dot_za_impl(
+ const int8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const int8_t *weights,
+ int8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+)
+{
+ struct Args
+ {
+ const int8_t *inptr;
+ size_t ld_in_vl;
+ long unsigned int pad_top, pad_bottom, pad_left;
+ const int8_t *weights;
+ long unsigned int input_cols, output_cols;
+ int8_t **outptrs;
+ const size_t *ld_out_cols;
+ const size_t *ld_out_vls;
+ long unsigned int current_channel, n_channels;
+ };
+
+ Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ptrue p2.b\n"
+ "mov x20, #0x6\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "sub x20, x20, x6\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x17\n"
+ "whilelt p9.s, XZR, x20\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "whilelt p8.s, XZR, x7\n"
+ "addvl SP, SP, #-12\n"
+ "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "neg z21.h, p2/M, z21.h\n"
+ "eor p8.b, p2/Z, p8.b, p9.b\n"
+ "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "1:" // Channel loop
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "mov z30.s, #0x0\n"
+ "cbz x20, 2f\n"
+ "ld1w { z30.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "2:" // Load bias: Done
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x22\n"
+ "ld1sb { z10.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "ld1rh { z31.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z7.h, #0x0\n"
+ "sub z10.h, z10.h, z31.h\n"
+ "incw x22\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z16.h, z16.h, z31.h\n"
+ "trn1 z20.h, z7.h, z10.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z31.h\n"
+ "mov x20, x22\n"
+ "trn1 z19.h, z10.h, z16.h\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "trn1 z26.h, z16.h, z11.h\n"
+ "trn1 z13.h, z11.h, z7.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z24.h, z24.h, z31.h\n"
+ "sub z11.h, z11.h, z31.h\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "sub z2.h, z2.h, z31.h\n"
+ "addvl x21, SP, #12\n"
+ "incw x22\n"
+ "addvl x21, x21, #-4\n"
+ "mov x20, x22\n"
+ "st1h { z20.h }, p2, [x21]\n"
+ "trn1 z22.h, z7.h, z24.h\n"
+ "st1h { z19.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z1.h, z24.h, z11.h\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z3.h, z11.h, z2.h\n"
+ "ld1sb { z0.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "st1h { z13.h }, p2, [x21, #3, MUL VL]\n"
+ "trn1 z25.h, z2.h, z7.h\n"
+ "ld1sb { z4.s }, p2/Z, [x20]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "sub z16.h, z16.h, z31.h\n"
+ "sub z0.h, z0.h, z31.h\n"
+ "addvl x21, x21, #-4\n"
+ "st1h { z22.h }, p2, [x21]\n"
+ "sub z4.h, z4.h, z31.h\n"
+ "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
+ "mov z31.d, z30.d\n"
+ "st1h { z3.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z24.h, z7.h, z16.h\n"
+ "trn1 z18.h, z16.h, z0.h\n"
+ "st1h { z25.h }, p2, [x21, #3, MUL VL]\n"
+ "addvl x21, x21, #-4\n"
+ "trn1 z0.h, z0.h, z4.h\n"
+ "trn1 z1.h, z4.h, z7.h\n"
+ "st1h { z24.h }, p2, [x21]\n"
+ "st1h { z18.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z1.h }, p2, [x21, #3, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z14.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "3:" // Load mul: End
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z12.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "4:" // Load right_shift: End
+ "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x15, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "orr x23, x17, x23, LSL #22\n"
+ "mov x22, #0x6\n"
+ "add x21, x7, x6\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "mov x8, #0x0\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x7, x14\n"
+ "5:" // Issue prefetches
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
+ "bgt 5b\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x14, x7, x20, x14\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
+ "mov x22, #0x2\n"
+ "ldp x11, x10, [x23], #0x10\n"
+ ".inst 0xc0040bc2 // mova za.d[x8, #2], { z30.d-z31.d }\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ ".inst 0xc0040bc3 // mova za.d[x8, #3], { z30.d-z31.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ "ldp x27, x26, [x23], #0x10\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ "sub x13, x13, x21\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ "6:" // Left padding
+ "subs x21, x21, #0x1\n"
+ "st1b { z24.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z25.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z27.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "bgt 6b\n"
+ "7:" // Left padding: End
+ "adds XZR, x7, x6\n"
+ "bne 12f\n"
+ "cbz x22, 10f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
+ "beq 9f\n"
+ "8:" // Unpadded: 2 priming loads
+ "add x21, x14, %x[ld_in_row]\n"
+ "ld1sb { z20.s }, p1/Z, [x14]\n"
+ "addvl x20, SP, #8\n"
+ "ld1sb { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z4.h, z20.h, z16.h\n"
+ "add z4.h, z4.h, z21.h\n"
+ "ld1sb { z23.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1sb { z22.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z5.h, z23.h, z22.h\n"
+ "add z5.h, z5.h, z21.h\n"
+ "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p1/Z, [x21]\n"
+ "trn1 z6.h, z17.h, z16.h\n"
+ "add z6.h, z6.h, z21.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16b1488 // sdot za.s[x8, 0], { z4.h-z5.h }, z11.h\n"
+ ".inst 0xc1631489 // sdot za.s[x8, 1], { z4.h-z5.h }, z3.h\n"
+ ".inst 0xa1412a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16814a8 // sdot za.s[x8, 0], { z5.h-z6.h }, z8.h\n"
+ ".inst 0xc16014a9 // sdot za.s[x8, 1], { z5.h-z6.h }, z0.h\n"
+ "9:" // Unpadded: 1 priming loads
+ "add x22, x14, %x[ld_in_row]\n"
+ "ld1sb { z25.s }, p1/Z, [x14]\n"
+ "addvl x21, SP, #4\n"
+ "ld1sb { z6.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z3.h, z25.h, z6.h\n"
+ "add z3.h, z3.h, z21.h\n"
+ "ld1sb { z18.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "addvl x20, SP, #8\n"
+ "ld1sb { z26.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z4.h, z18.h, z26.h\n"
+ "add z4.h, z4.h, z21.h\n"
+ "ld1sb { z2.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1sb { z5.s }, p1/Z, [x22]\n"
+ "trn1 z5.h, z2.h, z5.h\n"
+ "add z5.h, z5.h, z21.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1611468 // sdot za.s[x8, 0], { z3.h-z4.h }, z1.h\n"
+ ".inst 0xc1601469 // sdot za.s[x8, 1], { z3.h-z4.h }, z0.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0412aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a146a // sdot za.s[x8, 2], { z3.h-z4.h }, z10.h\n"
+ ".inst 0xc162146b // sdot za.s[x8, 3], { z3.h-z4.h }, z2.h\n"
+ ".inst 0xc1691488 // sdot za.s[x8, 0], { z4.h-z5.h }, z9.h\n"
+ ".inst 0xc1681489 // sdot za.s[x8, 1], { z4.h-z5.h }, z8.h\n"
+ ".inst 0xa1412a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a148a // sdot za.s[x8, 2], { z4.h-z5.h }, z10.h\n"
+ ".inst 0xc162148b // sdot za.s[x8, 3], { z4.h-z5.h }, z2.h\n"
+ "10:" // Unpadded: 0 priming loads
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "cbz x15, 18f\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p1/Z, [x14]\n"
+ "sub x15, x15, #0x1\n"
+ "ld1sb { z9.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z6.h, z17.h, z9.h\n"
+ "sub x13, x13, #0x1\n"
+ "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "cmp x15, x13\n"
+ "add z6.h, z6.h, z21.h\n"
+ "ld1sb { z7.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z7.h, z17.h, z7.h\n"
+ "csel x23, x15, x13, LT\n"
+ "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add z7.h, z7.h, z21.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1sb { z1.s }, p1/Z, [x20]\n"
+ "trn1 z8.h, z17.h, z1.h\n"
+ "add z8.h, z8.h, z21.h\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 17f\n"
+ "11:" // Unpadded: Main loop
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ "addvl x22, SP, #4\n"
+ "addvl x21, SP, #8\n"
+ "ld1sb { z2.s }, p1/Z, [x14]\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa1402ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ "ld1sb { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa1412ac3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ "ld1sb { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc16d14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z13.h\n"
+ "ld1sb { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16514cb // sdot za.s[x8, 3], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+ "ld1sb { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+ "trn1 z6.h, z2.h, z19.h\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16314eb // sdot za.s[x8, 3], { z7.h-z8.h }, z3.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc16914ec // sdot za.s[x8, 4], { z7.h-z8.h }, z9.h\n"
+ "st1b { z24.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "add z6.h, z6.h, z21.h\n"
+ ".inst 0xc16114ed // sdot za.s[x8, 5], { z7.h-z8.h }, z1.h\n"
+ "trn1 z7.h, z23.h, z18.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "st1b { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ "st1b { z25.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "add z7.h, z7.h, z21.h\n"
+ "st1b { z27.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "add z8.h, z8.h, z21.h\n"
+ "bgt 11b\n"
+ "b 17f\n"
+ "12:" // Padded
+ "cbz x22, 15f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
+ "beq 14f\n"
+ "13:" // Padded: 2 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z19.s }, p0/Z, [x14]\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z7.h, z19.h, z18.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #8\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z9.h, z17.h, z16.h\n"
+ ".inst 0xc16a14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z10.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc16214e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z2.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16d1508 // sdot za.s[x8, 0], { z8.h-z9.h }, z13.h\n"
+ ".inst 0xc1651509 // sdot za.s[x8, 1], { z8.h-z9.h }, z5.h\n"
+ "14:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z19.s }, p0/Z, [x14]\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z22.h, z19.h, z18.h\n"
+ "trn1 z23.h, z17.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "addvl x21, SP, #4\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #8\n"
+ "trn1 z24.h, z17.h, z16.h\n"
+ ".inst 0xc16116c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+ ".inst 0xc16016c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xa0412aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16d16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z13.h\n"
+ ".inst 0xc16516cb // sdot za.s[x8, 3], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc16116e8 // sdot za.s[x8, 0], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16016e9 // sdot za.s[x8, 1], { z23.h-z24.h }, z0.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16116ea // sdot za.s[x8, 2], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16016eb // sdot za.s[x8, 3], { z23.h-z24.h }, z0.h\n"
+ "15:" // Padded: 0 priming loads
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "cbz x15, 18f\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z19.s }, p0/Z, [x14]\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z6.h, z19.h, z18.h\n"
+ "trn1 z7.h, z17.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ "sub x15, x15, #0x1\n"
+ "sub x13, x13, #0x1\n"
+ "cmp x15, x13\n"
+ "trn1 z8.h, z17.h, z16.h\n"
+ "csel x23, x15, x13, LT\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 17f\n"
+ "16:" // Padded: Main loop
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z9.s }, p0/Z, [x14]\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ "add z9.h, p0/M, z9.h, z21.h\n"
+ "add x22, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z19.s }, p0/Z, [x22]\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z18.s }, p0/Z, [x22]\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z16.s }, p0/Z, [x22]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ "mov x12, #0x4\n"
+ "addvl x21, SP, #4\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #8\n"
+ ".inst 0xc16b14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z11.h\n"
+ "subs x23, x23, #0x1\n"
+ "ld1sb { z17.s }, p0/Z, [x22]\n"
+ ".inst 0xc16314cb // sdot za.s[x8, 3], { z6.h-z7.h }, z3.h\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xa0412aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16d14cc // sdot za.s[x8, 4], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ "ld1sb { z2.s }, p0/Z, [x22]\n"
+ ".inst 0xc16514cd // sdot za.s[x8, 5], { z6.h-z7.h }, z5.h\n"
+ "add z2.h, p0/M, z2.h, z21.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc16a14eb // sdot za.s[x8, 3], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa1412a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z24.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc16b14ec // sdot za.s[x8, 4], { z7.h-z8.h }, z11.h\n"
+ "st1b { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "trn1 z6.h, z9.h, z19.h\n"
+ ".inst 0xc16314ed // sdot za.s[x8, 5], { z7.h-z8.h }, z3.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "st1b { z25.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z27.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "trn1 z7.h, z18.h, z16.h\n"
+ "trn1 z8.h, z17.h, z2.h\n"
+ "bgt 16b\n"
+ "17:" // Main loop tail
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ "addvl x21, SP, #4\n"
+ "addvl x20, SP, #8\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc16114ca // sdot za.s[x8, 2], { z6.h-z7.h }, z1.h\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc16014cb // sdot za.s[x8, 3], { z6.h-z7.h }, z0.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ "st1b { z24.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+ "st1b { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ ".inst 0xc16314ea // sdot za.s[x8, 2], { z7.h-z8.h }, z3.h\n"
+ "st1b { z25.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ ".inst 0xc16214eb // sdot za.s[x8, 3], { z7.h-z8.h }, z2.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z27.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ ".inst 0xc16114ec // sdot za.s[x8, 4], { z7.h-z8.h }, z1.h\n"
+ ".inst 0xc16014ed // sdot za.s[x8, 5], { z7.h-z8.h }, z0.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "18:" // Main loop skip tail
+ "cbz x13, 20f\n"
+ "19:" // Right padding loop
+ ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ "subs x13, x13, #0x1\n"
+ ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc1acaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc1afab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ ".inst 0xc1bccfa4 // sclamp { z4.s-z7.s }, z29.s, z28.s\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z5.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z7.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "bgt 19b\n"
+ "20:" // End
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x16\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x16, x17\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x10]\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #12\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..56fb127aa0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
+ const int8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const int8_t *weights,
+ int8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+);
+
+class sme2_s8q_planar_3x3_s2_4rows_dot_za : public PlanarStrategy<int8_t, int8_t>
+{
+ using Parent = PlanarStrategy<int8_t, int8_t>;
+
+ public:
+ using return_type = int8_t;
+ constexpr static auto output_rows = 4u;
+ constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+ constexpr static auto stride_rows = 2u, stride_cols = 2u;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ sme2_s8q_planar_3x3_s2_4rows_dot_za(const CPUInfo *)
+ : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+ {
+ }
+
+ typename Parent::KernelType get_kernel(void) const override
+ {
+ return sme2_s8q_planar_3x3_s2_4rows_dot_za_impl;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..1d0efc6bc1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,881 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_3x3_s2_4rows_dot_za_impl(
+ const int8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const int8_t *weights,
+ int8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+)
+{
+ struct Args
+ {
+ const int8_t *inptr;
+ size_t ld_in_vl;
+ long unsigned int pad_top, pad_bottom, pad_left;
+ const int8_t *weights;
+ long unsigned int input_cols, output_cols;
+ int8_t **outptrs;
+ const size_t *ld_out_cols;
+ const size_t *ld_out_vls;
+ long unsigned int current_channel, n_channels;
+ };
+
+ Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ptrue p2.b\n"
+ "mov x20, #0x9\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ld1rh { z11.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "sub x20, x20, x6\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x17\n"
+ "whilelt p9.s, XZR, x20\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "whilelt p8.s, XZR, x7\n"
+ "addvl SP, SP, #-6\n"
+ "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "neg z11.h, p2/M, z11.h\n"
+ "eor p8.b, p2/Z, p8.b, p9.b\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "1:" // Channel loop
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "mov z28.s, #0x0\n"
+ "cbz x20, 2f\n"
+ "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "2:" // Load bias: Done
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x22\n"
+ "ld1sb { z26.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "ld1rh { z16.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "sub z26.h, z26.h, z16.h\n"
+ "incw x22\n"
+ "mov z24.h, #0x0\n"
+ "ld1sb { z3.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z3.h, z3.h, z16.h\n"
+ "trn1 z31.h, z26.h, z3.h\n"
+ "ld1sb { z21.s }, p2/Z, [x20]\n"
+ "sub z21.h, z21.h, z16.h\n"
+ "mov x20, x22\n"
+ "trn1 z14.h, z21.h, z24.h\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z2.h, z2.h, z16.h\n"
+ "addvl x21, SP, #6\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z25.h, z25.h, z16.h\n"
+ "incw x22\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "sub z27.h, z27.h, z16.h\n"
+ "addvl x21, x21, #-2\n"
+ "mov x20, x22\n"
+ "st1h { z31.h }, p2, [x21]\n"
+ "trn1 z4.h, z2.h, z25.h\n"
+ "ld1sb { z26.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "st1h { z14.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z12.h, z27.h, z24.h\n"
+ "ld1sb { z20.s }, p2/Z, [x20]\n"
+ "sub z26.h, z26.h, z16.h\n"
+ "sub z23.h, z23.h, z16.h\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "sub z20.h, z20.h, z16.h\n"
+ "addvl x21, x21, #-2\n"
+ "st1h { z4.h }, p2, [x21]\n"
+ "mov z29.d, z28.d\n"
+ "st1h { z12.h }, p2, [x21, #1, MUL VL]\n"
+ "addvl x21, x21, #-2\n"
+ "mov z30.d, z28.d\n"
+ "mov z31.d, z28.d\n"
+ "trn1 z25.h, z26.h, z23.h\n"
+ "st1h { z25.h }, p2, [x21]\n"
+ "trn1 z3.h, z20.h, z24.h\n"
+ "st1h { z3.h }, p2, [x21, #1, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z6.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "3:" // Load mul: End
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z9.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "4:" // Load right_shift: End
+ "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x15, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "orr x23, x17, x23, LSL #22\n"
+ "mov x22, #0x9\n"
+ "add x21, x7, x6\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "mov x8, #0x0\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x7, x14\n"
+ "5:" // Issue prefetches
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
+ "bgt 5b\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x14, x7, x20, x14\n"
+ ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
+ "mov x22, #0x2\n"
+ "ldp x11, x10, [x23], #0x10\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldp x27, x26, [x23], #0x10\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ "and x22, x21, #0x1\n"
+ ".inst 0xc1a9aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
+ "add x21, x21, #0x1\n"
+ "lsr x21, x21, #0x1\n"
+ ".inst 0xc1adab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
+ "sub x13, x13, x21\n"
+ ".inst 0xc1a7cd58 // sclamp { z24.s-z27.s }, z10.s, z7.s\n"
+ "6:" // Left padding
+ "subs x21, x21, #0x1\n"
+ "st1b { z24.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z25.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z26.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z27.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "bgt 6b\n"
+ "7:" // Left padding: End
+ "adds XZR, x7, x6\n"
+ "bne 12f\n"
+ "cbz x22, 10f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
+ "beq 9f\n"
+ "8:" // Unpadded: 2 priming loads
+ "add x21, x14, %x[ld_in_row]\n"
+ "ld1sb { z1.s }, p1/Z, [x14]\n"
+ "addvl x20, SP, #4\n"
+ "ld1sb { z21.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z1.h, z1.h, z21.h\n"
+ "add z1.h, z1.h, z11.h\n"
+ "ld1sb { z2.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1sb { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z2.h, z2.h, z15.h\n"
+ "add z2.h, z2.h, z11.h\n"
+ "ld1sb { z3.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z21.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z3.h, z3.h, z21.h\n"
+ "add z3.h, z3.h, z11.h\n"
+ "ld1sb { z4.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z4.h, z4.h, z19.h\n"
+ "add z4.h, z4.h, z11.h\n"
+ "ld1sb { z8.s }, p1/Z, [x21]\n"
+ "mov z5.d, z8.d\n"
+ "add z5.h, z5.h, z11.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+ ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+ "9:" // Unpadded: 1 priming loads
+ "add x21, x14, %x[ld_in_row]\n"
+ "ld1sb { z1.s }, p1/Z, [x14]\n"
+ "addvl x20, SP, #2\n"
+ "ld1sb { z21.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z1.h, z1.h, z21.h\n"
+ "add z1.h, z1.h, z11.h\n"
+ "ld1sb { z2.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1sb { z12.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z2.h, z2.h, z12.h\n"
+ "add z2.h, z2.h, z11.h\n"
+ "ld1sb { z3.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z8.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z3.h, z3.h, z8.h\n"
+ "add z3.h, z3.h, z11.h\n"
+ "ld1sb { z4.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z5.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z4.h, z4.h, z5.h\n"
+ "add z4.h, z4.h, z11.h\n"
+ "ld1sb { z5.s }, p1/Z, [x21]\n"
+ "mov z5.d, z5.d\n"
+ "add z5.h, z5.h, z11.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+ ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+ "10:" // Unpadded: 0 priming loads
+ "cmp x15, #0x2\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "blt 18f\n"
+ "add x21, x14, %x[ld_in_row]\n"
+ "ld1sb { z21.s }, p1/Z, [x14]\n"
+ "sub x15, x15, #0x2\n"
+ "ld1sb { z8.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z8.h\n"
+ "sub x13, x13, #0x1\n"
+ "ld1sb { z22.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "lsr x20, x15, #0x1\n"
+ "add z21.h, z21.h, z11.h\n"
+ "ld1sb { z25.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z25.h\n"
+ "cmp x20, x13\n"
+ "ld1sb { z23.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "csel x23, x20, x13, LT\n"
+ "add z22.h, z22.h, z11.h\n"
+ "ld1sb { z18.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z18.h\n"
+ "add z23.h, z23.h, z11.h\n"
+ "ld1sb { z24.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1sb { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z19.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "ld1sb { z8.s }, p1/Z, [x21]\n"
+ "mov z25.d, z8.d\n"
+ "add z25.h, z25.h, z11.h\n"
+ "and x15, x15, #0x1\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 17f\n"
+ "11:" // Unpadded: Main loop
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+ "addvl x20, SP, #4\n"
+ "add x22, x14, %x[ld_in_row]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "addvl x21, SP, #2\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1sb { z21.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ "ld1sb { z18.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "trn1 z21.h, z21.h, z18.h\n"
+ "ld1sb { z22.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z21.h, z21.h, z11.h\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ "ld1sb { z8.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z8.h\n"
+ "add z22.h, z22.h, z11.h\n"
+ "ld1sb { z23.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ld1sb { z27.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z27.h\n"
+ "add z23.h, z23.h, z11.h\n"
+ "ld1sb { z24.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ "ld1sb { z8.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z8.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "ld1sb { z4.s }, p1/Z, [x22]\n"
+ "mov z25.d, z4.d\n"
+ "add z25.h, z25.h, z11.h\n"
+ ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17416a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z4.h\n"
+ ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+ "ld1sb { z21.s }, p1/Z, [x14]\n"
+ ".inst 0xc17c16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z12.h\n"
+ ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ "ld1sb { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z12.h\n"
+ ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "ld1sb { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z0.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "ld1sb { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z20.h\n"
+ "st1b { z1.s }, p1, [x10]\n"
+ "ld1sb { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add x10, x10, x28\n"
+ "st1b { z2.s }, p1, [x27]\n"
+ "ld1sb { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z24.h\n"
+ "add x27, x27, x25\n"
+ "ld1sb { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z3.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "ld1sb { z3.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z3.h\n"
+ "add z21.h, z21.h, z11.h\n"
+ "ld1sb { z3.s }, p1/Z, [x20]\n"
+ "mov z25.d, z3.d\n"
+ "add z22.h, z22.h, z11.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "add z23.h, z23.h, z11.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "add z25.h, z25.h, z11.h\n"
+ "bgt 11b\n"
+ "b 17f\n"
+ "12:" // Padded
+ "cbz x22, 15f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
+ "beq 14f\n"
+ "13:" // Padded: 2 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z22.s }, p0/Z, [x14]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z4.s }, p0/Z, [x20]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z17.h\n"
+ "trn1 z23.h, z23.h, z4.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z11.h\n"
+ "addvl x20, SP, #4\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+ "mov z26.d, z1.d\n"
+ ".inst 0xc17416c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z4.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc17c16e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z12.h\n"
+ "14:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z22.s }, p0/Z, [x14]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z5.s }, p0/Z, [x20]\n"
+ "add z5.h, p0/M, z5.h, z11.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z17.h\n"
+ "trn1 z23.h, z23.h, z5.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z15.s }, p0/Z, [x20]\n"
+ "add z15.h, p0/M, z15.h, z11.h\n"
+ "addvl x20, SP, #2\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov z26.d, z15.d\n"
+ ".inst 0xc17016c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc17116e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
+ "15:" // Padded: 0 priming loads
+ "cmp x15, #0x2\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "blt 18f\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z18.h\n"
+ "trn1 z22.h, z22.h, z3.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z19.s }, p0/Z, [x20]\n"
+ "add z19.h, p0/M, z19.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z20.h, p0/M, z20.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
+ "sub x15, x15, #0x2\n"
+ "sub x13, x13, #0x1\n"
+ "trn1 z23.h, z23.h, z19.h\n"
+ "trn1 z24.h, z24.h, z20.h\n"
+ "lsr x20, x15, #0x1\n"
+ "cmp x20, x13\n"
+ "mov z25.d, z3.d\n"
+ "csel x22, x20, x13, LT\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "and x15, x15, #0x1\n"
+ "sub x13, x13, x22\n"
+ "cbz x22, 17f\n"
+ "16:" // Padded: Main loop
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+ "addvl x20, SP, #4\n"
+ "mov x12, #0x0\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "add x21, x14, %x[ld_in_row]\n"
+ ".inst 0xc17416a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z4.h\n"
+ "ld1sb { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z14.s }, p0/Z, [x21]\n"
+ "add z14.h, p0/M, z14.h, z11.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc17c16c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z12.h\n"
+ "ld1sb { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z15.s }, p0/Z, [x21]\n"
+ "mov x12, #0x4\n"
+ "add z15.h, p0/M, z15.h, z11.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z23.s }, p0/Z, [x21]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z24.s }, p0/Z, [x21]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z4.s }, p0/Z, [x21]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
+ "mov x12, #0x8\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z14.h\n"
+ "trn1 z22.h, z22.h, z15.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "addvl x20, SP, #2\n"
+ "ld1sb { z2.s }, p0/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z4.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov x12, #0x0\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "add z2.h, p0/M, z2.h, z11.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc17016a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z0.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "mov z25.d, z2.d\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc17116c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z1.h\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z4.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z12.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add z12.h, p0/M, z12.h, z11.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z11.h\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "trn1 z21.h, z21.h, z20.h\n"
+ "st1b { z17.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "trn1 z22.h, z22.h, z4.h\n"
+ "trn1 z23.h, z23.h, z27.h\n"
+ "st1b { z18.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "trn1 z24.h, z24.h, z12.h\n"
+ "mov z25.d, z8.d\n"
+ "st1b { z19.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "bgt 16b\n"
+ "17:" // Main loop tail
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+ "addvl x20, SP, #4\n"
+ "mov x12, #0x0\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1sb { z0.s }, p0/Z, [x14]\n"
+ "add z0.h, p0/M, z0.h, z11.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z14.s }, p0/Z, [x20]\n"
+ "add z14.h, p0/M, z14.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "ld1sb { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z12.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add z12.h, p0/M, z12.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z2.s }, p0/Z, [x20]\n"
+ "add z2.h, p0/M, z2.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z3.s }, p0/Z, [x20]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #2\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ "trn1 z0.h, z0.h, z14.h\n"
+ "add x8, x8, #0x1\n"
+ "add z27.h, p0/M, z27.h, z11.h\n"
+ "trn1 z1.h, z1.h, z12.h\n"
+ "trn1 z2.h, z2.h, z21.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "trn1 z3.h, z3.h, z25.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ "mov z4.d, z27.d\n"
+ ".inst 0xc17e1408 // sdot za.s[x8, 0], { z0.h-z3.h }, z14.h\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "st1b { z17.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ ".inst 0xc17f1428 // sdot za.s[x8, 0], { z1.h-z4.h }, z15.h\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "st1b { z18.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z19.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "18:" // Main loop skip tail
+ "cbz x15, 19f\n" // Skip remainder inputs
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z0.s }, p0/Z, [x20]\n"
+ "add z0.h, p0/M, z0.h, z11.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z17.h\n"
+ "trn1 z22.h, z22.h, z0.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z5.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z5.h, p0/M, z5.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z4.s }, p0/Z, [x20]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z5.h\n"
+ "mov z25.d, z4.d\n"
+ "addvl x20, SP, #4\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+ "sub x13, x13, #0x1\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "st1b { z17.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z18.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z19.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "19:" // Tail input: End
+ "cbz x13, 21f\n"
+ "20:" // Right padding loop
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+ "subs x13, x13, #0x1\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "st1b { z0.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z1.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z2.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z3.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "bgt 20b\n"
+ "21:" // End
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x16\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x16, x17\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x10]\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #6\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..40fa718266
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
+ const int8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const int8_t *weights,
+ int8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+);
+
+class sme2_s8q_planar_5x5_s1_4rows_dot_za : public PlanarStrategy<int8_t, int8_t>
+{
+ using Parent = PlanarStrategy<int8_t, int8_t>;
+
+ public:
+ using return_type = int8_t;
+ constexpr static auto output_rows = 4u;
+ constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+ constexpr static auto stride_rows = 1u, stride_cols = 1u;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ sme2_s8q_planar_5x5_s1_4rows_dot_za(const CPUInfo *)
+ : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+ {
+ }
+
+ typename Parent::KernelType get_kernel(void) const override
+ {
+ return sme2_s8q_planar_5x5_s1_4rows_dot_za_impl;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..bb68733a45
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,1204 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_5x5_s1_4rows_dot_za_impl(
+ const int8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const int8_t *weights,
+ int8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+)
+{
+ struct Args
+ {
+ const int8_t *inptr;
+ size_t ld_in_vl;
+ long unsigned int pad_top, pad_bottom, pad_left;
+ const int8_t *weights;
+ long unsigned int input_cols, output_cols;
+ int8_t **outptrs;
+ const size_t *ld_out_cols;
+ const size_t *ld_out_vls;
+ long unsigned int current_channel, n_channels;
+ };
+
+ Args args = { inptr, ld_in_vl, pad_top, 8u - std::min(8u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ptrue p2.b\n"
+ "mov x20, #0x8\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ld1rh { z17.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "sub x20, x20, x5\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x7\n"
+ "whilelt p9.s, XZR, x20\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "whilelt p8.s, XZR, x6\n"
+ "addvl SP, SP, #-30\n"
+ "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "neg z17.h, p2/M, z17.h\n"
+ "eor p8.b, p2/Z, p8.b, p9.b\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "1:" // Channel loop
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "mov z18.s, #0x0\n"
+ "cbz x20, 2f\n"
+ "ld1w { z18.s }, p1/Z, [x20, x17, LSL #2]\n"
+ "2:" // Load bias: Done
+ "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x23\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "ld1rh { z3.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z15.h, #0x0\n"
+ "sub z2.h, z2.h, z3.h\n"
+ "incw x23\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z13.h, z13.h, z3.h\n"
+ "trn1 z11.h, z15.h, z2.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z27.h, z27.h, z3.h\n"
+ "trn1 z0.h, z2.h, z13.h\n"
+ "ld1sb { z19.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z19.h, z19.h, z3.h\n"
+ "trn1 z26.h, z13.h, z27.h\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
+ "sub z14.h, z14.h, z3.h\n"
+ "mov x20, x23\n"
+ "trn1 z10.h, z27.h, z19.h\n"
+ "ld1sb { z9.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z19.h, z19.h, z14.h\n"
+ "trn1 z1.h, z14.h, z15.h\n"
+ "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z9.h, z9.h, z3.h\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "ld1sb { z29.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z29.h, z29.h, z3.h\n"
+ "addvl x22, SP, #30\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "incw x23\n"
+ "sub z2.h, z2.h, z3.h\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "addvl x22, x22, #-6\n"
+ "sub z23.h, z23.h, z3.h\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "trn1 z20.h, z15.h, z9.h\n"
+ "incw x23\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "st1h { z0.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z22.h, z9.h, z5.h\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z9.h, z5.h, z29.h\n"
+ "ld1sb { z21.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z26.h, z29.h, z2.h\n"
+ "ld1sb { z0.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z28.h, z2.h, z23.h\n"
+ "ld1sb { z19.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z1.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z2.h, z23.h, z15.h\n"
+ "sub z25.h, z25.h, z3.h\n"
+ "addvl x22, x22, #-6\n"
+ "sub z21.h, z21.h, z3.h\n"
+ "ld1sb { z6.s }, p2/Z, [x20]\n"
+ "sub z0.h, z0.h, z3.h\n"
+ "mov x20, x23\n"
+ "sub z19.h, z19.h, z3.h\n"
+ "sub z6.h, z6.h, z3.h\n"
+ "st1h { z20.h }, p2, [x22]\n"
+ "incw x23\n"
+ "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z11.h, z15.h, z25.h\n"
+ "trn1 z10.h, z25.h, z21.h\n"
+ "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z9.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z14.h, z21.h, z0.h\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z21.h, z0.h, z19.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z28.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z19.h, z19.h, z6.h\n"
+ "ld1sb { z29.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z13.h, z6.h, z15.h\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "sub z23.h, z23.h, z3.h\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
+ "addvl x22, x22, #-6\n"
+ "sub z27.h, z27.h, z3.h\n"
+ "sub z29.h, z29.h, z3.h\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "sub z1.h, z1.h, z3.h\n"
+ "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z30.h, z15.h, z5.h\n"
+ "trn1 z26.h, z5.h, z23.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z14.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z22.h, z23.h, z27.h\n"
+ "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z28.h, z27.h, z29.h\n"
+ "ld1sb { z8.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z27.h, z29.h, z1.h\n"
+ "ld1sb { z9.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z13.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z2.h, z1.h, z15.h\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z3.h\n"
+ "addvl x22, x22, #-6\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "sub z8.h, z8.h, z3.h\n"
+ "st1h { z30.h }, p2, [x22]\n"
+ "sub z9.h, z9.h, z3.h\n"
+ "sub z14.h, z14.h, z3.h\n"
+ "st1h { z26.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+ "mov z19.d, z18.d\n"
+ "trn1 z22.h, z15.h, z11.h\n"
+ "st1h { z28.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z1.h, z11.h, z5.h\n"
+ "trn1 z31.h, z5.h, z8.h\n"
+ "st1h { z27.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z8.h, z8.h, z9.h\n"
+ "trn1 z21.h, z9.h, z14.h\n"
+ "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "addvl x22, x22, #-6\n"
+ "trn1 z15.h, z14.h, z15.h\n"
+ "st1h { z22.h }, p2, [x22]\n"
+ "st1h { z1.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z31.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z8.h }, p2, [x22, #3, MUL VL]\n"
+ "st1h { z21.h }, p2, [x22, #4, MUL VL]\n"
+ "st1h { z15.h }, p2, [x22, #5, MUL VL]\n"
+ "cbz x21, 3f\n"
+ "ld1w { z7.s }, p1/Z, [x21, x17, LSL #2]\n"
+ "3:" // Load mul: End
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z4.s }, p1/Z, [x20, x17, LSL #2]\n"
+ "4:" // Load right_shift: End
+ "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x25, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
+ "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "orr x23, x7, x23, LSL #22\n"
+ "mov x22, #0x8\n"
+ "add x21, x6, x5\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "mov x11, #0x0\n"
+ "mov x8, #0x8\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x6, x16\n"
+ "5:" // Issue prefetches
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
+ "bgt 5b\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x16, x6, x20, x16\n"
+ ".inst 0xc0046a40 // mova za.d[x11, #0], { z18.d-z19.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0046a41 // mova za.d[x11, #1], { z18.d-z19.d }\n"
+ "mov x22, #0x4\n"
+ "ldp x14, x13, [x23], #0x10\n"
+ ".inst 0xc0046a42 // mova za.d[x11, #2], { z18.d-z19.d }\n"
+ "ldp x4, x10, [x20], #0x10\n"
+ ".inst 0xc0046a43 // mova za.d[x11, #3], { z18.d-z19.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0046a44 // mova za.d[x11, #4], { z18.d-z19.d }\n"
+ "ldp x9, x28, [x23], #0x10\n"
+ ".inst 0xc0046a45 // mova za.d[x11, #5], { z18.d-z19.d }\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ ".inst 0xc0046a46 // mova za.d[x11, #6], { z18.d-z19.d }\n"
+ ".inst 0xc0046a47 // mova za.d[x11, #7], { z18.d-z19.d }\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+ "sub x15, x15, x21\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+ ".inst 0xc1a4aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z4.s\n"
+ ".inst 0xc1acab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z12.s\n"
+ ".inst 0xc1b0cf14 // sclamp { z20.s-z23.s }, z24.s, z16.s\n"
+ "6:" // Left padding
+ "subs x21, x21, #0x1\n"
+ "st1b { z20.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z22.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z21.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z23.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "bgt 6b\n"
+ "7:" // Left padding: End
+ "adds XZR, x6, x5\n"
+ "bne 14f\n"
+ "cbz x22, 12f\n"
+ "cmp x22, #0x1\n"
+ "sub x25, x25, x22\n"
+ "beq 11f\n"
+ "cmp x22, #0x2\n"
+ "beq 10f\n"
+ "cmp x22, #0x3\n"
+ "beq 9f\n"
+ "8:" // Unpadded: 4 priming loads
+ "add x21, x16, %x[ld_in_row]\n"
+ "ld1sb { z1.s }, p1/Z, [x16]\n"
+ "addvl x20, SP, #24\n"
+ "ld1sb { z28.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z27.h, z1.h, z28.h\n"
+ "add z27.h, z27.h, z17.h\n"
+ "ld1sb { z1.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1sb { z2.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z28.h, z1.h, z2.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "ld1sb { z13.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z6.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z29.h, z13.h, z6.h\n"
+ "add z29.h, z29.h, z17.h\n"
+ "ld1sb { z30.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16a7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z10.h\n"
+ "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z20.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add z30.h, z30.h, z17.h\n"
+ ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc1617789 // sdot za.s[x11, 1], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
+ ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ "9:" // Unpadded: 3 priming loads
+ "add x22, x16, %x[ld_in_row]\n"
+ "ld1sb { z2.s }, p1/Z, [x16]\n"
+ "addvl x21, SP, #18\n"
+ "ld1sb { z28.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z20.h, z2.h, z28.h\n"
+ "add z20.h, z20.h, z17.h\n"
+ "ld1sb { z31.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "addvl x20, SP, #24\n"
+ "ld1sb { z11.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z21.h, z31.h, z11.h\n"
+ "add z21.h, z21.h, z17.h\n"
+ "ld1sb { z25.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1sb { z8.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z22.h, z25.h, z8.h\n"
+ "add z22.h, z22.h, z17.h\n"
+ "ld1sb { z8.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16e7688 // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+ "ld1sb { z3.s }, p1/Z, [x22]\n"
+ "trn1 z23.h, z8.h, z3.h\n"
+ ".inst 0xc1667689 // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc161768a // sdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
+ "add z23.h, z23.h, z17.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc160768b // sdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
+ ".inst 0xc16976a8 // sdot za.s[x11, 0], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xa0422aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16176a9 // sdot za.s[x11, 1], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16976aa // sdot za.s[x11, 2], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xc16176ab // sdot za.s[x11, 3], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xc16f76c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc16e76c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16b76ca // sdot za.s[x11, 2], { z22.h-z23.h }, z11.h\n"
+ ".inst 0xc16a76cb // sdot za.s[x11, 3], { z22.h-z23.h }, z10.h\n"
+ "10:" // Unpadded: 2 priming loads
+ "add x23, x16, %x[ld_in_row]\n"
+ "ld1sb { z2.s }, p1/Z, [x16]\n"
+ "addvl x22, SP, #12\n"
+ "ld1sb { z22.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z0.h, z2.h, z22.h\n"
+ "add z0.h, z0.h, z17.h\n"
+ "ld1sb { z14.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "addvl x21, SP, #18\n"
+ "ld1sb { z6.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z1.h, z14.h, z6.h\n"
+ "add z1.h, z1.h, z17.h\n"
+ "ld1sb { z15.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "addvl x20, SP, #24\n"
+ "ld1sb { z6.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z2.h, z15.h, z6.h\n"
+ "add z2.h, z2.h, z17.h\n"
+ "ld1sb { z21.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16f7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z15.h\n"
+ "ld1sb { z30.s }, p1/Z, [x23]\n"
+ "trn1 z3.h, z21.h, z30.h\n"
+ ".inst 0xc16e7409 // sdot za.s[x11, 1], { z0.h-z1.h }, z14.h\n"
+ ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d740a // sdot za.s[x11, 2], { z0.h-z1.h }, z13.h\n"
+ "add z3.h, z3.h, z17.h\n"
+ ".inst 0xa0412ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc165740b // sdot za.s[x11, 3], { z0.h-z1.h }, z5.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16f7428 // sdot za.s[x11, 0], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e7429 // sdot za.s[x11, 1], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16b740c // sdot za.s[x11, 4], { z0.h-z1.h }, z11.h\n"
+ ".inst 0xc16a740d // sdot za.s[x11, 5], { z0.h-z1.h }, z10.h\n"
+ ".inst 0xc16f742a // sdot za.s[x11, 2], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e742b // sdot za.s[x11, 3], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1697448 // sdot za.s[x11, 0], { z2.h-z3.h }, z9.h\n"
+ ".inst 0xc1687449 // sdot za.s[x11, 1], { z2.h-z3.h }, z8.h\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16f742c // sdot za.s[x11, 4], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e742d // sdot za.s[x11, 5], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xc16b744a // sdot za.s[x11, 2], { z2.h-z3.h }, z11.h\n"
+ ".inst 0xc16a744b // sdot za.s[x11, 3], { z2.h-z3.h }, z10.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc161744c // sdot za.s[x11, 4], { z2.h-z3.h }, z1.h\n"
+ ".inst 0xc160744d // sdot za.s[x11, 5], { z2.h-z3.h }, z0.h\n"
+ "11:" // Unpadded: 1 priming loads
+ "add x24, x16, %x[ld_in_row]\n"
+ "ld1sb { z0.s }, p1/Z, [x16]\n"
+ "addvl x23, SP, #6\n"
+ "ld1sb { z3.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z28.h, z0.h, z3.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "ld1sb { z6.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "addvl x22, SP, #12\n"
+ "ld1sb { z30.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z29.h, z6.h, z30.h\n"
+ "add z29.h, z29.h, z17.h\n"
+ "ld1sb { z1.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "addvl x21, SP, #18\n"
+ "ld1sb { z25.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z30.h, z1.h, z25.h\n"
+ "add z30.h, z30.h, z17.h\n"
+ "ld1sb { z3.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1sb { z5.s }, p1/Z, [x24]\n"
+ "trn1 z31.h, z3.h, z5.h\n"
+ ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16e778a // sdot za.s[x11, 2], { z28.h-z29.h }, z14.h\n"
+ "add z31.h, z31.h, z17.h\n"
+ ".inst 0xa1412ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc166778b // sdot za.s[x11, 3], { z28.h-z29.h }, z6.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16a77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16277a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z2.h\n"
+ ".inst 0xa0412ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16f778c // sdot za.s[x11, 4], { z28.h-z29.h }, z15.h\n"
+ ".inst 0xc16e778d // sdot za.s[x11, 5], { z28.h-z29.h }, z14.h\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+ ".inst 0xc16877ab // sdot za.s[x11, 3], { z29.h-z30.h }, z8.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z10.h\n"
+ ".inst 0xc16277c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z2.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16e778e // sdot za.s[x11, 6], { z28.h-z29.h }, z14.h\n"
+ ".inst 0xc166778f // sdot za.s[x11, 7], { z28.h-z29.h }, z6.h\n"
+ ".inst 0xc16d77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z13.h\n"
+ ".inst 0xc16577ad // sdot za.s[x11, 5], { z29.h-z30.h }, z5.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z10.h\n"
+ ".inst 0xc16277cb // sdot za.s[x11, 3], { z30.h-z31.h }, z2.h\n"
+ ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z14.h\n"
+ ".inst 0xc16677af // sdot za.s[x11, 7], { z29.h-z30.h }, z6.h\n"
+ ".inst 0xc16977cc // sdot za.s[x11, 4], { z30.h-z31.h }, z9.h\n"
+ ".inst 0xc16877cd // sdot za.s[x11, 5], { z30.h-z31.h }, z8.h\n"
+ ".inst 0xa1422a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16e77ce // sdot za.s[x11, 6], { z30.h-z31.h }, z14.h\n"
+ ".inst 0xc16677cf // sdot za.s[x11, 7], { z30.h-z31.h }, z6.h\n"
+ "12:" // Unpadded: 0 priming loads
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 22f\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ "ld1sb { z26.s }, p1/Z, [x16]\n"
+ "sub x25, x25, #0x1\n"
+ "ld1sb { z28.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z26.h, z28.h\n"
+ "sub x15, x15, #0x1\n"
+ "ld1sb { z31.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "cmp x25, x15\n"
+ "add z25.h, z25.h, z17.h\n"
+ "ld1sb { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z26.h, z31.h, z15.h\n"
+ "csel x25, x25, x15, LT\n"
+ "ld1sb { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add z26.h, z26.h, z17.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1sb { z8.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z27.h, z22.h, z8.h\n"
+ "add z27.h, z27.h, z17.h\n"
+ "ld1sb { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "sub x15, x15, x25\n"
+ "ld1sb { z20.s }, p1/Z, [x20]\n"
+ "trn1 z28.h, z21.h, z20.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "cbz x25, 21f\n"
+ "13:" // Unpadded: Main loop
+ "addvl x24, SP, #6\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+ "addvl x23, SP, #12\n"
+ "ld1sb { z21.s }, p1/Z, [x16]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402b0e // ld1h { z14.h-z15.h }, pn10.b/Z, [x24]\n"
+ "addvl x22, SP, #18\n"
+ "addvl x21, SP, #24\n"
+ ".inst 0xc16f772a // sdot za.s[x11, 2], { z25.h-z26.h }, z15.h\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ "ld1sb { z0.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e772b // sdot za.s[x11, 3], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+ "subs x25, x25, #0x1\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ "ld1sb { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412b05 // ld1h { z5.h, z13.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ "ld1sb { z31.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ "ld1sb { z29.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ "ld1sb { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc16f772e // sdot za.s[x11, 6], { z25.h-z26.h }, z15.h\n"
+ "ld1sb { z30.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e772f // sdot za.s[x11, 7], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d774c // sdot za.s[x11, 4], { z26.h-z27.h }, z13.h\n"
+ "ld1sb { z6.s }, p1/Z, [x20]\n"
+ ".inst 0xc165774d // sdot za.s[x11, 5], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16a776a // sdot za.s[x11, 2], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776b // sdot za.s[x11, 3], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a776c // sdot za.s[x11, 4], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+ ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+ "trn1 z25.h, z21.h, z0.h\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16d1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z13.h\n"
+ "add z25.h, z25.h, z17.h\n"
+ ".inst 0xc1651749 // sdot za.s[x8, 1], { z26.h-z27.h }, z5.h\n"
+ "trn1 z26.h, z20.h, z31.h\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+ "add z26.h, z26.h, z17.h\n"
+ ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+ "trn1 z27.h, z29.h, z22.h\n"
+ "trn1 z28.h, z30.h, z6.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "add z27.h, z27.h, z17.h\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ "add x11, x11, #0x2\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "add z28.h, z28.h, z17.h\n"
+ "st1b { z10.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z9.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z11.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "bgt 13b\n"
+ "b 21f\n"
+ "14:" // Padded
+ "cbz x22, 19f\n"
+ "cmp x22, #0x1\n"
+ "sub x25, x25, x22\n"
+ "beq 18f\n"
+ "cmp x22, #0x2\n"
+ "beq 17f\n"
+ "cmp x22, #0x3\n"
+ "beq 16f\n"
+ "15:" // Padded: 4 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z9.s }, p0/Z, [x16]\n"
+ "add z9.h, p0/M, z9.h, z17.h\n"
+ "add x21, x16, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z21.s }, p0/Z, [x21]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "mov x12, #0x4\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z31.h, z9.h, z22.h\n"
+ "trn1 z0.h, z21.h, z20.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z21.s }, p0/Z, [x21]\n"
+ "addvl x20, SP, #24\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z1.h, z22.h, z20.h\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16a77e8 // sdot za.s[x11, 0], { z31.h-z0.h }, z10.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc16277e9 // sdot za.s[x11, 1], { z31.h-z0.h }, z2.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "trn1 z2.h, z21.h, z20.h\n"
+ ".inst 0xc16d7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z13.h\n"
+ ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1657409 // sdot za.s[x11, 1], { z0.h-z1.h }, z5.h\n"
+ ".inst 0xc1697428 // sdot za.s[x11, 0], { z1.h-z2.h }, z9.h\n"
+ ".inst 0xc1687429 // sdot za.s[x11, 1], { z1.h-z2.h }, z8.h\n"
+ "16:" // Padded: 3 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z5.s }, p0/Z, [x16]\n"
+ "add z5.h, p0/M, z5.h, z17.h\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z28.h, z5.h, z22.h\n"
+ "trn1 z29.h, z21.h, z20.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
+ "addvl x21, SP, #18\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "trn1 z30.h, z22.h, z20.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #24\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "trn1 z31.h, z21.h, z20.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc169778a // sdot za.s[x11, 2], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc161778b // sdot za.s[x11, 3], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xa1422aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16f77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z15.h\n"
+ ".inst 0xc16e77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z14.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+ ".inst 0xc16177ab // sdot za.s[x11, 3], { z29.h-z30.h }, z1.h\n"
+ ".inst 0xc16b77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
+ ".inst 0xc16377c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z15.h\n"
+ ".inst 0xc16e77cb // sdot za.s[x11, 3], { z30.h-z31.h }, z14.h\n"
+ "17:" // Padded: 2 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z29.s }, p0/Z, [x16]\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z8.h, z29.h, z22.h\n"
+ "trn1 z9.h, z21.h, z20.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
+ "addvl x22, SP, #12\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ "trn1 z10.h, z22.h, z20.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "addvl x21, SP, #18\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16f7508 // sdot za.s[x11, 0], { z8.h-z9.h }, z15.h\n"
+ ".inst 0xc16e7509 // sdot za.s[x11, 1], { z8.h-z9.h }, z14.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #24\n"
+ "trn1 z11.h, z21.h, z20.h\n"
+ ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e750a // sdot za.s[x11, 2], { z8.h-z9.h }, z14.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc166750b // sdot za.s[x11, 3], { z8.h-z9.h }, z6.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16d7528 // sdot za.s[x11, 0], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc1657529 // sdot za.s[x11, 1], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16f750c // sdot za.s[x11, 4], { z8.h-z9.h }, z15.h\n"
+ ".inst 0xc16e750d // sdot za.s[x11, 5], { z8.h-z9.h }, z14.h\n"
+ ".inst 0xc16d752a // sdot za.s[x11, 2], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xc165752b // sdot za.s[x11, 3], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1617548 // sdot za.s[x11, 0], { z10.h-z11.h }, z1.h\n"
+ ".inst 0xc1607549 // sdot za.s[x11, 1], { z10.h-z11.h }, z0.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e752c // sdot za.s[x11, 4], { z9.h-z10.h }, z14.h\n"
+ ".inst 0xc166752d // sdot za.s[x11, 5], { z9.h-z10.h }, z6.h\n"
+ ".inst 0xc161754a // sdot za.s[x11, 2], { z10.h-z11.h }, z1.h\n"
+ ".inst 0xc160754b // sdot za.s[x11, 3], { z10.h-z11.h }, z0.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f754c // sdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc16e754d // sdot za.s[x11, 5], { z10.h-z11.h }, z14.h\n"
+ "18:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z1.s }, p0/Z, [x16]\n"
+ "add z1.h, p0/M, z1.h, z17.h\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z26.h, z1.h, z22.h\n"
+ "trn1 z27.h, z21.h, z20.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
+ "addvl x23, SP, #6\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xa0402aee // ld1h { z14.h-z15.h }, pn10.b/Z, [x23]\n"
+ "trn1 z28.h, z22.h, z20.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "addvl x22, SP, #12\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16f7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z15.h\n"
+ ".inst 0xc16e7749 // sdot za.s[x11, 1], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
+ "addvl x21, SP, #18\n"
+ "trn1 z29.h, z21.h, z20.h\n"
+ ".inst 0xa0412aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc161774a // sdot za.s[x11, 2], { z26.h-z27.h }, z1.h\n"
+ "addvl x20, SP, #24\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc160774b // sdot za.s[x11, 3], { z26.h-z27.h }, z0.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16b7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16a7769 // sdot za.s[x11, 1], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e774c // sdot za.s[x11, 4], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc166774d // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc1687789 // sdot za.s[x11, 1], { z28.h-z29.h }, z8.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xc16e776c // sdot za.s[x11, 4], { z27.h-z28.h }, z14.h\n"
+ ".inst 0xc166776d // sdot za.s[x11, 5], { z27.h-z28.h }, z6.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a778a // sdot za.s[x11, 2], { z28.h-z29.h }, z10.h\n"
+ ".inst 0xc162778b // sdot za.s[x11, 3], { z28.h-z29.h }, z2.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e776e // sdot za.s[x11, 6], { z27.h-z28.h }, z14.h\n"
+ ".inst 0xc166776f // sdot za.s[x11, 7], { z27.h-z28.h }, z6.h\n"
+ ".inst 0xc161778c // sdot za.s[x11, 4], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xc160778d // sdot za.s[x11, 5], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1422a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16a778e // sdot za.s[x11, 6], { z28.h-z29.h }, z10.h\n"
+ ".inst 0xc162778f // sdot za.s[x11, 7], { z28.h-z29.h }, z2.h\n"
+ "19:" // Padded: 0 priming loads
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 22f\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z6.s }, p0/Z, [x16]\n"
+ "add z6.h, p0/M, z6.h, z17.h\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z30.s }, p0/Z, [x20]\n"
+ "add z30.h, p0/M, z30.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z17.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z6.h, z30.h\n"
+ "trn1 z26.h, z27.h, z26.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z9.s }, p0/Z, [x20]\n"
+ "add z9.h, p0/M, z9.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
+ "sub x25, x25, #0x1\n"
+ "sub x15, x15, #0x1\n"
+ "cmp x25, x15\n"
+ "trn1 z27.h, z8.h, z9.h\n"
+ "trn1 z28.h, z21.h, z29.h\n"
+ "csel x25, x25, x15, LT\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "sub x15, x15, x25\n"
+ "cbz x25, 21f\n"
+ "20:" // Padded: Main loop
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z8.s }, p0/Z, [x16]\n"
+ "add z8.h, p0/M, z8.h, z17.h\n"
+ "add x24, x16, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z21.s }, p0/Z, [x24]\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+ "addvl x23, SP, #6\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ "addvl x22, SP, #12\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ "addvl x21, SP, #18\n"
+ "addvl x20, SP, #24\n"
+ "ld1sb { z29.s }, p0/Z, [x24]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ "mov x12, #0x4\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ "ld1sb { z30.s }, p0/Z, [x24]\n"
+ "add z30.h, p0/M, z30.h, z17.h\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "subs x25, x25, #0x1\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ "ld1sb { z15.s }, p0/Z, [x24]\n"
+ "add z15.h, p0/M, z15.h, z17.h\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ "ld1sb { z20.s }, p0/Z, [x24]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ "ld1sb { z31.s }, p0/Z, [x24]\n"
+ "add z31.h, p0/M, z31.h, z17.h\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc16b774c // sdot za.s[x11, 4], { z26.h-z27.h }, z11.h\n"
+ "ld1sb { z22.s }, p0/Z, [x24]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ ".inst 0xc16a774d // sdot za.s[x11, 5], { z26.h-z27.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc169776a // sdot za.s[x11, 2], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776b // sdot za.s[x11, 3], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16e774e // sdot za.s[x11, 6], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc166774f // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc161776c // sdot za.s[x11, 4], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1631728 // sdot za.s[x8, 0], { z25.h-z26.h }, z3.h\n"
+ ".inst 0xc1621729 // sdot za.s[x8, 1], { z25.h-z26.h }, z2.h\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ "trn1 z25.h, z8.h, z21.h\n"
+ ".inst 0xc16e1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc1661749 // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "trn1 z26.h, z29.h, z30.h\n"
+ ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "trn1 z27.h, z15.h, z20.h\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ "trn1 z28.h, z31.h, z22.h\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ "add x11, x11, #0x2\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z9.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z11.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "bgt 20b\n"
+ "21:" // Main loop tail
+ "addvl x23, SP, #6\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+ "addvl x22, SP, #12\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ "addvl x21, SP, #18\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc169774a // sdot za.s[x11, 2], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc161774b // sdot za.s[x11, 3], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xa1412ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc169774c // sdot za.s[x11, 4], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc161774d // sdot za.s[x11, 5], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0422ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa0412a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc163776c // sdot za.s[x11, 4], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16a776e // sdot za.s[x11, 6], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776f // sdot za.s[x11, 7], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+ ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc1691748 // sdot za.s[x8, 0], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc1681749 // sdot za.s[x8, 1], { z26.h-z27.h }, z8.h\n"
+ ".inst 0xc1611768 // sdot za.s[x8, 0], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ "add x11, x11, #0x2\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z9.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z11.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "22:" // Main loop skip tail
+ "cbz x15, 24f\n"
+ "23:" // Right padding loop
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ "add x8, x8, #0x2\n"
+ "subs x15, x15, #0x1\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ "add x11, x11, #0x2\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z9.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z11.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "bgt 23b\n"
+ "24:" // End
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #16\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x17\n"
+ "whilelt p1.s, x17, x7\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x10]\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #30\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..8bffc05e1f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
+ const int8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const int8_t *weights,
+ int8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+);
+
+class sme2_s8q_planar_5x5_s2_4rows_dot_za : public PlanarStrategy<int8_t, int8_t>
+{
+ using Parent = PlanarStrategy<int8_t, int8_t>;
+
+ public:
+ using return_type = int8_t;
+ constexpr static auto output_rows = 4u;
+ constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+ constexpr static auto stride_rows = 2u, stride_cols = 2u;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ sme2_s8q_planar_5x5_s2_4rows_dot_za(const CPUInfo *)
+ : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+ {
+ }
+
+ typename Parent::KernelType get_kernel(void) const override
+ {
+ return sme2_s8q_planar_5x5_s2_4rows_dot_za_impl;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..3da0d14d74
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_s8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,1354 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_s8q_planar_5x5_s2_4rows_dot_za_impl(
+ const int8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const int8_t *weights,
+ int8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+)
+{
+ struct Args
+ {
+ const int8_t *inptr;
+ size_t ld_in_vl;
+ long unsigned int pad_top, pad_bottom, pad_left;
+ const int8_t *weights;
+ long unsigned int input_cols, output_cols;
+ int8_t **outptrs;
+ const size_t *ld_out_cols;
+ const size_t *ld_out_vls;
+ long unsigned int current_channel, n_channels;
+ };
+
+ Args args = { inptr, ld_in_vl, pad_top, 11u - std::min(11u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ldr x3, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ptrue p2.b\n"
+ "mov x20, #0xb\n"
+ "ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ld1rh { z7.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "sub x20, x20, x3\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x5\n"
+ "whilelt p9.s, XZR, x20\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "whilelt p8.s, XZR, x4\n"
+ "addvl SP, SP, #-15\n"
+ "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "neg z7.h, p2/M, z7.h\n"
+ "eor p8.b, p2/Z, p8.b, p9.b\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z21.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "1:" // Channel loop
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "mov z12.s, #0x0\n"
+ "cbz x20, 2f\n"
+ "ld1w { z12.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "2:" // Load bias: Done
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x22\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "ld1rh { z28.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "incw x22\n"
+ "mov z26.h, #0x0\n"
+ "ld1sb { z22.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "trn1 z17.h, z13.h, z22.h\n"
+ "ld1sb { z20.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z20.h, z20.h, z28.h\n"
+ "addvl x21, SP, #15\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z1.h, z1.h, z28.h\n"
+ "trn1 z29.h, z20.h, z1.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "mov x20, x22\n"
+ "sub z27.h, z27.h, z28.h\n"
+ "incw x22\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z14.h, z14.h, z28.h\n"
+ "addvl x21, x21, #-3\n"
+ "ld1sb { z18.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z18.h, z18.h, z28.h\n"
+ "trn1 z22.h, z27.h, z26.h\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z23.h, z23.h, z28.h\n"
+ "st1h { z17.h }, p2, [x21]\n"
+ "ld1sb { z30.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z30.h, z30.h, z28.h\n"
+ "trn1 z8.h, z14.h, z18.h\n"
+ "ld1sb { z15.s }, p2/Z, [x20]\n"
+ "mov x20, x22\n"
+ "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
+ "sub z15.h, z15.h, z28.h\n"
+ "ld1sb { z20.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z23.h, z23.h, z30.h\n"
+ "sub z20.h, z20.h, z28.h\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z24.h, z24.h, z28.h\n"
+ "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z0.h, z15.h, z26.h\n"
+ "incw x22\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z16.h, z16.h, z28.h\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "addvl x21, x21, #-3\n"
+ "mov x20, x22\n"
+ "st1h { z8.h }, p2, [x21]\n"
+ "trn1 z27.h, z20.h, z24.h\n"
+ "ld1sb { z22.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z11.h, z11.h, z28.h\n"
+ "ld1sb { z3.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z23.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z20.h, z16.h, z13.h\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "sub z3.h, z3.h, z28.h\n"
+ "ld1sb { z15.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z29.h, z11.h, z26.h\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "incw x22\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "sub z15.h, z15.h, z28.h\n"
+ "addvl x21, x21, #-3\n"
+ "mov x20, x22\n"
+ "st1h { z27.h }, p2, [x21]\n"
+ "sub z16.h, z16.h, z28.h\n"
+ "trn1 z19.h, z22.h, z3.h\n"
+ "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z20.h }, p2, [x21, #1, MUL VL]\n"
+ "ld1sb { z0.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z31.h, z13.h, z15.h\n"
+ "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1sb { z18.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z16.h, z16.h, z26.h\n"
+ "sub z17.h, z17.h, z28.h\n"
+ "ld1sb { z22.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z0.h, z0.h, z28.h\n"
+ "sub z18.h, z18.h, z28.h\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "sub z1.h, z1.h, z28.h\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "addvl x21, x21, #-3\n"
+ "st1h { z19.h }, p2, [x21]\n"
+ "mov z13.d, z12.d\n"
+ "mov z14.d, z12.d\n"
+ "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+ "mov z15.d, z12.d\n"
+ "trn1 z8.h, z17.h, z0.h\n"
+ "st1h { z16.h }, p2, [x21, #2, MUL VL]\n"
+ "addvl x21, x21, #-3\n"
+ "trn1 z31.h, z18.h, z22.h\n"
+ "trn1 z29.h, z1.h, z26.h\n"
+ "st1h { z8.h }, p2, [x21]\n"
+ "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z6.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "3:" // Load mul: End
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "4:" // Load right_shift: End
+ "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x7, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
+ "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "orr x23, x5, x23, LSL #22\n"
+ "mov x22, #0xb\n"
+ "add x21, x4, x3\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "mov x8, #0x0\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x4, x17\n"
+ "5:" // Issue prefetches
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
+ "bgt 5b\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x17, x4, x20, x17\n"
+ ".inst 0xc0040d80 // mova za.d[x8, #0], { z12.d-z15.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040d81 // mova za.d[x8, #1], { z12.d-z15.d }\n"
+ "mov x22, #0x4\n"
+ "ldp x15, x14, [x23], #0x10\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "ldp x13, x11, [x20], #0x10\n"
+ ".inst 0xc0040d83 // mova za.d[x8, #3], { z12.d-z15.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "ldp x10, x9, [x23], #0x10\n"
+ "ldp x28, x27, [x20], #0x10\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ "and x22, x21, #0x1\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ "add x21, x21, #0x1\n"
+ "lsr x21, x21, #0x1\n"
+ ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+ "sub x16, x16, x21\n"
+ ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+ "6:" // Left padding
+ "subs x21, x21, #0x1\n"
+ "st1b { z28.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "st1b { z29.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "st1b { z30.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z31.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "bgt 6b\n"
+ "7:" // Left padding: End
+ "adds XZR, x4, x3\n"
+ "bne 14f\n"
+ "cbz x22, 12f\n"
+ "cmp x22, #0x1\n"
+ "sub x7, x7, x22\n"
+ "beq 11f\n"
+ "cmp x22, #0x2\n"
+ "beq 10f\n"
+ "cmp x22, #0x3\n"
+ "beq 9f\n"
+ "8:" // Unpadded: 4 priming loads
+ "add x21, x17, %x[ld_in_row]\n"
+ "ld1sb { z27.s }, p1/Z, [x17]\n"
+ "addvl x20, SP, #12\n"
+ "ld1sb { z0.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z27.h, z27.h, z0.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1sb { z28.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1sb { z11.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z28.h, z28.h, z11.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1sb { z29.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z8.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z29.h, z29.h, z8.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1sb { z30.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z30.h, z30.h, z17.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1sb { z31.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z26.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z31.h, z31.h, z26.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1sb { z20.s }, p1/Z, [x21]\n"
+ "mov z0.d, z20.d\n"
+ "add z0.h, z0.h, z7.h\n"
+ ".inst 0xc1781788 // sdot za.s[x8, 0], { z28.h-z31.h }, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17817a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z8.h\n"
+ "9:" // Unpadded: 3 priming loads
+ "add x21, x17, %x[ld_in_row]\n"
+ "ld1sb { z29.s }, p1/Z, [x17]\n"
+ "addvl x20, SP, #9\n"
+ "ld1sb { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z29.h, z29.h, z17.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1sb { z30.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1sb { z0.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z30.h, z30.h, z0.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1sb { z31.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1sb { z0.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z0.h, z0.h, z16.h\n"
+ "add z0.h, z0.h, z7.h\n"
+ "ld1sb { z1.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z1.h, z1.h, z16.h\n"
+ "add z1.h, z1.h, z7.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+ "ld1sb { z16.s }, p1/Z, [x21]\n"
+ "mov z2.d, z16.d\n"
+ "add z2.h, z2.h, z7.h\n"
+ ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17817e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z8.h\n"
+ "10:" // Unpadded: 2 priming loads
+ "add x22, x17, %x[ld_in_row]\n"
+ "ld1sb { z26.s }, p1/Z, [x17]\n"
+ "addvl x21, SP, #6\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1sb { z27.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "addvl x20, SP, #12\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1sb { z28.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1sb { z29.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z28.h, z28.h, z29.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1sb { z29.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1sb { z19.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z29.h, z29.h, z19.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1sb { z30.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1sb { z23.s }, p1/Z, [x22]\n"
+ "trn1 z30.h, z30.h, z23.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z30.h, z30.h, z7.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1sb { z22.s }, p1/Z, [x22]\n"
+ "mov z31.d, z22.d\n"
+ ".inst 0xc1731768 // sdot za.s[x8, 0], { z27.h-z30.h }, z3.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17b1769 // sdot za.s[x8, 1], { z27.h-z30.h }, z11.h\n"
+ ".inst 0xc1731788 // sdot za.s[x8, 0], { z28.h-z31.h }, z3.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1701789 // sdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
+ "11:" // Unpadded: 1 priming loads
+ "add x22, x17, %x[ld_in_row]\n"
+ "ld1sb { z29.s }, p1/Z, [x17]\n"
+ "addvl x21, SP, #3\n"
+ "ld1sb { z22.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z29.h, z29.h, z22.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1sb { z30.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "addvl x20, SP, #9\n"
+ "ld1sb { z25.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z30.h, z30.h, z25.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1sb { z31.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1sb { z0.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z0.h, z0.h, z16.h\n"
+ "add z0.h, z0.h, z7.h\n"
+ "ld1sb { z1.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1sb { z2.s }, p1/Z, [x22]\n"
+ "trn1 z1.h, z1.h, z2.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z1.h, z1.h, z7.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+ "ld1sb { z24.s }, p1/Z, [x22]\n"
+ "mov z2.d, z24.d\n"
+ ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+ ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17817a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z8.h\n"
+ "add z2.h, z2.h, z7.h\n"
+ "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17917c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z9.h\n"
+ ".inst 0xc17317e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z3.h\n"
+ "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17317e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z3.h\n"
+ "12:" // Unpadded: 0 priming loads
+ "cmp x7, #0x2\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "blt 22f\n"
+ "add x21, x17, %x[ld_in_row]\n"
+ "ld1sb { z23.s }, p1/Z, [x17]\n"
+ "sub x7, x7, #0x2\n"
+ "ld1sb { z25.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z25.h\n"
+ "sub x16, x16, #0x1\n"
+ "ld1sb { z24.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "lsr x20, x7, #0x1\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1sb { z30.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z30.h\n"
+ "cmp x20, x16\n"
+ "ld1sb { z25.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "csel x26, x20, x16, LT\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1sb { z22.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z22.h\n"
+ "add z25.h, z25.h, z7.h\n"
+ "ld1sb { z26.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1sb { z22.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z26.h, z26.h, z22.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1sb { z27.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "and x7, x7, #0x1\n"
+ "ld1sb { z30.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z27.h, z27.h, z30.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1sb { z28.s }, p1/Z, [x21]\n"
+ "mov z28.d, z28.d\n"
+ "add z28.h, z28.h, z7.h\n"
+ "sub x16, x16, x26\n"
+ "cbz x26, 21f\n"
+ "13:" // Unpadded: Main loop
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+ "addvl x25, SP, #6\n"
+ "addvl x24, SP, #12\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa0402b20 // ld1h { z0.h-z1.h }, pn10.b/Z, [x25]\n"
+ "add x23, x17, %x[ld_in_row]\n"
+ "addvl x22, SP, #3\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "addvl x21, SP, #9\n"
+ "subs x26, x26, #0x1\n"
+ ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17816ea // sdot za.s[x8, 2], { z23.h-z26.h }, z8.h\n"
+ "ld1sb { z23.s }, p1/Z, [x17]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
+ ".inst 0xc179170a // sdot za.s[x8, 2], { z24.h-z27.h }, z9.h\n"
+ "ld1sb { z16.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "ld1h { z9.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1sb { z24.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+ "ld1sb { z18.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1sb { z25.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "ld1sb { z8.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z8.h\n"
+ "add z25.h, z25.h, z7.h\n"
+ "ld1sb { z26.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "ld1sb { z28.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z26.h, z26.h, z28.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1sb { z27.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ "ld1sb { z28.s }, p1/Z, [x23]\n"
+ "trn1 z27.h, z27.h, z28.h\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z27.h, z27.h, z7.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ "ld1sb { z20.s }, p1/Z, [x23]\n"
+ "mov z28.d, z20.d\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+ ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ ".inst 0xc1701728 // sdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "st1b { z16.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "ld1sb { z23.s }, p1/Z, [x17]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "st1b { z17.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ "st1b { z18.s }, p1, [x10]\n"
+ "ld1sb { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add x10, x10, x28\n"
+ "st1b { z19.s }, p1, [x9]\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z16.h\n"
+ "add x9, x9, x27\n"
+ "ld1sb { z25.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z16.h\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1sb { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add z25.h, z25.h, z7.h\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1sb { z27.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1sb { z16.s }, p1/Z, [x20]\n"
+ "mov z28.d, z16.d\n"
+ "add z28.h, z28.h, z7.h\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "bgt 13b\n"
+ "b 21f\n"
+ "14:" // Padded
+ "cbz x22, 19f\n"
+ "cmp x22, #0x1\n"
+ "sub x7, x7, x22\n"
+ "beq 18f\n"
+ "cmp x22, #0x2\n"
+ "beq 17f\n"
+ "cmp x22, #0x3\n"
+ "beq 16f\n"
+ "15:" // Padded: 4 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z27.s }, p0/Z, [x17]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x21, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z28.s }, p0/Z, [x21]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "mov x12, #0x4\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ "trn1 z28.h, z28.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z29.s }, p0/Z, [x21]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z18.s }, p0/Z, [x21]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z30.s }, p0/Z, [x21]\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z31.s }, p0/Z, [x21]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "addvl x20, SP, #12\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z29.h, z29.h, z18.h\n"
+ "trn1 z30.h, z30.h, z17.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1sb { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ "mov z0.d, z20.d\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ ".inst 0xc1711788 // sdot za.s[x8, 0], { z28.h-z31.h }, z1.h\n"
+ "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17117a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z1.h\n"
+ "16:" // Padded: 3 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z24.s }, p0/Z, [x17]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
+ "add x21, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z25.s }, p0/Z, [x21]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "mov x12, #0x4\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z17.h\n"
+ "trn1 z25.h, z25.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z26.s }, p0/Z, [x21]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z18.s }, p0/Z, [x21]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z27.s }, p0/Z, [x21]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z17.s }, p0/Z, [x21]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z28.s }, p0/Z, [x21]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "addvl x20, SP, #9\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z26.h, z26.h, z18.h\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ "trn1 z28.h, z28.h, z16.h\n"
+ ".inst 0xc1721708 // sdot za.s[x8, 0], { z24.h-z27.h }, z2.h\n"
+ "ld1sb { z11.s }, p0/Z, [x21]\n"
+ "add z11.h, p0/M, z11.h, z7.h\n"
+ "mov z29.d, z11.d\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1701748 // sdot za.s[x8, 0], { z26.h-z29.h }, z0.h\n"
+ "17:" // Padded: 2 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z25.s }, p0/Z, [x17]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "addvl x21, SP, #6\n"
+ "trn1 z27.h, z27.h, z18.h\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "trn1 z29.h, z29.h, z16.h\n"
+ ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+ "ld1sb { z1.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #12\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ ".inst 0xc1791748 // sdot za.s[x8, 0], { z26.h-z29.h }, z9.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+ "mov z30.d, z1.d\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc1791768 // sdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+ "18:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z25.s }, p0/Z, [x17]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "addvl x21, SP, #3\n"
+ "trn1 z27.h, z27.h, z18.h\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "trn1 z29.h, z29.h, z16.h\n"
+ ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+ "ld1sb { z0.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #9\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ ".inst 0xc17b1748 // sdot za.s[x8, 0], { z26.h-z29.h }, z11.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+ "mov z30.d, z0.d\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+ "19:" // Padded: 0 priming loads
+ "cmp x7, #0x2\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "blt 22f\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z19.s }, p0/Z, [x20]\n"
+ "add z19.h, p0/M, z19.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z18.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "sub x7, x7, #0x2\n"
+ "sub x16, x16, #0x1\n"
+ "trn1 z25.h, z25.h, z19.h\n"
+ "trn1 z26.h, z26.h, z18.h\n"
+ "lsr x20, x7, #0x1\n"
+ "cmp x20, x16\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ "mov z28.d, z16.d\n"
+ "csel x25, x20, x16, LT\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "and x7, x7, #0x1\n"
+ "sub x16, x16, x25\n"
+ "cbz x25, 21f\n"
+ "20:" // Padded: Main loop
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+ "addvl x24, SP, #6\n"
+ "addvl x23, SP, #12\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ "addvl x22, SP, #3\n"
+ ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
+ "addvl x21, SP, #9\n"
+ "subs x25, x25, #0x1\n"
+ ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+ "ld1sb { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x4\n"
+ "ld1sb { z1.s }, p0/Z, [x20]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "ld1h { z3.h }, p2/Z, [x23, #2, MUL VL]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc173172a // sdot za.s[x8, 2], { z25.h-z28.h }, z3.h\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z30.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ "trn1 z24.h, z24.h, z1.h\n"
+ "trn1 z25.h, z25.h, z3.h\n"
+ "trn1 z26.h, z26.h, z30.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "trn1 z27.h, z27.h, z29.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "mov x12, #0x0\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc17216e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
+ "ld1sb { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc1731709 // sdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
+ "ld1sb { z24.s }, p0/Z, [x20]\n"
+ "mov z28.d, z20.d\n"
+ "ld1h { z1.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z22.s }, p0/Z, [x20]\n"
+ ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+ "mov x12, #0x4\n"
+ "add z22.h, p0/M, z22.h, z7.h\n"
+ "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1711729 // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z20.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z31.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ "st1b { z16.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "st1b { z17.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "trn1 z23.h, z23.h, z8.h\n"
+ "trn1 z24.h, z24.h, z22.h\n"
+ "st1b { z18.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "trn1 z25.h, z25.h, z28.h\n"
+ "trn1 z26.h, z26.h, z20.h\n"
+ "st1b { z19.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "trn1 z27.h, z27.h, z31.h\n"
+ "mov z28.d, z1.d\n"
+ "bgt 20b\n"
+ "21:" // Main loop tail
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+ "addvl x24, SP, #6\n"
+ "addvl x23, SP, #12\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc17816e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z8.h\n"
+ "add x22, x17, %x[ld_in_row]\n"
+ "addvl x21, SP, #3\n"
+ ".inst 0xc1791709 // sdot za.s[x8, 1], { z24.h-z27.h }, z9.h\n"
+ ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
+ "addvl x20, SP, #9\n"
+ ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+ "ld1sb { z29.s }, p0/Z, [x17]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z8.s }, p0/Z, [x22]\n"
+ "add z8.h, p0/M, z8.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+ "ld1sb { z30.s }, p0/Z, [x22]\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x4\n"
+ "ld1sb { z20.s }, p0/Z, [x22]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc172172a // sdot za.s[x8, 2], { z25.h-z28.h }, z2.h\n"
+ "trn1 z29.h, z29.h, z8.h\n"
+ "ld1sb { z31.s }, p0/Z, [x22]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z25.s }, p0/Z, [x22]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z0.s }, p0/Z, [x22]\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z17.s }, p0/Z, [x22]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z1.s }, p0/Z, [x22]\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z28.s }, p0/Z, [x22]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "trn1 z30.h, z30.h, z20.h\n"
+ "trn1 z31.h, z31.h, z25.h\n"
+ "trn1 z0.h, z0.h, z17.h\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "trn1 z1.h, z1.h, z28.h\n"
+ ".inst 0xc17317a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
+ "ld1sb { z22.s }, p0/Z, [x22]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ "add z22.h, p0/M, z22.h, z7.h\n"
+ ".inst 0xc17b17c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ ".inst 0xc1a4aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z4.s\n"
+ ".inst 0xc17317a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z3.h\n"
+ "mov z2.d, z22.d\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17b17c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xc1aaab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
+ ".inst 0xc17917e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1b5ccb8 // sclamp { z24.s-z27.s }, z5.s, z21.s\n"
+ "st1b { z24.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "st1b { z25.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "st1b { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ ".inst 0xc17817e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z8.h\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "st1b { z27.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "22:" // Main loop skip tail
+ "cbz x7, 23f\n" // Skip remainder inputs
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z24.s }, p0/Z, [x17]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z17.h\n"
+ "trn1 z25.h, z25.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1sb { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1sb { z16.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1sb { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1sb { z31.s }, p0/Z, [x20]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "trn1 z26.h, z26.h, z17.h\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "ld1sb { z0.s }, p0/Z, [x20]\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ "trn1 z28.h, z28.h, z31.h\n"
+ "addvl x21, SP, #6\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ "mov z29.d, z0.d\n"
+ "addvl x20, SP, #12\n"
+ "sub x16, x16, #0x1\n"
+ ".inst 0xc17b1728 // sdot za.s[x8, 0], { z25.h-z28.h }, z11.h\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1791729 // sdot za.s[x8, 1], { z25.h-z28.h }, z9.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ ".inst 0xc171170a // sdot za.s[x8, 2], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ "st1b { z16.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ ".inst 0xc1721749 // sdot za.s[x8, 1], { z26.h-z29.h }, z2.h\n"
+ "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "st1b { z17.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ ".inst 0xc173174a // sdot za.s[x8, 2], { z26.h-z29.h }, z3.h\n"
+ "add x8, x8, #0x1\n"
+ "st1b { z18.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z19.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "23:" // Tail input: End
+ "cbz x16, 25f\n"
+ "24:" // Right padding loop
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ "subs x16, x16, #0x1\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+ ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+ "st1b { z28.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "st1b { z29.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "st1b { z30.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z31.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "bgt 24b\n"
+ "25:" // End
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #16\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x6\n"
+ "whilelt p1.s, x6, x5\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x10]\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #15\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..2e40c75d6b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
+ const uint8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const uint8_t *weights,
+ uint8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8q_planar_3x3_s1_4rows_dot_za : public PlanarStrategy<uint8_t, uint8_t>
+{
+ using Parent = PlanarStrategy<uint8_t, uint8_t>;
+
+ public:
+ using return_type = uint8_t;
+ constexpr static auto output_rows = 4u;
+ constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+ constexpr static auto stride_rows = 1u, stride_cols = 1u;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ sme2_u8q_planar_3x3_s1_4rows_dot_za(const CPUInfo *)
+ : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+ {
+ }
+
+ typename Parent::KernelType get_kernel(void) const override
+ {
+ return sme2_u8q_planar_3x3_s1_4rows_dot_za_impl;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..60c3a1e632
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,664 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_3x3_s1_4rows_dot_za_impl(
+ const uint8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const uint8_t *weights,
+ uint8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+)
+{
+ struct Args
+ {
+ const uint8_t *inptr;
+ size_t ld_in_vl;
+ long unsigned int pad_top, pad_bottom, pad_left;
+ const uint8_t *weights;
+ long unsigned int input_cols, output_cols;
+ uint8_t **outptrs;
+ const size_t *ld_out_cols;
+ const size_t *ld_out_vls;
+ long unsigned int current_channel, n_channels;
+ };
+
+ Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ptrue p2.b\n"
+ "mov x20, #0x6\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "sub x20, x20, x6\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x17\n"
+ "whilelt p9.s, XZR, x20\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "whilelt p8.s, XZR, x7\n"
+ "addvl SP, SP, #-12\n"
+ "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "neg z21.h, p2/M, z21.h\n"
+ "eor p8.b, p2/Z, p8.b, p9.b\n"
+ "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "1:" // Channel loop
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "mov z30.s, #0x0\n"
+ "cbz x20, 2f\n"
+ "ld1w { z30.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "2:" // Load bias: Done
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x22\n"
+ "ld1b { z10.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "ld1rh { z31.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z7.h, #0x0\n"
+ "sub z10.h, z10.h, z31.h\n"
+ "incw x22\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z16.h, z16.h, z31.h\n"
+ "trn1 z20.h, z7.h, z10.h\n"
+ "ld1b { z11.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z31.h\n"
+ "mov x20, x22\n"
+ "trn1 z19.h, z10.h, z16.h\n"
+ "ld1b { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "trn1 z26.h, z16.h, z11.h\n"
+ "trn1 z13.h, z11.h, z7.h\n"
+ "ld1b { z11.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z24.h, z24.h, z31.h\n"
+ "sub z11.h, z11.h, z31.h\n"
+ "ld1b { z2.s }, p2/Z, [x20]\n"
+ "sub z2.h, z2.h, z31.h\n"
+ "addvl x21, SP, #12\n"
+ "incw x22\n"
+ "addvl x21, x21, #-4\n"
+ "mov x20, x22\n"
+ "st1h { z20.h }, p2, [x21]\n"
+ "trn1 z22.h, z7.h, z24.h\n"
+ "st1h { z19.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z1.h, z24.h, z11.h\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z3.h, z11.h, z2.h\n"
+ "ld1b { z0.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "st1h { z13.h }, p2, [x21, #3, MUL VL]\n"
+ "trn1 z25.h, z2.h, z7.h\n"
+ "ld1b { z4.s }, p2/Z, [x20]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "sub z16.h, z16.h, z31.h\n"
+ "sub z0.h, z0.h, z31.h\n"
+ "addvl x21, x21, #-4\n"
+ "st1h { z22.h }, p2, [x21]\n"
+ "sub z4.h, z4.h, z31.h\n"
+ "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
+ "mov z31.d, z30.d\n"
+ "st1h { z3.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z24.h, z7.h, z16.h\n"
+ "trn1 z18.h, z16.h, z0.h\n"
+ "st1h { z25.h }, p2, [x21, #3, MUL VL]\n"
+ "addvl x21, x21, #-4\n"
+ "trn1 z0.h, z0.h, z4.h\n"
+ "trn1 z1.h, z4.h, z7.h\n"
+ "st1h { z24.h }, p2, [x21]\n"
+ "st1h { z18.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z1.h }, p2, [x21, #3, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z14.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "3:" // Load mul: End
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z12.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "4:" // Load right_shift: End
+ "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x15, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "orr x23, x17, x23, LSL #22\n"
+ "mov x22, #0x6\n"
+ "add x21, x7, x6\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "mov x8, #0x0\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x7, x14\n"
+ "5:" // Issue prefetches
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
+ "bgt 5b\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x14, x7, x20, x14\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
+ "mov x22, #0x2\n"
+ "ldp x11, x10, [x23], #0x10\n"
+ ".inst 0xc0040bc2 // mova za.d[x8, #2], { z30.d-z31.d }\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ ".inst 0xc0040bc3 // mova za.d[x8, #3], { z30.d-z31.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ "ldp x27, x26, [x23], #0x10\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ "sub x13, x13, x21\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ "6:" // Left padding
+ "subs x21, x21, #0x1\n"
+ "st1b { z24.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z25.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z27.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "bgt 6b\n"
+ "7:" // Left padding: End
+ "adds XZR, x7, x6\n"
+ "bne 12f\n"
+ "cbz x22, 10f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
+ "beq 9f\n"
+ "8:" // Unpadded: 2 priming loads
+ "add x21, x14, %x[ld_in_row]\n"
+ "ld1b { z20.s }, p1/Z, [x14]\n"
+ "addvl x20, SP, #8\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z4.h, z20.h, z16.h\n"
+ "add z4.h, z4.h, z21.h\n"
+ "ld1b { z23.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z5.h, z23.h, z22.h\n"
+ "add z5.h, z5.h, z21.h\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
+ "trn1 z6.h, z17.h, z16.h\n"
+ "add z6.h, z6.h, z21.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16b1488 // sdot za.s[x8, 0], { z4.h-z5.h }, z11.h\n"
+ ".inst 0xc1631489 // sdot za.s[x8, 1], { z4.h-z5.h }, z3.h\n"
+ ".inst 0xa1412a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16814a8 // sdot za.s[x8, 0], { z5.h-z6.h }, z8.h\n"
+ ".inst 0xc16014a9 // sdot za.s[x8, 1], { z5.h-z6.h }, z0.h\n"
+ "9:" // Unpadded: 1 priming loads
+ "add x22, x14, %x[ld_in_row]\n"
+ "ld1b { z25.s }, p1/Z, [x14]\n"
+ "addvl x21, SP, #4\n"
+ "ld1b { z6.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z3.h, z25.h, z6.h\n"
+ "add z3.h, z3.h, z21.h\n"
+ "ld1b { z18.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "addvl x20, SP, #8\n"
+ "ld1b { z26.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z4.h, z18.h, z26.h\n"
+ "add z4.h, z4.h, z21.h\n"
+ "ld1b { z2.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1b { z5.s }, p1/Z, [x22]\n"
+ "trn1 z5.h, z2.h, z5.h\n"
+ "add z5.h, z5.h, z21.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1611468 // sdot za.s[x8, 0], { z3.h-z4.h }, z1.h\n"
+ ".inst 0xc1601469 // sdot za.s[x8, 1], { z3.h-z4.h }, z0.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0412aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a146a // sdot za.s[x8, 2], { z3.h-z4.h }, z10.h\n"
+ ".inst 0xc162146b // sdot za.s[x8, 3], { z3.h-z4.h }, z2.h\n"
+ ".inst 0xc1691488 // sdot za.s[x8, 0], { z4.h-z5.h }, z9.h\n"
+ ".inst 0xc1681489 // sdot za.s[x8, 1], { z4.h-z5.h }, z8.h\n"
+ ".inst 0xa1412a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a148a // sdot za.s[x8, 2], { z4.h-z5.h }, z10.h\n"
+ ".inst 0xc162148b // sdot za.s[x8, 3], { z4.h-z5.h }, z2.h\n"
+ "10:" // Unpadded: 0 priming loads
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "cbz x15, 18f\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x14]\n"
+ "sub x15, x15, #0x1\n"
+ "ld1b { z9.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z6.h, z17.h, z9.h\n"
+ "sub x13, x13, #0x1\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "cmp x15, x13\n"
+ "add z6.h, z6.h, z21.h\n"
+ "ld1b { z7.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z7.h, z17.h, z7.h\n"
+ "csel x23, x15, x13, LT\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add z7.h, z7.h, z21.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1b { z1.s }, p1/Z, [x20]\n"
+ "trn1 z8.h, z17.h, z1.h\n"
+ "add z8.h, z8.h, z21.h\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 17f\n"
+ "11:" // Unpadded: Main loop
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ "addvl x22, SP, #4\n"
+ "addvl x21, SP, #8\n"
+ "ld1b { z2.s }, p1/Z, [x14]\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa1402ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ "ld1b { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa1412ac3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ "ld1b { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc16d14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z13.h\n"
+ "ld1b { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16514cb // sdot za.s[x8, 3], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+ "trn1 z6.h, z2.h, z19.h\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16314eb // sdot za.s[x8, 3], { z7.h-z8.h }, z3.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc16914ec // sdot za.s[x8, 4], { z7.h-z8.h }, z9.h\n"
+ "st1b { z24.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "add z6.h, z6.h, z21.h\n"
+ ".inst 0xc16114ed // sdot za.s[x8, 5], { z7.h-z8.h }, z1.h\n"
+ "trn1 z7.h, z23.h, z18.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "st1b { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ "st1b { z25.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "add z7.h, z7.h, z21.h\n"
+ "st1b { z27.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "add z8.h, z8.h, z21.h\n"
+ "bgt 11b\n"
+ "b 17f\n"
+ "12:" // Padded
+ "cbz x22, 15f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
+ "beq 14f\n"
+ "13:" // Padded: 2 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z19.s }, p0/Z, [x14]\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z7.h, z19.h, z18.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #8\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z9.h, z17.h, z16.h\n"
+ ".inst 0xc16a14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z10.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc16214e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z2.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16d1508 // sdot za.s[x8, 0], { z8.h-z9.h }, z13.h\n"
+ ".inst 0xc1651509 // sdot za.s[x8, 1], { z8.h-z9.h }, z5.h\n"
+ "14:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z19.s }, p0/Z, [x14]\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z22.h, z19.h, z18.h\n"
+ "trn1 z23.h, z17.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x21, SP, #4\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #8\n"
+ "trn1 z24.h, z17.h, z16.h\n"
+ ".inst 0xc16116c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+ ".inst 0xc16016c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xa0412aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16d16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z13.h\n"
+ ".inst 0xc16516cb // sdot za.s[x8, 3], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc16116e8 // sdot za.s[x8, 0], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16016e9 // sdot za.s[x8, 1], { z23.h-z24.h }, z0.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16116ea // sdot za.s[x8, 2], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16016eb // sdot za.s[x8, 3], { z23.h-z24.h }, z0.h\n"
+ "15:" // Padded: 0 priming loads
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "cbz x15, 18f\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z19.s }, p0/Z, [x14]\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z6.h, z19.h, z18.h\n"
+ "trn1 z7.h, z17.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ "sub x15, x15, #0x1\n"
+ "sub x13, x13, #0x1\n"
+ "cmp x15, x13\n"
+ "trn1 z8.h, z17.h, z16.h\n"
+ "csel x23, x15, x13, LT\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 17f\n"
+ "16:" // Padded: Main loop
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z9.s }, p0/Z, [x14]\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ "add z9.h, p0/M, z9.h, z21.h\n"
+ "add x22, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z19.s }, p0/Z, [x22]\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z18.s }, p0/Z, [x22]\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ "mov x12, #0x4\n"
+ "addvl x21, SP, #4\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #8\n"
+ ".inst 0xc16b14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z11.h\n"
+ "subs x23, x23, #0x1\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ ".inst 0xc16314cb // sdot za.s[x8, 3], { z6.h-z7.h }, z3.h\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xa0412aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16d14cc // sdot za.s[x8, 4], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ "ld1b { z2.s }, p0/Z, [x22]\n"
+ ".inst 0xc16514cd // sdot za.s[x8, 5], { z6.h-z7.h }, z5.h\n"
+ "add z2.h, p0/M, z2.h, z21.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc16a14eb // sdot za.s[x8, 3], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa1412a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z24.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc16b14ec // sdot za.s[x8, 4], { z7.h-z8.h }, z11.h\n"
+ "st1b { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "trn1 z6.h, z9.h, z19.h\n"
+ ".inst 0xc16314ed // sdot za.s[x8, 5], { z7.h-z8.h }, z3.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "st1b { z25.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z27.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "trn1 z7.h, z18.h, z16.h\n"
+ "trn1 z8.h, z17.h, z2.h\n"
+ "bgt 16b\n"
+ "17:" // Main loop tail
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ "addvl x21, SP, #4\n"
+ "addvl x20, SP, #8\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc16114ca // sdot za.s[x8, 2], { z6.h-z7.h }, z1.h\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc16014cb // sdot za.s[x8, 3], { z6.h-z7.h }, z0.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ "st1b { z24.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+ "st1b { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ ".inst 0xc16314ea // sdot za.s[x8, 2], { z7.h-z8.h }, z3.h\n"
+ "st1b { z25.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ ".inst 0xc16214eb // sdot za.s[x8, 3], { z7.h-z8.h }, z2.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z27.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ ".inst 0xc16114ec // sdot za.s[x8, 4], { z7.h-z8.h }, z1.h\n"
+ ".inst 0xc16014ed // sdot za.s[x8, 5], { z7.h-z8.h }, z0.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "18:" // Main loop skip tail
+ "cbz x13, 20f\n"
+ "19:" // Right padding loop
+ ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ "subs x13, x13, #0x1\n"
+ ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc1acaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc1afab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ ".inst 0xc1bccfa4 // sclamp { z4.s-z7.s }, z29.s, z28.s\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z5.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z7.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "bgt 19b\n"
+ "20:" // End
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x16\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x16, x17\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x10]\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #12\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..f852e12de1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
+ const uint8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const uint8_t *weights,
+ uint8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8q_planar_3x3_s2_4rows_dot_za : public PlanarStrategy<uint8_t, uint8_t>
+{
+ using Parent = PlanarStrategy<uint8_t, uint8_t>;
+
+ public:
+ using return_type = uint8_t;
+ constexpr static auto output_rows = 4u;
+ constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+ constexpr static auto stride_rows = 2u, stride_cols = 2u;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ sme2_u8q_planar_3x3_s2_4rows_dot_za(const CPUInfo *)
+ : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+ {
+ }
+
+ typename Parent::KernelType get_kernel(void) const override
+ {
+ return sme2_u8q_planar_3x3_s2_4rows_dot_za_impl;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..e4ce6c74fb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,881 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_3x3_s2_4rows_dot_za_impl(
+ const uint8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const uint8_t *weights,
+ uint8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+)
+{
+ struct Args
+ {
+ const uint8_t *inptr;
+ size_t ld_in_vl;
+ long unsigned int pad_top, pad_bottom, pad_left;
+ const uint8_t *weights;
+ long unsigned int input_cols, output_cols;
+ uint8_t **outptrs;
+ const size_t *ld_out_cols;
+ const size_t *ld_out_vls;
+ long unsigned int current_channel, n_channels;
+ };
+
+ Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ptrue p2.b\n"
+ "mov x20, #0x9\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ld1rh { z11.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "sub x20, x20, x6\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x17\n"
+ "whilelt p9.s, XZR, x20\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "whilelt p8.s, XZR, x7\n"
+ "addvl SP, SP, #-6\n"
+ "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "neg z11.h, p2/M, z11.h\n"
+ "eor p8.b, p2/Z, p8.b, p9.b\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "1:" // Channel loop
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "mov z28.s, #0x0\n"
+ "cbz x20, 2f\n"
+ "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "2:" // Load bias: Done
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x22\n"
+ "ld1b { z26.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "ld1rh { z16.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "sub z26.h, z26.h, z16.h\n"
+ "incw x22\n"
+ "mov z24.h, #0x0\n"
+ "ld1b { z3.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z3.h, z3.h, z16.h\n"
+ "trn1 z31.h, z26.h, z3.h\n"
+ "ld1b { z21.s }, p2/Z, [x20]\n"
+ "sub z21.h, z21.h, z16.h\n"
+ "mov x20, x22\n"
+ "trn1 z14.h, z21.h, z24.h\n"
+ "ld1b { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z2.h, z2.h, z16.h\n"
+ "addvl x21, SP, #6\n"
+ "ld1b { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z25.h, z25.h, z16.h\n"
+ "incw x22\n"
+ "ld1b { z27.s }, p2/Z, [x20]\n"
+ "sub z27.h, z27.h, z16.h\n"
+ "addvl x21, x21, #-2\n"
+ "mov x20, x22\n"
+ "st1h { z31.h }, p2, [x21]\n"
+ "trn1 z4.h, z2.h, z25.h\n"
+ "ld1b { z26.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "ld1b { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "st1h { z14.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z12.h, z27.h, z24.h\n"
+ "ld1b { z20.s }, p2/Z, [x20]\n"
+ "sub z26.h, z26.h, z16.h\n"
+ "sub z23.h, z23.h, z16.h\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "sub z20.h, z20.h, z16.h\n"
+ "addvl x21, x21, #-2\n"
+ "st1h { z4.h }, p2, [x21]\n"
+ "mov z29.d, z28.d\n"
+ "st1h { z12.h }, p2, [x21, #1, MUL VL]\n"
+ "addvl x21, x21, #-2\n"
+ "mov z30.d, z28.d\n"
+ "mov z31.d, z28.d\n"
+ "trn1 z25.h, z26.h, z23.h\n"
+ "st1h { z25.h }, p2, [x21]\n"
+ "trn1 z3.h, z20.h, z24.h\n"
+ "st1h { z3.h }, p2, [x21, #1, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z6.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "3:" // Load mul: End
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z9.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "4:" // Load right_shift: End
+ "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x15, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "orr x23, x17, x23, LSL #22\n"
+ "mov x22, #0x9\n"
+ "add x21, x7, x6\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "mov x8, #0x0\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x7, x14\n"
+ "5:" // Issue prefetches
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
+ "bgt 5b\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x14, x7, x20, x14\n"
+ ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
+ "mov x22, #0x2\n"
+ "ldp x11, x10, [x23], #0x10\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldp x27, x26, [x23], #0x10\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ "and x22, x21, #0x1\n"
+ ".inst 0xc1a9aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
+ "add x21, x21, #0x1\n"
+ "lsr x21, x21, #0x1\n"
+ ".inst 0xc1adab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
+ "sub x13, x13, x21\n"
+ ".inst 0xc1a7cd58 // sclamp { z24.s-z27.s }, z10.s, z7.s\n"
+ "6:" // Left padding
+ "subs x21, x21, #0x1\n"
+ "st1b { z24.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z25.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z26.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z27.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "bgt 6b\n"
+ "7:" // Left padding: End
+ "adds XZR, x7, x6\n"
+ "bne 12f\n"
+ "cbz x22, 10f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
+ "beq 9f\n"
+ "8:" // Unpadded: 2 priming loads
+ "add x21, x14, %x[ld_in_row]\n"
+ "ld1b { z1.s }, p1/Z, [x14]\n"
+ "addvl x20, SP, #4\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z1.h, z1.h, z21.h\n"
+ "add z1.h, z1.h, z11.h\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1b { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z2.h, z2.h, z15.h\n"
+ "add z2.h, z2.h, z11.h\n"
+ "ld1b { z3.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z3.h, z3.h, z21.h\n"
+ "add z3.h, z3.h, z11.h\n"
+ "ld1b { z4.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z4.h, z4.h, z19.h\n"
+ "add z4.h, z4.h, z11.h\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
+ "mov z5.d, z8.d\n"
+ "add z5.h, z5.h, z11.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+ ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+ "9:" // Unpadded: 1 priming loads
+ "add x21, x14, %x[ld_in_row]\n"
+ "ld1b { z1.s }, p1/Z, [x14]\n"
+ "addvl x20, SP, #2\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z1.h, z1.h, z21.h\n"
+ "add z1.h, z1.h, z11.h\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1b { z12.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z2.h, z2.h, z12.h\n"
+ "add z2.h, z2.h, z11.h\n"
+ "ld1b { z3.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z3.h, z3.h, z8.h\n"
+ "add z3.h, z3.h, z11.h\n"
+ "ld1b { z4.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z5.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z4.h, z4.h, z5.h\n"
+ "add z4.h, z4.h, z11.h\n"
+ "ld1b { z5.s }, p1/Z, [x21]\n"
+ "mov z5.d, z5.d\n"
+ "add z5.h, z5.h, z11.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+ ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+ "10:" // Unpadded: 0 priming loads
+ "cmp x15, #0x2\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "blt 18f\n"
+ "add x21, x14, %x[ld_in_row]\n"
+ "ld1b { z21.s }, p1/Z, [x14]\n"
+ "sub x15, x15, #0x2\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z8.h\n"
+ "sub x13, x13, #0x1\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "lsr x20, x15, #0x1\n"
+ "add z21.h, z21.h, z11.h\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z25.h\n"
+ "cmp x20, x13\n"
+ "ld1b { z23.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "csel x23, x20, x13, LT\n"
+ "add z22.h, z22.h, z11.h\n"
+ "ld1b { z18.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z18.h\n"
+ "add z23.h, z23.h, z11.h\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z19.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
+ "mov z25.d, z8.d\n"
+ "add z25.h, z25.h, z11.h\n"
+ "and x15, x15, #0x1\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 17f\n"
+ "11:" // Unpadded: Main loop
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+ "addvl x20, SP, #4\n"
+ "add x22, x14, %x[ld_in_row]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "addvl x21, SP, #2\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1b { z21.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "trn1 z21.h, z21.h, z18.h\n"
+ "ld1b { z22.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z21.h, z21.h, z11.h\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z8.h\n"
+ "add z22.h, z22.h, z11.h\n"
+ "ld1b { z23.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ld1b { z27.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z27.h\n"
+ "add z23.h, z23.h, z11.h\n"
+ "ld1b { z24.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z8.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "ld1b { z4.s }, p1/Z, [x22]\n"
+ "mov z25.d, z4.d\n"
+ "add z25.h, z25.h, z11.h\n"
+ ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17416a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z4.h\n"
+ ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+ "ld1b { z21.s }, p1/Z, [x14]\n"
+ ".inst 0xc17c16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z12.h\n"
+ ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ "ld1b { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z12.h\n"
+ ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z0.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z20.h\n"
+ "st1b { z1.s }, p1, [x10]\n"
+ "ld1b { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add x10, x10, x28\n"
+ "st1b { z2.s }, p1, [x27]\n"
+ "ld1b { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z24.h\n"
+ "add x27, x27, x25\n"
+ "ld1b { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z3.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "ld1b { z3.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z3.h\n"
+ "add z21.h, z21.h, z11.h\n"
+ "ld1b { z3.s }, p1/Z, [x20]\n"
+ "mov z25.d, z3.d\n"
+ "add z22.h, z22.h, z11.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "add z23.h, z23.h, z11.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "add z25.h, z25.h, z11.h\n"
+ "bgt 11b\n"
+ "b 17f\n"
+ "12:" // Padded
+ "cbz x22, 15f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
+ "beq 14f\n"
+ "13:" // Padded: 2 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z22.s }, p0/Z, [x14]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z17.h\n"
+ "trn1 z23.h, z23.h, z4.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z11.h\n"
+ "addvl x20, SP, #4\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+ "mov z26.d, z1.d\n"
+ ".inst 0xc17416c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z4.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc17c16e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z12.h\n"
+ "14:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z22.s }, p0/Z, [x14]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z5.s }, p0/Z, [x20]\n"
+ "add z5.h, p0/M, z5.h, z11.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z17.h\n"
+ "trn1 z23.h, z23.h, z5.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
+ "add z15.h, p0/M, z15.h, z11.h\n"
+ "addvl x20, SP, #2\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov z26.d, z15.d\n"
+ ".inst 0xc17016c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc17116e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
+ "15:" // Padded: 0 priming loads
+ "cmp x15, #0x2\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "blt 18f\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z18.h\n"
+ "trn1 z22.h, z22.h, z3.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
+ "add z19.h, p0/M, z19.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z20.h, p0/M, z20.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
+ "sub x15, x15, #0x2\n"
+ "sub x13, x13, #0x1\n"
+ "trn1 z23.h, z23.h, z19.h\n"
+ "trn1 z24.h, z24.h, z20.h\n"
+ "lsr x20, x15, #0x1\n"
+ "cmp x20, x13\n"
+ "mov z25.d, z3.d\n"
+ "csel x22, x20, x13, LT\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "and x15, x15, #0x1\n"
+ "sub x13, x13, x22\n"
+ "cbz x22, 17f\n"
+ "16:" // Padded: Main loop
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+ "addvl x20, SP, #4\n"
+ "mov x12, #0x0\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "add x21, x14, %x[ld_in_row]\n"
+ ".inst 0xc17416a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z4.h\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z14.s }, p0/Z, [x21]\n"
+ "add z14.h, p0/M, z14.h, z11.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc17c16c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z12.h\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
+ "mov x12, #0x4\n"
+ "add z15.h, p0/M, z15.h, z11.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z23.s }, p0/Z, [x21]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z24.s }, p0/Z, [x21]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z4.s }, p0/Z, [x21]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
+ "mov x12, #0x8\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z14.h\n"
+ "trn1 z22.h, z22.h, z15.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "addvl x20, SP, #2\n"
+ "ld1b { z2.s }, p0/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z4.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov x12, #0x0\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "add z2.h, p0/M, z2.h, z11.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc17016a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z0.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "mov z25.d, z2.d\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc17116c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z1.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add z12.h, p0/M, z12.h, z11.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z11.h\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "trn1 z21.h, z21.h, z20.h\n"
+ "st1b { z17.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "trn1 z22.h, z22.h, z4.h\n"
+ "trn1 z23.h, z23.h, z27.h\n"
+ "st1b { z18.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "trn1 z24.h, z24.h, z12.h\n"
+ "mov z25.d, z8.d\n"
+ "st1b { z19.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "bgt 16b\n"
+ "17:" // Main loop tail
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+ "addvl x20, SP, #4\n"
+ "mov x12, #0x0\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1b { z0.s }, p0/Z, [x14]\n"
+ "add z0.h, p0/M, z0.h, z11.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
+ "add z14.h, p0/M, z14.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add z12.h, p0/M, z12.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z2.s }, p0/Z, [x20]\n"
+ "add z2.h, p0/M, z2.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #2\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ "trn1 z0.h, z0.h, z14.h\n"
+ "add x8, x8, #0x1\n"
+ "add z27.h, p0/M, z27.h, z11.h\n"
+ "trn1 z1.h, z1.h, z12.h\n"
+ "trn1 z2.h, z2.h, z21.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "trn1 z3.h, z3.h, z25.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ "mov z4.d, z27.d\n"
+ ".inst 0xc17e1408 // sdot za.s[x8, 0], { z0.h-z3.h }, z14.h\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "st1b { z17.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ ".inst 0xc17f1428 // sdot za.s[x8, 0], { z1.h-z4.h }, z15.h\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "st1b { z18.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z19.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "18:" // Main loop skip tail
+ "cbz x15, 19f\n" // Skip remainder inputs
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z0.s }, p0/Z, [x20]\n"
+ "add z0.h, p0/M, z0.h, z11.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z17.h\n"
+ "trn1 z22.h, z22.h, z0.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z5.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z5.h, p0/M, z5.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z5.h\n"
+ "mov z25.d, z4.d\n"
+ "addvl x20, SP, #4\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+ "sub x13, x13, #0x1\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "st1b { z17.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z18.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z19.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "19:" // Tail input: End
+ "cbz x13, 21f\n"
+ "20:" // Right padding loop
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+ "subs x13, x13, #0x1\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "st1b { z0.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z1.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z2.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z3.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "bgt 20b\n"
+ "21:" // End
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x16\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x16, x17\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x10]\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #6\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..d8b87dcd55
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
+ const uint8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const uint8_t *weights,
+ uint8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8q_planar_5x5_s1_4rows_dot_za : public PlanarStrategy<uint8_t, uint8_t>
+{
+ using Parent = PlanarStrategy<uint8_t, uint8_t>;
+
+ public:
+ using return_type = uint8_t;
+ constexpr static auto output_rows = 4u;
+ constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+ constexpr static auto stride_rows = 1u, stride_cols = 1u;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ sme2_u8q_planar_5x5_s1_4rows_dot_za(const CPUInfo *)
+ : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+ {
+ }
+
+ typename Parent::KernelType get_kernel(void) const override
+ {
+ return sme2_u8q_planar_5x5_s1_4rows_dot_za_impl;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..d33ef764ef
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,1204 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_5x5_s1_4rows_dot_za_impl(
+ const uint8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const uint8_t *weights,
+ uint8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+)
+{
+ struct Args
+ {
+ const uint8_t *inptr;
+ size_t ld_in_vl;
+ long unsigned int pad_top, pad_bottom, pad_left;
+ const uint8_t *weights;
+ long unsigned int input_cols, output_cols;
+ uint8_t **outptrs;
+ const size_t *ld_out_cols;
+ const size_t *ld_out_vls;
+ long unsigned int current_channel, n_channels;
+ };
+
+ Args args = { inptr, ld_in_vl, pad_top, 8u - std::min(8u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ptrue p2.b\n"
+ "mov x20, #0x8\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ld1rh { z17.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "sub x20, x20, x5\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x7\n"
+ "whilelt p9.s, XZR, x20\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "whilelt p8.s, XZR, x6\n"
+ "addvl SP, SP, #-30\n"
+ "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "neg z17.h, p2/M, z17.h\n"
+ "eor p8.b, p2/Z, p8.b, p9.b\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "1:" // Channel loop
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "mov z18.s, #0x0\n"
+ "cbz x20, 2f\n"
+ "ld1w { z18.s }, p1/Z, [x20, x17, LSL #2]\n"
+ "2:" // Load bias: Done
+ "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x23\n"
+ "ld1b { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "ld1rh { z3.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z15.h, #0x0\n"
+ "sub z2.h, z2.h, z3.h\n"
+ "incw x23\n"
+ "ld1b { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z13.h, z13.h, z3.h\n"
+ "trn1 z11.h, z15.h, z2.h\n"
+ "ld1b { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z27.h, z27.h, z3.h\n"
+ "trn1 z0.h, z2.h, z13.h\n"
+ "ld1b { z19.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z19.h, z19.h, z3.h\n"
+ "trn1 z26.h, z13.h, z27.h\n"
+ "ld1b { z14.s }, p2/Z, [x20]\n"
+ "sub z14.h, z14.h, z3.h\n"
+ "mov x20, x23\n"
+ "trn1 z10.h, z27.h, z19.h\n"
+ "ld1b { z9.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z19.h, z19.h, z14.h\n"
+ "trn1 z1.h, z14.h, z15.h\n"
+ "ld1b { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z9.h, z9.h, z3.h\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "ld1b { z29.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z29.h, z29.h, z3.h\n"
+ "addvl x22, SP, #30\n"
+ "ld1b { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "incw x23\n"
+ "sub z2.h, z2.h, z3.h\n"
+ "ld1b { z23.s }, p2/Z, [x20]\n"
+ "addvl x22, x22, #-6\n"
+ "sub z23.h, z23.h, z3.h\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "trn1 z20.h, z15.h, z9.h\n"
+ "incw x23\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "st1h { z0.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z22.h, z9.h, z5.h\n"
+ "ld1b { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z9.h, z5.h, z29.h\n"
+ "ld1b { z21.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z26.h, z29.h, z2.h\n"
+ "ld1b { z0.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z28.h, z2.h, z23.h\n"
+ "ld1b { z19.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z1.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z2.h, z23.h, z15.h\n"
+ "sub z25.h, z25.h, z3.h\n"
+ "addvl x22, x22, #-6\n"
+ "sub z21.h, z21.h, z3.h\n"
+ "ld1b { z6.s }, p2/Z, [x20]\n"
+ "sub z0.h, z0.h, z3.h\n"
+ "mov x20, x23\n"
+ "sub z19.h, z19.h, z3.h\n"
+ "sub z6.h, z6.h, z3.h\n"
+ "st1h { z20.h }, p2, [x22]\n"
+ "incw x23\n"
+ "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z11.h, z15.h, z25.h\n"
+ "trn1 z10.h, z25.h, z21.h\n"
+ "ld1b { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z9.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z14.h, z21.h, z0.h\n"
+ "ld1b { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z21.h, z0.h, z19.h\n"
+ "ld1b { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z28.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z19.h, z19.h, z6.h\n"
+ "ld1b { z29.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z13.h, z6.h, z15.h\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "sub z23.h, z23.h, z3.h\n"
+ "ld1b { z1.s }, p2/Z, [x20]\n"
+ "addvl x22, x22, #-6\n"
+ "sub z27.h, z27.h, z3.h\n"
+ "sub z29.h, z29.h, z3.h\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "sub z1.h, z1.h, z3.h\n"
+ "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z30.h, z15.h, z5.h\n"
+ "trn1 z26.h, z5.h, z23.h\n"
+ "ld1b { z11.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z14.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z22.h, z23.h, z27.h\n"
+ "ld1b { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z28.h, z27.h, z29.h\n"
+ "ld1b { z8.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z27.h, z29.h, z1.h\n"
+ "ld1b { z9.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z13.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z2.h, z1.h, z15.h\n"
+ "ld1b { z14.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z3.h\n"
+ "addvl x22, x22, #-6\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "sub z8.h, z8.h, z3.h\n"
+ "st1h { z30.h }, p2, [x22]\n"
+ "sub z9.h, z9.h, z3.h\n"
+ "sub z14.h, z14.h, z3.h\n"
+ "st1h { z26.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+ "mov z19.d, z18.d\n"
+ "trn1 z22.h, z15.h, z11.h\n"
+ "st1h { z28.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z1.h, z11.h, z5.h\n"
+ "trn1 z31.h, z5.h, z8.h\n"
+ "st1h { z27.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z8.h, z8.h, z9.h\n"
+ "trn1 z21.h, z9.h, z14.h\n"
+ "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "addvl x22, x22, #-6\n"
+ "trn1 z15.h, z14.h, z15.h\n"
+ "st1h { z22.h }, p2, [x22]\n"
+ "st1h { z1.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z31.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z8.h }, p2, [x22, #3, MUL VL]\n"
+ "st1h { z21.h }, p2, [x22, #4, MUL VL]\n"
+ "st1h { z15.h }, p2, [x22, #5, MUL VL]\n"
+ "cbz x21, 3f\n"
+ "ld1w { z7.s }, p1/Z, [x21, x17, LSL #2]\n"
+ "3:" // Load mul: End
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z4.s }, p1/Z, [x20, x17, LSL #2]\n"
+ "4:" // Load right_shift: End
+ "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x25, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
+ "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "orr x23, x7, x23, LSL #22\n"
+ "mov x22, #0x8\n"
+ "add x21, x6, x5\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "mov x11, #0x0\n"
+ "mov x8, #0x8\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x6, x16\n"
+ "5:" // Issue prefetches
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
+ "bgt 5b\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x16, x6, x20, x16\n"
+ ".inst 0xc0046a40 // mova za.d[x11, #0], { z18.d-z19.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0046a41 // mova za.d[x11, #1], { z18.d-z19.d }\n"
+ "mov x22, #0x4\n"
+ "ldp x14, x13, [x23], #0x10\n"
+ ".inst 0xc0046a42 // mova za.d[x11, #2], { z18.d-z19.d }\n"
+ "ldp x4, x10, [x20], #0x10\n"
+ ".inst 0xc0046a43 // mova za.d[x11, #3], { z18.d-z19.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0046a44 // mova za.d[x11, #4], { z18.d-z19.d }\n"
+ "ldp x9, x28, [x23], #0x10\n"
+ ".inst 0xc0046a45 // mova za.d[x11, #5], { z18.d-z19.d }\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ ".inst 0xc0046a46 // mova za.d[x11, #6], { z18.d-z19.d }\n"
+ ".inst 0xc0046a47 // mova za.d[x11, #7], { z18.d-z19.d }\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+ "sub x15, x15, x21\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+ ".inst 0xc1a4aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z4.s\n"
+ ".inst 0xc1acab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z12.s\n"
+ ".inst 0xc1b0cf14 // sclamp { z20.s-z23.s }, z24.s, z16.s\n"
+ "6:" // Left padding
+ "subs x21, x21, #0x1\n"
+ "st1b { z20.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z22.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z21.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z23.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "bgt 6b\n"
+ "7:" // Left padding: End
+ "adds XZR, x6, x5\n"
+ "bne 14f\n"
+ "cbz x22, 12f\n"
+ "cmp x22, #0x1\n"
+ "sub x25, x25, x22\n"
+ "beq 11f\n"
+ "cmp x22, #0x2\n"
+ "beq 10f\n"
+ "cmp x22, #0x3\n"
+ "beq 9f\n"
+ "8:" // Unpadded: 4 priming loads
+ "add x21, x16, %x[ld_in_row]\n"
+ "ld1b { z1.s }, p1/Z, [x16]\n"
+ "addvl x20, SP, #24\n"
+ "ld1b { z28.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z27.h, z1.h, z28.h\n"
+ "add z27.h, z27.h, z17.h\n"
+ "ld1b { z1.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z28.h, z1.h, z2.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z6.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z29.h, z13.h, z6.h\n"
+ "add z29.h, z29.h, z17.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16a7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z10.h\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z20.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add z30.h, z30.h, z17.h\n"
+ ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc1617789 // sdot za.s[x11, 1], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
+ ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ "9:" // Unpadded: 3 priming loads
+ "add x22, x16, %x[ld_in_row]\n"
+ "ld1b { z2.s }, p1/Z, [x16]\n"
+ "addvl x21, SP, #18\n"
+ "ld1b { z28.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z20.h, z2.h, z28.h\n"
+ "add z20.h, z20.h, z17.h\n"
+ "ld1b { z31.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "addvl x20, SP, #24\n"
+ "ld1b { z11.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z21.h, z31.h, z11.h\n"
+ "add z21.h, z21.h, z17.h\n"
+ "ld1b { z25.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z22.h, z25.h, z8.h\n"
+ "add z22.h, z22.h, z17.h\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16e7688 // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+ "ld1b { z3.s }, p1/Z, [x22]\n"
+ "trn1 z23.h, z8.h, z3.h\n"
+ ".inst 0xc1667689 // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc161768a // sdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
+ "add z23.h, z23.h, z17.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc160768b // sdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
+ ".inst 0xc16976a8 // sdot za.s[x11, 0], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xa0422aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16176a9 // sdot za.s[x11, 1], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16976aa // sdot za.s[x11, 2], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xc16176ab // sdot za.s[x11, 3], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xc16f76c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc16e76c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16b76ca // sdot za.s[x11, 2], { z22.h-z23.h }, z11.h\n"
+ ".inst 0xc16a76cb // sdot za.s[x11, 3], { z22.h-z23.h }, z10.h\n"
+ "10:" // Unpadded: 2 priming loads
+ "add x23, x16, %x[ld_in_row]\n"
+ "ld1b { z2.s }, p1/Z, [x16]\n"
+ "addvl x22, SP, #12\n"
+ "ld1b { z22.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z0.h, z2.h, z22.h\n"
+ "add z0.h, z0.h, z17.h\n"
+ "ld1b { z14.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "addvl x21, SP, #18\n"
+ "ld1b { z6.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z1.h, z14.h, z6.h\n"
+ "add z1.h, z1.h, z17.h\n"
+ "ld1b { z15.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "addvl x20, SP, #24\n"
+ "ld1b { z6.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z2.h, z15.h, z6.h\n"
+ "add z2.h, z2.h, z17.h\n"
+ "ld1b { z21.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16f7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z15.h\n"
+ "ld1b { z30.s }, p1/Z, [x23]\n"
+ "trn1 z3.h, z21.h, z30.h\n"
+ ".inst 0xc16e7409 // sdot za.s[x11, 1], { z0.h-z1.h }, z14.h\n"
+ ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d740a // sdot za.s[x11, 2], { z0.h-z1.h }, z13.h\n"
+ "add z3.h, z3.h, z17.h\n"
+ ".inst 0xa0412ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc165740b // sdot za.s[x11, 3], { z0.h-z1.h }, z5.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16f7428 // sdot za.s[x11, 0], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e7429 // sdot za.s[x11, 1], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16b740c // sdot za.s[x11, 4], { z0.h-z1.h }, z11.h\n"
+ ".inst 0xc16a740d // sdot za.s[x11, 5], { z0.h-z1.h }, z10.h\n"
+ ".inst 0xc16f742a // sdot za.s[x11, 2], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e742b // sdot za.s[x11, 3], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1697448 // sdot za.s[x11, 0], { z2.h-z3.h }, z9.h\n"
+ ".inst 0xc1687449 // sdot za.s[x11, 1], { z2.h-z3.h }, z8.h\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16f742c // sdot za.s[x11, 4], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e742d // sdot za.s[x11, 5], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xc16b744a // sdot za.s[x11, 2], { z2.h-z3.h }, z11.h\n"
+ ".inst 0xc16a744b // sdot za.s[x11, 3], { z2.h-z3.h }, z10.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc161744c // sdot za.s[x11, 4], { z2.h-z3.h }, z1.h\n"
+ ".inst 0xc160744d // sdot za.s[x11, 5], { z2.h-z3.h }, z0.h\n"
+ "11:" // Unpadded: 1 priming loads
+ "add x24, x16, %x[ld_in_row]\n"
+ "ld1b { z0.s }, p1/Z, [x16]\n"
+ "addvl x23, SP, #6\n"
+ "ld1b { z3.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z28.h, z0.h, z3.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "ld1b { z6.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "addvl x22, SP, #12\n"
+ "ld1b { z30.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z29.h, z6.h, z30.h\n"
+ "add z29.h, z29.h, z17.h\n"
+ "ld1b { z1.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "addvl x21, SP, #18\n"
+ "ld1b { z25.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z30.h, z1.h, z25.h\n"
+ "add z30.h, z30.h, z17.h\n"
+ "ld1b { z3.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1b { z5.s }, p1/Z, [x24]\n"
+ "trn1 z31.h, z3.h, z5.h\n"
+ ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16e778a // sdot za.s[x11, 2], { z28.h-z29.h }, z14.h\n"
+ "add z31.h, z31.h, z17.h\n"
+ ".inst 0xa1412ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc166778b // sdot za.s[x11, 3], { z28.h-z29.h }, z6.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16a77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16277a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z2.h\n"
+ ".inst 0xa0412ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16f778c // sdot za.s[x11, 4], { z28.h-z29.h }, z15.h\n"
+ ".inst 0xc16e778d // sdot za.s[x11, 5], { z28.h-z29.h }, z14.h\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+ ".inst 0xc16877ab // sdot za.s[x11, 3], { z29.h-z30.h }, z8.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z10.h\n"
+ ".inst 0xc16277c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z2.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16e778e // sdot za.s[x11, 6], { z28.h-z29.h }, z14.h\n"
+ ".inst 0xc166778f // sdot za.s[x11, 7], { z28.h-z29.h }, z6.h\n"
+ ".inst 0xc16d77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z13.h\n"
+ ".inst 0xc16577ad // sdot za.s[x11, 5], { z29.h-z30.h }, z5.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z10.h\n"
+ ".inst 0xc16277cb // sdot za.s[x11, 3], { z30.h-z31.h }, z2.h\n"
+ ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z14.h\n"
+ ".inst 0xc16677af // sdot za.s[x11, 7], { z29.h-z30.h }, z6.h\n"
+ ".inst 0xc16977cc // sdot za.s[x11, 4], { z30.h-z31.h }, z9.h\n"
+ ".inst 0xc16877cd // sdot za.s[x11, 5], { z30.h-z31.h }, z8.h\n"
+ ".inst 0xa1422a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16e77ce // sdot za.s[x11, 6], { z30.h-z31.h }, z14.h\n"
+ ".inst 0xc16677cf // sdot za.s[x11, 7], { z30.h-z31.h }, z6.h\n"
+ "12:" // Unpadded: 0 priming loads
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 22f\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ "ld1b { z26.s }, p1/Z, [x16]\n"
+ "sub x25, x25, #0x1\n"
+ "ld1b { z28.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z26.h, z28.h\n"
+ "sub x15, x15, #0x1\n"
+ "ld1b { z31.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "cmp x25, x15\n"
+ "add z25.h, z25.h, z17.h\n"
+ "ld1b { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z26.h, z31.h, z15.h\n"
+ "csel x25, x25, x15, LT\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add z26.h, z26.h, z17.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1b { z8.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z27.h, z22.h, z8.h\n"
+ "add z27.h, z27.h, z17.h\n"
+ "ld1b { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "sub x15, x15, x25\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
+ "trn1 z28.h, z21.h, z20.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "cbz x25, 21f\n"
+ "13:" // Unpadded: Main loop
+ "addvl x24, SP, #6\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+ "addvl x23, SP, #12\n"
+ "ld1b { z21.s }, p1/Z, [x16]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402b0e // ld1h { z14.h-z15.h }, pn10.b/Z, [x24]\n"
+ "addvl x22, SP, #18\n"
+ "addvl x21, SP, #24\n"
+ ".inst 0xc16f772a // sdot za.s[x11, 2], { z25.h-z26.h }, z15.h\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ "ld1b { z0.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e772b // sdot za.s[x11, 3], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+ "subs x25, x25, #0x1\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412b05 // ld1h { z5.h, z13.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ "ld1b { z31.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ "ld1b { z29.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc16f772e // sdot za.s[x11, 6], { z25.h-z26.h }, z15.h\n"
+ "ld1b { z30.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e772f // sdot za.s[x11, 7], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d774c // sdot za.s[x11, 4], { z26.h-z27.h }, z13.h\n"
+ "ld1b { z6.s }, p1/Z, [x20]\n"
+ ".inst 0xc165774d // sdot za.s[x11, 5], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16a776a // sdot za.s[x11, 2], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776b // sdot za.s[x11, 3], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a776c // sdot za.s[x11, 4], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+ ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+ "trn1 z25.h, z21.h, z0.h\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16d1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z13.h\n"
+ "add z25.h, z25.h, z17.h\n"
+ ".inst 0xc1651749 // sdot za.s[x8, 1], { z26.h-z27.h }, z5.h\n"
+ "trn1 z26.h, z20.h, z31.h\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+ "add z26.h, z26.h, z17.h\n"
+ ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+ "trn1 z27.h, z29.h, z22.h\n"
+ "trn1 z28.h, z30.h, z6.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "add z27.h, z27.h, z17.h\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ "add x11, x11, #0x2\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "add z28.h, z28.h, z17.h\n"
+ "st1b { z10.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z9.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z11.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "bgt 13b\n"
+ "b 21f\n"
+ "14:" // Padded
+ "cbz x22, 19f\n"
+ "cmp x22, #0x1\n"
+ "sub x25, x25, x22\n"
+ "beq 18f\n"
+ "cmp x22, #0x2\n"
+ "beq 17f\n"
+ "cmp x22, #0x3\n"
+ "beq 16f\n"
+ "15:" // Padded: 4 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z9.s }, p0/Z, [x16]\n"
+ "add z9.h, p0/M, z9.h, z17.h\n"
+ "add x21, x16, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z21.s }, p0/Z, [x21]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "mov x12, #0x4\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z31.h, z9.h, z22.h\n"
+ "trn1 z0.h, z21.h, z20.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z21.s }, p0/Z, [x21]\n"
+ "addvl x20, SP, #24\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z1.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16a77e8 // sdot za.s[x11, 0], { z31.h-z0.h }, z10.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc16277e9 // sdot za.s[x11, 1], { z31.h-z0.h }, z2.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "trn1 z2.h, z21.h, z20.h\n"
+ ".inst 0xc16d7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z13.h\n"
+ ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1657409 // sdot za.s[x11, 1], { z0.h-z1.h }, z5.h\n"
+ ".inst 0xc1697428 // sdot za.s[x11, 0], { z1.h-z2.h }, z9.h\n"
+ ".inst 0xc1687429 // sdot za.s[x11, 1], { z1.h-z2.h }, z8.h\n"
+ "16:" // Padded: 3 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z5.s }, p0/Z, [x16]\n"
+ "add z5.h, p0/M, z5.h, z17.h\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z28.h, z5.h, z22.h\n"
+ "trn1 z29.h, z21.h, z20.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "addvl x21, SP, #18\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "trn1 z30.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #24\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "trn1 z31.h, z21.h, z20.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc169778a // sdot za.s[x11, 2], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc161778b // sdot za.s[x11, 3], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xa1422aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16f77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z15.h\n"
+ ".inst 0xc16e77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z14.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+ ".inst 0xc16177ab // sdot za.s[x11, 3], { z29.h-z30.h }, z1.h\n"
+ ".inst 0xc16b77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
+ ".inst 0xc16377c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z15.h\n"
+ ".inst 0xc16e77cb // sdot za.s[x11, 3], { z30.h-z31.h }, z14.h\n"
+ "17:" // Padded: 2 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z29.s }, p0/Z, [x16]\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z8.h, z29.h, z22.h\n"
+ "trn1 z9.h, z21.h, z20.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "addvl x22, SP, #12\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ "trn1 z10.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "addvl x21, SP, #18\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16f7508 // sdot za.s[x11, 0], { z8.h-z9.h }, z15.h\n"
+ ".inst 0xc16e7509 // sdot za.s[x11, 1], { z8.h-z9.h }, z14.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #24\n"
+ "trn1 z11.h, z21.h, z20.h\n"
+ ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e750a // sdot za.s[x11, 2], { z8.h-z9.h }, z14.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc166750b // sdot za.s[x11, 3], { z8.h-z9.h }, z6.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16d7528 // sdot za.s[x11, 0], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc1657529 // sdot za.s[x11, 1], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16f750c // sdot za.s[x11, 4], { z8.h-z9.h }, z15.h\n"
+ ".inst 0xc16e750d // sdot za.s[x11, 5], { z8.h-z9.h }, z14.h\n"
+ ".inst 0xc16d752a // sdot za.s[x11, 2], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xc165752b // sdot za.s[x11, 3], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1617548 // sdot za.s[x11, 0], { z10.h-z11.h }, z1.h\n"
+ ".inst 0xc1607549 // sdot za.s[x11, 1], { z10.h-z11.h }, z0.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e752c // sdot za.s[x11, 4], { z9.h-z10.h }, z14.h\n"
+ ".inst 0xc166752d // sdot za.s[x11, 5], { z9.h-z10.h }, z6.h\n"
+ ".inst 0xc161754a // sdot za.s[x11, 2], { z10.h-z11.h }, z1.h\n"
+ ".inst 0xc160754b // sdot za.s[x11, 3], { z10.h-z11.h }, z0.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f754c // sdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc16e754d // sdot za.s[x11, 5], { z10.h-z11.h }, z14.h\n"
+ "18:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z1.s }, p0/Z, [x16]\n"
+ "add z1.h, p0/M, z1.h, z17.h\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z26.h, z1.h, z22.h\n"
+ "trn1 z27.h, z21.h, z20.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "addvl x23, SP, #6\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xa0402aee // ld1h { z14.h-z15.h }, pn10.b/Z, [x23]\n"
+ "trn1 z28.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "addvl x22, SP, #12\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16f7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z15.h\n"
+ ".inst 0xc16e7749 // sdot za.s[x11, 1], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
+ "addvl x21, SP, #18\n"
+ "trn1 z29.h, z21.h, z20.h\n"
+ ".inst 0xa0412aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc161774a // sdot za.s[x11, 2], { z26.h-z27.h }, z1.h\n"
+ "addvl x20, SP, #24\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc160774b // sdot za.s[x11, 3], { z26.h-z27.h }, z0.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16b7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16a7769 // sdot za.s[x11, 1], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e774c // sdot za.s[x11, 4], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc166774d // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc1687789 // sdot za.s[x11, 1], { z28.h-z29.h }, z8.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xc16e776c // sdot za.s[x11, 4], { z27.h-z28.h }, z14.h\n"
+ ".inst 0xc166776d // sdot za.s[x11, 5], { z27.h-z28.h }, z6.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a778a // sdot za.s[x11, 2], { z28.h-z29.h }, z10.h\n"
+ ".inst 0xc162778b // sdot za.s[x11, 3], { z28.h-z29.h }, z2.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e776e // sdot za.s[x11, 6], { z27.h-z28.h }, z14.h\n"
+ ".inst 0xc166776f // sdot za.s[x11, 7], { z27.h-z28.h }, z6.h\n"
+ ".inst 0xc161778c // sdot za.s[x11, 4], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xc160778d // sdot za.s[x11, 5], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1422a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16a778e // sdot za.s[x11, 6], { z28.h-z29.h }, z10.h\n"
+ ".inst 0xc162778f // sdot za.s[x11, 7], { z28.h-z29.h }, z2.h\n"
+ "19:" // Padded: 0 priming loads
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 22f\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z6.s }, p0/Z, [x16]\n"
+ "add z6.h, p0/M, z6.h, z17.h\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z30.s }, p0/Z, [x20]\n"
+ "add z30.h, p0/M, z30.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z17.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z6.h, z30.h\n"
+ "trn1 z26.h, z27.h, z26.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z9.s }, p0/Z, [x20]\n"
+ "add z9.h, p0/M, z9.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
+ "sub x25, x25, #0x1\n"
+ "sub x15, x15, #0x1\n"
+ "cmp x25, x15\n"
+ "trn1 z27.h, z8.h, z9.h\n"
+ "trn1 z28.h, z21.h, z29.h\n"
+ "csel x25, x25, x15, LT\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "sub x15, x15, x25\n"
+ "cbz x25, 21f\n"
+ "20:" // Padded: Main loop
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z8.s }, p0/Z, [x16]\n"
+ "add z8.h, p0/M, z8.h, z17.h\n"
+ "add x24, x16, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z21.s }, p0/Z, [x24]\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+ "addvl x23, SP, #6\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ "addvl x22, SP, #12\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ "addvl x21, SP, #18\n"
+ "addvl x20, SP, #24\n"
+ "ld1b { z29.s }, p0/Z, [x24]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ "mov x12, #0x4\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ "ld1b { z30.s }, p0/Z, [x24]\n"
+ "add z30.h, p0/M, z30.h, z17.h\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "subs x25, x25, #0x1\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ "ld1b { z15.s }, p0/Z, [x24]\n"
+ "add z15.h, p0/M, z15.h, z17.h\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ "ld1b { z20.s }, p0/Z, [x24]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ "ld1b { z31.s }, p0/Z, [x24]\n"
+ "add z31.h, p0/M, z31.h, z17.h\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc16b774c // sdot za.s[x11, 4], { z26.h-z27.h }, z11.h\n"
+ "ld1b { z22.s }, p0/Z, [x24]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ ".inst 0xc16a774d // sdot za.s[x11, 5], { z26.h-z27.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc169776a // sdot za.s[x11, 2], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776b // sdot za.s[x11, 3], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16e774e // sdot za.s[x11, 6], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc166774f // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc161776c // sdot za.s[x11, 4], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1631728 // sdot za.s[x8, 0], { z25.h-z26.h }, z3.h\n"
+ ".inst 0xc1621729 // sdot za.s[x8, 1], { z25.h-z26.h }, z2.h\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ "trn1 z25.h, z8.h, z21.h\n"
+ ".inst 0xc16e1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc1661749 // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "trn1 z26.h, z29.h, z30.h\n"
+ ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "trn1 z27.h, z15.h, z20.h\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ "trn1 z28.h, z31.h, z22.h\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ "add x11, x11, #0x2\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z9.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z11.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "bgt 20b\n"
+ "21:" // Main loop tail
+ "addvl x23, SP, #6\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+ "addvl x22, SP, #12\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ "addvl x21, SP, #18\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc169774a // sdot za.s[x11, 2], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc161774b // sdot za.s[x11, 3], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xa1412ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc169774c // sdot za.s[x11, 4], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc161774d // sdot za.s[x11, 5], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0422ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa0412a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc163776c // sdot za.s[x11, 4], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16a776e // sdot za.s[x11, 6], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776f // sdot za.s[x11, 7], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+ ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc1691748 // sdot za.s[x8, 0], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc1681749 // sdot za.s[x8, 1], { z26.h-z27.h }, z8.h\n"
+ ".inst 0xc1611768 // sdot za.s[x8, 0], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ "add x11, x11, #0x2\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z9.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z11.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "22:" // Main loop skip tail
+ "cbz x15, 24f\n"
+ "23:" // Right padding loop
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ "add x8, x8, #0x2\n"
+ "subs x15, x15, #0x1\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ "add x11, x11, #0x2\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z9.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z11.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "bgt 23b\n"
+ "24:" // End
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #16\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x17\n"
+ "whilelt p1.s, x17, x7\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x10]\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #30\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..05aad19c09
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
+ const uint8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const uint8_t *weights,
+ uint8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8q_planar_5x5_s2_4rows_dot_za : public PlanarStrategy<uint8_t, uint8_t>
+{
+ using Parent = PlanarStrategy<uint8_t, uint8_t>;
+
+ public:
+ using return_type = uint8_t;
+ constexpr static auto output_rows = 4u;
+ constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+ constexpr static auto stride_rows = 2u, stride_cols = 2u;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ sme2_u8q_planar_5x5_s2_4rows_dot_za(const CPUInfo *)
+ : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+ {
+ }
+
+ typename Parent::KernelType get_kernel(void) const override
+ {
+ return sme2_u8q_planar_5x5_s2_4rows_dot_za_impl;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..6c144afa77
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,1354 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8q_planar_5x5_s2_4rows_dot_za_impl(
+ const uint8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const uint8_t *weights,
+ uint8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+)
+{
+ struct Args
+ {
+ const uint8_t *inptr;
+ size_t ld_in_vl;
+ long unsigned int pad_top, pad_bottom, pad_left;
+ const uint8_t *weights;
+ long unsigned int input_cols, output_cols;
+ uint8_t **outptrs;
+ const size_t *ld_out_cols;
+ const size_t *ld_out_vls;
+ long unsigned int current_channel, n_channels;
+ };
+
+ Args args = { inptr, ld_in_vl, pad_top, 11u - std::min(11u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ldr x3, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ptrue p2.b\n"
+ "mov x20, #0xb\n"
+ "ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ld1rh { z7.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "sub x20, x20, x3\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x5\n"
+ "whilelt p9.s, XZR, x20\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "whilelt p8.s, XZR, x4\n"
+ "addvl SP, SP, #-15\n"
+ "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "neg z7.h, p2/M, z7.h\n"
+ "eor p8.b, p2/Z, p8.b, p9.b\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z21.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "1:" // Channel loop
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "mov z12.s, #0x0\n"
+ "cbz x20, 2f\n"
+ "ld1w { z12.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "2:" // Load bias: Done
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x22\n"
+ "ld1b { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "ld1rh { z28.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "incw x22\n"
+ "mov z26.h, #0x0\n"
+ "ld1b { z22.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "trn1 z17.h, z13.h, z22.h\n"
+ "ld1b { z20.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z20.h, z20.h, z28.h\n"
+ "addvl x21, SP, #15\n"
+ "ld1b { z1.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z1.h, z1.h, z28.h\n"
+ "trn1 z29.h, z20.h, z1.h\n"
+ "ld1b { z27.s }, p2/Z, [x20]\n"
+ "mov x20, x22\n"
+ "sub z27.h, z27.h, z28.h\n"
+ "incw x22\n"
+ "ld1b { z14.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z14.h, z14.h, z28.h\n"
+ "addvl x21, x21, #-3\n"
+ "ld1b { z18.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z18.h, z18.h, z28.h\n"
+ "trn1 z22.h, z27.h, z26.h\n"
+ "ld1b { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z23.h, z23.h, z28.h\n"
+ "st1h { z17.h }, p2, [x21]\n"
+ "ld1b { z30.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z30.h, z30.h, z28.h\n"
+ "trn1 z8.h, z14.h, z18.h\n"
+ "ld1b { z15.s }, p2/Z, [x20]\n"
+ "mov x20, x22\n"
+ "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
+ "sub z15.h, z15.h, z28.h\n"
+ "ld1b { z20.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z23.h, z23.h, z30.h\n"
+ "sub z20.h, z20.h, z28.h\n"
+ "ld1b { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z24.h, z24.h, z28.h\n"
+ "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z0.h, z15.h, z26.h\n"
+ "incw x22\n"
+ "ld1b { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z16.h, z16.h, z28.h\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "ld1b { z11.s }, p2/Z, [x20]\n"
+ "addvl x21, x21, #-3\n"
+ "mov x20, x22\n"
+ "st1h { z8.h }, p2, [x21]\n"
+ "trn1 z27.h, z20.h, z24.h\n"
+ "ld1b { z22.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z11.h, z11.h, z28.h\n"
+ "ld1b { z3.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z23.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z20.h, z16.h, z13.h\n"
+ "ld1b { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "sub z3.h, z3.h, z28.h\n"
+ "ld1b { z15.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z29.h, z11.h, z26.h\n"
+ "ld1b { z16.s }, p2/Z, [x20]\n"
+ "incw x22\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "sub z15.h, z15.h, z28.h\n"
+ "addvl x21, x21, #-3\n"
+ "mov x20, x22\n"
+ "st1h { z27.h }, p2, [x21]\n"
+ "sub z16.h, z16.h, z28.h\n"
+ "trn1 z19.h, z22.h, z3.h\n"
+ "ld1b { z17.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z20.h }, p2, [x21, #1, MUL VL]\n"
+ "ld1b { z0.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z31.h, z13.h, z15.h\n"
+ "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1b { z18.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z16.h, z16.h, z26.h\n"
+ "sub z17.h, z17.h, z28.h\n"
+ "ld1b { z22.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z0.h, z0.h, z28.h\n"
+ "sub z18.h, z18.h, z28.h\n"
+ "ld1b { z1.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "sub z1.h, z1.h, z28.h\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "addvl x21, x21, #-3\n"
+ "st1h { z19.h }, p2, [x21]\n"
+ "mov z13.d, z12.d\n"
+ "mov z14.d, z12.d\n"
+ "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+ "mov z15.d, z12.d\n"
+ "trn1 z8.h, z17.h, z0.h\n"
+ "st1h { z16.h }, p2, [x21, #2, MUL VL]\n"
+ "addvl x21, x21, #-3\n"
+ "trn1 z31.h, z18.h, z22.h\n"
+ "trn1 z29.h, z1.h, z26.h\n"
+ "st1h { z8.h }, p2, [x21]\n"
+ "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z6.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "3:" // Load mul: End
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "4:" // Load right_shift: End
+ "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x7, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
+ "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "orr x23, x5, x23, LSL #22\n"
+ "mov x22, #0xb\n"
+ "add x21, x4, x3\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "mov x8, #0x0\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x4, x17\n"
+ "5:" // Issue prefetches
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
+ "bgt 5b\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x17, x4, x20, x17\n"
+ ".inst 0xc0040d80 // mova za.d[x8, #0], { z12.d-z15.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040d81 // mova za.d[x8, #1], { z12.d-z15.d }\n"
+ "mov x22, #0x4\n"
+ "ldp x15, x14, [x23], #0x10\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "ldp x13, x11, [x20], #0x10\n"
+ ".inst 0xc0040d83 // mova za.d[x8, #3], { z12.d-z15.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "ldp x10, x9, [x23], #0x10\n"
+ "ldp x28, x27, [x20], #0x10\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ "and x22, x21, #0x1\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ "add x21, x21, #0x1\n"
+ "lsr x21, x21, #0x1\n"
+ ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+ "sub x16, x16, x21\n"
+ ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+ "6:" // Left padding
+ "subs x21, x21, #0x1\n"
+ "st1b { z28.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "st1b { z29.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "st1b { z30.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z31.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "bgt 6b\n"
+ "7:" // Left padding: End
+ "adds XZR, x4, x3\n"
+ "bne 14f\n"
+ "cbz x22, 12f\n"
+ "cmp x22, #0x1\n"
+ "sub x7, x7, x22\n"
+ "beq 11f\n"
+ "cmp x22, #0x2\n"
+ "beq 10f\n"
+ "cmp x22, #0x3\n"
+ "beq 9f\n"
+ "8:" // Unpadded: 4 priming loads
+ "add x21, x17, %x[ld_in_row]\n"
+ "ld1b { z27.s }, p1/Z, [x17]\n"
+ "addvl x20, SP, #12\n"
+ "ld1b { z0.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z27.h, z27.h, z0.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1b { z28.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1b { z11.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z28.h, z28.h, z11.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1b { z29.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z29.h, z29.h, z8.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z30.h, z30.h, z17.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1b { z31.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z26.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z31.h, z31.h, z26.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "mov z0.d, z20.d\n"
+ "add z0.h, z0.h, z7.h\n"
+ ".inst 0xc1781788 // sdot za.s[x8, 0], { z28.h-z31.h }, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17817a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z8.h\n"
+ "9:" // Unpadded: 3 priming loads
+ "add x21, x17, %x[ld_in_row]\n"
+ "ld1b { z29.s }, p1/Z, [x17]\n"
+ "addvl x20, SP, #9\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z29.h, z29.h, z17.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1b { z0.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z30.h, z30.h, z0.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1b { z31.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1b { z0.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z0.h, z0.h, z16.h\n"
+ "add z0.h, z0.h, z7.h\n"
+ "ld1b { z1.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z1.h, z1.h, z16.h\n"
+ "add z1.h, z1.h, z7.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
+ "mov z2.d, z16.d\n"
+ "add z2.h, z2.h, z7.h\n"
+ ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17817e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z8.h\n"
+ "10:" // Unpadded: 2 priming loads
+ "add x22, x17, %x[ld_in_row]\n"
+ "ld1b { z26.s }, p1/Z, [x17]\n"
+ "addvl x21, SP, #6\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "addvl x20, SP, #12\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1b { z28.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1b { z29.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z28.h, z28.h, z29.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1b { z29.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1b { z19.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z29.h, z29.h, z19.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1b { z23.s }, p1/Z, [x22]\n"
+ "trn1 z30.h, z30.h, z23.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z30.h, z30.h, z7.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1b { z22.s }, p1/Z, [x22]\n"
+ "mov z31.d, z22.d\n"
+ ".inst 0xc1731768 // sdot za.s[x8, 0], { z27.h-z30.h }, z3.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17b1769 // sdot za.s[x8, 1], { z27.h-z30.h }, z11.h\n"
+ ".inst 0xc1731788 // sdot za.s[x8, 0], { z28.h-z31.h }, z3.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1701789 // sdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
+ "11:" // Unpadded: 1 priming loads
+ "add x22, x17, %x[ld_in_row]\n"
+ "ld1b { z29.s }, p1/Z, [x17]\n"
+ "addvl x21, SP, #3\n"
+ "ld1b { z22.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z29.h, z29.h, z22.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "addvl x20, SP, #9\n"
+ "ld1b { z25.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z30.h, z30.h, z25.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1b { z31.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1b { z0.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z0.h, z0.h, z16.h\n"
+ "add z0.h, z0.h, z7.h\n"
+ "ld1b { z1.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1b { z2.s }, p1/Z, [x22]\n"
+ "trn1 z1.h, z1.h, z2.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z1.h, z1.h, z7.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+ "ld1b { z24.s }, p1/Z, [x22]\n"
+ "mov z2.d, z24.d\n"
+ ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+ ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17817a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z8.h\n"
+ "add z2.h, z2.h, z7.h\n"
+ "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17917c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z9.h\n"
+ ".inst 0xc17317e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z3.h\n"
+ "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17317e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z3.h\n"
+ "12:" // Unpadded: 0 priming loads
+ "cmp x7, #0x2\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "blt 22f\n"
+ "add x21, x17, %x[ld_in_row]\n"
+ "ld1b { z23.s }, p1/Z, [x17]\n"
+ "sub x7, x7, #0x2\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z25.h\n"
+ "sub x16, x16, #0x1\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "lsr x20, x7, #0x1\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z30.h\n"
+ "cmp x20, x16\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "csel x26, x20, x16, LT\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z22.h\n"
+ "add z25.h, z25.h, z7.h\n"
+ "ld1b { z26.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z26.h, z26.h, z22.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "and x7, x7, #0x1\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z27.h, z27.h, z30.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1b { z28.s }, p1/Z, [x21]\n"
+ "mov z28.d, z28.d\n"
+ "add z28.h, z28.h, z7.h\n"
+ "sub x16, x16, x26\n"
+ "cbz x26, 21f\n"
+ "13:" // Unpadded: Main loop
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+ "addvl x25, SP, #6\n"
+ "addvl x24, SP, #12\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa0402b20 // ld1h { z0.h-z1.h }, pn10.b/Z, [x25]\n"
+ "add x23, x17, %x[ld_in_row]\n"
+ "addvl x22, SP, #3\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "addvl x21, SP, #9\n"
+ "subs x26, x26, #0x1\n"
+ ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17816ea // sdot za.s[x8, 2], { z23.h-z26.h }, z8.h\n"
+ "ld1b { z23.s }, p1/Z, [x17]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
+ ".inst 0xc179170a // sdot za.s[x8, 2], { z24.h-z27.h }, z9.h\n"
+ "ld1b { z16.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "ld1h { z9.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1b { z24.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+ "ld1b { z18.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1b { z25.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "ld1b { z8.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z8.h\n"
+ "add z25.h, z25.h, z7.h\n"
+ "ld1b { z26.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "ld1b { z28.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z26.h, z26.h, z28.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ "ld1b { z28.s }, p1/Z, [x23]\n"
+ "trn1 z27.h, z27.h, z28.h\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z27.h, z27.h, z7.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ "ld1b { z20.s }, p1/Z, [x23]\n"
+ "mov z28.d, z20.d\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+ ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ ".inst 0xc1701728 // sdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "st1b { z16.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "ld1b { z23.s }, p1/Z, [x17]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "st1b { z17.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ "st1b { z18.s }, p1, [x10]\n"
+ "ld1b { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add x10, x10, x28\n"
+ "st1b { z19.s }, p1, [x9]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z16.h\n"
+ "add x9, x9, x27\n"
+ "ld1b { z25.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z16.h\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1b { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add z25.h, z25.h, z7.h\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "mov z28.d, z16.d\n"
+ "add z28.h, z28.h, z7.h\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "bgt 13b\n"
+ "b 21f\n"
+ "14:" // Padded
+ "cbz x22, 19f\n"
+ "cmp x22, #0x1\n"
+ "sub x7, x7, x22\n"
+ "beq 18f\n"
+ "cmp x22, #0x2\n"
+ "beq 17f\n"
+ "cmp x22, #0x3\n"
+ "beq 16f\n"
+ "15:" // Padded: 4 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z27.s }, p0/Z, [x17]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x21, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z28.s }, p0/Z, [x21]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "mov x12, #0x4\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ "trn1 z28.h, z28.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z29.s }, p0/Z, [x21]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z30.s }, p0/Z, [x21]\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z31.s }, p0/Z, [x21]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "addvl x20, SP, #12\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z29.h, z29.h, z18.h\n"
+ "trn1 z30.h, z30.h, z17.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ "mov z0.d, z20.d\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ ".inst 0xc1711788 // sdot za.s[x8, 0], { z28.h-z31.h }, z1.h\n"
+ "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17117a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z1.h\n"
+ "16:" // Padded: 3 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z24.s }, p0/Z, [x17]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
+ "add x21, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z25.s }, p0/Z, [x21]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "mov x12, #0x4\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z17.h\n"
+ "trn1 z25.h, z25.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z26.s }, p0/Z, [x21]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z27.s }, p0/Z, [x21]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z28.s }, p0/Z, [x21]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "addvl x20, SP, #9\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z26.h, z26.h, z18.h\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ "trn1 z28.h, z28.h, z16.h\n"
+ ".inst 0xc1721708 // sdot za.s[x8, 0], { z24.h-z27.h }, z2.h\n"
+ "ld1b { z11.s }, p0/Z, [x21]\n"
+ "add z11.h, p0/M, z11.h, z7.h\n"
+ "mov z29.d, z11.d\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1701748 // sdot za.s[x8, 0], { z26.h-z29.h }, z0.h\n"
+ "17:" // Padded: 2 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z25.s }, p0/Z, [x17]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "addvl x21, SP, #6\n"
+ "trn1 z27.h, z27.h, z18.h\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "trn1 z29.h, z29.h, z16.h\n"
+ ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #12\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ ".inst 0xc1791748 // sdot za.s[x8, 0], { z26.h-z29.h }, z9.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+ "mov z30.d, z1.d\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc1791768 // sdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+ "18:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z25.s }, p0/Z, [x17]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "addvl x21, SP, #3\n"
+ "trn1 z27.h, z27.h, z18.h\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "trn1 z29.h, z29.h, z16.h\n"
+ ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+ "ld1b { z0.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #9\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ ".inst 0xc17b1748 // sdot za.s[x8, 0], { z26.h-z29.h }, z11.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+ "mov z30.d, z0.d\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+ "19:" // Padded: 0 priming loads
+ "cmp x7, #0x2\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "blt 22f\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
+ "add z19.h, p0/M, z19.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "sub x7, x7, #0x2\n"
+ "sub x16, x16, #0x1\n"
+ "trn1 z25.h, z25.h, z19.h\n"
+ "trn1 z26.h, z26.h, z18.h\n"
+ "lsr x20, x7, #0x1\n"
+ "cmp x20, x16\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ "mov z28.d, z16.d\n"
+ "csel x25, x20, x16, LT\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "and x7, x7, #0x1\n"
+ "sub x16, x16, x25\n"
+ "cbz x25, 21f\n"
+ "20:" // Padded: Main loop
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+ "addvl x24, SP, #6\n"
+ "addvl x23, SP, #12\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ "addvl x22, SP, #3\n"
+ ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
+ "addvl x21, SP, #9\n"
+ "subs x25, x25, #0x1\n"
+ ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+ "ld1b { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x4\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "ld1h { z3.h }, p2/Z, [x23, #2, MUL VL]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc173172a // sdot za.s[x8, 2], { z25.h-z28.h }, z3.h\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z30.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ "trn1 z24.h, z24.h, z1.h\n"
+ "trn1 z25.h, z25.h, z3.h\n"
+ "trn1 z26.h, z26.h, z30.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "trn1 z27.h, z27.h, z29.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "mov x12, #0x0\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc17216e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
+ "ld1b { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc1731709 // sdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "mov z28.d, z20.d\n"
+ "ld1h { z1.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+ "mov x12, #0x4\n"
+ "add z22.h, p0/M, z22.h, z7.h\n"
+ "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1711729 // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z31.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ "st1b { z16.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "st1b { z17.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "trn1 z23.h, z23.h, z8.h\n"
+ "trn1 z24.h, z24.h, z22.h\n"
+ "st1b { z18.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "trn1 z25.h, z25.h, z28.h\n"
+ "trn1 z26.h, z26.h, z20.h\n"
+ "st1b { z19.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "trn1 z27.h, z27.h, z31.h\n"
+ "mov z28.d, z1.d\n"
+ "bgt 20b\n"
+ "21:" // Main loop tail
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+ "addvl x24, SP, #6\n"
+ "addvl x23, SP, #12\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc17816e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z8.h\n"
+ "add x22, x17, %x[ld_in_row]\n"
+ "addvl x21, SP, #3\n"
+ ".inst 0xc1791709 // sdot za.s[x8, 1], { z24.h-z27.h }, z9.h\n"
+ ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
+ "addvl x20, SP, #9\n"
+ ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+ "ld1b { z29.s }, p0/Z, [x17]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z8.s }, p0/Z, [x22]\n"
+ "add z8.h, p0/M, z8.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+ "ld1b { z30.s }, p0/Z, [x22]\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x4\n"
+ "ld1b { z20.s }, p0/Z, [x22]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc172172a // sdot za.s[x8, 2], { z25.h-z28.h }, z2.h\n"
+ "trn1 z29.h, z29.h, z8.h\n"
+ "ld1b { z31.s }, p0/Z, [x22]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z25.s }, p0/Z, [x22]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z0.s }, p0/Z, [x22]\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z1.s }, p0/Z, [x22]\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z28.s }, p0/Z, [x22]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "trn1 z30.h, z30.h, z20.h\n"
+ "trn1 z31.h, z31.h, z25.h\n"
+ "trn1 z0.h, z0.h, z17.h\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "trn1 z1.h, z1.h, z28.h\n"
+ ".inst 0xc17317a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
+ "ld1b { z22.s }, p0/Z, [x22]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ "add z22.h, p0/M, z22.h, z7.h\n"
+ ".inst 0xc17b17c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ ".inst 0xc1a4aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z4.s\n"
+ ".inst 0xc17317a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z3.h\n"
+ "mov z2.d, z22.d\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17b17c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xc1aaab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
+ ".inst 0xc17917e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1b5ccb8 // sclamp { z24.s-z27.s }, z5.s, z21.s\n"
+ "st1b { z24.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "st1b { z25.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "st1b { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ ".inst 0xc17817e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z8.h\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "st1b { z27.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "22:" // Main loop skip tail
+ "cbz x7, 23f\n" // Skip remainder inputs
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z24.s }, p0/Z, [x17]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z17.h\n"
+ "trn1 z25.h, z25.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z31.s }, p0/Z, [x20]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "trn1 z26.h, z26.h, z17.h\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "ld1b { z0.s }, p0/Z, [x20]\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ "trn1 z28.h, z28.h, z31.h\n"
+ "addvl x21, SP, #6\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ "mov z29.d, z0.d\n"
+ "addvl x20, SP, #12\n"
+ "sub x16, x16, #0x1\n"
+ ".inst 0xc17b1728 // sdot za.s[x8, 0], { z25.h-z28.h }, z11.h\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1791729 // sdot za.s[x8, 1], { z25.h-z28.h }, z9.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ ".inst 0xc171170a // sdot za.s[x8, 2], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ "st1b { z16.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ ".inst 0xc1721749 // sdot za.s[x8, 1], { z26.h-z29.h }, z2.h\n"
+ "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "st1b { z17.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ ".inst 0xc173174a // sdot za.s[x8, 2], { z26.h-z29.h }, z3.h\n"
+ "add x8, x8, #0x1\n"
+ "st1b { z18.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z19.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "23:" // Tail input: End
+ "cbz x16, 25f\n"
+ "24:" // Right padding loop
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ "subs x16, x16, #0x1\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+ ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+ "st1b { z28.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "st1b { z29.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "st1b { z30.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z31.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "bgt 24b\n"
+ "25:" // End
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #16\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x6\n"
+ "whilelt p1.s, x6, x5\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x10]\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #15\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..a4345097b5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
+ const uint8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const int8_t *weights,
+ uint8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za : public PlanarStrategy<uint8_t, int8_t>
+{
+ using Parent = PlanarStrategy<uint8_t, int8_t>;
+
+ public:
+ using return_type = uint8_t;
+ constexpr static auto output_rows = 4u;
+ constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+ constexpr static auto stride_rows = 1u, stride_cols = 1u;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za(const CPUInfo *)
+ : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+ {
+ }
+
+ typename Parent::KernelType get_kernel(void) const override
+ {
+ return sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..612beb342a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,664 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_3x3_s1_4rows_dot_za_impl(
+ const uint8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const int8_t *weights,
+ uint8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+)
+{
+ struct Args
+ {
+ const uint8_t *inptr;
+ size_t ld_in_vl;
+ long unsigned int pad_top, pad_bottom, pad_left;
+ const int8_t *weights;
+ long unsigned int input_cols, output_cols;
+ uint8_t **outptrs;
+ const size_t *ld_out_cols;
+ const size_t *ld_out_vls;
+ long unsigned int current_channel, n_channels;
+ };
+
+ Args args = { inptr, ld_in_vl, pad_top, 6u - std::min(6u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ptrue p2.b\n"
+ "mov x20, #0x6\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ld1rh { z21.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "sub x20, x20, x6\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x17\n"
+ "whilelt p9.s, XZR, x20\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "whilelt p8.s, XZR, x7\n"
+ "addvl SP, SP, #-12\n"
+ "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "neg z21.h, p2/M, z21.h\n"
+ "eor p8.b, p2/Z, p8.b, p9.b\n"
+ "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z28.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "1:" // Channel loop
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "mov z30.s, #0x0\n"
+ "cbz x20, 2f\n"
+ "ld1w { z30.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "2:" // Load bias: Done
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x22\n"
+ "ld1sb { z10.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "ld1rh { z31.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z7.h, #0x0\n"
+ "sub z10.h, z10.h, z31.h\n"
+ "incw x22\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z16.h, z16.h, z31.h\n"
+ "trn1 z20.h, z7.h, z10.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z31.h\n"
+ "mov x20, x22\n"
+ "trn1 z19.h, z10.h, z16.h\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "trn1 z26.h, z16.h, z11.h\n"
+ "trn1 z13.h, z11.h, z7.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z24.h, z24.h, z31.h\n"
+ "sub z11.h, z11.h, z31.h\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "sub z2.h, z2.h, z31.h\n"
+ "addvl x21, SP, #12\n"
+ "incw x22\n"
+ "addvl x21, x21, #-4\n"
+ "mov x20, x22\n"
+ "st1h { z20.h }, p2, [x21]\n"
+ "trn1 z22.h, z7.h, z24.h\n"
+ "st1h { z19.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z1.h, z24.h, z11.h\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "st1h { z26.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z3.h, z11.h, z2.h\n"
+ "ld1sb { z0.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "st1h { z13.h }, p2, [x21, #3, MUL VL]\n"
+ "trn1 z25.h, z2.h, z7.h\n"
+ "ld1sb { z4.s }, p2/Z, [x20]\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "sub z16.h, z16.h, z31.h\n"
+ "sub z0.h, z0.h, z31.h\n"
+ "addvl x21, x21, #-4\n"
+ "st1h { z22.h }, p2, [x21]\n"
+ "sub z4.h, z4.h, z31.h\n"
+ "st1h { z1.h }, p2, [x21, #1, MUL VL]\n"
+ "mov z31.d, z30.d\n"
+ "st1h { z3.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z24.h, z7.h, z16.h\n"
+ "trn1 z18.h, z16.h, z0.h\n"
+ "st1h { z25.h }, p2, [x21, #3, MUL VL]\n"
+ "addvl x21, x21, #-4\n"
+ "trn1 z0.h, z0.h, z4.h\n"
+ "trn1 z1.h, z4.h, z7.h\n"
+ "st1h { z24.h }, p2, [x21]\n"
+ "st1h { z18.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z1.h }, p2, [x21, #3, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z14.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "3:" // Load mul: End
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z12.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "4:" // Load right_shift: End
+ "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x15, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "orr x23, x17, x23, LSL #22\n"
+ "mov x22, #0x6\n"
+ "add x21, x7, x6\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "mov x8, #0x0\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x7, x14\n"
+ "5:" // Issue prefetches
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
+ "bgt 5b\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x14, x7, x20, x14\n"
+ ".inst 0xc0040bc0 // mova za.d[x8, #0], { z30.d-z31.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040bc1 // mova za.d[x8, #1], { z30.d-z31.d }\n"
+ "mov x22, #0x2\n"
+ "ldp x11, x10, [x23], #0x10\n"
+ ".inst 0xc0040bc2 // mova za.d[x8, #2], { z30.d-z31.d }\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ ".inst 0xc0040bc3 // mova za.d[x8, #3], { z30.d-z31.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ "ldp x27, x26, [x23], #0x10\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ "sub x13, x13, x21\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ "6:" // Left padding
+ "subs x21, x21, #0x1\n"
+ "st1b { z24.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z25.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z27.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "bgt 6b\n"
+ "7:" // Left padding: End
+ "adds XZR, x7, x6\n"
+ "bne 12f\n"
+ "cbz x22, 10f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
+ "beq 9f\n"
+ "8:" // Unpadded: 2 priming loads
+ "add x21, x14, %x[ld_in_row]\n"
+ "ld1b { z20.s }, p1/Z, [x14]\n"
+ "addvl x20, SP, #8\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z4.h, z20.h, z16.h\n"
+ "add z4.h, z4.h, z21.h\n"
+ "ld1b { z23.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z5.h, z23.h, z22.h\n"
+ "add z5.h, z5.h, z21.h\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
+ "trn1 z6.h, z17.h, z16.h\n"
+ "add z6.h, z6.h, z21.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16b1488 // sdot za.s[x8, 0], { z4.h-z5.h }, z11.h\n"
+ ".inst 0xc1631489 // sdot za.s[x8, 1], { z4.h-z5.h }, z3.h\n"
+ ".inst 0xa1412a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16814a8 // sdot za.s[x8, 0], { z5.h-z6.h }, z8.h\n"
+ ".inst 0xc16014a9 // sdot za.s[x8, 1], { z5.h-z6.h }, z0.h\n"
+ "9:" // Unpadded: 1 priming loads
+ "add x22, x14, %x[ld_in_row]\n"
+ "ld1b { z25.s }, p1/Z, [x14]\n"
+ "addvl x21, SP, #4\n"
+ "ld1b { z6.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z3.h, z25.h, z6.h\n"
+ "add z3.h, z3.h, z21.h\n"
+ "ld1b { z18.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "addvl x20, SP, #8\n"
+ "ld1b { z26.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z4.h, z18.h, z26.h\n"
+ "add z4.h, z4.h, z21.h\n"
+ "ld1b { z2.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1b { z5.s }, p1/Z, [x22]\n"
+ "trn1 z5.h, z2.h, z5.h\n"
+ "add z5.h, z5.h, z21.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1611468 // sdot za.s[x8, 0], { z3.h-z4.h }, z1.h\n"
+ ".inst 0xc1601469 // sdot za.s[x8, 1], { z3.h-z4.h }, z0.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xa0412aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a146a // sdot za.s[x8, 2], { z3.h-z4.h }, z10.h\n"
+ ".inst 0xc162146b // sdot za.s[x8, 3], { z3.h-z4.h }, z2.h\n"
+ ".inst 0xc1691488 // sdot za.s[x8, 0], { z4.h-z5.h }, z9.h\n"
+ ".inst 0xc1681489 // sdot za.s[x8, 1], { z4.h-z5.h }, z8.h\n"
+ ".inst 0xa1412a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a148a // sdot za.s[x8, 2], { z4.h-z5.h }, z10.h\n"
+ ".inst 0xc162148b // sdot za.s[x8, 3], { z4.h-z5.h }, z2.h\n"
+ "10:" // Unpadded: 0 priming loads
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "cbz x15, 18f\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x14]\n"
+ "sub x15, x15, #0x1\n"
+ "ld1b { z9.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z6.h, z17.h, z9.h\n"
+ "sub x13, x13, #0x1\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "cmp x15, x13\n"
+ "add z6.h, z6.h, z21.h\n"
+ "ld1b { z7.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z7.h, z17.h, z7.h\n"
+ "csel x23, x15, x13, LT\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add z7.h, z7.h, z21.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1b { z1.s }, p1/Z, [x20]\n"
+ "trn1 z8.h, z17.h, z1.h\n"
+ "add z8.h, z8.h, z21.h\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 17f\n"
+ "11:" // Unpadded: Main loop
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ "addvl x22, SP, #4\n"
+ "addvl x21, SP, #8\n"
+ "ld1b { z2.s }, p1/Z, [x14]\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa1402ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22]\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ "ld1b { z19.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa1412ac3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ "ld1b { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc16d14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z13.h\n"
+ "ld1b { z18.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16514cb // sdot za.s[x8, 3], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+ "ld1b { z17.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+ "trn1 z6.h, z2.h, z19.h\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16314eb // sdot za.s[x8, 3], { z7.h-z8.h }, z3.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc16914ec // sdot za.s[x8, 4], { z7.h-z8.h }, z9.h\n"
+ "st1b { z24.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "add z6.h, z6.h, z21.h\n"
+ ".inst 0xc16114ed // sdot za.s[x8, 5], { z7.h-z8.h }, z1.h\n"
+ "trn1 z7.h, z23.h, z18.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "st1b { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ "st1b { z25.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "add z7.h, z7.h, z21.h\n"
+ "st1b { z27.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "add z8.h, z8.h, z21.h\n"
+ "bgt 11b\n"
+ "b 17f\n"
+ "12:" // Padded
+ "cbz x22, 15f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
+ "beq 14f\n"
+ "13:" // Padded: 2 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z19.s }, p0/Z, [x14]\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z7.h, z19.h, z18.h\n"
+ "trn1 z8.h, z17.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #8\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z9.h, z17.h, z16.h\n"
+ ".inst 0xc16a14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z10.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc16214e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z2.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16d1508 // sdot za.s[x8, 0], { z8.h-z9.h }, z13.h\n"
+ ".inst 0xc1651509 // sdot za.s[x8, 1], { z8.h-z9.h }, z5.h\n"
+ "14:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z19.s }, p0/Z, [x14]\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z22.h, z19.h, z18.h\n"
+ "trn1 z23.h, z17.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "addvl x21, SP, #4\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #8\n"
+ "trn1 z24.h, z17.h, z16.h\n"
+ ".inst 0xc16116c8 // sdot za.s[x8, 0], { z22.h-z23.h }, z1.h\n"
+ ".inst 0xc16016c9 // sdot za.s[x8, 1], { z22.h-z23.h }, z0.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xa0412aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16d16ca // sdot za.s[x8, 2], { z22.h-z23.h }, z13.h\n"
+ ".inst 0xc16516cb // sdot za.s[x8, 3], { z22.h-z23.h }, z5.h\n"
+ ".inst 0xc16116e8 // sdot za.s[x8, 0], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16016e9 // sdot za.s[x8, 1], { z23.h-z24.h }, z0.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16116ea // sdot za.s[x8, 2], { z23.h-z24.h }, z1.h\n"
+ ".inst 0xc16016eb // sdot za.s[x8, 3], { z23.h-z24.h }, z0.h\n"
+ "15:" // Padded: 0 priming loads
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "cbz x15, 18f\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z19.s }, p0/Z, [x14]\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z6.h, z19.h, z18.h\n"
+ "trn1 z7.h, z17.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ "sub x15, x15, #0x1\n"
+ "sub x13, x13, #0x1\n"
+ "cmp x15, x13\n"
+ "trn1 z8.h, z17.h, z16.h\n"
+ "csel x23, x15, x13, LT\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 17f\n"
+ "16:" // Padded: Main loop
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z9.s }, p0/Z, [x14]\n"
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ "add z9.h, p0/M, z9.h, z21.h\n"
+ "add x22, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z19.s }, p0/Z, [x22]\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ "add z19.h, p0/M, z19.h, z21.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z18.s }, p0/Z, [x22]\n"
+ "add z18.h, p0/M, z18.h, z21.h\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x22]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ "mov x12, #0x4\n"
+ "addvl x21, SP, #4\n"
+ "add z16.h, p0/M, z16.h, z21.h\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #8\n"
+ ".inst 0xc16b14ca // sdot za.s[x8, 2], { z6.h-z7.h }, z11.h\n"
+ "subs x23, x23, #0x1\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ ".inst 0xc16314cb // sdot za.s[x8, 3], { z6.h-z7.h }, z3.h\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z21.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ ".inst 0xa0412aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16d14cc // sdot za.s[x8, 4], { z6.h-z7.h }, z13.h\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ "ld1b { z2.s }, p0/Z, [x22]\n"
+ ".inst 0xc16514cd // sdot za.s[x8, 5], { z6.h-z7.h }, z5.h\n"
+ "add z2.h, p0/M, z2.h, z21.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc16b14ea // sdot za.s[x8, 2], { z7.h-z8.h }, z11.h\n"
+ ".inst 0xa1402be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ ".inst 0xc16a14eb // sdot za.s[x8, 3], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa1412a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z24.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc16b14ec // sdot za.s[x8, 4], { z7.h-z8.h }, z11.h\n"
+ "st1b { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "trn1 z6.h, z9.h, z19.h\n"
+ ".inst 0xc16314ed // sdot za.s[x8, 5], { z7.h-z8.h }, z3.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xa0412bea // ld1h { z10.h-z11.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "st1b { z25.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z27.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "trn1 z7.h, z18.h, z16.h\n"
+ "trn1 z8.h, z17.h, z2.h\n"
+ "bgt 16b\n"
+ "17:" // Main loop tail
+ ".inst 0xc16d14c8 // sdot za.s[x8, 0], { z6.h-z7.h }, z13.h\n"
+ "addvl x21, SP, #4\n"
+ "addvl x20, SP, #8\n"
+ ".inst 0xc16514c9 // sdot za.s[x8, 1], { z6.h-z7.h }, z5.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16b14e8 // sdot za.s[x8, 0], { z7.h-z8.h }, z11.h\n"
+ ".inst 0xc16a14e9 // sdot za.s[x8, 1], { z7.h-z8.h }, z10.h\n"
+ ".inst 0xa0412aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc0060818 // mova { z24.d-z25.d }, za.d[x8, #0]\n"
+ ".inst 0xc006083a // mova { z26.d-z27.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z14.s\n"
+ ".inst 0xc16114ca // sdot za.s[x8, 2], { z6.h-z7.h }, z1.h\n"
+ ".inst 0xc1acaa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z12.s\n"
+ ".inst 0xc16014cb // sdot za.s[x8, 3], { z6.h-z7.h }, z0.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1afab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z15.s\n"
+ ".inst 0xc16914cc // sdot za.s[x8, 4], { z6.h-z7.h }, z9.h\n"
+ ".inst 0xc1bccfb8 // sclamp { z24.s-z27.s }, z29.s, z28.s\n"
+ "st1b { z24.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc16114cd // sdot za.s[x8, 5], { z6.h-z7.h }, z1.h\n"
+ "st1b { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ ".inst 0xc16314ea // sdot za.s[x8, 2], { z7.h-z8.h }, z3.h\n"
+ "st1b { z25.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ ".inst 0xc16214eb // sdot za.s[x8, 3], { z7.h-z8.h }, z2.h\n"
+ ".inst 0xa0412a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "st1b { z27.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ ".inst 0xc16114ec // sdot za.s[x8, 4], { z7.h-z8.h }, z1.h\n"
+ ".inst 0xc16014ed // sdot za.s[x8, 5], { z7.h-z8.h }, z0.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ "18:" // Main loop skip tail
+ "cbz x13, 20f\n"
+ "19:" // Right padding loop
+ ".inst 0xc0060804 // mova { z4.d-z5.d }, za.d[x8, #0]\n"
+ "subs x13, x13, #0x1\n"
+ ".inst 0xc0060826 // mova { z6.d-z7.d }, za.d[x8, #1]\n"
+ ".inst 0xc1aeac04 // sqdmulh { z4.s-z7.s }, { z4.s-z7.s }, z14.s\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc1acaa24 // srshl { z4.s-z7.s }, { z4.s-z7.s }, z12.s\n"
+ ".inst 0xc0040bc4 // mova za.d[x8, #4], { z30.d-z31.d }\n"
+ ".inst 0xc1afab04 // add { z4.s-z7.s }, { z4.s-z7.s }, z15.s\n"
+ ".inst 0xc0040bc5 // mova za.d[x8, #5], { z30.d-z31.d }\n"
+ ".inst 0xc1bccfa4 // sclamp { z4.s-z7.s }, z29.s, z28.s\n"
+ "st1b { z4.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z6.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z5.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z7.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "bgt 19b\n"
+ "20:" // End
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x16\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x16, x17\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x10]\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #12\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..104c11fc9d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
+ const uint8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const int8_t *weights,
+ uint8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za : public PlanarStrategy<uint8_t, int8_t>
+{
+ using Parent = PlanarStrategy<uint8_t, int8_t>;
+
+ public:
+ using return_type = uint8_t;
+ constexpr static auto output_rows = 4u;
+ constexpr static auto kernel_rows = 3u, kernel_cols = 3u;
+ constexpr static auto stride_rows = 2u, stride_cols = 2u;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za(const CPUInfo *)
+ : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+ {
+ }
+
+ typename Parent::KernelType get_kernel(void) const override
+ {
+ return sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..8ce04fb8c2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,881 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_3x3_s2_4rows_dot_za_impl(
+ const uint8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const int8_t *weights,
+ uint8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+)
+{
+ struct Args
+ {
+ const uint8_t *inptr;
+ size_t ld_in_vl;
+ long unsigned int pad_top, pad_bottom, pad_left;
+ const int8_t *weights;
+ long unsigned int input_cols, output_cols;
+ uint8_t **outptrs;
+ const size_t *ld_out_cols;
+ const size_t *ld_out_vls;
+ long unsigned int current_channel, n_channels;
+ };
+
+ Args args = { inptr, ld_in_vl, pad_top, 9u - std::min(9u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ptrue p2.b\n"
+ "mov x20, #0x9\n"
+ "ldr x7, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ld1rh { z11.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "sub x20, x20, x6\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "ldr x17, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x17\n"
+ "whilelt p9.s, XZR, x20\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "whilelt p8.s, XZR, x7\n"
+ "addvl SP, SP, #-6\n"
+ "ldr x16, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "neg z11.h, p2/M, z11.h\n"
+ "eor p8.b, p2/Z, p8.b, p9.b\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "1:" // Channel loop
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "mov z28.s, #0x0\n"
+ "cbz x20, 2f\n"
+ "ld1w { z28.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "2:" // Load bias: Done
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x22\n"
+ "ld1sb { z26.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "ld1rh { z16.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "sub z26.h, z26.h, z16.h\n"
+ "incw x22\n"
+ "mov z24.h, #0x0\n"
+ "ld1sb { z3.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z3.h, z3.h, z16.h\n"
+ "trn1 z31.h, z26.h, z3.h\n"
+ "ld1sb { z21.s }, p2/Z, [x20]\n"
+ "sub z21.h, z21.h, z16.h\n"
+ "mov x20, x22\n"
+ "trn1 z14.h, z21.h, z24.h\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z2.h, z2.h, z16.h\n"
+ "addvl x21, SP, #6\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "sub z25.h, z25.h, z16.h\n"
+ "incw x22\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "sub z27.h, z27.h, z16.h\n"
+ "addvl x21, x21, #-2\n"
+ "mov x20, x22\n"
+ "st1h { z31.h }, p2, [x21]\n"
+ "trn1 z4.h, z2.h, z25.h\n"
+ "ld1sb { z26.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #3\n"
+ "st1h { z14.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z12.h, z27.h, z24.h\n"
+ "ld1sb { z20.s }, p2/Z, [x20]\n"
+ "sub z26.h, z26.h, z16.h\n"
+ "sub z23.h, z23.h, z16.h\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "sub z20.h, z20.h, z16.h\n"
+ "addvl x21, x21, #-2\n"
+ "st1h { z4.h }, p2, [x21]\n"
+ "mov z29.d, z28.d\n"
+ "st1h { z12.h }, p2, [x21, #1, MUL VL]\n"
+ "addvl x21, x21, #-2\n"
+ "mov z30.d, z28.d\n"
+ "mov z31.d, z28.d\n"
+ "trn1 z25.h, z26.h, z23.h\n"
+ "st1h { z25.h }, p2, [x21]\n"
+ "trn1 z3.h, z20.h, z24.h\n"
+ "st1h { z3.h }, p2, [x21, #1, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z6.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "3:" // Load mul: End
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z9.s }, p1/Z, [x20, x16, LSL #2]\n"
+ "4:" // Load right_shift: End
+ "ldr x15, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x15, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
+ "ldr x14, [%x[args], %[offsetof_Args_inptr]]\n"
+ "orr x23, x17, x23, LSL #22\n"
+ "mov x22, #0x9\n"
+ "add x21, x7, x6\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "ldr x13, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "mov x8, #0x0\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x7, x14\n"
+ "5:" // Issue prefetches
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
+ "bgt 5b\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x14, x7, x20, x14\n"
+ ".inst 0xc0040f80 // mova za.d[x8, #0], { z28.d-z31.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040f81 // mova za.d[x8, #1], { z28.d-z31.d }\n"
+ "mov x22, #0x2\n"
+ "ldp x11, x10, [x23], #0x10\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ "ldp x27, x26, [x23], #0x10\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ "and x22, x21, #0x1\n"
+ ".inst 0xc1a9aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z9.s\n"
+ "add x21, x21, #0x1\n"
+ "lsr x21, x21, #0x1\n"
+ ".inst 0xc1adab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z13.s\n"
+ "sub x13, x13, x21\n"
+ ".inst 0xc1a7cd58 // sclamp { z24.s-z27.s }, z10.s, z7.s\n"
+ "6:" // Left padding
+ "subs x21, x21, #0x1\n"
+ "st1b { z24.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z25.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z26.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z27.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "bgt 6b\n"
+ "7:" // Left padding: End
+ "adds XZR, x7, x6\n"
+ "bne 12f\n"
+ "cbz x22, 10f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
+ "beq 9f\n"
+ "8:" // Unpadded: 2 priming loads
+ "add x21, x14, %x[ld_in_row]\n"
+ "ld1b { z1.s }, p1/Z, [x14]\n"
+ "addvl x20, SP, #4\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z1.h, z1.h, z21.h\n"
+ "add z1.h, z1.h, z11.h\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1b { z15.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z2.h, z2.h, z15.h\n"
+ "add z2.h, z2.h, z11.h\n"
+ "ld1b { z3.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z3.h, z3.h, z21.h\n"
+ "add z3.h, z3.h, z11.h\n"
+ "ld1b { z4.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z4.h, z4.h, z19.h\n"
+ "add z4.h, z4.h, z11.h\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
+ "mov z5.d, z8.d\n"
+ "add z5.h, z5.h, z11.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+ ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+ "9:" // Unpadded: 1 priming loads
+ "add x21, x14, %x[ld_in_row]\n"
+ "ld1b { z1.s }, p1/Z, [x14]\n"
+ "addvl x20, SP, #2\n"
+ "ld1b { z21.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z1.h, z1.h, z21.h\n"
+ "add z1.h, z1.h, z11.h\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1b { z12.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z2.h, z2.h, z12.h\n"
+ "add z2.h, z2.h, z11.h\n"
+ "ld1b { z3.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z3.h, z3.h, z8.h\n"
+ "add z3.h, z3.h, z11.h\n"
+ "ld1b { z4.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z5.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z4.h, z4.h, z5.h\n"
+ "add z4.h, z4.h, z11.h\n"
+ "ld1b { z5.s }, p1/Z, [x21]\n"
+ "mov z5.d, z5.d\n"
+ "add z5.h, z5.h, z11.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701428 // sdot za.s[x8, 0], { z1.h-z4.h }, z0.h\n"
+ ".inst 0xc1781448 // sdot za.s[x8, 0], { z2.h-z5.h }, z8.h\n"
+ "10:" // Unpadded: 0 priming loads
+ "cmp x15, #0x2\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "blt 18f\n"
+ "add x21, x14, %x[ld_in_row]\n"
+ "ld1b { z21.s }, p1/Z, [x14]\n"
+ "sub x15, x15, #0x2\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z8.h\n"
+ "sub x13, x13, #0x1\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "lsr x20, x15, #0x1\n"
+ "add z21.h, z21.h, z11.h\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z25.h\n"
+ "cmp x20, x13\n"
+ "ld1b { z23.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "csel x23, x20, x13, LT\n"
+ "add z22.h, z22.h, z11.h\n"
+ "ld1b { z18.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z18.h\n"
+ "add z23.h, z23.h, z11.h\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "ld1b { z19.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z19.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
+ "mov z25.d, z8.d\n"
+ "add z25.h, z25.h, z11.h\n"
+ "and x15, x15, #0x1\n"
+ "sub x13, x13, x23\n"
+ "cbz x23, 17f\n"
+ "11:" // Unpadded: Main loop
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+ "addvl x20, SP, #4\n"
+ "add x22, x14, %x[ld_in_row]\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "addvl x21, SP, #2\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1b { z21.s }, p1/Z, [x14]\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ "ld1b { z18.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "trn1 z21.h, z21.h, z18.h\n"
+ "ld1b { z22.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z21.h, z21.h, z11.h\n"
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z8.h\n"
+ "add z22.h, z22.h, z11.h\n"
+ "ld1b { z23.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "ld1b { z27.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z27.h\n"
+ "add z23.h, z23.h, z11.h\n"
+ "ld1b { z24.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z8.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "ld1b { z4.s }, p1/Z, [x22]\n"
+ "mov z25.d, z4.d\n"
+ "add z25.h, z25.h, z11.h\n"
+ ".inst 0xa1402aa4 // ld1h { z4.h, z12.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17416a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z4.h\n"
+ ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+ "ld1b { z21.s }, p1/Z, [x14]\n"
+ ".inst 0xc17c16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z12.h\n"
+ ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ "ld1b { z12.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z12.h\n"
+ ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z0.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z20.h\n"
+ "st1b { z1.s }, p1, [x10]\n"
+ "ld1b { z23.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add x10, x10, x28\n"
+ "st1b { z2.s }, p1, [x27]\n"
+ "ld1b { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z24.h\n"
+ "add x27, x27, x25\n"
+ "ld1b { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "st1b { z3.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "ld1b { z3.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z3.h\n"
+ "add z21.h, z21.h, z11.h\n"
+ "ld1b { z3.s }, p1/Z, [x20]\n"
+ "mov z25.d, z3.d\n"
+ "add z22.h, z22.h, z11.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "add z23.h, z23.h, z11.h\n"
+ "add z24.h, z24.h, z11.h\n"
+ "add z25.h, z25.h, z11.h\n"
+ "bgt 11b\n"
+ "b 17f\n"
+ "12:" // Padded
+ "cbz x22, 15f\n"
+ "cmp x22, #0x1\n"
+ "sub x15, x15, x22\n"
+ "beq 14f\n"
+ "13:" // Padded: 2 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z22.s }, p0/Z, [x14]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z17.h\n"
+ "trn1 z23.h, z23.h, z4.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z11.h\n"
+ "addvl x20, SP, #4\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+ "mov z26.d, z1.d\n"
+ ".inst 0xc17416c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z4.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc17c16e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z12.h\n"
+ "14:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z22.s }, p0/Z, [x14]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z5.s }, p0/Z, [x20]\n"
+ "add z5.h, p0/M, z5.h, z11.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z22.h, z22.h, z17.h\n"
+ "trn1 z23.h, z23.h, z5.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z15.s }, p0/Z, [x20]\n"
+ "add z15.h, p0/M, z15.h, z11.h\n"
+ "addvl x20, SP, #2\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov z26.d, z15.d\n"
+ ".inst 0xc17016c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z0.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc17116e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z1.h\n"
+ "15:" // Padded: 0 priming loads
+ "cmp x15, #0x2\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "blt 18f\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z18.h\n"
+ "trn1 z22.h, z22.h, z3.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
+ "add z19.h, p0/M, z19.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z20.h, p0/M, z20.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
+ "sub x15, x15, #0x2\n"
+ "sub x13, x13, #0x1\n"
+ "trn1 z23.h, z23.h, z19.h\n"
+ "trn1 z24.h, z24.h, z20.h\n"
+ "lsr x20, x15, #0x1\n"
+ "cmp x20, x13\n"
+ "mov z25.d, z3.d\n"
+ "csel x22, x20, x13, LT\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "and x15, x15, #0x1\n"
+ "sub x13, x13, x22\n"
+ "cbz x22, 17f\n"
+ "16:" // Padded: Main loop
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+ "addvl x20, SP, #4\n"
+ "mov x12, #0x0\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa1402a84 // ld1h { z4.h, z12.h }, pn10.b/Z, [x20]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "add x21, x14, %x[ld_in_row]\n"
+ ".inst 0xc17416a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z4.h\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z14.s }, p0/Z, [x21]\n"
+ "add z14.h, p0/M, z14.h, z11.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc17c16c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z12.h\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z15.s }, p0/Z, [x21]\n"
+ "mov x12, #0x4\n"
+ "add z15.h, p0/M, z15.h, z11.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z23.s }, p0/Z, [x21]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z24.s }, p0/Z, [x21]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z4.s }, p0/Z, [x21]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
+ "mov x12, #0x8\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z14.h\n"
+ "trn1 z22.h, z22.h, z15.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "addvl x20, SP, #2\n"
+ "ld1b { z2.s }, p0/Z, [x21]\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z4.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "mov x12, #0x0\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "add z2.h, p0/M, z2.h, z11.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ ".inst 0xc17016a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z0.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "mov z25.d, z2.d\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc17116c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z1.h\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add z12.h, p0/M, z12.h, z11.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z11.h\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ "subs x22, x22, #0x1\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "trn1 z21.h, z21.h, z20.h\n"
+ "st1b { z17.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "trn1 z22.h, z22.h, z4.h\n"
+ "trn1 z23.h, z23.h, z27.h\n"
+ "st1b { z18.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "trn1 z24.h, z24.h, z12.h\n"
+ "mov z25.d, z8.d\n"
+ "st1b { z19.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "bgt 16b\n"
+ "17:" // Main loop tail
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+ "addvl x20, SP, #4\n"
+ "mov x12, #0x0\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ "ld1b { z0.s }, p0/Z, [x14]\n"
+ "add z0.h, p0/M, z0.h, z11.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z14.s }, p0/Z, [x20]\n"
+ "add z14.h, p0/M, z14.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z12.s }, p0/Z, [x20]\n"
+ "mov x12, #0x4\n"
+ "add z12.h, p0/M, z12.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z2.s }, p0/Z, [x20]\n"
+ "add z2.h, p0/M, z2.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "add z3.h, p0/M, z3.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add z25.h, p0/M, z25.h, z11.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #2\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ "trn1 z0.h, z0.h, z14.h\n"
+ "add x8, x8, #0x1\n"
+ "add z27.h, p0/M, z27.h, z11.h\n"
+ "trn1 z1.h, z1.h, z12.h\n"
+ "trn1 z2.h, z2.h, z21.h\n"
+ "add x14, x14, %x[ld_in_col]\n"
+ "trn1 z3.h, z3.h, z25.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ "mov z4.d, z27.d\n"
+ ".inst 0xc17e1408 // sdot za.s[x8, 0], { z0.h-z3.h }, z14.h\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "st1b { z17.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ ".inst 0xc17f1428 // sdot za.s[x8, 0], { z1.h-z4.h }, z15.h\n"
+ ".inst 0xa0402bee // ld1h { z14.h-z15.h }, pn10.b/Z, [SP]\n"
+ "st1b { z18.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z19.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "18:" // Main loop skip tail
+ "cbz x15, 19f\n" // Skip remainder inputs
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z21.s }, p0/Z, [x14]\n"
+ "add z21.h, p0/M, z21.h, z11.h\n"
+ "add x20, x14, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z0.s }, p0/Z, [x20]\n"
+ "add z0.h, p0/M, z0.h, z11.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z21.h, z21.h, z17.h\n"
+ "trn1 z22.h, z22.h, z0.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z23.s }, p0/Z, [x20]\n"
+ "add z23.h, p0/M, z23.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z5.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z5.h, p0/M, z5.h, z11.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z4.s }, p0/Z, [x20]\n"
+ "add z4.h, p0/M, z4.h, z11.h\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z5.h\n"
+ "mov z25.d, z4.d\n"
+ "addvl x20, SP, #4\n"
+ ".inst 0xc17e16a8 // sdot za.s[x8, 0], { z21.h-z24.h }, z14.h\n"
+ "sub x13, x13, #0x1\n"
+ ".inst 0xc17f16c8 // sdot za.s[x8, 0], { z22.h-z25.h }, z15.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1a9aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z9.s\n"
+ ".inst 0xc17016a9 // sdot za.s[x8, 1], { z21.h-z24.h }, z0.h\n"
+ ".inst 0xc1adab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z13.s\n"
+ ".inst 0xc17116c9 // sdot za.s[x8, 1], { z22.h-z25.h }, z1.h\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc1a7cd50 // sclamp { z16.s-z19.s }, z10.s, z7.s\n"
+ "st1b { z16.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ "st1b { z17.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z18.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z19.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "19:" // Tail input: End
+ "cbz x13, 21f\n"
+ "20:" // Right padding loop
+ ".inst 0xc0060c00 // mova { z0.d-z3.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac00 // sqdmulh { z0.s-z3.s }, { z0.s-z3.s }, z6.s\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc1a9aa20 // srshl { z0.s-z3.s }, { z0.s-z3.s }, z9.s\n"
+ "subs x13, x13, #0x1\n"
+ ".inst 0xc0040f82 // mova za.d[x8, #2], { z28.d-z31.d }\n"
+ ".inst 0xc1adab00 // add { z0.s-z3.s }, { z0.s-z3.s }, z13.s\n"
+ ".inst 0xc1a7cd40 // sclamp { z0.s-z3.s }, z10.s, z7.s\n"
+ "st1b { z0.s }, p1, [x11]\n"
+ "add x11, x11, x9\n"
+ "st1b { z1.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z2.s }, p1, [x27]\n"
+ "add x27, x27, x25\n"
+ "st1b { z3.s }, p1, [x26]\n"
+ "add x26, x26, x24\n"
+ "bgt 20b\n"
+ "21:" // End
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x16\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "whilelt p1.s, x16, x17\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x10]\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #6\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp
new file mode 100644
index 0000000000..52173b8551
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
+ const uint8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const int8_t *weights,
+ uint8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za : public PlanarStrategy<uint8_t, int8_t>
+{
+ using Parent = PlanarStrategy<uint8_t, int8_t>;
+
+ public:
+ using return_type = uint8_t;
+ constexpr static auto output_rows = 4u;
+ constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+ constexpr static auto stride_rows = 1u, stride_cols = 1u;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za(const CPUInfo *)
+ : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+ {
+ }
+
+ typename Parent::KernelType get_kernel(void) const override
+ {
+ return sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..64023eeaff
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za/generic.cpp
@@ -0,0 +1,1204 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_5x5_s1_4rows_dot_za_impl(
+ const uint8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const int8_t *weights,
+ uint8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+)
+{
+ struct Args
+ {
+ const uint8_t *inptr;
+ size_t ld_in_vl;
+ long unsigned int pad_top, pad_bottom, pad_left;
+ const int8_t *weights;
+ long unsigned int input_cols, output_cols;
+ uint8_t **outptrs;
+ const size_t *ld_out_cols;
+ const size_t *ld_out_vls;
+ long unsigned int current_channel, n_channels;
+ };
+
+ Args args = { inptr, ld_in_vl, pad_top, 8u - std::min(8u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ldr x5, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ptrue p2.b\n"
+ "mov x20, #0x8\n"
+ "ldr x6, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ld1rh { z17.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "sub x20, x20, x5\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "ldr x7, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x7\n"
+ "whilelt p9.s, XZR, x20\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "whilelt p8.s, XZR, x6\n"
+ "addvl SP, SP, #-30\n"
+ "ldr x17, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "neg z17.h, p2/M, z17.h\n"
+ "eor p8.b, p2/Z, p8.b, p9.b\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z24.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "1:" // Channel loop
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "mov z18.s, #0x0\n"
+ "cbz x20, 2f\n"
+ "ld1w { z18.s }, p1/Z, [x20, x17, LSL #2]\n"
+ "2:" // Load bias: Done
+ "ldr x23, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x23\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "ld1rh { z3.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z15.h, #0x0\n"
+ "sub z2.h, z2.h, z3.h\n"
+ "incw x23\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z13.h, z13.h, z3.h\n"
+ "trn1 z11.h, z15.h, z2.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z27.h, z27.h, z3.h\n"
+ "trn1 z0.h, z2.h, z13.h\n"
+ "ld1sb { z19.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z19.h, z19.h, z3.h\n"
+ "trn1 z26.h, z13.h, z27.h\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
+ "sub z14.h, z14.h, z3.h\n"
+ "mov x20, x23\n"
+ "trn1 z10.h, z27.h, z19.h\n"
+ "ld1sb { z9.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z19.h, z19.h, z14.h\n"
+ "trn1 z1.h, z14.h, z15.h\n"
+ "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z9.h, z9.h, z3.h\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "ld1sb { z29.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z29.h, z29.h, z3.h\n"
+ "addvl x22, SP, #30\n"
+ "ld1sb { z2.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "incw x23\n"
+ "sub z2.h, z2.h, z3.h\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "addvl x22, x22, #-6\n"
+ "sub z23.h, z23.h, z3.h\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "trn1 z20.h, z15.h, z9.h\n"
+ "incw x23\n"
+ "ldr x21, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "st1h { z0.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z22.h, z9.h, z5.h\n"
+ "ld1sb { z25.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z26.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z9.h, z5.h, z29.h\n"
+ "ld1sb { z21.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z10.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z26.h, z29.h, z2.h\n"
+ "ld1sb { z0.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z28.h, z2.h, z23.h\n"
+ "ld1sb { z19.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z1.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z2.h, z23.h, z15.h\n"
+ "sub z25.h, z25.h, z3.h\n"
+ "addvl x22, x22, #-6\n"
+ "sub z21.h, z21.h, z3.h\n"
+ "ld1sb { z6.s }, p2/Z, [x20]\n"
+ "sub z0.h, z0.h, z3.h\n"
+ "mov x20, x23\n"
+ "sub z19.h, z19.h, z3.h\n"
+ "sub z6.h, z6.h, z3.h\n"
+ "st1h { z20.h }, p2, [x22]\n"
+ "incw x23\n"
+ "st1h { z22.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z11.h, z15.h, z25.h\n"
+ "trn1 z10.h, z25.h, z21.h\n"
+ "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z9.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z14.h, z21.h, z0.h\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z26.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z21.h, z0.h, z19.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z28.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z19.h, z19.h, z6.h\n"
+ "ld1sb { z29.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z13.h, z6.h, z15.h\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "sub z23.h, z23.h, z3.h\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
+ "addvl x22, x22, #-6\n"
+ "sub z27.h, z27.h, z3.h\n"
+ "sub z29.h, z29.h, z3.h\n"
+ "mov x20, x23\n"
+ "st1h { z11.h }, p2, [x22]\n"
+ "sub z1.h, z1.h, z3.h\n"
+ "st1h { z10.h }, p2, [x22, #1, MUL VL]\n"
+ "trn1 z30.h, z15.h, z5.h\n"
+ "trn1 z26.h, z5.h, z23.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z14.h }, p2, [x22, #2, MUL VL]\n"
+ "trn1 z22.h, z23.h, z27.h\n"
+ "ld1sb { z5.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z21.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z28.h, z27.h, z29.h\n"
+ "ld1sb { z8.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z19.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z27.h, z29.h, z1.h\n"
+ "ld1sb { z9.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z13.h }, p2, [x22, #5, MUL VL]\n"
+ "trn1 z2.h, z1.h, z15.h\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
+ "sub z11.h, z11.h, z3.h\n"
+ "addvl x22, x22, #-6\n"
+ "sub z5.h, z5.h, z3.h\n"
+ "sub z8.h, z8.h, z3.h\n"
+ "st1h { z30.h }, p2, [x22]\n"
+ "sub z9.h, z9.h, z3.h\n"
+ "sub z14.h, z14.h, z3.h\n"
+ "st1h { z26.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x22, #2, MUL VL]\n"
+ "mov z19.d, z18.d\n"
+ "trn1 z22.h, z15.h, z11.h\n"
+ "st1h { z28.h }, p2, [x22, #3, MUL VL]\n"
+ "trn1 z1.h, z11.h, z5.h\n"
+ "trn1 z31.h, z5.h, z8.h\n"
+ "st1h { z27.h }, p2, [x22, #4, MUL VL]\n"
+ "trn1 z8.h, z8.h, z9.h\n"
+ "trn1 z21.h, z9.h, z14.h\n"
+ "st1h { z2.h }, p2, [x22, #5, MUL VL]\n"
+ "addvl x22, x22, #-6\n"
+ "trn1 z15.h, z14.h, z15.h\n"
+ "st1h { z22.h }, p2, [x22]\n"
+ "st1h { z1.h }, p2, [x22, #1, MUL VL]\n"
+ "st1h { z31.h }, p2, [x22, #2, MUL VL]\n"
+ "st1h { z8.h }, p2, [x22, #3, MUL VL]\n"
+ "st1h { z21.h }, p2, [x22, #4, MUL VL]\n"
+ "st1h { z15.h }, p2, [x22, #5, MUL VL]\n"
+ "cbz x21, 3f\n"
+ "ld1w { z7.s }, p1/Z, [x21, x17, LSL #2]\n"
+ "3:" // Load mul: End
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z4.s }, p1/Z, [x20, x17, LSL #2]\n"
+ "4:" // Load right_shift: End
+ "ldr x25, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x25, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
+ "ldr x16, [%x[args], %[offsetof_Args_inptr]]\n"
+ "orr x23, x7, x23, LSL #22\n"
+ "mov x22, #0x8\n"
+ "add x21, x6, x5\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "ldr x15, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "mov x11, #0x0\n"
+ "mov x8, #0x8\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x6, x16\n"
+ "5:" // Issue prefetches
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
+ "bgt 5b\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x16, x6, x20, x16\n"
+ ".inst 0xc0046a40 // mova za.d[x11, #0], { z18.d-z19.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0046a41 // mova za.d[x11, #1], { z18.d-z19.d }\n"
+ "mov x22, #0x4\n"
+ "ldp x14, x13, [x23], #0x10\n"
+ ".inst 0xc0046a42 // mova za.d[x11, #2], { z18.d-z19.d }\n"
+ "ldp x4, x10, [x20], #0x10\n"
+ ".inst 0xc0046a43 // mova za.d[x11, #3], { z18.d-z19.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0046a44 // mova za.d[x11, #4], { z18.d-z19.d }\n"
+ "ldp x9, x28, [x23], #0x10\n"
+ ".inst 0xc0046a45 // mova za.d[x11, #5], { z18.d-z19.d }\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ ".inst 0xc0046a46 // mova za.d[x11, #6], { z18.d-z19.d }\n"
+ ".inst 0xc0046a47 // mova za.d[x11, #7], { z18.d-z19.d }\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
+ ".inst 0xc0066814 // mova { z20.d-z21.d }, za.d[x11, #0]\n"
+ "sub x15, x15, x21\n"
+ ".inst 0xc0066836 // mova { z22.d-z23.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac14 // sqdmulh { z20.s-z23.s }, { z20.s-z23.s }, z7.s\n"
+ ".inst 0xc1a4aa34 // srshl { z20.s-z23.s }, { z20.s-z23.s }, z4.s\n"
+ ".inst 0xc1acab14 // add { z20.s-z23.s }, { z20.s-z23.s }, z12.s\n"
+ ".inst 0xc1b0cf14 // sclamp { z20.s-z23.s }, z24.s, z16.s\n"
+ "6:" // Left padding
+ "subs x21, x21, #0x1\n"
+ "st1b { z20.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z22.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z21.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z23.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "bgt 6b\n"
+ "7:" // Left padding: End
+ "adds XZR, x6, x5\n"
+ "bne 14f\n"
+ "cbz x22, 12f\n"
+ "cmp x22, #0x1\n"
+ "sub x25, x25, x22\n"
+ "beq 11f\n"
+ "cmp x22, #0x2\n"
+ "beq 10f\n"
+ "cmp x22, #0x3\n"
+ "beq 9f\n"
+ "8:" // Unpadded: 4 priming loads
+ "add x21, x16, %x[ld_in_row]\n"
+ "ld1b { z1.s }, p1/Z, [x16]\n"
+ "addvl x20, SP, #24\n"
+ "ld1b { z28.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z27.h, z1.h, z28.h\n"
+ "add z27.h, z27.h, z17.h\n"
+ "ld1b { z1.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1b { z2.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z28.h, z1.h, z2.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "ld1b { z13.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z6.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z29.h, z13.h, z6.h\n"
+ "add z29.h, z29.h, z17.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16a7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z10.h\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "trn1 z30.h, z30.h, z20.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "add z30.h, z30.h, z17.h\n"
+ ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc1617789 // sdot za.s[x11, 1], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16b77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z11.h\n"
+ ".inst 0xc16a77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z10.h\n"
+ "9:" // Unpadded: 3 priming loads
+ "add x22, x16, %x[ld_in_row]\n"
+ "ld1b { z2.s }, p1/Z, [x16]\n"
+ "addvl x21, SP, #18\n"
+ "ld1b { z28.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z20.h, z2.h, z28.h\n"
+ "add z20.h, z20.h, z17.h\n"
+ "ld1b { z31.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "addvl x20, SP, #24\n"
+ "ld1b { z11.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z21.h, z31.h, z11.h\n"
+ "add z21.h, z21.h, z17.h\n"
+ "ld1b { z25.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z22.h, z25.h, z8.h\n"
+ "add z22.h, z22.h, z17.h\n"
+ "ld1b { z8.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16e7688 // sdot za.s[x11, 0], { z20.h-z21.h }, z14.h\n"
+ "ld1b { z3.s }, p1/Z, [x22]\n"
+ "trn1 z23.h, z8.h, z3.h\n"
+ ".inst 0xc1667689 // sdot za.s[x11, 1], { z20.h-z21.h }, z6.h\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc161768a // sdot za.s[x11, 2], { z20.h-z21.h }, z1.h\n"
+ "add z23.h, z23.h, z17.h\n"
+ ".inst 0xa1412aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc160768b // sdot za.s[x11, 3], { z20.h-z21.h }, z0.h\n"
+ ".inst 0xc16976a8 // sdot za.s[x11, 0], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xa0422aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16176a9 // sdot za.s[x11, 1], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16976aa // sdot za.s[x11, 2], { z21.h-z22.h }, z9.h\n"
+ ".inst 0xc16176ab // sdot za.s[x11, 3], { z21.h-z22.h }, z1.h\n"
+ ".inst 0xc16f76c8 // sdot za.s[x11, 0], { z22.h-z23.h }, z15.h\n"
+ ".inst 0xc16e76c9 // sdot za.s[x11, 1], { z22.h-z23.h }, z14.h\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16b76ca // sdot za.s[x11, 2], { z22.h-z23.h }, z11.h\n"
+ ".inst 0xc16a76cb // sdot za.s[x11, 3], { z22.h-z23.h }, z10.h\n"
+ "10:" // Unpadded: 2 priming loads
+ "add x23, x16, %x[ld_in_row]\n"
+ "ld1b { z2.s }, p1/Z, [x16]\n"
+ "addvl x22, SP, #12\n"
+ "ld1b { z22.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z0.h, z2.h, z22.h\n"
+ "add z0.h, z0.h, z17.h\n"
+ "ld1b { z14.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "addvl x21, SP, #18\n"
+ "ld1b { z6.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z1.h, z14.h, z6.h\n"
+ "add z1.h, z1.h, z17.h\n"
+ "ld1b { z15.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "addvl x20, SP, #24\n"
+ "ld1b { z6.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z2.h, z15.h, z6.h\n"
+ "add z2.h, z2.h, z17.h\n"
+ "ld1b { z21.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16f7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z15.h\n"
+ "ld1b { z30.s }, p1/Z, [x23]\n"
+ "trn1 z3.h, z21.h, z30.h\n"
+ ".inst 0xc16e7409 // sdot za.s[x11, 1], { z0.h-z1.h }, z14.h\n"
+ ".inst 0xa1402aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d740a // sdot za.s[x11, 2], { z0.h-z1.h }, z13.h\n"
+ "add z3.h, z3.h, z17.h\n"
+ ".inst 0xa0412ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc165740b // sdot za.s[x11, 3], { z0.h-z1.h }, z5.h\n"
+ ".inst 0xa0402a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16f7428 // sdot za.s[x11, 0], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e7429 // sdot za.s[x11, 1], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xa0422ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16b740c // sdot za.s[x11, 4], { z0.h-z1.h }, z11.h\n"
+ ".inst 0xc16a740d // sdot za.s[x11, 5], { z0.h-z1.h }, z10.h\n"
+ ".inst 0xc16f742a // sdot za.s[x11, 2], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e742b // sdot za.s[x11, 3], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xa0412a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1697448 // sdot za.s[x11, 0], { z2.h-z3.h }, z9.h\n"
+ ".inst 0xc1687449 // sdot za.s[x11, 1], { z2.h-z3.h }, z8.h\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16f742c // sdot za.s[x11, 4], { z1.h-z2.h }, z15.h\n"
+ ".inst 0xc16e742d // sdot za.s[x11, 5], { z1.h-z2.h }, z14.h\n"
+ ".inst 0xc16b744a // sdot za.s[x11, 2], { z2.h-z3.h }, z11.h\n"
+ ".inst 0xc16a744b // sdot za.s[x11, 3], { z2.h-z3.h }, z10.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc161744c // sdot za.s[x11, 4], { z2.h-z3.h }, z1.h\n"
+ ".inst 0xc160744d // sdot za.s[x11, 5], { z2.h-z3.h }, z0.h\n"
+ "11:" // Unpadded: 1 priming loads
+ "add x24, x16, %x[ld_in_row]\n"
+ "ld1b { z0.s }, p1/Z, [x16]\n"
+ "addvl x23, SP, #6\n"
+ "ld1b { z3.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z28.h, z0.h, z3.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "ld1b { z6.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "addvl x22, SP, #12\n"
+ "ld1b { z30.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z29.h, z6.h, z30.h\n"
+ "add z29.h, z29.h, z17.h\n"
+ "ld1b { z1.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "addvl x21, SP, #18\n"
+ "ld1b { z25.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "trn1 z30.h, z1.h, z25.h\n"
+ "add z30.h, z30.h, z17.h\n"
+ "ld1b { z3.s }, p1/Z, [x24]\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1b { z5.s }, p1/Z, [x24]\n"
+ "trn1 z31.h, z3.h, z5.h\n"
+ ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16e778a // sdot za.s[x11, 2], { z28.h-z29.h }, z14.h\n"
+ "add z31.h, z31.h, z17.h\n"
+ ".inst 0xa1412ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc166778b // sdot za.s[x11, 3], { z28.h-z29.h }, z6.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16a77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z10.h\n"
+ ".inst 0xc16277a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z2.h\n"
+ ".inst 0xa0412ac8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16f778c // sdot za.s[x11, 4], { z28.h-z29.h }, z15.h\n"
+ ".inst 0xc16e778d // sdot za.s[x11, 5], { z28.h-z29.h }, z14.h\n"
+ ".inst 0xa1402a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+ ".inst 0xc16877ab // sdot za.s[x11, 3], { z29.h-z30.h }, z8.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z10.h\n"
+ ".inst 0xc16277c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z2.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16e778e // sdot za.s[x11, 6], { z28.h-z29.h }, z14.h\n"
+ ".inst 0xc166778f // sdot za.s[x11, 7], { z28.h-z29.h }, z6.h\n"
+ ".inst 0xc16d77ac // sdot za.s[x11, 4], { z29.h-z30.h }, z13.h\n"
+ ".inst 0xc16577ad // sdot za.s[x11, 5], { z29.h-z30.h }, z5.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z10.h\n"
+ ".inst 0xc16277cb // sdot za.s[x11, 3], { z30.h-z31.h }, z2.h\n"
+ ".inst 0xa0422aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e77ae // sdot za.s[x11, 6], { z29.h-z30.h }, z14.h\n"
+ ".inst 0xc16677af // sdot za.s[x11, 7], { z29.h-z30.h }, z6.h\n"
+ ".inst 0xc16977cc // sdot za.s[x11, 4], { z30.h-z31.h }, z9.h\n"
+ ".inst 0xc16877cd // sdot za.s[x11, 5], { z30.h-z31.h }, z8.h\n"
+ ".inst 0xa1422a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16e77ce // sdot za.s[x11, 6], { z30.h-z31.h }, z14.h\n"
+ ".inst 0xc16677cf // sdot za.s[x11, 7], { z30.h-z31.h }, z6.h\n"
+ "12:" // Unpadded: 0 priming loads
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 22f\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ "ld1b { z26.s }, p1/Z, [x16]\n"
+ "sub x25, x25, #0x1\n"
+ "ld1b { z28.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z26.h, z28.h\n"
+ "sub x15, x15, #0x1\n"
+ "ld1b { z31.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "cmp x25, x15\n"
+ "add z25.h, z25.h, z17.h\n"
+ "ld1b { z15.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z26.h, z31.h, z15.h\n"
+ "csel x25, x25, x15, LT\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add z26.h, z26.h, z17.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "ld1b { z8.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z27.h, z22.h, z8.h\n"
+ "add z27.h, z27.h, z17.h\n"
+ "ld1b { z21.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "sub x15, x15, x25\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
+ "trn1 z28.h, z21.h, z20.h\n"
+ "add z28.h, z28.h, z17.h\n"
+ "cbz x25, 21f\n"
+ "13:" // Unpadded: Main loop
+ "addvl x24, SP, #6\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+ "addvl x23, SP, #12\n"
+ "ld1b { z21.s }, p1/Z, [x16]\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402b0e // ld1h { z14.h-z15.h }, pn10.b/Z, [x24]\n"
+ "addvl x22, SP, #18\n"
+ "addvl x21, SP, #24\n"
+ ".inst 0xc16f772a // sdot za.s[x11, 2], { z25.h-z26.h }, z15.h\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ "ld1b { z0.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e772b // sdot za.s[x11, 3], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xa1402ae6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x23]\n"
+ "subs x25, x25, #0x1\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ "ld1b { z20.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412b05 // ld1h { z5.h, z13.h }, pn10.b/Z, [x24, #0x2, MUL VL]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ "ld1b { z31.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ "ld1b { z29.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ "ld1b { z22.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422b02 // ld1h { z2.h, z10.h }, pn10.b/Z, [x24, #0x4, MUL VL]\n"
+ ".inst 0xc16f772e // sdot za.s[x11, 6], { z25.h-z26.h }, z15.h\n"
+ "ld1b { z30.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc16e772f // sdot za.s[x11, 7], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xa0402aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16d774c // sdot za.s[x11, 4], { z26.h-z27.h }, z13.h\n"
+ "ld1b { z6.s }, p1/Z, [x20]\n"
+ ".inst 0xc165774d // sdot za.s[x11, 5], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16a776a // sdot za.s[x11, 2], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776b // sdot za.s[x11, 3], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ae2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16a776c // sdot za.s[x11, 4], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xa0422aaa // ld1h { z10.h-z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+ ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+ "trn1 z25.h, z21.h, z0.h\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xc16d1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z13.h\n"
+ "add z25.h, z25.h, z17.h\n"
+ ".inst 0xc1651749 // sdot za.s[x8, 1], { z26.h-z27.h }, z5.h\n"
+ "trn1 z26.h, z20.h, z31.h\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+ "add z26.h, z26.h, z17.h\n"
+ ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+ "trn1 z27.h, z29.h, z22.h\n"
+ "trn1 z28.h, z30.h, z6.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "add z27.h, z27.h, z17.h\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ "add x11, x11, #0x2\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "add z28.h, z28.h, z17.h\n"
+ "st1b { z10.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z9.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z11.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "bgt 13b\n"
+ "b 21f\n"
+ "14:" // Padded
+ "cbz x22, 19f\n"
+ "cmp x22, #0x1\n"
+ "sub x25, x25, x22\n"
+ "beq 18f\n"
+ "cmp x22, #0x2\n"
+ "beq 17f\n"
+ "cmp x22, #0x3\n"
+ "beq 16f\n"
+ "15:" // Padded: 4 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z9.s }, p0/Z, [x16]\n"
+ "add z9.h, p0/M, z9.h, z17.h\n"
+ "add x21, x16, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z21.s }, p0/Z, [x21]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "mov x12, #0x4\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z31.h, z9.h, z22.h\n"
+ "trn1 z0.h, z21.h, z20.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z22.s }, p0/Z, [x21]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z21.s }, p0/Z, [x21]\n"
+ "addvl x20, SP, #24\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xa1402a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20]\n"
+ "trn1 z1.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16a77e8 // sdot za.s[x11, 0], { z31.h-z0.h }, z10.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc16277e9 // sdot za.s[x11, 1], { z31.h-z0.h }, z2.h\n"
+ ".inst 0xa1412a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ "trn1 z2.h, z21.h, z20.h\n"
+ ".inst 0xc16d7408 // sdot za.s[x11, 0], { z0.h-z1.h }, z13.h\n"
+ ".inst 0xa0422a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1657409 // sdot za.s[x11, 1], { z0.h-z1.h }, z5.h\n"
+ ".inst 0xc1697428 // sdot za.s[x11, 0], { z1.h-z2.h }, z9.h\n"
+ ".inst 0xc1687429 // sdot za.s[x11, 1], { z1.h-z2.h }, z8.h\n"
+ "16:" // Padded: 3 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z5.s }, p0/Z, [x16]\n"
+ "add z5.h, p0/M, z5.h, z17.h\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z28.h, z5.h, z22.h\n"
+ "trn1 z29.h, z21.h, z20.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "addvl x21, SP, #18\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ "trn1 z30.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #24\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc1617788 // sdot za.s[x11, 0], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xc1607789 // sdot za.s[x11, 1], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ "trn1 z31.h, z21.h, z20.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xa0412aae // ld1h { z14.h-z15.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc169778a // sdot za.s[x11, 2], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc161778b // sdot za.s[x11, 3], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xa1422aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16f77a8 // sdot za.s[x11, 0], { z29.h-z30.h }, z15.h\n"
+ ".inst 0xc16e77a9 // sdot za.s[x11, 1], { z29.h-z30.h }, z14.h\n"
+ ".inst 0xa1412a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16977aa // sdot za.s[x11, 2], { z29.h-z30.h }, z9.h\n"
+ ".inst 0xc16177ab // sdot za.s[x11, 3], { z29.h-z30.h }, z1.h\n"
+ ".inst 0xc16b77c8 // sdot za.s[x11, 0], { z30.h-z31.h }, z11.h\n"
+ ".inst 0xc16377c9 // sdot za.s[x11, 1], { z30.h-z31.h }, z3.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f77ca // sdot za.s[x11, 2], { z30.h-z31.h }, z15.h\n"
+ ".inst 0xc16e77cb // sdot za.s[x11, 3], { z30.h-z31.h }, z14.h\n"
+ "17:" // Padded: 2 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z29.s }, p0/Z, [x16]\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z8.h, z29.h, z22.h\n"
+ "trn1 z9.h, z21.h, z20.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "addvl x22, SP, #12\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xa0402ace // ld1h { z14.h-z15.h }, pn10.b/Z, [x22]\n"
+ "trn1 z10.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "addvl x21, SP, #18\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16f7508 // sdot za.s[x11, 0], { z8.h-z9.h }, z15.h\n"
+ ".inst 0xc16e7509 // sdot za.s[x11, 1], { z8.h-z9.h }, z14.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ "addvl x20, SP, #24\n"
+ "trn1 z11.h, z21.h, z20.h\n"
+ ".inst 0xa1412ac5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e750a // sdot za.s[x11, 2], { z8.h-z9.h }, z14.h\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc166750b // sdot za.s[x11, 3], { z8.h-z9.h }, z6.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16d7528 // sdot za.s[x11, 0], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc1657529 // sdot za.s[x11, 1], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16f750c // sdot za.s[x11, 4], { z8.h-z9.h }, z15.h\n"
+ ".inst 0xc16e750d // sdot za.s[x11, 5], { z8.h-z9.h }, z14.h\n"
+ ".inst 0xc16d752a // sdot za.s[x11, 2], { z9.h-z10.h }, z13.h\n"
+ ".inst 0xc165752b // sdot za.s[x11, 3], { z9.h-z10.h }, z5.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc1617548 // sdot za.s[x11, 0], { z10.h-z11.h }, z1.h\n"
+ ".inst 0xc1607549 // sdot za.s[x11, 1], { z10.h-z11.h }, z0.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e752c // sdot za.s[x11, 4], { z9.h-z10.h }, z14.h\n"
+ ".inst 0xc166752d // sdot za.s[x11, 5], { z9.h-z10.h }, z6.h\n"
+ ".inst 0xc161754a // sdot za.s[x11, 2], { z10.h-z11.h }, z1.h\n"
+ ".inst 0xc160754b // sdot za.s[x11, 3], { z10.h-z11.h }, z0.h\n"
+ ".inst 0xa0422a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f754c // sdot za.s[x11, 4], { z10.h-z11.h }, z15.h\n"
+ ".inst 0xc16e754d // sdot za.s[x11, 5], { z10.h-z11.h }, z14.h\n"
+ "18:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z1.s }, p0/Z, [x16]\n"
+ "add z1.h, p0/M, z1.h, z17.h\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z26.h, z1.h, z22.h\n"
+ "trn1 z27.h, z21.h, z20.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "addvl x23, SP, #6\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xa0402aee // ld1h { z14.h-z15.h }, pn10.b/Z, [x23]\n"
+ "trn1 z28.h, z22.h, z20.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "addvl x22, SP, #12\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ ".inst 0xc16f7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z15.h\n"
+ ".inst 0xc16e7749 // sdot za.s[x11, 1], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xa0402ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22]\n"
+ "addvl x21, SP, #18\n"
+ "trn1 z29.h, z21.h, z20.h\n"
+ ".inst 0xa0412aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc161774a // sdot za.s[x11, 2], { z26.h-z27.h }, z1.h\n"
+ "addvl x20, SP, #24\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc160774b // sdot za.s[x11, 3], { z26.h-z27.h }, z0.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc16b7768 // sdot za.s[x11, 0], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xa0422ae8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16a7769 // sdot za.s[x11, 1], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc16e774c // sdot za.s[x11, 4], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc166774d // sdot za.s[x11, 5], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1402a85 // ld1h { z5.h, z13.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc1697788 // sdot za.s[x11, 0], { z28.h-z29.h }, z9.h\n"
+ ".inst 0xc1687789 // sdot za.s[x11, 1], { z28.h-z29.h }, z8.h\n"
+ ".inst 0xa1422ac2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xc16e776c // sdot za.s[x11, 4], { z27.h-z28.h }, z14.h\n"
+ ".inst 0xc166776d // sdot za.s[x11, 5], { z27.h-z28.h }, z6.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc16a778a // sdot za.s[x11, 2], { z28.h-z29.h }, z10.h\n"
+ ".inst 0xc162778b // sdot za.s[x11, 3], { z28.h-z29.h }, z2.h\n"
+ ".inst 0xa0422aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16e776e // sdot za.s[x11, 6], { z27.h-z28.h }, z14.h\n"
+ ".inst 0xc166776f // sdot za.s[x11, 7], { z27.h-z28.h }, z6.h\n"
+ ".inst 0xc161778c // sdot za.s[x11, 4], { z28.h-z29.h }, z1.h\n"
+ ".inst 0xc160778d // sdot za.s[x11, 5], { z28.h-z29.h }, z0.h\n"
+ ".inst 0xa1422a82 // ld1h { z2.h, z10.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16a778e // sdot za.s[x11, 6], { z28.h-z29.h }, z10.h\n"
+ ".inst 0xc162778f // sdot za.s[x11, 7], { z28.h-z29.h }, z2.h\n"
+ "19:" // Padded: 0 priming loads
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "cbz x25, 22f\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z6.s }, p0/Z, [x16]\n"
+ "add z6.h, p0/M, z6.h, z17.h\n"
+ "add x20, x16, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z30.s }, p0/Z, [x20]\n"
+ "add z30.h, p0/M, z30.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z17.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z6.h, z30.h\n"
+ "trn1 z26.h, z27.h, z26.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z9.s }, p0/Z, [x20]\n"
+ "add z9.h, p0/M, z9.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z21.s }, p0/Z, [x20]\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
+ "sub x25, x25, #0x1\n"
+ "sub x15, x15, #0x1\n"
+ "cmp x25, x15\n"
+ "trn1 z27.h, z8.h, z9.h\n"
+ "trn1 z28.h, z21.h, z29.h\n"
+ "csel x25, x25, x15, LT\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ "sub x15, x15, x25\n"
+ "cbz x25, 21f\n"
+ "20:" // Padded: Main loop
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z8.s }, p0/Z, [x16]\n"
+ "add z8.h, p0/M, z8.h, z17.h\n"
+ "add x24, x16, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z21.s }, p0/Z, [x24]\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+ "addvl x23, SP, #6\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ "addvl x22, SP, #12\n"
+ "add z21.h, p0/M, z21.h, z17.h\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ "addvl x21, SP, #18\n"
+ "addvl x20, SP, #24\n"
+ "ld1b { z29.s }, p0/Z, [x24]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ "add z29.h, p0/M, z29.h, z17.h\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ "mov x12, #0x4\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ "ld1b { z30.s }, p0/Z, [x24]\n"
+ "add z30.h, p0/M, z30.h, z17.h\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "subs x25, x25, #0x1\n"
+ ".inst 0xc16d774a // sdot za.s[x11, 2], { z26.h-z27.h }, z13.h\n"
+ "ld1b { z15.s }, p0/Z, [x24]\n"
+ "add z15.h, p0/M, z15.h, z17.h\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xc165774b // sdot za.s[x11, 3], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa0412aca // ld1h { z10.h-z11.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "add x16, x16, %x[ld_in_col]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ "ld1b { z20.s }, p0/Z, [x24]\n"
+ "add z20.h, p0/M, z20.h, z17.h\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ "ld1b { z31.s }, p0/Z, [x24]\n"
+ "add z31.h, p0/M, z31.h, z17.h\n"
+ "add x24, x24, %x[ld_in_row]\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ ".inst 0xc16b774c // sdot za.s[x11, 4], { z26.h-z27.h }, z11.h\n"
+ "ld1b { z22.s }, p0/Z, [x24]\n"
+ "add z22.h, p0/M, z22.h, z17.h\n"
+ ".inst 0xc16a774d // sdot za.s[x11, 5], { z26.h-z27.h }, z10.h\n"
+ ".inst 0xa1412aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc169776a // sdot za.s[x11, 2], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776b // sdot za.s[x11, 3], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xa0422ac0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16e774e // sdot za.s[x11, 6], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc166774f // sdot za.s[x11, 7], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1412a86 // ld1h { z6.h, z14.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc161776c // sdot za.s[x11, 4], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xc160776d // sdot za.s[x11, 5], { z27.h-z28.h }, z0.h\n"
+ ".inst 0xa1422aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc169776e // sdot za.s[x11, 6], { z27.h-z28.h }, z9.h\n"
+ ".inst 0xc161776f // sdot za.s[x11, 7], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xa0422a8a // ld1h { z10.h-z11.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc1631728 // sdot za.s[x8, 0], { z25.h-z26.h }, z3.h\n"
+ ".inst 0xc1621729 // sdot za.s[x8, 1], { z25.h-z26.h }, z2.h\n"
+ ".inst 0xa0402be0 // ld1h { z0.h-z1.h }, pn10.b/Z, [SP]\n"
+ "trn1 z25.h, z8.h, z21.h\n"
+ ".inst 0xc16e1748 // sdot za.s[x8, 0], { z26.h-z27.h }, z14.h\n"
+ ".inst 0xc1661749 // sdot za.s[x8, 1], { z26.h-z27.h }, z6.h\n"
+ ".inst 0xa1412be5 // ld1h { z5.h, z13.h }, pn10.b/Z, [SP, #0x2, MUL VL]\n"
+ "trn1 z26.h, z29.h, z30.h\n"
+ ".inst 0xc16b1768 // sdot za.s[x8, 0], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a1769 // sdot za.s[x8, 1], { z27.h-z28.h }, z10.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xa0422be2 // ld1h { z2.h-z3.h }, pn10.b/Z, [SP, #0x4, MUL VL]\n"
+ "trn1 z27.h, z15.h, z20.h\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ "trn1 z28.h, z31.h, z22.h\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ "add x11, x11, #0x2\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z9.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z11.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "bgt 20b\n"
+ "21:" // Main loop tail
+ "addvl x23, SP, #6\n"
+ ".inst 0xc1617728 // sdot za.s[x11, 0], { z25.h-z26.h }, z1.h\n"
+ "addvl x22, SP, #12\n"
+ ".inst 0xc1607729 // sdot za.s[x11, 1], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa0402ae0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x23]\n"
+ "addvl x21, SP, #18\n"
+ "addvl x20, SP, #24\n"
+ ".inst 0xc161772a // sdot za.s[x11, 2], { z25.h-z26.h }, z1.h\n"
+ ".inst 0xc160772b // sdot za.s[x11, 3], { z25.h-z26.h }, z0.h\n"
+ ".inst 0xa1402ac6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc16d7748 // sdot za.s[x11, 0], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc1657749 // sdot za.s[x11, 1], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa1412ae1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x23, #0x2, MUL VL]\n"
+ ".inst 0xc16e772c // sdot za.s[x11, 4], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772d // sdot za.s[x11, 5], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa1402aa6 // ld1h { z6.h, z14.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc169774a // sdot za.s[x11, 2], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc161774b // sdot za.s[x11, 3], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xa1412ac1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x22, #0x2, MUL VL]\n"
+ ".inst 0xc1637768 // sdot za.s[x11, 0], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc1627769 // sdot za.s[x11, 1], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa0422aea // ld1h { z10.h-z11.h }, pn10.b/Z, [x23, #0x4, MUL VL]\n"
+ ".inst 0xc16e772e // sdot za.s[x11, 6], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc166772f // sdot za.s[x11, 7], { z25.h-z26.h }, z6.h\n"
+ ".inst 0xa0402a8e // ld1h { z14.h-z15.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc169774c // sdot za.s[x11, 4], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc161774d // sdot za.s[x11, 5], { z26.h-z27.h }, z1.h\n"
+ ".inst 0xa1412aa5 // ld1h { z5.h, z13.h }, pn10.b/Z, [x21, #0x2, MUL VL]\n"
+ ".inst 0xc16b776a // sdot za.s[x11, 2], { z27.h-z28.h }, z11.h\n"
+ ".inst 0xc16a776b // sdot za.s[x11, 3], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xa0422ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22, #0x4, MUL VL]\n"
+ ".inst 0xc16d774e // sdot za.s[x11, 6], { z26.h-z27.h }, z13.h\n"
+ ".inst 0xc165774f // sdot za.s[x11, 7], { z26.h-z27.h }, z5.h\n"
+ ".inst 0xa0412a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20, #0x2, MUL VL]\n"
+ ".inst 0xc163776c // sdot za.s[x11, 4], { z27.h-z28.h }, z3.h\n"
+ ".inst 0xc162776d // sdot za.s[x11, 5], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa1422aa2 // ld1h { z2.h, z10.h }, pn10.b/Z, [x21, #0x4, MUL VL]\n"
+ ".inst 0xc16a776e // sdot za.s[x11, 6], { z27.h-z28.h }, z10.h\n"
+ ".inst 0xc162776f // sdot za.s[x11, 7], { z27.h-z28.h }, z2.h\n"
+ ".inst 0xa0422a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20, #0x4, MUL VL]\n"
+ ".inst 0xc16f1728 // sdot za.s[x8, 0], { z25.h-z26.h }, z15.h\n"
+ ".inst 0xc16e1729 // sdot za.s[x8, 1], { z25.h-z26.h }, z14.h\n"
+ ".inst 0xc1691748 // sdot za.s[x8, 0], { z26.h-z27.h }, z9.h\n"
+ ".inst 0xc1681749 // sdot za.s[x8, 1], { z26.h-z27.h }, z8.h\n"
+ ".inst 0xc1611768 // sdot za.s[x8, 0], { z27.h-z28.h }, z1.h\n"
+ ".inst 0xc1601769 // sdot za.s[x8, 1], { z27.h-z28.h }, z0.h\n"
+ "add x8, x8, #0x2\n"
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ "add x11, x11, #0x2\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z9.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z11.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "22:" // Main loop skip tail
+ "cbz x15, 24f\n"
+ "23:" // Right padding loop
+ ".inst 0xc0066808 // mova { z8.d-z9.d }, za.d[x11, #0]\n"
+ "add x8, x8, #0x2\n"
+ "subs x15, x15, #0x1\n"
+ ".inst 0xc006682a // mova { z10.d-z11.d }, za.d[x11, #1]\n"
+ ".inst 0xc1a7ac08 // sqdmulh { z8.s-z11.s }, { z8.s-z11.s }, z7.s\n"
+ "add x11, x11, #0x2\n"
+ ".inst 0xc1a4aa28 // srshl { z8.s-z11.s }, { z8.s-z11.s }, z4.s\n"
+ ".inst 0xc0040a40 // mova za.d[x8, #0], { z18.d-z19.d }\n"
+ ".inst 0xc1acab08 // add { z8.s-z11.s }, { z8.s-z11.s }, z12.s\n"
+ ".inst 0xc0040a41 // mova za.d[x8, #1], { z18.d-z19.d }\n"
+ ".inst 0xc1b0cf08 // sclamp { z8.s-z11.s }, z24.s, z16.s\n"
+ "st1b { z8.s }, p1, [x14]\n"
+ "add x14, x14, x4\n"
+ "st1b { z10.s }, p1, [x13]\n"
+ "add x13, x13, x10\n"
+ "st1b { z9.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "st1b { z11.s }, p1, [x28]\n"
+ "add x28, x28, x26\n"
+ "bgt 23b\n"
+ "24:" // End
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #16\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x17\n"
+ "whilelt p1.s, x17, x7\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x10]\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #30\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp
new file mode 100644
index 0000000000..ad82070912
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
+ const uint8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const int8_t *weights,
+ uint8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+);
+
+class sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za : public PlanarStrategy<uint8_t, int8_t>
+{
+ using Parent = PlanarStrategy<uint8_t, int8_t>;
+
+ public:
+ using return_type = uint8_t;
+ constexpr static auto output_rows = 4u;
+ constexpr static auto kernel_rows = 5u, kernel_cols = 5u;
+ constexpr static auto stride_rows = 2u, stride_cols = 2u;
+ constexpr static auto vl_type = arm_gemm::VLType::SME;
+
+ sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za(const CPUInfo *)
+ : Parent(kernel_rows, kernel_cols, stride_rows, stride_cols, output_rows, vl_type)
+ {
+ }
+
+ typename Parent::KernelType get_kernel(void) const override
+ {
+ return sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl;
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
new file mode 100644
index 0000000000..d8dc69127e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za/generic.cpp
@@ -0,0 +1,1354 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SME2)
+
+#include <algorithm>
+#include <cstddef>
+#include "arm_gemm.hpp"
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+void sme2_u8s8u8q_planar_5x5_s2_4rows_dot_za_impl(
+ const uint8_t *inptr,
+ size_t ld_in_row,
+ size_t ld_in_col,
+ size_t ld_in_vl,
+ unsigned int pad_top,
+ unsigned int valid_input_rows,
+ unsigned int pad_left,
+ unsigned int valid_input_cols,
+ const int8_t *weights,
+ uint8_t **outptrs,
+ const size_t *outlds,
+ const size_t *outvllds,
+ unsigned int output_cols,
+ unsigned int start_channel,
+ unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp
+)
+{
+ struct Args
+ {
+ const uint8_t *inptr;
+ size_t ld_in_vl;
+ long unsigned int pad_top, pad_bottom, pad_left;
+ const int8_t *weights;
+ long unsigned int input_cols, output_cols;
+ uint8_t **outptrs;
+ const size_t *ld_out_cols;
+ const size_t *ld_out_vls;
+ long unsigned int current_channel, n_channels;
+ };
+
+ Args args = { inptr, ld_in_vl, pad_top, 11u - std::min(11u, pad_top + valid_input_rows), pad_left, weights, valid_input_cols, output_cols, outptrs, outlds, outvllds, start_channel, valid_channels };
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "ldr x3, [%x[args], %[offsetof_Args_pad_bottom]]\n"
+ "ptrue p2.b\n"
+ "mov x20, #0xb\n"
+ "ldr x4, [%x[args], %[offsetof_Args_pad_top]]\n"
+ "ld1rh { z7.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_a_offset]]\n"
+ "sub x20, x20, x3\n"
+ ".inst 0x25207812 // ptrue pn10.b\n"
+ "ldr x5, [%x[args], %[offsetof_Args_n_channels]]\n"
+ "whilelt p1.s, XZR, x5\n"
+ "whilelt p9.s, XZR, x20\n"
+ "ld1rw { z10.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "whilelt p8.s, XZR, x4\n"
+ "addvl SP, SP, #-15\n"
+ "ldr x6, [%x[args], %[offsetof_Args_current_channel]]\n"
+ "neg z7.h, p2/M, z7.h\n"
+ "eor p8.b, p2/Z, p8.b, p9.b\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_mul]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_per_layer_right_shift]]\n"
+ "ld1rw { z5.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z21.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "1:" // Channel loop
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_bias]]\n"
+ "mov z12.s, #0x0\n"
+ "cbz x20, 2f\n"
+ "ld1w { z12.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "2:" // Load bias: Done
+ "ldr x22, [%x[args], %[offsetof_Args_weights]]\n"
+ "mov x20, x22\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "ld1rh { z28.h }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "incw x22\n"
+ "mov z26.h, #0x0\n"
+ "ld1sb { z22.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "trn1 z17.h, z13.h, z22.h\n"
+ "ld1sb { z20.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z20.h, z20.h, z28.h\n"
+ "addvl x21, SP, #15\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z1.h, z1.h, z28.h\n"
+ "trn1 z29.h, z20.h, z1.h\n"
+ "ld1sb { z27.s }, p2/Z, [x20]\n"
+ "mov x20, x22\n"
+ "sub z27.h, z27.h, z28.h\n"
+ "incw x22\n"
+ "ld1sb { z14.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z14.h, z14.h, z28.h\n"
+ "addvl x21, x21, #-3\n"
+ "ld1sb { z18.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z18.h, z18.h, z28.h\n"
+ "trn1 z22.h, z27.h, z26.h\n"
+ "ld1sb { z23.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z23.h, z23.h, z28.h\n"
+ "st1h { z17.h }, p2, [x21]\n"
+ "ld1sb { z30.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z30.h, z30.h, z28.h\n"
+ "trn1 z8.h, z14.h, z18.h\n"
+ "ld1sb { z15.s }, p2/Z, [x20]\n"
+ "mov x20, x22\n"
+ "st1h { z29.h }, p2, [x21, #1, MUL VL]\n"
+ "sub z15.h, z15.h, z28.h\n"
+ "ld1sb { z20.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z23.h, z23.h, z30.h\n"
+ "sub z20.h, z20.h, z28.h\n"
+ "ld1sb { z24.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z24.h, z24.h, z28.h\n"
+ "st1h { z22.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z0.h, z15.h, z26.h\n"
+ "incw x22\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z16.h, z16.h, z28.h\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "ld1sb { z11.s }, p2/Z, [x20]\n"
+ "addvl x21, x21, #-3\n"
+ "mov x20, x22\n"
+ "st1h { z8.h }, p2, [x21]\n"
+ "trn1 z27.h, z20.h, z24.h\n"
+ "ld1sb { z22.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z11.h, z11.h, z28.h\n"
+ "ld1sb { z3.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z23.h }, p2, [x21, #1, MUL VL]\n"
+ "trn1 z20.h, z16.h, z13.h\n"
+ "ld1sb { z13.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "sub z3.h, z3.h, z28.h\n"
+ "ld1sb { z15.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z0.h }, p2, [x21, #2, MUL VL]\n"
+ "trn1 z29.h, z11.h, z26.h\n"
+ "ld1sb { z16.s }, p2/Z, [x20]\n"
+ "incw x22\n"
+ "sub z13.h, z13.h, z28.h\n"
+ "sub z15.h, z15.h, z28.h\n"
+ "addvl x21, x21, #-3\n"
+ "mov x20, x22\n"
+ "st1h { z27.h }, p2, [x21]\n"
+ "sub z16.h, z16.h, z28.h\n"
+ "trn1 z19.h, z22.h, z3.h\n"
+ "ld1sb { z17.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "st1h { z20.h }, p2, [x21, #1, MUL VL]\n"
+ "ld1sb { z0.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z31.h, z13.h, z15.h\n"
+ "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+ "ld1sb { z18.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "trn1 z16.h, z16.h, z26.h\n"
+ "sub z17.h, z17.h, z28.h\n"
+ "ld1sb { z22.s }, p2/Z, [x20]\n"
+ "incw x20, ALL, MUL #5\n"
+ "sub z0.h, z0.h, z28.h\n"
+ "sub z18.h, z18.h, z28.h\n"
+ "ld1sb { z1.s }, p2/Z, [x20]\n"
+ "sub z22.h, z22.h, z28.h\n"
+ "sub z1.h, z1.h, z28.h\n"
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_muls]]\n"
+ "addvl x21, x21, #-3\n"
+ "st1h { z19.h }, p2, [x21]\n"
+ "mov z13.d, z12.d\n"
+ "mov z14.d, z12.d\n"
+ "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+ "mov z15.d, z12.d\n"
+ "trn1 z8.h, z17.h, z0.h\n"
+ "st1h { z16.h }, p2, [x21, #2, MUL VL]\n"
+ "addvl x21, x21, #-3\n"
+ "trn1 z31.h, z18.h, z22.h\n"
+ "trn1 z29.h, z1.h, z26.h\n"
+ "st1h { z8.h }, p2, [x21]\n"
+ "st1h { z31.h }, p2, [x21, #1, MUL VL]\n"
+ "st1h { z29.h }, p2, [x21, #2, MUL VL]\n"
+ "cbz x20, 3f\n"
+ "ld1w { z6.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "3:" // Load mul: End
+ "ldr x20, [%x[qp], %[offsetof_Requantize32_per_channel_right_shifts]]\n"
+ "cbz x20, 4f\n"
+ "ld1w { z4.s }, p1/Z, [x20, x6, LSL #2]\n"
+ "4:" // Load right_shift: End
+ "ldr x7, [%x[args], %[offsetof_Args_input_cols]]\n"
+ "sub x20, x7, #0x1\n"
+ "orr x23, x20, %x[ld_in_col], LSL #16\n"
+ "ldr x17, [%x[args], %[offsetof_Args_inptr]]\n"
+ "orr x23, x5, x23, LSL #22\n"
+ "mov x22, #0xb\n"
+ "add x21, x4, x3\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "ldr x16, [%x[args], %[offsetof_Args_output_cols]]\n"
+ "mov x8, #0x0\n"
+ "lsl x23, x23, #0x0\n"
+ "sub x22, x22, x21\n"
+ "madd x20, x20, x4, x17\n"
+ "5:" // Issue prefetches
+ "subs x22, x22, #0x1\n"
+ ".inst 0xf8b74a9c // rprfm pldstrm, x23, [x20]\n"
+ "add x20, x20, %x[ld_in_col]\n"
+ "bgt 5b\n"
+ "ldr x23, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "lsl x20, %x[ld_in_row], #0x0\n"
+ "msub x17, x4, x20, x17\n"
+ ".inst 0xc0040d80 // mova za.d[x8, #0], { z12.d-z15.d }\n"
+ "ldr x20, [%x[args], %[offsetof_Args_ld_out_cols]]\n"
+ ".inst 0xc0040d81 // mova za.d[x8, #1], { z12.d-z15.d }\n"
+ "mov x22, #0x4\n"
+ "ldp x15, x14, [x23], #0x10\n"
+ ".inst 0xc0040d82 // mova za.d[x8, #2], { z12.d-z15.d }\n"
+ "ldp x13, x11, [x20], #0x10\n"
+ ".inst 0xc0040d83 // mova za.d[x8, #3], { z12.d-z15.d }\n"
+ "ldr x21, [%x[args], %[offsetof_Args_pad_left]]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "ldp x10, x9, [x23], #0x10\n"
+ "ldp x28, x27, [x20], #0x10\n"
+ "cbz x21, 7f\n"
+ "cmp x21, x22\n"
+ "csel x20, x21, x22, LT\n"
+ "sub x21, x21, x20\n"
+ "sub x22, x22, x20\n"
+ "cbz x21, 7f\n"
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ "and x22, x21, #0x1\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ "add x21, x21, #0x1\n"
+ "lsr x21, x21, #0x1\n"
+ ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+ "sub x16, x16, x21\n"
+ ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+ "6:" // Left padding
+ "subs x21, x21, #0x1\n"
+ "st1b { z28.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "st1b { z29.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "st1b { z30.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z31.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "bgt 6b\n"
+ "7:" // Left padding: End
+ "adds XZR, x4, x3\n"
+ "bne 14f\n"
+ "cbz x22, 12f\n"
+ "cmp x22, #0x1\n"
+ "sub x7, x7, x22\n"
+ "beq 11f\n"
+ "cmp x22, #0x2\n"
+ "beq 10f\n"
+ "cmp x22, #0x3\n"
+ "beq 9f\n"
+ "8:" // Unpadded: 4 priming loads
+ "add x21, x17, %x[ld_in_row]\n"
+ "ld1b { z27.s }, p1/Z, [x17]\n"
+ "addvl x20, SP, #12\n"
+ "ld1b { z0.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z27.h, z27.h, z0.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1b { z28.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1b { z11.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z28.h, z28.h, z11.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1b { z29.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z8.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z29.h, z29.h, z8.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z30.h, z30.h, z17.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1b { z31.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z26.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z31.h, z31.h, z26.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ ".inst 0xa1402a80 // ld1h { z0.h, z8.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1b { z20.s }, p1/Z, [x21]\n"
+ "mov z0.d, z20.d\n"
+ "add z0.h, z0.h, z7.h\n"
+ ".inst 0xc1781788 // sdot za.s[x8, 0], { z28.h-z31.h }, z8.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17817a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z8.h\n"
+ "9:" // Unpadded: 3 priming loads
+ "add x21, x17, %x[ld_in_row]\n"
+ "ld1b { z29.s }, p1/Z, [x17]\n"
+ "addvl x20, SP, #9\n"
+ "ld1b { z17.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z29.h, z29.h, z17.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1b { z0.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z30.h, z30.h, z0.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1b { z31.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1b { z0.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z0.h, z0.h, z16.h\n"
+ "add z0.h, z0.h, z7.h\n"
+ "ld1b { z1.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z1.h, z1.h, z16.h\n"
+ "add z1.h, z1.h, z7.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+ "ld1b { z16.s }, p1/Z, [x21]\n"
+ "mov z2.d, z16.d\n"
+ "add z2.h, z2.h, z7.h\n"
+ ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17817e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z8.h\n"
+ "10:" // Unpadded: 2 priming loads
+ "add x22, x17, %x[ld_in_row]\n"
+ "ld1b { z26.s }, p1/Z, [x17]\n"
+ "addvl x21, SP, #6\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "addvl x20, SP, #12\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1b { z28.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1b { z29.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z28.h, z28.h, z29.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1b { z29.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1b { z19.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z29.h, z29.h, z19.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1b { z23.s }, p1/Z, [x22]\n"
+ "trn1 z30.h, z30.h, z23.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z30.h, z30.h, z7.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1b { z22.s }, p1/Z, [x22]\n"
+ "mov z31.d, z22.d\n"
+ ".inst 0xc1731768 // sdot za.s[x8, 0], { z27.h-z30.h }, z3.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17b1769 // sdot za.s[x8, 1], { z27.h-z30.h }, z11.h\n"
+ ".inst 0xc1731788 // sdot za.s[x8, 0], { z28.h-z31.h }, z3.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1701789 // sdot za.s[x8, 1], { z28.h-z31.h }, z0.h\n"
+ "11:" // Unpadded: 1 priming loads
+ "add x22, x17, %x[ld_in_row]\n"
+ "ld1b { z29.s }, p1/Z, [x17]\n"
+ "addvl x21, SP, #3\n"
+ "ld1b { z22.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z29.h, z29.h, z22.h\n"
+ "add z29.h, z29.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "addvl x20, SP, #9\n"
+ "ld1b { z25.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z30.h, z30.h, z25.h\n"
+ "add z30.h, z30.h, z7.h\n"
+ "ld1b { z31.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ "add z31.h, z31.h, z7.h\n"
+ "ld1b { z0.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "trn1 z0.h, z0.h, z16.h\n"
+ "add z0.h, z0.h, z7.h\n"
+ "ld1b { z1.s }, p1/Z, [x22]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1b { z2.s }, p1/Z, [x22]\n"
+ "trn1 z1.h, z1.h, z2.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "add z1.h, z1.h, z7.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17217a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z2.h\n"
+ "ld1b { z24.s }, p1/Z, [x22]\n"
+ "mov z2.d, z24.d\n"
+ ".inst 0xc17317c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z3.h\n"
+ ".inst 0xa0402a88 // ld1h { z8.h-z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc17817a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z8.h\n"
+ "add z2.h, z2.h, z7.h\n"
+ "ld1h { z3.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17917c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z9.h\n"
+ ".inst 0xc17317e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z3.h\n"
+ "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17317e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z3.h\n"
+ "12:" // Unpadded: 0 priming loads
+ "cmp x7, #0x2\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "blt 22f\n"
+ "add x21, x17, %x[ld_in_row]\n"
+ "ld1b { z23.s }, p1/Z, [x17]\n"
+ "sub x7, x7, #0x2\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z25.h\n"
+ "sub x16, x16, #0x1\n"
+ "ld1b { z24.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "lsr x20, x7, #0x1\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z30.h\n"
+ "cmp x20, x16\n"
+ "ld1b { z25.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "csel x26, x20, x16, LT\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z22.h\n"
+ "add z25.h, z25.h, z7.h\n"
+ "ld1b { z26.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1b { z22.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z26.h, z26.h, z22.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "and x7, x7, #0x1\n"
+ "ld1b { z30.s }, p1/Z, [x21]\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z27.h, z27.h, z30.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1b { z28.s }, p1/Z, [x21]\n"
+ "mov z28.d, z28.d\n"
+ "add z28.h, z28.h, z7.h\n"
+ "sub x16, x16, x26\n"
+ "cbz x26, 21f\n"
+ "13:" // Unpadded: Main loop
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+ "addvl x25, SP, #6\n"
+ "addvl x24, SP, #12\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa0402b20 // ld1h { z0.h-z1.h }, pn10.b/Z, [x25]\n"
+ "add x23, x17, %x[ld_in_row]\n"
+ "addvl x22, SP, #3\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "addvl x21, SP, #9\n"
+ "subs x26, x26, #0x1\n"
+ ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+ ".inst 0xc17816ea // sdot za.s[x8, 2], { z23.h-z26.h }, z8.h\n"
+ "ld1b { z23.s }, p1/Z, [x17]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x25, #2, MUL VL]\n"
+ ".inst 0xc179170a // sdot za.s[x8, 2], { z24.h-z27.h }, z9.h\n"
+ "ld1b { z16.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "ld1h { z9.h }, p2/Z, [x24, #2, MUL VL]\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1b { z24.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+ "ld1b { z18.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z18.h\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1b { z25.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "ld1b { z8.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z8.h\n"
+ "add z25.h, z25.h, z7.h\n"
+ "ld1b { z26.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "ld1b { z28.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "trn1 z26.h, z26.h, z28.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x23]\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ "ld1b { z28.s }, p1/Z, [x23]\n"
+ "trn1 z27.h, z27.h, z28.h\n"
+ "add x23, x23, %x[ld_in_row]\n"
+ "add z27.h, z27.h, z7.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ "ld1b { z20.s }, p1/Z, [x23]\n"
+ "mov z28.d, z20.d\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ ".inst 0xa0402aa0 // ld1h { z0.h-z1.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "add z28.h, z28.h, z7.h\n"
+ "ld1h { z0.h }, p2/Z, [x22, #2, MUL VL]\n"
+ ".inst 0xc1711709 // sdot za.s[x8, 1], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ ".inst 0xc1701728 // sdot za.s[x8, 0], { z25.h-z28.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "st1b { z16.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "ld1b { z23.s }, p1/Z, [x17]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "st1b { z17.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ "st1b { z18.s }, p1, [x10]\n"
+ "ld1b { z24.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add x10, x10, x28\n"
+ "st1b { z19.s }, p1, [x9]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z16.h\n"
+ "add x9, x9, x27\n"
+ "ld1b { z25.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "add z23.h, z23.h, z7.h\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z16.h\n"
+ "add z24.h, z24.h, z7.h\n"
+ "ld1b { z26.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "add z25.h, z25.h, z7.h\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ "add z26.h, z26.h, z7.h\n"
+ "ld1b { z27.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "add z27.h, z27.h, z7.h\n"
+ "ld1b { z16.s }, p1/Z, [x20]\n"
+ "mov z28.d, z16.d\n"
+ "add z28.h, z28.h, z7.h\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "bgt 13b\n"
+ "b 21f\n"
+ "14:" // Padded
+ "cbz x22, 19f\n"
+ "cmp x22, #0x1\n"
+ "sub x7, x7, x22\n"
+ "beq 18f\n"
+ "cmp x22, #0x2\n"
+ "beq 17f\n"
+ "cmp x22, #0x3\n"
+ "beq 16f\n"
+ "15:" // Padded: 4 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z27.s }, p0/Z, [x17]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x21, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z28.s }, p0/Z, [x21]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "mov x12, #0x4\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ "trn1 z28.h, z28.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z29.s }, p0/Z, [x21]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z30.s }, p0/Z, [x21]\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z31.s }, p0/Z, [x21]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "addvl x20, SP, #12\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z29.h, z29.h, z18.h\n"
+ "trn1 z30.h, z30.h, z17.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xa0402a80 // ld1h { z0.h-z1.h }, pn10.b/Z, [x20]\n"
+ "trn1 z31.h, z31.h, z16.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1b { z20.s }, p0/Z, [x21]\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ "mov z0.d, z20.d\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ ".inst 0xc1711788 // sdot za.s[x8, 0], { z28.h-z31.h }, z1.h\n"
+ "ld1h { z1.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc17117a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z1.h\n"
+ "16:" // Padded: 3 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z24.s }, p0/Z, [x17]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
+ "add x21, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z25.s }, p0/Z, [x21]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "mov x12, #0x4\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z17.h\n"
+ "trn1 z25.h, z25.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z26.s }, p0/Z, [x21]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x21]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z27.s }, p0/Z, [x21]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z17.s }, p0/Z, [x21]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z28.s }, p0/Z, [x21]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z16.s }, p0/Z, [x21]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "addvl x20, SP, #9\n"
+ "add x21, x21, %x[ld_in_row]\n"
+ "trn1 z26.h, z26.h, z18.h\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ "trn1 z28.h, z28.h, z16.h\n"
+ ".inst 0xc1721708 // sdot za.s[x8, 0], { z24.h-z27.h }, z2.h\n"
+ "ld1b { z11.s }, p0/Z, [x21]\n"
+ "add z11.h, p0/M, z11.h, z7.h\n"
+ "mov z29.d, z11.d\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1701748 // sdot za.s[x8, 0], { z26.h-z29.h }, z0.h\n"
+ "17:" // Padded: 2 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z25.s }, p0/Z, [x17]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "addvl x21, SP, #6\n"
+ "trn1 z27.h, z27.h, z18.h\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa1402aa1 // ld1h { z1.h, z9.h }, pn10.b/Z, [x21]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "trn1 z29.h, z29.h, z16.h\n"
+ ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #12\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ ".inst 0xc1791748 // sdot za.s[x8, 0], { z26.h-z29.h }, z9.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+ "mov z30.d, z1.d\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc1791768 // sdot za.s[x8, 0], { z27.h-z30.h }, z9.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+ "18:" // Padded: 1 priming loads
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z25.s }, p0/Z, [x17]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z25.h, z25.h, z17.h\n"
+ "trn1 z26.h, z26.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "addvl x21, SP, #3\n"
+ "trn1 z27.h, z27.h, z18.h\n"
+ "trn1 z28.h, z28.h, z17.h\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "trn1 z29.h, z29.h, z16.h\n"
+ ".inst 0xc1731728 // sdot za.s[x8, 0], { z25.h-z28.h }, z3.h\n"
+ "ld1b { z0.s }, p0/Z, [x20]\n"
+ "addvl x20, SP, #9\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ ".inst 0xc17b1748 // sdot za.s[x8, 0], { z26.h-z29.h }, z11.h\n"
+ ".inst 0xa0402a82 // ld1h { z2.h-z3.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1721729 // sdot za.s[x8, 1], { z25.h-z28.h }, z2.h\n"
+ "mov z30.d, z0.d\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "ld1h { z0.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1731749 // sdot za.s[x8, 1], { z26.h-z29.h }, z3.h\n"
+ ".inst 0xc1701768 // sdot za.s[x8, 0], { z27.h-z30.h }, z0.h\n"
+ "ld1h { z0.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1701769 // sdot za.s[x8, 1], { z27.h-z30.h }, z0.h\n"
+ "19:" // Padded: 0 priming loads
+ "cmp x7, #0x2\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "blt 22f\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z23.h, z23.h, z17.h\n"
+ "trn1 z24.h, z24.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z19.s }, p0/Z, [x20]\n"
+ "add z19.h, p0/M, z19.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z18.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z18.h, p0/M, z18.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "sub x7, x7, #0x2\n"
+ "sub x16, x16, #0x1\n"
+ "trn1 z25.h, z25.h, z19.h\n"
+ "trn1 z26.h, z26.h, z18.h\n"
+ "lsr x20, x7, #0x1\n"
+ "cmp x20, x16\n"
+ "trn1 z27.h, z27.h, z17.h\n"
+ "mov z28.d, z16.d\n"
+ "csel x25, x20, x16, LT\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ "and x7, x7, #0x1\n"
+ "sub x16, x16, x25\n"
+ "cbz x25, 21f\n"
+ "20:" // Padded: Main loop
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+ "addvl x24, SP, #6\n"
+ "addvl x23, SP, #12\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa1402b00 // ld1h { z0.h, z8.h }, pn10.b/Z, [x24]\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc17016e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z0.h\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ "addvl x22, SP, #3\n"
+ ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
+ "addvl x21, SP, #9\n"
+ "subs x25, x25, #0x1\n"
+ ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+ "ld1b { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x4\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "ld1h { z3.h }, p2/Z, [x23, #2, MUL VL]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc173172a // sdot za.s[x8, 2], { z25.h-z28.h }, z3.h\n"
+ "trn1 z23.h, z23.h, z16.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z3.s }, p0/Z, [x20]\n"
+ "add z3.h, p0/M, z3.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z30.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z29.s }, p0/Z, [x20]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ "trn1 z24.h, z24.h, z1.h\n"
+ "trn1 z25.h, z25.h, z3.h\n"
+ "trn1 z26.h, z26.h, z30.h\n"
+ ".inst 0xa0402ac2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x22]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ "trn1 z27.h, z27.h, z29.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc17216e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z2.h\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "mov x12, #0x0\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ ".inst 0xa0402aa2 // ld1h { z2.h-z3.h }, pn10.b/Z, [x21]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc17216e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z2.h\n"
+ "ld1b { z23.s }, p0/Z, [x17]\n"
+ "add z23.h, p0/M, z23.h, z7.h\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z8.s }, p0/Z, [x20]\n"
+ "add z8.h, p0/M, z8.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc1731709 // sdot za.s[x8, 1], { z24.h-z27.h }, z3.h\n"
+ "ld1b { z24.s }, p0/Z, [x20]\n"
+ "mov z28.d, z20.d\n"
+ "ld1h { z1.h }, p2/Z, [x22, #2, MUL VL]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z22.s }, p0/Z, [x20]\n"
+ ".inst 0xc1711728 // sdot za.s[x8, 0], { z25.h-z28.h }, z1.h\n"
+ "mov x12, #0x4\n"
+ "add z22.h, p0/M, z22.h, z7.h\n"
+ "ld1h { z1.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc1711729 // sdot za.s[x8, 1], { z25.h-z28.h }, z1.h\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z20.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z31.s }, p0/Z, [x20]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z1.s }, p0/Z, [x20]\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ "st1b { z16.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "st1b { z17.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "trn1 z23.h, z23.h, z8.h\n"
+ "trn1 z24.h, z24.h, z22.h\n"
+ "st1b { z18.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "trn1 z25.h, z25.h, z28.h\n"
+ "trn1 z26.h, z26.h, z20.h\n"
+ "st1b { z19.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "trn1 z27.h, z27.h, z31.h\n"
+ "mov z28.d, z1.d\n"
+ "bgt 20b\n"
+ "21:" // Main loop tail
+ ".inst 0xc17316e8 // sdot za.s[x8, 0], { z23.h-z26.h }, z3.h\n"
+ "addvl x24, SP, #6\n"
+ "addvl x23, SP, #12\n"
+ ".inst 0xc17b1708 // sdot za.s[x8, 0], { z24.h-z27.h }, z11.h\n"
+ ".inst 0xa0402b08 // ld1h { z8.h-z9.h }, pn10.b/Z, [x24]\n"
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc17816e9 // sdot za.s[x8, 1], { z23.h-z26.h }, z8.h\n"
+ "add x22, x17, %x[ld_in_row]\n"
+ "addvl x21, SP, #3\n"
+ ".inst 0xc1791709 // sdot za.s[x8, 1], { z24.h-z27.h }, z9.h\n"
+ ".inst 0xa1402ae3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x23]\n"
+ "addvl x20, SP, #9\n"
+ ".inst 0xc17316ea // sdot za.s[x8, 2], { z23.h-z26.h }, z3.h\n"
+ "ld1b { z29.s }, p0/Z, [x17]\n"
+ "add z29.h, p0/M, z29.h, z7.h\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z8.s }, p0/Z, [x22]\n"
+ "add z8.h, p0/M, z8.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ ".inst 0xc17b170a // sdot za.s[x8, 2], { z24.h-z27.h }, z11.h\n"
+ "ld1b { z30.s }, p0/Z, [x22]\n"
+ "add z30.h, p0/M, z30.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc1721728 // sdot za.s[x8, 0], { z25.h-z28.h }, z2.h\n"
+ "ld1h { z0.h }, p2/Z, [x24, #2, MUL VL]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "mov x12, #0x4\n"
+ "ld1b { z20.s }, p0/Z, [x22]\n"
+ ".inst 0xc1701729 // sdot za.s[x8, 1], { z25.h-z28.h }, z0.h\n"
+ "add z20.h, p0/M, z20.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ "ld1h { z2.h }, p2/Z, [x23, #2, MUL VL]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ ".inst 0xc172172a // sdot za.s[x8, 2], { z25.h-z28.h }, z2.h\n"
+ "trn1 z29.h, z29.h, z8.h\n"
+ "ld1b { z31.s }, p0/Z, [x22]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z25.s }, p0/Z, [x22]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z0.s }, p0/Z, [x22]\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z17.s }, p0/Z, [x22]\n"
+ "mov x12, #0x8\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z1.s }, p0/Z, [x22]\n"
+ "add z1.h, p0/M, z1.h, z7.h\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z28.s }, p0/Z, [x22]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "trn1 z30.h, z30.h, z20.h\n"
+ "trn1 z31.h, z31.h, z25.h\n"
+ "trn1 z0.h, z0.h, z17.h\n"
+ ".inst 0xa1402aa3 // ld1h { z3.h, z11.h }, pn10.b/Z, [x21]\n"
+ "add x22, x22, %x[ld_in_row]\n"
+ ".inst 0xc0060c18 // mova { z24.d-z27.d }, za.d[x8, #0]\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "trn1 z1.h, z1.h, z28.h\n"
+ ".inst 0xc17317a8 // sdot za.s[x8, 0], { z29.h-z0.h }, z3.h\n"
+ "ld1b { z22.s }, p0/Z, [x22]\n"
+ ".inst 0xc1a6ac18 // sqdmulh { z24.s-z27.s }, { z24.s-z27.s }, z6.s\n"
+ "add z22.h, p0/M, z22.h, z7.h\n"
+ ".inst 0xc17b17c8 // sdot za.s[x8, 0], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xa1402a83 // ld1h { z3.h, z11.h }, pn10.b/Z, [x20]\n"
+ "add x17, x17, %x[ld_in_col]\n"
+ ".inst 0xc1a4aa38 // srshl { z24.s-z27.s }, { z24.s-z27.s }, z4.s\n"
+ ".inst 0xc17317a9 // sdot za.s[x8, 1], { z29.h-z0.h }, z3.h\n"
+ "mov z2.d, z22.d\n"
+ "ld1h { z9.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc17b17c9 // sdot za.s[x8, 1], { z30.h-z1.h }, z11.h\n"
+ ".inst 0xc1aaab18 // add { z24.s-z27.s }, { z24.s-z27.s }, z10.s\n"
+ ".inst 0xc17917e8 // sdot za.s[x8, 0], { z31.h-z2.h }, z9.h\n"
+ "ld1h { z8.h }, p2/Z, [x20, #2, MUL VL]\n"
+ ".inst 0xc1b5ccb8 // sclamp { z24.s-z27.s }, z5.s, z21.s\n"
+ "st1b { z24.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "st1b { z25.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ ".inst 0xa1402be3 // ld1h { z3.h, z11.h }, pn10.b/Z, [SP]\n"
+ "st1b { z26.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ ".inst 0xc17817e9 // sdot za.s[x8, 1], { z31.h-z2.h }, z8.h\n"
+ "ld1h { z2.h }, p2/Z, [SP, #2, MUL VL]\n"
+ "st1b { z27.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "22:" // Main loop skip tail
+ "cbz x7, 23f\n" // Skip remainder inputs
+ "mov x12, #0x0\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z24.s }, p0/Z, [x17]\n"
+ "add z24.h, p0/M, z24.h, z7.h\n"
+ "add x20, x17, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z25.s }, p0/Z, [x20]\n"
+ "add z25.h, p0/M, z25.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "mov x12, #0x4\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ "trn1 z24.h, z24.h, z17.h\n"
+ "trn1 z25.h, z25.h, z16.h\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z26.s }, p0/Z, [x20]\n"
+ "add z26.h, p0/M, z26.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z17.s }, p0/Z, [x20]\n"
+ "add z17.h, p0/M, z17.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "ld1b { z27.s }, p0/Z, [x20]\n"
+ "add z27.h, p0/M, z27.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25f04500 // psel p0.s, p1.s/Z, p8.s[w12, #3]\n"
+ "ld1b { z16.s }, p0/Z, [x20]\n"
+ "mov x12, #0x8\n"
+ "add z16.h, p0/M, z16.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25304500 // psel p0.s, p1.s/Z, p8.s[w12]\n"
+ "ld1b { z28.s }, p0/Z, [x20]\n"
+ "add z28.h, p0/M, z28.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25704500 // psel p0.s, p1.s/Z, p8.s[w12, #1]\n"
+ "ld1b { z31.s }, p0/Z, [x20]\n"
+ "add z31.h, p0/M, z31.h, z7.h\n"
+ "add x20, x20, %x[ld_in_row]\n"
+ ".inst 0x25b04500 // psel p0.s, p1.s/Z, p8.s[w12, #2]\n"
+ "trn1 z26.h, z26.h, z17.h\n"
+ "trn1 z27.h, z27.h, z16.h\n"
+ "ld1b { z0.s }, p0/Z, [x20]\n"
+ "add z0.h, p0/M, z0.h, z7.h\n"
+ "trn1 z28.h, z28.h, z31.h\n"
+ "addvl x21, SP, #6\n"
+ ".inst 0xc1731708 // sdot za.s[x8, 0], { z24.h-z27.h }, z3.h\n"
+ "mov z29.d, z0.d\n"
+ "addvl x20, SP, #12\n"
+ "sub x16, x16, #0x1\n"
+ ".inst 0xc17b1728 // sdot za.s[x8, 0], { z25.h-z28.h }, z11.h\n"
+ ".inst 0xa0402aa8 // ld1h { z8.h-z9.h }, pn10.b/Z, [x21]\n"
+ ".inst 0xc1721748 // sdot za.s[x8, 0], { z26.h-z29.h }, z2.h\n"
+ "ld1h { z2.h }, p2/Z, [x21, #2, MUL VL]\n"
+ ".inst 0xc1781709 // sdot za.s[x8, 1], { z24.h-z27.h }, z8.h\n"
+ ".inst 0xc0060c10 // mova { z16.d-z19.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac10 // sqdmulh { z16.s-z19.s }, { z16.s-z19.s }, z6.s\n"
+ ".inst 0xc1791729 // sdot za.s[x8, 1], { z25.h-z28.h }, z9.h\n"
+ ".inst 0xa1402a81 // ld1h { z1.h, z9.h }, pn10.b/Z, [x20]\n"
+ ".inst 0xc1a4aa30 // srshl { z16.s-z19.s }, { z16.s-z19.s }, z4.s\n"
+ ".inst 0xc171170a // sdot za.s[x8, 2], { z24.h-z27.h }, z1.h\n"
+ ".inst 0xc1aaab10 // add { z16.s-z19.s }, { z16.s-z19.s }, z10.s\n"
+ ".inst 0xc179172a // sdot za.s[x8, 2], { z25.h-z28.h }, z9.h\n"
+ ".inst 0xc1b5ccb0 // sclamp { z16.s-z19.s }, z5.s, z21.s\n"
+ "st1b { z16.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ ".inst 0xc1721749 // sdot za.s[x8, 1], { z26.h-z29.h }, z2.h\n"
+ "ld1h { z3.h }, p2/Z, [x20, #2, MUL VL]\n"
+ "st1b { z17.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ ".inst 0xc173174a // sdot za.s[x8, 2], { z26.h-z29.h }, z3.h\n"
+ "add x8, x8, #0x1\n"
+ "st1b { z18.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z19.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ "23:" // Tail input: End
+ "cbz x16, 25f\n"
+ "24:" // Right padding loop
+ ".inst 0xc0060c1c // mova { z28.d-z31.d }, za.d[x8, #0]\n"
+ ".inst 0xc1a6ac1c // sqdmulh { z28.s-z31.s }, { z28.s-z31.s }, z6.s\n"
+ "add x8, x8, #0x1\n"
+ ".inst 0xc1a4aa3c // srshl { z28.s-z31.s }, { z28.s-z31.s }, z4.s\n"
+ "subs x16, x16, #0x1\n"
+ ".inst 0xc0040d84 // mova za.d[x8, #4], { z12.d-z15.d }\n"
+ ".inst 0xc1aaab1c // add { z28.s-z31.s }, { z28.s-z31.s }, z10.s\n"
+ ".inst 0xc1b5ccbc // sclamp { z28.s-z31.s }, z5.s, z21.s\n"
+ "st1b { z28.s }, p1, [x15]\n"
+ "add x15, x15, x13\n"
+ "st1b { z29.s }, p1, [x14]\n"
+ "add x14, x14, x11\n"
+ "st1b { z30.s }, p1, [x10]\n"
+ "add x10, x10, x28\n"
+ "st1b { z31.s }, p1, [x9]\n"
+ "add x9, x9, x27\n"
+ "bgt 24b\n"
+ "25:" // End
+ "ldr x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "incw x20, ALL, MUL #16\n"
+ "incw x20, ALL, MUL #9\n"
+ "str x20, [%x[args], %[offsetof_Args_weights]]\n"
+ "ldr x21, [%x[args], %[offsetof_Args_ld_in_vl]]\n"
+ "incw x6\n"
+ "whilelt p1.s, x6, x5\n"
+ "ldr x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "add x20, x20, x21\n"
+ "str x20, [%x[args], %[offsetof_Args_inptr]]\n"
+ "ldr x25, [%x[args], %[offsetof_Args_outptrs]]\n"
+ "ldr x24, [%x[args], %[offsetof_Args_ld_out_vls]]\n"
+ "ldp x23, x22, [x25, #0x0]\n"
+ "ldp x21, x20, [x24, #0x0]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x0]\n"
+ "ldp x23, x22, [x25, #0x10]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x23, x23, x21\n"
+ "add x22, x22, x20\n"
+ "stp x23, x22, [x25, #0x10]\n"
+ "b.any 1b\n"
+ "addvl SP, SP, #15\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [ld_in_col] "r" (ld_in_col), [ld_in_row] "r" (ld_in_row), [offsetof_Args_current_channel] "I" (offsetof(Args, current_channel)), [offsetof_Args_inptr] "I" (offsetof(Args, inptr)), [offsetof_Args_input_cols] "I" (offsetof(Args, input_cols)), [offsetof_Args_ld_in_vl] "I" (offsetof(Args, ld_in_vl)), [offsetof_Args_ld_out_cols] "I" (offsetof(Args, ld_out_cols)), [offsetof_Args_ld_out_vls] "I" (offsetof(Args, ld_out_vls)), [offsetof_Args_n_channels] "I" (offsetof(Args, n_channels)), [offsetof_Args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_Args_output_cols] "I" (offsetof(Args, output_cols)), [offsetof_Args_pad_bottom] "I" (offsetof(Args, pad_bottom)), [offsetof_Args_pad_left] "I" (offsetof(Args, pad_left)), [offsetof_Args_pad_top] "I" (offsetof(Args, pad_top)), [offsetof_Args_weights] "I" (offsetof(Args, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_bias] "I" (offsetof(arm_gemm::Requantize32, bias)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [offsetof_Requantize32_per_channel_muls] "I" (offsetof(arm_gemm::Requantize32, per_channel_muls)), [offsetof_Requantize32_per_channel_right_shifts] "I" (offsetof(arm_gemm::Requantize32, per_channel_right_shifts)), [offsetof_Requantize32_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_Requantize32_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME2)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..edee21e941
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..d807856ccb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "mov x10, #0x0\n"
+ "mov x14, #0x0\n"
+ "1:" // Tile loop
+ "str x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x2\n"
+ "mov x24, #0x2\n"
+ "str x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x10, x23\n" // offset = tile_i * ld_input_row
+ "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x12, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x20, x10, x22\n" // offset = tile_i * ld_output_row
+ "cnth x11\n"
+ "madd x21, x14, x13, x21\n" // offset += tile_j * ld_input_col
+ "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ldr x9, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "madd x20, x14, x12, x20\n" // offset += tile_j * ld_output_col
+ "ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "ld1h { z27.h }, p3/Z, [x10]\n"
+ "add x27, x13, x13\n"
+ "mul x21, x21, x25\n" // offset *= kernel_stride * output_size
+ "add x9, x9, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "mul x20, x20, x24\n" // offset *= output_tile_size
+ "ld1h { z2.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x10, #4, MUL VL]\n"
+ "add x26, x9, x23, LSL #1\n"
+ "ld1h { z4.h }, p3/Z, [x10, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x10, #6, MUL VL]\n"
+ "add x25, x26, x23, LSL #1\n"
+ "add x24, x27, x13\n"
+ "ld1h { z6.h }, p3/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
+ "add x28, x28, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "ld1rh { z26.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "cmp x11, %x[n_channels]\n"
+ "add x23, x25, x23, LSL #1\n"
+ "ld1rh { z25.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z7.h }, p3/Z, [x10, #-8, MUL VL]\n"
+ "add x22, x28, x22, LSL #1\n"
+ "mov x21, #0x0\n"
+ "ld1h { z8.h }, p3/Z, [x10, #-7, MUL VL]\n"
+ "ld1h { z9.h }, p2/Z, [x26, x13, LSL #1]\n"
+ "sub x20, XZR, x11\n"
+ "ld1h { z10.h }, p2/Z, [x9]\n"
+ "ld1h { z11.h }, p2/Z, [x9, x24, LSL #1]\n"
+ "addvl x10, x10, #-6\n"
+ "ld1h { z12.h }, p2/Z, [x26, x27, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x25, x13, LSL #1]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "movprfx z24, z27\n fmla z24.h, p3/M, z4.h, z9.h\n"
+ "movprfx z23, z27\n fmla z23.h, p3/M, z3.h, z9.h\n"
+ "whilelt p1.h, x11, %x[n_channels]\n"
+ "inch x21\n"
+ "movprfx z22, z27\n fmla z22.h, p3/M, z1.h, z9.h\n"
+ "movprfx z21, z27\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z18.h }, p2/Z, [x23]\n"
+ "inch x11\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "ld1h { z20.h }, p2/Z, [x25, x27, LSL #1]\n"
+ "fmla z22.h, p3/M, z2.h, z12.h\n"
+ "fmla z21.h, p3/M, z1.h, z12.h\n"
+ "mov p0.b, p2.b\n"
+ "ld1h { z27.h }, p3/Z, [x10]\n"
+ "fmla z24.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, x13, LSL #1]\n"
+ "inch x20\n"
+ "fmla z22.h, p3/M, z6.h, z18.h\n"
+ "fmla z21.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z18.h }, p2/Z, [x9, x27, LSL #1]\n"
+ "addvl x9, x9, #1\n"
+ "fmla z24.h, p3/M, z7.h, z13.h\n"
+ "fmla z23.h, p3/M, z6.h, z13.h\n"
+ "fmla z22.h, p3/M, z4.h, z13.h\n"
+ "fmla z21.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x26]\n"
+ "fmla z24.h, p3/M, z1.h, z16.h\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x24, LSL #1]\n"
+ "addvl x26, x26, #1\n"
+ "fmla z22.h, p3/M, z5.h, z20.h\n"
+ "fmla z21.h, p3/M, z4.h, z20.h\n"
+ "ld1h { z4.h }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z24.h, p3/M, z2.h, z18.h\n"
+ "fmla z23.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x25]\n"
+ "ld1h { z1.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z22.h, p3/M, z0.h, z17.h\n"
+ "fmla z21.h, p3/M, z2.h, z16.h\n"
+ "ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z24.h, p3/M, z8.h, z20.h\n"
+ "fmla z23.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z18.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "addvl x25, x25, #1\n"
+ "fmla z22.h, p3/M, z3.h, z19.h\n"
+ "fmla z21.h, p3/M, z5.h, z18.h\n"
+ "ld1h { z13.h }, p1/Z, [x25, x13, LSL #1]\n"
+ "fmla z24.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x13, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z17.h\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z3.h }, p3/Z, [x10, #4, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z24.h, p3/M, z6.h, z19.h\n"
+ "fmla z23.h, p3/M, z8.h, z18.h\n"
+ "fmax z24.h, p3/M, z24.h, z26.h\n"
+ "fmax z23.h, p3/M, z23.h, z26.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z26.h\n"
+ "fmax z21.h, p3/M, z21.h, z26.h\n"
+ "ld1h { z6.h }, p3/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
+ "whilelt p2.h, x21, %x[n_channels]\n"
+ "ld1h { z9.h }, p1/Z, [x26, x13, LSL #1]\n"
+ "cmp x11, %x[n_channels]\n"
+ "fmin z24.h, p3/M, z24.h, z25.h\n"
+ "ld1h { z10.h }, p1/Z, [x9]\n"
+ "ld1h { z11.h }, p1/Z, [x9, x24, LSL #1]\n"
+ "fmin z23.h, p3/M, z23.h, z25.h\n"
+ "fmin z22.h, p3/M, z22.h, z25.h\n"
+ "ld1h { z12.h }, p1/Z, [x26, x27, LSL #1]\n"
+ "st1h { z24.h }, p0, [x28]\n"
+ "fmin z21.h, p3/M, z21.h, z25.h\n"
+ "addvl x23, x23, #1\n"
+ "st1h { z23.h }, p0, [x28, x12, LSL #1]\n"
+ "ld1h { z7.h }, p3/Z, [x10, #-8, MUL VL]\n"
+ "st1h { z22.h }, p0, [x22]\n"
+ "addvl x28, x28, #1\n"
+ "ld1h { z8.h }, p3/Z, [x10, #-7, MUL VL]\n"
+ "addvl x10, x10, #-6\n"
+ "st1h { z21.h }, p0, [x22, x12, LSL #1]\n"
+ "addvl x22, x22, #1\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "movprfx z24, z27\n fmla z24.h, p3/M, z4.h, z9.h\n"
+ "movprfx z23, z27\n fmla z23.h, p3/M, z3.h, z9.h\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z22, z27\n fmla z22.h, p3/M, z1.h, z9.h\n"
+ "movprfx z21, z27\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z18.h }, p2/Z, [x23]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "ld1h { z20.h }, p2/Z, [x25, x27, LSL #1]\n"
+ "fmla z22.h, p3/M, z2.h, z12.h\n"
+ "fmla z21.h, p3/M, z1.h, z12.h\n"
+ "add x14, x14, #0x1\n"
+ "cmp x14, x20\n"
+ "fmla z24.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, x13, LSL #1]\n"
+ "add x21, x10, #0x1\n"
+ "fmla z22.h, p3/M, z6.h, z18.h\n"
+ "fmla z21.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z18.h }, p2/Z, [x9, x27, LSL #1]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z24.h, p3/M, z7.h, z13.h\n"
+ "fmla z23.h, p3/M, z6.h, z13.h\n"
+ "csel x10, x10, x21, LT\n"
+ "mov p0.b, p2.b\n"
+ "fmla z22.h, p3/M, z4.h, z13.h\n"
+ "fmla z21.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x26]\n"
+ "csel x14, x14, XZR, LT\n"
+ "fmla z24.h, p3/M, z1.h, z16.h\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x24, LSL #1]\n"
+ "cmp x10, x20\n"
+ "fmla z22.h, p3/M, z5.h, z20.h\n"
+ "fmla z21.h, p3/M, z4.h, z20.h\n"
+ "fmla z24.h, p3/M, z2.h, z18.h\n"
+ "fmla z23.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x25]\n"
+ "fmla z22.h, p3/M, z0.h, z17.h\n"
+ "fmla z21.h, p3/M, z2.h, z16.h\n"
+ "fmla z24.h, p3/M, z8.h, z20.h\n"
+ "fmla z23.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z18.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "fmla z22.h, p3/M, z3.h, z19.h\n"
+ "fmla z21.h, p3/M, z5.h, z18.h\n"
+ "fmla z24.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x13, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z17.h\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
+ "fmla z24.h, p3/M, z6.h, z19.h\n"
+ "fmla z23.h, p3/M, z8.h, z18.h\n"
+ "fmax z24.h, p3/M, z24.h, z26.h\n"
+ "fmax z23.h, p3/M, z23.h, z26.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z26.h\n"
+ "fmax z21.h, p3/M, z21.h, z26.h\n"
+ "fmin z24.h, p3/M, z24.h, z25.h\n"
+ "fmin z23.h, p3/M, z23.h, z25.h\n"
+ "st1h { z24.h }, p0, [x28]\n"
+ "fmin z22.h, p3/M, z22.h, z25.h\n"
+ "fmin z21.h, p3/M, z21.h, z25.h\n"
+ "st1h { z23.h }, p0, [x28, x12, LSL #1]\n"
+ "st1h { z22.h }, p0, [x22]\n"
+ "st1h { z21.h }, p0, [x22, x12, LSL #1]\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..90982b6990
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[16];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[5];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[3];
+ inptrs[3] = input_ptrs[6];
+ inptrs[4] = input_ptrs[9];
+ inptrs[5] = input_ptrs[12];
+ inptrs[6] = input_ptrs[15];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[2];
+ inptrs[9] = input_ptrs[10];
+ inptrs[10] = input_ptrs[4];
+ inptrs[11] = input_ptrs[7];
+ inptrs[12] = input_ptrs[8];
+ inptrs[13] = input_ptrs[11];
+ inptrs[14] = input_ptrs[13];
+ inptrs[15] = input_ptrs[14];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "cnth x14\n"
+ "ldp x13, x12, [x20, #0x0]\n"
+ "ldp x11, x10, [x20, #0x10]\n"
+ "mov x9, #0x0\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ld1h { z20.h }, p3/Z, [x16]\n"
+ "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
+ "cmp x14, %x[n_channels]\n"
+ "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
+ "sub x28, XZR, x14\n"
+ "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
+ "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ld1rh { z26.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z25.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
+ "ld1h { z9.h }, p2/Z, [x24, x9, LSL #1]\n"
+ "addvl x16, x16, #-6\n"
+ "ld1h { z10.h }, p2/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "movprfx z24, z20\n fmla z24.h, p3/M, z4.h, z9.h\n"
+ "movprfx z23, z20\n fmla z23.h, p3/M, z3.h, z9.h\n"
+ "ldr x21, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x30]\n"
+ "movprfx z22, z20\n fmla z22.h, p3/M, z1.h, z9.h\n"
+ "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ldr x22, [x15, #0x38]\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x48]\n"
+ "fmla z22.h, p3/M, z2.h, z12.h\n"
+ "fmla z21.h, p3/M, z1.h, z12.h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "ld1h { z20.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldr x22, [x15, #0x50]\n"
+ "fmla z22.h, p3/M, z6.h, z18.h\n"
+ "fmla z21.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x58]\n"
+ "fmla z24.h, p3/M, z7.h, z13.h\n"
+ "fmla z23.h, p3/M, z6.h, z13.h\n"
+ "ldr x20, [x15, #0x60]\n"
+ "ldr x27, [x15, #0x68]\n"
+ "fmla z22.h, p3/M, z4.h, z13.h\n"
+ "fmla z21.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldr x26, [x15, #0x70]\n"
+ "fmla z24.h, p3/M, z1.h, z16.h\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ldr x25, [x15, #0x78]\n"
+ "fmla z22.h, p3/M, z5.h, z20.h\n"
+ "fmla z21.h, p3/M, z4.h, z20.h\n"
+ "whilelt p1.h, x14, %x[n_channels]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "fmla z24.h, p3/M, z2.h, z18.h\n"
+ "fmla z23.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "fmla z22.h, p3/M, z0.h, z17.h\n"
+ "fmla z21.h, p3/M, z2.h, z16.h\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ld1h { z13.h }, p1/Z, [x20, x14, LSL #1]\n"
+ "fmla z24.h, p3/M, z8.h, z20.h\n"
+ "fmla z23.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z18.h }, p2/Z, [x27, x9, LSL #1]\n"
+ "inch x28\n"
+ "fmla z22.h, p3/M, z3.h, z19.h\n"
+ "fmla z21.h, p3/M, z5.h, z18.h\n"
+ "mov p0.b, p2.b\n"
+ "ld1h { z20.h }, p3/Z, [x16]\n"
+ "fmla z24.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x9, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z17.h\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
+ "inch x9\n"
+ "ld1h { z11.h }, p1/Z, [x22, x14, LSL #1]\n"
+ "fmla z24.h, p3/M, z6.h, z19.h\n"
+ "fmla z23.h, p3/M, z8.h, z18.h\n"
+ "ld1h { z9.h }, p1/Z, [x24, x14, LSL #1]\n"
+ "ld1h { z10.h }, p1/Z, [x23, x14, LSL #1]\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z12.h }, p1/Z, [x21, x14, LSL #1]\n"
+ "inch x14\n"
+ "fmax z24.h, p3/M, z24.h, z26.h\n"
+ "fmax z23.h, p3/M, z23.h, z26.h\n"
+ "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
+ "fmax z22.h, p3/M, z22.h, z26.h\n"
+ "fmax z21.h, p3/M, z21.h, z26.h\n"
+ "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
+ "whilelt p2.h, x9, %x[n_channels]\n"
+ "cmp x14, %x[n_channels]\n"
+ "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "fmin z24.h, p3/M, z24.h, z25.h\n"
+ "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
+ "fmin z23.h, p3/M, z23.h, z25.h\n"
+ "fmin z22.h, p3/M, z22.h, z25.h\n"
+ "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
+ "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
+ "fmin z21.h, p3/M, z21.h, z25.h\n"
+ "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
+ "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
+ "addvl x16, x16, #-6\n"
+ "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z24, z20\n fmla z24.h, p3/M, z4.h, z9.h\n"
+ "movprfx z23, z20\n fmla z23.h, p3/M, z3.h, z9.h\n"
+ "ldr x21, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x30]\n"
+ "movprfx z22, z20\n fmla z22.h, p3/M, z1.h, z9.h\n"
+ "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ldr x22, [x15, #0x38]\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z23.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x48]\n"
+ "fmla z22.h, p3/M, z2.h, z12.h\n"
+ "fmla z21.h, p3/M, z1.h, z12.h\n"
+ "ldr x20, [x15, #0x40]\n"
+ "ld1h { z20.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z5.h, z12.h\n"
+ "fmla z23.h, p3/M, z4.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x50]\n"
+ "fmla z22.h, p3/M, z6.h, z18.h\n"
+ "fmla z21.h, p3/M, z3.h, z13.h\n"
+ "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x58]\n"
+ "fmla z24.h, p3/M, z7.h, z13.h\n"
+ "fmla z23.h, p3/M, z6.h, z13.h\n"
+ "ldr x23, [x15, #0x60]\n"
+ "ldr x22, [x15, #0x68]\n"
+ "fmla z22.h, p3/M, z4.h, z13.h\n"
+ "fmla z21.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "fmla z24.h, p3/M, z1.h, z16.h\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "fmla z22.h, p3/M, z5.h, z20.h\n"
+ "fmla z21.h, p3/M, z4.h, z20.h\n"
+ "inch x28\n"
+ "mov p0.b, p2.b\n"
+ "fmla z24.h, p3/M, z2.h, z18.h\n"
+ "fmla z23.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x23, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z0.h, z17.h\n"
+ "fmla z21.h, p3/M, z2.h, z16.h\n"
+ "fmla z24.h, p3/M, z8.h, z20.h\n"
+ "fmla z23.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z18.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z3.h, z19.h\n"
+ "fmla z21.h, p3/M, z5.h, z18.h\n"
+ "fmla z24.h, p3/M, z3.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z17.h\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
+ "fmla z24.h, p3/M, z6.h, z19.h\n"
+ "fmla z23.h, p3/M, z8.h, z18.h\n"
+ "fmax z24.h, p3/M, z24.h, z26.h\n"
+ "fmax z23.h, p3/M, z23.h, z26.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z26.h\n"
+ "fmax z21.h, p3/M, z21.h, z26.h\n"
+ "fmin z24.h, p3/M, z24.h, z25.h\n"
+ "fmin z23.h, p3/M, z23.h, z25.h\n"
+ "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
+ "fmin z22.h, p3/M, z22.h, z25.h\n"
+ "fmin z21.h, p3/M, z21.h, z25.h\n"
+ "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
+ "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
+ "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..da2ef72a30
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 3;
+ constexpr static unsigned int output_cols = 3;
+
+ sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..a22ab39d6f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "mov x13, #0x0\n"
+ "mov x8, #0x0\n"
+ "1:" // Tile loop
+ "str x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x3\n"
+ "mov x24, #0x3\n"
+ "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mul x22, x13, x23\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x22, x8, x17, x22\n" // offset += tile_j * ld_input_col
+ "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "cnth x15\n"
+ "mul x20, x13, x21\n" // offset = tile_i * ld_output_row
+ "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x12, x17, x17\n"
+ "mul x22, x22, x25\n" // offset *= kernel_stride * output_size
+ "add x14, x14, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x10, x14, x23, LSL #1\n"
+ "madd x20, x8, x16, x20\n" // offset += tile_j * ld_output_col
+ "add x9, x10, x23, LSL #1\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ld1h { z14.h }, p3/Z, [x13]\n"
+ "mul x20, x20, x24\n" // offset *= output_tile_size
+ "ld1h { z0.h }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x13, #2, MUL VL]\n"
+ "add x28, x9, x23, LSL #1\n"
+ "ld1h { z2.h }, p3/Z, [x13, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x13, #4, MUL VL]\n"
+ "add x27, x12, x17\n"
+ "add x11, x11, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "ld1h { z4.h }, p3/Z, [x13, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x13, #6, MUL VL]\n"
+ "add x26, x28, x23, LSL #1\n"
+ "add x25, x27, x17\n"
+ "ld1h { z6.h }, p3/Z, [x13, #7, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "add x24, x11, x21, LSL #1\n"
+ "ld1rh { z31.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "cmp x15, %x[n_channels]\n"
+ "add x23, x24, x21, LSL #1\n"
+ "ld1rh { z30.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z7.h }, p3/Z, [x13, #-8, MUL VL]\n"
+ "add x22, x16, x16\n"
+ "mov x21, #0x0\n"
+ "ld1h { z8.h }, p3/Z, [x13, #-7, MUL VL]\n"
+ "ld1h { z9.h }, p2/Z, [x9, x12, LSL #1]\n"
+ "sub x20, XZR, x15\n"
+ "ld1h { z10.h }, p2/Z, [x14]\n"
+ "ld1h { z11.h }, p2/Z, [x14, x25, LSL #1]\n"
+ "addvl x13, x13, #-6\n"
+ "ld1h { z12.h }, p2/Z, [x26]\n"
+ "ld1h { z13.h }, p2/Z, [x10, x12, LSL #1]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "movprfx z29, z14\n fmla z29.h, p3/M, z7.h, z9.h\n"
+ "movprfx z28, z14\n fmla z28.h, p3/M, z8.h, z9.h\n"
+ "whilelt p1.h, x15, %x[n_channels]\n"
+ "inch x21\n"
+ "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
+ "fmla z29.h, p3/M, z4.h, z13.h\n"
+ "inch x15\n"
+ "mov p0.b, p2.b\n"
+ "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
+ "inch x20\n"
+ "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z23.h }, p2/Z, [x9, x27, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x9, x17, LSL #1]\n"
+ "movprfx z22, z14\n fmla z22.h, p3/M, z2.h, z9.h\n"
+ "fmla z29.h, p3/M, z6.h, z18.h\n"
+ "movprfx z21, z14\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "fmla z28.h, p3/M, z5.h, z13.h\n"
+ "fmla z27.h, p3/M, z3.h, z13.h\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "fmla z25.h, p3/M, z1.h, z13.h\n"
+ "fmla z24.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z17.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z22.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x25, LSL #1]\n"
+ "movprfx z20, z14\n fmla z20.h, p3/M, z1.h, z9.h\n"
+ "fmla z29.h, p3/M, z0.h, z17.h\n"
+ "ld1h { z14.h }, p3/Z, [x13]\n"
+ "fmla z21.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x27, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z18.h\n"
+ "fmla z20.h, p3/M, z0.h, z18.h\n"
+ "fmla z26.h, p3/M, z4.h, z18.h\n"
+ "fmla z25.h, p3/M, z3.h, z18.h\n"
+ "fmla z22.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x10]\n"
+ "fmla z29.h, p3/M, z2.h, z16.h\n"
+ "fmla z27.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x28]\n"
+ "fmla z24.h, p3/M, z4.h, z23.h\n"
+ "fmla z28.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x10, x25, LSL #1]\n"
+ "fmla z20.h, p3/M, z2.h, z23.h\n"
+ "fmla z21.h, p3/M, z1.h, z23.h\n"
+ "fmla z29.h, p3/M, z8.h, z23.h\n"
+ "fmla z27.h, p3/M, z7.h, z23.h\n"
+ "fmla z25.h, p3/M, z5.h, z23.h\n"
+ "fmla z26.h, p3/M, z0.h, z19.h\n"
+ "ld1h { z17.h }, p2/Z, [x28, x12, LSL #1]\n"
+ "fmla z22.h, p3/M, z3.h, z18.h\n"
+ "fmla z24.h, p3/M, z2.h, z16.h\n"
+ "fmla z20.h, p3/M, z4.h, z17.h\n"
+ "fmla z21.h, p3/M, z3.h, z17.h\n"
+ "fmla z28.h, p3/M, z3.h, z19.h\n"
+ "fmla z27.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x28, x25, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x26, x17, LSL #1]\n"
+ "fmla z26.h, p3/M, z6.h, z18.h\n"
+ "fmla z25.h, p3/M, z7.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x10, x17, LSL #1]\n"
+ "fmla z22.h, p3/M, z5.h, z17.h\n"
+ "fmla z24.h, p3/M, z6.h, z17.h\n"
+ "fmla z21.h, p3/M, z5.h, z19.h\n"
+ "fmla z20.h, p3/M, z6.h, z16.h\n"
+ "fmla z26.h, p3/M, z8.h, z17.h\n"
+ "fmla z22.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x27, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z18.h\n"
+ "fmla z25.h, p3/M, z0.h, z18.h\n"
+ "fmla z24.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x10, x27, LSL #1]\n"
+ "fmla z20.h, p3/M, z8.h, z17.h\n"
+ "addvl x10, x10, #1\n"
+ "fmla z21.h, p3/M, z7.h, z17.h\n"
+ "fmla z28.h, p3/M, z4.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x28, x27, LSL #1]\n"
+ "fmla z26.h, p3/M, z1.h, z18.h\n"
+ "fmla z29.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x28, x17, LSL #1]\n"
+ "addvl x28, x28, #1\n"
+ "fmla z27.h, p3/M, z4.h, z16.h\n"
+ "fmla z25.h, p3/M, z2.h, z16.h\n"
+ "fmla z24.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x12, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
+ "addvl x14, x14, #1\n"
+ "fmla z20.h, p3/M, z3.h, z17.h\n"
+ "fmla z21.h, p3/M, z4.h, z19.h\n"
+ "ld1h { z4.h }, p3/Z, [x13, #5, MUL VL]\n"
+ "ld1h { z10.h }, p1/Z, [x14]\n"
+ "fmla z26.h, p3/M, z7.h, z17.h\n"
+ "fmla z25.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x9]\n"
+ "fmla z28.h, p3/M, z2.h, z16.h\n"
+ "fmla z29.h, p3/M, z1.h, z16.h\n"
+ "fmax z29.h, p3/M, z29.h, z31.h\n"
+ "ld1h { z1.h }, p3/Z, [x13, #2, MUL VL]\n"
+ "fmla z27.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x9, x25, LSL #1]\n"
+ "fmla z24.h, p3/M, z7.h, z19.h\n"
+ "addvl x9, x9, #1\n"
+ "fmla z20.h, p3/M, z5.h, z19.h\n"
+ "fmla z22.h, p3/M, z0.h, z18.h\n"
+ "ld1h { z0.h }, p3/Z, [x13, #1, MUL VL]\n"
+ "fmin z29.h, p3/M, z29.h, z30.h\n"
+ "fmla z21.h, p3/M, z2.h, z17.h\n"
+ "fmla z25.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x12, LSL #1]\n"
+ "fmax z25.h, p3/M, z25.h, z31.h\n"
+ "fmla z28.h, p3/M, z6.h, z18.h\n"
+ "fmla z26.h, p3/M, z3.h, z18.h\n"
+ "fmax z28.h, p3/M, z28.h, z31.h\n"
+ "fmax z26.h, p3/M, z26.h, z31.h\n"
+ "fmla z27.h, p3/M, z8.h, z17.h\n"
+ "fmla z24.h, p3/M, z5.h, z17.h\n"
+ "fmax z27.h, p3/M, z27.h, z31.h\n"
+ "fmax z24.h, p3/M, z24.h, z31.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "fmla z20.h, p3/M, z7.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z31.h\n"
+ "fmax z20.h, p3/M, z20.h, z31.h\n"
+ "fmla z21.h, p3/M, z6.h, z16.h\n"
+ "fmax z21.h, p3/M, z21.h, z31.h\n"
+ "addvl x26, x26, #1\n"
+ "ld1h { z2.h }, p3/Z, [x13, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x13, #4, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x13, #6, MUL VL]\n"
+ "whilelt p2.h, x21, %x[n_channels]\n"
+ "cmp x15, %x[n_channels]\n"
+ "ld1h { z6.h }, p3/Z, [x13, #7, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "fmin z28.h, p3/M, z28.h, z30.h\n"
+ "ld1h { z9.h }, p1/Z, [x9, x12, LSL #1]\n"
+ "fmin z27.h, p3/M, z27.h, z30.h\n"
+ "fmin z26.h, p3/M, z26.h, z30.h\n"
+ "ld1h { z11.h }, p1/Z, [x14, x25, LSL #1]\n"
+ "ld1h { z12.h }, p1/Z, [x26]\n"
+ "fmin z25.h, p3/M, z25.h, z30.h\n"
+ "fmin z24.h, p3/M, z24.h, z30.h\n"
+ "ld1h { z13.h }, p1/Z, [x10, x12, LSL #1]\n"
+ "st1h { z28.h }, p0, [x11]\n"
+ "fmin z22.h, p3/M, z22.h, z30.h\n"
+ "fmin z20.h, p3/M, z20.h, z30.h\n"
+ "st1h { z29.h }, p0, [x11, x16, LSL #1]\n"
+ "ld1h { z7.h }, p3/Z, [x13, #-8, MUL VL]\n"
+ "fmin z21.h, p3/M, z21.h, z30.h\n"
+ "st1h { z27.h }, p0, [x11, x22, LSL #1]\n"
+ "addvl x11, x11, #1\n"
+ "ld1h { z8.h }, p3/Z, [x13, #-7, MUL VL]\n"
+ "st1h { z26.h }, p0, [x24]\n"
+ "addvl x13, x13, #-6\n"
+ "st1h { z25.h }, p0, [x24, x16, LSL #1]\n"
+ "st1h { z24.h }, p0, [x24, x22, LSL #1]\n"
+ "addvl x24, x24, #1\n"
+ "st1h { z22.h }, p0, [x23]\n"
+ "st1h { z20.h }, p0, [x23, x16, LSL #1]\n"
+ "st1h { z21.h }, p0, [x23, x22, LSL #1]\n"
+ "addvl x23, x23, #1\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "movprfx z29, z14\n fmla z29.h, p3/M, z7.h, z9.h\n"
+ "movprfx z28, z14\n fmla z28.h, p3/M, z8.h, z9.h\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
+ "fmla z29.h, p3/M, z4.h, z13.h\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "add x8, x8, #0x1\n"
+ "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
+ "cmp x8, x20\n"
+ "add x21, x13, #0x1\n"
+ "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z23.h }, p2/Z, [x9, x27, LSL #1]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x9, x17, LSL #1]\n"
+ "movprfx z22, z14\n fmla z22.h, p3/M, z2.h, z9.h\n"
+ "csel x13, x13, x21, LT\n"
+ "fmla z29.h, p3/M, z6.h, z18.h\n"
+ "movprfx z21, z14\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "mov p0.b, p2.b\n"
+ "csel x8, x8, XZR, LT\n"
+ "fmla z28.h, p3/M, z5.h, z13.h\n"
+ "fmla z27.h, p3/M, z3.h, z13.h\n"
+ "cmp x13, x20\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "fmla z25.h, p3/M, z1.h, z13.h\n"
+ "fmla z24.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z17.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "fmla z22.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x25, LSL #1]\n"
+ "movprfx z20, z14\n fmla z20.h, p3/M, z1.h, z9.h\n"
+ "fmla z29.h, p3/M, z0.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x27, LSL #1]\n"
+ "fmla z28.h, p3/M, z7.h, z18.h\n"
+ "fmla z20.h, p3/M, z0.h, z18.h\n"
+ "fmla z26.h, p3/M, z4.h, z18.h\n"
+ "fmla z25.h, p3/M, z3.h, z18.h\n"
+ "fmla z22.h, p3/M, z1.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x10]\n"
+ "fmla z29.h, p3/M, z2.h, z16.h\n"
+ "fmla z27.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x28]\n"
+ "fmla z24.h, p3/M, z4.h, z23.h\n"
+ "fmla z28.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x10, x25, LSL #1]\n"
+ "fmla z20.h, p3/M, z2.h, z23.h\n"
+ "fmla z21.h, p3/M, z1.h, z23.h\n"
+ "fmla z29.h, p3/M, z8.h, z23.h\n"
+ "fmla z27.h, p3/M, z7.h, z23.h\n"
+ "fmla z25.h, p3/M, z5.h, z23.h\n"
+ "fmla z26.h, p3/M, z0.h, z19.h\n"
+ "ld1h { z17.h }, p2/Z, [x28, x12, LSL #1]\n"
+ "fmla z22.h, p3/M, z3.h, z18.h\n"
+ "fmla z24.h, p3/M, z2.h, z16.h\n"
+ "fmla z20.h, p3/M, z4.h, z17.h\n"
+ "fmla z21.h, p3/M, z3.h, z17.h\n"
+ "fmla z28.h, p3/M, z3.h, z19.h\n"
+ "fmla z27.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x28, x25, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x26, x17, LSL #1]\n"
+ "fmla z26.h, p3/M, z6.h, z18.h\n"
+ "fmla z25.h, p3/M, z7.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x10, x17, LSL #1]\n"
+ "fmla z22.h, p3/M, z5.h, z17.h\n"
+ "fmla z24.h, p3/M, z6.h, z17.h\n"
+ "fmla z21.h, p3/M, z5.h, z19.h\n"
+ "fmla z20.h, p3/M, z6.h, z16.h\n"
+ "fmla z26.h, p3/M, z8.h, z17.h\n"
+ "fmla z22.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x27, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z18.h\n"
+ "fmla z25.h, p3/M, z0.h, z18.h\n"
+ "fmla z24.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x10, x27, LSL #1]\n"
+ "fmla z20.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z7.h, z17.h\n"
+ "fmla z28.h, p3/M, z4.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x28, x27, LSL #1]\n"
+ "fmla z26.h, p3/M, z1.h, z18.h\n"
+ "fmla z29.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x28, x17, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z16.h\n"
+ "fmla z25.h, p3/M, z2.h, z16.h\n"
+ "fmla z24.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x12, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
+ "fmla z20.h, p3/M, z3.h, z17.h\n"
+ "fmla z21.h, p3/M, z4.h, z19.h\n"
+ "fmla z26.h, p3/M, z7.h, z17.h\n"
+ "fmla z25.h, p3/M, z6.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x9]\n"
+ "fmla z28.h, p3/M, z2.h, z16.h\n"
+ "fmla z29.h, p3/M, z1.h, z16.h\n"
+ "fmax z29.h, p3/M, z29.h, z31.h\n"
+ "fmin z29.h, p3/M, z29.h, z30.h\n"
+ "fmla z27.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x9, x25, LSL #1]\n"
+ "fmla z24.h, p3/M, z7.h, z19.h\n"
+ "fmla z20.h, p3/M, z5.h, z19.h\n"
+ "fmla z22.h, p3/M, z0.h, z18.h\n"
+ "fmla z21.h, p3/M, z2.h, z17.h\n"
+ "fmla z25.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x26, x12, LSL #1]\n"
+ "fmax z25.h, p3/M, z25.h, z31.h\n"
+ "fmla z28.h, p3/M, z6.h, z18.h\n"
+ "fmla z26.h, p3/M, z3.h, z18.h\n"
+ "fmax z28.h, p3/M, z28.h, z31.h\n"
+ "fmax z26.h, p3/M, z26.h, z31.h\n"
+ "fmla z27.h, p3/M, z8.h, z17.h\n"
+ "fmla z24.h, p3/M, z5.h, z17.h\n"
+ "fmax z27.h, p3/M, z27.h, z31.h\n"
+ "fmax z24.h, p3/M, z24.h, z31.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "fmla z20.h, p3/M, z7.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z31.h\n"
+ "fmax z20.h, p3/M, z20.h, z31.h\n"
+ "fmla z21.h, p3/M, z6.h, z16.h\n"
+ "fmax z21.h, p3/M, z21.h, z31.h\n"
+ "fmin z28.h, p3/M, z28.h, z30.h\n"
+ "st1h { z28.h }, p0, [x11]\n"
+ "fmin z27.h, p3/M, z27.h, z30.h\n"
+ "fmin z26.h, p3/M, z26.h, z30.h\n"
+ "st1h { z29.h }, p0, [x11, x16, LSL #1]\n"
+ "fmin z25.h, p3/M, z25.h, z30.h\n"
+ "fmin z24.h, p3/M, z24.h, z30.h\n"
+ "st1h { z27.h }, p0, [x11, x22, LSL #1]\n"
+ "fmin z22.h, p3/M, z22.h, z30.h\n"
+ "fmin z20.h, p3/M, z20.h, z30.h\n"
+ "st1h { z26.h }, p0, [x24]\n"
+ "fmin z21.h, p3/M, z21.h, z30.h\n"
+ "st1h { z25.h }, p0, [x24, x16, LSL #1]\n"
+ "st1h { z24.h }, p0, [x24, x22, LSL #1]\n"
+ "st1h { z22.h }, p0, [x23]\n"
+ "st1h { z20.h }, p0, [x23, x16, LSL #1]\n"
+ "st1h { z21.h }, p0, [x23, x22, LSL #1]\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..4f8368acd5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[25];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[4];
+ inptrs[3] = input_ptrs[20];
+ inptrs[4] = input_ptrs[7];
+ inptrs[5] = input_ptrs[24];
+ inptrs[6] = input_ptrs[11];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[3];
+ inptrs[9] = input_ptrs[13];
+ inptrs[10] = input_ptrs[5];
+ inptrs[11] = input_ptrs[9];
+ inptrs[12] = input_ptrs[15];
+ inptrs[13] = input_ptrs[17];
+ inptrs[14] = input_ptrs[19];
+ inptrs[15] = input_ptrs[21];
+ inptrs[16] = input_ptrs[6];
+ inptrs[17] = input_ptrs[8];
+ inptrs[18] = input_ptrs[23];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[18];
+ inptrs[22] = input_ptrs[10];
+ inptrs[23] = input_ptrs[14];
+ inptrs[24] = input_ptrs[22];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1h { z14.h }, p3/Z, [x8]\n"
+ "cnth x16\n"
+ "mov x15, #0x0\n"
+ "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n"
+ "cmp x16, %x[n_channels]\n"
+ "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n"
+ "sub x14, XZR, x16\n"
+ "ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n"
+ "addvl x8, x8, #16\n"
+ "ldp x24, x23, [x17, #0x0]\n"
+ "ldp x22, x21, [x17, #0x10]\n"
+ "ldr x20, [x17, #0x20]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ld1rh { z31.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z30.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n"
+ "ld1h { z9.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "addvl x8, x8, #-6\n"
+ "ld1h { z10.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "movprfx z29, z14\n fmla z29.h, p3/M, z8.h, z9.h\n"
+ "movprfx z28, z14\n fmla z28.h, p3/M, z7.h, z9.h\n"
+ "ldr x23, [x17, #0x30]\n"
+ "ldr x26, [x17, #0x38]\n"
+ "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
+ "fmla z29.h, p3/M, z0.h, z10.h\n"
+ "ldr x22, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "fmla z28.h, p3/M, z4.h, z13.h\n"
+ "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "ldr x20, [x17, #0x40]\n"
+ "ld1h { z19.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
+ "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
+ "ldr x25, [x17, #0x50]\n"
+ "ldr x24, [x17, #0x58]\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "movprfx z23, z14\n fmla z23.h, p3/M, z2.h, z9.h\n"
+ "ldr x23, [x17, #0x60]\n"
+ "fmla z29.h, p3/M, z5.h, z13.h\n"
+ "fmla z28.h, p3/M, z6.h, z18.h\n"
+ "ldr x12, [x17, #0x70]\n"
+ "ldr x11, [x17, #0x88]\n"
+ "movprfx z22, z14\n fmla z22.h, p3/M, z0.h, z9.h\n"
+ "fmla z27.h, p3/M, z3.h, z13.h\n"
+ "inch x14\n"
+ "mov p1.b, p2.b\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "fmla z25.h, p3/M, z1.h, z13.h\n"
+ "ldr x10, [x13, #0x0]\n"
+ "whilelt p0.h, x16, %x[n_channels]\n"
+ "fmla z24.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z23.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "movprfx z21, z14\n fmla z21.h, p3/M, z1.h, z9.h\n"
+ "fmla z29.h, p3/M, z7.h, z18.h\n"
+ "ldr x22, [x17, #0x68]\n"
+ "ldr x21, [x17, #0x78]\n"
+ "fmla z28.h, p3/M, z0.h, z17.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x20, [x17, #0x80]\n"
+ "fmla z26.h, p3/M, z4.h, z18.h\n"
+ "fmla z25.h, p3/M, z3.h, z18.h\n"
+ "ldr x9, [x13, #0x8]\n"
+ "ldr x28, [x13, #0x10]\n"
+ "fmla z21.h, p3/M, z0.h, z18.h\n"
+ "fmla z24.h, p3/M, z4.h, z19.h\n"
+ "ldr x27, [x13, #0x18]\n"
+ "ld1h { z14.h }, p3/Z, [x8]\n"
+ "fmla z23.h, p3/M, z1.h, z18.h\n"
+ "fmla z29.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z20.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z16.h\n"
+ "fmla z27.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ldr x26, [x17, #0x90]\n"
+ "fmla z25.h, p3/M, z5.h, z19.h\n"
+ "fmla z21.h, p3/M, z2.h, z19.h\n"
+ "ldr x25, [x17, #0xa0]\n"
+ "ldr x24, [x17, #0x98]\n"
+ "fmla z26.h, p3/M, z0.h, z20.h\n"
+ "fmla z24.h, p3/M, z2.h, z17.h\n"
+ "fmla z28.h, p3/M, z8.h, z19.h\n"
+ "fmla z27.h, p3/M, z7.h, z19.h\n"
+ "fmla z22.h, p3/M, z1.h, z19.h\n"
+ "fmla z23.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ldr x23, [x17, #0xa8]\n"
+ "fmla z26.h, p3/M, z6.h, z16.h\n"
+ "fmla z25.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z24.h, p3/M, z6.h, z18.h\n"
+ "fmla z21.h, p3/M, z4.h, z18.h\n"
+ "fmla z29.h, p3/M, z3.h, z20.h\n"
+ "fmla z27.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z18.h\n"
+ "fmla z22.h, p3/M, z3.h, z18.h\n"
+ "ldr x21, [x17, #0xb0]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.h, p3/M, z8.h, z18.h\n"
+ "fmla z24.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z6.h, z16.h\n"
+ "fmla z28.h, p3/M, z3.h, z19.h\n"
+ "fmla z25.h, p3/M, z0.h, z19.h\n"
+ "fmla z22.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z23.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z19.h\n"
+ "fmla z26.h, p3/M, z1.h, z19.h\n"
+ "fmla z28.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z17.h\n"
+ "fmla z25.h, p3/M, z2.h, z17.h\n"
+ "fmla z24.h, p3/M, z1.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z18.h\n"
+ "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ldr x25, [x17, #0x20]\n"
+ "fmla z22.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z17.h\n"
+ "fmla z26.h, p3/M, z7.h, z16.h\n"
+ "fmla z25.h, p3/M, z6.h, z16.h\n"
+ "fmla z23.h, p3/M, z4.h, z16.h\n"
+ "fmla z21.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z18.h\n"
+ "fmla z28.h, p3/M, z1.h, z17.h\n"
+ "fmax z28.h, p3/M, z28.h, z31.h\n"
+ "fmin z28.h, p3/M, z28.h, z30.h\n"
+ "fmla z27.h, p3/M, z0.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z6.h, z16.h\n"
+ "fmax z29.h, p3/M, z29.h, z31.h\n"
+ "fmla z24.h, p3/M, z7.h, z18.h\n"
+ "fmla z21.h, p3/M, z5.h, z18.h\n"
+ "fmin z29.h, p3/M, z29.h, z30.h\n"
+ "st1h { z29.h }, p1, [x10, x14, LSL #1]\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "fmla z22.h, p3/M, z2.h, z17.h\n"
+ "ldr x24, [x13, #0x20]\n"
+ "st1h { z28.h }, p1, [x9, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z8.h, z18.h\n"
+ "fmla z26.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ldp x23, x22, [x17, #0x0]\n"
+ "fmla z27.h, p3/M, z8.h, z17.h\n"
+ "fmla z24.h, p3/M, z5.h, z17.h\n"
+ "ldp x21, x20, [x17, #0x10]\n"
+ "fmax z27.h, p3/M, z27.h, z31.h\n"
+ "fmla z23.h, p3/M, z8.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "fmax z26.h, p3/M, z26.h, z31.h\n"
+ "fmax z25.h, p3/M, z25.h, z31.h\n"
+ "fmla z22.h, p3/M, z6.h, z16.h\n"
+ "inch x15\n"
+ "ld1h { z9.h }, p0/Z, [x23, x16, LSL #1]\n"
+ "ld1h { z10.h }, p0/Z, [x22, x16, LSL #1]\n"
+ "ld1h { z11.h }, p0/Z, [x21, x16, LSL #1]\n"
+ "ld1h { z12.h }, p0/Z, [x20, x16, LSL #1]\n"
+ "fmin z27.h, p3/M, z27.h, z30.h\n"
+ "fmin z26.h, p3/M, z26.h, z30.h\n"
+ "ld1h { z13.h }, p0/Z, [x25, x16, LSL #1]\n"
+ "inch x16\n"
+ "fmin z25.h, p3/M, z25.h, z30.h\n"
+ "st1h { z27.h }, p1, [x28, x14, LSL #1]\n"
+ "fmax z24.h, p3/M, z24.h, z31.h\n"
+ "fmax z23.h, p3/M, z23.h, z31.h\n"
+ "st1h { z26.h }, p1, [x27, x14, LSL #1]\n"
+ "ldr x23, [x13, #0x28]\n"
+ "fmax z21.h, p3/M, z21.h, z31.h\n"
+ "fmax z22.h, p3/M, z22.h, z31.h\n"
+ "st1h { z25.h }, p1, [x24, x14, LSL #1]\n"
+ "ldr x22, [x13, #0x30]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "whilelt p2.h, x15, %x[n_channels]\n"
+ "cmp x16, %x[n_channels]\n"
+ "ld1h { z0.h }, p3/Z, [x8, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x8, #2, MUL VL]\n"
+ "fmin z24.h, p3/M, z24.h, z30.h\n"
+ "fmin z23.h, p3/M, z23.h, z30.h\n"
+ "ld1h { z2.h }, p3/Z, [x8, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x8, #4, MUL VL]\n"
+ "fmin z21.h, p3/M, z21.h, z30.h\n"
+ "fmin z22.h, p3/M, z22.h, z30.h\n"
+ "ld1h { z4.h }, p3/Z, [x8, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x8, #6, MUL VL]\n"
+ "st1h { z24.h }, p1, [x23, x14, LSL #1]\n"
+ "ld1h { z6.h }, p3/Z, [x8, #7, MUL VL]\n"
+ "addvl x8, x8, #16\n"
+ "st1h { z23.h }, p1, [x22, x14, LSL #1]\n"
+ "ld1h { z7.h }, p3/Z, [x8, #-8, MUL VL]\n"
+ "st1h { z21.h }, p1, [x21, x14, LSL #1]\n"
+ "ld1h { z8.h }, p3/Z, [x8, #-7, MUL VL]\n"
+ "addvl x8, x8, #-6\n"
+ "st1h { z22.h }, p1, [x20, x14, LSL #1]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z29, z14\n fmla z29.h, p3/M, z8.h, z9.h\n"
+ "movprfx z28, z14\n fmla z28.h, p3/M, z7.h, z9.h\n"
+ "ldr x23, [x17, #0x30]\n"
+ "ldr x26, [x17, #0x38]\n"
+ "movprfx z27, z14\n fmla z27.h, p3/M, z6.h, z9.h\n"
+ "fmla z29.h, p3/M, z0.h, z10.h\n"
+ "ldr x22, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "fmla z28.h, p3/M, z4.h, z13.h\n"
+ "movprfx z26, z14\n fmla z26.h, p3/M, z5.h, z9.h\n"
+ "ldr x20, [x17, #0x40]\n"
+ "ld1h { z19.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "movprfx z25, z14\n fmla z25.h, p3/M, z4.h, z9.h\n"
+ "movprfx z24, z14\n fmla z24.h, p3/M, z3.h, z9.h\n"
+ "ldr x25, [x17, #0x50]\n"
+ "ldr x24, [x17, #0x58]\n"
+ "fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "movprfx z23, z14\n fmla z23.h, p3/M, z2.h, z9.h\n"
+ "ldr x23, [x17, #0x60]\n"
+ "fmla z29.h, p3/M, z5.h, z13.h\n"
+ "fmla z28.h, p3/M, z6.h, z18.h\n"
+ "ldr x12, [x17, #0x70]\n"
+ "ldr x11, [x17, #0x88]\n"
+ "movprfx z22, z14\n fmla z22.h, p3/M, z0.h, z9.h\n"
+ "fmla z27.h, p3/M, z3.h, z13.h\n"
+ "inch x14\n"
+ "mov p0.b, p2.b\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "fmla z25.h, p3/M, z1.h, z13.h\n"
+ "ldr x10, [x13, #0x0]\n"
+ "ldr x9, [x13, #0x8]\n"
+ "fmla z24.h, p3/M, z0.h, z13.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z23.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "movprfx z21, z14\n fmla z21.h, p3/M, z1.h, z9.h\n"
+ "fmla z29.h, p3/M, z7.h, z18.h\n"
+ "ldr x22, [x17, #0x68]\n"
+ "ldr x21, [x17, #0x78]\n"
+ "fmla z28.h, p3/M, z0.h, z17.h\n"
+ "fmla z22.h, p3/M, z8.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x20, [x17, #0x80]\n"
+ "fmla z26.h, p3/M, z4.h, z18.h\n"
+ "fmla z25.h, p3/M, z3.h, z18.h\n"
+ "ldr x28, [x13, #0x10]\n"
+ "ldr x27, [x13, #0x18]\n"
+ "fmla z21.h, p3/M, z0.h, z18.h\n"
+ "fmla z24.h, p3/M, z4.h, z19.h\n"
+ "fmla z23.h, p3/M, z1.h, z18.h\n"
+ "fmla z29.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z20.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z16.h\n"
+ "fmla z27.h, p3/M, z1.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "ldr x26, [x17, #0x90]\n"
+ "fmla z25.h, p3/M, z5.h, z19.h\n"
+ "fmla z21.h, p3/M, z2.h, z19.h\n"
+ "ldr x25, [x17, #0xa0]\n"
+ "ldr x24, [x17, #0x98]\n"
+ "fmla z26.h, p3/M, z0.h, z20.h\n"
+ "fmla z24.h, p3/M, z2.h, z17.h\n"
+ "fmla z28.h, p3/M, z8.h, z19.h\n"
+ "fmla z27.h, p3/M, z7.h, z19.h\n"
+ "fmla z22.h, p3/M, z1.h, z19.h\n"
+ "fmla z23.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "ldr x23, [x17, #0xa8]\n"
+ "fmla z26.h, p3/M, z6.h, z16.h\n"
+ "fmla z25.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z19.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z24.h, p3/M, z6.h, z18.h\n"
+ "fmla z21.h, p3/M, z4.h, z18.h\n"
+ "fmla z29.h, p3/M, z3.h, z20.h\n"
+ "fmla z27.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z23.h, p3/M, z5.h, z18.h\n"
+ "fmla z22.h, p3/M, z3.h, z18.h\n"
+ "ldr x21, [x17, #0xb0]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.h, p3/M, z8.h, z18.h\n"
+ "fmla z24.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z6.h, z16.h\n"
+ "fmla z28.h, p3/M, z3.h, z19.h\n"
+ "fmla z25.h, p3/M, z0.h, z19.h\n"
+ "fmla z22.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "fmla z23.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x26, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z4.h, z19.h\n"
+ "fmla z26.h, p3/M, z1.h, z19.h\n"
+ "fmla z28.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x24, x15, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z17.h\n"
+ "fmla z25.h, p3/M, z2.h, z17.h\n"
+ "fmla z24.h, p3/M, z1.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z18.h\n"
+ "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z2.h, z17.h\n"
+ "fmla z26.h, p3/M, z7.h, z16.h\n"
+ "fmla z25.h, p3/M, z6.h, z16.h\n"
+ "fmla z23.h, p3/M, z4.h, z16.h\n"
+ "fmla z21.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x15, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z18.h\n"
+ "fmla z28.h, p3/M, z1.h, z17.h\n"
+ "fmax z28.h, p3/M, z28.h, z31.h\n"
+ "fmin z28.h, p3/M, z28.h, z30.h\n"
+ "fmla z27.h, p3/M, z0.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x15, LSL #1]\n"
+ "fmla z29.h, p3/M, z6.h, z16.h\n"
+ "fmax z29.h, p3/M, z29.h, z31.h\n"
+ "fmla z24.h, p3/M, z7.h, z18.h\n"
+ "fmla z21.h, p3/M, z5.h, z18.h\n"
+ "fmin z29.h, p3/M, z29.h, z30.h\n"
+ "st1h { z29.h }, p0, [x10, x14, LSL #1]\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "fmla z22.h, p3/M, z2.h, z17.h\n"
+ "ldr x20, [x13, #0x20]\n"
+ "st1h { z28.h }, p0, [x9, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z8.h, z18.h\n"
+ "fmla z26.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x15, LSL #1]\n"
+ "fmax z26.h, p3/M, z26.h, z31.h\n"
+ "fmla z27.h, p3/M, z8.h, z17.h\n"
+ "fmla z24.h, p3/M, z5.h, z17.h\n"
+ "fmax z27.h, p3/M, z27.h, z31.h\n"
+ "fmax z25.h, p3/M, z25.h, z31.h\n"
+ "fmla z23.h, p3/M, z8.h, z16.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "fmin z27.h, p3/M, z27.h, z30.h\n"
+ "fmin z26.h, p3/M, z26.h, z30.h\n"
+ "fmla z22.h, p3/M, z6.h, z16.h\n"
+ "fmin z25.h, p3/M, z25.h, z30.h\n"
+ "fmax z24.h, p3/M, z24.h, z31.h\n"
+ "st1h { z27.h }, p0, [x28, x14, LSL #1]\n"
+ "fmax z23.h, p3/M, z23.h, z31.h\n"
+ "fmax z21.h, p3/M, z21.h, z31.h\n"
+ "st1h { z26.h }, p0, [x27, x14, LSL #1]\n"
+ "ldr x23, [x13, #0x28]\n"
+ "fmax z22.h, p3/M, z22.h, z31.h\n"
+ "st1h { z25.h }, p0, [x20, x14, LSL #1]\n"
+ "ldr x22, [x13, #0x30]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "fmin z24.h, p3/M, z24.h, z30.h\n"
+ "fmin z23.h, p3/M, z23.h, z30.h\n"
+ "st1h { z24.h }, p0, [x23, x14, LSL #1]\n"
+ "fmin z21.h, p3/M, z21.h, z30.h\n"
+ "fmin z22.h, p3/M, z22.h, z30.h\n"
+ "st1h { z23.h }, p0, [x22, x14, LSL #1]\n"
+ "st1h { z21.h }, p0, [x21, x14, LSL #1]\n"
+ "st1h { z22.h }, p0, [x20, x14, LSL #1]\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..af5ee740c9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 4;
+ constexpr static unsigned int output_cols = 4;
+
+ sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..41eaa4f18c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,656 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "mov x16, #0x0\n"
+ "mov x4, #0x0\n"
+ "1:" // Tile loop
+ "str x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x4\n"
+ "mov x24, #0x4\n"
+ "str x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x16, x23\n" // offset = tile_i * ld_input_row
+ "ldr x5, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x20, x16, x22\n" // offset = tile_i * ld_output_row
+ "add x7, x5, x5\n"
+ "madd x21, x4, x5, x21\n" // offset += tile_j * ld_input_col
+ "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "cnth x16\n"
+ "madd x20, x4, x6, x20\n" // offset += tile_j * ld_output_col
+ "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x14, x7, x5\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "mul x21, x21, x25\n" // offset *= kernel_stride * output_size
+ "add x8, x8, x21, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x13, x8, x23, LSL #1\n"
+ "ld1h { z19.h }, p3/Z, [x17]\n"
+ "mul x20, x20, x24\n" // offset *= output_tile_size
+ "add x12, x13, x23, LSL #1\n"
+ "add x15, x15, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
+ "add x11, x12, x23, LSL #1\n"
+ "add x10, x14, x5\n"
+ "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
+ "add x9, x15, x22, LSL #1\n"
+ "add x28, x11, x23, LSL #1\n"
+ "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
+ "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
+ "addvl x17, x17, #16\n"
+ "add x27, x10, x5\n"
+ "add x26, x9, x22, LSL #1\n"
+ "add x25, x6, x6\n"
+ "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "cmp x16, %x[n_channels]\n"
+ "add x24, x28, x23, LSL #1\n"
+ "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
+ "add x23, x26, x22, LSL #1\n"
+ "add x22, x25, x6\n"
+ "ld1h { z9.h }, p2/Z, [x12, x7, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x8]\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x16\n"
+ "ld1h { z11.h }, p2/Z, [x8, x27, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x12, x14, LSL #1]\n"
+ "addvl x17, x17, #-6\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "movprfx z14, z19\n fmla z14.h, p3/M, z4.h, z9.h\n"
+ "movprfx z31, z19\n fmla z31.h, p3/M, z8.h, z9.h\n"
+ "whilelt p1.h, x16, %x[n_channels]\n"
+ "inch x21\n"
+ "movprfx z21, z19\n fmla z21.h, p3/M, z3.h, z9.h\n"
+ "movprfx z22, z19\n fmla z22.h, p3/M, z1.h, z9.h\n"
+ "inch x16\n"
+ "mov p0.b, p2.b\n"
+ "movprfx z20, z19\n fmla z20.h, p3/M, z0.h, z9.h\n"
+ "fmla z14.h, p3/M, z5.h, z12.h\n"
+ "inch x20\n"
+ "movprfx z13, z19\n fmla z13.h, p3/M, z7.h, z9.h\n"
+ "movprfx z17, z19\n fmla z17.h, p3/M, z6.h, z9.h\n"
+ "movprfx z27, z19\n fmla z27.h, p3/M, z5.h, z9.h\n"
+ "movprfx z18, z19\n fmla z18.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "movprfx z30, z19\n fmla z30.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z29.h }, p2/Z, [x24]\n"
+ "ld1h { z11.h }, p2/Z, [x24, x27, LSL #1]\n"
+ "fmla z21.h, p3/M, z4.h, z12.h\n"
+ "fmla z22.h, p3/M, z2.h, z12.h\n"
+ "fmla z20.h, p3/M, z1.h, z12.h\n"
+ "movprfx z23, z19\n fmla z23.h, p3/M, z6.h, z29.h\n"
+ "ld1h { z10.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "fmla z14.h, p3/M, z7.h, z9.h\n"
+ "fmla z13.h, p3/M, z8.h, z12.h\n"
+ "fmla z17.h, p3/M, z7.h, z12.h\n"
+ "fmla z30.h, p3/M, z6.h, z12.h\n"
+ "movprfx z26, z19\n fmla z26.h, p3/M, z3.h, z12.h\n"
+ "movprfx z28, z19\n fmla z28.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x8, x5, LSL #1]\n"
+ "movprfx z24, z19\n fmla z24.h, p3/M, z8.h, z11.h\n"
+ "fmla z21.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z11.h }, p2/Z, [x8, x10, LSL #1]\n"
+ "fmla z22.h, p3/M, z4.h, z9.h\n"
+ "fmla z20.h, p3/M, z3.h, z9.h\n"
+ "movprfx z25, z19\n fmla z25.h, p3/M, z1.h, z9.h\n"
+ "movprfx z29, z19\n fmla z29.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z19.h }, p3/Z, [x17]\n"
+ "fmla z27.h, p3/M, z8.h, z9.h\n"
+ "fmla z18.h, p3/M, z5.h, z9.h\n"
+ "fmla z23.h, p3/M, z2.h, z9.h\n"
+ "fmla z14.h, p3/M, z8.h, z10.h\n"
+ "ld1h { z9.h }, p2/Z, [x13]\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z13.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x13, x27, LSL #1]\n"
+ "fmla z17.h, p3/M, z2.h, z11.h\n"
+ "fmla z30.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x28]\n"
+ "fmla z21.h, p3/M, z7.h, z10.h\n"
+ "fmla z26.h, p3/M, z6.h, z10.h\n"
+ "fmla z22.h, p3/M, z5.h, z10.h\n"
+ "fmla z20.h, p3/M, z4.h, z10.h\n"
+ "fmla z28.h, p3/M, z3.h, z10.h\n"
+ "fmla z25.h, p3/M, z2.h, z10.h\n"
+ "fmla z29.h, p3/M, z1.h, z10.h\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x13, x7, LSL #1]\n"
+ "fmla z27.h, p3/M, z0.h, z9.h\n"
+ "fmla z18.h, p3/M, z6.h, z11.h\n"
+ "fmla z23.h, p3/M, z3.h, z11.h\n"
+ "fmla z14.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x27, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z9.h\n"
+ "fmla z30.h, p3/M, z5.h, z12.h\n"
+ "fmla z26.h, p3/M, z2.h, z12.h\n"
+ "fmla z13.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z9.h }, p2/Z, [x13, x14, LSL #1]\n"
+ "fmla z17.h, p3/M, z3.h, z10.h\n"
+ "fmla z21.h, p3/M, z0.h, z10.h\n"
+ "fmla z28.h, p3/M, z8.h, z11.h\n"
+ "fmla z24.h, p3/M, z5.h, z11.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x5, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z10.h\n"
+ "fmla z14.h, p3/M, z2.h, z9.h\n"
+ "fmla z31.h, p3/M, z5.h, z10.h\n"
+ "fmla z13.h, p3/M, z5.h, z9.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x5, LSL #1]\n"
+ "fmla z17.h, p3/M, z4.h, z9.h\n"
+ "fmla z30.h, p3/M, z3.h, z9.h\n"
+ "fmla z21.h, p3/M, z1.h, z9.h\n"
+ "fmla z26.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z10.h }, p2/Z, [x12, x10, LSL #1]\n"
+ "fmla z23.h, p3/M, z7.h, z12.h\n"
+ "fmla z25.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x10, LSL #1]\n"
+ "fmla z27.h, p3/M, z4.h, z11.h\n"
+ "fmla z14.h, p3/M, z3.h, z11.h\n"
+ "fmla z18.h, p3/M, z1.h, z11.h\n"
+ "fmla z22.h, p3/M, z0.h, z11.h\n"
+ "fmla z31.h, p3/M, z7.h, z11.h\n"
+ "fmla z13.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z9.h }, p2/Z, [x8, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z8.h, z12.h\n"
+ "fmla z24.h, p3/M, z7.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x5, LSL #1]\n"
+ "fmla z17.h, p3/M, z8.h, z10.h\n"
+ "fmla z30.h, p3/M, z7.h, z10.h\n"
+ "fmla z21.h, p3/M, z5.h, z10.h\n"
+ "fmla z26.h, p3/M, z4.h, z10.h\n"
+ "fmla z20.h, p3/M, z2.h, z10.h\n"
+ "fmla z28.h, p3/M, z1.h, z10.h\n"
+ "ld1h { z11.h }, p2/Z, [x8, x14, LSL #1]\n"
+ "addvl x8, x8, #1\n"
+ "fmla z27.h, p3/M, z7.h, z12.h\n"
+ "fmla z14.h, p3/M, z6.h, z12.h\n"
+ "fmla z18.h, p3/M, z4.h, z12.h\n"
+ "fmla z22.h, p3/M, z3.h, z12.h\n"
+ "fmla z23.h, p3/M, z1.h, z12.h\n"
+ "fmla z25.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x11, x10, LSL #1]\n"
+ "fmla z31.h, p3/M, z2.h, z9.h\n"
+ "fmla z13.h, p3/M, z1.h, z9.h\n"
+ "fmla z17.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x12]\n"
+ "fmla z29.h, p3/M, z2.h, z12.h\n"
+ "fmla z30.h, p3/M, z0.h, z11.h\n"
+ "fmla z27.h, p3/M, z3.h, z9.h\n"
+ "fmla z18.h, p3/M, z0.h, z9.h\n"
+ "fmla z21.h, p3/M, z8.h, z12.h\n"
+ "fmla z26.h, p3/M, z7.h, z12.h\n"
+ "fmla z20.h, p3/M, z5.h, z12.h\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "fmla z24.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x28, x7, LSL #1]\n"
+ "fmla z13.h, p3/M, z2.h, z11.h\n"
+ "fmla z17.h, p3/M, z1.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x12, x27, LSL #1]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z31.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z12.h }, p2/Z, [x11]\n"
+ "fmla z25.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z9.h }, p1/Z, [x12, x7, LSL #1]\n"
+ "fmla z29.h, p3/M, z3.h, z10.h\n"
+ "fmla z30.h, p3/M, z8.h, z11.h\n"
+ "fmla z26.h, p3/M, z5.h, z11.h\n"
+ "fmla z28.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x11, x27, LSL #1]\n"
+ "addvl x11, x11, #1\n"
+ "fmla z27.h, p3/M, z6.h, z12.h\n"
+ "fmla z18.h, p3/M, z3.h, z12.h\n"
+ "fmla z23.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x7, LSL #1]\n"
+ "fmla z24.h, p3/M, z2.h, z11.h\n"
+ "fmla z25.h, p3/M, z7.h, z12.h\n"
+ "fmla z29.h, p3/M, z6.h, z12.h\n"
+ "fmla z18.h, p3/M, z8.h, z10.h\n"
+ "fmla z22.h, p3/M, z7.h, z10.h\n"
+ "fmla z20.h, p3/M, z6.h, z10.h\n"
+ "fmla z23.h, p3/M, z5.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x28, x14, LSL #1]\n"
+ "fmla z28.h, p3/M, z5.h, z11.h\n"
+ "fmla z25.h, p3/M, z5.h, z10.h\n"
+ "fmla z29.h, p3/M, z4.h, z10.h\n"
+ "fmla z24.h, p3/M, z3.h, z10.h\n"
+ "fmla z26.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "fmla z23.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x13, x5, LSL #1]\n"
+ "fmla z22.h, p3/M, z8.h, z10.h\n"
+ "fmla z20.h, p3/M, z7.h, z10.h\n"
+ "addvl x24, x24, #1\n"
+ "fmla z28.h, p3/M, z6.h, z10.h\n"
+ "fmla z25.h, p3/M, z8.h, z11.h\n"
+ "ld1h { z10.h }, p2/Z, [x13, x10, LSL #1]\n"
+ "addvl x13, x13, #1\n"
+ "fmla z29.h, p3/M, z7.h, z11.h\n"
+ "fmla z24.h, p3/M, z6.h, z11.h\n"
+ "ld1h { z11.h }, p2/Z, [x28, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z12.h\n"
+ "fmla z13.h, p3/M, z3.h, z12.h\n"
+ "fmax z31.h, p3/M, z31.h, z15.h\n"
+ "fmax z13.h, p3/M, z13.h, z15.h\n"
+ "fmla z27.h, p3/M, z1.h, z12.h\n"
+ "fmla z14.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z0.h }, p2/Z, [x28, x10, LSL #1]\n"
+ "fmax z27.h, p3/M, z27.h, z15.h\n"
+ "fmla z17.h, p3/M, z5.h, z10.h\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "fmax z17.h, p3/M, z17.h, z15.h\n"
+ "fmax z30.h, p3/M, z30.h, z15.h\n"
+ "fmla z21.h, p3/M, z2.h, z10.h\n"
+ "fmla z26.h, p3/M, z1.h, z10.h\n"
+ "fmax z14.h, p3/M, z14.h, z15.h\n"
+ "fmax z21.h, p3/M, z21.h, z15.h\n"
+ "fmla z18.h, p3/M, z7.h, z11.h\n"
+ "fmla z22.h, p3/M, z6.h, z11.h\n"
+ "fmax z26.h, p3/M, z26.h, z15.h\n"
+ "fmax z18.h, p3/M, z18.h, z15.h\n"
+ "fmla z23.h, p3/M, z4.h, z11.h\n"
+ "fmla z25.h, p3/M, z3.h, z11.h\n"
+ "fmax z22.h, p3/M, z22.h, z15.h\n"
+ "fmax z23.h, p3/M, z23.h, z15.h\n"
+ "fmla z20.h, p3/M, z8.h, z0.h\n"
+ "fmla z28.h, p3/M, z7.h, z0.h\n"
+ "fmax z20.h, p3/M, z20.h, z15.h\n"
+ "fmax z28.h, p3/M, z28.h, z15.h\n"
+ "fmla z29.h, p3/M, z5.h, z0.h\n"
+ "fmla z24.h, p3/M, z4.h, z0.h\n"
+ "fmax z25.h, p3/M, z25.h, z15.h\n"
+ "fmax z29.h, p3/M, z29.h, z15.h\n"
+ "fmax z24.h, p3/M, z24.h, z15.h\n"
+ "ld1h { z0.h }, p3/Z, [x17, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x17, #2, MUL VL]\n"
+ "whilelt p2.h, x21, %x[n_channels]\n"
+ "ld1h { z2.h }, p3/Z, [x17, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x17, #4, MUL VL]\n"
+ "cmp x16, %x[n_channels]\n"
+ "fmin z31.h, p3/M, z31.h, z16.h\n"
+ "ld1h { z4.h }, p3/Z, [x17, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x17, #6, MUL VL]\n"
+ "fmin z13.h, p3/M, z13.h, z16.h\n"
+ "fmin z17.h, p3/M, z17.h, z16.h\n"
+ "ld1h { z6.h }, p3/Z, [x17, #7, MUL VL]\n"
+ "addvl x17, x17, #16\n"
+ "fmin z30.h, p3/M, z30.h, z16.h\n"
+ "ld1h { z10.h }, p1/Z, [x8]\n"
+ "fmin z27.h, p3/M, z27.h, z16.h\n"
+ "fmin z14.h, p3/M, z14.h, z16.h\n"
+ "ld1h { z11.h }, p1/Z, [x8, x27, LSL #1]\n"
+ "ld1h { z12.h }, p1/Z, [x12, x14, LSL #1]\n"
+ "fmin z21.h, p3/M, z21.h, z16.h\n"
+ "fmin z26.h, p3/M, z26.h, z16.h\n"
+ "st1h { z31.h }, p0, [x15]\n"
+ "ld1h { z7.h }, p3/Z, [x17, #-8, MUL VL]\n"
+ "fmin z18.h, p3/M, z18.h, z16.h\n"
+ "fmin z22.h, p3/M, z22.h, z16.h\n"
+ "st1h { z13.h }, p0, [x15, x6, LSL #1]\n"
+ "ld1h { z8.h }, p3/Z, [x17, #-7, MUL VL]\n"
+ "fmin z20.h, p3/M, z20.h, z16.h\n"
+ "fmin z28.h, p3/M, z28.h, z16.h\n"
+ "st1h { z17.h }, p0, [x15, x25, LSL #1]\n"
+ "fmin z23.h, p3/M, z23.h, z16.h\n"
+ "fmin z25.h, p3/M, z25.h, z16.h\n"
+ "st1h { z30.h }, p0, [x15, x22, LSL #1]\n"
+ "fmin z29.h, p3/M, z29.h, z16.h\n"
+ "fmin z24.h, p3/M, z24.h, z16.h\n"
+ "st1h { z27.h }, p0, [x9]\n"
+ "addvl x28, x28, #1\n"
+ "st1h { z14.h }, p0, [x9, x6, LSL #1]\n"
+ "addvl x15, x15, #1\n"
+ "st1h { z21.h }, p0, [x9, x25, LSL #1]\n"
+ "addvl x17, x17, #-6\n"
+ "st1h { z26.h }, p0, [x9, x22, LSL #1]\n"
+ "addvl x9, x9, #1\n"
+ "st1h { z18.h }, p0, [x26]\n"
+ "st1h { z22.h }, p0, [x26, x6, LSL #1]\n"
+ "st1h { z20.h }, p0, [x26, x25, LSL #1]\n"
+ "st1h { z28.h }, p0, [x26, x22, LSL #1]\n"
+ "addvl x26, x26, #1\n"
+ "st1h { z23.h }, p0, [x23]\n"
+ "st1h { z25.h }, p0, [x23, x6, LSL #1]\n"
+ "st1h { z29.h }, p0, [x23, x25, LSL #1]\n"
+ "st1h { z24.h }, p0, [x23, x22, LSL #1]\n"
+ "addvl x23, x23, #1\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "movprfx z14, z19\n fmla z14.h, p3/M, z4.h, z9.h\n"
+ "movprfx z31, z19\n fmla z31.h, p3/M, z8.h, z9.h\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z30, z19\n fmla z30.h, p3/M, z3.h, z9.h\n"
+ "movprfx z13, z19\n fmla z13.h, p3/M, z1.h, z9.h\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "add x4, x4, #0x1\n"
+ "movprfx z20, z19\n fmla z20.h, p3/M, z0.h, z9.h\n"
+ "fmla z14.h, p3/M, z5.h, z12.h\n"
+ "cmp x4, x20\n"
+ "add x21, x16, #0x1\n"
+ "movprfx z18, z19\n fmla z18.h, p3/M, z7.h, z9.h\n"
+ "movprfx z28, z19\n fmla z28.h, p3/M, z6.h, z9.h\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x16, x16, x21, LT\n"
+ "movprfx z17, z19\n fmla z17.h, p3/M, z5.h, z9.h\n"
+ "movprfx z26, z19\n fmla z26.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z9.h }, p2/Z, [x11, x7, LSL #1]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z31.h, p3/M, z0.h, z10.h\n"
+ "movprfx z27, z19\n fmla z27.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z29.h }, p2/Z, [x24]\n"
+ "ld1h { z21.h }, p2/Z, [x24, x27, LSL #1]\n"
+ "fmla z30.h, p3/M, z4.h, z12.h\n"
+ "fmla z13.h, p3/M, z2.h, z12.h\n"
+ "csel x4, x4, XZR, LT\n"
+ "cmp x16, x20\n"
+ "fmla z20.h, p3/M, z1.h, z12.h\n"
+ "movprfx z10, z19\n fmla z10.h, p3/M, z6.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x11, x14, LSL #1]\n"
+ "fmla z14.h, p3/M, z7.h, z9.h\n"
+ "fmla z18.h, p3/M, z8.h, z12.h\n"
+ "fmla z28.h, p3/M, z7.h, z12.h\n"
+ "fmla z27.h, p3/M, z6.h, z12.h\n"
+ "movprfx z11, z19\n fmla z11.h, p3/M, z3.h, z12.h\n"
+ "movprfx z25, z19\n fmla z25.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z22.h }, p2/Z, [x8, x5, LSL #1]\n"
+ "movprfx z24, z19\n fmla z24.h, p3/M, z8.h, z21.h\n"
+ "fmla z30.h, p3/M, z6.h, z9.h\n"
+ "ld1h { z21.h }, p2/Z, [x8, x10, LSL #1]\n"
+ "fmla z13.h, p3/M, z4.h, z9.h\n"
+ "fmla z20.h, p3/M, z3.h, z9.h\n"
+ "movprfx z12, z19\n fmla z12.h, p3/M, z1.h, z9.h\n"
+ "movprfx z23, z19\n fmla z23.h, p3/M, z0.h, z9.h\n"
+ "fmla z17.h, p3/M, z8.h, z9.h\n"
+ "fmla z26.h, p3/M, z5.h, z9.h\n"
+ "fmla z10.h, p3/M, z2.h, z9.h\n"
+ "fmla z14.h, p3/M, z8.h, z29.h\n"
+ "ld1h { z9.h }, p2/Z, [x13]\n"
+ "fmla z31.h, p3/M, z1.h, z22.h\n"
+ "fmla z18.h, p3/M, z0.h, z22.h\n"
+ "ld1h { z22.h }, p2/Z, [x13, x27, LSL #1]\n"
+ "fmla z28.h, p3/M, z2.h, z21.h\n"
+ "fmla z27.h, p3/M, z1.h, z21.h\n"
+ "ld1h { z19.h }, p2/Z, [x28]\n"
+ "fmla z30.h, p3/M, z7.h, z29.h\n"
+ "fmla z11.h, p3/M, z6.h, z29.h\n"
+ "fmla z13.h, p3/M, z5.h, z29.h\n"
+ "fmla z20.h, p3/M, z4.h, z29.h\n"
+ "fmla z25.h, p3/M, z3.h, z29.h\n"
+ "fmla z12.h, p3/M, z2.h, z29.h\n"
+ "fmla z23.h, p3/M, z1.h, z29.h\n"
+ "fmla z24.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z21.h }, p2/Z, [x13, x7, LSL #1]\n"
+ "fmla z17.h, p3/M, z0.h, z9.h\n"
+ "fmla z26.h, p3/M, z6.h, z19.h\n"
+ "fmla z10.h, p3/M, z3.h, z19.h\n"
+ "fmla z14.h, p3/M, z1.h, z21.h\n"
+ "ld1h { z19.h }, p2/Z, [x28, x27, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z9.h\n"
+ "fmla z27.h, p3/M, z5.h, z22.h\n"
+ "fmla z11.h, p3/M, z2.h, z22.h\n"
+ "fmla z18.h, p3/M, z4.h, z21.h\n"
+ "ld1h { z29.h }, p2/Z, [x13, x14, LSL #1]\n"
+ "fmla z28.h, p3/M, z3.h, z21.h\n"
+ "fmla z30.h, p3/M, z0.h, z21.h\n"
+ "fmla z25.h, p3/M, z8.h, z19.h\n"
+ "fmla z24.h, p3/M, z5.h, z19.h\n"
+ "ld1h { z19.h }, p2/Z, [x24, x5, LSL #1]\n"
+ "fmla z17.h, p3/M, z2.h, z21.h\n"
+ "fmla z14.h, p3/M, z2.h, z29.h\n"
+ "fmla z31.h, p3/M, z5.h, z21.h\n"
+ "fmla z18.h, p3/M, z5.h, z29.h\n"
+ "ld1h { z22.h }, p2/Z, [x12, x5, LSL #1]\n"
+ "fmla z28.h, p3/M, z4.h, z29.h\n"
+ "fmla z27.h, p3/M, z3.h, z29.h\n"
+ "fmla z30.h, p3/M, z1.h, z29.h\n"
+ "fmla z11.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z21.h }, p2/Z, [x12, x10, LSL #1]\n"
+ "fmla z10.h, p3/M, z7.h, z19.h\n"
+ "fmla z12.h, p3/M, z6.h, z19.h\n"
+ "ld1h { z19.h }, p2/Z, [x24, x10, LSL #1]\n"
+ "fmla z17.h, p3/M, z4.h, z22.h\n"
+ "fmla z14.h, p3/M, z3.h, z22.h\n"
+ "fmla z26.h, p3/M, z1.h, z22.h\n"
+ "fmla z13.h, p3/M, z0.h, z22.h\n"
+ "fmla z31.h, p3/M, z7.h, z22.h\n"
+ "fmla z18.h, p3/M, z6.h, z22.h\n"
+ "ld1h { z29.h }, p2/Z, [x8, x7, LSL #1]\n"
+ "fmla z23.h, p3/M, z8.h, z19.h\n"
+ "fmla z24.h, p3/M, z7.h, z19.h\n"
+ "ld1h { z19.h }, p2/Z, [x11, x5, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z21.h\n"
+ "fmla z27.h, p3/M, z7.h, z21.h\n"
+ "fmla z30.h, p3/M, z5.h, z21.h\n"
+ "fmla z11.h, p3/M, z4.h, z21.h\n"
+ "fmla z20.h, p3/M, z2.h, z21.h\n"
+ "fmla z25.h, p3/M, z1.h, z21.h\n"
+ "ld1h { z22.h }, p2/Z, [x8, x14, LSL #1]\n"
+ "fmla z17.h, p3/M, z7.h, z19.h\n"
+ "fmla z14.h, p3/M, z6.h, z19.h\n"
+ "fmla z26.h, p3/M, z4.h, z19.h\n"
+ "fmla z13.h, p3/M, z3.h, z19.h\n"
+ "fmla z10.h, p3/M, z1.h, z19.h\n"
+ "fmla z12.h, p3/M, z0.h, z19.h\n"
+ "ld1h { z21.h }, p2/Z, [x11, x10, LSL #1]\n"
+ "fmla z31.h, p3/M, z2.h, z29.h\n"
+ "fmla z18.h, p3/M, z1.h, z29.h\n"
+ "fmla z28.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x12]\n"
+ "fmla z23.h, p3/M, z2.h, z21.h\n"
+ "fmla z27.h, p3/M, z0.h, z22.h\n"
+ "fmla z17.h, p3/M, z3.h, z29.h\n"
+ "fmla z26.h, p3/M, z0.h, z29.h\n"
+ "fmla z30.h, p3/M, z8.h, z21.h\n"
+ "fmla z11.h, p3/M, z7.h, z21.h\n"
+ "fmla z20.h, p3/M, z5.h, z21.h\n"
+ "fmla z25.h, p3/M, z4.h, z21.h\n"
+ "fmla z24.h, p3/M, z1.h, z21.h\n"
+ "ld1h { z19.h }, p2/Z, [x28, x7, LSL #1]\n"
+ "fmla z18.h, p3/M, z2.h, z22.h\n"
+ "fmla z28.h, p3/M, z1.h, z22.h\n"
+ "ld1h { z21.h }, p2/Z, [x12, x27, LSL #1]\n"
+ "fmla z31.h, p3/M, z6.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x11]\n"
+ "fmla z12.h, p3/M, z4.h, z19.h\n"
+ "fmla z23.h, p3/M, z3.h, z19.h\n"
+ "fmla z27.h, p3/M, z8.h, z21.h\n"
+ "fmla z11.h, p3/M, z5.h, z21.h\n"
+ "fmla z25.h, p3/M, z2.h, z21.h\n"
+ "ld1h { z9.h }, p2/Z, [x11, x27, LSL #1]\n"
+ "fmla z17.h, p3/M, z6.h, z29.h\n"
+ "fmla z26.h, p3/M, z3.h, z29.h\n"
+ "fmla z10.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z22.h }, p2/Z, [x24, x7, LSL #1]\n"
+ "fmla z24.h, p3/M, z2.h, z9.h\n"
+ "fmla z12.h, p3/M, z7.h, z22.h\n"
+ "fmla z23.h, p3/M, z6.h, z22.h\n"
+ "fmla z26.h, p3/M, z8.h, z19.h\n"
+ "fmla z13.h, p3/M, z7.h, z19.h\n"
+ "fmla z20.h, p3/M, z6.h, z19.h\n"
+ "fmla z10.h, p3/M, z5.h, z19.h\n"
+ "ld1h { z21.h }, p2/Z, [x28, x14, LSL #1]\n"
+ "fmla z25.h, p3/M, z5.h, z9.h\n"
+ "fmla z12.h, p3/M, z5.h, z21.h\n"
+ "fmla z23.h, p3/M, z4.h, z21.h\n"
+ "fmla z24.h, p3/M, z3.h, z21.h\n"
+ "fmla z11.h, p3/M, z8.h, z9.h\n"
+ "ld1h { z19.h }, p2/Z, [x24, x14, LSL #1]\n"
+ "fmla z10.h, p3/M, z8.h, z22.h\n"
+ "ld1h { z22.h }, p2/Z, [x13, x5, LSL #1]\n"
+ "fmla z13.h, p3/M, z8.h, z21.h\n"
+ "fmla z20.h, p3/M, z7.h, z21.h\n"
+ "fmla z25.h, p3/M, z6.h, z21.h\n"
+ "fmla z12.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z29.h }, p2/Z, [x13, x10, LSL #1]\n"
+ "fmla z23.h, p3/M, z7.h, z19.h\n"
+ "fmla z24.h, p3/M, z6.h, z19.h\n"
+ "ld1h { z21.h }, p2/Z, [x28, x5, LSL #1]\n"
+ "fmla z31.h, p3/M, z4.h, z22.h\n"
+ "fmla z18.h, p3/M, z3.h, z22.h\n"
+ "fmax z31.h, p3/M, z31.h, z15.h\n"
+ "fmax z18.h, p3/M, z18.h, z15.h\n"
+ "fmla z17.h, p3/M, z1.h, z22.h\n"
+ "fmla z14.h, p3/M, z0.h, z22.h\n"
+ "ld1h { z9.h }, p2/Z, [x28, x10, LSL #1]\n"
+ "fmax z17.h, p3/M, z17.h, z15.h\n"
+ "fmla z28.h, p3/M, z5.h, z29.h\n"
+ "fmla z27.h, p3/M, z4.h, z29.h\n"
+ "fmax z28.h, p3/M, z28.h, z15.h\n"
+ "fmax z27.h, p3/M, z27.h, z15.h\n"
+ "fmla z30.h, p3/M, z2.h, z29.h\n"
+ "fmla z11.h, p3/M, z1.h, z29.h\n"
+ "fmax z14.h, p3/M, z14.h, z15.h\n"
+ "fmax z30.h, p3/M, z30.h, z15.h\n"
+ "fmla z26.h, p3/M, z7.h, z21.h\n"
+ "fmla z13.h, p3/M, z6.h, z21.h\n"
+ "fmax z11.h, p3/M, z11.h, z15.h\n"
+ "fmax z26.h, p3/M, z26.h, z15.h\n"
+ "fmla z10.h, p3/M, z4.h, z21.h\n"
+ "fmla z12.h, p3/M, z3.h, z21.h\n"
+ "fmax z13.h, p3/M, z13.h, z15.h\n"
+ "fmax z10.h, p3/M, z10.h, z15.h\n"
+ "fmla z20.h, p3/M, z8.h, z9.h\n"
+ "fmla z25.h, p3/M, z7.h, z9.h\n"
+ "fmax z20.h, p3/M, z20.h, z15.h\n"
+ "fmax z25.h, p3/M, z25.h, z15.h\n"
+ "fmla z23.h, p3/M, z5.h, z9.h\n"
+ "fmla z24.h, p3/M, z4.h, z9.h\n"
+ "fmax z12.h, p3/M, z12.h, z15.h\n"
+ "fmax z23.h, p3/M, z23.h, z15.h\n"
+ "fmax z24.h, p3/M, z24.h, z15.h\n"
+ "fmin z31.h, p3/M, z31.h, z16.h\n"
+ "st1h { z31.h }, p0, [x15]\n"
+ "fmin z18.h, p3/M, z18.h, z16.h\n"
+ "fmin z28.h, p3/M, z28.h, z16.h\n"
+ "st1h { z18.h }, p0, [x15, x6, LSL #1]\n"
+ "fmin z27.h, p3/M, z27.h, z16.h\n"
+ "fmin z17.h, p3/M, z17.h, z16.h\n"
+ "st1h { z28.h }, p0, [x15, x25, LSL #1]\n"
+ "fmin z14.h, p3/M, z14.h, z16.h\n"
+ "fmin z30.h, p3/M, z30.h, z16.h\n"
+ "st1h { z27.h }, p0, [x15, x22, LSL #1]\n"
+ "fmin z11.h, p3/M, z11.h, z16.h\n"
+ "fmin z26.h, p3/M, z26.h, z16.h\n"
+ "st1h { z17.h }, p0, [x9]\n"
+ "fmin z13.h, p3/M, z13.h, z16.h\n"
+ "fmin z20.h, p3/M, z20.h, z16.h\n"
+ "st1h { z14.h }, p0, [x9, x6, LSL #1]\n"
+ "fmin z25.h, p3/M, z25.h, z16.h\n"
+ "fmin z10.h, p3/M, z10.h, z16.h\n"
+ "st1h { z30.h }, p0, [x9, x25, LSL #1]\n"
+ "fmin z12.h, p3/M, z12.h, z16.h\n"
+ "fmin z23.h, p3/M, z23.h, z16.h\n"
+ "st1h { z11.h }, p0, [x9, x22, LSL #1]\n"
+ "fmin z24.h, p3/M, z24.h, z16.h\n"
+ "st1h { z26.h }, p0, [x26]\n"
+ "st1h { z13.h }, p0, [x26, x6, LSL #1]\n"
+ "st1h { z20.h }, p0, [x26, x25, LSL #1]\n"
+ "st1h { z25.h }, p0, [x26, x22, LSL #1]\n"
+ "st1h { z10.h }, p0, [x23]\n"
+ "st1h { z12.h }, p0, [x23, x6, LSL #1]\n"
+ "st1h { z23.h }, p0, [x23, x25, LSL #1]\n"
+ "st1h { z24.h }, p0, [x23, x22, LSL #1]\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..c0be293cd7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,714 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[36];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[14];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[5];
+ inptrs[3] = input_ptrs[15];
+ inptrs[4] = input_ptrs[30];
+ inptrs[5] = input_ptrs[35];
+ inptrs[6] = input_ptrs[20];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[4];
+ inptrs[9] = input_ptrs[21];
+ inptrs[10] = input_ptrs[6];
+ inptrs[11] = input_ptrs[11];
+ inptrs[12] = input_ptrs[24];
+ inptrs[13] = input_ptrs[8];
+ inptrs[14] = input_ptrs[29];
+ inptrs[15] = input_ptrs[9];
+ inptrs[16] = input_ptrs[31];
+ inptrs[17] = input_ptrs[13];
+ inptrs[18] = input_ptrs[34];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[19];
+ inptrs[22] = input_ptrs[3];
+ inptrs[23] = input_ptrs[12];
+ inptrs[24] = input_ptrs[22];
+ inptrs[25] = input_ptrs[17];
+ inptrs[26] = input_ptrs[18];
+ inptrs[27] = input_ptrs[26];
+ inptrs[28] = input_ptrs[23];
+ inptrs[29] = input_ptrs[32];
+ inptrs[30] = input_ptrs[27];
+ inptrs[31] = input_ptrs[33];
+ inptrs[32] = input_ptrs[7];
+ inptrs[33] = input_ptrs[10];
+ inptrs[34] = input_ptrs[25];
+ inptrs[35] = input_ptrs[28];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x8, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1h { z17.h }, p3/Z, [x7]\n"
+ "cnth x17\n"
+ "mov x16, #0x0\n"
+ "ld1h { z0.h }, p3/Z, [x7, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x7, #2, MUL VL]\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ld1h { z2.h }, p3/Z, [x7, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x7, #4, MUL VL]\n"
+ "cmp x17, %x[n_channels]\n"
+ "ld1h { z4.h }, p3/Z, [x7, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x7, #6, MUL VL]\n"
+ "sub x15, XZR, x17\n"
+ "ld1h { z6.h }, p3/Z, [x7, #7, MUL VL]\n"
+ "addvl x7, x7, #16\n"
+ "ldp x23, x22, [x8, #0x0]\n"
+ "ldp x21, x20, [x8, #0x10]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ld1rh { z16.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z19.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z7.h }, p3/Z, [x7, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x7, #-7, MUL VL]\n"
+ "addvl x7, x7, #-6\n"
+ "ld1h { z9.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "movprfx z20, z17\n fmla z20.h, p3/M, z4.h, z9.h\n"
+ "movprfx z26, z17\n fmla z26.h, p3/M, z8.h, z9.h\n"
+ "ldr x27, [x8, #0x20]\n"
+ "ldr x24, [x8, #0x30]\n"
+ "movprfx z24, z17\n fmla z24.h, p3/M, z3.h, z9.h\n"
+ "movprfx z30, z17\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "ldr x23, [x8, #0x28]\n"
+ "ldr x22, [x8, #0x38]\n"
+ "movprfx z31, z17\n fmla z31.h, p3/M, z0.h, z9.h\n"
+ "movprfx z22, z17\n fmla z22.h, p3/M, z7.h, z9.h\n"
+ "ldr x26, [x8, #0x40]\n"
+ "ldr x21, [x8, #0x48]\n"
+ "movprfx z27, z17\n fmla z27.h, p3/M, z6.h, z9.h\n"
+ "fmla z20.h, p3/M, z5.h, z12.h\n"
+ "ldr x25, [x8, #0x50]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "movprfx z14, z17\n fmla z14.h, p3/M, z5.h, z9.h\n"
+ "movprfx z23, z17\n fmla z23.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z25.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x13, [x8, #0x70]\n"
+ "fmla z26.h, p3/M, z0.h, z10.h\n"
+ "movprfx z9, z17\n fmla z9.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z28.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "ld1h { z21.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "fmla z24.h, p3/M, z4.h, z12.h\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "ldr x24, [x8, #0x60]\n"
+ "ldr x23, [x8, #0x68]\n"
+ "fmla z31.h, p3/M, z1.h, z12.h\n"
+ "fmla z22.h, p3/M, z8.h, z12.h\n"
+ "inch x15\n"
+ "mov p1.b, p2.b\n"
+ "fmla z27.h, p3/M, z7.h, z12.h\n"
+ "movprfx z15, z17\n fmla z15.h, p3/M, z6.h, z28.h\n"
+ "ld1h { z10.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x28, [x8, #0x88]\n"
+ "fmla z20.h, p3/M, z7.h, z25.h\n"
+ "fmla z9.h, p3/M, z6.h, z12.h\n"
+ "ldr x12, [x14, #0x0]\n"
+ "ldr x11, [x14, #0x8]\n"
+ "movprfx z11, z17\n fmla z11.h, p3/M, z3.h, z12.h\n"
+ "movprfx z13, z17\n fmla z13.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ldr x22, [x8, #0x78]\n"
+ "movprfx z28, z17\n fmla z28.h, p3/M, z8.h, z21.h\n"
+ "fmla z24.h, p3/M, z6.h, z25.h\n"
+ "ld1h { z29.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "ldr x21, [x8, #0x80]\n"
+ "fmla z30.h, p3/M, z4.h, z25.h\n"
+ "fmla z31.h, p3/M, z3.h, z25.h\n"
+ "ldr x10, [x14, #0x10]\n"
+ "ldr x9, [x14, #0x18]\n"
+ "movprfx z18, z17\n fmla z18.h, p3/M, z1.h, z25.h\n"
+ "movprfx z21, z17\n fmla z21.h, p3/M, z0.h, z25.h\n"
+ "whilelt p0.h, x17, %x[n_channels]\n"
+ "ld1h { z17.h }, p3/Z, [x7]\n"
+ "fmla z14.h, p3/M, z8.h, z25.h\n"
+ "fmla z23.h, p3/M, z5.h, z25.h\n"
+ "fmla z15.h, p3/M, z2.h, z25.h\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z25.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "ldr x27, [x8, #0x90]\n"
+ "fmla z22.h, p3/M, z0.h, z12.h\n"
+ "fmla z27.h, p3/M, z2.h, z29.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x20, [x8, #0x98]\n"
+ "fmla z20.h, p3/M, z8.h, z10.h\n"
+ "fmla z9.h, p3/M, z1.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x26, [x8, #0xa0]\n"
+ "fmla z24.h, p3/M, z7.h, z10.h\n"
+ "fmla z11.h, p3/M, z6.h, z10.h\n"
+ "fmla z30.h, p3/M, z5.h, z10.h\n"
+ "fmla z31.h, p3/M, z4.h, z10.h\n"
+ "fmla z13.h, p3/M, z3.h, z10.h\n"
+ "fmla z18.h, p3/M, z2.h, z10.h\n"
+ "fmla z21.h, p3/M, z1.h, z10.h\n"
+ "fmla z28.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ldr x25, [x8, #0xa8]\n"
+ "fmla z26.h, p3/M, z3.h, z25.h\n"
+ "fmla z14.h, p3/M, z0.h, z25.h\n"
+ "fmla z23.h, p3/M, z6.h, z29.h\n"
+ "fmla z15.h, p3/M, z3.h, z29.h\n"
+ "ld1h { z25.h }, p2/Z, [x13, x16, LSL #1]\n"
+ "ldr x24, [x8, #0xb0]\n"
+ "fmla z22.h, p3/M, z4.h, z10.h\n"
+ "fmla z27.h, p3/M, z3.h, z10.h\n"
+ "fmla z20.h, p3/M, z1.h, z10.h\n"
+ "fmla z9.h, p3/M, z5.h, z12.h\n"
+ "fmla z11.h, p3/M, z2.h, z12.h\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ldr x23, [x8, #0xb8]\n"
+ "fmla z13.h, p3/M, z8.h, z25.h\n"
+ "fmla z28.h, p3/M, z5.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x22, [x8, #0xc0]\n"
+ "fmla z26.h, p3/M, z5.h, z10.h\n"
+ "fmla z14.h, p3/M, z2.h, z10.h\n"
+ "ld1h { z29.h }, p2/Z, [x28, x16, LSL #1]\n"
+ "ldr x21, [x8, #0xc8]\n"
+ "fmla z22.h, p3/M, z5.h, z12.h\n"
+ "fmla z27.h, p3/M, z4.h, z12.h\n"
+ "fmla z20.h, p3/M, z2.h, z12.h\n"
+ "fmla z9.h, p3/M, z3.h, z12.h\n"
+ "fmla z24.h, p3/M, z1.h, z12.h\n"
+ "fmla z11.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x28, [x8, #0xd8]\n"
+ "fmla z15.h, p3/M, z7.h, z25.h\n"
+ "fmla z18.h, p3/M, z6.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "ldr x20, [x8, #0xd0]\n"
+ "fmla z26.h, p3/M, z7.h, z29.h\n"
+ "fmla z22.h, p3/M, z6.h, z29.h\n"
+ "fmla z14.h, p3/M, z4.h, z29.h\n"
+ "fmla z20.h, p3/M, z3.h, z29.h\n"
+ "fmla z23.h, p3/M, z1.h, z29.h\n"
+ "fmla z30.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "ldr x27, [x8, #0xe0]\n"
+ "fmla z27.h, p3/M, z8.h, z10.h\n"
+ "fmla z21.h, p3/M, z8.h, z25.h\n"
+ "fmla z28.h, p3/M, z7.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "fmla z13.h, p3/M, z1.h, z10.h\n"
+ "ldr x26, [x8, #0xe8]\n"
+ "fmla z9.h, p3/M, z7.h, z10.h\n"
+ "fmla z24.h, p3/M, z5.h, z10.h\n"
+ "fmla z11.h, p3/M, z4.h, z10.h\n"
+ "fmla z31.h, p3/M, z2.h, z10.h\n"
+ "ld1h { z10.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x25, [x8, #0xf0]\n"
+ "fmla z26.h, p3/M, z2.h, z29.h\n"
+ "fmla z22.h, p3/M, z1.h, z29.h\n"
+ "fmla z27.h, p3/M, z0.h, z29.h\n"
+ "fmla z14.h, p3/M, z7.h, z25.h\n"
+ "ld1h { z29.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ldr x24, [x8, #0xf8]\n"
+ "fmla z20.h, p3/M, z6.h, z25.h\n"
+ "fmla z23.h, p3/M, z4.h, z25.h\n"
+ "fmla z30.h, p3/M, z3.h, z25.h\n"
+ "fmla z15.h, p3/M, z1.h, z25.h\n"
+ "fmla z18.h, p3/M, z0.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "fmla z13.h, p3/M, z4.h, z25.h\n"
+ "ldr x23, [x8, #0x100]\n"
+ "fmla z21.h, p3/M, z2.h, z25.h\n"
+ "fmla z22.h, p3/M, z2.h, z10.h\n"
+ "fmla z27.h, p3/M, z1.h, z10.h\n"
+ "fmla z9.h, p3/M, z0.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x22, [x8, #0x108]\n"
+ "fmla z26.h, p3/M, z6.h, z29.h\n"
+ "fmla z14.h, p3/M, z3.h, z29.h\n"
+ "fmla z23.h, p3/M, z0.h, z29.h\n"
+ "fmla z24.h, p3/M, z8.h, z25.h\n"
+ "ld1h { z10.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x21, [x8, #0x110]\n"
+ "fmla z11.h, p3/M, z7.h, z25.h\n"
+ "fmla z31.h, p3/M, z5.h, z25.h\n"
+ "fmla z28.h, p3/M, z1.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x28, x16, LSL #1]\n"
+ "fmla z13.h, p3/M, z2.h, z12.h\n"
+ "ldr x20, [x8, #0x118]\n"
+ "fmla z15.h, p3/M, z0.h, z10.h\n"
+ "fmla z18.h, p3/M, z4.h, z25.h\n"
+ "fmla z21.h, p3/M, z3.h, z25.h\n"
+ "fmla z9.h, p3/M, z8.h, z12.h\n"
+ "fmla z11.h, p3/M, z5.h, z12.h\n"
+ "fmla z14.h, p3/M, z6.h, z10.h\n"
+ "ld1h { z12.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "fmla z23.h, p3/M, z3.h, z10.h\n"
+ "ld1h { z29.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z25.h\n"
+ "fmla z31.h, p3/M, z6.h, z25.h\n"
+ "fmla z15.h, p3/M, z5.h, z25.h\n"
+ "fmla z13.h, p3/M, z5.h, z12.h\n"
+ "fmla z28.h, p3/M, z2.h, z12.h\n"
+ "fmla z18.h, p3/M, z7.h, z29.h\n"
+ "fmla z21.h, p3/M, z6.h, z29.h\n"
+ "fmla z23.h, p3/M, z8.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "fmla z15.h, p3/M, z8.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z25.h\n"
+ "fmla z31.h, p3/M, z7.h, z25.h\n"
+ "fmla z13.h, p3/M, z6.h, z25.h\n"
+ "fmla z18.h, p3/M, z5.h, z25.h\n"
+ "fmla z21.h, p3/M, z4.h, z25.h\n"
+ "fmla z28.h, p3/M, z3.h, z25.h\n"
+ "ld1h { z25.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ldp x27, x26, [x8, #0x0]\n"
+ "fmla z11.h, p3/M, z8.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "fmla z26.h, p3/M, z4.h, z29.h\n"
+ "fmax z26.h, p3/M, z26.h, z16.h\n"
+ "fmla z22.h, p3/M, z3.h, z29.h\n"
+ "fmla z27.h, p3/M, z5.h, z25.h\n"
+ "fmax z22.h, p3/M, z22.h, z16.h\n"
+ "fmax z27.h, p3/M, z27.h, z16.h\n"
+ "fmla z9.h, p3/M, z4.h, z25.h\n"
+ "fmla z18.h, p3/M, z8.h, z12.h\n"
+ "fmax z9.h, p3/M, z9.h, z16.h\n"
+ "fmin z26.h, p3/M, z26.h, z19.h\n"
+ "fmla z21.h, p3/M, z7.h, z12.h\n"
+ "fmla z28.h, p3/M, z6.h, z12.h\n"
+ "ld1h { z10.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "fmin z22.h, p3/M, z22.h, z19.h\n"
+ "fmla z14.h, p3/M, z1.h, z29.h\n"
+ "fmla z20.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "fmin z27.h, p3/M, z27.h, z19.h\n"
+ "fmla z24.h, p3/M, z2.h, z25.h\n"
+ "fmla z11.h, p3/M, z1.h, z25.h\n"
+ "fmin z9.h, p3/M, z9.h, z19.h\n"
+ "fmax z14.h, p3/M, z14.h, z16.h\n"
+ "fmla z23.h, p3/M, z7.h, z10.h\n"
+ "fmla z30.h, p3/M, z6.h, z10.h\n"
+ "fmax z20.h, p3/M, z20.h, z16.h\n"
+ "fmax z24.h, p3/M, z24.h, z16.h\n"
+ "fmla z31.h, p3/M, z8.h, z12.h\n"
+ "fmla z13.h, p3/M, z7.h, z12.h\n"
+ "fmax z11.h, p3/M, z11.h, z16.h\n"
+ "st1h { z26.h }, p1, [x12, x15, LSL #1]\n"
+ "st1h { z22.h }, p1, [x11, x15, LSL #1]\n"
+ "ldr x23, [x14, #0x20]\n"
+ "ldr x22, [x14, #0x28]\n"
+ "fmla z15.h, p3/M, z4.h, z10.h\n"
+ "st1h { z27.h }, p1, [x10, x15, LSL #1]\n"
+ "ldr x21, [x14, #0x30]\n"
+ "fmla z18.h, p3/M, z3.h, z10.h\n"
+ "fmla z21.h, p3/M, z5.h, z12.h\n"
+ "st1h { z9.h }, p1, [x9, x15, LSL #1]\n"
+ "ldr x20, [x14, #0x38]\n"
+ "fmla z28.h, p3/M, z4.h, z12.h\n"
+ "ldp x25, x24, [x8, #0x10]\n"
+ "fmin z14.h, p3/M, z14.h, z19.h\n"
+ "fmin z20.h, p3/M, z20.h, z19.h\n"
+ "st1h { z14.h }, p1, [x23, x15, LSL #1]\n"
+ "ldr x23, [x14, #0x40]\n"
+ "fmin z24.h, p3/M, z24.h, z19.h\n"
+ "fmin z11.h, p3/M, z11.h, z19.h\n"
+ "st1h { z20.h }, p1, [x22, x15, LSL #1]\n"
+ "ldr x22, [x14, #0x48]\n"
+ "fmax z23.h, p3/M, z23.h, z16.h\n"
+ "fmax z30.h, p3/M, z30.h, z16.h\n"
+ "st1h { z24.h }, p1, [x21, x15, LSL #1]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "fmax z31.h, p3/M, z31.h, z16.h\n"
+ "fmax z13.h, p3/M, z13.h, z16.h\n"
+ "st1h { z11.h }, p1, [x20, x15, LSL #1]\n"
+ "ldr x20, [x14, #0x58]\n"
+ "inch x16\n"
+ "ld1h { z9.h }, p0/Z, [x27, x17, LSL #1]\n"
+ "ld1h { z10.h }, p0/Z, [x26, x17, LSL #1]\n"
+ "fmin z23.h, p3/M, z23.h, z19.h\n"
+ "ld1h { z11.h }, p0/Z, [x25, x17, LSL #1]\n"
+ "ld1h { z12.h }, p0/Z, [x24, x17, LSL #1]\n"
+ "inch x17\n"
+ "fmin z30.h, p3/M, z30.h, z19.h\n"
+ "fmin z31.h, p3/M, z31.h, z19.h\n"
+ "fmin z13.h, p3/M, z13.h, z19.h\n"
+ "st1h { z23.h }, p1, [x23, x15, LSL #1]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "fmax z15.h, p3/M, z15.h, z16.h\n"
+ "fmax z18.h, p3/M, z18.h, z16.h\n"
+ "st1h { z30.h }, p1, [x22, x15, LSL #1]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "fmax z21.h, p3/M, z21.h, z16.h\n"
+ "fmax z28.h, p3/M, z28.h, z16.h\n"
+ "st1h { z31.h }, p1, [x21, x15, LSL #1]\n"
+ "ldr x21, [x14, #0x70]\n"
+ "st1h { z13.h }, p1, [x20, x15, LSL #1]\n"
+ "ldr x20, [x14, #0x78]\n"
+ "ld1h { z0.h }, p3/Z, [x7, #1, MUL VL]\n"
+ "whilelt p2.h, x16, %x[n_channels]\n"
+ "ld1h { z1.h }, p3/Z, [x7, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x7, #3, MUL VL]\n"
+ "cmp x17, %x[n_channels]\n"
+ "fmin z15.h, p3/M, z15.h, z19.h\n"
+ "ld1h { z3.h }, p3/Z, [x7, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x7, #5, MUL VL]\n"
+ "fmin z18.h, p3/M, z18.h, z19.h\n"
+ "fmin z21.h, p3/M, z21.h, z19.h\n"
+ "ld1h { z5.h }, p3/Z, [x7, #6, MUL VL]\n"
+ "ld1h { z6.h }, p3/Z, [x7, #7, MUL VL]\n"
+ "addvl x7, x7, #16\n"
+ "fmin z28.h, p3/M, z28.h, z19.h\n"
+ "st1h { z15.h }, p1, [x23, x15, LSL #1]\n"
+ "ld1h { z7.h }, p3/Z, [x7, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x7, #-7, MUL VL]\n"
+ "addvl x7, x7, #-6\n"
+ "st1h { z18.h }, p1, [x22, x15, LSL #1]\n"
+ "st1h { z21.h }, p1, [x21, x15, LSL #1]\n"
+ "st1h { z28.h }, p1, [x20, x15, LSL #1]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z14, z17\n fmla z14.h, p3/M, z4.h, z9.h\n"
+ "movprfx z18, z17\n fmla z18.h, p3/M, z8.h, z9.h\n"
+ "ldr x27, [x8, #0x20]\n"
+ "ldr x24, [x8, #0x30]\n"
+ "movprfx z15, z17\n fmla z15.h, p3/M, z3.h, z9.h\n"
+ "movprfx z30, z17\n fmla z30.h, p3/M, z1.h, z9.h\n"
+ "ldr x23, [x8, #0x28]\n"
+ "ldr x22, [x8, #0x38]\n"
+ "movprfx z20, z17\n fmla z20.h, p3/M, z0.h, z9.h\n"
+ "movprfx z13, z17\n fmla z13.h, p3/M, z7.h, z9.h\n"
+ "ldr x26, [x8, #0x40]\n"
+ "ldr x21, [x8, #0x48]\n"
+ "movprfx z22, z17\n fmla z22.h, p3/M, z6.h, z9.h\n"
+ "fmla z14.h, p3/M, z5.h, z12.h\n"
+ "ldr x25, [x8, #0x50]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "movprfx z27, z17\n fmla z27.h, p3/M, z5.h, z9.h\n"
+ "movprfx z31, z17\n fmla z31.h, p3/M, z2.h, z9.h\n"
+ "ld1h { z23.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x13, [x8, #0x70]\n"
+ "fmla z18.h, p3/M, z0.h, z10.h\n"
+ "movprfx z9, z17\n fmla z9.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z21.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "ld1h { z25.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "fmla z15.h, p3/M, z4.h, z12.h\n"
+ "fmla z30.h, p3/M, z2.h, z12.h\n"
+ "ldr x24, [x8, #0x60]\n"
+ "ldr x23, [x8, #0x68]\n"
+ "fmla z20.h, p3/M, z1.h, z12.h\n"
+ "fmla z13.h, p3/M, z8.h, z12.h\n"
+ "inch x15\n"
+ "mov p0.b, p2.b\n"
+ "fmla z22.h, p3/M, z7.h, z12.h\n"
+ "movprfx z28, z17\n fmla z28.h, p3/M, z6.h, z21.h\n"
+ "ld1h { z29.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x28, [x8, #0x88]\n"
+ "fmla z14.h, p3/M, z7.h, z23.h\n"
+ "fmla z9.h, p3/M, z6.h, z12.h\n"
+ "ldr x12, [x14, #0x0]\n"
+ "ldr x11, [x14, #0x8]\n"
+ "movprfx z11, z17\n fmla z11.h, p3/M, z3.h, z12.h\n"
+ "movprfx z10, z17\n fmla z10.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z12.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ldr x22, [x8, #0x78]\n"
+ "movprfx z26, z17\n fmla z26.h, p3/M, z8.h, z25.h\n"
+ "fmla z15.h, p3/M, z6.h, z23.h\n"
+ "ld1h { z21.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "ldr x21, [x8, #0x80]\n"
+ "fmla z30.h, p3/M, z4.h, z23.h\n"
+ "fmla z20.h, p3/M, z3.h, z23.h\n"
+ "ldr x10, [x14, #0x10]\n"
+ "ldr x9, [x14, #0x18]\n"
+ "movprfx z25, z17\n fmla z25.h, p3/M, z1.h, z23.h\n"
+ "movprfx z24, z17\n fmla z24.h, p3/M, z0.h, z23.h\n"
+ "fmla z27.h, p3/M, z8.h, z23.h\n"
+ "fmla z31.h, p3/M, z5.h, z23.h\n"
+ "fmla z28.h, p3/M, z2.h, z23.h\n"
+ "fmla z18.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z23.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "ldr x27, [x8, #0x90]\n"
+ "fmla z13.h, p3/M, z0.h, z12.h\n"
+ "fmla z22.h, p3/M, z2.h, z21.h\n"
+ "ld1h { z12.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x20, [x8, #0x98]\n"
+ "fmla z14.h, p3/M, z8.h, z29.h\n"
+ "fmla z9.h, p3/M, z1.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x26, [x8, #0xa0]\n"
+ "fmla z15.h, p3/M, z7.h, z29.h\n"
+ "fmla z11.h, p3/M, z6.h, z29.h\n"
+ "fmla z30.h, p3/M, z5.h, z29.h\n"
+ "fmla z20.h, p3/M, z4.h, z29.h\n"
+ "fmla z10.h, p3/M, z3.h, z29.h\n"
+ "fmla z25.h, p3/M, z2.h, z29.h\n"
+ "fmla z24.h, p3/M, z1.h, z29.h\n"
+ "fmla z26.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ldr x25, [x8, #0xa8]\n"
+ "fmla z18.h, p3/M, z3.h, z23.h\n"
+ "fmla z27.h, p3/M, z0.h, z23.h\n"
+ "fmla z31.h, p3/M, z6.h, z21.h\n"
+ "fmla z28.h, p3/M, z3.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x13, x16, LSL #1]\n"
+ "ldr x24, [x8, #0xb0]\n"
+ "fmla z13.h, p3/M, z4.h, z29.h\n"
+ "fmla z22.h, p3/M, z3.h, z29.h\n"
+ "fmla z14.h, p3/M, z1.h, z29.h\n"
+ "fmla z9.h, p3/M, z5.h, z12.h\n"
+ "fmla z11.h, p3/M, z2.h, z12.h\n"
+ "fmla z15.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z17.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "ldr x23, [x8, #0xb8]\n"
+ "fmla z10.h, p3/M, z8.h, z21.h\n"
+ "fmla z26.h, p3/M, z5.h, z21.h\n"
+ "ld1h { z23.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x22, [x8, #0xc0]\n"
+ "fmla z18.h, p3/M, z5.h, z29.h\n"
+ "fmla z27.h, p3/M, z2.h, z29.h\n"
+ "ld1h { z21.h }, p2/Z, [x28, x16, LSL #1]\n"
+ "ldr x21, [x8, #0xc8]\n"
+ "fmla z13.h, p3/M, z5.h, z17.h\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
+ "fmla z14.h, p3/M, z2.h, z17.h\n"
+ "fmla z9.h, p3/M, z3.h, z17.h\n"
+ "fmla z15.h, p3/M, z1.h, z17.h\n"
+ "fmla z11.h, p3/M, z0.h, z17.h\n"
+ "ld1h { z29.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x28, [x8, #0xd8]\n"
+ "fmla z28.h, p3/M, z7.h, z23.h\n"
+ "fmla z25.h, p3/M, z6.h, z23.h\n"
+ "ld1h { z23.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "ldr x20, [x8, #0xd0]\n"
+ "fmla z18.h, p3/M, z7.h, z21.h\n"
+ "fmla z13.h, p3/M, z6.h, z21.h\n"
+ "fmla z27.h, p3/M, z4.h, z21.h\n"
+ "fmla z14.h, p3/M, z3.h, z21.h\n"
+ "fmla z31.h, p3/M, z1.h, z21.h\n"
+ "fmla z30.h, p3/M, z0.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "ldr x27, [x8, #0xe0]\n"
+ "fmla z22.h, p3/M, z8.h, z29.h\n"
+ "fmla z24.h, p3/M, z8.h, z23.h\n"
+ "fmla z26.h, p3/M, z7.h, z23.h\n"
+ "ld1h { z23.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "fmla z10.h, p3/M, z1.h, z29.h\n"
+ "ldr x26, [x8, #0xe8]\n"
+ "fmla z9.h, p3/M, z7.h, z29.h\n"
+ "fmla z15.h, p3/M, z5.h, z29.h\n"
+ "fmla z11.h, p3/M, z4.h, z29.h\n"
+ "fmla z20.h, p3/M, z2.h, z29.h\n"
+ "ld1h { z29.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "ldr x25, [x8, #0xf0]\n"
+ "fmla z18.h, p3/M, z2.h, z21.h\n"
+ "fmla z13.h, p3/M, z1.h, z21.h\n"
+ "fmla z22.h, p3/M, z0.h, z21.h\n"
+ "fmla z27.h, p3/M, z7.h, z23.h\n"
+ "ld1h { z21.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "ldr x24, [x8, #0xf8]\n"
+ "fmla z14.h, p3/M, z6.h, z23.h\n"
+ "fmla z31.h, p3/M, z4.h, z23.h\n"
+ "fmla z30.h, p3/M, z3.h, z23.h\n"
+ "fmla z28.h, p3/M, z1.h, z23.h\n"
+ "fmla z25.h, p3/M, z0.h, z23.h\n"
+ "ld1h { z17.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "fmla z10.h, p3/M, z4.h, z17.h\n"
+ "ldr x23, [x8, #0x100]\n"
+ "fmla z24.h, p3/M, z2.h, z17.h\n"
+ "fmla z13.h, p3/M, z2.h, z29.h\n"
+ "fmla z22.h, p3/M, z1.h, z29.h\n"
+ "fmla z9.h, p3/M, z0.h, z29.h\n"
+ "ld1h { z23.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "ldr x22, [x8, #0x108]\n"
+ "fmla z18.h, p3/M, z6.h, z21.h\n"
+ "fmla z27.h, p3/M, z3.h, z21.h\n"
+ "fmla z31.h, p3/M, z0.h, z21.h\n"
+ "fmla z15.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z29.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "ldr x21, [x8, #0x110]\n"
+ "fmla z11.h, p3/M, z7.h, z17.h\n"
+ "fmla z20.h, p3/M, z5.h, z17.h\n"
+ "fmla z26.h, p3/M, z1.h, z17.h\n"
+ "ld1h { z21.h }, p2/Z, [x28, x16, LSL #1]\n"
+ "fmla z10.h, p3/M, z2.h, z23.h\n"
+ "ldr x20, [x8, #0x118]\n"
+ "fmla z28.h, p3/M, z0.h, z29.h\n"
+ "fmla z25.h, p3/M, z4.h, z21.h\n"
+ "fmla z24.h, p3/M, z3.h, z21.h\n"
+ "fmla z9.h, p3/M, z8.h, z23.h\n"
+ "fmla z11.h, p3/M, z5.h, z23.h\n"
+ "fmla z27.h, p3/M, z6.h, z29.h\n"
+ "ld1h { z23.h }, p2/Z, [x27, x16, LSL #1]\n"
+ "fmla z31.h, p3/M, z3.h, z29.h\n"
+ "ld1h { z17.h }, p2/Z, [x26, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z7.h, z21.h\n"
+ "fmla z20.h, p3/M, z6.h, z21.h\n"
+ "fmla z28.h, p3/M, z5.h, z21.h\n"
+ "fmla z10.h, p3/M, z5.h, z23.h\n"
+ "fmla z26.h, p3/M, z2.h, z23.h\n"
+ "fmla z25.h, p3/M, z7.h, z17.h\n"
+ "fmla z24.h, p3/M, z6.h, z17.h\n"
+ "fmla z31.h, p3/M, z8.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x25, x16, LSL #1]\n"
+ "fmla z28.h, p3/M, z8.h, z17.h\n"
+ "ld1h { z12.h }, p2/Z, [x23, x16, LSL #1]\n"
+ "fmla z30.h, p3/M, z8.h, z21.h\n"
+ "fmla z20.h, p3/M, z7.h, z21.h\n"
+ "fmla z10.h, p3/M, z6.h, z21.h\n"
+ "fmla z25.h, p3/M, z5.h, z21.h\n"
+ "fmla z24.h, p3/M, z4.h, z21.h\n"
+ "fmla z26.h, p3/M, z3.h, z21.h\n"
+ "ld1h { z21.h }, p2/Z, [x22, x16, LSL #1]\n"
+ "fmla z11.h, p3/M, z8.h, z23.h\n"
+ "ld1h { z29.h }, p2/Z, [x24, x16, LSL #1]\n"
+ "fmla z18.h, p3/M, z4.h, z12.h\n"
+ "fmax z18.h, p3/M, z18.h, z16.h\n"
+ "fmla z13.h, p3/M, z3.h, z12.h\n"
+ "fmla z22.h, p3/M, z5.h, z21.h\n"
+ "fmax z13.h, p3/M, z13.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z16.h\n"
+ "fmla z9.h, p3/M, z4.h, z21.h\n"
+ "fmla z25.h, p3/M, z8.h, z29.h\n"
+ "fmax z9.h, p3/M, z9.h, z16.h\n"
+ "fmin z18.h, p3/M, z18.h, z19.h\n"
+ "fmla z24.h, p3/M, z7.h, z29.h\n"
+ "fmla z26.h, p3/M, z6.h, z29.h\n"
+ "ld1h { z23.h }, p2/Z, [x21, x16, LSL #1]\n"
+ "fmin z13.h, p3/M, z13.h, z19.h\n"
+ "fmla z27.h, p3/M, z1.h, z12.h\n"
+ "fmla z14.h, p3/M, z0.h, z12.h\n"
+ "ld1h { z29.h }, p2/Z, [x20, x16, LSL #1]\n"
+ "fmin z22.h, p3/M, z22.h, z19.h\n"
+ "fmla z15.h, p3/M, z2.h, z21.h\n"
+ "fmla z11.h, p3/M, z1.h, z21.h\n"
+ "fmin z9.h, p3/M, z9.h, z19.h\n"
+ "fmax z27.h, p3/M, z27.h, z16.h\n"
+ "fmla z31.h, p3/M, z7.h, z23.h\n"
+ "fmla z30.h, p3/M, z6.h, z23.h\n"
+ "fmax z14.h, p3/M, z14.h, z16.h\n"
+ "fmax z15.h, p3/M, z15.h, z16.h\n"
+ "fmla z20.h, p3/M, z8.h, z29.h\n"
+ "fmla z10.h, p3/M, z7.h, z29.h\n"
+ "fmax z11.h, p3/M, z11.h, z16.h\n"
+ "st1h { z18.h }, p0, [x12, x15, LSL #1]\n"
+ "st1h { z13.h }, p0, [x11, x15, LSL #1]\n"
+ "ldr x23, [x14, #0x20]\n"
+ "ldr x22, [x14, #0x28]\n"
+ "fmla z28.h, p3/M, z4.h, z23.h\n"
+ "st1h { z22.h }, p0, [x10, x15, LSL #1]\n"
+ "ldr x21, [x14, #0x30]\n"
+ "fmla z25.h, p3/M, z3.h, z23.h\n"
+ "fmla z24.h, p3/M, z5.h, z29.h\n"
+ "st1h { z9.h }, p0, [x9, x15, LSL #1]\n"
+ "ldr x20, [x14, #0x38]\n"
+ "fmla z26.h, p3/M, z4.h, z29.h\n"
+ "fmin z27.h, p3/M, z27.h, z19.h\n"
+ "fmin z14.h, p3/M, z14.h, z19.h\n"
+ "fmin z15.h, p3/M, z15.h, z19.h\n"
+ "st1h { z27.h }, p0, [x23, x15, LSL #1]\n"
+ "ldr x23, [x14, #0x40]\n"
+ "fmin z11.h, p3/M, z11.h, z19.h\n"
+ "fmax z31.h, p3/M, z31.h, z16.h\n"
+ "st1h { z14.h }, p0, [x22, x15, LSL #1]\n"
+ "ldr x22, [x14, #0x48]\n"
+ "fmax z30.h, p3/M, z30.h, z16.h\n"
+ "fmax z20.h, p3/M, z20.h, z16.h\n"
+ "st1h { z15.h }, p0, [x21, x15, LSL #1]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "fmax z10.h, p3/M, z10.h, z16.h\n"
+ "st1h { z11.h }, p0, [x20, x15, LSL #1]\n"
+ "ldr x20, [x14, #0x58]\n"
+ "fmin z31.h, p3/M, z31.h, z19.h\n"
+ "fmin z30.h, p3/M, z30.h, z19.h\n"
+ "fmin z20.h, p3/M, z20.h, z19.h\n"
+ "st1h { z31.h }, p0, [x23, x15, LSL #1]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "fmin z10.h, p3/M, z10.h, z19.h\n"
+ "fmax z28.h, p3/M, z28.h, z16.h\n"
+ "st1h { z30.h }, p0, [x22, x15, LSL #1]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "fmax z25.h, p3/M, z25.h, z16.h\n"
+ "fmax z24.h, p3/M, z24.h, z16.h\n"
+ "st1h { z20.h }, p0, [x21, x15, LSL #1]\n"
+ "ldr x21, [x14, #0x70]\n"
+ "fmax z26.h, p3/M, z26.h, z16.h\n"
+ "st1h { z10.h }, p0, [x20, x15, LSL #1]\n"
+ "ldr x20, [x14, #0x78]\n"
+ "fmin z28.h, p3/M, z28.h, z19.h\n"
+ "fmin z25.h, p3/M, z25.h, z19.h\n"
+ "fmin z24.h, p3/M, z24.h, z19.h\n"
+ "st1h { z28.h }, p0, [x23, x15, LSL #1]\n"
+ "fmin z26.h, p3/M, z26.h, z19.h\n"
+ "st1h { z25.h }, p0, [x22, x15, LSL #1]\n"
+ "st1h { z24.h }, p0, [x21, x15, LSL #1]\n"
+ "st1h { z26.h }, p0, [x20, x15, LSL #1]\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..d8a25666bd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..58decdba1c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "mov x11, #0x0\n"
+ "mov x16, #0x0\n"
+ "1:" // Tile loop
+ "str x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x4\n"
+ "mov x24, #0x2\n"
+ "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mul x22, x11, x23\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x22, x16, x15, x22\n" // offset += tile_j * ld_input_col
+ "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "cnth x13\n"
+ "mul x20, x11, x21\n" // offset = tile_i * ld_output_row
+ "ldr x12, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x11, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x10, x15, x15\n"
+ "mul x22, x22, x25\n" // offset *= kernel_stride * output_size
+ "add x12, x12, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ldr x9, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x28, x12, x23, LSL #1\n"
+ "madd x20, x16, x14, x20\n" // offset += tile_j * ld_output_col
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ld1h { z30.h }, p3/Z, [x11]\n"
+ "ld1h { z0.h }, p3/Z, [x11, #1, MUL VL]\n"
+ "mul x20, x20, x24\n" // offset *= output_tile_size
+ "ld1h { z1.h }, p3/Z, [x11, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x11, #3, MUL VL]\n"
+ "add x27, x28, x23, LSL #1\n"
+ "ld1h { z3.h }, p3/Z, [x11, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x11, #5, MUL VL]\n"
+ "add x26, x10, x15\n"
+ "add x25, x27, x23, LSL #1\n"
+ "ld1h { z5.h }, p3/Z, [x11, #6, MUL VL]\n"
+ "ld1h { z6.h }, p3/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "add x24, x26, x15\n"
+ "add x9, x9, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "cmp x13, %x[n_channels]\n"
+ "ld1rh { z29.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z28.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "add x23, x25, x23, LSL #1\n"
+ "add x22, x9, x21, LSL #1\n"
+ "ld1h { z7.h }, p3/Z, [x11, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x11, #-7, MUL VL]\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x13\n"
+ "ld1h { z9.h }, p2/Z, [x27, x10, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x12]\n"
+ "ld1h { z11.h }, p2/Z, [x12, x15, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x12, x26, LSL #1]\n"
+ "addvl x11, x11, #-6\n"
+ "ld1h { z13.h }, p2/Z, [x12, x24, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x28]\n"
+ "ld1h { z15.h }, p2/Z, [x28, x15, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x12, x10, LSL #1]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "movprfx z27, z30\n fmla z27.h, p3/M, z8.h, z9.h\n"
+ "movprfx z26, z30\n fmla z26.h, p3/M, z6.h, z9.h\n"
+ "whilelt p1.h, x13, %x[n_channels]\n"
+ "inch x21\n"
+ "fmla z27.h, p3/M, z0.h, z10.h\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z20.h }, p2/Z, [x28, x24, LSL #1]\n"
+ "inch x13\n"
+ "fmla z27.h, p3/M, z1.h, z11.h\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z17.h }, p2/Z, [x28, x26, LSL #1]\n"
+ "ld1h { z19.h }, p2/Z, [x28, x10, LSL #1]\n"
+ "fmla z27.h, p3/M, z3.h, z14.h\n"
+ "fmla z26.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x25]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z27.h, p3/M, z4.h, z15.h\n"
+ "fmla z26.h, p3/M, z4.h, z17.h\n"
+ "ld1h { z25.h }, p2/Z, [x27]\n"
+ "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z16.h\n"
+ "fmla z26.h, p3/M, z5.h, z20.h\n"
+ "ld1h { z24.h }, p2/Z, [x27, x26, LSL #1]\n"
+ "ld1h { z23.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "movprfx z22, z30\n fmla z22.h, p3/M, z2.h, z9.h\n"
+ "movprfx z21, z30\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "addvl x12, x12, #1\n"
+ "addvl x28, x28, #1\n"
+ "fmla z27.h, p3/M, z5.h, z19.h\n"
+ "fmla z26.h, p3/M, z3.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x26, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x11]\n"
+ "fmla z22.h, p3/M, z3.h, z18.h\n"
+ "fmla z21.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "ld1h { z20.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z22.h, p3/M, z0.h, z25.h\n"
+ "fmla z21.h, p3/M, z1.h, z24.h\n"
+ "ld1h { z0.h }, p3/Z, [x11, #1, MUL VL]\n"
+ "inch x20\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
+ "fmla z21.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x27, x24, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x23, x26, LSL #1]\n"
+ "fmla z27.h, p3/M, z6.h, z25.h\n"
+ "fmla z22.h, p3/M, z1.h, z23.h\n"
+ "ld1h { z17.h }, p2/Z, [x23]\n"
+ "addvl x27, x27, #1\n"
+ "fmla z21.h, p3/M, z2.h, z19.h\n"
+ "fmla z27.h, p3/M, z7.h, z23.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x10, LSL #1]\n"
+ "fmax z27.h, p3/M, z27.h, z29.h\n"
+ "fmla z22.h, p3/M, z6.h, z17.h\n"
+ "fmla z21.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x10, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x11, #2, MUL VL]\n"
+ "fmla z22.h, p3/M, z7.h, z20.h\n"
+ "fmla z21.h, p3/M, z7.h, z18.h\n"
+ "ld1h { z2.h }, p3/Z, [x11, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x11, #4, MUL VL]\n"
+ "fmla z26.h, p3/M, z7.h, z24.h\n"
+ "fmla z22.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z4.h }, p3/Z, [x11, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x11, #6, MUL VL]\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
+ "fmla z26.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "fmax z26.h, p3/M, z26.h, z29.h\n"
+ "fmla z22.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z29.h\n"
+ "fmax z21.h, p3/M, z21.h, z29.h\n"
+ "ld1h { z6.h }, p3/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "whilelt p2.h, x21, %x[n_channels]\n"
+ "ld1h { z9.h }, p1/Z, [x27, x10, LSL #1]\n"
+ "cmp x13, %x[n_channels]\n"
+ "fmin z27.h, p3/M, z27.h, z28.h\n"
+ "ld1h { z10.h }, p1/Z, [x12]\n"
+ "ld1h { z11.h }, p1/Z, [x12, x15, LSL #1]\n"
+ "fmin z26.h, p3/M, z26.h, z28.h\n"
+ "fmin z22.h, p3/M, z22.h, z28.h\n"
+ "ld1h { z12.h }, p1/Z, [x12, x26, LSL #1]\n"
+ "ld1h { z13.h }, p1/Z, [x12, x24, LSL #1]\n"
+ "fmin z21.h, p3/M, z21.h, z28.h\n"
+ "addvl x25, x25, #1\n"
+ "ld1h { z14.h }, p1/Z, [x28]\n"
+ "ld1h { z15.h }, p1/Z, [x28, x15, LSL #1]\n"
+ "addvl x23, x23, #1\n"
+ "ld1h { z16.h }, p1/Z, [x12, x10, LSL #1]\n"
+ "st1h { z27.h }, p0, [x9]\n"
+ "ld1h { z7.h }, p3/Z, [x11, #-8, MUL VL]\n"
+ "st1h { z26.h }, p0, [x9, x14, LSL #1]\n"
+ "addvl x9, x9, #1\n"
+ "ld1h { z8.h }, p3/Z, [x11, #-7, MUL VL]\n"
+ "addvl x11, x11, #-6\n"
+ "st1h { z22.h }, p0, [x22]\n"
+ "st1h { z21.h }, p0, [x22, x14, LSL #1]\n"
+ "addvl x22, x22, #1\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "movprfx z27, z30\n fmla z27.h, p3/M, z8.h, z9.h\n"
+ "movprfx z26, z30\n fmla z26.h, p3/M, z6.h, z9.h\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "fmla z27.h, p3/M, z0.h, z10.h\n"
+ "fmla z26.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z20.h }, p2/Z, [x28, x24, LSL #1]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "fmla z27.h, p3/M, z1.h, z11.h\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z17.h }, p2/Z, [x28, x26, LSL #1]\n"
+ "ld1h { z19.h }, p2/Z, [x28, x10, LSL #1]\n"
+ "fmla z27.h, p3/M, z3.h, z14.h\n"
+ "fmla z26.h, p3/M, z0.h, z16.h\n"
+ "ld1h { z18.h }, p2/Z, [x25]\n"
+ "add x16, x16, #0x1\n"
+ "fmla z27.h, p3/M, z4.h, z15.h\n"
+ "fmla z26.h, p3/M, z4.h, z17.h\n"
+ "ld1h { z25.h }, p2/Z, [x27]\n"
+ "ld1h { z17.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z27.h, p3/M, z2.h, z16.h\n"
+ "fmla z26.h, p3/M, z5.h, z20.h\n"
+ "ld1h { z24.h }, p2/Z, [x27, x26, LSL #1]\n"
+ "ld1h { z23.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "movprfx z22, z30\n fmla z22.h, p3/M, z2.h, z9.h\n"
+ "movprfx z21, z30\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "cmp x16, x20\n"
+ "add x21, x11, #0x1\n"
+ "fmla z27.h, p3/M, z5.h, z19.h\n"
+ "fmla z26.h, p3/M, z3.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x26, LSL #1]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z22.h, p3/M, z3.h, z18.h\n"
+ "fmla z21.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "ld1h { z20.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z22.h, p3/M, z0.h, z25.h\n"
+ "fmla z21.h, p3/M, z1.h, z24.h\n"
+ "csel x11, x11, x21, LT\n"
+ "mov p0.b, p2.b\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
+ "fmla z21.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x27, x24, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x23, x26, LSL #1]\n"
+ "fmla z27.h, p3/M, z6.h, z25.h\n"
+ "fmla z22.h, p3/M, z1.h, z23.h\n"
+ "ld1h { z17.h }, p2/Z, [x23]\n"
+ "csel x16, x16, XZR, LT\n"
+ "fmla z21.h, p3/M, z2.h, z19.h\n"
+ "fmla z27.h, p3/M, z7.h, z23.h\n"
+ "ld1h { z16.h }, p2/Z, [x25, x10, LSL #1]\n"
+ "fmax z27.h, p3/M, z27.h, z29.h\n"
+ "fmla z22.h, p3/M, z6.h, z17.h\n"
+ "fmla z21.h, p3/M, z3.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x10, LSL #1]\n"
+ "cmp x11, x20\n"
+ "fmla z22.h, p3/M, z7.h, z20.h\n"
+ "fmla z21.h, p3/M, z7.h, z18.h\n"
+ "fmin z27.h, p3/M, z27.h, z28.h\n"
+ "st1h { z27.h }, p0, [x9]\n"
+ "fmla z26.h, p3/M, z7.h, z24.h\n"
+ "fmla z22.h, p3/M, z5.h, z16.h\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
+ "fmla z26.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "fmax z26.h, p3/M, z26.h, z29.h\n"
+ "fmla z22.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z16.h\n"
+ "fmax z22.h, p3/M, z22.h, z29.h\n"
+ "fmax z21.h, p3/M, z21.h, z29.h\n"
+ "fmin z26.h, p3/M, z26.h, z28.h\n"
+ "fmin z22.h, p3/M, z22.h, z28.h\n"
+ "st1h { z26.h }, p0, [x9, x14, LSL #1]\n"
+ "fmin z21.h, p3/M, z21.h, z28.h\n"
+ "st1h { z22.h }, p0, [x22]\n"
+ "st1h { z21.h }, p0, [x22, x14, LSL #1]\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..d5fbb6baee
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[25];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[1];
+ inptrs[3] = input_ptrs[3];
+ inptrs[4] = input_ptrs[4];
+ inptrs[5] = input_ptrs[5];
+ inptrs[6] = input_ptrs[6];
+ inptrs[7] = input_ptrs[2];
+ inptrs[8] = input_ptrs[8];
+ inptrs[9] = input_ptrs[9];
+ inptrs[10] = input_ptrs[7];
+ inptrs[11] = input_ptrs[15];
+ inptrs[12] = input_ptrs[10];
+ inptrs[13] = input_ptrs[16];
+ inptrs[14] = input_ptrs[11];
+ inptrs[15] = input_ptrs[18];
+ inptrs[16] = input_ptrs[13];
+ inptrs[17] = input_ptrs[19];
+ inptrs[18] = input_ptrs[20];
+ inptrs[19] = input_ptrs[14];
+ inptrs[20] = input_ptrs[21];
+ inptrs[21] = input_ptrs[17];
+ inptrs[22] = input_ptrs[23];
+ inptrs[23] = input_ptrs[22];
+ inptrs[24] = input_ptrs[24];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "cnth x14\n"
+ "ldp x13, x12, [x20, #0x0]\n"
+ "ldp x11, x10, [x20, #0x10]\n"
+ "mov x9, #0x0\n"
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "ld1h { z20.h }, p3/Z, [x16]\n"
+ "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
+ "cmp x14, %x[n_channels]\n"
+ "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
+ "sub x28, XZR, x14\n"
+ "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
+ "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ld1rh { z26.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z25.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
+ "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
+ "addvl x16, x16, #-6\n"
+ "ld1h { z9.h }, p2/Z, [x27, x9, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x26, x9, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x25, x9, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x24, x9, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z15.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "movprfx z24, z20\n fmla z24.h, p3/M, z8.h, z9.h\n"
+ "movprfx z23, z20\n fmla z23.h, p3/M, z6.h, z9.h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z23.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x50]\n"
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "fmla z23.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z3.h, z14.h\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ldr x22, [x15, #0x78]\n"
+ "fmla z24.h, p3/M, z4.h, z15.h\n"
+ "fmla z23.h, p3/M, z4.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x60]\n"
+ "fmla z24.h, p3/M, z2.h, z16.h\n"
+ "fmla z23.h, p3/M, z5.h, z18.h\n"
+ "ldr x20, [x15, #0x80]\n"
+ "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "movprfx z22, z20\n fmla z22.h, p3/M, z2.h, z9.h\n"
+ "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z20.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x68]\n"
+ "fmla z24.h, p3/M, z5.h, z19.h\n"
+ "fmla z23.h, p3/M, z3.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla z22.h, p3/M, z3.h, z17.h\n"
+ "fmla z21.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z0.h, z18.h\n"
+ "fmla z21.h, p3/M, z1.h, z20.h\n"
+ "ldr x21, [x15, #0x70]\n"
+ "ldr x20, [x15, #0x98]\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
+ "fmla z21.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z6.h, z18.h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "fmla z22.h, p3/M, z1.h, z16.h\n"
+ "fmla z21.h, p3/M, z2.h, z19.h\n"
+ "fmla z24.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0xa0]\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "fmla z22.h, p3/M, z6.h, z16.h\n"
+ "fmla z21.h, p3/M, z3.h, z18.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z17.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla z23.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z5.h, z18.h\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
+ "fmla z23.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z16.h\n"
+ "whilelt p1.h, x14, %x[n_channels]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "inch x9\n"
+ "fmax z24.h, p3/M, z24.h, z26.h\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ld1h { z9.h }, p1/Z, [x27, x14, LSL #1]\n"
+ "fmax z23.h, p3/M, z23.h, z26.h\n"
+ "fmax z22.h, p3/M, z22.h, z26.h\n"
+ "ld1h { z10.h }, p1/Z, [x26, x14, LSL #1]\n"
+ "ld1h { z11.h }, p1/Z, [x25, x14, LSL #1]\n"
+ "fmax z21.h, p3/M, z21.h, z26.h\n"
+ "inch x28\n"
+ "ld1h { z12.h }, p1/Z, [x24, x14, LSL #1]\n"
+ "ld1h { z13.h }, p1/Z, [x23, x14, LSL #1]\n"
+ "mov p0.b, p2.b\n"
+ "whilelt p2.h, x9, %x[n_channels]\n"
+ "ld1h { z14.h }, p1/Z, [x22, x14, LSL #1]\n"
+ "ld1h { z15.h }, p1/Z, [x21, x14, LSL #1]\n"
+ "fmin z24.h, p3/M, z24.h, z25.h\n"
+ "fmin z23.h, p3/M, z23.h, z25.h\n"
+ "ld1h { z16.h }, p1/Z, [x20, x14, LSL #1]\n"
+ "inch x14\n"
+ "ld1h { z20.h }, p3/Z, [x16]\n"
+ "cmp x14, %x[n_channels]\n"
+ "ld1h { z0.h }, p3/Z, [x16, #1, MUL VL]\n"
+ "ld1h { z1.h }, p3/Z, [x16, #2, MUL VL]\n"
+ "fmin z22.h, p3/M, z22.h, z25.h\n"
+ "fmin z21.h, p3/M, z21.h, z25.h\n"
+ "ld1h { z2.h }, p3/Z, [x16, #3, MUL VL]\n"
+ "ld1h { z3.h }, p3/Z, [x16, #4, MUL VL]\n"
+ "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
+ "ld1h { z4.h }, p3/Z, [x16, #5, MUL VL]\n"
+ "ld1h { z5.h }, p3/Z, [x16, #6, MUL VL]\n"
+ "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
+ "ld1h { z6.h }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
+ "ld1h { z7.h }, p3/Z, [x16, #-8, MUL VL]\n"
+ "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
+ "ld1h { z8.h }, p3/Z, [x16, #-7, MUL VL]\n"
+ "addvl x16, x16, #-6\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z24, z20\n fmla z24.h, p3/M, z8.h, z9.h\n"
+ "movprfx z23, z20\n fmla z23.h, p3/M, z6.h, z9.h\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "fmla z24.h, p3/M, z0.h, z10.h\n"
+ "fmla z23.h, p3/M, z1.h, z12.h\n"
+ "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x50]\n"
+ "fmla z24.h, p3/M, z1.h, z11.h\n"
+ "fmla z23.h, p3/M, z2.h, z13.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z3.h, z14.h\n"
+ "fmla z23.h, p3/M, z0.h, z16.h\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ldr x22, [x15, #0x78]\n"
+ "fmla z24.h, p3/M, z4.h, z15.h\n"
+ "fmla z23.h, p3/M, z4.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x60]\n"
+ "fmla z24.h, p3/M, z2.h, z16.h\n"
+ "fmla z23.h, p3/M, z5.h, z18.h\n"
+ "ldr x20, [x15, #0x80]\n"
+ "ld1h { z18.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "movprfx z22, z20\n fmla z22.h, p3/M, z2.h, z9.h\n"
+ "movprfx z21, z20\n fmla z21.h, p3/M, z0.h, z9.h\n"
+ "ld1h { z20.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0x68]\n"
+ "fmla z24.h, p3/M, z5.h, z19.h\n"
+ "fmla z23.h, p3/M, z3.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x22, x9, LSL #1]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla z22.h, p3/M, z3.h, z17.h\n"
+ "fmla z21.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z0.h, z18.h\n"
+ "fmla z21.h, p3/M, z1.h, z20.h\n"
+ "ldr x21, [x15, #0x70]\n"
+ "ldr x20, [x15, #0x98]\n"
+ "fmla z22.h, p3/M, z4.h, z17.h\n"
+ "fmla z21.h, p3/M, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z19.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z24.h, p3/M, z6.h, z18.h\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "fmla z22.h, p3/M, z1.h, z16.h\n"
+ "fmla z21.h, p3/M, z2.h, z19.h\n"
+ "fmla z24.h, p3/M, z7.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "ldr x21, [x15, #0xa0]\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "fmla z22.h, p3/M, z6.h, z16.h\n"
+ "fmla z21.h, p3/M, z3.h, z18.h\n"
+ "ld1h { z17.h }, p2/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z7.h, z17.h\n"
+ "fmla z21.h, p3/M, z7.h, z16.h\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla z23.h, p3/M, z7.h, z20.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z5.h, z18.h\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "fmla z21.h, p3/M, z6.h, z17.h\n"
+ "fmla z23.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x9, LSL #1]\n"
+ "fmla z22.h, p3/M, z8.h, z17.h\n"
+ "fmla z21.h, p3/M, z8.h, z16.h\n"
+ "inch x28\n"
+ "mov p0.b, p2.b\n"
+ "fmax z24.h, p3/M, z24.h, z26.h\n"
+ "fmax z23.h, p3/M, z23.h, z26.h\n"
+ "fmax z22.h, p3/M, z22.h, z26.h\n"
+ "fmax z21.h, p3/M, z21.h, z26.h\n"
+ "fmin z24.h, p3/M, z24.h, z25.h\n"
+ "fmin z23.h, p3/M, z23.h, z25.h\n"
+ "st1h { z24.h }, p0, [x13, x28, LSL #1]\n"
+ "fmin z22.h, p3/M, z22.h, z25.h\n"
+ "fmin z21.h, p3/M, z21.h, z25.h\n"
+ "st1h { z23.h }, p0, [x12, x28, LSL #1]\n"
+ "st1h { z22.h }, p0, [x11, x28, LSL #1]\n"
+ "st1h { z21.h }, p0, [x10, x28, LSL #1]\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..abdfac5a3f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const input_ptrs, __fp16 *const *const outptrs, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const __fp16 *inptr, int64_t ld_input_row, int64_t ld_input_col, __fp16 *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const __fp16 activation_min, const __fp16 activation_max);
+
+class sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..fdbee67926
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,523 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "mov x12, #0x0\n"
+ "mov x8, #0x0\n"
+ "1:" // Tile loop
+ "str x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x2\n"
+ "mov x24, #0x2\n"
+ "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mul x22, x12, x23\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x22, x8, x17, x22\n" // offset += tile_j * ld_input_col
+ "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "add x15, x17, x17\n"
+ "mul x20, x12, x21\n" // offset = tile_i * ld_output_row
+ "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "cnth x12\n"
+ "mul x22, x22, x25\n" // offset *= kernel_stride * output_size
+ "add x14, x14, x22, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "add x11, x14, x23, LSL #1\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x20, x8, x16, x20\n" // offset += tile_j * ld_output_col
+ "add x9, x11, x23, LSL #1\n"
+ "add x28, x15, x17\n"
+ "ld1rh { z15.h }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "mul x20, x20, x24\n" // offset *= output_tile_size
+ "whilelt p2.h, XZR, %x[n_channels]\n"
+ "add x27, x9, x23, LSL #1\n"
+ "ld1rh { z28.h }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "add x26, x28, x17\n"
+ "add x25, x27, x23, LSL #1\n"
+ "ld1h { z29.h }, p3/Z, [x10]\n"
+ "ld1h { z0.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "add x24, x26, x17\n"
+ "add x13, x13, x20, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "ld1h { z1.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "ld1h { z2.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "cmp x12, %x[n_channels]\n"
+ "add x23, x25, x23, LSL #1\n"
+ "ld1h { z3.h }, p3/Z, [x10, #4, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x10, #5, MUL VL]\n"
+ "add x22, x13, x21, LSL #1\n"
+ "mov x21, #0x0\n"
+ "ld1h { z5.h }, p2/Z, [x14]\n"
+ "ld1h { z6.h }, p2/Z, [x14, x17, LSL #1]\n"
+ "sub x20, XZR, x12\n"
+ "ld1h { z7.h }, p2/Z, [x11]\n"
+ "ld1h { z8.h }, p2/Z, [x11, x17, LSL #1]\n"
+ "addvl x10, x10, #6\n"
+ "ld1h { z9.h }, p2/Z, [x14, x15, LSL #1]\n"
+ "ld1h { z13.h }, p2/Z, [x11, x15, LSL #1]\n"
+ "ld1h { z11.h }, p2/Z, [x14, x28, LSL #1]\n"
+ "ld1h { z12.h }, p2/Z, [x14, x26, LSL #1]\n"
+ "ld1h { z10.h }, p2/Z, [x11, x24, LSL #1]\n"
+ "ld1h { z14.h }, p2/Z, [x9]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "movprfx z27, z29\n fmla z27.h, p3/M, z0.h, z5.h\n"
+ "movprfx z31, z29\n fmla z31.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z24.h }, p2/Z, [x11, x28, LSL #1]\n"
+ "whilelt p1.h, x12, %x[n_channels]\n"
+ "movprfx z26, z29\n fmla z26.h, p3/M, z0.h, z7.h\n"
+ "movprfx z30, z29\n fmla z30.h, p3/M, z0.h, z8.h\n"
+ "ld1h { z18.h }, p3/Z, [x10]\n"
+ "inch x21\n"
+ "fmla z27.h, p3/M, z1.h, z6.h\n"
+ "fmla z31.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z23.h }, p2/Z, [x11, x26, LSL #1]\n"
+ "inch x12\n"
+ "fmla z26.h, p3/M, z1.h, z8.h\n"
+ "fmla z30.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z22.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z27.h, p3/M, z2.h, z9.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x24, LSL #1]\n"
+ "addvl x14, x14, #1\n"
+ "fmla z26.h, p3/M, z2.h, z13.h\n"
+ "fmla z30.h, p3/M, z2.h, z24.h\n"
+ "ld1h { z20.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "addvl x11, x11, #1\n"
+ "fmla z27.h, p3/M, z3.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z0.h }, p2/Z, [x9, x17, LSL #1]\n"
+ "inch x20\n"
+ "fmla z26.h, p3/M, z3.h, z24.h\n"
+ "fmla z30.h, p3/M, z3.h, z23.h\n"
+ "ld1h { z17.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z27.h, p3/M, z4.h, z12.h\n"
+ "fmla z31.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z19.h }, p2/Z, [x9, x15, LSL #1]\n"
+ "ld1h { z5.h }, p2/Z, [x9, x28, LSL #1]\n"
+ "fmla z26.h, p3/M, z4.h, z23.h\n"
+ "fmla z30.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z21.h }, p3/Z, [x10, #4, MUL VL]\n"
+ "fmla z27.h, p3/M, z18.h, z7.h\n"
+ "fmla z31.h, p3/M, z18.h, z8.h\n"
+ "ld1h { z7.h }, p1/Z, [x11]\n"
+ "fmla z26.h, p3/M, z18.h, z14.h\n"
+ "fmla z30.h, p3/M, z18.h, z0.h\n"
+ "ld1h { z18.h }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z27.h, p3/M, z22.h, z8.h\n"
+ "fmla z31.h, p3/M, z22.h, z13.h\n"
+ "ld1h { z3.h }, p2/Z, [x9, x24, LSL #1]\n"
+ "fmla z26.h, p3/M, z22.h, z0.h\n"
+ "fmla z30.h, p3/M, z22.h, z19.h\n"
+ "ld1h { z8.h }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z27.h, p3/M, z20.h, z13.h\n"
+ "fmla z31.h, p3/M, z20.h, z24.h\n"
+ "ld1h { z2.h }, p2/Z, [x9, x26, LSL #1]\n"
+ "addvl x9, x9, #1\n"
+ "fmla z26.h, p3/M, z20.h, z19.h\n"
+ "fmla z30.h, p3/M, z20.h, z5.h\n"
+ "ld1h { z16.h }, p3/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
+ "fmla z27.h, p3/M, z17.h, z24.h\n"
+ "fmla z31.h, p3/M, z17.h, z23.h\n"
+ "ld1h { z25.h }, p2/Z, [x27]\n"
+ "ld1h { z29.h }, p3/Z, [x10, #4, MUL VL]\n"
+ "fmla z26.h, p3/M, z17.h, z5.h\n"
+ "fmla z30.h, p3/M, z17.h, z2.h\n"
+ "ld1h { z17.h }, p3/Z, [x10, #-8, MUL VL]\n"
+ "fmla z27.h, p3/M, z21.h, z23.h\n"
+ "fmla z31.h, p3/M, z21.h, z10.h\n"
+ "ld1h { z24.h }, p2/Z, [x27, x17, LSL #1]\n"
+ "ld1h { z22.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z21.h, z2.h\n"
+ "fmla z30.h, p3/M, z21.h, z3.h\n"
+ "ld1h { z21.h }, p3/Z, [x10, #-7, MUL VL]\n"
+ "fmla z27.h, p3/M, z18.h, z14.h\n"
+ "fmla z31.h, p3/M, z18.h, z0.h\n"
+ "ld1h { z1.h }, p2/Z, [x27, x24, LSL #1]\n"
+ "fmla z26.h, p3/M, z18.h, z25.h\n"
+ "fmla z30.h, p3/M, z18.h, z24.h\n"
+ "ld1h { z23.h }, p3/Z, [x10, #-6, MUL VL]\n"
+ "fmla z27.h, p3/M, z8.h, z0.h\n"
+ "fmla z31.h, p3/M, z8.h, z19.h\n"
+ "ld1h { z0.h }, p2/Z, [x27, x28, LSL #1]\n"
+ "fmla z26.h, p3/M, z8.h, z24.h\n"
+ "fmla z30.h, p3/M, z8.h, z22.h\n"
+ "ld1h { z20.h }, p3/Z, [x10, #-5, MUL VL]\n"
+ "fmla z27.h, p3/M, z16.h, z19.h\n"
+ "fmla z31.h, p3/M, z16.h, z5.h\n"
+ "ld1h { z19.h }, p2/Z, [x27, x26, LSL #1]\n"
+ "addvl x27, x27, #1\n"
+ "fmla z26.h, p3/M, z16.h, z22.h\n"
+ "fmla z30.h, p3/M, z16.h, z0.h\n"
+ "ld1h { z18.h }, p3/Z, [x10, #-4, MUL VL]\n"
+ "fmla z27.h, p3/M, z17.h, z5.h\n"
+ "fmla z31.h, p3/M, z17.h, z2.h\n"
+ "ld1h { z16.h }, p2/Z, [x25]\n"
+ "fmla z26.h, p3/M, z17.h, z0.h\n"
+ "fmla z30.h, p3/M, z17.h, z19.h\n"
+ "ld1h { z17.h }, p3/Z, [x10, #-3, MUL VL]\n"
+ "fmla z27.h, p3/M, z21.h, z2.h\n"
+ "fmla z31.h, p3/M, z21.h, z3.h\n"
+ "ld1h { z4.h }, p2/Z, [x25, x17, LSL #1]\n"
+ "ld1h { z8.h }, p2/Z, [x25, x26, LSL #1]\n"
+ "fmla z26.h, p3/M, z21.h, z19.h\n"
+ "fmla z30.h, p3/M, z21.h, z1.h\n"
+ "ld1h { z13.h }, p3/Z, [x10, #-2, MUL VL]\n"
+ "fmla z27.h, p3/M, z23.h, z25.h\n"
+ "fmla z31.h, p3/M, z23.h, z24.h\n"
+ "ld1h { z25.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z23.h, z16.h\n"
+ "fmla z30.h, p3/M, z23.h, z4.h\n"
+ "ld1h { z5.h }, p3/Z, [x10, #-1, MUL VL]\n"
+ "fmla z27.h, p3/M, z20.h, z24.h\n"
+ "fmla z31.h, p3/M, z20.h, z22.h\n"
+ "ld1h { z24.h }, p2/Z, [x25, x28, LSL #1]\n"
+ "fmla z26.h, p3/M, z20.h, z4.h\n"
+ "fmla z30.h, p3/M, z20.h, z25.h\n"
+ "ld1h { z23.h }, p3/Z, [x10]\n"
+ "fmla z27.h, p3/M, z18.h, z22.h\n"
+ "fmla z31.h, p3/M, z18.h, z0.h\n"
+ "ld1h { z22.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "addvl x25, x25, #1\n"
+ "fmla z26.h, p3/M, z18.h, z25.h\n"
+ "fmla z30.h, p3/M, z18.h, z24.h\n"
+ "ld1h { z21.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z27.h, p3/M, z17.h, z0.h\n"
+ "fmla z31.h, p3/M, z17.h, z19.h\n"
+ "ld1h { z18.h }, p2/Z, [x23]\n"
+ "fmla z26.h, p3/M, z17.h, z24.h\n"
+ "fmla z30.h, p3/M, z17.h, z8.h\n"
+ "ld1h { z20.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z27.h, p3/M, z13.h, z19.h\n"
+ "fmla z31.h, p3/M, z13.h, z1.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x17, LSL #1]\n"
+ "ld1h { z14.h }, p1/Z, [x9]\n"
+ "fmla z26.h, p3/M, z13.h, z8.h\n"
+ "fmla z30.h, p3/M, z13.h, z22.h\n"
+ "ld1h { z19.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z27.h, p3/M, z5.h, z16.h\n"
+ "fmla z31.h, p3/M, z5.h, z4.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z5.h, z18.h\n"
+ "fmla z30.h, p3/M, z5.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z27.h, p3/M, z23.h, z4.h\n"
+ "fmla z31.h, p3/M, z23.h, z25.h\n"
+ "ld1h { z13.h }, p1/Z, [x11, x15, LSL #1]\n"
+ "fmla z26.h, p3/M, z23.h, z17.h\n"
+ "fmla z30.h, p3/M, z23.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z27.h, p3/M, z21.h, z25.h\n"
+ "fmla z31.h, p3/M, z21.h, z24.h\n"
+ "ld1h { z5.h }, p1/Z, [x14]\n"
+ "fmla z26.h, p3/M, z21.h, z16.h\n"
+ "fmla z30.h, p3/M, z21.h, z18.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "ld1h { z2.h }, p3/Z, [x10, #7, MUL VL]\n"
+ "fmla z27.h, p3/M, z20.h, z24.h\n"
+ "fmla z31.h, p3/M, z20.h, z8.h\n"
+ "addvl x10, x10, #16\n"
+ "whilelt p2.h, x21, %x[n_channels]\n"
+ "fmla z26.h, p3/M, z20.h, z18.h\n"
+ "fmla z30.h, p3/M, z20.h, z17.h\n"
+ "cmp x12, %x[n_channels]\n"
+ "addvl x23, x23, #1\n"
+ "fmla z27.h, p3/M, z19.h, z8.h\n"
+ "fmla z31.h, p3/M, z19.h, z22.h\n"
+ "fmax z27.h, p3/M, z27.h, z15.h\n"
+ "fmax z31.h, p3/M, z31.h, z15.h\n"
+ "fmla z26.h, p3/M, z19.h, z17.h\n"
+ "fmla z30.h, p3/M, z19.h, z16.h\n"
+ "fmax z26.h, p3/M, z26.h, z15.h\n"
+ "fmax z30.h, p3/M, z30.h, z15.h\n"
+ "fmin z27.h, p3/M, z27.h, z28.h\n"
+ "fmin z31.h, p3/M, z31.h, z28.h\n"
+ "ld1h { z6.h }, p1/Z, [x14, x17, LSL #1]\n"
+ "ld1h { z8.h }, p1/Z, [x11, x17, LSL #1]\n"
+ "fmin z26.h, p3/M, z26.h, z28.h\n"
+ "fmin z30.h, p3/M, z30.h, z28.h\n"
+ "ld1h { z9.h }, p1/Z, [x14, x15, LSL #1]\n"
+ "ld1h { z11.h }, p1/Z, [x14, x28, LSL #1]\n"
+ "ld1h { z12.h }, p1/Z, [x14, x26, LSL #1]\n"
+ "ld1h { z10.h }, p1/Z, [x11, x24, LSL #1]\n"
+ "st1h { z27.h }, p0, [x13]\n"
+ "st1h { z31.h }, p0, [x13, x16, LSL #1]\n"
+ "addvl x13, x13, #1\n"
+ "ld1h { z3.h }, p3/Z, [x10, #-8, MUL VL]\n"
+ "ld1h { z4.h }, p3/Z, [x10, #-7, MUL VL]\n"
+ "st1h { z26.h }, p0, [x22]\n"
+ "addvl x10, x10, #-6\n"
+ "st1h { z30.h }, p0, [x22, x16, LSL #1]\n"
+ "addvl x22, x22, #1\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "movprfx z30, z29\n fmla z30.h, p3/M, z0.h, z5.h\n"
+ "movprfx z31, z29\n fmla z31.h, p3/M, z0.h, z6.h\n"
+ "ld1h { z22.h }, p2/Z, [x11, x28, LSL #1]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "movprfx z5, z29\n fmla z5.h, p3/M, z0.h, z7.h\n"
+ "fmla z29.h, p3/M, z0.h, z8.h\n"
+ "ld1h { z20.h }, p3/Z, [x10]\n"
+ "ldr x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "fmla z30.h, p3/M, z1.h, z6.h\n"
+ "fmla z31.h, p3/M, z1.h, z9.h\n"
+ "ld1h { z6.h }, p2/Z, [x11, x26, LSL #1]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "fmla z5.h, p3/M, z1.h, z8.h\n"
+ "fmla z29.h, p3/M, z1.h, z13.h\n"
+ "ld1h { z19.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "add x8, x8, #0x1\n"
+ "fmla z30.h, p3/M, z2.h, z9.h\n"
+ "fmla z31.h, p3/M, z2.h, z11.h\n"
+ "ld1h { z16.h }, p2/Z, [x14, x24, LSL #1]\n"
+ "cmp x8, x20\n"
+ "fmla z5.h, p3/M, z2.h, z13.h\n"
+ "fmla z29.h, p3/M, z2.h, z22.h\n"
+ "ld1h { z18.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "add x21, x12, #0x1\n"
+ "fmla z30.h, p3/M, z3.h, z11.h\n"
+ "fmla z31.h, p3/M, z3.h, z12.h\n"
+ "ld1h { z1.h }, p2/Z, [x9, x17, LSL #1]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z5.h, p3/M, z3.h, z22.h\n"
+ "fmla z29.h, p3/M, z3.h, z6.h\n"
+ "ld1h { z17.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "csel x12, x12, x21, LT\n"
+ "fmla z30.h, p3/M, z4.h, z12.h\n"
+ "fmla z31.h, p3/M, z4.h, z16.h\n"
+ "ld1h { z0.h }, p2/Z, [x9, x15, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x9, x28, LSL #1]\n"
+ "fmla z5.h, p3/M, z4.h, z6.h\n"
+ "fmla z29.h, p3/M, z4.h, z10.h\n"
+ "ld1h { z16.h }, p3/Z, [x10, #4, MUL VL]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z30.h, p3/M, z20.h, z7.h\n"
+ "fmla z31.h, p3/M, z20.h, z8.h\n"
+ "csel x8, x8, XZR, LT\n"
+ "cmp x12, x20\n"
+ "fmla z5.h, p3/M, z20.h, z14.h\n"
+ "fmla z29.h, p3/M, z20.h, z1.h\n"
+ "ld1h { z21.h }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z30.h, p3/M, z19.h, z8.h\n"
+ "fmla z31.h, p3/M, z19.h, z13.h\n"
+ "ld1h { z26.h }, p2/Z, [x9, x24, LSL #1]\n"
+ "fmla z5.h, p3/M, z19.h, z1.h\n"
+ "fmla z29.h, p3/M, z19.h, z0.h\n"
+ "ld1h { z25.h }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z30.h, p3/M, z18.h, z13.h\n"
+ "fmla z31.h, p3/M, z18.h, z22.h\n"
+ "ld1h { z24.h }, p2/Z, [x9, x26, LSL #1]\n"
+ "fmla z5.h, p3/M, z18.h, z0.h\n"
+ "fmla z29.h, p3/M, z18.h, z27.h\n"
+ "ld1h { z23.h }, p3/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
+ "fmla z30.h, p3/M, z17.h, z22.h\n"
+ "fmla z31.h, p3/M, z17.h, z6.h\n"
+ "ld1h { z22.h }, p2/Z, [x27]\n"
+ "fmla z5.h, p3/M, z17.h, z27.h\n"
+ "fmla z29.h, p3/M, z17.h, z24.h\n"
+ "ld1h { z20.h }, p3/Z, [x10, #-8, MUL VL]\n"
+ "fmla z30.h, p3/M, z16.h, z6.h\n"
+ "fmla z31.h, p3/M, z16.h, z10.h\n"
+ "ld1h { z19.h }, p2/Z, [x27, x17, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x27, x15, LSL #1]\n"
+ "fmla z5.h, p3/M, z16.h, z24.h\n"
+ "fmla z29.h, p3/M, z16.h, z26.h\n"
+ "ld1h { z16.h }, p3/Z, [x10, #-7, MUL VL]\n"
+ "fmla z30.h, p3/M, z21.h, z14.h\n"
+ "fmla z31.h, p3/M, z21.h, z1.h\n"
+ "ld1h { z17.h }, p2/Z, [x27, x24, LSL #1]\n"
+ "fmla z5.h, p3/M, z21.h, z22.h\n"
+ "fmla z29.h, p3/M, z21.h, z19.h\n"
+ "ld1h { z21.h }, p3/Z, [x10, #-6, MUL VL]\n"
+ "fmla z30.h, p3/M, z25.h, z1.h\n"
+ "fmla z31.h, p3/M, z25.h, z0.h\n"
+ "ld1h { z7.h }, p2/Z, [x27, x28, LSL #1]\n"
+ "fmla z5.h, p3/M, z25.h, z19.h\n"
+ "fmla z29.h, p3/M, z25.h, z18.h\n"
+ "ld1h { z10.h }, p3/Z, [x10, #-5, MUL VL]\n"
+ "fmla z30.h, p3/M, z23.h, z0.h\n"
+ "fmla z31.h, p3/M, z23.h, z27.h\n"
+ "ld1h { z11.h }, p2/Z, [x27, x26, LSL #1]\n"
+ "fmla z5.h, p3/M, z23.h, z18.h\n"
+ "fmla z29.h, p3/M, z23.h, z7.h\n"
+ "ld1h { z6.h }, p3/Z, [x10, #-4, MUL VL]\n"
+ "fmla z30.h, p3/M, z20.h, z27.h\n"
+ "fmla z31.h, p3/M, z20.h, z24.h\n"
+ "ld1h { z0.h }, p2/Z, [x25]\n"
+ "fmla z5.h, p3/M, z20.h, z7.h\n"
+ "fmla z29.h, p3/M, z20.h, z11.h\n"
+ "ld1h { z9.h }, p3/Z, [x10, #-3, MUL VL]\n"
+ "fmla z30.h, p3/M, z16.h, z24.h\n"
+ "fmla z31.h, p3/M, z16.h, z26.h\n"
+ "ld1h { z3.h }, p2/Z, [x25, x17, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x25, x26, LSL #1]\n"
+ "fmla z5.h, p3/M, z16.h, z11.h\n"
+ "fmla z29.h, p3/M, z16.h, z17.h\n"
+ "ld1h { z16.h }, p3/Z, [x10, #-2, MUL VL]\n"
+ "fmla z30.h, p3/M, z21.h, z22.h\n"
+ "fmla z31.h, p3/M, z21.h, z19.h\n"
+ "ld1h { z26.h }, p2/Z, [x25, x15, LSL #1]\n"
+ "fmla z5.h, p3/M, z21.h, z0.h\n"
+ "fmla z29.h, p3/M, z21.h, z3.h\n"
+ "ld1h { z25.h }, p3/Z, [x10, #-1, MUL VL]\n"
+ "fmla z30.h, p3/M, z10.h, z19.h\n"
+ "fmla z31.h, p3/M, z10.h, z18.h\n"
+ "ld1h { z24.h }, p2/Z, [x25, x28, LSL #1]\n"
+ "fmla z5.h, p3/M, z10.h, z3.h\n"
+ "fmla z29.h, p3/M, z10.h, z26.h\n"
+ "ld1h { z23.h }, p3/Z, [x10]\n"
+ "fmla z30.h, p3/M, z6.h, z18.h\n"
+ "fmla z31.h, p3/M, z6.h, z7.h\n"
+ "ld1h { z22.h }, p2/Z, [x25, x24, LSL #1]\n"
+ "fmla z5.h, p3/M, z6.h, z26.h\n"
+ "fmla z29.h, p3/M, z6.h, z24.h\n"
+ "ld1h { z21.h }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z30.h, p3/M, z9.h, z7.h\n"
+ "fmla z31.h, p3/M, z9.h, z11.h\n"
+ "ld1h { z18.h }, p2/Z, [x23]\n"
+ "fmla z5.h, p3/M, z9.h, z24.h\n"
+ "fmla z29.h, p3/M, z9.h, z27.h\n"
+ "ld1h { z20.h }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z30.h, p3/M, z16.h, z11.h\n"
+ "fmla z31.h, p3/M, z16.h, z17.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x17, LSL #1]\n"
+ "fmla z5.h, p3/M, z16.h, z27.h\n"
+ "fmla z29.h, p3/M, z16.h, z22.h\n"
+ "ld1h { z19.h }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z30.h, p3/M, z25.h, z0.h\n"
+ "fmla z31.h, p3/M, z25.h, z3.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x15, LSL #1]\n"
+ "fmla z5.h, p3/M, z25.h, z18.h\n"
+ "fmla z29.h, p3/M, z25.h, z17.h\n"
+ "ld1h { z18.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "fmla z30.h, p3/M, z23.h, z3.h\n"
+ "fmla z31.h, p3/M, z23.h, z26.h\n"
+ "fmla z5.h, p3/M, z23.h, z17.h\n"
+ "fmla z29.h, p3/M, z23.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x23, x26, LSL #1]\n"
+ "fmla z30.h, p3/M, z21.h, z26.h\n"
+ "fmla z31.h, p3/M, z21.h, z24.h\n"
+ "fmla z5.h, p3/M, z21.h, z16.h\n"
+ "fmla z29.h, p3/M, z21.h, z18.h\n"
+ "ld1h { z16.h }, p2/Z, [x23, x24, LSL #1]\n"
+ "fmla z30.h, p3/M, z20.h, z24.h\n"
+ "fmla z31.h, p3/M, z20.h, z27.h\n"
+ "fmla z5.h, p3/M, z20.h, z18.h\n"
+ "fmla z29.h, p3/M, z20.h, z17.h\n"
+ "fmla z30.h, p3/M, z19.h, z27.h\n"
+ "fmla z31.h, p3/M, z19.h, z22.h\n"
+ "fmax z30.h, p3/M, z30.h, z15.h\n"
+ "fmax z31.h, p3/M, z31.h, z15.h\n"
+ "fmla z5.h, p3/M, z19.h, z17.h\n"
+ "fmla z29.h, p3/M, z19.h, z16.h\n"
+ "fmax z5.h, p3/M, z5.h, z15.h\n"
+ "fmax z29.h, p3/M, z29.h, z15.h\n"
+ "fmin z30.h, p3/M, z30.h, z28.h\n"
+ "fmin z31.h, p3/M, z31.h, z28.h\n"
+ "st1h { z30.h }, p0, [x13]\n"
+ "fmin z5.h, p3/M, z5.h, z28.h\n"
+ "fmin z29.h, p3/M, z29.h, z28.h\n"
+ "st1h { z31.h }, p0, [x13, x16, LSL #1]\n"
+ "st1h { z5.h }, p0, [x22]\n"
+ "st1h { z29.h }, p0, [x22, x16, LSL #1]\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..1ec0cb2cbf
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,551 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[36];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[0];
+ inptrs[1] = input_ptrs[1];
+ inptrs[2] = input_ptrs[6];
+ inptrs[3] = input_ptrs[7];
+ inptrs[4] = input_ptrs[2];
+ inptrs[5] = input_ptrs[8];
+ inptrs[6] = input_ptrs[3];
+ inptrs[7] = input_ptrs[4];
+ inptrs[8] = input_ptrs[11];
+ inptrs[9] = input_ptrs[12];
+ inptrs[10] = input_ptrs[9];
+ inptrs[11] = input_ptrs[10];
+ inptrs[12] = input_ptrs[5];
+ inptrs[13] = input_ptrs[13];
+ inptrs[14] = input_ptrs[14];
+ inptrs[15] = input_ptrs[15];
+ inptrs[16] = input_ptrs[16];
+ inptrs[17] = input_ptrs[17];
+ inptrs[18] = input_ptrs[18];
+ inptrs[19] = input_ptrs[19];
+ inptrs[20] = input_ptrs[20];
+ inptrs[21] = input_ptrs[21];
+ inptrs[22] = input_ptrs[22];
+ inptrs[23] = input_ptrs[23];
+ inptrs[24] = input_ptrs[24];
+ inptrs[25] = input_ptrs[25];
+ inptrs[26] = input_ptrs[26];
+ inptrs[27] = input_ptrs[27];
+ inptrs[28] = input_ptrs[28];
+ inptrs[29] = input_ptrs[29];
+ inptrs[30] = input_ptrs[30];
+ inptrs[31] = input_ptrs[31];
+ inptrs[32] = input_ptrs[32];
+ inptrs[33] = input_ptrs[33];
+ inptrs[34] = input_ptrs[34];
+ inptrs[35] = input_ptrs[35];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldp x15, x14, [x20, #0x0]\n"
+ "mov x13, #0x0\n"
+ "ldp x12, x11, [x20, #0x10]\n"
+ "whilelt p3.h, XZR, %x[n_channels]\n"
+ "ldp x21, x20, [x16, #0x0]\n"
+ "cnth x10\n"
+ "ptrue p2.b\n"
+ "ldr x9, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ld1h { z5.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "cmp x10, %x[n_channels]\n"
+ "ld1h { z6.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldp x27, x26, [x16, #0x10]\n"
+ "sub x28, XZR, x10\n"
+ "ldp x25, x24, [x16, #0x20]\n"
+ "ldp x23, x22, [x16, #0x30]\n"
+ "ldp x21, x20, [x16, #0x40]\n"
+ "ld1rh { z15.h }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rh { z28.h }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1h { z29.h }, p2/Z, [x9]\n"
+ "ld1h { z0.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z1.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1h { z2.h }, p2/Z, [x9, #3, MUL VL]\n"
+ "ld1h { z3.h }, p2/Z, [x9, #4, MUL VL]\n"
+ "ld1h { z4.h }, p2/Z, [x9, #5, MUL VL]\n"
+ "ld1h { z7.h }, p3/Z, [x27, x13, LSL #1]\n"
+ "addvl x9, x9, #6\n"
+ "ld1h { z8.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "ld1h { z9.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "ld1h { z13.h }, p3/Z, [x24, x13, LSL #1]\n"
+ "ld1h { z11.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ld1h { z12.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "ld1h { z10.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ld1h { z14.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "movprfx z30, z29\n fmla z30.h, p2/M, z0.h, z5.h\n"
+ "movprfx z27, z29\n fmla z27.h, p2/M, z0.h, z6.h\n"
+ "ldr x20, [x16, #0x50]\n"
+ "ld1h { z5.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "movprfx z31, z29\n fmla z31.h, p2/M, z0.h, z7.h\n"
+ "movprfx z26, z29\n fmla z26.h, p2/M, z0.h, z8.h\n"
+ "ldr x20, [x16, #0x58]\n"
+ "ldr x21, [x16, #0x60]\n"
+ "fmla z30.h, p2/M, z1.h, z6.h\n"
+ "fmla z27.h, p2/M, z1.h, z9.h\n"
+ "ld1h { z22.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x20, [x16, #0x68]\n"
+ "fmla z31.h, p2/M, z1.h, z8.h\n"
+ "fmla z26.h, p2/M, z1.h, z13.h\n"
+ "ld1h { z21.h }, p2/Z, [x9]\n"
+ "ldr x23, [x16, #0x70]\n"
+ "fmla z30.h, p2/M, z2.h, z9.h\n"
+ "fmla z27.h, p2/M, z2.h, z11.h\n"
+ "ld1h { z20.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z31.h, p2/M, z2.h, z13.h\n"
+ "fmla z26.h, p2/M, z2.h, z5.h\n"
+ "ldr x22, [x16, #0x78]\n"
+ "ld1h { z17.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "fmla z30.h, p2/M, z3.h, z11.h\n"
+ "fmla z27.h, p2/M, z3.h, z12.h\n"
+ "ld1h { z11.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x21, [x16, #0x80]\n"
+ "fmla z31.h, p2/M, z3.h, z5.h\n"
+ "fmla z26.h, p2/M, z3.h, z22.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, #3, MUL VL]\n"
+ "ldr x20, [x16, #0x88]\n"
+ "fmla z30.h, p2/M, z4.h, z12.h\n"
+ "fmla z27.h, p2/M, z4.h, z20.h\n"
+ "ld1h { z0.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ld1h { z29.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z4.h, z22.h\n"
+ "fmla z26.h, p2/M, z4.h, z10.h\n"
+ "ld1h { z19.h }, p2/Z, [x9, #4, MUL VL]\n"
+ "ldr x23, [x16, #0x90]\n"
+ "fmla z30.h, p2/M, z21.h, z7.h\n"
+ "fmla z27.h, p2/M, z21.h, z8.h\n"
+ "ldr x26, [x16, #0x98]\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla z31.h, p2/M, z21.h, z14.h\n"
+ "fmla z26.h, p2/M, z21.h, z11.h\n"
+ "ld1h { z25.h }, p2/Z, [x9, #5, MUL VL]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla z30.h, p2/M, z18.h, z8.h\n"
+ "fmla z27.h, p2/M, z18.h, z13.h\n"
+ "ld1h { z24.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla z31.h, p2/M, z18.h, z11.h\n"
+ "fmla z26.h, p2/M, z18.h, z0.h\n"
+ "ld1h { z18.h }, p2/Z, [x9, #6, MUL VL]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z30.h, p2/M, z17.h, z13.h\n"
+ "fmla z27.h, p2/M, z17.h, z5.h\n"
+ "ld1h { z3.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ldr x21, [x16, #0xc0]\n"
+ "fmla z31.h, p2/M, z17.h, z0.h\n"
+ "fmla z26.h, p2/M, z17.h, z29.h\n"
+ "ld1h { z17.h }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
+ "fmla z30.h, p2/M, z16.h, z5.h\n"
+ "fmla z27.h, p2/M, z16.h, z22.h\n"
+ "ld1h { z6.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ldr x27, [x16, #0xc8]\n"
+ "fmla z31.h, p2/M, z16.h, z29.h\n"
+ "fmla z26.h, p2/M, z16.h, z3.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ldr x23, [x16, #0xd0]\n"
+ "fmla z30.h, p2/M, z19.h, z22.h\n"
+ "fmla z27.h, p2/M, z19.h, z10.h\n"
+ "ld1h { z23.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "ld1h { z22.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z19.h, z3.h\n"
+ "fmla z26.h, p2/M, z19.h, z24.h\n"
+ "ld1h { z21.h }, p2/Z, [x9, #-7, MUL VL]\n"
+ "ldr x22, [x16, #0xd8]\n"
+ "fmla z30.h, p2/M, z25.h, z14.h\n"
+ "fmla z27.h, p2/M, z25.h, z11.h\n"
+ "ld1h { z1.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x20, [x16, #0xe0]\n"
+ "fmla z31.h, p2/M, z25.h, z6.h\n"
+ "fmla z26.h, p2/M, z25.h, z23.h\n"
+ "ld1h { z20.h }, p2/Z, [x9, #-6, MUL VL]\n"
+ "ldr x26, [x16, #0xf8]\n"
+ "fmla z30.h, p2/M, z18.h, z11.h\n"
+ "fmla z27.h, p2/M, z18.h, z0.h\n"
+ "ld1h { z7.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla z31.h, p2/M, z18.h, z23.h\n"
+ "fmla z26.h, p2/M, z18.h, z22.h\n"
+ "ld1h { z18.h }, p2/Z, [x9, #-5, MUL VL]\n"
+ "whilelt p1.h, x10, %x[n_channels]\n"
+ "fmla z30.h, p2/M, z17.h, z0.h\n"
+ "fmla z27.h, p2/M, z17.h, z29.h\n"
+ "ld1h { z19.h }, p3/Z, [x24, x13, LSL #1]\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla z31.h, p2/M, z17.h, z22.h\n"
+ "fmla z26.h, p2/M, z17.h, z7.h\n"
+ "ld1h { z17.h }, p2/Z, [x9, #-4, MUL VL]\n"
+ "inch x28\n"
+ "fmla z30.h, p2/M, z16.h, z29.h\n"
+ "fmla z27.h, p2/M, z16.h, z3.h\n"
+ "ld1h { z0.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ldr x21, [x16, #0x100]\n"
+ "fmla z31.h, p2/M, z16.h, z7.h\n"
+ "fmla z26.h, p2/M, z16.h, z19.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, #-3, MUL VL]\n"
+ "mov p0.b, p3.b\n"
+ "fmla z30.h, p2/M, z21.h, z3.h\n"
+ "fmla z27.h, p2/M, z21.h, z24.h\n"
+ "ld1h { z11.h }, p3/Z, [x27, x13, LSL #1]\n"
+ "ld1h { z13.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z21.h, z19.h\n"
+ "fmla z26.h, p2/M, z21.h, z1.h\n"
+ "ld1h { z10.h }, p2/Z, [x9, #-2, MUL VL]\n"
+ "ldr x20, [x16, #0x108]\n"
+ "fmla z30.h, p2/M, z20.h, z6.h\n"
+ "fmla z27.h, p2/M, z20.h, z23.h\n"
+ "ld1h { z25.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ldr x23, [x16, #0x110]\n"
+ "fmla z31.h, p2/M, z20.h, z0.h\n"
+ "fmla z26.h, p2/M, z20.h, z11.h\n"
+ "ld1h { z8.h }, p2/Z, [x9, #-1, MUL VL]\n"
+ "ld1h { z29.h }, p2/Z, [x9, #4, MUL VL]\n"
+ "fmla z30.h, p2/M, z18.h, z23.h\n"
+ "fmla z27.h, p2/M, z18.h, z22.h\n"
+ "ld1h { z24.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "ldr x22, [x16, #0x118]\n"
+ "fmla z31.h, p2/M, z18.h, z11.h\n"
+ "fmla z26.h, p2/M, z18.h, z25.h\n"
+ "ld1h { z23.h }, p2/Z, [x9]\n"
+ "fmla z30.h, p2/M, z17.h, z22.h\n"
+ "fmla z27.h, p2/M, z17.h, z7.h\n"
+ "ld1h { z22.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z17.h, z25.h\n"
+ "fmla z26.h, p2/M, z17.h, z24.h\n"
+ "ld1h { z21.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z30.h, p2/M, z16.h, z7.h\n"
+ "fmla z27.h, p2/M, z16.h, z19.h\n"
+ "ld1h { z18.h }, p3/Z, [x24, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z16.h, z24.h\n"
+ "fmla z26.h, p2/M, z16.h, z13.h\n"
+ "ld1h { z20.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "fmla z30.h, p2/M, z10.h, z19.h\n"
+ "fmla z27.h, p2/M, z10.h, z1.h\n"
+ "ld1h { z17.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z10.h, z13.h\n"
+ "fmla z26.h, p2/M, z10.h, z22.h\n"
+ "ld1h { z19.h }, p2/Z, [x9, #3, MUL VL]\n"
+ "fmla z30.h, p2/M, z8.h, z0.h\n"
+ "fmla z27.h, p2/M, z8.h, z11.h\n"
+ "ld1h { z16.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "fmla z31.h, p2/M, z8.h, z18.h\n"
+ "fmla z26.h, p2/M, z8.h, z17.h\n"
+ "ld1h { z18.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldp x21, x20, [x16, #0x0]\n"
+ "fmla z30.h, p2/M, z23.h, z11.h\n"
+ "fmla z27.h, p2/M, z23.h, z25.h\n"
+ "ld1h { z0.h }, p2/Z, [x9, #5, MUL VL]\n"
+ "fmla z31.h, p2/M, z23.h, z17.h\n"
+ "fmla z26.h, p2/M, z23.h, z16.h\n"
+ "ld1h { z17.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ld1h { z1.h }, p2/Z, [x9, #6, MUL VL]\n"
+ "fmla z30.h, p2/M, z21.h, z25.h\n"
+ "fmla z27.h, p2/M, z21.h, z24.h\n"
+ "ld1h { z5.h }, p1/Z, [x21, x10, LSL #1]\n"
+ "fmla z31.h, p2/M, z21.h, z16.h\n"
+ "fmla z26.h, p2/M, z21.h, z18.h\n"
+ "ld1h { z16.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "ldp x27, x26, [x16, #0x10]\n"
+ "fmla z30.h, p2/M, z20.h, z24.h\n"
+ "fmla z27.h, p2/M, z20.h, z13.h\n"
+ "ld1h { z6.h }, p1/Z, [x20, x10, LSL #1]\n"
+ "ldp x25, x24, [x16, #0x20]\n"
+ "fmla z31.h, p2/M, z20.h, z18.h\n"
+ "fmla z26.h, p2/M, z20.h, z17.h\n"
+ "ldp x23, x22, [x16, #0x30]\n"
+ "ldp x21, x20, [x16, #0x40]\n"
+ "fmla z30.h, p2/M, z19.h, z13.h\n"
+ "fmla z27.h, p2/M, z19.h, z22.h\n"
+ "inch x13\n"
+ "ld1h { z7.h }, p1/Z, [x27, x10, LSL #1]\n"
+ "fmla z31.h, p2/M, z19.h, z17.h\n"
+ "fmla z26.h, p2/M, z19.h, z16.h\n"
+ "ld1h { z8.h }, p1/Z, [x26, x10, LSL #1]\n"
+ "ld1h { z9.h }, p1/Z, [x25, x10, LSL #1]\n"
+ "ld1h { z13.h }, p1/Z, [x24, x10, LSL #1]\n"
+ "ld1h { z11.h }, p1/Z, [x23, x10, LSL #1]\n"
+ "fmax z30.h, p2/M, z30.h, z15.h\n"
+ "fmax z27.h, p2/M, z27.h, z15.h\n"
+ "ld1h { z12.h }, p1/Z, [x22, x10, LSL #1]\n"
+ "ld1h { z10.h }, p1/Z, [x21, x10, LSL #1]\n"
+ "fmax z31.h, p2/M, z31.h, z15.h\n"
+ "fmax z26.h, p2/M, z26.h, z15.h\n"
+ "ld1h { z14.h }, p1/Z, [x20, x10, LSL #1]\n"
+ "inch x10\n"
+ "ld1h { z2.h }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
+ "whilelt p3.h, x13, %x[n_channels]\n"
+ "cmp x10, %x[n_channels]\n"
+ "ld1h { z3.h }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ld1h { z4.h }, p2/Z, [x9, #-7, MUL VL]\n"
+ "fmin z30.h, p2/M, z30.h, z28.h\n"
+ "fmin z27.h, p2/M, z27.h, z28.h\n"
+ "st1h { z30.h }, p0, [x15, x28, LSL #1]\n"
+ "fmin z31.h, p2/M, z31.h, z28.h\n"
+ "fmin z26.h, p2/M, z26.h, z28.h\n"
+ "st1h { z27.h }, p0, [x14, x28, LSL #1]\n"
+ "st1h { z31.h }, p0, [x12, x28, LSL #1]\n"
+ "addvl x9, x9, #-6\n"
+ "st1h { z26.h }, p0, [x11, x28, LSL #1]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z30, z29\n fmla z30.h, p2/M, z0.h, z5.h\n"
+ "movprfx z31, z29\n fmla z31.h, p2/M, z0.h, z6.h\n"
+ "ldr x20, [x16, #0x50]\n"
+ "ld1h { z22.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "movprfx z5, z29\n fmla z5.h, p2/M, z0.h, z7.h\n"
+ "fmla z29.h, p2/M, z0.h, z8.h\n"
+ "ldr x20, [x16, #0x58]\n"
+ "ldr x21, [x16, #0x60]\n"
+ "fmla z30.h, p2/M, z1.h, z6.h\n"
+ "fmla z31.h, p2/M, z1.h, z9.h\n"
+ "ld1h { z6.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x20, [x16, #0x68]\n"
+ "fmla z5.h, p2/M, z1.h, z8.h\n"
+ "fmla z29.h, p2/M, z1.h, z13.h\n"
+ "ld1h { z20.h }, p2/Z, [x9]\n"
+ "ldr x23, [x16, #0x70]\n"
+ "fmla z30.h, p2/M, z2.h, z9.h\n"
+ "fmla z31.h, p2/M, z2.h, z11.h\n"
+ "ld1h { z16.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ld1h { z19.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z5.h, p2/M, z2.h, z13.h\n"
+ "fmla z29.h, p2/M, z2.h, z22.h\n"
+ "ldr x21, [x16, #0x78]\n"
+ "ld1h { z18.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "fmla z30.h, p2/M, z3.h, z11.h\n"
+ "fmla z31.h, p2/M, z3.h, z12.h\n"
+ "ld1h { z1.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x22, [x16, #0x80]\n"
+ "fmla z5.h, p2/M, z3.h, z22.h\n"
+ "fmla z29.h, p2/M, z3.h, z6.h\n"
+ "ld1h { z17.h }, p2/Z, [x9, #3, MUL VL]\n"
+ "ldr x20, [x16, #0x88]\n"
+ "fmla z30.h, p2/M, z4.h, z12.h\n"
+ "fmla z31.h, p2/M, z4.h, z16.h\n"
+ "ld1h { z0.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ld1h { z27.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z4.h, z6.h\n"
+ "fmla z29.h, p2/M, z4.h, z10.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, #4, MUL VL]\n"
+ "ldr x21, [x16, #0x90]\n"
+ "fmla z30.h, p2/M, z20.h, z7.h\n"
+ "fmla z31.h, p2/M, z20.h, z8.h\n"
+ "ldr x27, [x16, #0x98]\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla z5.h, p2/M, z20.h, z14.h\n"
+ "fmla z29.h, p2/M, z20.h, z1.h\n"
+ "ld1h { z21.h }, p2/Z, [x9, #5, MUL VL]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla z30.h, p2/M, z19.h, z8.h\n"
+ "fmla z31.h, p2/M, z19.h, z13.h\n"
+ "ld1h { z26.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla z5.h, p2/M, z19.h, z1.h\n"
+ "fmla z29.h, p2/M, z19.h, z0.h\n"
+ "ld1h { z25.h }, p2/Z, [x9, #6, MUL VL]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z30.h, p2/M, z18.h, z13.h\n"
+ "fmla z31.h, p2/M, z18.h, z22.h\n"
+ "ld1h { z24.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "ldr x23, [x16, #0xc0]\n"
+ "fmla z5.h, p2/M, z18.h, z0.h\n"
+ "fmla z29.h, p2/M, z18.h, z27.h\n"
+ "ld1h { z23.h }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
+ "fmla z30.h, p2/M, z17.h, z22.h\n"
+ "fmla z31.h, p2/M, z17.h, z6.h\n"
+ "ld1h { z22.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ldr x22, [x16, #0xc8]\n"
+ "fmla z5.h, p2/M, z17.h, z27.h\n"
+ "fmla z29.h, p2/M, z17.h, z24.h\n"
+ "ld1h { z20.h }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ldr x21, [x16, #0xd0]\n"
+ "fmla z30.h, p2/M, z16.h, z6.h\n"
+ "fmla z31.h, p2/M, z16.h, z10.h\n"
+ "ld1h { z19.h }, p3/Z, [x27, x13, LSL #1]\n"
+ "ld1h { z18.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z16.h, z24.h\n"
+ "fmla z29.h, p2/M, z16.h, z26.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, #-7, MUL VL]\n"
+ "ldr x27, [x16, #0xd8]\n"
+ "fmla z30.h, p2/M, z21.h, z14.h\n"
+ "fmla z31.h, p2/M, z21.h, z1.h\n"
+ "ld1h { z17.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "ldr x20, [x16, #0xe0]\n"
+ "fmla z5.h, p2/M, z21.h, z22.h\n"
+ "fmla z29.h, p2/M, z21.h, z19.h\n"
+ "ld1h { z21.h }, p2/Z, [x9, #-6, MUL VL]\n"
+ "ldr x26, [x16, #0xf8]\n"
+ "fmla z30.h, p2/M, z25.h, z1.h\n"
+ "fmla z31.h, p2/M, z25.h, z0.h\n"
+ "ld1h { z9.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla z5.h, p2/M, z25.h, z19.h\n"
+ "fmla z29.h, p2/M, z25.h, z18.h\n"
+ "ld1h { z4.h }, p2/Z, [x9, #-5, MUL VL]\n"
+ "inch x28\n"
+ "fmla z30.h, p2/M, z23.h, z0.h\n"
+ "fmla z31.h, p2/M, z23.h, z27.h\n"
+ "ld1h { z8.h }, p3/Z, [x24, x13, LSL #1]\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla z5.h, p2/M, z23.h, z18.h\n"
+ "fmla z29.h, p2/M, z23.h, z9.h\n"
+ "ld1h { z6.h }, p2/Z, [x9, #-4, MUL VL]\n"
+ "mov p0.b, p3.b\n"
+ "fmla z30.h, p2/M, z20.h, z27.h\n"
+ "fmla z31.h, p2/M, z20.h, z24.h\n"
+ "ld1h { z10.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "ldr x23, [x16, #0x100]\n"
+ "fmla z5.h, p2/M, z20.h, z9.h\n"
+ "fmla z29.h, p2/M, z20.h, z8.h\n"
+ "ld1h { z11.h }, p2/Z, [x9, #-3, MUL VL]\n"
+ "fmla z30.h, p2/M, z16.h, z24.h\n"
+ "fmla z31.h, p2/M, z16.h, z26.h\n"
+ "ld1h { z0.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "ld1h { z27.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z16.h, z8.h\n"
+ "fmla z29.h, p2/M, z16.h, z17.h\n"
+ "ld1h { z16.h }, p2/Z, [x9, #-2, MUL VL]\n"
+ "ldr x22, [x16, #0x108]\n"
+ "fmla z30.h, p2/M, z21.h, z22.h\n"
+ "fmla z31.h, p2/M, z21.h, z19.h\n"
+ "ld1h { z26.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "ldr x21, [x16, #0x110]\n"
+ "fmla z5.h, p2/M, z21.h, z10.h\n"
+ "fmla z29.h, p2/M, z21.h, z0.h\n"
+ "ld1h { z25.h }, p2/Z, [x9, #-1, MUL VL]\n"
+ "fmla z30.h, p2/M, z4.h, z19.h\n"
+ "fmla z31.h, p2/M, z4.h, z18.h\n"
+ "ld1h { z24.h }, p3/Z, [x27, x13, LSL #1]\n"
+ "ldr x20, [x16, #0x118]\n"
+ "fmla z5.h, p2/M, z4.h, z0.h\n"
+ "fmla z29.h, p2/M, z4.h, z26.h\n"
+ "ld1h { z23.h }, p2/Z, [x9]\n"
+ "fmla z30.h, p2/M, z6.h, z18.h\n"
+ "fmla z31.h, p2/M, z6.h, z9.h\n"
+ "ld1h { z22.h }, p3/Z, [x25, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z6.h, z26.h\n"
+ "fmla z29.h, p2/M, z6.h, z24.h\n"
+ "ld1h { z21.h }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z30.h, p2/M, z11.h, z9.h\n"
+ "fmla z31.h, p2/M, z11.h, z8.h\n"
+ "ld1h { z18.h }, p3/Z, [x24, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z11.h, z24.h\n"
+ "fmla z29.h, p2/M, z11.h, z27.h\n"
+ "ld1h { z20.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "fmla z30.h, p2/M, z16.h, z8.h\n"
+ "fmla z31.h, p2/M, z16.h, z17.h\n"
+ "ld1h { z17.h }, p3/Z, [x26, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z16.h, z27.h\n"
+ "fmla z29.h, p2/M, z16.h, z22.h\n"
+ "ld1h { z19.h }, p2/Z, [x9, #3, MUL VL]\n"
+ "fmla z30.h, p2/M, z25.h, z10.h\n"
+ "fmla z31.h, p2/M, z25.h, z0.h\n"
+ "ld1h { z16.h }, p3/Z, [x23, x13, LSL #1]\n"
+ "fmla z5.h, p2/M, z25.h, z18.h\n"
+ "fmla z29.h, p2/M, z25.h, z17.h\n"
+ "ld1h { z18.h }, p3/Z, [x22, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z23.h, z0.h\n"
+ "fmla z31.h, p2/M, z23.h, z26.h\n"
+ "fmla z5.h, p2/M, z23.h, z17.h\n"
+ "fmla z29.h, p2/M, z23.h, z16.h\n"
+ "ld1h { z17.h }, p3/Z, [x21, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z21.h, z26.h\n"
+ "fmla z31.h, p2/M, z21.h, z24.h\n"
+ "fmla z5.h, p2/M, z21.h, z16.h\n"
+ "fmla z29.h, p2/M, z21.h, z18.h\n"
+ "ld1h { z16.h }, p3/Z, [x20, x13, LSL #1]\n"
+ "fmla z30.h, p2/M, z20.h, z24.h\n"
+ "fmla z31.h, p2/M, z20.h, z27.h\n"
+ "fmla z5.h, p2/M, z20.h, z18.h\n"
+ "fmla z29.h, p2/M, z20.h, z17.h\n"
+ "fmla z30.h, p2/M, z19.h, z27.h\n"
+ "fmla z31.h, p2/M, z19.h, z22.h\n"
+ "fmax z30.h, p2/M, z30.h, z15.h\n"
+ "fmax z31.h, p2/M, z31.h, z15.h\n"
+ "fmla z5.h, p2/M, z19.h, z17.h\n"
+ "fmla z29.h, p2/M, z19.h, z16.h\n"
+ "fmax z5.h, p2/M, z5.h, z15.h\n"
+ "fmax z29.h, p2/M, z29.h, z15.h\n"
+ "fmin z30.h, p2/M, z30.h, z28.h\n"
+ "fmin z31.h, p2/M, z31.h, z28.h\n"
+ "st1h { z30.h }, p0, [x15, x28, LSL #1]\n"
+ "fmin z5.h, p2/M, z5.h, z28.h\n"
+ "fmin z29.h, p2/M, z29.h, z28.h\n"
+ "st1h { z31.h }, p0, [x14, x28, LSL #1]\n"
+ "st1h { z5.h }, p0, [x12, x28, LSL #1]\n"
+ "st1h { z29.h }, p0, [x11, x28, LSL #1]\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..16b96fdb8e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..1bdef85274
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "mov x10, #0x0\n"
+ "mov x14, #0x0\n"
+ "1:" // Tile loop
+ "str x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x2\n"
+ "mov x24, #0x2\n"
+ "str x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x10, x23\n" // offset = tile_i * ld_input_row
+ "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x12, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x20, x10, x22\n" // offset = tile_i * ld_output_row
+ "cntw x11\n"
+ "madd x21, x14, x13, x21\n" // offset += tile_j * ld_input_col
+ "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ldr x9, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "madd x20, x14, x12, x20\n" // offset += tile_j * ld_output_col
+ "ldr x28, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "ld1w { z27.s }, p3/Z, [x10]\n"
+ "add x27, x13, x13\n"
+ "mul x21, x21, x25\n" // offset *= kernel_stride * output_size
+ "add x9, x9, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "mul x20, x20, x24\n" // offset *= output_tile_size
+ "ld1w { z2.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x10, #4, MUL VL]\n"
+ "add x26, x9, x23, LSL #2\n"
+ "ld1w { z4.s }, p3/Z, [x10, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x10, #6, MUL VL]\n"
+ "add x25, x26, x23, LSL #2\n"
+ "add x24, x27, x13\n"
+ "ld1w { z6.s }, p3/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
+ "add x28, x28, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "cmp x11, %x[n_channels]\n"
+ "add x23, x25, x23, LSL #2\n"
+ "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z7.s }, p3/Z, [x10, #-8, MUL VL]\n"
+ "add x22, x28, x22, LSL #2\n"
+ "mov x21, #0x0\n"
+ "ld1w { z8.s }, p3/Z, [x10, #-7, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x26, x13, LSL #2]\n"
+ "sub x20, XZR, x11\n"
+ "ld1w { z10.s }, p2/Z, [x9]\n"
+ "ld1w { z11.s }, p2/Z, [x9, x24, LSL #2]\n"
+ "addvl x10, x10, #-6\n"
+ "ld1w { z12.s }, p2/Z, [x26, x27, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x25, x13, LSL #2]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "movprfx z24, z27\n fmla z24.s, p3/M, z4.s, z9.s\n"
+ "movprfx z23, z27\n fmla z23.s, p3/M, z3.s, z9.s\n"
+ "whilelt p1.s, x11, %x[n_channels]\n"
+ "incw x21\n"
+ "movprfx z22, z27\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "movprfx z21, z27\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x23]\n"
+ "incw x11\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x25, x27, LSL #2]\n"
+ "fmla z22.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z1.s, z12.s\n"
+ "mov p0.b, p2.b\n"
+ "ld1w { z27.s }, p3/Z, [x10]\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, x13, LSL #2]\n"
+ "incw x20\n"
+ "fmla z22.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x9, x27, LSL #2]\n"
+ "addvl x9, x9, #1\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "fmla z23.s, p3/M, z6.s, z13.s\n"
+ "fmla z22.s, p3/M, z4.s, z13.s\n"
+ "fmla z21.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x26]\n"
+ "fmla z24.s, p3/M, z1.s, z16.s\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x24, LSL #2]\n"
+ "addvl x26, x26, #1\n"
+ "fmla z22.s, p3/M, z5.s, z20.s\n"
+ "fmla z21.s, p3/M, z4.s, z20.s\n"
+ "ld1w { z4.s }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z24.s, p3/M, z2.s, z18.s\n"
+ "fmla z23.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x25]\n"
+ "ld1w { z1.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z22.s, p3/M, z0.s, z17.s\n"
+ "fmla z21.s, p3/M, z2.s, z16.s\n"
+ "ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z24.s, p3/M, z8.s, z20.s\n"
+ "fmla z23.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z18.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "addvl x25, x25, #1\n"
+ "fmla z22.s, p3/M, z3.s, z19.s\n"
+ "fmla z21.s, p3/M, z5.s, z18.s\n"
+ "ld1w { z13.s }, p1/Z, [x25, x13, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z17.s\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z3.s }, p3/Z, [x10, #4, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z24.s, p3/M, z6.s, z19.s\n"
+ "fmla z23.s, p3/M, z8.s, z18.s\n"
+ "fmax z24.s, p3/M, z24.s, z26.s\n"
+ "fmax z23.s, p3/M, z23.s, z26.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z26.s\n"
+ "fmax z21.s, p3/M, z21.s, z26.s\n"
+ "ld1w { z6.s }, p3/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
+ "ld1w { z9.s }, p1/Z, [x26, x13, LSL #2]\n"
+ "cmp x11, %x[n_channels]\n"
+ "fmin z24.s, p3/M, z24.s, z25.s\n"
+ "ld1w { z10.s }, p1/Z, [x9]\n"
+ "ld1w { z11.s }, p1/Z, [x9, x24, LSL #2]\n"
+ "fmin z23.s, p3/M, z23.s, z25.s\n"
+ "fmin z22.s, p3/M, z22.s, z25.s\n"
+ "ld1w { z12.s }, p1/Z, [x26, x27, LSL #2]\n"
+ "st1w { z24.s }, p0, [x28]\n"
+ "fmin z21.s, p3/M, z21.s, z25.s\n"
+ "addvl x23, x23, #1\n"
+ "st1w { z23.s }, p0, [x28, x12, LSL #2]\n"
+ "ld1w { z7.s }, p3/Z, [x10, #-8, MUL VL]\n"
+ "st1w { z22.s }, p0, [x22]\n"
+ "addvl x28, x28, #1\n"
+ "ld1w { z8.s }, p3/Z, [x10, #-7, MUL VL]\n"
+ "addvl x10, x10, #-6\n"
+ "st1w { z21.s }, p0, [x22, x12, LSL #2]\n"
+ "addvl x22, x22, #1\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "movprfx z24, z27\n fmla z24.s, p3/M, z4.s, z9.s\n"
+ "movprfx z23, z27\n fmla z23.s, p3/M, z3.s, z9.s\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z22, z27\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "movprfx z21, z27\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x23]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x25, x27, LSL #2]\n"
+ "fmla z22.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z1.s, z12.s\n"
+ "add x14, x14, #0x1\n"
+ "cmp x14, x20\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, x13, LSL #2]\n"
+ "add x21, x10, #0x1\n"
+ "fmla z22.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x9, x27, LSL #2]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "fmla z23.s, p3/M, z6.s, z13.s\n"
+ "csel x10, x10, x21, LT\n"
+ "mov p0.b, p2.b\n"
+ "fmla z22.s, p3/M, z4.s, z13.s\n"
+ "fmla z21.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x26]\n"
+ "csel x14, x14, XZR, LT\n"
+ "fmla z24.s, p3/M, z1.s, z16.s\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x24, LSL #2]\n"
+ "cmp x10, x20\n"
+ "fmla z22.s, p3/M, z5.s, z20.s\n"
+ "fmla z21.s, p3/M, z4.s, z20.s\n"
+ "fmla z24.s, p3/M, z2.s, z18.s\n"
+ "fmla z23.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x25]\n"
+ "fmla z22.s, p3/M, z0.s, z17.s\n"
+ "fmla z21.s, p3/M, z2.s, z16.s\n"
+ "fmla z24.s, p3/M, z8.s, z20.s\n"
+ "fmla z23.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z18.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "fmla z22.s, p3/M, z3.s, z19.s\n"
+ "fmla z21.s, p3/M, z5.s, z18.s\n"
+ "fmla z24.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x13, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z17.s\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
+ "fmla z24.s, p3/M, z6.s, z19.s\n"
+ "fmla z23.s, p3/M, z8.s, z18.s\n"
+ "fmax z24.s, p3/M, z24.s, z26.s\n"
+ "fmax z23.s, p3/M, z23.s, z26.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z26.s\n"
+ "fmax z21.s, p3/M, z21.s, z26.s\n"
+ "fmin z24.s, p3/M, z24.s, z25.s\n"
+ "fmin z23.s, p3/M, z23.s, z25.s\n"
+ "st1w { z24.s }, p0, [x28]\n"
+ "fmin z22.s, p3/M, z22.s, z25.s\n"
+ "fmin z21.s, p3/M, z21.s, z25.s\n"
+ "st1w { z23.s }, p0, [x28, x12, LSL #2]\n"
+ "st1w { z22.s }, p0, [x22]\n"
+ "st1w { z21.s }, p0, [x22, x12, LSL #2]\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..873b4736ff
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[16];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[5];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[3];
+ inptrs[3] = input_ptrs[6];
+ inptrs[4] = input_ptrs[9];
+ inptrs[5] = input_ptrs[12];
+ inptrs[6] = input_ptrs[15];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[2];
+ inptrs[9] = input_ptrs[10];
+ inptrs[10] = input_ptrs[4];
+ inptrs[11] = input_ptrs[7];
+ inptrs[12] = input_ptrs[8];
+ inptrs[13] = input_ptrs[11];
+ inptrs[14] = input_ptrs[13];
+ inptrs[15] = input_ptrs[14];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "cntw x14\n"
+ "ldp x13, x12, [x20, #0x0]\n"
+ "ldp x11, x10, [x20, #0x10]\n"
+ "mov x9, #0x0\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "ld1w { z20.s }, p3/Z, [x16]\n"
+ "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+ "cmp x14, %x[n_channels]\n"
+ "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+ "sub x28, XZR, x14\n"
+ "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x24, x9, LSL #2]\n"
+ "addvl x16, x16, #-6\n"
+ "ld1w { z10.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "movprfx z24, z20\n fmla z24.s, p3/M, z4.s, z9.s\n"
+ "movprfx z23, z20\n fmla z23.s, p3/M, z3.s, z9.s\n"
+ "ldr x21, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x30]\n"
+ "movprfx z22, z20\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x22, [x15, #0x38]\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x48]\n"
+ "fmla z22.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z1.s, z12.s\n"
+ "ldr x20, [x15, #0x40]\n"
+ "ld1w { z20.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x22, [x15, #0x50]\n"
+ "fmla z22.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x58]\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "fmla z23.s, p3/M, z6.s, z13.s\n"
+ "ldr x20, [x15, #0x60]\n"
+ "ldr x27, [x15, #0x68]\n"
+ "fmla z22.s, p3/M, z4.s, z13.s\n"
+ "fmla z21.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x26, [x15, #0x70]\n"
+ "fmla z24.s, p3/M, z1.s, z16.s\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x25, [x15, #0x78]\n"
+ "fmla z22.s, p3/M, z5.s, z20.s\n"
+ "fmla z21.s, p3/M, z4.s, z20.s\n"
+ "whilelt p1.s, x14, %x[n_channels]\n"
+ "ldp x24, x23, [x15, #0x0]\n"
+ "fmla z24.s, p3/M, z2.s, z18.s\n"
+ "fmla z23.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldp x22, x21, [x15, #0x10]\n"
+ "fmla z22.s, p3/M, z0.s, z17.s\n"
+ "fmla z21.s, p3/M, z2.s, z16.s\n"
+ "ldr x20, [x15, #0x20]\n"
+ "ld1w { z13.s }, p1/Z, [x20, x14, LSL #2]\n"
+ "fmla z24.s, p3/M, z8.s, z20.s\n"
+ "fmla z23.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z18.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "incw x28\n"
+ "fmla z22.s, p3/M, z3.s, z19.s\n"
+ "fmla z21.s, p3/M, z5.s, z18.s\n"
+ "mov p0.b, p2.b\n"
+ "ld1w { z20.s }, p3/Z, [x16]\n"
+ "fmla z24.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z17.s\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
+ "incw x9\n"
+ "ld1w { z11.s }, p1/Z, [x22, x14, LSL #2]\n"
+ "fmla z24.s, p3/M, z6.s, z19.s\n"
+ "fmla z23.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z9.s }, p1/Z, [x24, x14, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x23, x14, LSL #2]\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z12.s }, p1/Z, [x21, x14, LSL #2]\n"
+ "incw x14\n"
+ "fmax z24.s, p3/M, z24.s, z26.s\n"
+ "fmax z23.s, p3/M, z23.s, z26.s\n"
+ "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+ "fmax z22.s, p3/M, z22.s, z26.s\n"
+ "fmax z21.s, p3/M, z21.s, z26.s\n"
+ "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
+ "whilelt p2.s, x9, %x[n_channels]\n"
+ "cmp x14, %x[n_channels]\n"
+ "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "fmin z24.s, p3/M, z24.s, z25.s\n"
+ "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
+ "fmin z23.s, p3/M, z23.s, z25.s\n"
+ "fmin z22.s, p3/M, z22.s, z25.s\n"
+ "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
+ "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+ "fmin z21.s, p3/M, z21.s, z25.s\n"
+ "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+ "addvl x16, x16, #-6\n"
+ "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z24, z20\n fmla z24.s, p3/M, z4.s, z9.s\n"
+ "movprfx z23, z20\n fmla z23.s, p3/M, z3.s, z9.s\n"
+ "ldr x21, [x15, #0x28]\n"
+ "ldr x20, [x15, #0x30]\n"
+ "movprfx z22, z20\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x22, [x15, #0x38]\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z23.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x48]\n"
+ "fmla z22.s, p3/M, z2.s, z12.s\n"
+ "fmla z21.s, p3/M, z1.s, z12.s\n"
+ "ldr x20, [x15, #0x40]\n"
+ "ld1w { z20.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z5.s, z12.s\n"
+ "fmla z23.s, p3/M, z4.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x50]\n"
+ "fmla z22.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z3.s, z13.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x58]\n"
+ "fmla z24.s, p3/M, z7.s, z13.s\n"
+ "fmla z23.s, p3/M, z6.s, z13.s\n"
+ "ldr x23, [x15, #0x60]\n"
+ "ldr x22, [x15, #0x68]\n"
+ "fmla z22.s, p3/M, z4.s, z13.s\n"
+ "fmla z21.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x70]\n"
+ "fmla z24.s, p3/M, z1.s, z16.s\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x78]\n"
+ "fmla z22.s, p3/M, z5.s, z20.s\n"
+ "fmla z21.s, p3/M, z4.s, z20.s\n"
+ "incw x28\n"
+ "mov p0.b, p2.b\n"
+ "fmla z24.s, p3/M, z2.s, z18.s\n"
+ "fmla z23.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z17.s\n"
+ "fmla z21.s, p3/M, z2.s, z16.s\n"
+ "fmla z24.s, p3/M, z8.s, z20.s\n"
+ "fmla z23.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z18.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z3.s, z19.s\n"
+ "fmla z21.s, p3/M, z5.s, z18.s\n"
+ "fmla z24.s, p3/M, z3.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z17.s\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
+ "fmla z24.s, p3/M, z6.s, z19.s\n"
+ "fmla z23.s, p3/M, z8.s, z18.s\n"
+ "fmax z24.s, p3/M, z24.s, z26.s\n"
+ "fmax z23.s, p3/M, z23.s, z26.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z26.s\n"
+ "fmax z21.s, p3/M, z21.s, z26.s\n"
+ "fmin z24.s, p3/M, z24.s, z25.s\n"
+ "fmin z23.s, p3/M, z23.s, z25.s\n"
+ "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
+ "fmin z22.s, p3/M, z22.s, z25.s\n"
+ "fmin z21.s, p3/M, z21.s, z25.s\n"
+ "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
+ "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
+ "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..e4f432c9ed
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 3;
+ constexpr static unsigned int output_cols = 3;
+
+ sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..015d0e63c2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "mov x13, #0x0\n"
+ "mov x8, #0x0\n"
+ "1:" // Tile loop
+ "str x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x3\n"
+ "mov x24, #0x3\n"
+ "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mul x22, x13, x23\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x22, x8, x17, x22\n" // offset += tile_j * ld_input_col
+ "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "cntw x15\n"
+ "mul x20, x13, x21\n" // offset = tile_i * ld_output_row
+ "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x12, x17, x17\n"
+ "mul x22, x22, x25\n" // offset *= kernel_stride * output_size
+ "add x14, x14, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "ldr x11, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x10, x14, x23, LSL #2\n"
+ "madd x20, x8, x16, x20\n" // offset += tile_j * ld_output_col
+ "add x9, x10, x23, LSL #2\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "ld1w { z14.s }, p3/Z, [x13]\n"
+ "mul x20, x20, x24\n" // offset *= output_tile_size
+ "ld1w { z0.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x13, #2, MUL VL]\n"
+ "add x28, x9, x23, LSL #2\n"
+ "ld1w { z2.s }, p3/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x13, #4, MUL VL]\n"
+ "add x27, x12, x17\n"
+ "add x11, x11, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "ld1w { z4.s }, p3/Z, [x13, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x13, #6, MUL VL]\n"
+ "add x26, x28, x23, LSL #2\n"
+ "add x25, x27, x17\n"
+ "ld1w { z6.s }, p3/Z, [x13, #7, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "add x24, x11, x21, LSL #2\n"
+ "ld1rw { z31.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "cmp x15, %x[n_channels]\n"
+ "add x23, x24, x21, LSL #2\n"
+ "ld1rw { z30.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z7.s }, p3/Z, [x13, #-8, MUL VL]\n"
+ "add x22, x16, x16\n"
+ "mov x21, #0x0\n"
+ "ld1w { z8.s }, p3/Z, [x13, #-7, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x9, x12, LSL #2]\n"
+ "sub x20, XZR, x15\n"
+ "ld1w { z10.s }, p2/Z, [x14]\n"
+ "ld1w { z11.s }, p2/Z, [x14, x25, LSL #2]\n"
+ "addvl x13, x13, #-6\n"
+ "ld1w { z12.s }, p2/Z, [x26]\n"
+ "ld1w { z13.s }, p2/Z, [x10, x12, LSL #2]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "movprfx z29, z14\n fmla z29.s, p3/M, z7.s, z9.s\n"
+ "movprfx z28, z14\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "whilelt p1.s, x15, %x[n_channels]\n"
+ "incw x21\n"
+ "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z4.s, z13.s\n"
+ "incw x15\n"
+ "mov p0.b, p2.b\n"
+ "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+ "incw x20\n"
+ "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z23.s }, p2/Z, [x9, x27, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x9, x17, LSL #2]\n"
+ "movprfx z22, z14\n fmla z22.s, p3/M, z2.s, z9.s\n"
+ "fmla z29.s, p3/M, z6.s, z18.s\n"
+ "movprfx z21, z14\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "fmla z28.s, p3/M, z5.s, z13.s\n"
+ "fmla z27.s, p3/M, z3.s, z13.s\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "fmla z25.s, p3/M, z1.s, z13.s\n"
+ "fmla z24.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x14, x17, LSL #2]\n"
+ "fmla z22.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x25, LSL #2]\n"
+ "movprfx z20, z14\n fmla z20.s, p3/M, z1.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z14.s }, p3/Z, [x13]\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x27, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z18.s\n"
+ "fmla z20.s, p3/M, z0.s, z18.s\n"
+ "fmla z26.s, p3/M, z4.s, z18.s\n"
+ "fmla z25.s, p3/M, z3.s, z18.s\n"
+ "fmla z22.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x10]\n"
+ "fmla z29.s, p3/M, z2.s, z16.s\n"
+ "fmla z27.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x28]\n"
+ "fmla z24.s, p3/M, z4.s, z23.s\n"
+ "fmla z28.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x10, x25, LSL #2]\n"
+ "fmla z20.s, p3/M, z2.s, z23.s\n"
+ "fmla z21.s, p3/M, z1.s, z23.s\n"
+ "fmla z29.s, p3/M, z8.s, z23.s\n"
+ "fmla z27.s, p3/M, z7.s, z23.s\n"
+ "fmla z25.s, p3/M, z5.s, z23.s\n"
+ "fmla z26.s, p3/M, z0.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x12, LSL #2]\n"
+ "fmla z22.s, p3/M, z3.s, z18.s\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z20.s, p3/M, z4.s, z17.s\n"
+ "fmla z21.s, p3/M, z3.s, z17.s\n"
+ "fmla z28.s, p3/M, z3.s, z19.s\n"
+ "fmla z27.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x28, x25, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x26, x17, LSL #2]\n"
+ "fmla z26.s, p3/M, z6.s, z18.s\n"
+ "fmla z25.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x10, x17, LSL #2]\n"
+ "fmla z22.s, p3/M, z5.s, z17.s\n"
+ "fmla z24.s, p3/M, z6.s, z17.s\n"
+ "fmla z21.s, p3/M, z5.s, z19.s\n"
+ "fmla z20.s, p3/M, z6.s, z16.s\n"
+ "fmla z26.s, p3/M, z8.s, z17.s\n"
+ "fmla z22.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x27, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z18.s\n"
+ "fmla z25.s, p3/M, z0.s, z18.s\n"
+ "fmla z24.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x10, x27, LSL #2]\n"
+ "fmla z20.s, p3/M, z8.s, z17.s\n"
+ "addvl x10, x10, #1\n"
+ "fmla z21.s, p3/M, z7.s, z17.s\n"
+ "fmla z28.s, p3/M, z4.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x28, x27, LSL #2]\n"
+ "fmla z26.s, p3/M, z1.s, z18.s\n"
+ "fmla z29.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x17, LSL #2]\n"
+ "addvl x28, x28, #1\n"
+ "fmla z27.s, p3/M, z4.s, z16.s\n"
+ "fmla z25.s, p3/M, z2.s, z16.s\n"
+ "fmla z24.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x12, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
+ "addvl x14, x14, #1\n"
+ "fmla z20.s, p3/M, z3.s, z17.s\n"
+ "fmla z21.s, p3/M, z4.s, z19.s\n"
+ "ld1w { z4.s }, p3/Z, [x13, #5, MUL VL]\n"
+ "ld1w { z10.s }, p1/Z, [x14]\n"
+ "fmla z26.s, p3/M, z7.s, z17.s\n"
+ "fmla z25.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x9]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z1.s, z16.s\n"
+ "fmax z29.s, p3/M, z29.s, z31.s\n"
+ "ld1w { z1.s }, p3/Z, [x13, #2, MUL VL]\n"
+ "fmla z27.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x9, x25, LSL #2]\n"
+ "fmla z24.s, p3/M, z7.s, z19.s\n"
+ "addvl x9, x9, #1\n"
+ "fmla z20.s, p3/M, z5.s, z19.s\n"
+ "fmla z22.s, p3/M, z0.s, z18.s\n"
+ "ld1w { z0.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "fmin z29.s, p3/M, z29.s, z30.s\n"
+ "fmla z21.s, p3/M, z2.s, z17.s\n"
+ "fmla z25.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x12, LSL #2]\n"
+ "fmax z25.s, p3/M, z25.s, z31.s\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "fmla z26.s, p3/M, z3.s, z18.s\n"
+ "fmax z28.s, p3/M, z28.s, z31.s\n"
+ "fmax z26.s, p3/M, z26.s, z31.s\n"
+ "fmla z27.s, p3/M, z8.s, z17.s\n"
+ "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "fmax z27.s, p3/M, z27.s, z31.s\n"
+ "fmax z24.s, p3/M, z24.s, z31.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z20.s, p3/M, z7.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z31.s\n"
+ "fmax z20.s, p3/M, z20.s, z31.s\n"
+ "fmla z21.s, p3/M, z6.s, z16.s\n"
+ "fmax z21.s, p3/M, z21.s, z31.s\n"
+ "addvl x26, x26, #1\n"
+ "ld1w { z2.s }, p3/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x13, #4, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x13, #6, MUL VL]\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
+ "cmp x15, %x[n_channels]\n"
+ "ld1w { z6.s }, p3/Z, [x13, #7, MUL VL]\n"
+ "addvl x13, x13, #16\n"
+ "fmin z28.s, p3/M, z28.s, z30.s\n"
+ "ld1w { z9.s }, p1/Z, [x9, x12, LSL #2]\n"
+ "fmin z27.s, p3/M, z27.s, z30.s\n"
+ "fmin z26.s, p3/M, z26.s, z30.s\n"
+ "ld1w { z11.s }, p1/Z, [x14, x25, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x26]\n"
+ "fmin z25.s, p3/M, z25.s, z30.s\n"
+ "fmin z24.s, p3/M, z24.s, z30.s\n"
+ "ld1w { z13.s }, p1/Z, [x10, x12, LSL #2]\n"
+ "st1w { z28.s }, p0, [x11]\n"
+ "fmin z22.s, p3/M, z22.s, z30.s\n"
+ "fmin z20.s, p3/M, z20.s, z30.s\n"
+ "st1w { z29.s }, p0, [x11, x16, LSL #2]\n"
+ "ld1w { z7.s }, p3/Z, [x13, #-8, MUL VL]\n"
+ "fmin z21.s, p3/M, z21.s, z30.s\n"
+ "st1w { z27.s }, p0, [x11, x22, LSL #2]\n"
+ "addvl x11, x11, #1\n"
+ "ld1w { z8.s }, p3/Z, [x13, #-7, MUL VL]\n"
+ "st1w { z26.s }, p0, [x24]\n"
+ "addvl x13, x13, #-6\n"
+ "st1w { z25.s }, p0, [x24, x16, LSL #2]\n"
+ "st1w { z24.s }, p0, [x24, x22, LSL #2]\n"
+ "addvl x24, x24, #1\n"
+ "st1w { z22.s }, p0, [x23]\n"
+ "st1w { z20.s }, p0, [x23, x16, LSL #2]\n"
+ "st1w { z21.s }, p0, [x23, x22, LSL #2]\n"
+ "addvl x23, x23, #1\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "movprfx z29, z14\n fmla z29.s, p3/M, z7.s, z9.s\n"
+ "movprfx z28, z14\n fmla z28.s, p3/M, z8.s, z9.s\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z4.s, z13.s\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "add x8, x8, #0x1\n"
+ "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+ "cmp x8, x20\n"
+ "add x21, x13, #0x1\n"
+ "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z23.s }, p2/Z, [x9, x27, LSL #2]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x9, x17, LSL #2]\n"
+ "movprfx z22, z14\n fmla z22.s, p3/M, z2.s, z9.s\n"
+ "csel x13, x13, x21, LT\n"
+ "fmla z29.s, p3/M, z6.s, z18.s\n"
+ "movprfx z21, z14\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "mov p0.b, p2.b\n"
+ "csel x8, x8, XZR, LT\n"
+ "fmla z28.s, p3/M, z5.s, z13.s\n"
+ "fmla z27.s, p3/M, z3.s, z13.s\n"
+ "cmp x13, x20\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "fmla z25.s, p3/M, z1.s, z13.s\n"
+ "fmla z24.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x14, x17, LSL #2]\n"
+ "fmla z22.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x25, LSL #2]\n"
+ "movprfx z20, z14\n fmla z20.s, p3/M, z1.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x27, LSL #2]\n"
+ "fmla z28.s, p3/M, z7.s, z18.s\n"
+ "fmla z20.s, p3/M, z0.s, z18.s\n"
+ "fmla z26.s, p3/M, z4.s, z18.s\n"
+ "fmla z25.s, p3/M, z3.s, z18.s\n"
+ "fmla z22.s, p3/M, z1.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x10]\n"
+ "fmla z29.s, p3/M, z2.s, z16.s\n"
+ "fmla z27.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x28]\n"
+ "fmla z24.s, p3/M, z4.s, z23.s\n"
+ "fmla z28.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x10, x25, LSL #2]\n"
+ "fmla z20.s, p3/M, z2.s, z23.s\n"
+ "fmla z21.s, p3/M, z1.s, z23.s\n"
+ "fmla z29.s, p3/M, z8.s, z23.s\n"
+ "fmla z27.s, p3/M, z7.s, z23.s\n"
+ "fmla z25.s, p3/M, z5.s, z23.s\n"
+ "fmla z26.s, p3/M, z0.s, z19.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x12, LSL #2]\n"
+ "fmla z22.s, p3/M, z3.s, z18.s\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z20.s, p3/M, z4.s, z17.s\n"
+ "fmla z21.s, p3/M, z3.s, z17.s\n"
+ "fmla z28.s, p3/M, z3.s, z19.s\n"
+ "fmla z27.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x28, x25, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x26, x17, LSL #2]\n"
+ "fmla z26.s, p3/M, z6.s, z18.s\n"
+ "fmla z25.s, p3/M, z7.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x10, x17, LSL #2]\n"
+ "fmla z22.s, p3/M, z5.s, z17.s\n"
+ "fmla z24.s, p3/M, z6.s, z17.s\n"
+ "fmla z21.s, p3/M, z5.s, z19.s\n"
+ "fmla z20.s, p3/M, z6.s, z16.s\n"
+ "fmla z26.s, p3/M, z8.s, z17.s\n"
+ "fmla z22.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x27, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z18.s\n"
+ "fmla z25.s, p3/M, z0.s, z18.s\n"
+ "fmla z24.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x10, x27, LSL #2]\n"
+ "fmla z20.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z7.s, z17.s\n"
+ "fmla z28.s, p3/M, z4.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x28, x27, LSL #2]\n"
+ "fmla z26.s, p3/M, z1.s, z18.s\n"
+ "fmla z29.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x17, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z16.s\n"
+ "fmla z25.s, p3/M, z2.s, z16.s\n"
+ "fmla z24.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x12, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
+ "fmla z20.s, p3/M, z3.s, z17.s\n"
+ "fmla z21.s, p3/M, z4.s, z19.s\n"
+ "fmla z26.s, p3/M, z7.s, z17.s\n"
+ "fmla z25.s, p3/M, z6.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x9]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z29.s, p3/M, z1.s, z16.s\n"
+ "fmax z29.s, p3/M, z29.s, z31.s\n"
+ "fmin z29.s, p3/M, z29.s, z30.s\n"
+ "fmla z27.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x9, x25, LSL #2]\n"
+ "fmla z24.s, p3/M, z7.s, z19.s\n"
+ "fmla z20.s, p3/M, z5.s, z19.s\n"
+ "fmla z22.s, p3/M, z0.s, z18.s\n"
+ "fmla z21.s, p3/M, z2.s, z17.s\n"
+ "fmla z25.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x26, x12, LSL #2]\n"
+ "fmax z25.s, p3/M, z25.s, z31.s\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "fmla z26.s, p3/M, z3.s, z18.s\n"
+ "fmax z28.s, p3/M, z28.s, z31.s\n"
+ "fmax z26.s, p3/M, z26.s, z31.s\n"
+ "fmla z27.s, p3/M, z8.s, z17.s\n"
+ "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "fmax z27.s, p3/M, z27.s, z31.s\n"
+ "fmax z24.s, p3/M, z24.s, z31.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "fmla z20.s, p3/M, z7.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z31.s\n"
+ "fmax z20.s, p3/M, z20.s, z31.s\n"
+ "fmla z21.s, p3/M, z6.s, z16.s\n"
+ "fmax z21.s, p3/M, z21.s, z31.s\n"
+ "fmin z28.s, p3/M, z28.s, z30.s\n"
+ "st1w { z28.s }, p0, [x11]\n"
+ "fmin z27.s, p3/M, z27.s, z30.s\n"
+ "fmin z26.s, p3/M, z26.s, z30.s\n"
+ "st1w { z29.s }, p0, [x11, x16, LSL #2]\n"
+ "fmin z25.s, p3/M, z25.s, z30.s\n"
+ "fmin z24.s, p3/M, z24.s, z30.s\n"
+ "st1w { z27.s }, p0, [x11, x22, LSL #2]\n"
+ "fmin z22.s, p3/M, z22.s, z30.s\n"
+ "fmin z20.s, p3/M, z20.s, z30.s\n"
+ "st1w { z26.s }, p0, [x24]\n"
+ "fmin z21.s, p3/M, z21.s, z30.s\n"
+ "st1w { z25.s }, p0, [x24, x16, LSL #2]\n"
+ "st1w { z24.s }, p0, [x24, x22, LSL #2]\n"
+ "st1w { z22.s }, p0, [x23]\n"
+ "st1w { z20.s }, p0, [x23, x16, LSL #2]\n"
+ "st1w { z21.s }, p0, [x23, x22, LSL #2]\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..4809b0c45c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,477 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[25];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[4];
+ inptrs[3] = input_ptrs[20];
+ inptrs[4] = input_ptrs[7];
+ inptrs[5] = input_ptrs[24];
+ inptrs[6] = input_ptrs[11];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[3];
+ inptrs[9] = input_ptrs[13];
+ inptrs[10] = input_ptrs[5];
+ inptrs[11] = input_ptrs[9];
+ inptrs[12] = input_ptrs[15];
+ inptrs[13] = input_ptrs[17];
+ inptrs[14] = input_ptrs[19];
+ inptrs[15] = input_ptrs[21];
+ inptrs[16] = input_ptrs[6];
+ inptrs[17] = input_ptrs[8];
+ inptrs[18] = input_ptrs[23];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[18];
+ inptrs[22] = input_ptrs[10];
+ inptrs[23] = input_ptrs[14];
+ inptrs[24] = input_ptrs[22];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x17, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1w { z14.s }, p3/Z, [x8]\n"
+ "cntw x16\n"
+ "mov x15, #0x0\n"
+ "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
+ "cmp x16, %x[n_channels]\n"
+ "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
+ "sub x14, XZR, x16\n"
+ "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
+ "addvl x8, x8, #16\n"
+ "ldp x24, x23, [x17, #0x0]\n"
+ "ldp x22, x21, [x17, #0x10]\n"
+ "ldr x20, [x17, #0x20]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ld1rw { z31.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z30.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "addvl x8, x8, #-6\n"
+ "ld1w { z10.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "movprfx z29, z14\n fmla z29.s, p3/M, z8.s, z9.s\n"
+ "movprfx z28, z14\n fmla z28.s, p3/M, z7.s, z9.s\n"
+ "ldr x23, [x17, #0x30]\n"
+ "ldr x26, [x17, #0x38]\n"
+ "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z10.s\n"
+ "ldr x22, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "fmla z28.s, p3/M, z4.s, z13.s\n"
+ "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "ldr x20, [x17, #0x40]\n"
+ "ld1w { z19.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+ "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
+ "ldr x25, [x17, #0x50]\n"
+ "ldr x24, [x17, #0x58]\n"
+ "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "movprfx z23, z14\n fmla z23.s, p3/M, z2.s, z9.s\n"
+ "ldr x23, [x17, #0x60]\n"
+ "fmla z29.s, p3/M, z5.s, z13.s\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "ldr x12, [x17, #0x70]\n"
+ "ldr x11, [x17, #0x88]\n"
+ "movprfx z22, z14\n fmla z22.s, p3/M, z0.s, z9.s\n"
+ "fmla z27.s, p3/M, z3.s, z13.s\n"
+ "incw x14\n"
+ "mov p1.b, p2.b\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "fmla z25.s, p3/M, z1.s, z13.s\n"
+ "ldr x10, [x13, #0x0]\n"
+ "whilelt p0.s, x16, %x[n_channels]\n"
+ "fmla z24.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z23.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "movprfx z21, z14\n fmla z21.s, p3/M, z1.s, z9.s\n"
+ "fmla z29.s, p3/M, z7.s, z18.s\n"
+ "ldr x22, [x17, #0x68]\n"
+ "ldr x21, [x17, #0x78]\n"
+ "fmla z28.s, p3/M, z0.s, z17.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x17, #0x80]\n"
+ "fmla z26.s, p3/M, z4.s, z18.s\n"
+ "fmla z25.s, p3/M, z3.s, z18.s\n"
+ "ldr x9, [x13, #0x8]\n"
+ "ldr x28, [x13, #0x10]\n"
+ "fmla z21.s, p3/M, z0.s, z18.s\n"
+ "fmla z24.s, p3/M, z4.s, z19.s\n"
+ "ldr x27, [x13, #0x18]\n"
+ "ld1w { z14.s }, p3/Z, [x8]\n"
+ "fmla z23.s, p3/M, z1.s, z18.s\n"
+ "fmla z29.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z20.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z27.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x26, [x17, #0x90]\n"
+ "fmla z25.s, p3/M, z5.s, z19.s\n"
+ "fmla z21.s, p3/M, z2.s, z19.s\n"
+ "ldr x25, [x17, #0xa0]\n"
+ "ldr x24, [x17, #0x98]\n"
+ "fmla z26.s, p3/M, z0.s, z20.s\n"
+ "fmla z24.s, p3/M, z2.s, z17.s\n"
+ "fmla z28.s, p3/M, z8.s, z19.s\n"
+ "fmla z27.s, p3/M, z7.s, z19.s\n"
+ "fmla z22.s, p3/M, z1.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x23, [x17, #0xa8]\n"
+ "fmla z26.s, p3/M, z6.s, z16.s\n"
+ "fmla z25.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z4.s, z18.s\n"
+ "fmla z29.s, p3/M, z3.s, z20.s\n"
+ "fmla z27.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x12, x15, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z18.s\n"
+ "fmla z22.s, p3/M, z3.s, z18.s\n"
+ "ldr x21, [x17, #0xb0]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.s, p3/M, z8.s, z18.s\n"
+ "fmla z24.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z6.s, z16.s\n"
+ "fmla z28.s, p3/M, z3.s, z19.s\n"
+ "fmla z25.s, p3/M, z0.s, z19.s\n"
+ "fmla z22.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
+ "fmla z23.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z19.s\n"
+ "fmla z26.s, p3/M, z1.s, z19.s\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z17.s\n"
+ "fmla z25.s, p3/M, z2.s, z17.s\n"
+ "fmla z24.s, p3/M, z1.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ldr x25, [x17, #0x20]\n"
+ "fmla z22.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z2.s, z17.s\n"
+ "fmla z26.s, p3/M, z7.s, z16.s\n"
+ "fmla z25.s, p3/M, z6.s, z16.s\n"
+ "fmla z23.s, p3/M, z4.s, z16.s\n"
+ "fmla z21.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z18.s\n"
+ "fmla z28.s, p3/M, z1.s, z17.s\n"
+ "fmax z28.s, p3/M, z28.s, z31.s\n"
+ "fmin z28.s, p3/M, z28.s, z30.s\n"
+ "fmla z27.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z6.s, z16.s\n"
+ "fmax z29.s, p3/M, z29.s, z31.s\n"
+ "fmla z24.s, p3/M, z7.s, z18.s\n"
+ "fmla z21.s, p3/M, z5.s, z18.s\n"
+ "fmin z29.s, p3/M, z29.s, z30.s\n"
+ "st1w { z29.s }, p1, [x10, x14, LSL #2]\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "fmla z22.s, p3/M, z2.s, z17.s\n"
+ "ldr x24, [x13, #0x20]\n"
+ "st1w { z28.s }, p1, [x9, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z8.s, z18.s\n"
+ "fmla z26.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldp x23, x22, [x17, #0x0]\n"
+ "fmla z27.s, p3/M, z8.s, z17.s\n"
+ "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "ldp x21, x20, [x17, #0x10]\n"
+ "fmax z27.s, p3/M, z27.s, z31.s\n"
+ "fmla z23.s, p3/M, z8.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmax z26.s, p3/M, z26.s, z31.s\n"
+ "fmax z25.s, p3/M, z25.s, z31.s\n"
+ "fmla z22.s, p3/M, z6.s, z16.s\n"
+ "incw x15\n"
+ "ld1w { z9.s }, p0/Z, [x23, x16, LSL #2]\n"
+ "ld1w { z10.s }, p0/Z, [x22, x16, LSL #2]\n"
+ "ld1w { z11.s }, p0/Z, [x21, x16, LSL #2]\n"
+ "ld1w { z12.s }, p0/Z, [x20, x16, LSL #2]\n"
+ "fmin z27.s, p3/M, z27.s, z30.s\n"
+ "fmin z26.s, p3/M, z26.s, z30.s\n"
+ "ld1w { z13.s }, p0/Z, [x25, x16, LSL #2]\n"
+ "incw x16\n"
+ "fmin z25.s, p3/M, z25.s, z30.s\n"
+ "st1w { z27.s }, p1, [x28, x14, LSL #2]\n"
+ "fmax z24.s, p3/M, z24.s, z31.s\n"
+ "fmax z23.s, p3/M, z23.s, z31.s\n"
+ "st1w { z26.s }, p1, [x27, x14, LSL #2]\n"
+ "ldr x23, [x13, #0x28]\n"
+ "fmax z21.s, p3/M, z21.s, z31.s\n"
+ "fmax z22.s, p3/M, z22.s, z31.s\n"
+ "st1w { z25.s }, p1, [x24, x14, LSL #2]\n"
+ "ldr x22, [x13, #0x30]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "whilelt p2.s, x15, %x[n_channels]\n"
+ "cmp x16, %x[n_channels]\n"
+ "ld1w { z0.s }, p3/Z, [x8, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x8, #2, MUL VL]\n"
+ "fmin z24.s, p3/M, z24.s, z30.s\n"
+ "fmin z23.s, p3/M, z23.s, z30.s\n"
+ "ld1w { z2.s }, p3/Z, [x8, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x8, #4, MUL VL]\n"
+ "fmin z21.s, p3/M, z21.s, z30.s\n"
+ "fmin z22.s, p3/M, z22.s, z30.s\n"
+ "ld1w { z4.s }, p3/Z, [x8, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x8, #6, MUL VL]\n"
+ "st1w { z24.s }, p1, [x23, x14, LSL #2]\n"
+ "ld1w { z6.s }, p3/Z, [x8, #7, MUL VL]\n"
+ "addvl x8, x8, #16\n"
+ "st1w { z23.s }, p1, [x22, x14, LSL #2]\n"
+ "ld1w { z7.s }, p3/Z, [x8, #-8, MUL VL]\n"
+ "st1w { z21.s }, p1, [x21, x14, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x8, #-7, MUL VL]\n"
+ "addvl x8, x8, #-6\n"
+ "st1w { z22.s }, p1, [x20, x14, LSL #2]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z29, z14\n fmla z29.s, p3/M, z8.s, z9.s\n"
+ "movprfx z28, z14\n fmla z28.s, p3/M, z7.s, z9.s\n"
+ "ldr x23, [x17, #0x30]\n"
+ "ldr x26, [x17, #0x38]\n"
+ "movprfx z27, z14\n fmla z27.s, p3/M, z6.s, z9.s\n"
+ "fmla z29.s, p3/M, z0.s, z10.s\n"
+ "ldr x22, [x17, #0x28]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "fmla z28.s, p3/M, z4.s, z13.s\n"
+ "movprfx z26, z14\n fmla z26.s, p3/M, z5.s, z9.s\n"
+ "ldr x20, [x17, #0x40]\n"
+ "ld1w { z19.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "movprfx z25, z14\n fmla z25.s, p3/M, z4.s, z9.s\n"
+ "movprfx z24, z14\n fmla z24.s, p3/M, z3.s, z9.s\n"
+ "ldr x25, [x17, #0x50]\n"
+ "ldr x24, [x17, #0x58]\n"
+ "fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "movprfx z23, z14\n fmla z23.s, p3/M, z2.s, z9.s\n"
+ "ldr x23, [x17, #0x60]\n"
+ "fmla z29.s, p3/M, z5.s, z13.s\n"
+ "fmla z28.s, p3/M, z6.s, z18.s\n"
+ "ldr x12, [x17, #0x70]\n"
+ "ldr x11, [x17, #0x88]\n"
+ "movprfx z22, z14\n fmla z22.s, p3/M, z0.s, z9.s\n"
+ "fmla z27.s, p3/M, z3.s, z13.s\n"
+ "incw x14\n"
+ "mov p0.b, p2.b\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "fmla z25.s, p3/M, z1.s, z13.s\n"
+ "ldr x10, [x13, #0x0]\n"
+ "ldr x9, [x13, #0x8]\n"
+ "fmla z24.s, p3/M, z0.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z23.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "movprfx z21, z14\n fmla z21.s, p3/M, z1.s, z9.s\n"
+ "fmla z29.s, p3/M, z7.s, z18.s\n"
+ "ldr x22, [x17, #0x68]\n"
+ "ldr x21, [x17, #0x78]\n"
+ "fmla z28.s, p3/M, z0.s, z17.s\n"
+ "fmla z22.s, p3/M, z8.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x20, [x17, #0x80]\n"
+ "fmla z26.s, p3/M, z4.s, z18.s\n"
+ "fmla z25.s, p3/M, z3.s, z18.s\n"
+ "ldr x28, [x13, #0x10]\n"
+ "ldr x27, [x13, #0x18]\n"
+ "fmla z21.s, p3/M, z0.s, z18.s\n"
+ "fmla z24.s, p3/M, z4.s, z19.s\n"
+ "fmla z23.s, p3/M, z1.s, z18.s\n"
+ "fmla z29.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z20.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z16.s\n"
+ "fmla z27.s, p3/M, z1.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "ldr x26, [x17, #0x90]\n"
+ "fmla z25.s, p3/M, z5.s, z19.s\n"
+ "fmla z21.s, p3/M, z2.s, z19.s\n"
+ "ldr x25, [x17, #0xa0]\n"
+ "ldr x24, [x17, #0x98]\n"
+ "fmla z26.s, p3/M, z0.s, z20.s\n"
+ "fmla z24.s, p3/M, z2.s, z17.s\n"
+ "fmla z28.s, p3/M, z8.s, z19.s\n"
+ "fmla z27.s, p3/M, z7.s, z19.s\n"
+ "fmla z22.s, p3/M, z1.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "ldr x23, [x17, #0xa8]\n"
+ "fmla z26.s, p3/M, z6.s, z16.s\n"
+ "fmla z25.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z19.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "ldr x22, [x17, #0xc0]\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "fmla z21.s, p3/M, z4.s, z18.s\n"
+ "fmla z29.s, p3/M, z3.s, z20.s\n"
+ "fmla z27.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x12, x15, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z23.s, p3/M, z5.s, z18.s\n"
+ "fmla z22.s, p3/M, z3.s, z18.s\n"
+ "ldr x21, [x17, #0xb0]\n"
+ "ldr x20, [x17, #0xb8]\n"
+ "fmla z26.s, p3/M, z8.s, z18.s\n"
+ "fmla z24.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z6.s, z16.s\n"
+ "fmla z28.s, p3/M, z3.s, z19.s\n"
+ "fmla z25.s, p3/M, z0.s, z19.s\n"
+ "fmla z22.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x11, x15, LSL #2]\n"
+ "fmla z23.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x26, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z4.s, z19.s\n"
+ "fmla z26.s, p3/M, z1.s, z19.s\n"
+ "fmla z28.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x24, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z17.s\n"
+ "fmla z25.s, p3/M, z2.s, z17.s\n"
+ "fmla z24.s, p3/M, z1.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z18.s\n"
+ "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z2.s, z17.s\n"
+ "fmla z26.s, p3/M, z7.s, z16.s\n"
+ "fmla z25.s, p3/M, z6.s, z16.s\n"
+ "fmla z23.s, p3/M, z4.s, z16.s\n"
+ "fmla z21.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z18.s\n"
+ "fmla z28.s, p3/M, z1.s, z17.s\n"
+ "fmax z28.s, p3/M, z28.s, z31.s\n"
+ "fmin z28.s, p3/M, z28.s, z30.s\n"
+ "fmla z27.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x15, LSL #2]\n"
+ "fmla z29.s, p3/M, z6.s, z16.s\n"
+ "fmax z29.s, p3/M, z29.s, z31.s\n"
+ "fmla z24.s, p3/M, z7.s, z18.s\n"
+ "fmla z21.s, p3/M, z5.s, z18.s\n"
+ "fmin z29.s, p3/M, z29.s, z30.s\n"
+ "st1w { z29.s }, p0, [x10, x14, LSL #2]\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "fmla z22.s, p3/M, z2.s, z17.s\n"
+ "ldr x20, [x13, #0x20]\n"
+ "st1w { z28.s }, p0, [x9, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z8.s, z18.s\n"
+ "fmla z26.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x15, LSL #2]\n"
+ "fmax z26.s, p3/M, z26.s, z31.s\n"
+ "fmla z27.s, p3/M, z8.s, z17.s\n"
+ "fmla z24.s, p3/M, z5.s, z17.s\n"
+ "fmax z27.s, p3/M, z27.s, z31.s\n"
+ "fmax z25.s, p3/M, z25.s, z31.s\n"
+ "fmla z23.s, p3/M, z8.s, z16.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "fmin z27.s, p3/M, z27.s, z30.s\n"
+ "fmin z26.s, p3/M, z26.s, z30.s\n"
+ "fmla z22.s, p3/M, z6.s, z16.s\n"
+ "fmin z25.s, p3/M, z25.s, z30.s\n"
+ "fmax z24.s, p3/M, z24.s, z31.s\n"
+ "st1w { z27.s }, p0, [x28, x14, LSL #2]\n"
+ "fmax z23.s, p3/M, z23.s, z31.s\n"
+ "fmax z21.s, p3/M, z21.s, z31.s\n"
+ "st1w { z26.s }, p0, [x27, x14, LSL #2]\n"
+ "ldr x23, [x13, #0x28]\n"
+ "fmax z22.s, p3/M, z22.s, z31.s\n"
+ "st1w { z25.s }, p0, [x20, x14, LSL #2]\n"
+ "ldr x22, [x13, #0x30]\n"
+ "ldr x21, [x13, #0x38]\n"
+ "ldr x20, [x13, #0x40]\n"
+ "fmin z24.s, p3/M, z24.s, z30.s\n"
+ "fmin z23.s, p3/M, z23.s, z30.s\n"
+ "st1w { z24.s }, p0, [x23, x14, LSL #2]\n"
+ "fmin z21.s, p3/M, z21.s, z30.s\n"
+ "fmin z22.s, p3/M, z22.s, z30.s\n"
+ "st1w { z23.s }, p0, [x22, x14, LSL #2]\n"
+ "st1w { z21.s }, p0, [x21, x14, LSL #2]\n"
+ "st1w { z22.s }, p0, [x20, x14, LSL #2]\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..38b377509e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 4;
+ constexpr static unsigned int output_cols = 4;
+
+ sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..35445595f8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,656 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "mov x16, #0x0\n"
+ "mov x4, #0x0\n"
+ "1:" // Tile loop
+ "str x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x4\n"
+ "mov x24, #0x4\n"
+ "str x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "mul x21, x16, x23\n" // offset = tile_i * ld_input_row
+ "ldr x5, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x20, x16, x22\n" // offset = tile_i * ld_output_row
+ "add x7, x5, x5\n"
+ "madd x21, x4, x5, x21\n" // offset += tile_j * ld_input_col
+ "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "cntw x16\n"
+ "madd x20, x4, x6, x20\n" // offset += tile_j * ld_output_col
+ "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x14, x7, x5\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "mul x21, x21, x25\n" // offset *= kernel_stride * output_size
+ "add x8, x8, x21, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x13, x8, x23, LSL #2\n"
+ "ld1w { z19.s }, p3/Z, [x17]\n"
+ "mul x20, x20, x24\n" // offset *= output_tile_size
+ "add x12, x13, x23, LSL #2\n"
+ "add x15, x15, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
+ "add x11, x12, x23, LSL #2\n"
+ "add x10, x14, x5\n"
+ "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
+ "add x9, x15, x22, LSL #2\n"
+ "add x28, x11, x23, LSL #2\n"
+ "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
+ "addvl x17, x17, #16\n"
+ "add x27, x10, x5\n"
+ "add x26, x9, x22, LSL #2\n"
+ "add x25, x6, x6\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "cmp x16, %x[n_channels]\n"
+ "add x24, x28, x23, LSL #2\n"
+ "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
+ "add x23, x26, x22, LSL #2\n"
+ "add x22, x25, x6\n"
+ "ld1w { z9.s }, p2/Z, [x12, x7, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x8]\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x16\n"
+ "ld1w { z11.s }, p2/Z, [x8, x27, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x12, x14, LSL #2]\n"
+ "addvl x17, x17, #-6\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "movprfx z14, z19\n fmla z14.s, p3/M, z4.s, z9.s\n"
+ "movprfx z31, z19\n fmla z31.s, p3/M, z8.s, z9.s\n"
+ "whilelt p1.s, x16, %x[n_channels]\n"
+ "incw x21\n"
+ "movprfx z21, z19\n fmla z21.s, p3/M, z3.s, z9.s\n"
+ "movprfx z22, z19\n fmla z22.s, p3/M, z1.s, z9.s\n"
+ "incw x16\n"
+ "mov p0.b, p2.b\n"
+ "movprfx z20, z19\n fmla z20.s, p3/M, z0.s, z9.s\n"
+ "fmla z14.s, p3/M, z5.s, z12.s\n"
+ "incw x20\n"
+ "movprfx z13, z19\n fmla z13.s, p3/M, z7.s, z9.s\n"
+ "movprfx z17, z19\n fmla z17.s, p3/M, z6.s, z9.s\n"
+ "movprfx z27, z19\n fmla z27.s, p3/M, z5.s, z9.s\n"
+ "movprfx z18, z19\n fmla z18.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x11, x7, LSL #2]\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "movprfx z30, z19\n fmla z30.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z29.s }, p2/Z, [x24]\n"
+ "ld1w { z11.s }, p2/Z, [x24, x27, LSL #2]\n"
+ "fmla z21.s, p3/M, z4.s, z12.s\n"
+ "fmla z22.s, p3/M, z2.s, z12.s\n"
+ "fmla z20.s, p3/M, z1.s, z12.s\n"
+ "movprfx z23, z19\n fmla z23.s, p3/M, z6.s, z29.s\n"
+ "ld1w { z10.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "fmla z14.s, p3/M, z7.s, z9.s\n"
+ "fmla z13.s, p3/M, z8.s, z12.s\n"
+ "fmla z17.s, p3/M, z7.s, z12.s\n"
+ "fmla z30.s, p3/M, z6.s, z12.s\n"
+ "movprfx z26, z19\n fmla z26.s, p3/M, z3.s, z12.s\n"
+ "movprfx z28, z19\n fmla z28.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x8, x5, LSL #2]\n"
+ "movprfx z24, z19\n fmla z24.s, p3/M, z8.s, z11.s\n"
+ "fmla z21.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x8, x10, LSL #2]\n"
+ "fmla z22.s, p3/M, z4.s, z9.s\n"
+ "fmla z20.s, p3/M, z3.s, z9.s\n"
+ "movprfx z25, z19\n fmla z25.s, p3/M, z1.s, z9.s\n"
+ "movprfx z29, z19\n fmla z29.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z19.s }, p3/Z, [x17]\n"
+ "fmla z27.s, p3/M, z8.s, z9.s\n"
+ "fmla z18.s, p3/M, z5.s, z9.s\n"
+ "fmla z23.s, p3/M, z2.s, z9.s\n"
+ "fmla z14.s, p3/M, z8.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x13]\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "fmla z13.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x13, x27, LSL #2]\n"
+ "fmla z17.s, p3/M, z2.s, z11.s\n"
+ "fmla z30.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x28]\n"
+ "fmla z21.s, p3/M, z7.s, z10.s\n"
+ "fmla z26.s, p3/M, z6.s, z10.s\n"
+ "fmla z22.s, p3/M, z5.s, z10.s\n"
+ "fmla z20.s, p3/M, z4.s, z10.s\n"
+ "fmla z28.s, p3/M, z3.s, z10.s\n"
+ "fmla z25.s, p3/M, z2.s, z10.s\n"
+ "fmla z29.s, p3/M, z1.s, z10.s\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x13, x7, LSL #2]\n"
+ "fmla z27.s, p3/M, z0.s, z9.s\n"
+ "fmla z18.s, p3/M, z6.s, z11.s\n"
+ "fmla z23.s, p3/M, z3.s, z11.s\n"
+ "fmla z14.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x27, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z9.s\n"
+ "fmla z30.s, p3/M, z5.s, z12.s\n"
+ "fmla z26.s, p3/M, z2.s, z12.s\n"
+ "fmla z13.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z9.s }, p2/Z, [x13, x14, LSL #2]\n"
+ "fmla z17.s, p3/M, z3.s, z10.s\n"
+ "fmla z21.s, p3/M, z0.s, z10.s\n"
+ "fmla z28.s, p3/M, z8.s, z11.s\n"
+ "fmla z24.s, p3/M, z5.s, z11.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x5, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z10.s\n"
+ "fmla z14.s, p3/M, z2.s, z9.s\n"
+ "fmla z31.s, p3/M, z5.s, z10.s\n"
+ "fmla z13.s, p3/M, z5.s, z9.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x5, LSL #2]\n"
+ "fmla z17.s, p3/M, z4.s, z9.s\n"
+ "fmla z30.s, p3/M, z3.s, z9.s\n"
+ "fmla z21.s, p3/M, z1.s, z9.s\n"
+ "fmla z26.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z10.s }, p2/Z, [x12, x10, LSL #2]\n"
+ "fmla z23.s, p3/M, z7.s, z12.s\n"
+ "fmla z25.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x10, LSL #2]\n"
+ "fmla z27.s, p3/M, z4.s, z11.s\n"
+ "fmla z14.s, p3/M, z3.s, z11.s\n"
+ "fmla z18.s, p3/M, z1.s, z11.s\n"
+ "fmla z22.s, p3/M, z0.s, z11.s\n"
+ "fmla z31.s, p3/M, z7.s, z11.s\n"
+ "fmla z13.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z9.s }, p2/Z, [x8, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z8.s, z12.s\n"
+ "fmla z24.s, p3/M, z7.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x5, LSL #2]\n"
+ "fmla z17.s, p3/M, z8.s, z10.s\n"
+ "fmla z30.s, p3/M, z7.s, z10.s\n"
+ "fmla z21.s, p3/M, z5.s, z10.s\n"
+ "fmla z26.s, p3/M, z4.s, z10.s\n"
+ "fmla z20.s, p3/M, z2.s, z10.s\n"
+ "fmla z28.s, p3/M, z1.s, z10.s\n"
+ "ld1w { z11.s }, p2/Z, [x8, x14, LSL #2]\n"
+ "addvl x8, x8, #1\n"
+ "fmla z27.s, p3/M, z7.s, z12.s\n"
+ "fmla z14.s, p3/M, z6.s, z12.s\n"
+ "fmla z18.s, p3/M, z4.s, z12.s\n"
+ "fmla z22.s, p3/M, z3.s, z12.s\n"
+ "fmla z23.s, p3/M, z1.s, z12.s\n"
+ "fmla z25.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x11, x10, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z9.s\n"
+ "fmla z13.s, p3/M, z1.s, z9.s\n"
+ "fmla z17.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x12]\n"
+ "fmla z29.s, p3/M, z2.s, z12.s\n"
+ "fmla z30.s, p3/M, z0.s, z11.s\n"
+ "fmla z27.s, p3/M, z3.s, z9.s\n"
+ "fmla z18.s, p3/M, z0.s, z9.s\n"
+ "fmla z21.s, p3/M, z8.s, z12.s\n"
+ "fmla z26.s, p3/M, z7.s, z12.s\n"
+ "fmla z20.s, p3/M, z5.s, z12.s\n"
+ "fmla z28.s, p3/M, z4.s, z12.s\n"
+ "fmla z24.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x7, LSL #2]\n"
+ "fmla z13.s, p3/M, z2.s, z11.s\n"
+ "fmla z17.s, p3/M, z1.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x12, x27, LSL #2]\n"
+ "addvl x12, x12, #1\n"
+ "fmla z31.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z12.s }, p2/Z, [x11]\n"
+ "fmla z25.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z9.s }, p1/Z, [x12, x7, LSL #2]\n"
+ "fmla z29.s, p3/M, z3.s, z10.s\n"
+ "fmla z30.s, p3/M, z8.s, z11.s\n"
+ "fmla z26.s, p3/M, z5.s, z11.s\n"
+ "fmla z28.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x11, x27, LSL #2]\n"
+ "addvl x11, x11, #1\n"
+ "fmla z27.s, p3/M, z6.s, z12.s\n"
+ "fmla z18.s, p3/M, z3.s, z12.s\n"
+ "fmla z23.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x7, LSL #2]\n"
+ "fmla z24.s, p3/M, z2.s, z11.s\n"
+ "fmla z25.s, p3/M, z7.s, z12.s\n"
+ "fmla z29.s, p3/M, z6.s, z12.s\n"
+ "fmla z18.s, p3/M, z8.s, z10.s\n"
+ "fmla z22.s, p3/M, z7.s, z10.s\n"
+ "fmla z20.s, p3/M, z6.s, z10.s\n"
+ "fmla z23.s, p3/M, z5.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x28, x14, LSL #2]\n"
+ "fmla z28.s, p3/M, z5.s, z11.s\n"
+ "fmla z25.s, p3/M, z5.s, z10.s\n"
+ "fmla z29.s, p3/M, z4.s, z10.s\n"
+ "fmla z24.s, p3/M, z3.s, z10.s\n"
+ "fmla z26.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "fmla z23.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x13, x5, LSL #2]\n"
+ "fmla z22.s, p3/M, z8.s, z10.s\n"
+ "fmla z20.s, p3/M, z7.s, z10.s\n"
+ "addvl x24, x24, #1\n"
+ "fmla z28.s, p3/M, z6.s, z10.s\n"
+ "fmla z25.s, p3/M, z8.s, z11.s\n"
+ "ld1w { z10.s }, p2/Z, [x13, x10, LSL #2]\n"
+ "addvl x13, x13, #1\n"
+ "fmla z29.s, p3/M, z7.s, z11.s\n"
+ "fmla z24.s, p3/M, z6.s, z11.s\n"
+ "ld1w { z11.s }, p2/Z, [x28, x5, LSL #2]\n"
+ "fmla z31.s, p3/M, z4.s, z12.s\n"
+ "fmla z13.s, p3/M, z3.s, z12.s\n"
+ "fmax z31.s, p3/M, z31.s, z15.s\n"
+ "fmax z13.s, p3/M, z13.s, z15.s\n"
+ "fmla z27.s, p3/M, z1.s, z12.s\n"
+ "fmla z14.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z0.s }, p2/Z, [x28, x10, LSL #2]\n"
+ "fmax z27.s, p3/M, z27.s, z15.s\n"
+ "fmla z17.s, p3/M, z5.s, z10.s\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "fmax z17.s, p3/M, z17.s, z15.s\n"
+ "fmax z30.s, p3/M, z30.s, z15.s\n"
+ "fmla z21.s, p3/M, z2.s, z10.s\n"
+ "fmla z26.s, p3/M, z1.s, z10.s\n"
+ "fmax z14.s, p3/M, z14.s, z15.s\n"
+ "fmax z21.s, p3/M, z21.s, z15.s\n"
+ "fmla z18.s, p3/M, z7.s, z11.s\n"
+ "fmla z22.s, p3/M, z6.s, z11.s\n"
+ "fmax z26.s, p3/M, z26.s, z15.s\n"
+ "fmax z18.s, p3/M, z18.s, z15.s\n"
+ "fmla z23.s, p3/M, z4.s, z11.s\n"
+ "fmla z25.s, p3/M, z3.s, z11.s\n"
+ "fmax z22.s, p3/M, z22.s, z15.s\n"
+ "fmax z23.s, p3/M, z23.s, z15.s\n"
+ "fmla z20.s, p3/M, z8.s, z0.s\n"
+ "fmla z28.s, p3/M, z7.s, z0.s\n"
+ "fmax z20.s, p3/M, z20.s, z15.s\n"
+ "fmax z28.s, p3/M, z28.s, z15.s\n"
+ "fmla z29.s, p3/M, z5.s, z0.s\n"
+ "fmla z24.s, p3/M, z4.s, z0.s\n"
+ "fmax z25.s, p3/M, z25.s, z15.s\n"
+ "fmax z29.s, p3/M, z29.s, z15.s\n"
+ "fmax z24.s, p3/M, z24.s, z15.s\n"
+ "ld1w { z0.s }, p3/Z, [x17, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x17, #2, MUL VL]\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
+ "ld1w { z2.s }, p3/Z, [x17, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x17, #4, MUL VL]\n"
+ "cmp x16, %x[n_channels]\n"
+ "fmin z31.s, p3/M, z31.s, z16.s\n"
+ "ld1w { z4.s }, p3/Z, [x17, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x17, #6, MUL VL]\n"
+ "fmin z13.s, p3/M, z13.s, z16.s\n"
+ "fmin z17.s, p3/M, z17.s, z16.s\n"
+ "ld1w { z6.s }, p3/Z, [x17, #7, MUL VL]\n"
+ "addvl x17, x17, #16\n"
+ "fmin z30.s, p3/M, z30.s, z16.s\n"
+ "ld1w { z10.s }, p1/Z, [x8]\n"
+ "fmin z27.s, p3/M, z27.s, z16.s\n"
+ "fmin z14.s, p3/M, z14.s, z16.s\n"
+ "ld1w { z11.s }, p1/Z, [x8, x27, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x12, x14, LSL #2]\n"
+ "fmin z21.s, p3/M, z21.s, z16.s\n"
+ "fmin z26.s, p3/M, z26.s, z16.s\n"
+ "st1w { z31.s }, p0, [x15]\n"
+ "ld1w { z7.s }, p3/Z, [x17, #-8, MUL VL]\n"
+ "fmin z18.s, p3/M, z18.s, z16.s\n"
+ "fmin z22.s, p3/M, z22.s, z16.s\n"
+ "st1w { z13.s }, p0, [x15, x6, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x17, #-7, MUL VL]\n"
+ "fmin z20.s, p3/M, z20.s, z16.s\n"
+ "fmin z28.s, p3/M, z28.s, z16.s\n"
+ "st1w { z17.s }, p0, [x15, x25, LSL #2]\n"
+ "fmin z23.s, p3/M, z23.s, z16.s\n"
+ "fmin z25.s, p3/M, z25.s, z16.s\n"
+ "st1w { z30.s }, p0, [x15, x22, LSL #2]\n"
+ "fmin z29.s, p3/M, z29.s, z16.s\n"
+ "fmin z24.s, p3/M, z24.s, z16.s\n"
+ "st1w { z27.s }, p0, [x9]\n"
+ "addvl x28, x28, #1\n"
+ "st1w { z14.s }, p0, [x9, x6, LSL #2]\n"
+ "addvl x15, x15, #1\n"
+ "st1w { z21.s }, p0, [x9, x25, LSL #2]\n"
+ "addvl x17, x17, #-6\n"
+ "st1w { z26.s }, p0, [x9, x22, LSL #2]\n"
+ "addvl x9, x9, #1\n"
+ "st1w { z18.s }, p0, [x26]\n"
+ "st1w { z22.s }, p0, [x26, x6, LSL #2]\n"
+ "st1w { z20.s }, p0, [x26, x25, LSL #2]\n"
+ "st1w { z28.s }, p0, [x26, x22, LSL #2]\n"
+ "addvl x26, x26, #1\n"
+ "st1w { z23.s }, p0, [x23]\n"
+ "st1w { z25.s }, p0, [x23, x6, LSL #2]\n"
+ "st1w { z29.s }, p0, [x23, x25, LSL #2]\n"
+ "st1w { z24.s }, p0, [x23, x22, LSL #2]\n"
+ "addvl x23, x23, #1\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "movprfx z14, z19\n fmla z14.s, p3/M, z4.s, z9.s\n"
+ "movprfx z31, z19\n fmla z31.s, p3/M, z8.s, z9.s\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "movprfx z30, z19\n fmla z30.s, p3/M, z3.s, z9.s\n"
+ "movprfx z13, z19\n fmla z13.s, p3/M, z1.s, z9.s\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "add x4, x4, #0x1\n"
+ "movprfx z20, z19\n fmla z20.s, p3/M, z0.s, z9.s\n"
+ "fmla z14.s, p3/M, z5.s, z12.s\n"
+ "cmp x4, x20\n"
+ "add x21, x16, #0x1\n"
+ "movprfx z18, z19\n fmla z18.s, p3/M, z7.s, z9.s\n"
+ "movprfx z28, z19\n fmla z28.s, p3/M, z6.s, z9.s\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "csel x16, x16, x21, LT\n"
+ "movprfx z17, z19\n fmla z17.s, p3/M, z5.s, z9.s\n"
+ "movprfx z26, z19\n fmla z26.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z9.s }, p2/Z, [x11, x7, LSL #2]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z31.s, p3/M, z0.s, z10.s\n"
+ "movprfx z27, z19\n fmla z27.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z29.s }, p2/Z, [x24]\n"
+ "ld1w { z21.s }, p2/Z, [x24, x27, LSL #2]\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
+ "fmla z13.s, p3/M, z2.s, z12.s\n"
+ "csel x4, x4, XZR, LT\n"
+ "cmp x16, x20\n"
+ "fmla z20.s, p3/M, z1.s, z12.s\n"
+ "movprfx z10, z19\n fmla z10.s, p3/M, z6.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x11, x14, LSL #2]\n"
+ "fmla z14.s, p3/M, z7.s, z9.s\n"
+ "fmla z18.s, p3/M, z8.s, z12.s\n"
+ "fmla z28.s, p3/M, z7.s, z12.s\n"
+ "fmla z27.s, p3/M, z6.s, z12.s\n"
+ "movprfx z11, z19\n fmla z11.s, p3/M, z3.s, z12.s\n"
+ "movprfx z25, z19\n fmla z25.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z22.s }, p2/Z, [x8, x5, LSL #2]\n"
+ "movprfx z24, z19\n fmla z24.s, p3/M, z8.s, z21.s\n"
+ "fmla z30.s, p3/M, z6.s, z9.s\n"
+ "ld1w { z21.s }, p2/Z, [x8, x10, LSL #2]\n"
+ "fmla z13.s, p3/M, z4.s, z9.s\n"
+ "fmla z20.s, p3/M, z3.s, z9.s\n"
+ "movprfx z12, z19\n fmla z12.s, p3/M, z1.s, z9.s\n"
+ "movprfx z23, z19\n fmla z23.s, p3/M, z0.s, z9.s\n"
+ "fmla z17.s, p3/M, z8.s, z9.s\n"
+ "fmla z26.s, p3/M, z5.s, z9.s\n"
+ "fmla z10.s, p3/M, z2.s, z9.s\n"
+ "fmla z14.s, p3/M, z8.s, z29.s\n"
+ "ld1w { z9.s }, p2/Z, [x13]\n"
+ "fmla z31.s, p3/M, z1.s, z22.s\n"
+ "fmla z18.s, p3/M, z0.s, z22.s\n"
+ "ld1w { z22.s }, p2/Z, [x13, x27, LSL #2]\n"
+ "fmla z28.s, p3/M, z2.s, z21.s\n"
+ "fmla z27.s, p3/M, z1.s, z21.s\n"
+ "ld1w { z19.s }, p2/Z, [x28]\n"
+ "fmla z30.s, p3/M, z7.s, z29.s\n"
+ "fmla z11.s, p3/M, z6.s, z29.s\n"
+ "fmla z13.s, p3/M, z5.s, z29.s\n"
+ "fmla z20.s, p3/M, z4.s, z29.s\n"
+ "fmla z25.s, p3/M, z3.s, z29.s\n"
+ "fmla z12.s, p3/M, z2.s, z29.s\n"
+ "fmla z23.s, p3/M, z1.s, z29.s\n"
+ "fmla z24.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z21.s }, p2/Z, [x13, x7, LSL #2]\n"
+ "fmla z17.s, p3/M, z0.s, z9.s\n"
+ "fmla z26.s, p3/M, z6.s, z19.s\n"
+ "fmla z10.s, p3/M, z3.s, z19.s\n"
+ "fmla z14.s, p3/M, z1.s, z21.s\n"
+ "ld1w { z19.s }, p2/Z, [x28, x27, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z9.s\n"
+ "fmla z27.s, p3/M, z5.s, z22.s\n"
+ "fmla z11.s, p3/M, z2.s, z22.s\n"
+ "fmla z18.s, p3/M, z4.s, z21.s\n"
+ "ld1w { z29.s }, p2/Z, [x13, x14, LSL #2]\n"
+ "fmla z28.s, p3/M, z3.s, z21.s\n"
+ "fmla z30.s, p3/M, z0.s, z21.s\n"
+ "fmla z25.s, p3/M, z8.s, z19.s\n"
+ "fmla z24.s, p3/M, z5.s, z19.s\n"
+ "ld1w { z19.s }, p2/Z, [x24, x5, LSL #2]\n"
+ "fmla z17.s, p3/M, z2.s, z21.s\n"
+ "fmla z14.s, p3/M, z2.s, z29.s\n"
+ "fmla z31.s, p3/M, z5.s, z21.s\n"
+ "fmla z18.s, p3/M, z5.s, z29.s\n"
+ "ld1w { z22.s }, p2/Z, [x12, x5, LSL #2]\n"
+ "fmla z28.s, p3/M, z4.s, z29.s\n"
+ "fmla z27.s, p3/M, z3.s, z29.s\n"
+ "fmla z30.s, p3/M, z1.s, z29.s\n"
+ "fmla z11.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z21.s }, p2/Z, [x12, x10, LSL #2]\n"
+ "fmla z10.s, p3/M, z7.s, z19.s\n"
+ "fmla z12.s, p3/M, z6.s, z19.s\n"
+ "ld1w { z19.s }, p2/Z, [x24, x10, LSL #2]\n"
+ "fmla z17.s, p3/M, z4.s, z22.s\n"
+ "fmla z14.s, p3/M, z3.s, z22.s\n"
+ "fmla z26.s, p3/M, z1.s, z22.s\n"
+ "fmla z13.s, p3/M, z0.s, z22.s\n"
+ "fmla z31.s, p3/M, z7.s, z22.s\n"
+ "fmla z18.s, p3/M, z6.s, z22.s\n"
+ "ld1w { z29.s }, p2/Z, [x8, x7, LSL #2]\n"
+ "fmla z23.s, p3/M, z8.s, z19.s\n"
+ "fmla z24.s, p3/M, z7.s, z19.s\n"
+ "ld1w { z19.s }, p2/Z, [x11, x5, LSL #2]\n"
+ "fmla z28.s, p3/M, z8.s, z21.s\n"
+ "fmla z27.s, p3/M, z7.s, z21.s\n"
+ "fmla z30.s, p3/M, z5.s, z21.s\n"
+ "fmla z11.s, p3/M, z4.s, z21.s\n"
+ "fmla z20.s, p3/M, z2.s, z21.s\n"
+ "fmla z25.s, p3/M, z1.s, z21.s\n"
+ "ld1w { z22.s }, p2/Z, [x8, x14, LSL #2]\n"
+ "fmla z17.s, p3/M, z7.s, z19.s\n"
+ "fmla z14.s, p3/M, z6.s, z19.s\n"
+ "fmla z26.s, p3/M, z4.s, z19.s\n"
+ "fmla z13.s, p3/M, z3.s, z19.s\n"
+ "fmla z10.s, p3/M, z1.s, z19.s\n"
+ "fmla z12.s, p3/M, z0.s, z19.s\n"
+ "ld1w { z21.s }, p2/Z, [x11, x10, LSL #2]\n"
+ "fmla z31.s, p3/M, z2.s, z29.s\n"
+ "fmla z18.s, p3/M, z1.s, z29.s\n"
+ "fmla z28.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x12]\n"
+ "fmla z23.s, p3/M, z2.s, z21.s\n"
+ "fmla z27.s, p3/M, z0.s, z22.s\n"
+ "fmla z17.s, p3/M, z3.s, z29.s\n"
+ "fmla z26.s, p3/M, z0.s, z29.s\n"
+ "fmla z30.s, p3/M, z8.s, z21.s\n"
+ "fmla z11.s, p3/M, z7.s, z21.s\n"
+ "fmla z20.s, p3/M, z5.s, z21.s\n"
+ "fmla z25.s, p3/M, z4.s, z21.s\n"
+ "fmla z24.s, p3/M, z1.s, z21.s\n"
+ "ld1w { z19.s }, p2/Z, [x28, x7, LSL #2]\n"
+ "fmla z18.s, p3/M, z2.s, z22.s\n"
+ "fmla z28.s, p3/M, z1.s, z22.s\n"
+ "ld1w { z21.s }, p2/Z, [x12, x27, LSL #2]\n"
+ "fmla z31.s, p3/M, z6.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x11]\n"
+ "fmla z12.s, p3/M, z4.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z19.s\n"
+ "fmla z27.s, p3/M, z8.s, z21.s\n"
+ "fmla z11.s, p3/M, z5.s, z21.s\n"
+ "fmla z25.s, p3/M, z2.s, z21.s\n"
+ "ld1w { z9.s }, p2/Z, [x11, x27, LSL #2]\n"
+ "fmla z17.s, p3/M, z6.s, z29.s\n"
+ "fmla z26.s, p3/M, z3.s, z29.s\n"
+ "fmla z10.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z22.s }, p2/Z, [x24, x7, LSL #2]\n"
+ "fmla z24.s, p3/M, z2.s, z9.s\n"
+ "fmla z12.s, p3/M, z7.s, z22.s\n"
+ "fmla z23.s, p3/M, z6.s, z22.s\n"
+ "fmla z26.s, p3/M, z8.s, z19.s\n"
+ "fmla z13.s, p3/M, z7.s, z19.s\n"
+ "fmla z20.s, p3/M, z6.s, z19.s\n"
+ "fmla z10.s, p3/M, z5.s, z19.s\n"
+ "ld1w { z21.s }, p2/Z, [x28, x14, LSL #2]\n"
+ "fmla z25.s, p3/M, z5.s, z9.s\n"
+ "fmla z12.s, p3/M, z5.s, z21.s\n"
+ "fmla z23.s, p3/M, z4.s, z21.s\n"
+ "fmla z24.s, p3/M, z3.s, z21.s\n"
+ "fmla z11.s, p3/M, z8.s, z9.s\n"
+ "ld1w { z19.s }, p2/Z, [x24, x14, LSL #2]\n"
+ "fmla z10.s, p3/M, z8.s, z22.s\n"
+ "ld1w { z22.s }, p2/Z, [x13, x5, LSL #2]\n"
+ "fmla z13.s, p3/M, z8.s, z21.s\n"
+ "fmla z20.s, p3/M, z7.s, z21.s\n"
+ "fmla z25.s, p3/M, z6.s, z21.s\n"
+ "fmla z12.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z29.s }, p2/Z, [x13, x10, LSL #2]\n"
+ "fmla z23.s, p3/M, z7.s, z19.s\n"
+ "fmla z24.s, p3/M, z6.s, z19.s\n"
+ "ld1w { z21.s }, p2/Z, [x28, x5, LSL #2]\n"
+ "fmla z31.s, p3/M, z4.s, z22.s\n"
+ "fmla z18.s, p3/M, z3.s, z22.s\n"
+ "fmax z31.s, p3/M, z31.s, z15.s\n"
+ "fmax z18.s, p3/M, z18.s, z15.s\n"
+ "fmla z17.s, p3/M, z1.s, z22.s\n"
+ "fmla z14.s, p3/M, z0.s, z22.s\n"
+ "ld1w { z9.s }, p2/Z, [x28, x10, LSL #2]\n"
+ "fmax z17.s, p3/M, z17.s, z15.s\n"
+ "fmla z28.s, p3/M, z5.s, z29.s\n"
+ "fmla z27.s, p3/M, z4.s, z29.s\n"
+ "fmax z28.s, p3/M, z28.s, z15.s\n"
+ "fmax z27.s, p3/M, z27.s, z15.s\n"
+ "fmla z30.s, p3/M, z2.s, z29.s\n"
+ "fmla z11.s, p3/M, z1.s, z29.s\n"
+ "fmax z14.s, p3/M, z14.s, z15.s\n"
+ "fmax z30.s, p3/M, z30.s, z15.s\n"
+ "fmla z26.s, p3/M, z7.s, z21.s\n"
+ "fmla z13.s, p3/M, z6.s, z21.s\n"
+ "fmax z11.s, p3/M, z11.s, z15.s\n"
+ "fmax z26.s, p3/M, z26.s, z15.s\n"
+ "fmla z10.s, p3/M, z4.s, z21.s\n"
+ "fmla z12.s, p3/M, z3.s, z21.s\n"
+ "fmax z13.s, p3/M, z13.s, z15.s\n"
+ "fmax z10.s, p3/M, z10.s, z15.s\n"
+ "fmla z20.s, p3/M, z8.s, z9.s\n"
+ "fmla z25.s, p3/M, z7.s, z9.s\n"
+ "fmax z20.s, p3/M, z20.s, z15.s\n"
+ "fmax z25.s, p3/M, z25.s, z15.s\n"
+ "fmla z23.s, p3/M, z5.s, z9.s\n"
+ "fmla z24.s, p3/M, z4.s, z9.s\n"
+ "fmax z12.s, p3/M, z12.s, z15.s\n"
+ "fmax z23.s, p3/M, z23.s, z15.s\n"
+ "fmax z24.s, p3/M, z24.s, z15.s\n"
+ "fmin z31.s, p3/M, z31.s, z16.s\n"
+ "st1w { z31.s }, p0, [x15]\n"
+ "fmin z18.s, p3/M, z18.s, z16.s\n"
+ "fmin z28.s, p3/M, z28.s, z16.s\n"
+ "st1w { z18.s }, p0, [x15, x6, LSL #2]\n"
+ "fmin z27.s, p3/M, z27.s, z16.s\n"
+ "fmin z17.s, p3/M, z17.s, z16.s\n"
+ "st1w { z28.s }, p0, [x15, x25, LSL #2]\n"
+ "fmin z14.s, p3/M, z14.s, z16.s\n"
+ "fmin z30.s, p3/M, z30.s, z16.s\n"
+ "st1w { z27.s }, p0, [x15, x22, LSL #2]\n"
+ "fmin z11.s, p3/M, z11.s, z16.s\n"
+ "fmin z26.s, p3/M, z26.s, z16.s\n"
+ "st1w { z17.s }, p0, [x9]\n"
+ "fmin z13.s, p3/M, z13.s, z16.s\n"
+ "fmin z20.s, p3/M, z20.s, z16.s\n"
+ "st1w { z14.s }, p0, [x9, x6, LSL #2]\n"
+ "fmin z25.s, p3/M, z25.s, z16.s\n"
+ "fmin z10.s, p3/M, z10.s, z16.s\n"
+ "st1w { z30.s }, p0, [x9, x25, LSL #2]\n"
+ "fmin z12.s, p3/M, z12.s, z16.s\n"
+ "fmin z23.s, p3/M, z23.s, z16.s\n"
+ "st1w { z11.s }, p0, [x9, x22, LSL #2]\n"
+ "fmin z24.s, p3/M, z24.s, z16.s\n"
+ "st1w { z26.s }, p0, [x26]\n"
+ "st1w { z13.s }, p0, [x26, x6, LSL #2]\n"
+ "st1w { z20.s }, p0, [x26, x25, LSL #2]\n"
+ "st1w { z25.s }, p0, [x26, x22, LSL #2]\n"
+ "st1w { z10.s }, p0, [x23]\n"
+ "st1w { z12.s }, p0, [x23, x6, LSL #2]\n"
+ "st1w { z23.s }, p0, [x23, x25, LSL #2]\n"
+ "st1w { z24.s }, p0, [x23, x22, LSL #2]\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..3db248924f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,714 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[36];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[14];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[5];
+ inptrs[3] = input_ptrs[15];
+ inptrs[4] = input_ptrs[30];
+ inptrs[5] = input_ptrs[35];
+ inptrs[6] = input_ptrs[20];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[4];
+ inptrs[9] = input_ptrs[21];
+ inptrs[10] = input_ptrs[6];
+ inptrs[11] = input_ptrs[11];
+ inptrs[12] = input_ptrs[24];
+ inptrs[13] = input_ptrs[8];
+ inptrs[14] = input_ptrs[29];
+ inptrs[15] = input_ptrs[9];
+ inptrs[16] = input_ptrs[31];
+ inptrs[17] = input_ptrs[13];
+ inptrs[18] = input_ptrs[34];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[19];
+ inptrs[22] = input_ptrs[3];
+ inptrs[23] = input_ptrs[12];
+ inptrs[24] = input_ptrs[22];
+ inptrs[25] = input_ptrs[17];
+ inptrs[26] = input_ptrs[18];
+ inptrs[27] = input_ptrs[26];
+ inptrs[28] = input_ptrs[23];
+ inptrs[29] = input_ptrs[32];
+ inptrs[30] = input_ptrs[27];
+ inptrs[31] = input_ptrs[33];
+ inptrs[32] = input_ptrs[7];
+ inptrs[33] = input_ptrs[10];
+ inptrs[34] = input_ptrs[25];
+ inptrs[35] = input_ptrs[28];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x8, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ld1w { z17.s }, p3/Z, [x7]\n"
+ "cntw x17\n"
+ "mov x16, #0x0\n"
+ "ld1w { z0.s }, p3/Z, [x7, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x7, #2, MUL VL]\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "ld1w { z2.s }, p3/Z, [x7, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x7, #4, MUL VL]\n"
+ "cmp x17, %x[n_channels]\n"
+ "ld1w { z4.s }, p3/Z, [x7, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x7, #6, MUL VL]\n"
+ "sub x15, XZR, x17\n"
+ "ld1w { z6.s }, p3/Z, [x7, #7, MUL VL]\n"
+ "addvl x7, x7, #16\n"
+ "ldp x23, x22, [x8, #0x0]\n"
+ "ldp x21, x20, [x8, #0x10]\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ld1rw { z16.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z19.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z7.s }, p3/Z, [x7, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x7, #-7, MUL VL]\n"
+ "addvl x7, x7, #-6\n"
+ "ld1w { z9.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "movprfx z20, z17\n fmla z20.s, p3/M, z4.s, z9.s\n"
+ "movprfx z26, z17\n fmla z26.s, p3/M, z8.s, z9.s\n"
+ "ldr x27, [x8, #0x20]\n"
+ "ldr x24, [x8, #0x30]\n"
+ "movprfx z24, z17\n fmla z24.s, p3/M, z3.s, z9.s\n"
+ "movprfx z30, z17\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "ldr x23, [x8, #0x28]\n"
+ "ldr x22, [x8, #0x38]\n"
+ "movprfx z31, z17\n fmla z31.s, p3/M, z0.s, z9.s\n"
+ "movprfx z22, z17\n fmla z22.s, p3/M, z7.s, z9.s\n"
+ "ldr x26, [x8, #0x40]\n"
+ "ldr x21, [x8, #0x48]\n"
+ "movprfx z27, z17\n fmla z27.s, p3/M, z6.s, z9.s\n"
+ "fmla z20.s, p3/M, z5.s, z12.s\n"
+ "ldr x25, [x8, #0x50]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "movprfx z14, z17\n fmla z14.s, p3/M, z5.s, z9.s\n"
+ "movprfx z23, z17\n fmla z23.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z25.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "ldr x13, [x8, #0x70]\n"
+ "fmla z26.s, p3/M, z0.s, z10.s\n"
+ "movprfx z9, z17\n fmla z9.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z28.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "ld1w { z21.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "fmla z24.s, p3/M, z4.s, z12.s\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "ldr x24, [x8, #0x60]\n"
+ "ldr x23, [x8, #0x68]\n"
+ "fmla z31.s, p3/M, z1.s, z12.s\n"
+ "fmla z22.s, p3/M, z8.s, z12.s\n"
+ "incw x15\n"
+ "mov p1.b, p2.b\n"
+ "fmla z27.s, p3/M, z7.s, z12.s\n"
+ "movprfx z15, z17\n fmla z15.s, p3/M, z6.s, z28.s\n"
+ "ld1w { z10.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ldr x28, [x8, #0x88]\n"
+ "fmla z20.s, p3/M, z7.s, z25.s\n"
+ "fmla z9.s, p3/M, z6.s, z12.s\n"
+ "ldr x12, [x14, #0x0]\n"
+ "ldr x11, [x14, #0x8]\n"
+ "movprfx z11, z17\n fmla z11.s, p3/M, z3.s, z12.s\n"
+ "movprfx z13, z17\n fmla z13.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "ldr x22, [x8, #0x78]\n"
+ "movprfx z28, z17\n fmla z28.s, p3/M, z8.s, z21.s\n"
+ "fmla z24.s, p3/M, z6.s, z25.s\n"
+ "ld1w { z29.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "ldr x21, [x8, #0x80]\n"
+ "fmla z30.s, p3/M, z4.s, z25.s\n"
+ "fmla z31.s, p3/M, z3.s, z25.s\n"
+ "ldr x10, [x14, #0x10]\n"
+ "ldr x9, [x14, #0x18]\n"
+ "movprfx z18, z17\n fmla z18.s, p3/M, z1.s, z25.s\n"
+ "movprfx z21, z17\n fmla z21.s, p3/M, z0.s, z25.s\n"
+ "whilelt p0.s, x17, %x[n_channels]\n"
+ "ld1w { z17.s }, p3/Z, [x7]\n"
+ "fmla z14.s, p3/M, z8.s, z25.s\n"
+ "fmla z23.s, p3/M, z5.s, z25.s\n"
+ "fmla z15.s, p3/M, z2.s, z25.s\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z25.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "ldr x27, [x8, #0x90]\n"
+ "fmla z22.s, p3/M, z0.s, z12.s\n"
+ "fmla z27.s, p3/M, z2.s, z29.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "ldr x20, [x8, #0x98]\n"
+ "fmla z20.s, p3/M, z8.s, z10.s\n"
+ "fmla z9.s, p3/M, z1.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "ldr x26, [x8, #0xa0]\n"
+ "fmla z24.s, p3/M, z7.s, z10.s\n"
+ "fmla z11.s, p3/M, z6.s, z10.s\n"
+ "fmla z30.s, p3/M, z5.s, z10.s\n"
+ "fmla z31.s, p3/M, z4.s, z10.s\n"
+ "fmla z13.s, p3/M, z3.s, z10.s\n"
+ "fmla z18.s, p3/M, z2.s, z10.s\n"
+ "fmla z21.s, p3/M, z1.s, z10.s\n"
+ "fmla z28.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "ldr x25, [x8, #0xa8]\n"
+ "fmla z26.s, p3/M, z3.s, z25.s\n"
+ "fmla z14.s, p3/M, z0.s, z25.s\n"
+ "fmla z23.s, p3/M, z6.s, z29.s\n"
+ "fmla z15.s, p3/M, z3.s, z29.s\n"
+ "ld1w { z25.s }, p2/Z, [x13, x16, LSL #2]\n"
+ "ldr x24, [x8, #0xb0]\n"
+ "fmla z22.s, p3/M, z4.s, z10.s\n"
+ "fmla z27.s, p3/M, z3.s, z10.s\n"
+ "fmla z20.s, p3/M, z1.s, z10.s\n"
+ "fmla z9.s, p3/M, z5.s, z12.s\n"
+ "fmla z11.s, p3/M, z2.s, z12.s\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "ldr x23, [x8, #0xb8]\n"
+ "fmla z13.s, p3/M, z8.s, z25.s\n"
+ "fmla z28.s, p3/M, z5.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ldr x22, [x8, #0xc0]\n"
+ "fmla z26.s, p3/M, z5.s, z10.s\n"
+ "fmla z14.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z29.s }, p2/Z, [x28, x16, LSL #2]\n"
+ "ldr x21, [x8, #0xc8]\n"
+ "fmla z22.s, p3/M, z5.s, z12.s\n"
+ "fmla z27.s, p3/M, z4.s, z12.s\n"
+ "fmla z20.s, p3/M, z2.s, z12.s\n"
+ "fmla z9.s, p3/M, z3.s, z12.s\n"
+ "fmla z24.s, p3/M, z1.s, z12.s\n"
+ "fmla z11.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "ldr x28, [x8, #0xd8]\n"
+ "fmla z15.s, p3/M, z7.s, z25.s\n"
+ "fmla z18.s, p3/M, z6.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "ldr x20, [x8, #0xd0]\n"
+ "fmla z26.s, p3/M, z7.s, z29.s\n"
+ "fmla z22.s, p3/M, z6.s, z29.s\n"
+ "fmla z14.s, p3/M, z4.s, z29.s\n"
+ "fmla z20.s, p3/M, z3.s, z29.s\n"
+ "fmla z23.s, p3/M, z1.s, z29.s\n"
+ "fmla z30.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "ldr x27, [x8, #0xe0]\n"
+ "fmla z27.s, p3/M, z8.s, z10.s\n"
+ "fmla z21.s, p3/M, z8.s, z25.s\n"
+ "fmla z28.s, p3/M, z7.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "fmla z13.s, p3/M, z1.s, z10.s\n"
+ "ldr x26, [x8, #0xe8]\n"
+ "fmla z9.s, p3/M, z7.s, z10.s\n"
+ "fmla z24.s, p3/M, z5.s, z10.s\n"
+ "fmla z11.s, p3/M, z4.s, z10.s\n"
+ "fmla z31.s, p3/M, z2.s, z10.s\n"
+ "ld1w { z10.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "ldr x25, [x8, #0xf0]\n"
+ "fmla z26.s, p3/M, z2.s, z29.s\n"
+ "fmla z22.s, p3/M, z1.s, z29.s\n"
+ "fmla z27.s, p3/M, z0.s, z29.s\n"
+ "fmla z14.s, p3/M, z7.s, z25.s\n"
+ "ld1w { z29.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "ldr x24, [x8, #0xf8]\n"
+ "fmla z20.s, p3/M, z6.s, z25.s\n"
+ "fmla z23.s, p3/M, z4.s, z25.s\n"
+ "fmla z30.s, p3/M, z3.s, z25.s\n"
+ "fmla z15.s, p3/M, z1.s, z25.s\n"
+ "fmla z18.s, p3/M, z0.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "fmla z13.s, p3/M, z4.s, z25.s\n"
+ "ldr x23, [x8, #0x100]\n"
+ "fmla z21.s, p3/M, z2.s, z25.s\n"
+ "fmla z22.s, p3/M, z2.s, z10.s\n"
+ "fmla z27.s, p3/M, z1.s, z10.s\n"
+ "fmla z9.s, p3/M, z0.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ldr x22, [x8, #0x108]\n"
+ "fmla z26.s, p3/M, z6.s, z29.s\n"
+ "fmla z14.s, p3/M, z3.s, z29.s\n"
+ "fmla z23.s, p3/M, z0.s, z29.s\n"
+ "fmla z24.s, p3/M, z8.s, z25.s\n"
+ "ld1w { z10.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "ldr x21, [x8, #0x110]\n"
+ "fmla z11.s, p3/M, z7.s, z25.s\n"
+ "fmla z31.s, p3/M, z5.s, z25.s\n"
+ "fmla z28.s, p3/M, z1.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x28, x16, LSL #2]\n"
+ "fmla z13.s, p3/M, z2.s, z12.s\n"
+ "ldr x20, [x8, #0x118]\n"
+ "fmla z15.s, p3/M, z0.s, z10.s\n"
+ "fmla z18.s, p3/M, z4.s, z25.s\n"
+ "fmla z21.s, p3/M, z3.s, z25.s\n"
+ "fmla z9.s, p3/M, z8.s, z12.s\n"
+ "fmla z11.s, p3/M, z5.s, z12.s\n"
+ "fmla z14.s, p3/M, z6.s, z10.s\n"
+ "ld1w { z12.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "fmla z23.s, p3/M, z3.s, z10.s\n"
+ "ld1w { z29.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z25.s\n"
+ "fmla z31.s, p3/M, z6.s, z25.s\n"
+ "fmla z15.s, p3/M, z5.s, z25.s\n"
+ "fmla z13.s, p3/M, z5.s, z12.s\n"
+ "fmla z28.s, p3/M, z2.s, z12.s\n"
+ "fmla z18.s, p3/M, z7.s, z29.s\n"
+ "fmla z21.s, p3/M, z6.s, z29.s\n"
+ "fmla z23.s, p3/M, z8.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "fmla z15.s, p3/M, z8.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z25.s\n"
+ "fmla z31.s, p3/M, z7.s, z25.s\n"
+ "fmla z13.s, p3/M, z6.s, z25.s\n"
+ "fmla z18.s, p3/M, z5.s, z25.s\n"
+ "fmla z21.s, p3/M, z4.s, z25.s\n"
+ "fmla z28.s, p3/M, z3.s, z25.s\n"
+ "ld1w { z25.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "ldp x27, x26, [x8, #0x0]\n"
+ "fmla z11.s, p3/M, z8.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "fmla z26.s, p3/M, z4.s, z29.s\n"
+ "fmax z26.s, p3/M, z26.s, z16.s\n"
+ "fmla z22.s, p3/M, z3.s, z29.s\n"
+ "fmla z27.s, p3/M, z5.s, z25.s\n"
+ "fmax z22.s, p3/M, z22.s, z16.s\n"
+ "fmax z27.s, p3/M, z27.s, z16.s\n"
+ "fmla z9.s, p3/M, z4.s, z25.s\n"
+ "fmla z18.s, p3/M, z8.s, z12.s\n"
+ "fmax z9.s, p3/M, z9.s, z16.s\n"
+ "fmin z26.s, p3/M, z26.s, z19.s\n"
+ "fmla z21.s, p3/M, z7.s, z12.s\n"
+ "fmla z28.s, p3/M, z6.s, z12.s\n"
+ "ld1w { z10.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "fmin z22.s, p3/M, z22.s, z19.s\n"
+ "fmla z14.s, p3/M, z1.s, z29.s\n"
+ "fmla z20.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "fmin z27.s, p3/M, z27.s, z19.s\n"
+ "fmla z24.s, p3/M, z2.s, z25.s\n"
+ "fmla z11.s, p3/M, z1.s, z25.s\n"
+ "fmin z9.s, p3/M, z9.s, z19.s\n"
+ "fmax z14.s, p3/M, z14.s, z16.s\n"
+ "fmla z23.s, p3/M, z7.s, z10.s\n"
+ "fmla z30.s, p3/M, z6.s, z10.s\n"
+ "fmax z20.s, p3/M, z20.s, z16.s\n"
+ "fmax z24.s, p3/M, z24.s, z16.s\n"
+ "fmla z31.s, p3/M, z8.s, z12.s\n"
+ "fmla z13.s, p3/M, z7.s, z12.s\n"
+ "fmax z11.s, p3/M, z11.s, z16.s\n"
+ "st1w { z26.s }, p1, [x12, x15, LSL #2]\n"
+ "st1w { z22.s }, p1, [x11, x15, LSL #2]\n"
+ "ldr x23, [x14, #0x20]\n"
+ "ldr x22, [x14, #0x28]\n"
+ "fmla z15.s, p3/M, z4.s, z10.s\n"
+ "st1w { z27.s }, p1, [x10, x15, LSL #2]\n"
+ "ldr x21, [x14, #0x30]\n"
+ "fmla z18.s, p3/M, z3.s, z10.s\n"
+ "fmla z21.s, p3/M, z5.s, z12.s\n"
+ "st1w { z9.s }, p1, [x9, x15, LSL #2]\n"
+ "ldr x20, [x14, #0x38]\n"
+ "fmla z28.s, p3/M, z4.s, z12.s\n"
+ "ldp x25, x24, [x8, #0x10]\n"
+ "fmin z14.s, p3/M, z14.s, z19.s\n"
+ "fmin z20.s, p3/M, z20.s, z19.s\n"
+ "st1w { z14.s }, p1, [x23, x15, LSL #2]\n"
+ "ldr x23, [x14, #0x40]\n"
+ "fmin z24.s, p3/M, z24.s, z19.s\n"
+ "fmin z11.s, p3/M, z11.s, z19.s\n"
+ "st1w { z20.s }, p1, [x22, x15, LSL #2]\n"
+ "ldr x22, [x14, #0x48]\n"
+ "fmax z23.s, p3/M, z23.s, z16.s\n"
+ "fmax z30.s, p3/M, z30.s, z16.s\n"
+ "st1w { z24.s }, p1, [x21, x15, LSL #2]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "fmax z31.s, p3/M, z31.s, z16.s\n"
+ "fmax z13.s, p3/M, z13.s, z16.s\n"
+ "st1w { z11.s }, p1, [x20, x15, LSL #2]\n"
+ "ldr x20, [x14, #0x58]\n"
+ "incw x16\n"
+ "ld1w { z9.s }, p0/Z, [x27, x17, LSL #2]\n"
+ "ld1w { z10.s }, p0/Z, [x26, x17, LSL #2]\n"
+ "fmin z23.s, p3/M, z23.s, z19.s\n"
+ "ld1w { z11.s }, p0/Z, [x25, x17, LSL #2]\n"
+ "ld1w { z12.s }, p0/Z, [x24, x17, LSL #2]\n"
+ "incw x17\n"
+ "fmin z30.s, p3/M, z30.s, z19.s\n"
+ "fmin z31.s, p3/M, z31.s, z19.s\n"
+ "fmin z13.s, p3/M, z13.s, z19.s\n"
+ "st1w { z23.s }, p1, [x23, x15, LSL #2]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "fmax z15.s, p3/M, z15.s, z16.s\n"
+ "fmax z18.s, p3/M, z18.s, z16.s\n"
+ "st1w { z30.s }, p1, [x22, x15, LSL #2]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "fmax z21.s, p3/M, z21.s, z16.s\n"
+ "fmax z28.s, p3/M, z28.s, z16.s\n"
+ "st1w { z31.s }, p1, [x21, x15, LSL #2]\n"
+ "ldr x21, [x14, #0x70]\n"
+ "st1w { z13.s }, p1, [x20, x15, LSL #2]\n"
+ "ldr x20, [x14, #0x78]\n"
+ "ld1w { z0.s }, p3/Z, [x7, #1, MUL VL]\n"
+ "whilelt p2.s, x16, %x[n_channels]\n"
+ "ld1w { z1.s }, p3/Z, [x7, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x7, #3, MUL VL]\n"
+ "cmp x17, %x[n_channels]\n"
+ "fmin z15.s, p3/M, z15.s, z19.s\n"
+ "ld1w { z3.s }, p3/Z, [x7, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x7, #5, MUL VL]\n"
+ "fmin z18.s, p3/M, z18.s, z19.s\n"
+ "fmin z21.s, p3/M, z21.s, z19.s\n"
+ "ld1w { z5.s }, p3/Z, [x7, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x7, #7, MUL VL]\n"
+ "addvl x7, x7, #16\n"
+ "fmin z28.s, p3/M, z28.s, z19.s\n"
+ "st1w { z15.s }, p1, [x23, x15, LSL #2]\n"
+ "ld1w { z7.s }, p3/Z, [x7, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x7, #-7, MUL VL]\n"
+ "addvl x7, x7, #-6\n"
+ "st1w { z18.s }, p1, [x22, x15, LSL #2]\n"
+ "st1w { z21.s }, p1, [x21, x15, LSL #2]\n"
+ "st1w { z28.s }, p1, [x20, x15, LSL #2]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z14, z17\n fmla z14.s, p3/M, z4.s, z9.s\n"
+ "movprfx z18, z17\n fmla z18.s, p3/M, z8.s, z9.s\n"
+ "ldr x27, [x8, #0x20]\n"
+ "ldr x24, [x8, #0x30]\n"
+ "movprfx z15, z17\n fmla z15.s, p3/M, z3.s, z9.s\n"
+ "movprfx z30, z17\n fmla z30.s, p3/M, z1.s, z9.s\n"
+ "ldr x23, [x8, #0x28]\n"
+ "ldr x22, [x8, #0x38]\n"
+ "movprfx z20, z17\n fmla z20.s, p3/M, z0.s, z9.s\n"
+ "movprfx z13, z17\n fmla z13.s, p3/M, z7.s, z9.s\n"
+ "ldr x26, [x8, #0x40]\n"
+ "ldr x21, [x8, #0x48]\n"
+ "movprfx z22, z17\n fmla z22.s, p3/M, z6.s, z9.s\n"
+ "fmla z14.s, p3/M, z5.s, z12.s\n"
+ "ldr x25, [x8, #0x50]\n"
+ "ldr x20, [x8, #0x58]\n"
+ "movprfx z27, z17\n fmla z27.s, p3/M, z5.s, z9.s\n"
+ "movprfx z31, z17\n fmla z31.s, p3/M, z2.s, z9.s\n"
+ "ld1w { z23.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "ldr x13, [x8, #0x70]\n"
+ "fmla z18.s, p3/M, z0.s, z10.s\n"
+ "movprfx z9, z17\n fmla z9.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z21.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "ld1w { z25.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "fmla z15.s, p3/M, z4.s, z12.s\n"
+ "fmla z30.s, p3/M, z2.s, z12.s\n"
+ "ldr x24, [x8, #0x60]\n"
+ "ldr x23, [x8, #0x68]\n"
+ "fmla z20.s, p3/M, z1.s, z12.s\n"
+ "fmla z13.s, p3/M, z8.s, z12.s\n"
+ "incw x15\n"
+ "mov p0.b, p2.b\n"
+ "fmla z22.s, p3/M, z7.s, z12.s\n"
+ "movprfx z28, z17\n fmla z28.s, p3/M, z6.s, z21.s\n"
+ "ld1w { z29.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ldr x28, [x8, #0x88]\n"
+ "fmla z14.s, p3/M, z7.s, z23.s\n"
+ "fmla z9.s, p3/M, z6.s, z12.s\n"
+ "ldr x12, [x14, #0x0]\n"
+ "ldr x11, [x14, #0x8]\n"
+ "movprfx z11, z17\n fmla z11.s, p3/M, z3.s, z12.s\n"
+ "movprfx z10, z17\n fmla z10.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z12.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "ldr x22, [x8, #0x78]\n"
+ "movprfx z26, z17\n fmla z26.s, p3/M, z8.s, z25.s\n"
+ "fmla z15.s, p3/M, z6.s, z23.s\n"
+ "ld1w { z21.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "ldr x21, [x8, #0x80]\n"
+ "fmla z30.s, p3/M, z4.s, z23.s\n"
+ "fmla z20.s, p3/M, z3.s, z23.s\n"
+ "ldr x10, [x14, #0x10]\n"
+ "ldr x9, [x14, #0x18]\n"
+ "movprfx z25, z17\n fmla z25.s, p3/M, z1.s, z23.s\n"
+ "movprfx z24, z17\n fmla z24.s, p3/M, z0.s, z23.s\n"
+ "fmla z27.s, p3/M, z8.s, z23.s\n"
+ "fmla z31.s, p3/M, z5.s, z23.s\n"
+ "fmla z28.s, p3/M, z2.s, z23.s\n"
+ "fmla z18.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z23.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "ldr x27, [x8, #0x90]\n"
+ "fmla z13.s, p3/M, z0.s, z12.s\n"
+ "fmla z22.s, p3/M, z2.s, z21.s\n"
+ "ld1w { z12.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "ldr x20, [x8, #0x98]\n"
+ "fmla z14.s, p3/M, z8.s, z29.s\n"
+ "fmla z9.s, p3/M, z1.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "ldr x26, [x8, #0xa0]\n"
+ "fmla z15.s, p3/M, z7.s, z29.s\n"
+ "fmla z11.s, p3/M, z6.s, z29.s\n"
+ "fmla z30.s, p3/M, z5.s, z29.s\n"
+ "fmla z20.s, p3/M, z4.s, z29.s\n"
+ "fmla z10.s, p3/M, z3.s, z29.s\n"
+ "fmla z25.s, p3/M, z2.s, z29.s\n"
+ "fmla z24.s, p3/M, z1.s, z29.s\n"
+ "fmla z26.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "ldr x25, [x8, #0xa8]\n"
+ "fmla z18.s, p3/M, z3.s, z23.s\n"
+ "fmla z27.s, p3/M, z0.s, z23.s\n"
+ "fmla z31.s, p3/M, z6.s, z21.s\n"
+ "fmla z28.s, p3/M, z3.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x13, x16, LSL #2]\n"
+ "ldr x24, [x8, #0xb0]\n"
+ "fmla z13.s, p3/M, z4.s, z29.s\n"
+ "fmla z22.s, p3/M, z3.s, z29.s\n"
+ "fmla z14.s, p3/M, z1.s, z29.s\n"
+ "fmla z9.s, p3/M, z5.s, z12.s\n"
+ "fmla z11.s, p3/M, z2.s, z12.s\n"
+ "fmla z15.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z17.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "ldr x23, [x8, #0xb8]\n"
+ "fmla z10.s, p3/M, z8.s, z21.s\n"
+ "fmla z26.s, p3/M, z5.s, z21.s\n"
+ "ld1w { z23.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ldr x22, [x8, #0xc0]\n"
+ "fmla z18.s, p3/M, z5.s, z29.s\n"
+ "fmla z27.s, p3/M, z2.s, z29.s\n"
+ "ld1w { z21.s }, p2/Z, [x28, x16, LSL #2]\n"
+ "ldr x21, [x8, #0xc8]\n"
+ "fmla z13.s, p3/M, z5.s, z17.s\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
+ "fmla z14.s, p3/M, z2.s, z17.s\n"
+ "fmla z9.s, p3/M, z3.s, z17.s\n"
+ "fmla z15.s, p3/M, z1.s, z17.s\n"
+ "fmla z11.s, p3/M, z0.s, z17.s\n"
+ "ld1w { z29.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "ldr x28, [x8, #0xd8]\n"
+ "fmla z28.s, p3/M, z7.s, z23.s\n"
+ "fmla z25.s, p3/M, z6.s, z23.s\n"
+ "ld1w { z23.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "ldr x20, [x8, #0xd0]\n"
+ "fmla z18.s, p3/M, z7.s, z21.s\n"
+ "fmla z13.s, p3/M, z6.s, z21.s\n"
+ "fmla z27.s, p3/M, z4.s, z21.s\n"
+ "fmla z14.s, p3/M, z3.s, z21.s\n"
+ "fmla z31.s, p3/M, z1.s, z21.s\n"
+ "fmla z30.s, p3/M, z0.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "ldr x27, [x8, #0xe0]\n"
+ "fmla z22.s, p3/M, z8.s, z29.s\n"
+ "fmla z24.s, p3/M, z8.s, z23.s\n"
+ "fmla z26.s, p3/M, z7.s, z23.s\n"
+ "ld1w { z23.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "fmla z10.s, p3/M, z1.s, z29.s\n"
+ "ldr x26, [x8, #0xe8]\n"
+ "fmla z9.s, p3/M, z7.s, z29.s\n"
+ "fmla z15.s, p3/M, z5.s, z29.s\n"
+ "fmla z11.s, p3/M, z4.s, z29.s\n"
+ "fmla z20.s, p3/M, z2.s, z29.s\n"
+ "ld1w { z29.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "ldr x25, [x8, #0xf0]\n"
+ "fmla z18.s, p3/M, z2.s, z21.s\n"
+ "fmla z13.s, p3/M, z1.s, z21.s\n"
+ "fmla z22.s, p3/M, z0.s, z21.s\n"
+ "fmla z27.s, p3/M, z7.s, z23.s\n"
+ "ld1w { z21.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "ldr x24, [x8, #0xf8]\n"
+ "fmla z14.s, p3/M, z6.s, z23.s\n"
+ "fmla z31.s, p3/M, z4.s, z23.s\n"
+ "fmla z30.s, p3/M, z3.s, z23.s\n"
+ "fmla z28.s, p3/M, z1.s, z23.s\n"
+ "fmla z25.s, p3/M, z0.s, z23.s\n"
+ "ld1w { z17.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "fmla z10.s, p3/M, z4.s, z17.s\n"
+ "ldr x23, [x8, #0x100]\n"
+ "fmla z24.s, p3/M, z2.s, z17.s\n"
+ "fmla z13.s, p3/M, z2.s, z29.s\n"
+ "fmla z22.s, p3/M, z1.s, z29.s\n"
+ "fmla z9.s, p3/M, z0.s, z29.s\n"
+ "ld1w { z23.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "ldr x22, [x8, #0x108]\n"
+ "fmla z18.s, p3/M, z6.s, z21.s\n"
+ "fmla z27.s, p3/M, z3.s, z21.s\n"
+ "fmla z31.s, p3/M, z0.s, z21.s\n"
+ "fmla z15.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z29.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "ldr x21, [x8, #0x110]\n"
+ "fmla z11.s, p3/M, z7.s, z17.s\n"
+ "fmla z20.s, p3/M, z5.s, z17.s\n"
+ "fmla z26.s, p3/M, z1.s, z17.s\n"
+ "ld1w { z21.s }, p2/Z, [x28, x16, LSL #2]\n"
+ "fmla z10.s, p3/M, z2.s, z23.s\n"
+ "ldr x20, [x8, #0x118]\n"
+ "fmla z28.s, p3/M, z0.s, z29.s\n"
+ "fmla z25.s, p3/M, z4.s, z21.s\n"
+ "fmla z24.s, p3/M, z3.s, z21.s\n"
+ "fmla z9.s, p3/M, z8.s, z23.s\n"
+ "fmla z11.s, p3/M, z5.s, z23.s\n"
+ "fmla z27.s, p3/M, z6.s, z29.s\n"
+ "ld1w { z23.s }, p2/Z, [x27, x16, LSL #2]\n"
+ "fmla z31.s, p3/M, z3.s, z29.s\n"
+ "ld1w { z17.s }, p2/Z, [x26, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z7.s, z21.s\n"
+ "fmla z20.s, p3/M, z6.s, z21.s\n"
+ "fmla z28.s, p3/M, z5.s, z21.s\n"
+ "fmla z10.s, p3/M, z5.s, z23.s\n"
+ "fmla z26.s, p3/M, z2.s, z23.s\n"
+ "fmla z25.s, p3/M, z7.s, z17.s\n"
+ "fmla z24.s, p3/M, z6.s, z17.s\n"
+ "fmla z31.s, p3/M, z8.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x25, x16, LSL #2]\n"
+ "fmla z28.s, p3/M, z8.s, z17.s\n"
+ "ld1w { z12.s }, p2/Z, [x23, x16, LSL #2]\n"
+ "fmla z30.s, p3/M, z8.s, z21.s\n"
+ "fmla z20.s, p3/M, z7.s, z21.s\n"
+ "fmla z10.s, p3/M, z6.s, z21.s\n"
+ "fmla z25.s, p3/M, z5.s, z21.s\n"
+ "fmla z24.s, p3/M, z4.s, z21.s\n"
+ "fmla z26.s, p3/M, z3.s, z21.s\n"
+ "ld1w { z21.s }, p2/Z, [x22, x16, LSL #2]\n"
+ "fmla z11.s, p3/M, z8.s, z23.s\n"
+ "ld1w { z29.s }, p2/Z, [x24, x16, LSL #2]\n"
+ "fmla z18.s, p3/M, z4.s, z12.s\n"
+ "fmax z18.s, p3/M, z18.s, z16.s\n"
+ "fmla z13.s, p3/M, z3.s, z12.s\n"
+ "fmla z22.s, p3/M, z5.s, z21.s\n"
+ "fmax z13.s, p3/M, z13.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z16.s\n"
+ "fmla z9.s, p3/M, z4.s, z21.s\n"
+ "fmla z25.s, p3/M, z8.s, z29.s\n"
+ "fmax z9.s, p3/M, z9.s, z16.s\n"
+ "fmin z18.s, p3/M, z18.s, z19.s\n"
+ "fmla z24.s, p3/M, z7.s, z29.s\n"
+ "fmla z26.s, p3/M, z6.s, z29.s\n"
+ "ld1w { z23.s }, p2/Z, [x21, x16, LSL #2]\n"
+ "fmin z13.s, p3/M, z13.s, z19.s\n"
+ "fmla z27.s, p3/M, z1.s, z12.s\n"
+ "fmla z14.s, p3/M, z0.s, z12.s\n"
+ "ld1w { z29.s }, p2/Z, [x20, x16, LSL #2]\n"
+ "fmin z22.s, p3/M, z22.s, z19.s\n"
+ "fmla z15.s, p3/M, z2.s, z21.s\n"
+ "fmla z11.s, p3/M, z1.s, z21.s\n"
+ "fmin z9.s, p3/M, z9.s, z19.s\n"
+ "fmax z27.s, p3/M, z27.s, z16.s\n"
+ "fmla z31.s, p3/M, z7.s, z23.s\n"
+ "fmla z30.s, p3/M, z6.s, z23.s\n"
+ "fmax z14.s, p3/M, z14.s, z16.s\n"
+ "fmax z15.s, p3/M, z15.s, z16.s\n"
+ "fmla z20.s, p3/M, z8.s, z29.s\n"
+ "fmla z10.s, p3/M, z7.s, z29.s\n"
+ "fmax z11.s, p3/M, z11.s, z16.s\n"
+ "st1w { z18.s }, p0, [x12, x15, LSL #2]\n"
+ "st1w { z13.s }, p0, [x11, x15, LSL #2]\n"
+ "ldr x23, [x14, #0x20]\n"
+ "ldr x22, [x14, #0x28]\n"
+ "fmla z28.s, p3/M, z4.s, z23.s\n"
+ "st1w { z22.s }, p0, [x10, x15, LSL #2]\n"
+ "ldr x21, [x14, #0x30]\n"
+ "fmla z25.s, p3/M, z3.s, z23.s\n"
+ "fmla z24.s, p3/M, z5.s, z29.s\n"
+ "st1w { z9.s }, p0, [x9, x15, LSL #2]\n"
+ "ldr x20, [x14, #0x38]\n"
+ "fmla z26.s, p3/M, z4.s, z29.s\n"
+ "fmin z27.s, p3/M, z27.s, z19.s\n"
+ "fmin z14.s, p3/M, z14.s, z19.s\n"
+ "fmin z15.s, p3/M, z15.s, z19.s\n"
+ "st1w { z27.s }, p0, [x23, x15, LSL #2]\n"
+ "ldr x23, [x14, #0x40]\n"
+ "fmin z11.s, p3/M, z11.s, z19.s\n"
+ "fmax z31.s, p3/M, z31.s, z16.s\n"
+ "st1w { z14.s }, p0, [x22, x15, LSL #2]\n"
+ "ldr x22, [x14, #0x48]\n"
+ "fmax z30.s, p3/M, z30.s, z16.s\n"
+ "fmax z20.s, p3/M, z20.s, z16.s\n"
+ "st1w { z15.s }, p0, [x21, x15, LSL #2]\n"
+ "ldr x21, [x14, #0x50]\n"
+ "fmax z10.s, p3/M, z10.s, z16.s\n"
+ "st1w { z11.s }, p0, [x20, x15, LSL #2]\n"
+ "ldr x20, [x14, #0x58]\n"
+ "fmin z31.s, p3/M, z31.s, z19.s\n"
+ "fmin z30.s, p3/M, z30.s, z19.s\n"
+ "fmin z20.s, p3/M, z20.s, z19.s\n"
+ "st1w { z31.s }, p0, [x23, x15, LSL #2]\n"
+ "ldr x23, [x14, #0x60]\n"
+ "fmin z10.s, p3/M, z10.s, z19.s\n"
+ "fmax z28.s, p3/M, z28.s, z16.s\n"
+ "st1w { z30.s }, p0, [x22, x15, LSL #2]\n"
+ "ldr x22, [x14, #0x68]\n"
+ "fmax z25.s, p3/M, z25.s, z16.s\n"
+ "fmax z24.s, p3/M, z24.s, z16.s\n"
+ "st1w { z20.s }, p0, [x21, x15, LSL #2]\n"
+ "ldr x21, [x14, #0x70]\n"
+ "fmax z26.s, p3/M, z26.s, z16.s\n"
+ "st1w { z10.s }, p0, [x20, x15, LSL #2]\n"
+ "ldr x20, [x14, #0x78]\n"
+ "fmin z28.s, p3/M, z28.s, z19.s\n"
+ "fmin z25.s, p3/M, z25.s, z19.s\n"
+ "fmin z24.s, p3/M, z24.s, z19.s\n"
+ "st1w { z28.s }, p0, [x23, x15, LSL #2]\n"
+ "fmin z26.s, p3/M, z26.s, z19.s\n"
+ "st1w { z25.s }, p0, [x22, x15, LSL #2]\n"
+ "st1w { z24.s }, p0, [x21, x15, LSL #2]\n"
+ "st1w { z26.s }, p0, [x20, x15, LSL #2]\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..75d62007ab
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..e6090fda94
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "mov x11, #0x0\n"
+ "mov x16, #0x0\n"
+ "1:" // Tile loop
+ "str x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x4\n"
+ "mov x24, #0x2\n"
+ "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mul x22, x11, x23\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x22, x16, x15, x22\n" // offset += tile_j * ld_input_col
+ "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "cntw x13\n"
+ "mul x20, x11, x21\n" // offset = tile_i * ld_output_row
+ "ldr x12, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x11, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x10, x15, x15\n"
+ "mul x22, x22, x25\n" // offset *= kernel_stride * output_size
+ "add x12, x12, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "ldr x9, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x28, x12, x23, LSL #2\n"
+ "madd x20, x16, x14, x20\n" // offset += tile_j * ld_output_col
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "ld1w { z30.s }, p3/Z, [x11]\n"
+ "ld1w { z0.s }, p3/Z, [x11, #1, MUL VL]\n"
+ "mul x20, x20, x24\n" // offset *= output_tile_size
+ "ld1w { z1.s }, p3/Z, [x11, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x11, #3, MUL VL]\n"
+ "add x27, x28, x23, LSL #2\n"
+ "ld1w { z3.s }, p3/Z, [x11, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x11, #5, MUL VL]\n"
+ "add x26, x10, x15\n"
+ "add x25, x27, x23, LSL #2\n"
+ "ld1w { z5.s }, p3/Z, [x11, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "add x24, x26, x15\n"
+ "add x9, x9, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "cmp x13, %x[n_channels]\n"
+ "ld1rw { z29.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z28.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "add x23, x25, x23, LSL #2\n"
+ "add x22, x9, x21, LSL #2\n"
+ "ld1w { z7.s }, p3/Z, [x11, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x11, #-7, MUL VL]\n"
+ "mov x21, #0x0\n"
+ "sub x20, XZR, x13\n"
+ "ld1w { z9.s }, p2/Z, [x27, x10, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x12]\n"
+ "ld1w { z11.s }, p2/Z, [x12, x15, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x12, x26, LSL #2]\n"
+ "addvl x11, x11, #-6\n"
+ "ld1w { z13.s }, p2/Z, [x12, x24, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x28]\n"
+ "ld1w { z15.s }, p2/Z, [x28, x15, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x12, x10, LSL #2]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "movprfx z27, z30\n fmla z27.s, p3/M, z8.s, z9.s\n"
+ "movprfx z26, z30\n fmla z26.s, p3/M, z6.s, z9.s\n"
+ "whilelt p1.s, x13, %x[n_channels]\n"
+ "incw x21\n"
+ "fmla z27.s, p3/M, z0.s, z10.s\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z20.s }, p2/Z, [x28, x24, LSL #2]\n"
+ "incw x13\n"
+ "fmla z27.s, p3/M, z1.s, z11.s\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x26, LSL #2]\n"
+ "ld1w { z19.s }, p2/Z, [x28, x10, LSL #2]\n"
+ "fmla z27.s, p3/M, z3.s, z14.s\n"
+ "fmla z26.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x25]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z27.s, p3/M, z4.s, z15.s\n"
+ "fmla z26.s, p3/M, z4.s, z17.s\n"
+ "ld1w { z25.s }, p2/Z, [x27]\n"
+ "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z16.s\n"
+ "fmla z26.s, p3/M, z5.s, z20.s\n"
+ "ld1w { z24.s }, p2/Z, [x27, x26, LSL #2]\n"
+ "ld1w { z23.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "movprfx z22, z30\n fmla z22.s, p3/M, z2.s, z9.s\n"
+ "movprfx z21, z30\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "addvl x12, x12, #1\n"
+ "addvl x28, x28, #1\n"
+ "fmla z27.s, p3/M, z5.s, z19.s\n"
+ "fmla z26.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x26, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x11]\n"
+ "fmla z22.s, p3/M, z3.s, z18.s\n"
+ "fmla z21.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z25.s\n"
+ "fmla z21.s, p3/M, z1.s, z24.s\n"
+ "ld1w { z0.s }, p3/Z, [x11, #1, MUL VL]\n"
+ "incw x20\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
+ "fmla z21.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x27, x24, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x23, x26, LSL #2]\n"
+ "fmla z27.s, p3/M, z6.s, z25.s\n"
+ "fmla z22.s, p3/M, z1.s, z23.s\n"
+ "ld1w { z17.s }, p2/Z, [x23]\n"
+ "addvl x27, x27, #1\n"
+ "fmla z21.s, p3/M, z2.s, z19.s\n"
+ "fmla z27.s, p3/M, z7.s, z23.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x10, LSL #2]\n"
+ "fmax z27.s, p3/M, z27.s, z29.s\n"
+ "fmla z22.s, p3/M, z6.s, z17.s\n"
+ "fmla z21.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x10, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x11, #2, MUL VL]\n"
+ "fmla z22.s, p3/M, z7.s, z20.s\n"
+ "fmla z21.s, p3/M, z7.s, z18.s\n"
+ "ld1w { z2.s }, p3/Z, [x11, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x11, #4, MUL VL]\n"
+ "fmla z26.s, p3/M, z7.s, z24.s\n"
+ "fmla z22.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z4.s }, p3/Z, [x11, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x11, #6, MUL VL]\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
+ "fmla z26.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "fmax z26.s, p3/M, z26.s, z29.s\n"
+ "fmla z22.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z29.s\n"
+ "fmax z21.s, p3/M, z21.s, z29.s\n"
+ "ld1w { z6.s }, p3/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
+ "ld1w { z9.s }, p1/Z, [x27, x10, LSL #2]\n"
+ "cmp x13, %x[n_channels]\n"
+ "fmin z27.s, p3/M, z27.s, z28.s\n"
+ "ld1w { z10.s }, p1/Z, [x12]\n"
+ "ld1w { z11.s }, p1/Z, [x12, x15, LSL #2]\n"
+ "fmin z26.s, p3/M, z26.s, z28.s\n"
+ "fmin z22.s, p3/M, z22.s, z28.s\n"
+ "ld1w { z12.s }, p1/Z, [x12, x26, LSL #2]\n"
+ "ld1w { z13.s }, p1/Z, [x12, x24, LSL #2]\n"
+ "fmin z21.s, p3/M, z21.s, z28.s\n"
+ "addvl x25, x25, #1\n"
+ "ld1w { z14.s }, p1/Z, [x28]\n"
+ "ld1w { z15.s }, p1/Z, [x28, x15, LSL #2]\n"
+ "addvl x23, x23, #1\n"
+ "ld1w { z16.s }, p1/Z, [x12, x10, LSL #2]\n"
+ "st1w { z27.s }, p0, [x9]\n"
+ "ld1w { z7.s }, p3/Z, [x11, #-8, MUL VL]\n"
+ "st1w { z26.s }, p0, [x9, x14, LSL #2]\n"
+ "addvl x9, x9, #1\n"
+ "ld1w { z8.s }, p3/Z, [x11, #-7, MUL VL]\n"
+ "addvl x11, x11, #-6\n"
+ "st1w { z22.s }, p0, [x22]\n"
+ "st1w { z21.s }, p0, [x22, x14, LSL #2]\n"
+ "addvl x22, x22, #1\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "movprfx z27, z30\n fmla z27.s, p3/M, z8.s, z9.s\n"
+ "movprfx z26, z30\n fmla z26.s, p3/M, z6.s, z9.s\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x11, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "fmla z27.s, p3/M, z0.s, z10.s\n"
+ "fmla z26.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z20.s }, p2/Z, [x28, x24, LSL #2]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "fmla z27.s, p3/M, z1.s, z11.s\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x28, x26, LSL #2]\n"
+ "ld1w { z19.s }, p2/Z, [x28, x10, LSL #2]\n"
+ "fmla z27.s, p3/M, z3.s, z14.s\n"
+ "fmla z26.s, p3/M, z0.s, z16.s\n"
+ "ld1w { z18.s }, p2/Z, [x25]\n"
+ "add x16, x16, #0x1\n"
+ "fmla z27.s, p3/M, z4.s, z15.s\n"
+ "fmla z26.s, p3/M, z4.s, z17.s\n"
+ "ld1w { z25.s }, p2/Z, [x27]\n"
+ "ld1w { z17.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z27.s, p3/M, z2.s, z16.s\n"
+ "fmla z26.s, p3/M, z5.s, z20.s\n"
+ "ld1w { z24.s }, p2/Z, [x27, x26, LSL #2]\n"
+ "ld1w { z23.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "movprfx z22, z30\n fmla z22.s, p3/M, z2.s, z9.s\n"
+ "movprfx z21, z30\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "cmp x16, x20\n"
+ "add x21, x11, #0x1\n"
+ "fmla z27.s, p3/M, z5.s, z19.s\n"
+ "fmla z26.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x26, LSL #2]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z22.s, p3/M, z3.s, z18.s\n"
+ "fmla z21.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "ld1w { z20.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z25.s\n"
+ "fmla z21.s, p3/M, z1.s, z24.s\n"
+ "csel x11, x11, x21, LT\n"
+ "mov p0.b, p2.b\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
+ "fmla z21.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x27, x24, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x23, x26, LSL #2]\n"
+ "fmla z27.s, p3/M, z6.s, z25.s\n"
+ "fmla z22.s, p3/M, z1.s, z23.s\n"
+ "ld1w { z17.s }, p2/Z, [x23]\n"
+ "csel x16, x16, XZR, LT\n"
+ "fmla z21.s, p3/M, z2.s, z19.s\n"
+ "fmla z27.s, p3/M, z7.s, z23.s\n"
+ "ld1w { z16.s }, p2/Z, [x25, x10, LSL #2]\n"
+ "fmax z27.s, p3/M, z27.s, z29.s\n"
+ "fmla z22.s, p3/M, z6.s, z17.s\n"
+ "fmla z21.s, p3/M, z3.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x10, LSL #2]\n"
+ "cmp x11, x20\n"
+ "fmla z22.s, p3/M, z7.s, z20.s\n"
+ "fmla z21.s, p3/M, z7.s, z18.s\n"
+ "fmin z27.s, p3/M, z27.s, z28.s\n"
+ "st1w { z27.s }, p0, [x9]\n"
+ "fmla z26.s, p3/M, z7.s, z24.s\n"
+ "fmla z22.s, p3/M, z5.s, z16.s\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
+ "fmla z26.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "fmax z26.s, p3/M, z26.s, z29.s\n"
+ "fmla z22.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "fmax z22.s, p3/M, z22.s, z29.s\n"
+ "fmax z21.s, p3/M, z21.s, z29.s\n"
+ "fmin z26.s, p3/M, z26.s, z28.s\n"
+ "fmin z22.s, p3/M, z22.s, z28.s\n"
+ "st1w { z26.s }, p0, [x9, x14, LSL #2]\n"
+ "fmin z21.s, p3/M, z21.s, z28.s\n"
+ "st1w { z22.s }, p0, [x22]\n"
+ "st1w { z21.s }, p0, [x22, x14, LSL #2]\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..98427701fa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,337 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[25];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[1];
+ inptrs[3] = input_ptrs[3];
+ inptrs[4] = input_ptrs[4];
+ inptrs[5] = input_ptrs[5];
+ inptrs[6] = input_ptrs[6];
+ inptrs[7] = input_ptrs[2];
+ inptrs[8] = input_ptrs[8];
+ inptrs[9] = input_ptrs[9];
+ inptrs[10] = input_ptrs[7];
+ inptrs[11] = input_ptrs[15];
+ inptrs[12] = input_ptrs[10];
+ inptrs[13] = input_ptrs[16];
+ inptrs[14] = input_ptrs[11];
+ inptrs[15] = input_ptrs[18];
+ inptrs[16] = input_ptrs[13];
+ inptrs[17] = input_ptrs[19];
+ inptrs[18] = input_ptrs[20];
+ inptrs[19] = input_ptrs[14];
+ inptrs[20] = input_ptrs[21];
+ inptrs[21] = input_ptrs[17];
+ inptrs[22] = input_ptrs[23];
+ inptrs[23] = input_ptrs[22];
+ inptrs[24] = input_ptrs[24];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x15, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "cntw x14\n"
+ "ldp x13, x12, [x20, #0x0]\n"
+ "ldp x11, x10, [x20, #0x10]\n"
+ "mov x9, #0x0\n"
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "ld1w { z20.s }, p3/Z, [x16]\n"
+ "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+ "cmp x14, %x[n_channels]\n"
+ "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+ "sub x28, XZR, x14\n"
+ "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
+ "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ld1rw { z26.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z25.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+ "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+ "addvl x16, x16, #-6\n"
+ "ld1w { z9.s }, p2/Z, [x27, x9, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x26, x9, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x25, x9, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x24, x9, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z15.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "movprfx z24, z20\n fmla z24.s, p3/M, z8.s, z9.s\n"
+ "movprfx z23, z20\n fmla z23.s, p3/M, z6.s, z9.s\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z23.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x50]\n"
+ "fmla z24.s, p3/M, z1.s, z11.s\n"
+ "fmla z23.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z14.s\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ldr x22, [x15, #0x78]\n"
+ "fmla z24.s, p3/M, z4.s, z15.s\n"
+ "fmla z23.s, p3/M, z4.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x60]\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z23.s, p3/M, z5.s, z18.s\n"
+ "ldr x20, [x15, #0x80]\n"
+ "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "movprfx z22, z20\n fmla z22.s, p3/M, z2.s, z9.s\n"
+ "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z20.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x68]\n"
+ "fmla z24.s, p3/M, z5.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla z22.s, p3/M, z3.s, z17.s\n"
+ "fmla z21.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z18.s\n"
+ "fmla z21.s, p3/M, z1.s, z20.s\n"
+ "ldr x21, [x15, #0x70]\n"
+ "ldr x20, [x15, #0x98]\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
+ "fmla z21.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "fmla z22.s, p3/M, z1.s, z16.s\n"
+ "fmla z21.s, p3/M, z2.s, z19.s\n"
+ "fmla z24.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0xa0]\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "fmla z22.s, p3/M, z6.s, z16.s\n"
+ "fmla z21.s, p3/M, z3.s, z18.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z17.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla z23.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z5.s, z18.s\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
+ "fmla z23.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "whilelt p1.s, x14, %x[n_channels]\n"
+ "ldp x27, x26, [x15, #0x0]\n"
+ "ldp x25, x24, [x15, #0x10]\n"
+ "ldp x23, x22, [x15, #0x20]\n"
+ "incw x9\n"
+ "fmax z24.s, p3/M, z24.s, z26.s\n"
+ "ldp x21, x20, [x15, #0x30]\n"
+ "ld1w { z9.s }, p1/Z, [x27, x14, LSL #2]\n"
+ "fmax z23.s, p3/M, z23.s, z26.s\n"
+ "fmax z22.s, p3/M, z22.s, z26.s\n"
+ "ld1w { z10.s }, p1/Z, [x26, x14, LSL #2]\n"
+ "ld1w { z11.s }, p1/Z, [x25, x14, LSL #2]\n"
+ "fmax z21.s, p3/M, z21.s, z26.s\n"
+ "incw x28\n"
+ "ld1w { z12.s }, p1/Z, [x24, x14, LSL #2]\n"
+ "ld1w { z13.s }, p1/Z, [x23, x14, LSL #2]\n"
+ "mov p0.b, p2.b\n"
+ "whilelt p2.s, x9, %x[n_channels]\n"
+ "ld1w { z14.s }, p1/Z, [x22, x14, LSL #2]\n"
+ "ld1w { z15.s }, p1/Z, [x21, x14, LSL #2]\n"
+ "fmin z24.s, p3/M, z24.s, z25.s\n"
+ "fmin z23.s, p3/M, z23.s, z25.s\n"
+ "ld1w { z16.s }, p1/Z, [x20, x14, LSL #2]\n"
+ "incw x14\n"
+ "ld1w { z20.s }, p3/Z, [x16]\n"
+ "cmp x14, %x[n_channels]\n"
+ "ld1w { z0.s }, p3/Z, [x16, #1, MUL VL]\n"
+ "ld1w { z1.s }, p3/Z, [x16, #2, MUL VL]\n"
+ "fmin z22.s, p3/M, z22.s, z25.s\n"
+ "fmin z21.s, p3/M, z21.s, z25.s\n"
+ "ld1w { z2.s }, p3/Z, [x16, #3, MUL VL]\n"
+ "ld1w { z3.s }, p3/Z, [x16, #4, MUL VL]\n"
+ "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
+ "ld1w { z4.s }, p3/Z, [x16, #5, MUL VL]\n"
+ "ld1w { z5.s }, p3/Z, [x16, #6, MUL VL]\n"
+ "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
+ "ld1w { z6.s }, p3/Z, [x16, #7, MUL VL]\n"
+ "addvl x16, x16, #16\n"
+ "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
+ "ld1w { z7.s }, p3/Z, [x16, #-8, MUL VL]\n"
+ "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
+ "ld1w { z8.s }, p3/Z, [x16, #-7, MUL VL]\n"
+ "addvl x16, x16, #-6\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z24, z20\n fmla z24.s, p3/M, z8.s, z9.s\n"
+ "movprfx z23, z20\n fmla z23.s, p3/M, z6.s, z9.s\n"
+ "ldr x21, [x15, #0x40]\n"
+ "ldr x20, [x15, #0x48]\n"
+ "fmla z24.s, p3/M, z0.s, z10.s\n"
+ "fmla z23.s, p3/M, z1.s, z12.s\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x50]\n"
+ "fmla z24.s, p3/M, z1.s, z11.s\n"
+ "fmla z23.s, p3/M, z2.s, z13.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z3.s, z14.s\n"
+ "fmla z23.s, p3/M, z0.s, z16.s\n"
+ "ldr x20, [x15, #0x58]\n"
+ "ldr x22, [x15, #0x78]\n"
+ "fmla z24.s, p3/M, z4.s, z15.s\n"
+ "fmla z23.s, p3/M, z4.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x60]\n"
+ "fmla z24.s, p3/M, z2.s, z16.s\n"
+ "fmla z23.s, p3/M, z5.s, z18.s\n"
+ "ldr x20, [x15, #0x80]\n"
+ "ld1w { z18.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "movprfx z22, z20\n fmla z22.s, p3/M, z2.s, z9.s\n"
+ "movprfx z21, z20\n fmla z21.s, p3/M, z0.s, z9.s\n"
+ "ld1w { z20.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0x68]\n"
+ "fmla z24.s, p3/M, z5.s, z19.s\n"
+ "fmla z23.s, p3/M, z3.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x22, x9, LSL #2]\n"
+ "ldr x20, [x15, #0x88]\n"
+ "fmla z22.s, p3/M, z3.s, z17.s\n"
+ "fmla z21.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z0.s, z18.s\n"
+ "fmla z21.s, p3/M, z1.s, z20.s\n"
+ "ldr x21, [x15, #0x70]\n"
+ "ldr x20, [x15, #0x98]\n"
+ "fmla z22.s, p3/M, z4.s, z17.s\n"
+ "fmla z21.s, p3/M, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z19.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z24.s, p3/M, z6.s, z18.s\n"
+ "ldr x21, [x15, #0x90]\n"
+ "ldr x20, [x15, #0xa8]\n"
+ "fmla z22.s, p3/M, z1.s, z16.s\n"
+ "fmla z21.s, p3/M, z2.s, z19.s\n"
+ "fmla z24.s, p3/M, z7.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "ldr x21, [x15, #0xa0]\n"
+ "ldr x20, [x15, #0xb0]\n"
+ "fmla z22.s, p3/M, z6.s, z16.s\n"
+ "fmla z21.s, p3/M, z3.s, z18.s\n"
+ "ld1w { z17.s }, p2/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z7.s, z17.s\n"
+ "fmla z21.s, p3/M, z7.s, z16.s\n"
+ "ldr x20, [x15, #0xb8]\n"
+ "fmla z23.s, p3/M, z7.s, z20.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z5.s, z18.s\n"
+ "ldr x20, [x15, #0xc0]\n"
+ "fmla z21.s, p3/M, z6.s, z17.s\n"
+ "fmla z23.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x9, LSL #2]\n"
+ "fmla z22.s, p3/M, z8.s, z17.s\n"
+ "fmla z21.s, p3/M, z8.s, z16.s\n"
+ "incw x28\n"
+ "mov p0.b, p2.b\n"
+ "fmax z24.s, p3/M, z24.s, z26.s\n"
+ "fmax z23.s, p3/M, z23.s, z26.s\n"
+ "fmax z22.s, p3/M, z22.s, z26.s\n"
+ "fmax z21.s, p3/M, z21.s, z26.s\n"
+ "fmin z24.s, p3/M, z24.s, z25.s\n"
+ "fmin z23.s, p3/M, z23.s, z25.s\n"
+ "st1w { z24.s }, p0, [x13, x28, LSL #2]\n"
+ "fmin z22.s, p3/M, z22.s, z25.s\n"
+ "fmin z21.s, p3/M, z21.s, z25.s\n"
+ "st1w { z23.s }, p0, [x12, x28, LSL #2]\n"
+ "st1w { z22.s }, p0, [x11, x28, LSL #2]\n"
+ "st1w { z21.s }, p0, [x10, x28, LSL #2]\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..ae89a64c6b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const input_ptrs, float *const *const outptrs, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int n_tile_rows, const unsigned int n_tile_cols, const float *inptr, int64_t ld_input_row, int64_t ld_input_col, float *outptr, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, const float activation_min, const float activation_max);
+
+class sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
+{
+ private:
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+ public:
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::SVE;
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
+ : Parent(output_rows, output_cols, kernel_rows, kernel_cols, stride_rows, stride_cols) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
+
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 0000000000..075181a488
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,523 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const float *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ float *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const float min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const float *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ float *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "ptrue p3.b\n"
+ "mov x12, #0x0\n"
+ "mov x8, #0x0\n"
+ "1:" // Tile loop
+ "str x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x2\n"
+ "mov x24, #0x2\n"
+ "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mul x22, x12, x23\n" // offset = tile_i * ld_input_row
+ "ldr x21, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x22, x8, x17, x22\n" // offset += tile_j * ld_input_col
+ "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "add x15, x17, x17\n"
+ "mul x20, x12, x21\n" // offset = tile_i * ld_output_row
+ "ldr x14, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "cntw x12\n"
+ "mul x22, x22, x25\n" // offset *= kernel_stride * output_size
+ "add x14, x14, x22, LSL #2\n" // inptr[0] += offset * sizeof(float)
+ "add x11, x14, x23, LSL #2\n"
+ "ldr x10, [%x[params_struct], %[offsetof_args_params]]\n"
+ "madd x20, x8, x16, x20\n" // offset += tile_j * ld_output_col
+ "add x9, x11, x23, LSL #2\n"
+ "add x28, x15, x17\n"
+ "ld1rw { z15.s }, p3/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "mul x20, x20, x24\n" // offset *= output_tile_size
+ "whilelt p2.s, XZR, %x[n_channels]\n"
+ "add x27, x9, x23, LSL #2\n"
+ "ld1rw { z28.s }, p3/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "add x26, x28, x17\n"
+ "add x25, x27, x23, LSL #2\n"
+ "ld1w { z29.s }, p3/Z, [x10]\n"
+ "ld1w { z0.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "add x24, x26, x17\n"
+ "add x13, x13, x20, LSL #2\n" // outptrs[0] += offset * sizeof(float)
+ "ld1w { z1.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "ld1w { z2.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "cmp x12, %x[n_channels]\n"
+ "add x23, x25, x23, LSL #2\n"
+ "ld1w { z3.s }, p3/Z, [x10, #4, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x10, #5, MUL VL]\n"
+ "add x22, x13, x21, LSL #2\n"
+ "mov x21, #0x0\n"
+ "ld1w { z5.s }, p2/Z, [x14]\n"
+ "ld1w { z6.s }, p2/Z, [x14, x17, LSL #2]\n"
+ "sub x20, XZR, x12\n"
+ "ld1w { z7.s }, p2/Z, [x11]\n"
+ "ld1w { z8.s }, p2/Z, [x11, x17, LSL #2]\n"
+ "addvl x10, x10, #6\n"
+ "ld1w { z9.s }, p2/Z, [x14, x15, LSL #2]\n"
+ "ld1w { z13.s }, p2/Z, [x11, x15, LSL #2]\n"
+ "ld1w { z11.s }, p2/Z, [x14, x28, LSL #2]\n"
+ "ld1w { z12.s }, p2/Z, [x14, x26, LSL #2]\n"
+ "ld1w { z10.s }, p2/Z, [x11, x24, LSL #2]\n"
+ "ld1w { z14.s }, p2/Z, [x9]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "movprfx z27, z29\n fmla z27.s, p3/M, z0.s, z5.s\n"
+ "movprfx z31, z29\n fmla z31.s, p3/M, z0.s, z6.s\n"
+ "ld1w { z24.s }, p2/Z, [x11, x28, LSL #2]\n"
+ "whilelt p1.s, x12, %x[n_channels]\n"
+ "movprfx z26, z29\n fmla z26.s, p3/M, z0.s, z7.s\n"
+ "movprfx z30, z29\n fmla z30.s, p3/M, z0.s, z8.s\n"
+ "ld1w { z18.s }, p3/Z, [x10]\n"
+ "incw x21\n"
+ "fmla z27.s, p3/M, z1.s, z6.s\n"
+ "fmla z31.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z23.s }, p2/Z, [x11, x26, LSL #2]\n"
+ "incw x12\n"
+ "fmla z26.s, p3/M, z1.s, z8.s\n"
+ "fmla z30.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z22.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z27.s, p3/M, z2.s, z9.s\n"
+ "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x24, LSL #2]\n"
+ "addvl x14, x14, #1\n"
+ "fmla z26.s, p3/M, z2.s, z13.s\n"
+ "fmla z30.s, p3/M, z2.s, z24.s\n"
+ "ld1w { z20.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "addvl x11, x11, #1\n"
+ "fmla z27.s, p3/M, z3.s, z11.s\n"
+ "fmla z31.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z0.s }, p2/Z, [x9, x17, LSL #2]\n"
+ "incw x20\n"
+ "fmla z26.s, p3/M, z3.s, z24.s\n"
+ "fmla z30.s, p3/M, z3.s, z23.s\n"
+ "ld1w { z17.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z27.s, p3/M, z4.s, z12.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z19.s }, p2/Z, [x9, x15, LSL #2]\n"
+ "ld1w { z5.s }, p2/Z, [x9, x28, LSL #2]\n"
+ "fmla z26.s, p3/M, z4.s, z23.s\n"
+ "fmla z30.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z21.s }, p3/Z, [x10, #4, MUL VL]\n"
+ "fmla z27.s, p3/M, z18.s, z7.s\n"
+ "fmla z31.s, p3/M, z18.s, z8.s\n"
+ "ld1w { z7.s }, p1/Z, [x11]\n"
+ "fmla z26.s, p3/M, z18.s, z14.s\n"
+ "fmla z30.s, p3/M, z18.s, z0.s\n"
+ "ld1w { z18.s }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z27.s, p3/M, z22.s, z8.s\n"
+ "fmla z31.s, p3/M, z22.s, z13.s\n"
+ "ld1w { z3.s }, p2/Z, [x9, x24, LSL #2]\n"
+ "fmla z26.s, p3/M, z22.s, z0.s\n"
+ "fmla z30.s, p3/M, z22.s, z19.s\n"
+ "ld1w { z8.s }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z27.s, p3/M, z20.s, z13.s\n"
+ "fmla z31.s, p3/M, z20.s, z24.s\n"
+ "ld1w { z2.s }, p2/Z, [x9, x26, LSL #2]\n"
+ "addvl x9, x9, #1\n"
+ "fmla z26.s, p3/M, z20.s, z19.s\n"
+ "fmla z30.s, p3/M, z20.s, z5.s\n"
+ "ld1w { z16.s }, p3/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
+ "fmla z27.s, p3/M, z17.s, z24.s\n"
+ "fmla z31.s, p3/M, z17.s, z23.s\n"
+ "ld1w { z25.s }, p2/Z, [x27]\n"
+ "ld1w { z29.s }, p3/Z, [x10, #4, MUL VL]\n"
+ "fmla z26.s, p3/M, z17.s, z5.s\n"
+ "fmla z30.s, p3/M, z17.s, z2.s\n"
+ "ld1w { z17.s }, p3/Z, [x10, #-8, MUL VL]\n"
+ "fmla z27.s, p3/M, z21.s, z23.s\n"
+ "fmla z31.s, p3/M, z21.s, z10.s\n"
+ "ld1w { z24.s }, p2/Z, [x27, x17, LSL #2]\n"
+ "ld1w { z22.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z21.s, z2.s\n"
+ "fmla z30.s, p3/M, z21.s, z3.s\n"
+ "ld1w { z21.s }, p3/Z, [x10, #-7, MUL VL]\n"
+ "fmla z27.s, p3/M, z18.s, z14.s\n"
+ "fmla z31.s, p3/M, z18.s, z0.s\n"
+ "ld1w { z1.s }, p2/Z, [x27, x24, LSL #2]\n"
+ "fmla z26.s, p3/M, z18.s, z25.s\n"
+ "fmla z30.s, p3/M, z18.s, z24.s\n"
+ "ld1w { z23.s }, p3/Z, [x10, #-6, MUL VL]\n"
+ "fmla z27.s, p3/M, z8.s, z0.s\n"
+ "fmla z31.s, p3/M, z8.s, z19.s\n"
+ "ld1w { z0.s }, p2/Z, [x27, x28, LSL #2]\n"
+ "fmla z26.s, p3/M, z8.s, z24.s\n"
+ "fmla z30.s, p3/M, z8.s, z22.s\n"
+ "ld1w { z20.s }, p3/Z, [x10, #-5, MUL VL]\n"
+ "fmla z27.s, p3/M, z16.s, z19.s\n"
+ "fmla z31.s, p3/M, z16.s, z5.s\n"
+ "ld1w { z19.s }, p2/Z, [x27, x26, LSL #2]\n"
+ "addvl x27, x27, #1\n"
+ "fmla z26.s, p3/M, z16.s, z22.s\n"
+ "fmla z30.s, p3/M, z16.s, z0.s\n"
+ "ld1w { z18.s }, p3/Z, [x10, #-4, MUL VL]\n"
+ "fmla z27.s, p3/M, z17.s, z5.s\n"
+ "fmla z31.s, p3/M, z17.s, z2.s\n"
+ "ld1w { z16.s }, p2/Z, [x25]\n"
+ "fmla z26.s, p3/M, z17.s, z0.s\n"
+ "fmla z30.s, p3/M, z17.s, z19.s\n"
+ "ld1w { z17.s }, p3/Z, [x10, #-3, MUL VL]\n"
+ "fmla z27.s, p3/M, z21.s, z2.s\n"
+ "fmla z31.s, p3/M, z21.s, z3.s\n"
+ "ld1w { z4.s }, p2/Z, [x25, x17, LSL #2]\n"
+ "ld1w { z8.s }, p2/Z, [x25, x26, LSL #2]\n"
+ "fmla z26.s, p3/M, z21.s, z19.s\n"
+ "fmla z30.s, p3/M, z21.s, z1.s\n"
+ "ld1w { z13.s }, p3/Z, [x10, #-2, MUL VL]\n"
+ "fmla z27.s, p3/M, z23.s, z25.s\n"
+ "fmla z31.s, p3/M, z23.s, z24.s\n"
+ "ld1w { z25.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z23.s, z16.s\n"
+ "fmla z30.s, p3/M, z23.s, z4.s\n"
+ "ld1w { z5.s }, p3/Z, [x10, #-1, MUL VL]\n"
+ "fmla z27.s, p3/M, z20.s, z24.s\n"
+ "fmla z31.s, p3/M, z20.s, z22.s\n"
+ "ld1w { z24.s }, p2/Z, [x25, x28, LSL #2]\n"
+ "fmla z26.s, p3/M, z20.s, z4.s\n"
+ "fmla z30.s, p3/M, z20.s, z25.s\n"
+ "ld1w { z23.s }, p3/Z, [x10]\n"
+ "fmla z27.s, p3/M, z18.s, z22.s\n"
+ "fmla z31.s, p3/M, z18.s, z0.s\n"
+ "ld1w { z22.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "addvl x25, x25, #1\n"
+ "fmla z26.s, p3/M, z18.s, z25.s\n"
+ "fmla z30.s, p3/M, z18.s, z24.s\n"
+ "ld1w { z21.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z27.s, p3/M, z17.s, z0.s\n"
+ "fmla z31.s, p3/M, z17.s, z19.s\n"
+ "ld1w { z18.s }, p2/Z, [x23]\n"
+ "fmla z26.s, p3/M, z17.s, z24.s\n"
+ "fmla z30.s, p3/M, z17.s, z8.s\n"
+ "ld1w { z20.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z27.s, p3/M, z13.s, z19.s\n"
+ "fmla z31.s, p3/M, z13.s, z1.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x17, LSL #2]\n"
+ "ld1w { z14.s }, p1/Z, [x9]\n"
+ "fmla z26.s, p3/M, z13.s, z8.s\n"
+ "fmla z30.s, p3/M, z13.s, z22.s\n"
+ "ld1w { z19.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z27.s, p3/M, z5.s, z16.s\n"
+ "fmla z31.s, p3/M, z5.s, z4.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z5.s, z18.s\n"
+ "fmla z30.s, p3/M, z5.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z27.s, p3/M, z23.s, z4.s\n"
+ "fmla z31.s, p3/M, z23.s, z25.s\n"
+ "ld1w { z13.s }, p1/Z, [x11, x15, LSL #2]\n"
+ "fmla z26.s, p3/M, z23.s, z17.s\n"
+ "fmla z30.s, p3/M, z23.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z27.s, p3/M, z21.s, z25.s\n"
+ "fmla z31.s, p3/M, z21.s, z24.s\n"
+ "ld1w { z5.s }, p1/Z, [x14]\n"
+ "fmla z26.s, p3/M, z21.s, z16.s\n"
+ "fmla z30.s, p3/M, z21.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "ld1w { z2.s }, p3/Z, [x10, #7, MUL VL]\n"
+ "fmla z27.s, p3/M, z20.s, z24.s\n"
+ "fmla z31.s, p3/M, z20.s, z8.s\n"
+ "addvl x10, x10, #16\n"
+ "whilelt p2.s, x21, %x[n_channels]\n"
+ "fmla z26.s, p3/M, z20.s, z18.s\n"
+ "fmla z30.s, p3/M, z20.s, z17.s\n"
+ "cmp x12, %x[n_channels]\n"
+ "addvl x23, x23, #1\n"
+ "fmla z27.s, p3/M, z19.s, z8.s\n"
+ "fmla z31.s, p3/M, z19.s, z22.s\n"
+ "fmax z27.s, p3/M, z27.s, z15.s\n"
+ "fmax z31.s, p3/M, z31.s, z15.s\n"
+ "fmla z26.s, p3/M, z19.s, z17.s\n"
+ "fmla z30.s, p3/M, z19.s, z16.s\n"
+ "fmax z26.s, p3/M, z26.s, z15.s\n"
+ "fmax z30.s, p3/M, z30.s, z15.s\n"
+ "fmin z27.s, p3/M, z27.s, z28.s\n"
+ "fmin z31.s, p3/M, z31.s, z28.s\n"
+ "ld1w { z6.s }, p1/Z, [x14, x17, LSL #2]\n"
+ "ld1w { z8.s }, p1/Z, [x11, x17, LSL #2]\n"
+ "fmin z26.s, p3/M, z26.s, z28.s\n"
+ "fmin z30.s, p3/M, z30.s, z28.s\n"
+ "ld1w { z9.s }, p1/Z, [x14, x15, LSL #2]\n"
+ "ld1w { z11.s }, p1/Z, [x14, x28, LSL #2]\n"
+ "ld1w { z12.s }, p1/Z, [x14, x26, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x11, x24, LSL #2]\n"
+ "st1w { z27.s }, p0, [x13]\n"
+ "st1w { z31.s }, p0, [x13, x16, LSL #2]\n"
+ "addvl x13, x13, #1\n"
+ "ld1w { z3.s }, p3/Z, [x10, #-8, MUL VL]\n"
+ "ld1w { z4.s }, p3/Z, [x10, #-7, MUL VL]\n"
+ "st1w { z26.s }, p0, [x22]\n"
+ "addvl x10, x10, #-6\n"
+ "st1w { z30.s }, p0, [x22, x16, LSL #2]\n"
+ "addvl x22, x22, #1\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "movprfx z30, z29\n fmla z30.s, p3/M, z0.s, z5.s\n"
+ "movprfx z31, z29\n fmla z31.s, p3/M, z0.s, z6.s\n"
+ "ld1w { z22.s }, p2/Z, [x11, x28, LSL #2]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "movprfx z5, z29\n fmla z5.s, p3/M, z0.s, z7.s\n"
+ "fmla z29.s, p3/M, z0.s, z8.s\n"
+ "ld1w { z20.s }, p3/Z, [x10]\n"
+ "ldr x12, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "fmla z30.s, p3/M, z1.s, z6.s\n"
+ "fmla z31.s, p3/M, z1.s, z9.s\n"
+ "ld1w { z6.s }, p2/Z, [x11, x26, LSL #2]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "fmla z5.s, p3/M, z1.s, z8.s\n"
+ "fmla z29.s, p3/M, z1.s, z13.s\n"
+ "ld1w { z19.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "add x8, x8, #0x1\n"
+ "fmla z30.s, p3/M, z2.s, z9.s\n"
+ "fmla z31.s, p3/M, z2.s, z11.s\n"
+ "ld1w { z16.s }, p2/Z, [x14, x24, LSL #2]\n"
+ "cmp x8, x20\n"
+ "fmla z5.s, p3/M, z2.s, z13.s\n"
+ "fmla z29.s, p3/M, z2.s, z22.s\n"
+ "ld1w { z18.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "add x21, x12, #0x1\n"
+ "fmla z30.s, p3/M, z3.s, z11.s\n"
+ "fmla z31.s, p3/M, z3.s, z12.s\n"
+ "ld1w { z1.s }, p2/Z, [x9, x17, LSL #2]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "fmla z5.s, p3/M, z3.s, z22.s\n"
+ "fmla z29.s, p3/M, z3.s, z6.s\n"
+ "ld1w { z17.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "csel x12, x12, x21, LT\n"
+ "fmla z30.s, p3/M, z4.s, z12.s\n"
+ "fmla z31.s, p3/M, z4.s, z16.s\n"
+ "ld1w { z0.s }, p2/Z, [x9, x15, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x9, x28, LSL #2]\n"
+ "fmla z5.s, p3/M, z4.s, z6.s\n"
+ "fmla z29.s, p3/M, z4.s, z10.s\n"
+ "ld1w { z16.s }, p3/Z, [x10, #4, MUL VL]\n"
+ "mov p0.b, p2.b\n"
+ "fmla z30.s, p3/M, z20.s, z7.s\n"
+ "fmla z31.s, p3/M, z20.s, z8.s\n"
+ "csel x8, x8, XZR, LT\n"
+ "cmp x12, x20\n"
+ "fmla z5.s, p3/M, z20.s, z14.s\n"
+ "fmla z29.s, p3/M, z20.s, z1.s\n"
+ "ld1w { z21.s }, p3/Z, [x10, #5, MUL VL]\n"
+ "fmla z30.s, p3/M, z19.s, z8.s\n"
+ "fmla z31.s, p3/M, z19.s, z13.s\n"
+ "ld1w { z26.s }, p2/Z, [x9, x24, LSL #2]\n"
+ "fmla z5.s, p3/M, z19.s, z1.s\n"
+ "fmla z29.s, p3/M, z19.s, z0.s\n"
+ "ld1w { z25.s }, p3/Z, [x10, #6, MUL VL]\n"
+ "fmla z30.s, p3/M, z18.s, z13.s\n"
+ "fmla z31.s, p3/M, z18.s, z22.s\n"
+ "ld1w { z24.s }, p2/Z, [x9, x26, LSL #2]\n"
+ "fmla z5.s, p3/M, z18.s, z0.s\n"
+ "fmla z29.s, p3/M, z18.s, z27.s\n"
+ "ld1w { z23.s }, p3/Z, [x10, #7, MUL VL]\n"
+ "addvl x10, x10, #16\n"
+ "fmla z30.s, p3/M, z17.s, z22.s\n"
+ "fmla z31.s, p3/M, z17.s, z6.s\n"
+ "ld1w { z22.s }, p2/Z, [x27]\n"
+ "fmla z5.s, p3/M, z17.s, z27.s\n"
+ "fmla z29.s, p3/M, z17.s, z24.s\n"
+ "ld1w { z20.s }, p3/Z, [x10, #-8, MUL VL]\n"
+ "fmla z30.s, p3/M, z16.s, z6.s\n"
+ "fmla z31.s, p3/M, z16.s, z10.s\n"
+ "ld1w { z19.s }, p2/Z, [x27, x17, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x27, x15, LSL #2]\n"
+ "fmla z5.s, p3/M, z16.s, z24.s\n"
+ "fmla z29.s, p3/M, z16.s, z26.s\n"
+ "ld1w { z16.s }, p3/Z, [x10, #-7, MUL VL]\n"
+ "fmla z30.s, p3/M, z21.s, z14.s\n"
+ "fmla z31.s, p3/M, z21.s, z1.s\n"
+ "ld1w { z17.s }, p2/Z, [x27, x24, LSL #2]\n"
+ "fmla z5.s, p3/M, z21.s, z22.s\n"
+ "fmla z29.s, p3/M, z21.s, z19.s\n"
+ "ld1w { z21.s }, p3/Z, [x10, #-6, MUL VL]\n"
+ "fmla z30.s, p3/M, z25.s, z1.s\n"
+ "fmla z31.s, p3/M, z25.s, z0.s\n"
+ "ld1w { z7.s }, p2/Z, [x27, x28, LSL #2]\n"
+ "fmla z5.s, p3/M, z25.s, z19.s\n"
+ "fmla z29.s, p3/M, z25.s, z18.s\n"
+ "ld1w { z10.s }, p3/Z, [x10, #-5, MUL VL]\n"
+ "fmla z30.s, p3/M, z23.s, z0.s\n"
+ "fmla z31.s, p3/M, z23.s, z27.s\n"
+ "ld1w { z11.s }, p2/Z, [x27, x26, LSL #2]\n"
+ "fmla z5.s, p3/M, z23.s, z18.s\n"
+ "fmla z29.s, p3/M, z23.s, z7.s\n"
+ "ld1w { z6.s }, p3/Z, [x10, #-4, MUL VL]\n"
+ "fmla z30.s, p3/M, z20.s, z27.s\n"
+ "fmla z31.s, p3/M, z20.s, z24.s\n"
+ "ld1w { z0.s }, p2/Z, [x25]\n"
+ "fmla z5.s, p3/M, z20.s, z7.s\n"
+ "fmla z29.s, p3/M, z20.s, z11.s\n"
+ "ld1w { z9.s }, p3/Z, [x10, #-3, MUL VL]\n"
+ "fmla z30.s, p3/M, z16.s, z24.s\n"
+ "fmla z31.s, p3/M, z16.s, z26.s\n"
+ "ld1w { z3.s }, p2/Z, [x25, x17, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x25, x26, LSL #2]\n"
+ "fmla z5.s, p3/M, z16.s, z11.s\n"
+ "fmla z29.s, p3/M, z16.s, z17.s\n"
+ "ld1w { z16.s }, p3/Z, [x10, #-2, MUL VL]\n"
+ "fmla z30.s, p3/M, z21.s, z22.s\n"
+ "fmla z31.s, p3/M, z21.s, z19.s\n"
+ "ld1w { z26.s }, p2/Z, [x25, x15, LSL #2]\n"
+ "fmla z5.s, p3/M, z21.s, z0.s\n"
+ "fmla z29.s, p3/M, z21.s, z3.s\n"
+ "ld1w { z25.s }, p3/Z, [x10, #-1, MUL VL]\n"
+ "fmla z30.s, p3/M, z10.s, z19.s\n"
+ "fmla z31.s, p3/M, z10.s, z18.s\n"
+ "ld1w { z24.s }, p2/Z, [x25, x28, LSL #2]\n"
+ "fmla z5.s, p3/M, z10.s, z3.s\n"
+ "fmla z29.s, p3/M, z10.s, z26.s\n"
+ "ld1w { z23.s }, p3/Z, [x10]\n"
+ "fmla z30.s, p3/M, z6.s, z18.s\n"
+ "fmla z31.s, p3/M, z6.s, z7.s\n"
+ "ld1w { z22.s }, p2/Z, [x25, x24, LSL #2]\n"
+ "fmla z5.s, p3/M, z6.s, z26.s\n"
+ "fmla z29.s, p3/M, z6.s, z24.s\n"
+ "ld1w { z21.s }, p3/Z, [x10, #1, MUL VL]\n"
+ "fmla z30.s, p3/M, z9.s, z7.s\n"
+ "fmla z31.s, p3/M, z9.s, z11.s\n"
+ "ld1w { z18.s }, p2/Z, [x23]\n"
+ "fmla z5.s, p3/M, z9.s, z24.s\n"
+ "fmla z29.s, p3/M, z9.s, z27.s\n"
+ "ld1w { z20.s }, p3/Z, [x10, #2, MUL VL]\n"
+ "fmla z30.s, p3/M, z16.s, z11.s\n"
+ "fmla z31.s, p3/M, z16.s, z17.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x17, LSL #2]\n"
+ "fmla z5.s, p3/M, z16.s, z27.s\n"
+ "fmla z29.s, p3/M, z16.s, z22.s\n"
+ "ld1w { z19.s }, p3/Z, [x10, #3, MUL VL]\n"
+ "fmla z30.s, p3/M, z25.s, z0.s\n"
+ "fmla z31.s, p3/M, z25.s, z3.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x15, LSL #2]\n"
+ "fmla z5.s, p3/M, z25.s, z18.s\n"
+ "fmla z29.s, p3/M, z25.s, z17.s\n"
+ "ld1w { z18.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "fmla z30.s, p3/M, z23.s, z3.s\n"
+ "fmla z31.s, p3/M, z23.s, z26.s\n"
+ "fmla z5.s, p3/M, z23.s, z17.s\n"
+ "fmla z29.s, p3/M, z23.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x23, x26, LSL #2]\n"
+ "fmla z30.s, p3/M, z21.s, z26.s\n"
+ "fmla z31.s, p3/M, z21.s, z24.s\n"
+ "fmla z5.s, p3/M, z21.s, z16.s\n"
+ "fmla z29.s, p3/M, z21.s, z18.s\n"
+ "ld1w { z16.s }, p2/Z, [x23, x24, LSL #2]\n"
+ "fmla z30.s, p3/M, z20.s, z24.s\n"
+ "fmla z31.s, p3/M, z20.s, z27.s\n"
+ "fmla z5.s, p3/M, z20.s, z18.s\n"
+ "fmla z29.s, p3/M, z20.s, z17.s\n"
+ "fmla z30.s, p3/M, z19.s, z27.s\n"
+ "fmla z31.s, p3/M, z19.s, z22.s\n"
+ "fmax z30.s, p3/M, z30.s, z15.s\n"
+ "fmax z31.s, p3/M, z31.s, z15.s\n"
+ "fmla z5.s, p3/M, z19.s, z17.s\n"
+ "fmla z29.s, p3/M, z19.s, z16.s\n"
+ "fmax z5.s, p3/M, z5.s, z15.s\n"
+ "fmax z29.s, p3/M, z29.s, z15.s\n"
+ "fmin z30.s, p3/M, z30.s, z28.s\n"
+ "fmin z31.s, p3/M, z31.s, z28.s\n"
+ "st1w { z30.s }, p0, [x13]\n"
+ "fmin z5.s, p3/M, z5.s, z28.s\n"
+ "fmin z29.s, p3/M, z29.s, z28.s\n"
+ "st1w { z31.s }, p0, [x13, x16, LSL #2]\n"
+ "st1w { z5.s }, p0, [x22]\n"
+ "st1w { z29.s }, p0, [x22, x16, LSL #2]\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 0000000000..bf65e04d32
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,551 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ struct Args
+ {
+ float *const *outptrs;
+ const void *params;
+ const float min, max;
+ const float *inptrs[36];
+
+ Args(
+ const float *const *const input_ptrs,
+ float *const *const outptrs,
+ const void *const params,
+ const float min,
+ const float max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[0];
+ inptrs[1] = input_ptrs[1];
+ inptrs[2] = input_ptrs[6];
+ inptrs[3] = input_ptrs[7];
+ inptrs[4] = input_ptrs[2];
+ inptrs[5] = input_ptrs[8];
+ inptrs[6] = input_ptrs[3];
+ inptrs[7] = input_ptrs[4];
+ inptrs[8] = input_ptrs[11];
+ inptrs[9] = input_ptrs[12];
+ inptrs[10] = input_ptrs[9];
+ inptrs[11] = input_ptrs[10];
+ inptrs[12] = input_ptrs[5];
+ inptrs[13] = input_ptrs[13];
+ inptrs[14] = input_ptrs[14];
+ inptrs[15] = input_ptrs[15];
+ inptrs[16] = input_ptrs[16];
+ inptrs[17] = input_ptrs[17];
+ inptrs[18] = input_ptrs[18];
+ inptrs[19] = input_ptrs[19];
+ inptrs[20] = input_ptrs[20];
+ inptrs[21] = input_ptrs[21];
+ inptrs[22] = input_ptrs[22];
+ inptrs[23] = input_ptrs[23];
+ inptrs[24] = input_ptrs[24];
+ inptrs[25] = input_ptrs[25];
+ inptrs[26] = input_ptrs[26];
+ inptrs[27] = input_ptrs[27];
+ inptrs[28] = input_ptrs[28];
+ inptrs[29] = input_ptrs[29];
+ inptrs[30] = input_ptrs[30];
+ inptrs[31] = input_ptrs[31];
+ inptrs[32] = input_ptrs[32];
+ inptrs[33] = input_ptrs[33];
+ inptrs[34] = input_ptrs[34];
+ inptrs[35] = input_ptrs[35];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x20, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldp x15, x14, [x20, #0x0]\n"
+ "mov x13, #0x0\n"
+ "ldp x12, x11, [x20, #0x10]\n"
+ "whilelt p3.s, XZR, %x[n_channels]\n"
+ "ldp x21, x20, [x16, #0x0]\n"
+ "cntw x10\n"
+ "ptrue p2.b\n"
+ "ldr x9, [%x[params_struct], %[offsetof_args_params]]\n"
+ "ld1w { z5.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "cmp x10, %x[n_channels]\n"
+ "ld1w { z6.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldp x27, x26, [x16, #0x10]\n"
+ "sub x28, XZR, x10\n"
+ "ldp x25, x24, [x16, #0x20]\n"
+ "ldp x23, x22, [x16, #0x30]\n"
+ "ldp x21, x20, [x16, #0x40]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[params_struct], %[offsetof_args_min]]\n"
+ "ld1rw { z28.s }, p2/Z, [%x[params_struct], %[offsetof_args_max]]\n"
+ "ld1w { z29.s }, p2/Z, [x9]\n"
+ "ld1w { z0.s }, p2/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x9, #4, MUL VL]\n"
+ "ld1w { z4.s }, p2/Z, [x9, #5, MUL VL]\n"
+ "ld1w { z7.s }, p3/Z, [x27, x13, LSL #2]\n"
+ "addvl x9, x9, #6\n"
+ "ld1w { z8.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "ld1w { z9.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "ld1w { z13.s }, p3/Z, [x24, x13, LSL #2]\n"
+ "ld1w { z11.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ld1w { z12.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "ld1w { z10.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ld1w { z14.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "movprfx z30, z29\n fmla z30.s, p2/M, z0.s, z5.s\n"
+ "movprfx z27, z29\n fmla z27.s, p2/M, z0.s, z6.s\n"
+ "ldr x20, [x16, #0x50]\n"
+ "ld1w { z5.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "movprfx z31, z29\n fmla z31.s, p2/M, z0.s, z7.s\n"
+ "movprfx z26, z29\n fmla z26.s, p2/M, z0.s, z8.s\n"
+ "ldr x20, [x16, #0x58]\n"
+ "ldr x21, [x16, #0x60]\n"
+ "fmla z30.s, p2/M, z1.s, z6.s\n"
+ "fmla z27.s, p2/M, z1.s, z9.s\n"
+ "ld1w { z22.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x20, [x16, #0x68]\n"
+ "fmla z31.s, p2/M, z1.s, z8.s\n"
+ "fmla z26.s, p2/M, z1.s, z13.s\n"
+ "ld1w { z21.s }, p2/Z, [x9]\n"
+ "ldr x23, [x16, #0x70]\n"
+ "fmla z30.s, p2/M, z2.s, z9.s\n"
+ "fmla z27.s, p2/M, z2.s, z11.s\n"
+ "ld1w { z20.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z31.s, p2/M, z2.s, z13.s\n"
+ "fmla z26.s, p2/M, z2.s, z5.s\n"
+ "ldr x22, [x16, #0x78]\n"
+ "ld1w { z17.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "fmla z30.s, p2/M, z3.s, z11.s\n"
+ "fmla z27.s, p2/M, z3.s, z12.s\n"
+ "ld1w { z11.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x21, [x16, #0x80]\n"
+ "fmla z31.s, p2/M, z3.s, z5.s\n"
+ "fmla z26.s, p2/M, z3.s, z22.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, #3, MUL VL]\n"
+ "ldr x20, [x16, #0x88]\n"
+ "fmla z30.s, p2/M, z4.s, z12.s\n"
+ "fmla z27.s, p2/M, z4.s, z20.s\n"
+ "ld1w { z0.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ld1w { z29.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z4.s, z22.s\n"
+ "fmla z26.s, p2/M, z4.s, z10.s\n"
+ "ld1w { z19.s }, p2/Z, [x9, #4, MUL VL]\n"
+ "ldr x23, [x16, #0x90]\n"
+ "fmla z30.s, p2/M, z21.s, z7.s\n"
+ "fmla z27.s, p2/M, z21.s, z8.s\n"
+ "ldr x26, [x16, #0x98]\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla z31.s, p2/M, z21.s, z14.s\n"
+ "fmla z26.s, p2/M, z21.s, z11.s\n"
+ "ld1w { z25.s }, p2/Z, [x9, #5, MUL VL]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla z30.s, p2/M, z18.s, z8.s\n"
+ "fmla z27.s, p2/M, z18.s, z13.s\n"
+ "ld1w { z24.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla z31.s, p2/M, z18.s, z11.s\n"
+ "fmla z26.s, p2/M, z18.s, z0.s\n"
+ "ld1w { z18.s }, p2/Z, [x9, #6, MUL VL]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z30.s, p2/M, z17.s, z13.s\n"
+ "fmla z27.s, p2/M, z17.s, z5.s\n"
+ "ld1w { z3.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ldr x21, [x16, #0xc0]\n"
+ "fmla z31.s, p2/M, z17.s, z0.s\n"
+ "fmla z26.s, p2/M, z17.s, z29.s\n"
+ "ld1w { z17.s }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
+ "fmla z30.s, p2/M, z16.s, z5.s\n"
+ "fmla z27.s, p2/M, z16.s, z22.s\n"
+ "ld1w { z6.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ldr x27, [x16, #0xc8]\n"
+ "fmla z31.s, p2/M, z16.s, z29.s\n"
+ "fmla z26.s, p2/M, z16.s, z3.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ldr x23, [x16, #0xd0]\n"
+ "fmla z30.s, p2/M, z19.s, z22.s\n"
+ "fmla z27.s, p2/M, z19.s, z10.s\n"
+ "ld1w { z23.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "ld1w { z22.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z19.s, z3.s\n"
+ "fmla z26.s, p2/M, z19.s, z24.s\n"
+ "ld1w { z21.s }, p2/Z, [x9, #-7, MUL VL]\n"
+ "ldr x22, [x16, #0xd8]\n"
+ "fmla z30.s, p2/M, z25.s, z14.s\n"
+ "fmla z27.s, p2/M, z25.s, z11.s\n"
+ "ld1w { z1.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x20, [x16, #0xe0]\n"
+ "fmla z31.s, p2/M, z25.s, z6.s\n"
+ "fmla z26.s, p2/M, z25.s, z23.s\n"
+ "ld1w { z20.s }, p2/Z, [x9, #-6, MUL VL]\n"
+ "ldr x26, [x16, #0xf8]\n"
+ "fmla z30.s, p2/M, z18.s, z11.s\n"
+ "fmla z27.s, p2/M, z18.s, z0.s\n"
+ "ld1w { z7.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla z31.s, p2/M, z18.s, z23.s\n"
+ "fmla z26.s, p2/M, z18.s, z22.s\n"
+ "ld1w { z18.s }, p2/Z, [x9, #-5, MUL VL]\n"
+ "whilelt p1.s, x10, %x[n_channels]\n"
+ "fmla z30.s, p2/M, z17.s, z0.s\n"
+ "fmla z27.s, p2/M, z17.s, z29.s\n"
+ "ld1w { z19.s }, p3/Z, [x24, x13, LSL #2]\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla z31.s, p2/M, z17.s, z22.s\n"
+ "fmla z26.s, p2/M, z17.s, z7.s\n"
+ "ld1w { z17.s }, p2/Z, [x9, #-4, MUL VL]\n"
+ "incw x28\n"
+ "fmla z30.s, p2/M, z16.s, z29.s\n"
+ "fmla z27.s, p2/M, z16.s, z3.s\n"
+ "ld1w { z0.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ldr x21, [x16, #0x100]\n"
+ "fmla z31.s, p2/M, z16.s, z7.s\n"
+ "fmla z26.s, p2/M, z16.s, z19.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, #-3, MUL VL]\n"
+ "mov p0.b, p3.b\n"
+ "fmla z30.s, p2/M, z21.s, z3.s\n"
+ "fmla z27.s, p2/M, z21.s, z24.s\n"
+ "ld1w { z11.s }, p3/Z, [x27, x13, LSL #2]\n"
+ "ld1w { z13.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z21.s, z19.s\n"
+ "fmla z26.s, p2/M, z21.s, z1.s\n"
+ "ld1w { z10.s }, p2/Z, [x9, #-2, MUL VL]\n"
+ "ldr x20, [x16, #0x108]\n"
+ "fmla z30.s, p2/M, z20.s, z6.s\n"
+ "fmla z27.s, p2/M, z20.s, z23.s\n"
+ "ld1w { z25.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ldr x23, [x16, #0x110]\n"
+ "fmla z31.s, p2/M, z20.s, z0.s\n"
+ "fmla z26.s, p2/M, z20.s, z11.s\n"
+ "ld1w { z8.s }, p2/Z, [x9, #-1, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [x9, #4, MUL VL]\n"
+ "fmla z30.s, p2/M, z18.s, z23.s\n"
+ "fmla z27.s, p2/M, z18.s, z22.s\n"
+ "ld1w { z24.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "ldr x22, [x16, #0x118]\n"
+ "fmla z31.s, p2/M, z18.s, z11.s\n"
+ "fmla z26.s, p2/M, z18.s, z25.s\n"
+ "ld1w { z23.s }, p2/Z, [x9]\n"
+ "fmla z30.s, p2/M, z17.s, z22.s\n"
+ "fmla z27.s, p2/M, z17.s, z7.s\n"
+ "ld1w { z22.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z17.s, z25.s\n"
+ "fmla z26.s, p2/M, z17.s, z24.s\n"
+ "ld1w { z21.s }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z30.s, p2/M, z16.s, z7.s\n"
+ "fmla z27.s, p2/M, z16.s, z19.s\n"
+ "ld1w { z18.s }, p3/Z, [x24, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z16.s, z24.s\n"
+ "fmla z26.s, p2/M, z16.s, z13.s\n"
+ "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "fmla z30.s, p2/M, z10.s, z19.s\n"
+ "fmla z27.s, p2/M, z10.s, z1.s\n"
+ "ld1w { z17.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z10.s, z13.s\n"
+ "fmla z26.s, p2/M, z10.s, z22.s\n"
+ "ld1w { z19.s }, p2/Z, [x9, #3, MUL VL]\n"
+ "fmla z30.s, p2/M, z8.s, z0.s\n"
+ "fmla z27.s, p2/M, z8.s, z11.s\n"
+ "ld1w { z16.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "fmla z31.s, p2/M, z8.s, z18.s\n"
+ "fmla z26.s, p2/M, z8.s, z17.s\n"
+ "ld1w { z18.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldp x21, x20, [x16, #0x0]\n"
+ "fmla z30.s, p2/M, z23.s, z11.s\n"
+ "fmla z27.s, p2/M, z23.s, z25.s\n"
+ "ld1w { z0.s }, p2/Z, [x9, #5, MUL VL]\n"
+ "fmla z31.s, p2/M, z23.s, z17.s\n"
+ "fmla z26.s, p2/M, z23.s, z16.s\n"
+ "ld1w { z17.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ld1w { z1.s }, p2/Z, [x9, #6, MUL VL]\n"
+ "fmla z30.s, p2/M, z21.s, z25.s\n"
+ "fmla z27.s, p2/M, z21.s, z24.s\n"
+ "ld1w { z5.s }, p1/Z, [x21, x10, LSL #2]\n"
+ "fmla z31.s, p2/M, z21.s, z16.s\n"
+ "fmla z26.s, p2/M, z21.s, z18.s\n"
+ "ld1w { z16.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "ldp x27, x26, [x16, #0x10]\n"
+ "fmla z30.s, p2/M, z20.s, z24.s\n"
+ "fmla z27.s, p2/M, z20.s, z13.s\n"
+ "ld1w { z6.s }, p1/Z, [x20, x10, LSL #2]\n"
+ "ldp x25, x24, [x16, #0x20]\n"
+ "fmla z31.s, p2/M, z20.s, z18.s\n"
+ "fmla z26.s, p2/M, z20.s, z17.s\n"
+ "ldp x23, x22, [x16, #0x30]\n"
+ "ldp x21, x20, [x16, #0x40]\n"
+ "fmla z30.s, p2/M, z19.s, z13.s\n"
+ "fmla z27.s, p2/M, z19.s, z22.s\n"
+ "incw x13\n"
+ "ld1w { z7.s }, p1/Z, [x27, x10, LSL #2]\n"
+ "fmla z31.s, p2/M, z19.s, z17.s\n"
+ "fmla z26.s, p2/M, z19.s, z16.s\n"
+ "ld1w { z8.s }, p1/Z, [x26, x10, LSL #2]\n"
+ "ld1w { z9.s }, p1/Z, [x25, x10, LSL #2]\n"
+ "ld1w { z13.s }, p1/Z, [x24, x10, LSL #2]\n"
+ "ld1w { z11.s }, p1/Z, [x23, x10, LSL #2]\n"
+ "fmax z30.s, p2/M, z30.s, z15.s\n"
+ "fmax z27.s, p2/M, z27.s, z15.s\n"
+ "ld1w { z12.s }, p1/Z, [x22, x10, LSL #2]\n"
+ "ld1w { z10.s }, p1/Z, [x21, x10, LSL #2]\n"
+ "fmax z31.s, p2/M, z31.s, z15.s\n"
+ "fmax z26.s, p2/M, z26.s, z15.s\n"
+ "ld1w { z14.s }, p1/Z, [x20, x10, LSL #2]\n"
+ "incw x10\n"
+ "ld1w { z2.s }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
+ "whilelt p3.s, x13, %x[n_channels]\n"
+ "cmp x10, %x[n_channels]\n"
+ "ld1w { z3.s }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ld1w { z4.s }, p2/Z, [x9, #-7, MUL VL]\n"
+ "fmin z30.s, p2/M, z30.s, z28.s\n"
+ "fmin z27.s, p2/M, z27.s, z28.s\n"
+ "st1w { z30.s }, p0, [x15, x28, LSL #2]\n"
+ "fmin z31.s, p2/M, z31.s, z28.s\n"
+ "fmin z26.s, p2/M, z26.s, z28.s\n"
+ "st1w { z27.s }, p0, [x14, x28, LSL #2]\n"
+ "st1w { z31.s }, p0, [x12, x28, LSL #2]\n"
+ "addvl x9, x9, #-6\n"
+ "st1w { z26.s }, p0, [x11, x28, LSL #2]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "movprfx z30, z29\n fmla z30.s, p2/M, z0.s, z5.s\n"
+ "movprfx z31, z29\n fmla z31.s, p2/M, z0.s, z6.s\n"
+ "ldr x20, [x16, #0x50]\n"
+ "ld1w { z22.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "movprfx z5, z29\n fmla z5.s, p2/M, z0.s, z7.s\n"
+ "fmla z29.s, p2/M, z0.s, z8.s\n"
+ "ldr x20, [x16, #0x58]\n"
+ "ldr x21, [x16, #0x60]\n"
+ "fmla z30.s, p2/M, z1.s, z6.s\n"
+ "fmla z31.s, p2/M, z1.s, z9.s\n"
+ "ld1w { z6.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x20, [x16, #0x68]\n"
+ "fmla z5.s, p2/M, z1.s, z8.s\n"
+ "fmla z29.s, p2/M, z1.s, z13.s\n"
+ "ld1w { z20.s }, p2/Z, [x9]\n"
+ "ldr x23, [x16, #0x70]\n"
+ "fmla z30.s, p2/M, z2.s, z9.s\n"
+ "fmla z31.s, p2/M, z2.s, z11.s\n"
+ "ld1w { z16.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ld1w { z19.s }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z5.s, p2/M, z2.s, z13.s\n"
+ "fmla z29.s, p2/M, z2.s, z22.s\n"
+ "ldr x21, [x16, #0x78]\n"
+ "ld1w { z18.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "fmla z30.s, p2/M, z3.s, z11.s\n"
+ "fmla z31.s, p2/M, z3.s, z12.s\n"
+ "ld1w { z1.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x22, [x16, #0x80]\n"
+ "fmla z5.s, p2/M, z3.s, z22.s\n"
+ "fmla z29.s, p2/M, z3.s, z6.s\n"
+ "ld1w { z17.s }, p2/Z, [x9, #3, MUL VL]\n"
+ "ldr x20, [x16, #0x88]\n"
+ "fmla z30.s, p2/M, z4.s, z12.s\n"
+ "fmla z31.s, p2/M, z4.s, z16.s\n"
+ "ld1w { z0.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ld1w { z27.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z4.s, z6.s\n"
+ "fmla z29.s, p2/M, z4.s, z10.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, #4, MUL VL]\n"
+ "ldr x21, [x16, #0x90]\n"
+ "fmla z30.s, p2/M, z20.s, z7.s\n"
+ "fmla z31.s, p2/M, z20.s, z8.s\n"
+ "ldr x27, [x16, #0x98]\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla z5.s, p2/M, z20.s, z14.s\n"
+ "fmla z29.s, p2/M, z20.s, z1.s\n"
+ "ld1w { z21.s }, p2/Z, [x9, #5, MUL VL]\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla z30.s, p2/M, z19.s, z8.s\n"
+ "fmla z31.s, p2/M, z19.s, z13.s\n"
+ "ld1w { z26.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla z5.s, p2/M, z19.s, z1.s\n"
+ "fmla z29.s, p2/M, z19.s, z0.s\n"
+ "ld1w { z25.s }, p2/Z, [x9, #6, MUL VL]\n"
+ "ldr x20, [x16, #0xb8]\n"
+ "fmla z30.s, p2/M, z18.s, z13.s\n"
+ "fmla z31.s, p2/M, z18.s, z22.s\n"
+ "ld1w { z24.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "ldr x23, [x16, #0xc0]\n"
+ "fmla z5.s, p2/M, z18.s, z0.s\n"
+ "fmla z29.s, p2/M, z18.s, z27.s\n"
+ "ld1w { z23.s }, p2/Z, [x9, #7, MUL VL]\n"
+ "addvl x9, x9, #16\n"
+ "fmla z30.s, p2/M, z17.s, z22.s\n"
+ "fmla z31.s, p2/M, z17.s, z6.s\n"
+ "ld1w { z22.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ldr x22, [x16, #0xc8]\n"
+ "fmla z5.s, p2/M, z17.s, z27.s\n"
+ "fmla z29.s, p2/M, z17.s, z24.s\n"
+ "ld1w { z20.s }, p2/Z, [x9, #-8, MUL VL]\n"
+ "ldr x21, [x16, #0xd0]\n"
+ "fmla z30.s, p2/M, z16.s, z6.s\n"
+ "fmla z31.s, p2/M, z16.s, z10.s\n"
+ "ld1w { z19.s }, p3/Z, [x27, x13, LSL #2]\n"
+ "ld1w { z18.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z16.s, z24.s\n"
+ "fmla z29.s, p2/M, z16.s, z26.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, #-7, MUL VL]\n"
+ "ldr x27, [x16, #0xd8]\n"
+ "fmla z30.s, p2/M, z21.s, z14.s\n"
+ "fmla z31.s, p2/M, z21.s, z1.s\n"
+ "ld1w { z17.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "ldr x20, [x16, #0xe0]\n"
+ "fmla z5.s, p2/M, z21.s, z22.s\n"
+ "fmla z29.s, p2/M, z21.s, z19.s\n"
+ "ld1w { z21.s }, p2/Z, [x9, #-6, MUL VL]\n"
+ "ldr x26, [x16, #0xf8]\n"
+ "fmla z30.s, p2/M, z25.s, z1.s\n"
+ "fmla z31.s, p2/M, z25.s, z0.s\n"
+ "ld1w { z9.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla z5.s, p2/M, z25.s, z19.s\n"
+ "fmla z29.s, p2/M, z25.s, z18.s\n"
+ "ld1w { z4.s }, p2/Z, [x9, #-5, MUL VL]\n"
+ "incw x28\n"
+ "fmla z30.s, p2/M, z23.s, z0.s\n"
+ "fmla z31.s, p2/M, z23.s, z27.s\n"
+ "ld1w { z8.s }, p3/Z, [x24, x13, LSL #2]\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla z5.s, p2/M, z23.s, z18.s\n"
+ "fmla z29.s, p2/M, z23.s, z9.s\n"
+ "ld1w { z6.s }, p2/Z, [x9, #-4, MUL VL]\n"
+ "mov p0.b, p3.b\n"
+ "fmla z30.s, p2/M, z20.s, z27.s\n"
+ "fmla z31.s, p2/M, z20.s, z24.s\n"
+ "ld1w { z10.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "ldr x23, [x16, #0x100]\n"
+ "fmla z5.s, p2/M, z20.s, z9.s\n"
+ "fmla z29.s, p2/M, z20.s, z8.s\n"
+ "ld1w { z11.s }, p2/Z, [x9, #-3, MUL VL]\n"
+ "fmla z30.s, p2/M, z16.s, z24.s\n"
+ "fmla z31.s, p2/M, z16.s, z26.s\n"
+ "ld1w { z0.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "ld1w { z27.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z16.s, z8.s\n"
+ "fmla z29.s, p2/M, z16.s, z17.s\n"
+ "ld1w { z16.s }, p2/Z, [x9, #-2, MUL VL]\n"
+ "ldr x22, [x16, #0x108]\n"
+ "fmla z30.s, p2/M, z21.s, z22.s\n"
+ "fmla z31.s, p2/M, z21.s, z19.s\n"
+ "ld1w { z26.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "ldr x21, [x16, #0x110]\n"
+ "fmla z5.s, p2/M, z21.s, z10.s\n"
+ "fmla z29.s, p2/M, z21.s, z0.s\n"
+ "ld1w { z25.s }, p2/Z, [x9, #-1, MUL VL]\n"
+ "fmla z30.s, p2/M, z4.s, z19.s\n"
+ "fmla z31.s, p2/M, z4.s, z18.s\n"
+ "ld1w { z24.s }, p3/Z, [x27, x13, LSL #2]\n"
+ "ldr x20, [x16, #0x118]\n"
+ "fmla z5.s, p2/M, z4.s, z0.s\n"
+ "fmla z29.s, p2/M, z4.s, z26.s\n"
+ "ld1w { z23.s }, p2/Z, [x9]\n"
+ "fmla z30.s, p2/M, z6.s, z18.s\n"
+ "fmla z31.s, p2/M, z6.s, z9.s\n"
+ "ld1w { z22.s }, p3/Z, [x25, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z6.s, z26.s\n"
+ "fmla z29.s, p2/M, z6.s, z24.s\n"
+ "ld1w { z21.s }, p2/Z, [x9, #1, MUL VL]\n"
+ "fmla z30.s, p2/M, z11.s, z9.s\n"
+ "fmla z31.s, p2/M, z11.s, z8.s\n"
+ "ld1w { z18.s }, p3/Z, [x24, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z11.s, z24.s\n"
+ "fmla z29.s, p2/M, z11.s, z27.s\n"
+ "ld1w { z20.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "fmla z30.s, p2/M, z16.s, z8.s\n"
+ "fmla z31.s, p2/M, z16.s, z17.s\n"
+ "ld1w { z17.s }, p3/Z, [x26, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z16.s, z27.s\n"
+ "fmla z29.s, p2/M, z16.s, z22.s\n"
+ "ld1w { z19.s }, p2/Z, [x9, #3, MUL VL]\n"
+ "fmla z30.s, p2/M, z25.s, z10.s\n"
+ "fmla z31.s, p2/M, z25.s, z0.s\n"
+ "ld1w { z16.s }, p3/Z, [x23, x13, LSL #2]\n"
+ "fmla z5.s, p2/M, z25.s, z18.s\n"
+ "fmla z29.s, p2/M, z25.s, z17.s\n"
+ "ld1w { z18.s }, p3/Z, [x22, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z23.s, z0.s\n"
+ "fmla z31.s, p2/M, z23.s, z26.s\n"
+ "fmla z5.s, p2/M, z23.s, z17.s\n"
+ "fmla z29.s, p2/M, z23.s, z16.s\n"
+ "ld1w { z17.s }, p3/Z, [x21, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z21.s, z26.s\n"
+ "fmla z31.s, p2/M, z21.s, z24.s\n"
+ "fmla z5.s, p2/M, z21.s, z16.s\n"
+ "fmla z29.s, p2/M, z21.s, z18.s\n"
+ "ld1w { z16.s }, p3/Z, [x20, x13, LSL #2]\n"
+ "fmla z30.s, p2/M, z20.s, z24.s\n"
+ "fmla z31.s, p2/M, z20.s, z27.s\n"
+ "fmla z5.s, p2/M, z20.s, z18.s\n"
+ "fmla z29.s, p2/M, z20.s, z17.s\n"
+ "fmla z30.s, p2/M, z19.s, z27.s\n"
+ "fmla z31.s, p2/M, z19.s, z22.s\n"
+ "fmax z30.s, p2/M, z30.s, z15.s\n"
+ "fmax z31.s, p2/M, z31.s, z15.s\n"
+ "fmla z5.s, p2/M, z19.s, z17.s\n"
+ "fmla z29.s, p2/M, z19.s, z16.s\n"
+ "fmax z5.s, p2/M, z5.s, z15.s\n"
+ "fmax z29.s, p2/M, z29.s, z15.s\n"
+ "fmin z30.s, p2/M, z30.s, z28.s\n"
+ "fmin z31.s, p2/M, z31.s, z28.s\n"
+ "st1w { z30.s }, p0, [x15, x28, LSL #2]\n"
+ "fmin z5.s, p2/M, z5.s, z28.s\n"
+ "fmin z29.s, p2/M, z29.s, z28.s\n"
+ "st1w { z31.s }, p0, [x14, x28, LSL #2]\n"
+ "st1w { z5.s }, p0, [x12, x28, LSL #2]\n"
+ "st1w { z29.s }, p0, [x11, x28, LSL #2]\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 0000000000..6b155fc855
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float);
+
+class sve_fp32_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<float, float, float, float>
+{
+ KernelType kernel = sve_fp32_nhwc_generic_output9_mla_depthfirst_impl;
+
+ public:
+ sve_fp32_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<float, float, float, float>(9, arm_gemm::VLType::SVE) {}
+
+ KernelType get_kernel() const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d53daaa8a0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(
+ const float *const *const inptrs,
+ float *const *const outptrs,
+ const void *params,
+ const void *bias,
+ const unsigned int n_points,
+ const unsigned int n_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ const float minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ptrue p1.b\n"
+ "mov x11, #0x0\n"
+ "ld1rw { z2.s }, p1/Z, [%x[minmax_vals]]\n"
+ "ld1rw { z1.s }, p1/Z, [%x[minmax_vals], #4]\n"
+ "whilelt p0.s, x11, %x[n_channels]\n"
+ "1:" // Channel loop
+ "mov z23.b, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "ld1w { z23.s }, p0/Z, [%x[bias], x11, LSL #2]\n"
+ "2:" // Channel loop: Load bias: Done
+ "mov x10, %x[inptrs]\n"
+ "ldp x28, x27, [x10], #0x10\n"
+ "ldp x26, x25, [x10], #0x10\n"
+ "subs x9, %x[n_points], #0x1\n"
+ "ldp x24, x23, [x10], #0x10\n"
+ "ldp x22, x21, [x10], #0x10\n"
+ "mov z24.d, z23.d\n"
+ "mov z25.d, z23.d\n"
+ "ldr x20, [x10], #0x8\n"
+ "mov z26.d, z23.d\n"
+ "mov z27.d, z23.d\n"
+ "ld1w { z0.s }, p1/Z, [%x[params]]\n"
+ "mov z28.d, z23.d\n"
+ "mov z29.d, z23.d\n"
+ "ld1w { z14.s }, p0/Z, [x28, x11, LSL #2]\n"
+ "ld1w { z15.s }, p0/Z, [x27, x11, LSL #2]\n"
+ "mov z30.d, z23.d\n"
+ "mov z31.d, z23.d\n"
+ "ld1w { z16.s }, p0/Z, [x26, x11, LSL #2]\n"
+ "ld1w { z17.s }, p0/Z, [x25, x11, LSL #2]\n"
+ "ld1w { z18.s }, p0/Z, [x24, x11, LSL #2]\n"
+ "ld1w { z19.s }, p0/Z, [x23, x11, LSL #2]\n"
+ "addvl %x[params], %x[params], #1\n"
+ "ld1w { z20.s }, p0/Z, [x22, x11, LSL #2]\n"
+ "ld1w { z21.s }, p0/Z, [x21, x11, LSL #2]\n"
+ "ld1w { z22.s }, p0/Z, [x20, x11, LSL #2]\n"
+ "ble 4f\n"
+ "3:" // Channel loop: Planar loop
+ "ldp x28, x27, [x10], #0x10\n"
+ "ldp x26, x25, [x10], #0x10\n"
+ "subs x9, x9, #0x1\n"
+ "fmla z23.s, p1/M, z14.s, z0.s\n"
+ "ldp x24, x23, [x10], #0x10\n"
+ "ldp x22, x21, [x10], #0x10\n"
+ "fmla z24.s, p1/M, z15.s, z0.s\n"
+ "fmla z25.s, p1/M, z16.s, z0.s\n"
+ "ldr x20, [x10], #0x8\n"
+ "fmla z26.s, p1/M, z17.s, z0.s\n"
+ "fmla z27.s, p1/M, z18.s, z0.s\n"
+ "ld1w { z14.s }, p0/Z, [x28, x11, LSL #2]\n"
+ "fmla z28.s, p1/M, z19.s, z0.s\n"
+ "fmla z29.s, p1/M, z20.s, z0.s\n"
+ "ld1w { z15.s }, p0/Z, [x27, x11, LSL #2]\n"
+ "ld1w { z16.s }, p0/Z, [x26, x11, LSL #2]\n"
+ "fmla z30.s, p1/M, z21.s, z0.s\n"
+ "fmla z31.s, p1/M, z22.s, z0.s\n"
+ "ld1w { z0.s }, p1/Z, [%x[params]]\n"
+ "ld1w { z17.s }, p0/Z, [x25, x11, LSL #2]\n"
+ "ld1w { z18.s }, p0/Z, [x24, x11, LSL #2]\n"
+ "ld1w { z19.s }, p0/Z, [x23, x11, LSL #2]\n"
+ "addvl %x[params], %x[params], #1\n"
+ "ld1w { z20.s }, p0/Z, [x22, x11, LSL #2]\n"
+ "ld1w { z21.s }, p0/Z, [x21, x11, LSL #2]\n"
+ "ld1w { z22.s }, p0/Z, [x20, x11, LSL #2]\n"
+ "bgt 3b\n"
+ "4:" // Channel loop: Planar tail
+ "fmla z23.s, p1/M, z14.s, z0.s\n"
+ "fmla z24.s, p1/M, z15.s, z0.s\n"
+ "fmax z23.s, p1/M, z23.s, z2.s\n"
+ "fmax z24.s, p1/M, z24.s, z2.s\n"
+ "fmla z25.s, p1/M, z16.s, z0.s\n"
+ "fmla z26.s, p1/M, z17.s, z0.s\n"
+ "fmax z25.s, p1/M, z25.s, z2.s\n"
+ "fmax z26.s, p1/M, z26.s, z2.s\n"
+ "fmla z27.s, p1/M, z18.s, z0.s\n"
+ "fmla z28.s, p1/M, z19.s, z0.s\n"
+ "fmax z27.s, p1/M, z27.s, z2.s\n"
+ "fmax z28.s, p1/M, z28.s, z2.s\n"
+ "fmla z29.s, p1/M, z20.s, z0.s\n"
+ "fmla z30.s, p1/M, z21.s, z0.s\n"
+ "fmax z29.s, p1/M, z29.s, z2.s\n"
+ "fmax z30.s, p1/M, z30.s, z2.s\n"
+ "fmla z31.s, p1/M, z22.s, z0.s\n"
+ "fmax z31.s, p1/M, z31.s, z2.s\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "fmin z23.s, p1/M, z23.s, z1.s\n"
+ "fmin z24.s, p1/M, z24.s, z1.s\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "fmin z25.s, p1/M, z25.s, z1.s\n"
+ "fmin z26.s, p1/M, z26.s, z1.s\n"
+ "st1w { z23.s }, p0, [x28, x11, LSL #2]\n"
+ "fmin z27.s, p1/M, z27.s, z1.s\n"
+ "fmin z28.s, p1/M, z28.s, z1.s\n"
+ "st1w { z24.s }, p0, [x27, x11, LSL #2]\n"
+ "fmin z29.s, p1/M, z29.s, z1.s\n"
+ "fmin z30.s, p1/M, z30.s, z1.s\n"
+ "st1w { z25.s }, p0, [x26, x11, LSL #2]\n"
+ "fmin z31.s, p1/M, z31.s, z1.s\n"
+ "st1w { z26.s }, p0, [x25, x11, LSL #2]\n"
+ "st1w { z27.s }, p0, [x24, x11, LSL #2]\n"
+ "st1w { z28.s }, p0, [x23, x11, LSL #2]\n"
+ "st1w { z29.s }, p0, [x22, x11, LSL #2]\n"
+ "st1w { z30.s }, p0, [x21, x11, LSL #2]\n"
+ "st1w { z31.s }, p0, [x20, x11, LSL #2]\n"
+ "incw x11\n"
+ "whilelt p0.s, x11, %x[n_channels]\n"
+ "b.any 1b\n"
+ : [params] "+&r" (params)
+ : [bias] "r" (bias), [inptrs] "r" (inptrs), [minmax_vals] "r" (minmax_vals), [n_channels] "r" ((uint64_t) n_channels), [n_points] "r" ((uint64_t) n_points), [outptrs] "r" (outptrs)
+ : "cc", "memory", "p0", "p1", "x9", "x10", "x11", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
new file mode 100644
index 0000000000..eb1b111c36
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+struct sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst : DepthfirstMultiplierStrategy<float, float, float, float>
+{
+ using Parent = DepthfirstMultiplierStrategy<float, float, float, float>;
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(const CPUInfo *)
+ : Parent(3, 3, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
+
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..3a71baaf61
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(
+ const float *const *const inptrs,
+ float *const *const outptrs,
+ const void *params,
+ const unsigned int n_output_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ const float minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "mov x17, #0x0\n"
+ "whilelt p2.s, x17, %x[channel_multiplier]\n"
+ "ldr x16, [%x[inptrs], #0x0]\n"
+ "ldr x15, [%x[inptrs], #0x8]\n"
+ "ptrue p1.b\n"
+ "ldr x14, [%x[inptrs], #0x10]\n"
+ "ldr x13, [%x[inptrs], #0x18]\n"
+ "mov x12, #0x0\n"
+ "ldr x11, [%x[inptrs], #0x20]\n"
+ "ldr x10, [%x[inptrs], #0x28]\n"
+ "ldr x9, [%x[inptrs], #0x30]\n"
+ "ld1w { z24.s }, p2/Z, [%x[params]]\n"
+ "mov z21.d, z24.d\n"
+ "mov z25.d, z24.d\n"
+ "ldp x28, x27, [%x[outptrs], #0x0]\n"
+ "ldp x26, x25, [%x[outptrs], #0x10]\n"
+ "mov z27.d, z24.d\n"
+ "mov z26.d, z24.d\n"
+ "ldp x24, x23, [%x[outptrs], #0x20]\n"
+ "ldp x22, x21, [%x[outptrs], #0x30]\n"
+ "mov z28.d, z24.d\n"
+ "mov z20.d, z24.d\n"
+ "ldr x20, [%x[outptrs], #0x40]\n"
+ "ld1rqw { z2.s }, p1/Z, [x16]\n"
+ "mov z23.d, z24.d\n"
+ "mov z19.d, z24.d\n"
+ "ld1rqw { z3.s }, p1/Z, [x16, #16]\n"
+ "ld1rqw { z4.s }, p1/Z, [x15]\n"
+ "ld1rqw { z5.s }, p1/Z, [x15, #16]\n"
+ "ld1rqw { z6.s }, p1/Z, [x14]\n"
+ "ld1rqw { z7.s }, p1/Z, [x14, #16]\n"
+ "ld1rqw { z8.s }, p1/Z, [x13]\n"
+ "ld1rqw { z9.s }, p1/Z, [x13, #16]\n"
+ "ld1rqw { z10.s }, p1/Z, [x11]\n"
+ "ld1rqw { z11.s }, p1/Z, [x11, #16]\n"
+ "ld1rqw { z12.s }, p1/Z, [x10]\n"
+ "ld1rqw { z13.s }, p1/Z, [x10, #16]\n"
+ "ld1rqw { z14.s }, p1/Z, [x9]\n"
+ "ld1rqw { z15.s }, p1/Z, [x9, #16]\n"
+ "ld1rw { z22.s }, p1/Z, [%x[clamps]]\n"
+ "ld1rw { z16.s }, p1/Z, [%x[clamps], #4]\n"
+ "ld1w { z31.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "addvl %x[params], %x[params], #4\n"
+ "1:" // Output channel complete vector loop
+ "fmla z24.s, z31.s, z2.s[0]\n"
+ "fmla z27.s, z31.s, z6.s[0]\n"
+ "mov z1.d, z10.d\n"
+ "incw x17\n"
+ "fmla z26.s, z31.s, z6.s[2]\n"
+ "fmla z28.s, z31.s, z7.s[0]\n"
+ "mov z0.d, z11.d\n"
+ "mov p0.b, p2.b\n"
+ "fmla z21.s, z31.s, z2.s[2]\n"
+ "fmla z25.s, z31.s, z3.s[0]\n"
+ "whilelt p2.s, x17, %x[channel_multiplier]\n"
+ "fmla z20.s, z31.s, z1.s[0]\n"
+ "fmla z23.s, z31.s, z1.s[2]\n"
+ "fmla z19.s, z31.s, z0.s[0]\n"
+ "fmla z24.s, z30.s, z2.s[1]\n"
+ "ld1w { z18.s }, p1/Z, [%x[params]]\n"
+ "fmla z27.s, z30.s, z6.s[1]\n"
+ "fmla z26.s, z30.s, z6.s[3]\n"
+ "fmla z28.s, z30.s, z7.s[1]\n"
+ "fmla z21.s, z30.s, z2.s[3]\n"
+ "fmla z25.s, z30.s, z3.s[1]\n"
+ "fmla z20.s, z30.s, z1.s[1]\n"
+ "fmla z23.s, z30.s, z1.s[3]\n"
+ "fmla z19.s, z30.s, z0.s[1]\n"
+ "ld1w { z17.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "fmla z24.s, z29.s, z2.s[2]\n"
+ "fmla z27.s, z29.s, z6.s[2]\n"
+ "fmla z26.s, z29.s, z7.s[0]\n"
+ "fmla z28.s, z29.s, z7.s[2]\n"
+ "fmla z21.s, z29.s, z3.s[0]\n"
+ "fmla z25.s, z29.s, z3.s[2]\n"
+ "fmla z20.s, z29.s, z1.s[2]\n"
+ "fmla z23.s, z29.s, z0.s[0]\n"
+ "mov z1.d, z8.d\n"
+ "fmla z19.s, z29.s, z0.s[2]\n"
+ "mov z0.d, z9.d\n"
+ "fmla z24.s, z18.s, z4.s[0]\n"
+ "ld1w { z31.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "fmla z27.s, z18.s, z1.s[0]\n"
+ "fmla z26.s, z18.s, z1.s[2]\n"
+ "mov z1.d, z12.d\n"
+ "fmla z28.s, z18.s, z0.s[0]\n"
+ "mov z0.d, z13.d\n"
+ "fmla z21.s, z18.s, z4.s[2]\n"
+ "fmla z25.s, z18.s, z5.s[0]\n"
+ "fmla z20.s, z18.s, z1.s[0]\n"
+ "fmla z23.s, z18.s, z1.s[2]\n"
+ "fmla z19.s, z18.s, z0.s[0]\n"
+ "mov z1.d, z8.d\n"
+ "ld1w { z18.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "mov z0.d, z9.d\n"
+ "fmla z24.s, z17.s, z4.s[1]\n"
+ "fmla z27.s, z17.s, z1.s[1]\n"
+ "fmla z26.s, z17.s, z1.s[3]\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "mov z1.d, z12.d\n"
+ "mov z0.d, z13.d\n"
+ "fmla z21.s, z17.s, z4.s[3]\n"
+ "fmla z25.s, z17.s, z5.s[1]\n"
+ "fmla z20.s, z17.s, z1.s[1]\n"
+ "fmla z23.s, z17.s, z1.s[3]\n"
+ "mov z1.d, z8.d\n"
+ "fmla z19.s, z17.s, z0.s[1]\n"
+ "mov z0.d, z9.d\n"
+ "fmla z24.s, z31.s, z4.s[2]\n"
+ "ld1w { z17.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "fmla z27.s, z31.s, z1.s[2]\n"
+ "fmla z26.s, z31.s, z0.s[0]\n"
+ "mov z1.d, z12.d\n"
+ "fmla z28.s, z31.s, z0.s[2]\n"
+ "mov z0.d, z13.d\n"
+ "fmla z21.s, z31.s, z5.s[0]\n"
+ "fmla z25.s, z31.s, z5.s[2]\n"
+ "fmla z20.s, z31.s, z1.s[2]\n"
+ "mov z1.d, z10.d\n"
+ "fmla z23.s, z31.s, z0.s[0]\n"
+ "fmla z19.s, z31.s, z0.s[2]\n"
+ "mov z0.d, z11.d\n"
+ "ld1w { z29.s }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "fmla z24.s, z18.s, z6.s[0]\n"
+ "fmla z27.s, z18.s, z1.s[0]\n"
+ "fmla z26.s, z18.s, z1.s[2]\n"
+ "fmla z28.s, z18.s, z0.s[0]\n"
+ "mov z1.d, z14.d\n"
+ "mov z0.d, z15.d\n"
+ "fmla z21.s, z18.s, z6.s[2]\n"
+ "fmla z25.s, z18.s, z7.s[0]\n"
+ "fmla z20.s, z18.s, z1.s[0]\n"
+ "fmla z23.s, z18.s, z1.s[2]\n"
+ "mov z1.d, z10.d\n"
+ "fmla z19.s, z18.s, z0.s[0]\n"
+ "mov z0.d, z11.d\n"
+ "fmla z24.s, z17.s, z6.s[1]\n"
+ "ld1w { z31.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "fmla z27.s, z17.s, z1.s[1]\n"
+ "fmla z26.s, z17.s, z1.s[3]\n"
+ "mov z1.d, z14.d\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "mov z0.d, z15.d\n"
+ "fmla z21.s, z17.s, z6.s[3]\n"
+ "fmla z25.s, z17.s, z7.s[1]\n"
+ "fmla z20.s, z17.s, z1.s[1]\n"
+ "fmla z23.s, z17.s, z1.s[3]\n"
+ "fmla z19.s, z17.s, z0.s[1]\n"
+ "mov z1.d, z10.d\n"
+ "mov z0.d, z11.d\n"
+ "fmla z24.s, z29.s, z6.s[2]\n"
+ "fmla z27.s, z29.s, z1.s[2]\n"
+ "fmin z24.s, p1/M, z24.s, z16.s\n"
+ "fmla z26.s, z29.s, z0.s[0]\n"
+ "fmla z28.s, z29.s, z0.s[2]\n"
+ "mov z1.d, z14.d\n"
+ "fmax z24.s, p1/M, z24.s, z22.s\n"
+ "mov z0.d, z15.d\n"
+ "fmla z21.s, z29.s, z7.s[0]\n"
+ "fmla z25.s, z29.s, z7.s[2]\n"
+ "fmin z21.s, p1/M, z21.s, z16.s\n"
+ "fmla z20.s, z29.s, z1.s[2]\n"
+ "fmla z23.s, z29.s, z0.s[0]\n"
+ "fmin z25.s, p1/M, z25.s, z16.s\n"
+ "fmin z27.s, p1/M, z27.s, z16.s\n"
+ "fmla z19.s, z29.s, z0.s[2]\n"
+ "fmin z26.s, p1/M, z26.s, z16.s\n"
+ "fmin z28.s, p1/M, z28.s, z16.s\n"
+ "st1w { z24.s }, p0, [x28, x12, LSL #2]\n"
+ "fmin z20.s, p1/M, z20.s, z16.s\n"
+ "fmin z23.s, p1/M, z23.s, z16.s\n"
+ "ld1w { z24.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "fmin z19.s, p1/M, z19.s, z16.s\n"
+ "addvl %x[params], %x[params], #16\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "fmax z21.s, p1/M, z21.s, z22.s\n"
+ "fmax z25.s, p1/M, z25.s, z22.s\n"
+ "st1w { z21.s }, p0, [x27, x12, LSL #2]\n"
+ "mov z21.d, z24.d\n"
+ "fmax z27.s, p1/M, z27.s, z22.s\n"
+ "fmax z26.s, p1/M, z26.s, z22.s\n"
+ "st1w { z25.s }, p0, [x26, x12, LSL #2]\n"
+ "mov z25.d, z24.d\n"
+ "fmax z28.s, p1/M, z28.s, z22.s\n"
+ "fmax z20.s, p1/M, z20.s, z22.s\n"
+ "st1w { z27.s }, p0, [x25, x12, LSL #2]\n"
+ "mov z27.d, z24.d\n"
+ "fmax z23.s, p1/M, z23.s, z22.s\n"
+ "fmax z19.s, p1/M, z19.s, z22.s\n"
+ "st1w { z26.s }, p0, [x24, x12, LSL #2]\n"
+ "mov z26.d, z24.d\n"
+ "st1w { z28.s }, p0, [x23, x12, LSL #2]\n"
+ "mov z28.d, z24.d\n"
+ "addvl %x[params], %x[params], #-6\n"
+ "st1w { z20.s }, p0, [x22, x12, LSL #2]\n"
+ "mov z20.d, z24.d\n"
+ "st1w { z23.s }, p0, [x21, x12, LSL #2]\n"
+ "mov z23.d, z24.d\n"
+ "st1w { z19.s }, p0, [x20, x12, LSL #2]\n"
+ "incw x12\n"
+ "mov z19.d, z24.d\n"
+ "b.any 1b\n"
+ : [params] "+&r" (params)
+ : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
new file mode 100644
index 0000000000..cc0c4236a8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
+
+struct sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst : DepthfirstMultiplierStrategy<float, float, float, float>
+{
+ using Parent = DepthfirstMultiplierStrategy<float, float, float, float>;
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(const CPUInfo *)
+ : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
+
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..84ab4b5035
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp
@@ -0,0 +1,392 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(
+ const float *const *const inptrs,
+ float *const *const outptrs,
+ const void *params,
+ const unsigned int n_output_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ const float minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "mov x15, #0x0\n"
+ "whilelt p2.s, x15, %x[channel_multiplier]\n"
+ "ldr x14, [%x[inptrs], #0x0]\n"
+ "ldr x13, [%x[inptrs], #0x8]\n"
+ "ptrue p1.b\n"
+ "ldr x12, [%x[inptrs], #0x10]\n"
+ "ldr x11, [%x[inptrs], #0x18]\n"
+ "mov x10, #0x0\n"
+ "ldr x9, [%x[inptrs], #0x20]\n"
+ "ldr x28, [%x[inptrs], #0x28]\n"
+ "ld1w { z16.s }, p2/Z, [%x[params]]\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "mov z25.d, z16.d\n"
+ "mov z15.d, z16.d\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "mov z24.d, z16.d\n"
+ "mov z14.d, z16.d\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "ld1rqw { z2.s }, p1/Z, [x14]\n"
+ "mov z26.d, z16.d\n"
+ "mov z17.d, z16.d\n"
+ "ld1rqw { z3.s }, p1/Z, [x14, #16]\n"
+ "ld1rqw { z4.s }, p1/Z, [x13]\n"
+ "mov z23.d, z16.d\n"
+ "ld1rqw { z5.s }, p1/Z, [x13, #16]\n"
+ "ld1rqw { z6.s }, p1/Z, [x12]\n"
+ "ld1rqw { z7.s }, p1/Z, [x12, #16]\n"
+ "ld1rqw { z8.s }, p1/Z, [x11]\n"
+ "ld1rqw { z9.s }, p1/Z, [x11, #16]\n"
+ "ld1rqw { z10.s }, p1/Z, [x9]\n"
+ "ld1rqw { z11.s }, p1/Z, [x9, #16]\n"
+ "ld1rqw { z12.s }, p1/Z, [x28]\n"
+ "ld1rqw { z13.s }, p1/Z, [x28, #16]\n"
+ "ld1rw { z21.s }, p1/Z, [%x[clamps]]\n"
+ "ld1rw { z22.s }, p1/Z, [%x[clamps], #4]\n"
+ "ld1w { z31.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "ld1w { z28.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "addvl %x[params], %x[params], #6\n"
+ "1:" // Output channel complete vector loop
+ "fmla z16.s, z31.s, z2.s[0]\n"
+ "fmla z25.s, z31.s, z2.s[1]\n"
+ "mov z0.d, z8.d\n"
+ "incw x15\n"
+ "fmla z15.s, z31.s, z2.s[2]\n"
+ "fmla z24.s, z31.s, z2.s[3]\n"
+ "mov z1.d, z9.d\n"
+ "mov p0.b, p2.b\n"
+ "fmla z14.s, z31.s, z4.s[0]\n"
+ "fmla z26.s, z31.s, z4.s[1]\n"
+ "whilelt p2.s, x15, %x[channel_multiplier]\n"
+ "fmla z17.s, z31.s, z4.s[2]\n"
+ "fmla z23.s, z31.s, z4.s[3]\n"
+ "ld1w { z20.s }, p1/Z, [%x[params]]\n"
+ "fmla z16.s, z30.s, z2.s[1]\n"
+ "fmla z25.s, z30.s, z2.s[2]\n"
+ "fmla z15.s, z30.s, z2.s[3]\n"
+ "fmla z24.s, z30.s, z3.s[0]\n"
+ "fmla z14.s, z30.s, z4.s[1]\n"
+ "fmla z26.s, z30.s, z4.s[2]\n"
+ "fmla z17.s, z30.s, z4.s[3]\n"
+ "fmla z23.s, z30.s, z5.s[0]\n"
+ "ld1w { z19.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "fmla z16.s, z29.s, z2.s[2]\n"
+ "fmla z25.s, z29.s, z2.s[3]\n"
+ "fmla z15.s, z29.s, z3.s[0]\n"
+ "fmla z24.s, z29.s, z3.s[1]\n"
+ "fmla z14.s, z29.s, z4.s[2]\n"
+ "fmla z26.s, z29.s, z4.s[3]\n"
+ "fmla z17.s, z29.s, z5.s[0]\n"
+ "fmla z23.s, z29.s, z5.s[1]\n"
+ "ld1w { z18.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "fmla z16.s, z28.s, z2.s[3]\n"
+ "fmla z25.s, z28.s, z3.s[0]\n"
+ "fmla z15.s, z28.s, z3.s[1]\n"
+ "fmla z24.s, z28.s, z3.s[2]\n"
+ "fmla z14.s, z28.s, z4.s[3]\n"
+ "fmla z26.s, z28.s, z5.s[0]\n"
+ "fmla z17.s, z28.s, z5.s[1]\n"
+ "fmla z23.s, z28.s, z5.s[2]\n"
+ "ld1w { z28.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "fmla z16.s, z27.s, z3.s[0]\n"
+ "fmla z25.s, z27.s, z3.s[1]\n"
+ "fmla z15.s, z27.s, z3.s[2]\n"
+ "fmla z24.s, z27.s, z3.s[3]\n"
+ "fmla z14.s, z27.s, z5.s[0]\n"
+ "fmla z26.s, z27.s, z5.s[1]\n"
+ "fmla z17.s, z27.s, z5.s[2]\n"
+ "fmla z23.s, z27.s, z5.s[3]\n"
+ "ld1w { z27.s }, p1/Z, [%x[params], #4, MUL VL]\n"
+ "fmla z16.s, z20.s, z4.s[0]\n"
+ "fmla z25.s, z20.s, z4.s[1]\n"
+ "fmla z15.s, z20.s, z4.s[2]\n"
+ "fmla z24.s, z20.s, z4.s[3]\n"
+ "fmla z14.s, z20.s, z6.s[0]\n"
+ "fmla z26.s, z20.s, z6.s[1]\n"
+ "fmla z17.s, z20.s, z6.s[2]\n"
+ "fmla z23.s, z20.s, z6.s[3]\n"
+ "ld1w { z20.s }, p1/Z, [%x[params], #5, MUL VL]\n"
+ "fmla z16.s, z19.s, z4.s[1]\n"
+ "fmla z25.s, z19.s, z4.s[2]\n"
+ "fmla z15.s, z19.s, z4.s[3]\n"
+ "fmla z24.s, z19.s, z5.s[0]\n"
+ "fmla z14.s, z19.s, z6.s[1]\n"
+ "fmla z26.s, z19.s, z6.s[2]\n"
+ "fmla z17.s, z19.s, z6.s[3]\n"
+ "fmla z23.s, z19.s, z7.s[0]\n"
+ "ld1w { z19.s }, p1/Z, [%x[params], #6, MUL VL]\n"
+ "fmla z16.s, z18.s, z4.s[2]\n"
+ "fmla z25.s, z18.s, z4.s[3]\n"
+ "fmla z15.s, z18.s, z5.s[0]\n"
+ "fmla z24.s, z18.s, z5.s[1]\n"
+ "fmla z14.s, z18.s, z6.s[2]\n"
+ "fmla z26.s, z18.s, z6.s[3]\n"
+ "fmla z17.s, z18.s, z7.s[0]\n"
+ "fmla z23.s, z18.s, z7.s[1]\n"
+ "ld1w { z18.s }, p1/Z, [%x[params], #7, MUL VL]\n"
+ "addvl %x[params], %x[params], #16\n"
+ "fmla z16.s, z28.s, z4.s[3]\n"
+ "fmla z25.s, z28.s, z5.s[0]\n"
+ "fmla z15.s, z28.s, z5.s[1]\n"
+ "fmla z24.s, z28.s, z5.s[2]\n"
+ "fmla z14.s, z28.s, z6.s[3]\n"
+ "fmla z26.s, z28.s, z7.s[0]\n"
+ "fmla z17.s, z28.s, z7.s[1]\n"
+ "fmla z23.s, z28.s, z7.s[2]\n"
+ "ld1w { z30.s }, p1/Z, [%x[params], #-8, MUL VL]\n"
+ "fmla z16.s, z27.s, z5.s[0]\n"
+ "fmla z25.s, z27.s, z5.s[1]\n"
+ "fmla z15.s, z27.s, z5.s[2]\n"
+ "fmla z24.s, z27.s, z5.s[3]\n"
+ "fmla z14.s, z27.s, z7.s[0]\n"
+ "fmla z26.s, z27.s, z7.s[1]\n"
+ "fmla z17.s, z27.s, z7.s[2]\n"
+ "fmla z23.s, z27.s, z7.s[3]\n"
+ "ld1w { z27.s }, p1/Z, [%x[params], #-7, MUL VL]\n"
+ "fmla z16.s, z20.s, z6.s[0]\n"
+ "fmla z25.s, z20.s, z6.s[1]\n"
+ "fmla z15.s, z20.s, z6.s[2]\n"
+ "fmla z24.s, z20.s, z6.s[3]\n"
+ "fmla z14.s, z20.s, z0.s[0]\n"
+ "fmla z26.s, z20.s, z0.s[1]\n"
+ "fmla z17.s, z20.s, z0.s[2]\n"
+ "fmla z23.s, z20.s, z0.s[3]\n"
+ "ld1w { z20.s }, p1/Z, [%x[params], #-6, MUL VL]\n"
+ "fmla z16.s, z19.s, z6.s[1]\n"
+ "fmla z25.s, z19.s, z6.s[2]\n"
+ "fmla z15.s, z19.s, z6.s[3]\n"
+ "fmla z24.s, z19.s, z7.s[0]\n"
+ "fmla z14.s, z19.s, z0.s[1]\n"
+ "fmla z26.s, z19.s, z0.s[2]\n"
+ "fmla z17.s, z19.s, z0.s[3]\n"
+ "fmla z23.s, z19.s, z1.s[0]\n"
+ "ld1w { z19.s }, p1/Z, [%x[params], #-5, MUL VL]\n"
+ "fmla z16.s, z18.s, z6.s[2]\n"
+ "fmla z25.s, z18.s, z6.s[3]\n"
+ "fmla z15.s, z18.s, z7.s[0]\n"
+ "fmla z24.s, z18.s, z7.s[1]\n"
+ "fmla z14.s, z18.s, z0.s[2]\n"
+ "fmla z26.s, z18.s, z0.s[3]\n"
+ "fmla z17.s, z18.s, z1.s[0]\n"
+ "fmla z23.s, z18.s, z1.s[1]\n"
+ "ld1w { z18.s }, p1/Z, [%x[params], #-4, MUL VL]\n"
+ "fmla z16.s, z30.s, z6.s[3]\n"
+ "fmla z25.s, z30.s, z7.s[0]\n"
+ "fmla z15.s, z30.s, z7.s[1]\n"
+ "fmla z24.s, z30.s, z7.s[2]\n"
+ "fmla z14.s, z30.s, z0.s[3]\n"
+ "fmla z26.s, z30.s, z1.s[0]\n"
+ "fmla z17.s, z30.s, z1.s[1]\n"
+ "fmla z23.s, z30.s, z1.s[2]\n"
+ "ld1w { z31.s }, p1/Z, [%x[params], #-3, MUL VL]\n"
+ "fmla z16.s, z27.s, z7.s[0]\n"
+ "fmla z25.s, z27.s, z7.s[1]\n"
+ "fmla z15.s, z27.s, z7.s[2]\n"
+ "fmla z24.s, z27.s, z7.s[3]\n"
+ "fmla z14.s, z27.s, z1.s[0]\n"
+ "fmla z26.s, z27.s, z1.s[1]\n"
+ "fmla z17.s, z27.s, z1.s[2]\n"
+ "fmla z23.s, z27.s, z1.s[3]\n"
+ "ld1w { z27.s }, p1/Z, [%x[params], #-2, MUL VL]\n"
+ "fmla z16.s, z20.s, z0.s[0]\n"
+ "fmla z25.s, z20.s, z0.s[1]\n"
+ "fmla z15.s, z20.s, z0.s[2]\n"
+ "fmla z24.s, z20.s, z0.s[3]\n"
+ "mov z0.d, z10.d\n"
+ "fmla z14.s, z20.s, z0.s[0]\n"
+ "fmla z26.s, z20.s, z0.s[1]\n"
+ "fmla z17.s, z20.s, z0.s[2]\n"
+ "fmla z23.s, z20.s, z0.s[3]\n"
+ "mov z0.d, z8.d\n"
+ "ld1w { z20.s }, p1/Z, [%x[params], #-1, MUL VL]\n"
+ "fmla z16.s, z19.s, z0.s[1]\n"
+ "fmla z25.s, z19.s, z0.s[2]\n"
+ "fmla z15.s, z19.s, z0.s[3]\n"
+ "fmla z24.s, z19.s, z1.s[0]\n"
+ "mov z1.d, z10.d\n"
+ "mov z0.d, z11.d\n"
+ "fmla z14.s, z19.s, z1.s[1]\n"
+ "fmla z26.s, z19.s, z1.s[2]\n"
+ "fmla z17.s, z19.s, z1.s[3]\n"
+ "fmla z23.s, z19.s, z0.s[0]\n"
+ "mov z1.d, z8.d\n"
+ "ld1w { z19.s }, p1/Z, [%x[params]]\n"
+ "mov z0.d, z9.d\n"
+ "fmla z16.s, z18.s, z1.s[2]\n"
+ "fmla z25.s, z18.s, z1.s[3]\n"
+ "fmla z15.s, z18.s, z0.s[0]\n"
+ "fmla z24.s, z18.s, z0.s[1]\n"
+ "mov z1.d, z10.d\n"
+ "mov z0.d, z11.d\n"
+ "fmla z14.s, z18.s, z1.s[2]\n"
+ "fmla z26.s, z18.s, z1.s[3]\n"
+ "fmla z17.s, z18.s, z0.s[0]\n"
+ "fmla z23.s, z18.s, z0.s[1]\n"
+ "mov z1.d, z8.d\n"
+ "ld1w { z18.s }, p1/Z, [%x[params], #1, MUL VL]\n"
+ "mov z0.d, z9.d\n"
+ "fmla z16.s, z31.s, z1.s[3]\n"
+ "fmla z25.s, z31.s, z0.s[0]\n"
+ "fmla z15.s, z31.s, z0.s[1]\n"
+ "fmla z24.s, z31.s, z0.s[2]\n"
+ "mov z0.d, z10.d\n"
+ "mov z1.d, z11.d\n"
+ "fmla z14.s, z31.s, z0.s[3]\n"
+ "fmla z26.s, z31.s, z1.s[0]\n"
+ "fmla z17.s, z31.s, z1.s[1]\n"
+ "fmla z23.s, z31.s, z1.s[2]\n"
+ "mov z1.d, z9.d\n"
+ "ld1w { z28.s }, p1/Z, [%x[params], #2, MUL VL]\n"
+ "fmla z16.s, z27.s, z1.s[0]\n"
+ "fmla z25.s, z27.s, z1.s[1]\n"
+ "fmla z15.s, z27.s, z1.s[2]\n"
+ "fmla z24.s, z27.s, z1.s[3]\n"
+ "mov z1.d, z11.d\n"
+ "fmla z14.s, z27.s, z1.s[0]\n"
+ "fmla z26.s, z27.s, z1.s[1]\n"
+ "fmla z17.s, z27.s, z1.s[2]\n"
+ "fmla z23.s, z27.s, z1.s[3]\n"
+ "ld1w { z27.s }, p1/Z, [%x[params], #3, MUL VL]\n"
+ "fmla z16.s, z20.s, z0.s[0]\n"
+ "fmla z25.s, z20.s, z0.s[1]\n"
+ "fmla z15.s, z20.s, z0.s[2]\n"
+ "fmla z24.s, z20.s, z0.s[3]\n"
+ "mov z0.d, z12.d\n"
+ "fmla z14.s, z20.s, z0.s[0]\n"
+ "fmla z26.s, z20.s, z0.s[1]\n"
+ "fmla z17.s, z20.s, z0.s[2]\n"
+ "fmla z23.s, z20.s, z0.s[3]\n"
+ "mov z0.d, z10.d\n"
+ "ld1w { z31.s }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "fmla z16.s, z19.s, z0.s[1]\n"
+ "fmla z25.s, z19.s, z0.s[2]\n"
+ "fmla z15.s, z19.s, z0.s[3]\n"
+ "fmla z24.s, z19.s, z1.s[0]\n"
+ "mov z1.d, z12.d\n"
+ "mov z0.d, z13.d\n"
+ "fmla z14.s, z19.s, z1.s[1]\n"
+ "fmla z26.s, z19.s, z1.s[2]\n"
+ "fmla z17.s, z19.s, z1.s[3]\n"
+ "fmla z23.s, z19.s, z0.s[0]\n"
+ "mov z1.d, z10.d\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "mov z0.d, z11.d\n"
+ "fmla z16.s, z18.s, z1.s[2]\n"
+ "fmla z25.s, z18.s, z1.s[3]\n"
+ "fmla z15.s, z18.s, z0.s[0]\n"
+ "fmla z24.s, z18.s, z0.s[1]\n"
+ "mov z1.d, z12.d\n"
+ "mov z0.d, z13.d\n"
+ "fmla z14.s, z18.s, z1.s[2]\n"
+ "fmla z26.s, z18.s, z1.s[3]\n"
+ "fmla z17.s, z18.s, z0.s[0]\n"
+ "fmla z23.s, z18.s, z0.s[1]\n"
+ "mov z1.d, z10.d\n"
+ "ld1w { z29.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "mov z0.d, z11.d\n"
+ "fmla z16.s, z28.s, z1.s[3]\n"
+ "fmla z25.s, z28.s, z0.s[0]\n"
+ "fmla z15.s, z28.s, z0.s[1]\n"
+ "fmla z24.s, z28.s, z0.s[2]\n"
+ "mov z0.d, z13.d\n"
+ "mov z1.d, z12.d\n"
+ "fmla z26.s, z28.s, z0.s[0]\n"
+ "fmla z17.s, z28.s, z0.s[1]\n"
+ "fmla z23.s, z28.s, z0.s[2]\n"
+ "mov z0.d, z11.d\n"
+ "fmla z14.s, z28.s, z1.s[3]\n"
+ "fmla z16.s, z27.s, z0.s[0]\n"
+ "fmla z25.s, z27.s, z0.s[1]\n"
+ "fmin z16.s, p1/M, z16.s, z22.s\n"
+ "fmax z16.s, p1/M, z16.s, z21.s\n"
+ "fmla z15.s, z27.s, z0.s[2]\n"
+ "fmla z24.s, z27.s, z0.s[3]\n"
+ "mov z0.d, z13.d\n"
+ "fmin z25.s, p1/M, z25.s, z22.s\n"
+ "fmla z14.s, z27.s, z0.s[0]\n"
+ "fmla z26.s, z27.s, z0.s[1]\n"
+ "fmin z15.s, p1/M, z15.s, z22.s\n"
+ "fmin z24.s, p1/M, z24.s, z22.s\n"
+ "fmla z17.s, z27.s, z0.s[2]\n"
+ "fmla z23.s, z27.s, z0.s[3]\n"
+ "fmin z14.s, p1/M, z14.s, z22.s\n"
+ "fmin z26.s, p1/M, z26.s, z22.s\n"
+ "fmin z17.s, p1/M, z17.s, z22.s\n"
+ "fmin z23.s, p1/M, z23.s, z22.s\n"
+ "st1w { z16.s }, p0, [x27, x10, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "addvl %x[params], %x[params], #16\n"
+ "fmax z25.s, p1/M, z25.s, z21.s\n"
+ "st1w { z25.s }, p0, [x26, x10, LSL #2]\n"
+ "mov z25.d, z16.d\n"
+ "fmax z15.s, p1/M, z15.s, z21.s\n"
+ "fmax z24.s, p1/M, z24.s, z21.s\n"
+ "st1w { z15.s }, p0, [x25, x10, LSL #2]\n"
+ "mov z15.d, z16.d\n"
+ "fmax z14.s, p1/M, z14.s, z21.s\n"
+ "fmax z26.s, p1/M, z26.s, z21.s\n"
+ "st1w { z24.s }, p0, [x24, x10, LSL #2]\n"
+ "mov z24.d, z16.d\n"
+ "fmax z17.s, p1/M, z17.s, z21.s\n"
+ "fmax z23.s, p1/M, z23.s, z21.s\n"
+ "st1w { z14.s }, p0, [x23, x10, LSL #2]\n"
+ "mov z14.d, z16.d\n"
+ "st1w { z26.s }, p0, [x22, x10, LSL #2]\n"
+ "mov z26.d, z16.d\n"
+ "ld1w { z28.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "st1w { z17.s }, p0, [x21, x10, LSL #2]\n"
+ "mov z17.d, z16.d\n"
+ "addvl %x[params], %x[params], #-6\n"
+ "st1w { z23.s }, p0, [x20, x10, LSL #2]\n"
+ "incw x10\n"
+ "mov z23.d, z16.d\n"
+ "b.any 1b\n"
+ : [params] "+&r" (params)
+ : [channel_multiplier] "r" (n_output_channels), [clamps] "r" (minmax_vals), [inptrs] "r" (inptrs), [outptrs] "r" (outptrs)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
new file mode 100644
index 0000000000..f83767d8ae
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float);
+
+struct sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<float, float, float, float>
+{
+ using Parent = GenericDepthfirstMultiplierKernelStrategy<float, float, float, float>;
+ sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+ : Parent(2, 8, arm_gemm::VLType::SVE)
+ {
+ }
+ Parent::KernelType kernel = sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..1770ec182c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -0,0 +1,454 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(
+ const float *const *const inptrs,
+ float *const *const outptrs,
+ const float *weights,
+ const float *bias,
+ const unsigned int kernel_points,
+ const unsigned int n_output_channels,
+ const float activation_min,
+ const float activation_max
+)
+{
+ const float minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ptrue p1.b\n"
+ "mov x9, #0x0\n"
+ "ld1rw { z15.s }, p1/Z, [%x[minmax_vals]]\n"
+ "ld1rw { z14.s }, p1/Z, [%x[minmax_vals], #4]\n"
+ "whilelt p0.s, x9, %x[n_output_channels]\n"
+ "1:" // Output channel loop
+ "mov z31.b, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "ld1w { z31.s }, p0/Z, [%x[bias], x9, LSL #2]\n"
+ "2:" // Output channel loop: Load bias: Done
+ "mov x23, %x[inptrs]\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "lsr x22, %x[kernel_points], #0x1\n"
+ "mov z16.d, z31.d\n"
+ "mov z17.d, z31.d\n"
+ "mov z18.d, z31.d\n"
+ "ld1rqw { z6.s }, p1/Z, [x21]\n"
+ "ld1rqw { z5.s }, p1/Z, [x21, #16]\n"
+ "mov z19.d, z31.d\n"
+ "mov z20.d, z31.d\n"
+ "ld1rqw { z1.s }, p1/Z, [x20]\n"
+ "ld1rqw { z2.s }, p1/Z, [x20, #16]\n"
+ "mov z21.d, z31.d\n"
+ "mov z22.d, z31.d\n"
+ "ld1w { z8.s }, p1/Z, [%x[weights]]\n"
+ "addvl %x[weights], %x[weights], #1\n"
+ "mov z23.d, z31.d\n"
+ "mov z24.d, z31.d\n"
+ "mov z25.d, z31.d\n"
+ "mov z26.d, z31.d\n"
+ "mov z27.d, z31.d\n"
+ "mov z28.d, z31.d\n"
+ "mov z29.d, z31.d\n"
+ "mov z30.d, z31.d\n"
+ "mov z31.d, z31.d\n"
+ "cbz x22, 6f\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "subs x22, x22, #0x1\n"
+ "ld1rqw { z0.s }, p1/Z, [x21]\n"
+ "ld1rqw { z4.s }, p1/Z, [x21, #16]\n"
+ "ld1rqw { z7.s }, p1/Z, [x20]\n"
+ "ld1rqw { z3.s }, p1/Z, [x20, #16]\n"
+ "ld1w { z11.s }, p1/Z, [%x[weights]]\n"
+ "addvl %x[weights], %x[weights], #1\n"
+ "beq 4f\n"
+ "3:" // Output channel loop: Kernel loop
+ "ldp x21, x20, [x23], #0x10\n"
+ "fmla z16.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z8.s, z6.s[1]\n"
+ "subs x22, x22, #0x1\n"
+ "fmla z18.s, z8.s, z6.s[2]\n"
+ "fmla z19.s, z8.s, z6.s[3]\n"
+ "ld1rqw { z6.s }, p1/Z, [x21]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z21.s, z8.s, z5.s[1]\n"
+ "fmla z22.s, z8.s, z5.s[2]\n"
+ "fmla z23.s, z8.s, z5.s[3]\n"
+ "ld1rqw { z5.s }, p1/Z, [x21, #16]\n"
+ "fmla z24.s, z8.s, z1.s[0]\n"
+ "fmla z25.s, z8.s, z1.s[1]\n"
+ "fmla z26.s, z8.s, z1.s[2]\n"
+ "fmla z27.s, z8.s, z1.s[3]\n"
+ "ld1rqw { z1.s }, p1/Z, [x20]\n"
+ "fmla z28.s, z8.s, z2.s[0]\n"
+ "fmla z29.s, z8.s, z2.s[1]\n"
+ "fmla z30.s, z8.s, z2.s[2]\n"
+ "fmla z31.s, z8.s, z2.s[3]\n"
+ "ld1rqw { z2.s }, p1/Z, [x20, #16]\n"
+ "ldp x21, x20, [x23], #0x10\n"
+ "ld1w { z8.s }, p1/Z, [%x[weights]]\n"
+ "fmla z16.s, z11.s, z0.s[0]\n"
+ "fmla z17.s, z11.s, z0.s[1]\n"
+ "fmla z18.s, z11.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[3]\n"
+ "ld1rqw { z0.s }, p1/Z, [x21]\n"
+ "fmla z20.s, z11.s, z4.s[0]\n"
+ "fmla z21.s, z11.s, z4.s[1]\n"
+ "fmla z22.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z4.s[3]\n"
+ "ld1rqw { z4.s }, p1/Z, [x21, #16]\n"
+ "fmla z24.s, z11.s, z7.s[0]\n"
+ "fmla z25.s, z11.s, z7.s[1]\n"
+ "fmla z26.s, z11.s, z7.s[2]\n"
+ "fmla z27.s, z11.s, z7.s[3]\n"
+ "ld1rqw { z7.s }, p1/Z, [x20]\n"
+ "fmla z28.s, z11.s, z3.s[0]\n"
+ "fmla z29.s, z11.s, z3.s[1]\n"
+ "fmla z30.s, z11.s, z3.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "ld1rqw { z3.s }, p1/Z, [x20, #16]\n"
+ "ld1w { z11.s }, p1/Z, [%x[weights], #1, MUL VL]\n"
+ "addvl %x[weights], %x[weights], #2\n"
+ "bgt 3b\n"
+ "4:" // Output channel loop: Kernel loop tail
+ "tbnz %x[kernel_points], #0, 5f\n"
+ "fmla z16.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z8.s, z6.s[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "fmla z18.s, z8.s, z6.s[2]\n"
+ "fmla z19.s, z8.s, z6.s[3]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z21.s, z8.s, z5.s[1]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "fmla z22.s, z8.s, z5.s[2]\n"
+ "fmla z23.s, z8.s, z5.s[3]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmla z24.s, z8.s, z1.s[0]\n"
+ "fmla z25.s, z8.s, z1.s[1]\n"
+ "fmla z26.s, z8.s, z1.s[2]\n"
+ "fmla z27.s, z8.s, z1.s[3]\n"
+ "fmla z28.s, z8.s, z2.s[0]\n"
+ "fmla z29.s, z8.s, z2.s[1]\n"
+ "fmla z30.s, z8.s, z2.s[2]\n"
+ "fmla z31.s, z8.s, z2.s[3]\n"
+ "fmla z16.s, z11.s, z0.s[0]\n"
+ "fmla z17.s, z11.s, z0.s[1]\n"
+ "fmin z16.s, p1/M, z16.s, z14.s\n"
+ "fmin z17.s, p1/M, z17.s, z14.s\n"
+ "fmla z18.s, z11.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[3]\n"
+ "fmin z18.s, p1/M, z18.s, z14.s\n"
+ "fmin z19.s, p1/M, z19.s, z14.s\n"
+ "fmla z20.s, z11.s, z4.s[0]\n"
+ "fmla z21.s, z11.s, z4.s[1]\n"
+ "fmin z20.s, p1/M, z20.s, z14.s\n"
+ "fmin z21.s, p1/M, z21.s, z14.s\n"
+ "fmla z22.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z4.s[3]\n"
+ "fmin z22.s, p1/M, z22.s, z14.s\n"
+ "fmin z23.s, p1/M, z23.s, z14.s\n"
+ "fmla z24.s, z11.s, z7.s[0]\n"
+ "fmla z25.s, z11.s, z7.s[1]\n"
+ "fmax z16.s, p1/M, z16.s, z15.s\n"
+ "fmax z17.s, p1/M, z17.s, z15.s\n"
+ "fmla z26.s, z11.s, z7.s[2]\n"
+ "fmla z27.s, z11.s, z7.s[3]\n"
+ "fmax z18.s, p1/M, z18.s, z15.s\n"
+ "fmax z19.s, p1/M, z19.s, z15.s\n"
+ "fmla z28.s, z11.s, z3.s[0]\n"
+ "fmla z29.s, z11.s, z3.s[1]\n"
+ "fmax z20.s, p1/M, z20.s, z15.s\n"
+ "fmax z21.s, p1/M, z21.s, z15.s\n"
+ "fmla z30.s, z11.s, z3.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "fmax z22.s, p1/M, z22.s, z15.s\n"
+ "fmax z23.s, p1/M, z23.s, z15.s\n"
+ "fmin z24.s, p1/M, z24.s, z14.s\n"
+ "fmin z25.s, p1/M, z25.s, z14.s\n"
+ "st1w { z16.s }, p0, [x27, x9, LSL #2]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmin z26.s, p1/M, z26.s, z14.s\n"
+ "fmin z27.s, p1/M, z27.s, z14.s\n"
+ "st1w { z17.s }, p0, [x26, x9, LSL #2]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmin z28.s, p1/M, z28.s, z14.s\n"
+ "fmin z29.s, p1/M, z29.s, z14.s\n"
+ "st1w { z18.s }, p0, [x25, x9, LSL #2]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmin z30.s, p1/M, z30.s, z14.s\n"
+ "fmin z31.s, p1/M, z31.s, z14.s\n"
+ "st1w { z19.s }, p0, [x24, x9, LSL #2]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "st1w { z20.s }, p0, [x23, x9, LSL #2]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmax z24.s, p1/M, z24.s, z15.s\n"
+ "fmax z25.s, p1/M, z25.s, z15.s\n"
+ "st1w { z21.s }, p0, [x22, x9, LSL #2]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmax z26.s, p1/M, z26.s, z15.s\n"
+ "fmax z27.s, p1/M, z27.s, z15.s\n"
+ "st1w { z22.s }, p0, [x21, x9, LSL #2]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmax z28.s, p1/M, z28.s, z15.s\n"
+ "fmax z29.s, p1/M, z29.s, z15.s\n"
+ "st1w { z23.s }, p0, [x20, x9, LSL #2]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax z30.s, p1/M, z30.s, z15.s\n"
+ "fmax z31.s, p1/M, z31.s, z15.s\n"
+ "st1w { z24.s }, p0, [x27, x9, LSL #2]\n"
+ "st1w { z25.s }, p0, [x26, x9, LSL #2]\n"
+ "st1w { z26.s }, p0, [x25, x9, LSL #2]\n"
+ "st1w { z27.s }, p0, [x24, x9, LSL #2]\n"
+ "st1w { z28.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z29.s }, p0, [x22, x9, LSL #2]\n"
+ "st1w { z30.s }, p0, [x21, x9, LSL #2]\n"
+ "st1w { z31.s }, p0, [x20, x9, LSL #2]\n"
+ "b 7f\n"
+ "5:" // Output channel loop: Odd tail
+ "fmla z16.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z8.s, z6.s[1]\n"
+ "ldp x20, x28, [x23], #0x10\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "fmla z18.s, z8.s, z6.s[2]\n"
+ "fmla z19.s, z8.s, z6.s[3]\n"
+ "ld1rqw { z6.s }, p1/Z, [x20]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z21.s, z8.s, z5.s[1]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla z22.s, z8.s, z5.s[2]\n"
+ "fmla z23.s, z8.s, z5.s[3]\n"
+ "ld1rqw { z5.s }, p1/Z, [x20, #16]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "fmla z24.s, z8.s, z1.s[0]\n"
+ "fmla z25.s, z8.s, z1.s[1]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "fmla z26.s, z8.s, z1.s[2]\n"
+ "fmla z27.s, z8.s, z1.s[3]\n"
+ "ld1rqw { z1.s }, p1/Z, [x28]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmla z28.s, z8.s, z2.s[0]\n"
+ "fmla z29.s, z8.s, z2.s[1]\n"
+ "fmla z30.s, z8.s, z2.s[2]\n"
+ "fmla z31.s, z8.s, z2.s[3]\n"
+ "ld1w { z10.s }, p1/Z, [%x[weights]]\n"
+ "ld1rqw { z2.s }, p1/Z, [x28, #16]\n"
+ "fmla z16.s, z11.s, z0.s[0]\n"
+ "fmla z17.s, z11.s, z0.s[1]\n"
+ "addvl %x[weights], %x[weights], #1\n"
+ "fmla z18.s, z11.s, z0.s[2]\n"
+ "fmla z19.s, z11.s, z0.s[3]\n"
+ "fmla z20.s, z11.s, z4.s[0]\n"
+ "fmla z21.s, z11.s, z4.s[1]\n"
+ "fmla z22.s, z11.s, z4.s[2]\n"
+ "fmla z23.s, z11.s, z4.s[3]\n"
+ "fmla z24.s, z11.s, z7.s[0]\n"
+ "fmla z25.s, z11.s, z7.s[1]\n"
+ "fmla z26.s, z11.s, z7.s[2]\n"
+ "fmla z27.s, z11.s, z7.s[3]\n"
+ "fmla z28.s, z11.s, z3.s[0]\n"
+ "fmla z29.s, z11.s, z3.s[1]\n"
+ "fmla z30.s, z11.s, z3.s[2]\n"
+ "fmla z31.s, z11.s, z3.s[3]\n"
+ "fmla z16.s, z10.s, z6.s[0]\n"
+ "fmla z17.s, z10.s, z6.s[1]\n"
+ "fmin z16.s, p1/M, z16.s, z14.s\n"
+ "fmin z17.s, p1/M, z17.s, z14.s\n"
+ "fmla z18.s, z10.s, z6.s[2]\n"
+ "fmla z19.s, z10.s, z6.s[3]\n"
+ "fmin z18.s, p1/M, z18.s, z14.s\n"
+ "fmin z19.s, p1/M, z19.s, z14.s\n"
+ "fmla z20.s, z10.s, z5.s[0]\n"
+ "fmla z21.s, z10.s, z5.s[1]\n"
+ "fmin z20.s, p1/M, z20.s, z14.s\n"
+ "fmin z21.s, p1/M, z21.s, z14.s\n"
+ "fmla z22.s, z10.s, z5.s[2]\n"
+ "fmla z23.s, z10.s, z5.s[3]\n"
+ "fmin z22.s, p1/M, z22.s, z14.s\n"
+ "fmin z23.s, p1/M, z23.s, z14.s\n"
+ "fmla z24.s, z10.s, z1.s[0]\n"
+ "fmla z25.s, z10.s, z1.s[1]\n"
+ "fmax z16.s, p1/M, z16.s, z15.s\n"
+ "fmax z17.s, p1/M, z17.s, z15.s\n"
+ "fmla z26.s, z10.s, z1.s[2]\n"
+ "fmla z27.s, z10.s, z1.s[3]\n"
+ "fmax z18.s, p1/M, z18.s, z15.s\n"
+ "fmax z19.s, p1/M, z19.s, z15.s\n"
+ "fmla z28.s, z10.s, z2.s[0]\n"
+ "fmla z29.s, z10.s, z2.s[1]\n"
+ "fmax z20.s, p1/M, z20.s, z15.s\n"
+ "fmax z21.s, p1/M, z21.s, z15.s\n"
+ "fmla z30.s, z10.s, z2.s[2]\n"
+ "fmla z31.s, z10.s, z2.s[3]\n"
+ "fmax z22.s, p1/M, z22.s, z15.s\n"
+ "fmax z23.s, p1/M, z23.s, z15.s\n"
+ "fmin z24.s, p1/M, z24.s, z14.s\n"
+ "fmin z25.s, p1/M, z25.s, z14.s\n"
+ "st1w { z16.s }, p0, [x27, x9, LSL #2]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmin z26.s, p1/M, z26.s, z14.s\n"
+ "fmin z27.s, p1/M, z27.s, z14.s\n"
+ "st1w { z17.s }, p0, [x26, x9, LSL #2]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmin z28.s, p1/M, z28.s, z14.s\n"
+ "fmin z29.s, p1/M, z29.s, z14.s\n"
+ "st1w { z18.s }, p0, [x25, x9, LSL #2]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmin z30.s, p1/M, z30.s, z14.s\n"
+ "fmin z31.s, p1/M, z31.s, z14.s\n"
+ "st1w { z19.s }, p0, [x24, x9, LSL #2]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "st1w { z20.s }, p0, [x23, x9, LSL #2]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmax z24.s, p1/M, z24.s, z15.s\n"
+ "fmax z25.s, p1/M, z25.s, z15.s\n"
+ "st1w { z21.s }, p0, [x22, x9, LSL #2]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmax z26.s, p1/M, z26.s, z15.s\n"
+ "fmax z27.s, p1/M, z27.s, z15.s\n"
+ "st1w { z22.s }, p0, [x21, x9, LSL #2]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmax z28.s, p1/M, z28.s, z15.s\n"
+ "fmax z29.s, p1/M, z29.s, z15.s\n"
+ "st1w { z23.s }, p0, [x20, x9, LSL #2]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax z30.s, p1/M, z30.s, z15.s\n"
+ "fmax z31.s, p1/M, z31.s, z15.s\n"
+ "st1w { z24.s }, p0, [x27, x9, LSL #2]\n"
+ "st1w { z25.s }, p0, [x26, x9, LSL #2]\n"
+ "st1w { z26.s }, p0, [x25, x9, LSL #2]\n"
+ "st1w { z27.s }, p0, [x24, x9, LSL #2]\n"
+ "st1w { z28.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z29.s }, p0, [x22, x9, LSL #2]\n"
+ "st1w { z30.s }, p0, [x21, x9, LSL #2]\n"
+ "st1w { z31.s }, p0, [x20, x9, LSL #2]\n"
+ "b 7f\n"
+ "6:" // Output channel loop: Single kernel point
+ "fmla z16.s, z8.s, z6.s[0]\n"
+ "fmla z17.s, z8.s, z6.s[1]\n"
+ "fmin z16.s, p1/M, z16.s, z14.s\n"
+ "fmin z17.s, p1/M, z17.s, z14.s\n"
+ "fmla z18.s, z8.s, z6.s[2]\n"
+ "fmla z19.s, z8.s, z6.s[3]\n"
+ "fmin z18.s, p1/M, z18.s, z14.s\n"
+ "fmin z19.s, p1/M, z19.s, z14.s\n"
+ "fmla z20.s, z8.s, z5.s[0]\n"
+ "fmla z21.s, z8.s, z5.s[1]\n"
+ "fmin z20.s, p1/M, z20.s, z14.s\n"
+ "fmin z21.s, p1/M, z21.s, z14.s\n"
+ "fmla z22.s, z8.s, z5.s[2]\n"
+ "fmla z23.s, z8.s, z5.s[3]\n"
+ "fmin z22.s, p1/M, z22.s, z14.s\n"
+ "fmin z23.s, p1/M, z23.s, z14.s\n"
+ "fmla z24.s, z8.s, z1.s[0]\n"
+ "fmla z25.s, z8.s, z1.s[1]\n"
+ "ldr x27, [%x[outptrs], #0x0]\n"
+ "ldr x26, [%x[outptrs], #0x8]\n"
+ "fmla z26.s, z8.s, z1.s[2]\n"
+ "fmla z27.s, z8.s, z1.s[3]\n"
+ "ldr x25, [%x[outptrs], #0x10]\n"
+ "ldr x24, [%x[outptrs], #0x18]\n"
+ "fmla z28.s, z8.s, z2.s[0]\n"
+ "fmla z29.s, z8.s, z2.s[1]\n"
+ "ldr x23, [%x[outptrs], #0x20]\n"
+ "ldr x22, [%x[outptrs], #0x28]\n"
+ "fmla z30.s, z8.s, z2.s[2]\n"
+ "fmla z31.s, z8.s, z2.s[3]\n"
+ "ldr x21, [%x[outptrs], #0x30]\n"
+ "ldr x20, [%x[outptrs], #0x38]\n"
+ "fmax z16.s, p1/M, z16.s, z15.s\n"
+ "fmax z17.s, p1/M, z17.s, z15.s\n"
+ "st1w { z16.s }, p0, [x27, x9, LSL #2]\n"
+ "ldr x27, [%x[outptrs], #0x40]\n"
+ "fmax z18.s, p1/M, z18.s, z15.s\n"
+ "fmax z19.s, p1/M, z19.s, z15.s\n"
+ "st1w { z17.s }, p0, [x26, x9, LSL #2]\n"
+ "ldr x26, [%x[outptrs], #0x48]\n"
+ "fmax z20.s, p1/M, z20.s, z15.s\n"
+ "fmax z21.s, p1/M, z21.s, z15.s\n"
+ "st1w { z18.s }, p0, [x25, x9, LSL #2]\n"
+ "ldr x25, [%x[outptrs], #0x50]\n"
+ "fmax z22.s, p1/M, z22.s, z15.s\n"
+ "fmax z23.s, p1/M, z23.s, z15.s\n"
+ "st1w { z19.s }, p0, [x24, x9, LSL #2]\n"
+ "ldr x24, [%x[outptrs], #0x58]\n"
+ "fmin z24.s, p1/M, z24.s, z14.s\n"
+ "fmin z25.s, p1/M, z25.s, z14.s\n"
+ "st1w { z20.s }, p0, [x23, x9, LSL #2]\n"
+ "ldr x23, [%x[outptrs], #0x60]\n"
+ "fmin z26.s, p1/M, z26.s, z14.s\n"
+ "fmin z27.s, p1/M, z27.s, z14.s\n"
+ "st1w { z21.s }, p0, [x22, x9, LSL #2]\n"
+ "ldr x22, [%x[outptrs], #0x68]\n"
+ "fmin z28.s, p1/M, z28.s, z14.s\n"
+ "fmin z29.s, p1/M, z29.s, z14.s\n"
+ "st1w { z22.s }, p0, [x21, x9, LSL #2]\n"
+ "ldr x21, [%x[outptrs], #0x70]\n"
+ "fmin z30.s, p1/M, z30.s, z14.s\n"
+ "fmin z31.s, p1/M, z31.s, z14.s\n"
+ "st1w { z23.s }, p0, [x20, x9, LSL #2]\n"
+ "ldr x20, [%x[outptrs], #0x78]\n"
+ "fmax z24.s, p1/M, z24.s, z15.s\n"
+ "fmax z25.s, p1/M, z25.s, z15.s\n"
+ "st1w { z24.s }, p0, [x27, x9, LSL #2]\n"
+ "fmax z26.s, p1/M, z26.s, z15.s\n"
+ "fmax z27.s, p1/M, z27.s, z15.s\n"
+ "st1w { z25.s }, p0, [x26, x9, LSL #2]\n"
+ "fmax z28.s, p1/M, z28.s, z15.s\n"
+ "fmax z29.s, p1/M, z29.s, z15.s\n"
+ "st1w { z26.s }, p0, [x25, x9, LSL #2]\n"
+ "fmax z30.s, p1/M, z30.s, z15.s\n"
+ "fmax z31.s, p1/M, z31.s, z15.s\n"
+ "st1w { z27.s }, p0, [x24, x9, LSL #2]\n"
+ "st1w { z28.s }, p0, [x23, x9, LSL #2]\n"
+ "st1w { z29.s }, p0, [x22, x9, LSL #2]\n"
+ "st1w { z30.s }, p0, [x21, x9, LSL #2]\n"
+ "st1w { z31.s }, p0, [x20, x9, LSL #2]\n"
+ "7:" // Output channel loop: Done
+ "incw x9\n"
+ "whilelt p0.s, x9, %x[n_output_channels]\n"
+ "b.any 1b\n"
+ : [weights] "+&r" (weights)
+ : [bias] "r" (bias), [inptrs] "r" (inptrs), [kernel_points] "r" ((uint64_t) kernel_points), [minmax_vals] "r" (minmax_vals), [n_output_channels] "r" ((uint64_t) n_output_channels), [outptrs] "r" (outptrs)
+ : "cc", "memory", "p0", "p1", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z10", "z11", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..04cf0d4036
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *);
+
+class sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleave_sve_s8q_3x3_dot::get_packed_size(args);
+ }
+
+ void pack_parameters(
+ const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const override
+ {
+ interleave_sve_s8q_3x3_dot::pack_parameters(
+ args.input_channels * args.channel_multiplier, buffer, reinterpret_cast<const int32_t *>(biases),
+ reinterpret_cast<const int8_t *>(weights), qp, ld_weight_col, ld_weight_row
+ );
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..0cee302c56
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,497 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
+{
+ __asm__ __volatile__(
+ "mov x14, #0x0\n"
+ "whilelt p0.b, x14, %x[n_channels]\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "ldp x25, x24, [%x[inptrs], #0x10]\n"
+ "ldp x23, x22, [%x[inptrs], #0x20]\n"
+ "ldp x13, x21, [%x[inptrs], #0x30]\n"
+ "mov x20, #0x1\n"
+ "ptrue p2.b\n"
+ "ldp x12, x11, [%x[outptrs], #0x0]\n"
+ "ldp x10, x9, [%x[outptrs], #0x10]\n"
+ "orr x20, x20, #0x100\n"
+ "orr x20, x20, #0x10000\n"
+ "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z21.b }, p0/Z, [x26, x14]\n"
+ "dup z25.s, w20\n"
+ "mov x28, #0x0\n"
+ "ldp x27, x26, [%x[inptrs], #0x40]\n"
+ "ld1b { z31.b }, p0/Z, [x25, x14]\n"
+ "zip2 z16.b, z15.b, z31.b\n"
+ "zip1 z15.b, z15.b, z31.b\n"
+ "ld1b { z29.b }, p0/Z, [x24, x14]\n"
+ "ldp x25, x24, [%x[inptrs], #0x50]\n"
+ "zip1 z30.b, z21.b, z29.b\n"
+ "zip2 z29.b, z21.b, z29.b\n"
+ "ld1b { z9.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z20.b }, p0/Z, [x22, x14]\n"
+ "zip2 z13.b, z15.b, z30.b\n"
+ "zip1 z15.b, z15.b, z30.b\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "ld1b { z5.b }, p0/Z, [x13, x14]\n"
+ "zip1 z14.b, z16.b, z29.b\n"
+ "zip2 z29.b, z16.b, z29.b\n"
+ "ld1b { z17.b }, p0/Z, [x21, x14]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "zip2 z31.b, z9.b, z5.b\n"
+ "zip1 z9.b, z9.b, z5.b\n"
+ "ld1b { z18.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z28.b }, p0/Z, [x26, x14]\n"
+ "zip1 z21.b, z20.b, z17.b\n"
+ "zip2 z17.b, z20.b, z17.b\n"
+ "ld1b { z6.b }, p0/Z, [x25, x14]\n"
+ "ld1b { z4.b }, p0/Z, [x24, x14]\n"
+ "zip2 z23.b, z18.b, z6.b\n"
+ "zip1 z18.b, z18.b, z6.b\n"
+ "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z19.b }, p0/Z, [x22, x14]\n"
+ "zip1 z24.b, z28.b, z4.b\n"
+ "zip2 z4.b, z28.b, z4.b\n"
+ "ld1b { z16.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z5.b }, p0/Z, [x20, x14]\n"
+ "zip2 z22.b, z2.b, z16.b\n"
+ "zip1 z2.b, z2.b, z16.b\n"
+ "zip1 z0.b, z19.b, z5.b\n"
+ "zip2 z5.b, z19.b, z5.b\n"
+ "ld1w { z10.s }, p2/Z, [%x[params]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "zip2 z19.b, z9.b, z21.b\n"
+ "zip1 z9.b, z9.b, z21.b\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "zip1 z11.b, z31.b, z17.b\n"
+ "zip2 z17.b, z31.b, z17.b\n"
+ "ldp x25, x23, [%x[inptrs], #0x10]\n"
+ "ldp x24, x22, [%x[inptrs], #0x20]\n"
+ "zip2 z12.b, z18.b, z24.b\n"
+ "zip1 z18.b, z18.b, z24.b\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "zip1 z20.b, z23.b, z4.b\n"
+ "zip2 z4.b, z23.b, z4.b\n"
+ "ld1b { z26.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "zip2 z24.b, z2.b, z0.b\n"
+ "zip1 z2.b, z2.b, z0.b\n"
+ "ld1b { z3.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "zip1 z0.b, z22.b, z5.b\n"
+ "zip2 z5.b, z22.b, z5.b\n"
+ "addvl %x[params], %x[params], #4\n"
+ "mov z22.d, z10.d\n"
+ "mov z31.d, z10.d\n"
+ "mov z21.d, z10.d\n"
+ "1:" // Loop
+ "mov z30.s, #0x0\n"
+ "sdot z30.s, z25.b, z9.b\n"
+ "sdot z10.s, z26.b, z15.b\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "sdot z30.s, z25.b, z18.b\n"
+ "sdot z31.s, z26.b, z9.b\n"
+ "mov z27.s, #0x0\n"
+ "incw x14, ALL, MUL #4\n"
+ "sdot z10.s, z3.b, z9.b\n"
+ "ext z9.b, z9.b, z9.b, #0x1\n"
+ "movprfx z28, z30\n sdot z28.s, z25.b, z2.b\n"
+ "sdot z30.s, z25.b, z15.b\n"
+ "ext z15.b, z15.b, z15.b, #0x1\n"
+ "sdot z27.s, z25.b, z9.b\n"
+ "sdot z31.s, z3.b, z18.b\n"
+ "sdot z10.s, z1.b, z18.b\n"
+ "ext z18.b, z18.b, z18.b, #0x1\n"
+ "sdot z22.s, z26.b, z15.b\n"
+ "sdot z21.s, z26.b, z9.b\n"
+ "sdot z27.s, z25.b, z18.b\n"
+ "sdot z31.s, z1.b, z2.b\n"
+ "ext z2.b, z2.b, z2.b, #0x1\n"
+ "sdot z22.s, z3.b, z9.b\n"
+ "sdot z21.s, z3.b, z18.b\n"
+ "ld1w { z3.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "mls z10.s, p2/M, z30.s, z8.s\n"
+ "movprfx z26, z27\n sdot z26.s, z25.b, z2.b\n"
+ "mov z9.s, #0x0\n"
+ "sdot z27.s, z25.b, z15.b\n"
+ "ld1w { z23.s }, p2/Z, [%x[params]]\n"
+ "sdot z22.s, z1.b, z18.b\n"
+ ".inst 0x04b7754a // sqrdmulh z10.s, z10.s, z23.s\n"
+ "sdot z21.s, z1.b, z2.b\n"
+ "mls z22.s, p2/M, z27.s, z8.s\n"
+ "and z18.d, z10.d, z3.d\n"
+ "mls z31.s, p2/M, z28.s, z8.s\n"
+ "mls z21.s, p2/M, z26.s, z8.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n"
+ ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n"
+ "sdot z9.s, z25.b, z19.b\n"
+ ".inst 0x04b776b5 // sqrdmulh z21.s, z21.s, z23.s\n"
+ "sqadd z10.s, z10.s, z18.s\n"
+ ".inst 0x4482886a // srshl z10.s, p2/M, z10.s, z3.s\n"
+ "sdot z9.s, z25.b, z12.b\n"
+ "and z28.d, z22.d, z3.d\n"
+ "and z23.d, z31.d, z3.d\n"
+ "movprfx z27, z9\n sdot z27.s, z25.b, z24.b\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "and z18.d, z21.d, z3.d\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "sdot z9.s, z25.b, z13.b\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z22.s, z22.s, z28.s\n"
+ "sqadd z31.s, z31.s, z23.s\n"
+ ".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n"
+ ".inst 0x4482887f // srshl z31.s, p2/M, z31.s, z3.s\n"
+ "sqadd z21.s, z21.s, z18.s\n"
+ "add z10.s, z10.s, z16.s\n"
+ ".inst 0x44828875 // srshl z21.s, p2/M, z21.s, z3.s\n"
+ "smax z10.s, p2/M, z10.s, z7.s\n"
+ "add z22.s, z22.s, z16.s\n"
+ "add z31.s, z31.s, z16.s\n"
+ "smin z10.s, p2/M, z10.s, z6.s\n"
+ "smax z22.s, p2/M, z22.s, z7.s\n"
+ "add z21.s, z21.s, z16.s\n"
+ "smax z31.s, p2/M, z31.s, z7.s\n"
+ "smax z21.s, p2/M, z21.s, z7.s\n"
+ "st1b { z10.s }, p0, [x12, x28]\n"
+ "ld1w { z28.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ "st1b { z22.s }, p0, [x11, x28]\n"
+ "mov z26.d, z28.d\n"
+ "ld1b { z15.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "st1b { z31.s }, p0, [x10, x28]\n"
+ "mov z31.d, z28.d\n"
+ "sdot z31.s, z1.b, z19.b\n"
+ "ld1b { z23.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "st1b { z21.s }, p0, [x9, x28]\n"
+ "mov z22.d, z28.d\n"
+ "sdot z28.s, z1.b, z13.b\n"
+ "sdot z28.s, z15.b, z19.b\n"
+ "ext z13.b, z13.b, z13.b, #0x1\n"
+ "ext z19.b, z19.b, z19.b, #0x1\n"
+ "sdot z26.s, z1.b, z13.b\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "mov z18.s, #0x0\n"
+ "sdot z22.s, z1.b, z19.b\n"
+ "sdot z18.s, z25.b, z19.b\n"
+ "incw x28\n"
+ "sdot z31.s, z15.b, z12.b\n"
+ "sdot z28.s, z23.b, z12.b\n"
+ "ext z12.b, z12.b, z12.b, #0x1\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "sdot z26.s, z15.b, z19.b\n"
+ "sdot z22.s, z15.b, z12.b\n"
+ "addvl %x[params], %x[params], #16\n"
+ "sdot z18.s, z25.b, z12.b\n"
+ "sdot z31.s, z23.b, z24.b\n"
+ "ext z24.b, z24.b, z24.b, #0x1\n"
+ "mls z28.s, p2/M, z9.s, z8.s\n"
+ "sdot z26.s, z23.b, z12.b\n"
+ ".inst 0x04be779c // sqrdmulh z28.s, z28.s, z30.s\n"
+ "sdot z22.s, z23.b, z24.b\n"
+ "movprfx z12, z18\n sdot z12.s, z25.b, z24.b\n"
+ "and z2.d, z28.d, z21.d\n"
+ "sdot z18.s, z25.b, z13.b\n"
+ "mls z26.s, p2/M, z18.s, z8.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "mls z31.s, p2/M, z27.s, z8.s\n"
+ "mls z22.s, p2/M, z12.s, z8.s\n"
+ ".inst 0x04be775a // sqrdmulh z26.s, z26.s, z30.s\n"
+ ".inst 0x04be77ff // sqrdmulh z31.s, z31.s, z30.s\n"
+ ".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n"
+ "ld1w { z1.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+ "sqadd z28.s, z28.s, z2.s\n"
+ "and z24.d, z26.d, z21.d\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ "and z23.d, z31.d, z21.d\n"
+ "and z18.d, z22.d, z21.d\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z26.s, z26.s, z24.s\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ "ld1b { z30.b }, p2/Z, [%x[params], #-6, MUL VL]\n"
+ "sqadd z31.s, z31.s, z23.s\n"
+ "sqadd z22.s, z22.s, z18.s\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ ".inst 0x44828ab6 // srshl z22.s, p2/M, z22.s, z21.s\n"
+ "add z28.s, z28.s, z16.s\n"
+ "smax z28.s, p2/M, z28.s, z7.s\n"
+ "add z26.s, z26.s, z16.s\n"
+ "smin z28.s, p2/M, z28.s, z6.s\n"
+ "add z31.s, z31.s, z16.s\n"
+ "add z22.s, z22.s, z16.s\n"
+ "smax z26.s, p2/M, z26.s, z7.s\n"
+ "smax z31.s, p2/M, z31.s, z7.s\n"
+ "mov z24.s, #0x0\n"
+ "sdot z24.s, z25.b, z11.b\n"
+ "smax z22.s, p2/M, z22.s, z7.s\n"
+ "st1b { z28.s }, p0, [x12, x28]\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "ld1b { z19.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "st1b { z26.s }, p0, [x11, x28]\n"
+ "mov z28.d, z23.d\n"
+ "sdot z24.s, z25.b, z20.b\n"
+ "st1b { z31.s }, p0, [x10, x28]\n"
+ "mov z27.d, z23.d\n"
+ "sdot z27.s, z19.b, z11.b\n"
+ "movprfx z13, z24\n sdot z13.s, z25.b, z0.b\n"
+ "st1b { z22.s }, p0, [x9, x28]\n"
+ "mov z26.d, z23.d\n"
+ "sdot z23.s, z19.b, z14.b\n"
+ "sdot z23.s, z30.b, z11.b\n"
+ "sdot z24.s, z25.b, z14.b\n"
+ "ext z14.b, z14.b, z14.b, #0x1\n"
+ "ld1b { z21.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
+ "sdot z28.s, z19.b, z14.b\n"
+ "ext z11.b, z11.b, z11.b, #0x1\n"
+ "mov z12.s, #0x0\n"
+ "sdot z26.s, z19.b, z11.b\n"
+ "ld1w { z22.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
+ "sdot z12.s, z25.b, z11.b\n"
+ "sdot z27.s, z30.b, z20.b\n"
+ "incw x28\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "sdot z23.s, z21.b, z20.b\n"
+ "ext z20.b, z20.b, z20.b, #0x1\n"
+ "sdot z28.s, z30.b, z11.b\n"
+ "sdot z26.s, z30.b, z20.b\n"
+ "sdot z12.s, z25.b, z20.b\n"
+ "sdot z27.s, z21.b, z0.b\n"
+ "ext z0.b, z0.b, z0.b, #0x1\n"
+ "mls z23.s, p2/M, z24.s, z8.s\n"
+ "sdot z28.s, z21.b, z20.b\n"
+ "sdot z26.s, z21.b, z0.b\n"
+ ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n"
+ "movprfx z19, z12\n sdot z19.s, z25.b, z0.b\n"
+ "sdot z12.s, z25.b, z14.b\n"
+ "and z18.d, z23.d, z22.d\n"
+ "mls z28.s, p2/M, z12.s, z8.s\n"
+ "mls z27.s, p2/M, z13.s, z8.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "mls z26.s, p2/M, z19.s, z8.s\n"
+ ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n"
+ ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n"
+ ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n"
+ "ld1w { z2.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "sqadd z23.s, z23.s, z18.s\n"
+ "and z20.d, z28.d, z22.d\n"
+ ".inst 0x44828ad7 // srshl z23.s, p2/M, z23.s, z22.s\n"
+ "and z19.d, z27.d, z22.d\n"
+ "and z18.d, z26.d, z22.d\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z20.s\n"
+ ".inst 0x44828adc // srshl z28.s, p2/M, z28.s, z22.s\n"
+ "ld1b { z13.b }, p2/Z, [%x[params]]\n"
+ "sqadd z27.s, z27.s, z19.s\n"
+ "sqadd z26.s, z26.s, z18.s\n"
+ ".inst 0x44828adb // srshl z27.s, p2/M, z27.s, z22.s\n"
+ ".inst 0x44828ada // srshl z26.s, p2/M, z26.s, z22.s\n"
+ "add z23.s, z23.s, z16.s\n"
+ "smax z23.s, p2/M, z23.s, z7.s\n"
+ "add z28.s, z28.s, z16.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "add z27.s, z27.s, z16.s\n"
+ "add z26.s, z26.s, z16.s\n"
+ "smax z28.s, p2/M, z28.s, z7.s\n"
+ "smax z27.s, p2/M, z27.s, z7.s\n"
+ "mov z24.s, #0x0\n"
+ "sdot z24.s, z25.b, z17.b\n"
+ "smax z26.s, p2/M, z26.s, z7.s\n"
+ "st1b { z23.s }, p0, [x12, x28]\n"
+ "ld1w { z1.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+ "smin z28.s, p2/M, z28.s, z6.s\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "st1b { z28.s }, p0, [x11, x28]\n"
+ "mov z0.d, z1.d\n"
+ "sdot z24.s, z25.b, z4.b\n"
+ "st1b { z27.s }, p0, [x10, x28]\n"
+ "mov z31.d, z1.d\n"
+ "sdot z31.s, z21.b, z17.b\n"
+ "movprfx z23, z24\n sdot z23.s, z25.b, z5.b\n"
+ "st1b { z26.s }, p0, [x9, x28]\n"
+ "mov z30.d, z1.d\n"
+ "sdot z1.s, z21.b, z29.b\n"
+ "sdot z1.s, z13.b, z17.b\n"
+ "sdot z24.s, z25.b, z29.b\n"
+ "ext z29.b, z29.b, z29.b, #0x1\n"
+ "ld1b { z20.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "sdot z0.s, z21.b, z29.b\n"
+ "ext z17.b, z17.b, z17.b, #0x1\n"
+ "mov z19.s, #0x0\n"
+ "sdot z30.s, z21.b, z17.b\n"
+ "ld1w { z22.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z19.s, z25.b, z17.b\n"
+ "sdot z31.s, z13.b, z4.b\n"
+ "incw x28\n"
+ "whilelt p1.s, x28, %x[n_channels]\n"
+ "sdot z1.s, z20.b, z4.b\n"
+ "ext z4.b, z4.b, z4.b, #0x1\n"
+ "sdot z0.s, z13.b, z17.b\n"
+ "whilelt p0.b, x14, %x[n_channels]\n"
+ "sdot z30.s, z13.b, z4.b\n"
+ "sdot z19.s, z25.b, z4.b\n"
+ "ld1b { z13.b }, p0/Z, [x26, x14]\n"
+ "ld1b { z28.b }, p0/Z, [x25, x14]\n"
+ "sdot z31.s, z20.b, z5.b\n"
+ "ext z5.b, z5.b, z5.b, #0x1\n"
+ "mls z1.s, p2/M, z24.s, z8.s\n"
+ "ld1b { z27.b }, p0/Z, [x22, x14]\n"
+ "sdot z0.s, z20.b, z4.b\n"
+ "sdot z30.s, z20.b, z5.b\n"
+ ".inst 0x04a27421 // sqrdmulh z1.s, z1.s, z2.s\n"
+ "ld1b { z26.b }, p0/Z, [x21, x14]\n"
+ "movprfx z18, z19\n sdot z18.s, z25.b, z5.b\n"
+ "sdot z19.s, z25.b, z29.b\n"
+ "and z11.d, z1.d, z22.d\n"
+ "ld1b { z29.b }, p0/Z, [x23, x14]\n"
+ "mls z0.s, p2/M, z19.s, z8.s\n"
+ "mls z31.s, p2/M, z23.s, z8.s\n"
+ "asr z11.s, z11.s, #0x1f\n"
+ "ld1b { z17.b }, p0/Z, [x20, x14]\n"
+ "mls z30.s, p2/M, z18.s, z8.s\n"
+ ".inst 0x04a27400 // sqrdmulh z0.s, z0.s, z2.s\n"
+ ".inst 0x04a277ff // sqrdmulh z31.s, z31.s, z2.s\n"
+ ".inst 0x04a277de // sqrdmulh z30.s, z30.s, z2.s\n"
+ "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+ "ldp x23, x22, [%x[inptrs], #0x40]\n"
+ "sqadd z1.s, z1.s, z11.s\n"
+ "and z21.d, z0.d, z22.d\n"
+ ".inst 0x44828ac1 // srshl z1.s, p2/M, z1.s, z22.s\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "and z20.d, z31.d, z22.d\n"
+ "and z19.d, z30.d, z22.d\n"
+ "ld1b { z18.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z11.b }, p0/Z, [x22, x14]\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "ld1b { z24.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z4.b }, p0/Z, [x20, x14]\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "sqadd z0.s, z0.s, z21.s\n"
+ ".inst 0x44828ac0 // srshl z0.s, p2/M, z0.s, z22.s\n"
+ "ld1b { z3.b }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "sqadd z31.s, z31.s, z20.s\n"
+ "sqadd z30.s, z30.s, z19.s\n"
+ ".inst 0x44828adf // srshl z31.s, p2/M, z31.s, z22.s\n"
+ ".inst 0x44828ade // srshl z30.s, p2/M, z30.s, z22.s\n"
+ "add z1.s, z1.s, z16.s\n"
+ "smax z1.s, p2/M, z1.s, z7.s\n"
+ "add z0.s, z0.s, z16.s\n"
+ "ld1b { z9.b }, p0/Z, [x24, x14]\n"
+ "add z31.s, z31.s, z16.s\n"
+ "add z30.s, z30.s, z16.s\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "smin z1.s, p2/M, z1.s, z6.s\n"
+ "smax z0.s, p2/M, z0.s, z7.s\n"
+ "st1b { z1.s }, p1, [x12, x28]\n"
+ "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+ "smax z31.s, p2/M, z31.s, z7.s\n"
+ "smax z30.s, p2/M, z30.s, z7.s\n"
+ "ld1b { z23.b }, p0/Z, [x22, x14]\n"
+ "ld1b { z22.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z5.b }, p0/Z, [x20, x14]\n"
+ "zip2 z20.b, z15.b, z28.b\n"
+ "zip1 z15.b, z15.b, z28.b\n"
+ "smin z0.s, p2/M, z0.s, z6.s\n"
+ "zip1 z19.b, z13.b, z29.b\n"
+ "zip2 z29.b, z13.b, z29.b\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smin z30.s, p2/M, z30.s, z6.s\n"
+ "st1b { z0.s }, p1, [x11, x28]\n"
+ "zip2 z13.b, z15.b, z19.b\n"
+ "zip1 z15.b, z15.b, z19.b\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "st1b { z31.s }, p1, [x10, x28]\n"
+ "zip1 z14.b, z20.b, z29.b\n"
+ "zip2 z29.b, z20.b, z29.b\n"
+ "ld1w { z10.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "st1b { z30.s }, p1, [x9, x28]\n"
+ "zip2 z21.b, z9.b, z26.b\n"
+ "zip1 z9.b, z9.b, z26.b\n"
+ "incw x28\n"
+ "zip1 z20.b, z27.b, z17.b\n"
+ "zip2 z17.b, z27.b, z17.b\n"
+ "ldp x25, x23, [%x[inptrs], #0x10]\n"
+ "ldp x24, x22, [%x[inptrs], #0x20]\n"
+ "zip2 z31.b, z18.b, z24.b\n"
+ "zip1 z18.b, z18.b, z24.b\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "ld1b { z26.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "zip1 z27.b, z11.b, z4.b\n"
+ "zip2 z4.b, z11.b, z4.b\n"
+ "ld1b { z1.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "addvl %x[params], %x[params], #8\n"
+ "zip2 z30.b, z2.b, z22.b\n"
+ "zip1 z2.b, z2.b, z22.b\n"
+ "zip1 z28.b, z23.b, z5.b\n"
+ "zip2 z5.b, z23.b, z5.b\n"
+ "zip2 z19.b, z9.b, z20.b\n"
+ "zip1 z9.b, z9.b, z20.b\n"
+ "zip1 z11.b, z21.b, z17.b\n"
+ "zip2 z17.b, z21.b, z17.b\n"
+ "zip2 z12.b, z18.b, z27.b\n"
+ "zip1 z18.b, z18.b, z27.b\n"
+ "zip1 z20.b, z31.b, z4.b\n"
+ "zip2 z4.b, z31.b, z4.b\n"
+ "zip2 z24.b, z2.b, z28.b\n"
+ "zip1 z2.b, z2.b, z28.b\n"
+ "zip1 z0.b, z30.b, z5.b\n"
+ "zip2 z5.b, z30.b, z5.b\n"
+ "mov z22.d, z10.d\n"
+ "mov z31.d, z10.d\n"
+ "mov z21.d, z10.d\n"
+ "b.any 1b\n"
+ : [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..c9b4daf334
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+class sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..8ac522dc9a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const int8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ int8_t *const *const outptrs;
+ const int8_t *inptrs[16];
+
+ Params(
+ long unsigned int n_channels,
+ const int8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[5];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[3];
+ inptrs[3] = inptrs_raw[6];
+ inptrs[4] = inptrs_raw[9];
+ inptrs[5] = inptrs_raw[12];
+ inptrs[6] = inptrs_raw[15];
+ inptrs[7] = inptrs_raw[1];
+ inptrs[8] = inptrs_raw[2];
+ inptrs[9] = inptrs_raw[10];
+ inptrs[10] = inptrs_raw[4];
+ inptrs[11] = inptrs_raw[7];
+ inptrs[12] = inptrs_raw[8];
+ inptrs[13] = inptrs_raw[11];
+ inptrs[14] = inptrs_raw[13];
+ inptrs[15] = inptrs_raw[14];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "mov x16, #0x0\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+ "ptrue p4.b\n"
+ "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "mov x23, x16\n"
+ "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x15, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z12.b }, p4/Z, [x21]\n"
+ "ld1rb { z30.b }, p4/Z, [x20]\n"
+ "add x21, x25, %[offsetof_Requantize32_minval]\n"
+ "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+ "ld1rh { z24.h }, p4/Z, [x22]\n"
+ "ld1rh { z11.h }, p4/Z, [x21]\n"
+ "ld1rh { z26.h }, p4/Z, [x20]\n"
+ "ldp x13, x12, [x24, #0x0]\n"
+ "incw x23\n"
+ "whilelt p3.h, x16, x15\n"
+ "ldp x11, x10, [x24, #0x10]\n"
+ "whilelt p2.s, x16, x15\n"
+ "whilelt p1.s, x23, x15\n"
+ "ldr x9, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1sb { z14.h }, p4/Z, [x14]\n"
+ "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "add x28, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x27, #0x0\n"
+ "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+ ".inst 0x455e11ce // ssublb z14.h, z14.b, z30.b\n"
+ ".inst 0x455e12b5 // ssublb z21.h, z21.b, z30.b\n"
+ "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+ ".inst 0x455e1021 // ssublb z1.h, z1.b, z30.b\n"
+ ".inst 0x455e10c6 // ssublb z6.h, z6.b, z30.b\n"
+ "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "inch x14, ALL, MUL #8\n"
+ ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
+ "ld1w { z17.s }, p2/Z, [x9]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #1, MUL VL]\n"
+ "uzp1 z5.s, z17.s, z16.s\n"
+ "uzp2 z9.s, z17.s, z16.s\n"
+ "ld1sb { z8.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x28, #0x0]\n"
+ "addvl x9, x9, #2\n"
+ "mov z17.d, z5.d\n"
+ "ldp x22, x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x20]\n"
+ "mov z25.d, z9.d\n"
+ "mov z16.d, z5.d\n"
+ "ld1sb { z0.h }, p3/Z, [x24, x16]\n"
+ "ld1sb { z29.h }, p3/Z, [x23, x16]\n"
+ "mov z23.d, z9.d\n"
+ "mov z22.d, z5.d\n"
+ "ld1sb { z4.h }, p3/Z, [x22, x16]\n"
+ "ld1sb { z13.h }, p3/Z, [x21, x16]\n"
+ "mov z27.d, z9.d\n"
+ ".inst 0x455e1252 // ssublb z18.h, z18.b, z30.b\n"
+ "ld1sb { z20.h }, p3/Z, [x20, x16]\n"
+ "ldr x26, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x455e10e7 // ssublb z7.h, z7.b, z30.b\n"
+ ".inst 0x455e114a // ssublb z10.h, z10.b, z30.b\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x9, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
+ ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
+ ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c11ad // ssublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1294 // ssublb z20.h, z20.b, z12.b\n"
+ "1:" // Loop
+ ".inst 0x44824005 // smlalb z5.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x44824409 // smlalt z9.s, p4/M, z0.h, z2.h\n"
+ "ldr x20, [x28, #0x28]\n"
+ "ldr x21, [x28, #0x38]\n"
+ ".inst 0x448e43a5 // smlalb z5.s, p4/M, z29.h, z14.h\n"
+ ".inst 0x44864011 // smlalb z17.s, p4/M, z0.h, z6.h\n"
+ "ld1sb { z3.h }, p3/Z, [x20, x16]\n"
+ "ldr x20, [x28, #0x30]\n"
+ ".inst 0x44954010 // smlalb z16.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x448e4016 // smlalb z22.s, p4/M, z0.h, z14.h\n"
+ "ld1sb { z31.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
+ ".inst 0x448e47a9 // smlalt z9.s, p4/M, z29.h, z14.h\n"
+ ".inst 0x449241a5 // smlalb z5.s, p4/M, z13.h, z18.h\n"
+ "ldr x21, [x28, #0x40]\n"
+ "ld1sb { z15.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x44864419 // smlalt z25.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
+ "ldr x20, [x28, #0x48]\n"
+ ".inst 0x448e441b // smlalt z27.s, p4/M, z0.h, z14.h\n"
+ ".inst 0x44814091 // smlalb z17.s, p4/M, z4.h, z1.h\n"
+ "ld1sb { z19.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x454c11ef // ssublb z15.h, z15.b, z12.b\n"
+ ".inst 0x448141b0 // smlalb z16.s, p4/M, z13.h, z1.h\n"
+ ".inst 0x449541b6 // smlalb z22.s, p4/M, z13.h, z21.h\n"
+ "ld1sb { z28.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x454c1273 // ssublb z19.h, z19.b, z12.b\n"
+ ".inst 0x449245a9 // smlalt z9.s, p4/M, z13.h, z18.h\n"
+ ".inst 0x448a4285 // smlalb z5.s, p4/M, z20.h, z10.h\n"
+ "ldr x21, [x28, #0x50]\n"
+ "ldr x20, [x28, #0x58]\n"
+ ".inst 0x44814499 // smlalt z25.s, p4/M, z4.h, z1.h\n"
+ ".inst 0x448145b7 // smlalt z23.s, p4/M, z13.h, z1.h\n"
+ ".inst 0x454c139c // ssublb z28.h, z28.b, z12.b\n"
+ "ld1sb { z4.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x449545bb // smlalt z27.s, p4/M, z13.h, z21.h\n"
+ ".inst 0x448241b1 // smlalb z17.s, p4/M, z13.h, z2.h\n"
+ "ld1sb { z29.h }, p3/Z, [x20, x16]\n"
+ "ldr x21, [x28, #0x60]\n"
+ ".inst 0x44874070 // smlalb z16.s, p4/M, z3.h, z7.h\n"
+ ".inst 0x44864296 // smlalb z22.s, p4/M, z20.h, z6.h\n"
+ "ldr x20, [x28, #0x68]\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ ".inst 0x448a4689 // smlalt z9.s, p4/M, z20.h, z10.h\n"
+ ".inst 0x449543e5 // smlalb z5.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
+ "ld1sb { z0.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x448245b9 // smlalt z25.s, p4/M, z13.h, z2.h\n"
+ ".inst 0x44874477 // smlalt z23.s, p4/M, z3.h, z7.h\n"
+ "ld1sb { z3.h }, p3/Z, [x20, x16]\n"
+ "ldr x20, [x28, #0x70]\n"
+ ".inst 0x4486469b // smlalt z27.s, p4/M, z20.h, z6.h\n"
+ ".inst 0x44874291 // smlalb z17.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
+ "ld1sb { z13.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x44824290 // smlalb z16.s, p4/M, z20.h, z2.h\n"
+ ".inst 0x448841f6 // smlalb z22.s, p4/M, z15.h, z8.h\n"
+ ".inst 0x454c1063 // ssublb z3.h, z3.b, z12.b\n"
+ "ldr x20, [x28, #0x78]\n"
+ ".inst 0x449547e9 // smlalt z9.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x44814265 // smlalb z5.s, p4/M, z19.h, z1.h\n"
+ ".inst 0x454c11ad // ssublb z13.h, z13.b, z12.b\n"
+ "whilelt p0.h, x27, x15\n"
+ ".inst 0x44874699 // smlalt z25.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x44824697 // smlalt z23.s, p4/M, z20.h, z2.h\n"
+ "ld1w { z20.s }, p2/Z, [x26]\n"
+ "inch x14\n"
+ ".inst 0x448845fb // smlalt z27.s, p4/M, z15.h, z8.h\n"
+ ".inst 0x448e43f1 // smlalb z17.s, p4/M, z31.h, z14.h\n"
+ "ld1w { z15.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44924390 // smlalb z16.s, p4/M, z28.h, z18.h\n"
+ ".inst 0x44824396 // smlalb z22.s, p4/M, z28.h, z2.h\n"
+ "addvl x26, x26, #2\n"
+ ".inst 0x44814669 // smlalt z9.s, p4/M, z19.h, z1.h\n"
+ ".inst 0x44884385 // smlalb z5.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x448e47f9 // smlalt z25.s, p4/M, z31.h, z14.h\n"
+ ".inst 0x44924797 // smlalt z23.s, p4/M, z28.h, z18.h\n"
+ "ld1sb { z31.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x454c13ff // ssublb z31.h, z31.b, z12.b\n"
+ ".inst 0x4482479b // smlalt z27.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x44954271 // smlalb z17.s, p4/M, z19.h, z21.h\n"
+ "uzp1 z2.s, z20.s, z15.s\n"
+ "inch x16\n"
+ ".inst 0x448e4090 // smlalb z16.s, p4/M, z4.h, z14.h\n"
+ ".inst 0x448143b6 // smlalb z22.s, p4/M, z29.h, z1.h\n"
+ "uzp2 z15.s, z20.s, z15.s\n"
+ "ld1w { z20.s }, p2/Z, [x25]\n"
+ ".inst 0x44884789 // smlalt z9.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x44864085 // smlalb z5.s, p4/M, z4.h, z6.h\n"
+ "mov x20, x16\n"
+ "incw x20\n"
+ ".inst 0x44954679 // smlalt z25.s, p4/M, z19.h, z21.h\n"
+ ".inst 0x448e4497 // smlalt z23.s, p4/M, z4.h, z14.h\n"
+ "ld1w { z19.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "uzp1 z21.s, z20.s, z19.s\n"
+ ".inst 0x448147bb // smlalt z27.s, p4/M, z29.h, z1.h\n"
+ ".inst 0x448a4391 // smlalb z17.s, p4/M, z28.h, z10.h\n"
+ "uzp2 z1.s, z20.s, z19.s\n"
+ "whilelt p2.s, x16, x15\n"
+ ".inst 0x44864010 // smlalb z16.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x44924076 // smlalb z22.s, p4/M, z3.h, z18.h\n"
+ "whilelt p1.s, x20, x15\n"
+ "whilelt p3.h, x16, x15\n"
+ ".inst 0x44864489 // smlalt z9.s, p4/M, z4.h, z6.h\n"
+ ".inst 0x44874005 // smlalb z5.s, p4/M, z0.h, z7.h\n"
+ ".inst 0x04a274a5 // sqrdmulh z5.s, z5.s, z2.s\n"
+ "addvl x25, x25, #2\n"
+ ".inst 0x448a4799 // smlalt z25.s, p4/M, z28.h, z10.h\n"
+ ".inst 0x44864417 // smlalt z23.s, p4/M, z0.h, z6.h\n"
+ "and z19.d, z5.d, z21.d\n"
+ ".inst 0x4492447b // smlalt z27.s, p4/M, z3.h, z18.h\n"
+ ".inst 0x449243b1 // smlalb z17.s, p4/M, z29.h, z18.h\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ ".inst 0x448a41b0 // smlalb z16.s, p4/M, z13.h, z10.h\n"
+ ".inst 0x448741b6 // smlalb z22.s, p4/M, z13.h, z7.h\n"
+ "sqadd z5.s, z5.s, z19.s\n"
+ ".inst 0x448292a5 // srshl z5.s, p4/M, z5.s, z21.s\n"
+ ".inst 0x44874409 // smlalt z9.s, p4/M, z0.h, z7.h\n"
+ ".inst 0x449247b9 // smlalt z25.s, p4/M, z29.h, z18.h\n"
+ ".inst 0x04af7529 // sqrdmulh z9.s, z9.s, z15.s\n"
+ ".inst 0x448a45b7 // smlalt z23.s, p4/M, z13.h, z10.h\n"
+ ".inst 0x448745bb // smlalt z27.s, p4/M, z13.h, z7.h\n"
+ "and z29.d, z9.d, z1.d\n"
+ ".inst 0x44884071 // smlalb z17.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x448843f0 // smlalb z16.s, p4/M, z31.h, z8.h\n"
+ ".inst 0x04a27631 // sqrdmulh z17.s, z17.s, z2.s\n"
+ ".inst 0x448a43f6 // smlalb z22.s, p4/M, z31.h, z10.h\n"
+ ".inst 0x44884479 // smlalt z25.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x04a27610 // sqrdmulh z16.s, z16.s, z2.s\n"
+ ".inst 0x448847f7 // smlalt z23.s, p4/M, z31.h, z8.h\n"
+ ".inst 0x448a47fb // smlalt z27.s, p4/M, z31.h, z10.h\n"
+ ".inst 0x04a276d6 // sqrdmulh z22.s, z22.s, z2.s\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "and z18.d, z17.d, z21.d\n"
+ ".inst 0x04af7739 // sqrdmulh z25.s, z25.s, z15.s\n"
+ "and z20.d, z16.d, z21.d\n"
+ ".inst 0x04af76f7 // sqrdmulh z23.s, z23.s, z15.s\n"
+ "and z19.d, z22.d, z21.d\n"
+ ".inst 0x04af777b // sqrdmulh z27.s, z27.s, z15.s\n"
+ "sqadd z9.s, z9.s, z29.s\n"
+ ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "and z7.d, z25.d, z1.d\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "and z6.d, z23.d, z1.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z2.d, z27.d, z1.d\n"
+ "sqadd z17.s, z17.s, z18.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ ".inst 0x448292b1 // srshl z17.s, p4/M, z17.s, z21.s\n"
+ "sqadd z16.s, z16.s, z20.s\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ ".inst 0x448292b0 // srshl z16.s, p4/M, z16.s, z21.s\n"
+ "sqadd z22.s, z22.s, z19.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ ".inst 0x448292b6 // srshl z22.s, p4/M, z22.s, z21.s\n"
+ "sqadd z25.s, z25.s, z7.s\n"
+ "sqadd z23.s, z23.s, z6.s\n"
+ ".inst 0x44829039 // srshl z25.s, p4/M, z25.s, z1.s\n"
+ ".inst 0x44829037 // srshl z23.s, p4/M, z23.s, z1.s\n"
+ "sqadd z27.s, z27.s, z2.s\n"
+ ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
+ ".inst 0x4482903b // srshl z27.s, p4/M, z27.s, z1.s\n"
+ ".inst 0x45304231 // sqxtnb z17.h, z17.s\n"
+ ".inst 0x45304210 // sqxtnb z16.h, z16.s\n"
+ ".inst 0x453042d6 // sqxtnb z22.h, z22.s\n"
+ ".inst 0x45304525 // sqxtnt z5.h, z9.s\n"
+ ".inst 0x45304731 // sqxtnt z17.h, z25.s\n"
+ ".inst 0x453046f0 // sqxtnt z16.h, z23.s\n"
+ ".inst 0x45304776 // sqxtnt z22.h, z27.s\n"
+ "sqadd z5.h, z5.h, z24.h\n"
+ "smax z5.h, p4/M, z5.h, z11.h\n"
+ "smin z5.h, p4/M, z5.h, z26.h\n"
+ "sqadd z17.h, z17.h, z24.h\n"
+ "sqadd z16.h, z16.h, z24.h\n"
+ "smax z17.h, p4/M, z17.h, z11.h\n"
+ "smax z16.h, p4/M, z16.h, z11.h\n"
+ "sqadd z22.h, z22.h, z24.h\n"
+ "smax z22.h, p4/M, z22.h, z11.h\n"
+ "smin z17.h, p4/M, z17.h, z26.h\n"
+ "st1b { z5.h }, p0, [x13, x27]\n"
+ "smin z16.h, p4/M, z16.h, z26.h\n"
+ "smin z22.h, p4/M, z22.h, z26.h\n"
+ "st1b { z17.h }, p0, [x12, x27]\n"
+ "st1b { z16.h }, p0, [x11, x27]\n"
+ "st1b { z22.h }, p0, [x10, x27]\n"
+ "ld1sb { z14.h }, p4/Z, [x14]\n"
+ "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "inch x27\n"
+ "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+ ".inst 0x455e11ce // ssublb z14.h, z14.b, z30.b\n"
+ ".inst 0x455e12b5 // ssublb z21.h, z21.b, z30.b\n"
+ "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+ ".inst 0x455e1021 // ssublb z1.h, z1.b, z30.b\n"
+ ".inst 0x455e10c6 // ssublb z6.h, z6.b, z30.b\n"
+ "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "inch x14, ALL, MUL #8\n"
+ ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+ "uzp1 z5.s, z17.s, z16.s\n"
+ "uzp2 z9.s, z17.s, z16.s\n"
+ "ld1sb { z8.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x28, #0x0]\n"
+ "addvl x21, x21, #2\n"
+ "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x22, x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x20]\n"
+ "mov z17.d, z5.d\n"
+ "mov z25.d, z9.d\n"
+ "ld1sb { z0.h }, p3/Z, [x24, x16]\n"
+ "ld1sb { z29.h }, p3/Z, [x23, x16]\n"
+ "mov z16.d, z5.d\n"
+ "mov z23.d, z9.d\n"
+ "ld1sb { z4.h }, p3/Z, [x22, x16]\n"
+ "ld1sb { z13.h }, p3/Z, [x21, x16]\n"
+ "mov z22.d, z5.d\n"
+ "mov z27.d, z9.d\n"
+ "ld1sb { z20.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x455e1252 // ssublb z18.h, z18.b, z30.b\n"
+ ".inst 0x455e10e7 // ssublb z7.h, z7.b, z30.b\n"
+ ".inst 0x455e114a // ssublb z10.h, z10.b, z30.b\n"
+ ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
+ ".inst 0x454c1000 // ssublb z0.h, z0.b, z12.b\n"
+ ".inst 0x454c13bd // ssublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c1084 // ssublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c11ad // ssublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1294 // ssublb z20.h, z20.b, z12.b\n"
+ "b.any 1b\n"
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..7a9b8a5bde
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+class sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..fc9a48bb46
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const int8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ int8_t *const *const outptrs;
+ const int8_t *inptrs[25];
+
+ Params(
+ long unsigned int n_channels,
+ const int8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[12];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[1];
+ inptrs[3] = inptrs_raw[3];
+ inptrs[4] = inptrs_raw[4];
+ inptrs[5] = inptrs_raw[5];
+ inptrs[6] = inptrs_raw[6];
+ inptrs[7] = inptrs_raw[2];
+ inptrs[8] = inptrs_raw[8];
+ inptrs[9] = inptrs_raw[9];
+ inptrs[10] = inptrs_raw[7];
+ inptrs[11] = inptrs_raw[15];
+ inptrs[12] = inptrs_raw[10];
+ inptrs[13] = inptrs_raw[16];
+ inptrs[14] = inptrs_raw[11];
+ inptrs[15] = inptrs_raw[18];
+ inptrs[16] = inptrs_raw[13];
+ inptrs[17] = inptrs_raw[19];
+ inptrs[18] = inptrs_raw[20];
+ inptrs[19] = inptrs_raw[14];
+ inptrs[20] = inptrs_raw[21];
+ inptrs[21] = inptrs_raw[17];
+ inptrs[22] = inptrs_raw[23];
+ inptrs[23] = inptrs_raw[22];
+ inptrs[24] = inptrs_raw[24];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "mov x7, #0x0\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+ "ptrue p4.b\n"
+ "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "mov x23, x7\n"
+ "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z26.b }, p4/Z, [x21]\n"
+ "ld1rb { z13.b }, p4/Z, [x20]\n"
+ "add x21, x25, %[offsetof_Requantize32_minval]\n"
+ "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+ "ld1rh { z19.h }, p4/Z, [x22]\n"
+ "ld1rh { z12.h }, p4/Z, [x21]\n"
+ "ld1rh { z9.h }, p4/Z, [x20]\n"
+ "ldp x16, x15, [x24, #0x0]\n"
+ "incw x23\n"
+ "whilelt p3.h, x7, x8\n"
+ "ldp x14, x13, [x24, #0x10]\n"
+ "whilelt p2.s, x7, x8\n"
+ "whilelt p1.s, x23, x8\n"
+ "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1sb { z25.h }, p4/Z, [x17]\n"
+ "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "add x11, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x10, #0x0\n"
+ "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+ ".inst 0x454d1339 // ssublb z25.h, z25.b, z13.b\n"
+ ".inst 0x454d13de // ssublb z30.h, z30.b, z13.b\n"
+ "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+ ".inst 0x454d11ce // ssublb z14.h, z14.b, z13.b\n"
+ ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
+ "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+ "inch x17, ALL, MUL #8\n"
+ ".inst 0x454d114a // ssublb z10.h, z10.b, z13.b\n"
+ "ld1w { z17.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z8.s, z17.s, z16.s\n"
+ "uzp2 z24.s, z17.s, z16.s\n"
+ "ld1sb { z2.h }, p4/Z, [x17]\n"
+ "ldp x27, x26, [x11, #0x0]\n"
+ "addvl x12, x12, #2\n"
+ "mov z18.d, z8.d\n"
+ "ldp x25, x24, [x11, #0x10]\n"
+ "ldp x23, x22, [x11, #0x20]\n"
+ "mov z0.d, z24.d\n"
+ "mov z15.d, z8.d\n"
+ "ldp x21, x20, [x11, #0x30]\n"
+ "ld1sb { z21.h }, p3/Z, [x27, x7]\n"
+ "mov z1.d, z24.d\n"
+ "mov z5.d, z8.d\n"
+ "ld1sb { z22.h }, p3/Z, [x26, x7]\n"
+ "ld1sb { z11.h }, p3/Z, [x25, x7]\n"
+ "mov z6.d, z24.d\n"
+ ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
+ "ld1sb { z20.h }, p3/Z, [x24, x7]\n"
+ "ld1sb { z27.h }, p3/Z, [x23, x7]\n"
+ ".inst 0x454d12f7 // ssublb z23.h, z23.b, z13.b\n"
+ ".inst 0x454d10e7 // ssublb z7.h, z7.b, z13.b\n"
+ "ld1sb { z28.h }, p3/Z, [x22, x7]\n"
+ "ld1sb { z16.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
+ ".inst 0x455a12b5 // ssublb z21.h, z21.b, z26.b\n"
+ "ld1sb { z31.h }, p3/Z, [x20, x7]\n"
+ "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x455a12d6 // ssublb z22.h, z22.b, z26.b\n"
+ ".inst 0x455a116b // ssublb z11.h, z11.b, z26.b\n"
+ "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455a1294 // ssublb z20.h, z20.b, z26.b\n"
+ ".inst 0x455a137b // ssublb z27.h, z27.b, z26.b\n"
+ ".inst 0x455a139c // ssublb z28.h, z28.b, z26.b\n"
+ ".inst 0x455a1210 // ssublb z16.h, z16.b, z26.b\n"
+ ".inst 0x455a13ff // ssublb z31.h, z31.b, z26.b\n"
+ "1:" // Loop
+ ".inst 0x448242a8 // smlalb z8.s, p4/M, z21.h, z2.h\n"
+ "ldr x21, [x11, #0x58]\n"
+ "ldr x20, [x11, #0x78]\n"
+ ".inst 0x448246b8 // smlalt z24.s, p4/M, z21.h, z2.h\n"
+ ".inst 0x449942c8 // smlalb z8.s, p4/M, z22.h, z25.h\n"
+ "ld1sb { z17.h }, p3/Z, [x21, x7]\n"
+ "ld1sb { z29.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x449742b2 // smlalb z18.s, p4/M, z21.h, z23.h\n"
+ "ldr x21, [x11, #0x60]\n"
+ "ldr x20, [x11, #0x80]\n"
+ ".inst 0x448e42af // smlalb z15.s, p4/M, z21.h, z14.h\n"
+ ".inst 0x449942a5 // smlalb z5.s, p4/M, z21.h, z25.h\n"
+ ".inst 0x449946d8 // smlalt z24.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x455a1231 // ssublb z17.h, z17.b, z26.b\n"
+ ".inst 0x449e4168 // smlalb z8.s, p4/M, z11.h, z30.h\n"
+ "ld1sb { z22.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x455a13bd // ssublb z29.h, z29.b, z26.b\n"
+ ".inst 0x449746a0 // smlalt z0.s, p4/M, z21.h, z23.h\n"
+ ".inst 0x448e46a1 // smlalt z1.s, p4/M, z21.h, z14.h\n"
+ "ldr x21, [x11, #0x68]\n"
+ ".inst 0x449946a6 // smlalt z6.s, p4/M, z21.h, z25.h\n"
+ "ld1sb { z21.h }, p3/Z, [x20, x7]\n"
+ "ldr x20, [x11, #0x88]\n"
+ ".inst 0x449e4292 // smlalb z18.s, p4/M, z20.h, z30.h\n"
+ ".inst 0x4484422f // smlalb z15.s, p4/M, z17.h, z4.h\n"
+ ".inst 0x448a43a5 // smlalb z5.s, p4/M, z29.h, z10.h\n"
+ ".inst 0x455a12d6 // ssublb z22.h, z22.b, z26.b\n"
+ "ldr x22, [x11, #0x40]\n"
+ ".inst 0x449e4578 // smlalt z24.s, p4/M, z11.h, z30.h\n"
+ ".inst 0x455a12b5 // ssublb z21.h, z21.b, z26.b\n"
+ ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
+ "ld1sb { z11.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x449e4680 // smlalt z0.s, p4/M, z20.h, z30.h\n"
+ "ld1sb { z20.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x44844621 // smlalt z1.s, p4/M, z17.h, z4.h\n"
+ "ldr x21, [x11, #0x70]\n"
+ ".inst 0x448a47a6 // smlalt z6.s, p4/M, z29.h, z10.h\n"
+ "ldr x20, [x11, #0x98]\n"
+ ".inst 0x448e4372 // smlalb z18.s, p4/M, z27.h, z14.h\n"
+ "ldr x23, [x11, #0x50]\n"
+ ".inst 0x449942cf // smlalb z15.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x449e42a5 // smlalb z5.s, p4/M, z21.h, z30.h\n"
+ ".inst 0x455a116b // ssublb z11.h, z11.b, z26.b\n"
+ "ld1sb { z17.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x44844798 // smlalt z24.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x455a1294 // ssublb z20.h, z20.b, z26.b\n"
+ ".inst 0x448a4208 // smlalb z8.s, p4/M, z16.h, z10.h\n"
+ "ld1sb { z29.h }, p3/Z, [x21, x7]\n"
+ "ld1sb { z28.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x448e4760 // smlalt z0.s, p4/M, z27.h, z14.h\n"
+ "ldr x22, [x11, #0x48]\n"
+ ".inst 0x449946c1 // smlalt z1.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x449e46a6 // smlalt z6.s, p4/M, z21.h, z30.h\n"
+ "ldr x21, [x11, #0x90]\n"
+ "ldr x20, [x11, #0xa8]\n"
+ ".inst 0x449943f2 // smlalb z18.s, p4/M, z31.h, z25.h\n"
+ "ld1sb { z27.h }, p3/Z, [x23, x7]\n"
+ ".inst 0x448a416f // smlalb z15.s, p4/M, z11.h, z10.h\n"
+ ".inst 0x44834285 // smlalb z5.s, p4/M, z20.h, z3.h\n"
+ ".inst 0x455a1231 // ssublb z17.h, z17.b, z26.b\n"
+ ".inst 0x448a4618 // smlalt z24.s, p4/M, z16.h, z10.h\n"
+ ".inst 0x455a13bd // ssublb z29.h, z29.b, z26.b\n"
+ ".inst 0x448e43e8 // smlalb z8.s, p4/M, z31.h, z14.h\n"
+ "ld1sb { z16.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x455a139c // ssublb z28.h, z28.b, z26.b\n"
+ ".inst 0x449947e0 // smlalt z0.s, p4/M, z31.h, z25.h\n"
+ "ld1sb { z25.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x448a4561 // smlalt z1.s, p4/M, z11.h, z10.h\n"
+ "ld1sb { z11.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x455a137b // ssublb z27.h, z27.b, z26.b\n"
+ ".inst 0x44834686 // smlalt z6.s, p4/M, z20.h, z3.h\n"
+ "ldr x21, [x11, #0xa0]\n"
+ "ldr x20, [x11, #0xb0]\n"
+ ".inst 0x448a4232 // smlalb z18.s, p4/M, z17.h, z10.h\n"
+ ".inst 0x449e43af // smlalb z15.s, p4/M, z29.h, z30.h\n"
+ ".inst 0x455a1210 // ssublb z16.h, z16.b, z26.b\n"
+ ".inst 0x448e4385 // smlalb z5.s, p4/M, z28.h, z14.h\n"
+ ".inst 0x448e47f8 // smlalt z24.s, p4/M, z31.h, z14.h\n"
+ ".inst 0x455a1339 // ssublb z25.h, z25.b, z26.b\n"
+ "ld1sb { z20.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x455a116b // ssublb z11.h, z11.b, z26.b\n"
+ ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
+ "ld1sb { z31.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x448a4620 // smlalt z0.s, p4/M, z17.h, z10.h\n"
+ ".inst 0x449e47a1 // smlalt z1.s, p4/M, z29.h, z30.h\n"
+ ".inst 0x448e4786 // smlalt z6.s, p4/M, z28.h, z14.h\n"
+ "ldr x20, [x11, #0xb8]\n"
+ ".inst 0x455a1294 // ssublb z20.h, z20.b, z26.b\n"
+ ".inst 0x44834212 // smlalb z18.s, p4/M, z16.h, z3.h\n"
+ ".inst 0x4497432f // smlalb z15.s, p4/M, z25.h, z23.h\n"
+ ".inst 0x455a13ff // ssublb z31.h, z31.b, z26.b\n"
+ "ld1sb { z30.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x44844165 // smlalb z5.s, p4/M, z11.h, z4.h\n"
+ ".inst 0x44834778 // smlalt z24.s, p4/M, z27.h, z3.h\n"
+ "ldr x20, [x11, #0xc0]\n"
+ "ld1w { z17.s }, p2/Z, [x9]\n"
+ ".inst 0x449742c8 // smlalb z8.s, p4/M, z22.h, z23.h\n"
+ ".inst 0x44834600 // smlalt z0.s, p4/M, z16.h, z3.h\n"
+ "ld1w { z14.s }, p1/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x455a13de // ssublb z30.h, z30.b, z26.b\n"
+ ".inst 0x44974721 // smlalt z1.s, p4/M, z25.h, z23.h\n"
+ ".inst 0x44844566 // smlalt z6.s, p4/M, z11.h, z4.h\n"
+ "ld1sb { z25.h }, p3/Z, [x20, x7]\n"
+ "uzp1 z10.s, z17.s, z14.s\n"
+ ".inst 0x44844372 // smlalb z18.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x4487428f // smlalb z15.s, p4/M, z20.h, z7.h\n"
+ "uzp2 z14.s, z17.s, z14.s\n"
+ "ld1w { z17.s }, p2/Z, [x28]\n"
+ ".inst 0x448743e5 // smlalb z5.s, p4/M, z31.h, z7.h\n"
+ ".inst 0x449746d8 // smlalt z24.s, p4/M, z22.h, z23.h\n"
+ "ld1w { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x455a1339 // ssublb z25.h, z25.b, z26.b\n"
+ ".inst 0x448743a8 // smlalb z8.s, p4/M, z29.h, z7.h\n"
+ ".inst 0x44844760 // smlalt z0.s, p4/M, z27.h, z4.h\n"
+ "uzp1 z4.s, z17.s, z16.s\n"
+ "inch x7\n"
+ ".inst 0x44874681 // smlalt z1.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x448747e6 // smlalt z6.s, p4/M, z31.h, z7.h\n"
+ ".inst 0x04aa7508 // sqrdmulh z8.s, z8.s, z10.s\n"
+ "whilelt p0.h, x10, x8\n"
+ ".inst 0x448742b2 // smlalb z18.s, p4/M, z21.h, z7.h\n"
+ ".inst 0x4483416f // smlalb z15.s, p4/M, z11.h, z3.h\n"
+ "uzp2 z22.s, z17.s, z16.s\n"
+ "mov x20, x7\n"
+ ".inst 0x449743c5 // smlalb z5.s, p4/M, z30.h, z23.h\n"
+ ".inst 0x448747b8 // smlalt z24.s, p4/M, z29.h, z7.h\n"
+ "and z17.d, z8.d, z4.d\n"
+ "inch x17\n"
+ ".inst 0x448746a0 // smlalt z0.s, p4/M, z21.h, z7.h\n"
+ ".inst 0x44834561 // smlalt z1.s, p4/M, z11.h, z3.h\n"
+ ".inst 0x04ae7718 // sqrdmulh z24.s, z24.s, z14.s\n"
+ "incw x20\n"
+ ".inst 0x449747c6 // smlalt z6.s, p4/M, z30.h, z23.h\n"
+ ".inst 0x44824392 // smlalb z18.s, p4/M, z28.h, z2.h\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "whilelt p2.s, x7, x8\n"
+ ".inst 0x448243cf // smlalb z15.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x44824325 // smlalb z5.s, p4/M, z25.h, z2.h\n"
+ "and z16.d, z24.d, z22.d\n"
+ "whilelt p1.s, x20, x8\n"
+ ".inst 0x44824780 // smlalt z0.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x448247c1 // smlalt z1.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x04aa7652 // sqrdmulh z18.s, z18.s, z10.s\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44824726 // smlalt z6.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x04aa75ef // sqrdmulh z15.s, z15.s, z10.s\n"
+ "whilelt p3.h, x7, x8\n"
+ "addvl x9, x9, #2\n"
+ ".inst 0x04aa74a5 // sqrdmulh z5.s, z5.s, z10.s\n"
+ "sqadd z8.s, z8.s, z17.s\n"
+ ".inst 0x44829088 // srshl z8.s, p4/M, z8.s, z4.s\n"
+ "addvl x28, x28, #2\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "and z21.d, z18.d, z4.d\n"
+ ".inst 0x04ae7400 // sqrdmulh z0.s, z0.s, z14.s\n"
+ "and z20.d, z15.d, z4.d\n"
+ ".inst 0x04ae7421 // sqrdmulh z1.s, z1.s, z14.s\n"
+ "and z28.d, z5.d, z4.d\n"
+ ".inst 0x04ae74c6 // sqrdmulh z6.s, z6.s, z14.s\n"
+ "sqadd z24.s, z24.s, z16.s\n"
+ ".inst 0x448292d8 // srshl z24.s, p4/M, z24.s, z22.s\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "and z25.d, z0.d, z22.d\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "and z17.d, z1.d, z22.d\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "and z16.d, z6.d, z22.d\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ ".inst 0x44829092 // srshl z18.s, p4/M, z18.s, z4.s\n"
+ "sqadd z15.s, z15.s, z20.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x4482908f // srshl z15.s, p4/M, z15.s, z4.s\n"
+ "sqadd z5.s, z5.s, z28.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x44829085 // srshl z5.s, p4/M, z5.s, z4.s\n"
+ "sqadd z0.s, z0.s, z25.s\n"
+ "sqadd z1.s, z1.s, z17.s\n"
+ ".inst 0x448292c0 // srshl z0.s, p4/M, z0.s, z22.s\n"
+ ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
+ "sqadd z6.s, z6.s, z16.s\n"
+ ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x448292c6 // srshl z6.s, p4/M, z6.s, z22.s\n"
+ ".inst 0x45304252 // sqxtnb z18.h, z18.s\n"
+ ".inst 0x453041ef // sqxtnb z15.h, z15.s\n"
+ ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
+ ".inst 0x45304708 // sqxtnt z8.h, z24.s\n"
+ ".inst 0x45304412 // sqxtnt z18.h, z0.s\n"
+ ".inst 0x4530442f // sqxtnt z15.h, z1.s\n"
+ ".inst 0x453044c5 // sqxtnt z5.h, z6.s\n"
+ "sqadd z8.h, z8.h, z19.h\n"
+ "smax z8.h, p4/M, z8.h, z12.h\n"
+ "smin z8.h, p4/M, z8.h, z9.h\n"
+ "sqadd z18.h, z18.h, z19.h\n"
+ "sqadd z15.h, z15.h, z19.h\n"
+ "smax z18.h, p4/M, z18.h, z12.h\n"
+ "smax z15.h, p4/M, z15.h, z12.h\n"
+ "sqadd z5.h, z5.h, z19.h\n"
+ "smax z5.h, p4/M, z5.h, z12.h\n"
+ "smin z18.h, p4/M, z18.h, z9.h\n"
+ "st1b { z8.h }, p0, [x16, x10]\n"
+ "smin z15.h, p4/M, z15.h, z9.h\n"
+ "smin z5.h, p4/M, z5.h, z9.h\n"
+ "st1b { z18.h }, p0, [x15, x10]\n"
+ "st1b { z15.h }, p0, [x14, x10]\n"
+ "st1b { z5.h }, p0, [x13, x10]\n"
+ "ld1sb { z25.h }, p4/Z, [x17]\n"
+ "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "inch x10\n"
+ "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+ ".inst 0x454d1339 // ssublb z25.h, z25.b, z13.b\n"
+ ".inst 0x454d13de // ssublb z30.h, z30.b, z13.b\n"
+ "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+ ".inst 0x454d11ce // ssublb z14.h, z14.b, z13.b\n"
+ ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
+ "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+ "inch x17, ALL, MUL #8\n"
+ ".inst 0x454d114a // ssublb z10.h, z10.b, z13.b\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
+ "uzp1 z8.s, z17.s, z16.s\n"
+ "uzp2 z24.s, z17.s, z16.s\n"
+ "ld1sb { z2.h }, p4/Z, [x17]\n"
+ "ldp x27, x26, [x11, #0x0]\n"
+ "addvl x20, x20, #2\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x25, x24, [x11, #0x10]\n"
+ "ldp x23, x22, [x11, #0x20]\n"
+ "mov z18.d, z8.d\n"
+ "mov z0.d, z24.d\n"
+ "ldp x21, x20, [x11, #0x30]\n"
+ "ld1sb { z21.h }, p3/Z, [x27, x7]\n"
+ "mov z15.d, z8.d\n"
+ "mov z1.d, z24.d\n"
+ "ld1sb { z22.h }, p3/Z, [x26, x7]\n"
+ "ld1sb { z11.h }, p3/Z, [x25, x7]\n"
+ "mov z5.d, z8.d\n"
+ "mov z6.d, z24.d\n"
+ "ld1sb { z20.h }, p3/Z, [x24, x7]\n"
+ "ld1sb { z27.h }, p3/Z, [x23, x7]\n"
+ ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
+ ".inst 0x454d12f7 // ssublb z23.h, z23.b, z13.b\n"
+ "ld1sb { z28.h }, p3/Z, [x22, x7]\n"
+ "ld1sb { z16.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x454d10e7 // ssublb z7.h, z7.b, z13.b\n"
+ ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
+ "ld1sb { z31.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x455a12b5 // ssublb z21.h, z21.b, z26.b\n"
+ ".inst 0x455a12d6 // ssublb z22.h, z22.b, z26.b\n"
+ ".inst 0x455a116b // ssublb z11.h, z11.b, z26.b\n"
+ ".inst 0x455a1294 // ssublb z20.h, z20.b, z26.b\n"
+ ".inst 0x455a137b // ssublb z27.h, z27.b, z26.b\n"
+ ".inst 0x455a139c // ssublb z28.h, z28.b, z26.b\n"
+ ".inst 0x455a1210 // ssublb z16.h, z16.b, z26.b\n"
+ ".inst 0x455a13ff // ssublb z31.h, z31.b, z26.b\n"
+ "b.any 1b\n"
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..1f8d6c5213
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
+
+class sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..7ff724ddd8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,652 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const int8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ int8_t *const *const outptrs;
+ const int8_t *inptrs[36];
+
+ Params(
+ long unsigned int n_channels,
+ const int8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ int8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[0];
+ inptrs[1] = inptrs_raw[1];
+ inptrs[2] = inptrs_raw[6];
+ inptrs[3] = inptrs_raw[7];
+ inptrs[4] = inptrs_raw[2];
+ inptrs[5] = inptrs_raw[8];
+ inptrs[6] = inptrs_raw[3];
+ inptrs[7] = inptrs_raw[4];
+ inptrs[8] = inptrs_raw[11];
+ inptrs[9] = inptrs_raw[12];
+ inptrs[10] = inptrs_raw[9];
+ inptrs[11] = inptrs_raw[10];
+ inptrs[12] = inptrs_raw[5];
+ inptrs[13] = inptrs_raw[13];
+ inptrs[14] = inptrs_raw[14];
+ inptrs[15] = inptrs_raw[15];
+ inptrs[16] = inptrs_raw[16];
+ inptrs[17] = inptrs_raw[17];
+ inptrs[18] = inptrs_raw[18];
+ inptrs[19] = inptrs_raw[19];
+ inptrs[20] = inptrs_raw[20];
+ inptrs[21] = inptrs_raw[21];
+ inptrs[22] = inptrs_raw[22];
+ inptrs[23] = inptrs_raw[23];
+ inptrs[24] = inptrs_raw[24];
+ inptrs[25] = inptrs_raw[25];
+ inptrs[26] = inptrs_raw[26];
+ inptrs[27] = inptrs_raw[27];
+ inptrs[28] = inptrs_raw[28];
+ inptrs[29] = inptrs_raw[29];
+ inptrs[30] = inptrs_raw[30];
+ inptrs[31] = inptrs_raw[31];
+ inptrs[32] = inptrs_raw[32];
+ inptrs[33] = inptrs_raw[33];
+ inptrs[34] = inptrs_raw[34];
+ inptrs[35] = inptrs_raw[35];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "mov x2, #0x0\n"
+ "mov x24, x2\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ptrue p4.b\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "incw x24\n"
+ "ldr x4, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x21, x23, %[offsetof_Requantize32_a_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+ "ld1rb { z30.b }, p4/Z, [x21]\n"
+ "ld1rb { z10.b }, p4/Z, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_minval]\n"
+ "ld1rh { z15.h }, p4/Z, [x21]\n"
+ "ld1rh { z12.h }, p4/Z, [x20]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1rh { z13.h }, p4/Z, [x20]\n"
+ "ldp x5, x6, [x22, #0x0]\n"
+ "whilelt p3.h, x2, x3\n"
+ "ldp x7, x8, [x22, #0x10]\n"
+ "whilelt p2.s, x2, x3\n"
+ "whilelt p1.s, x24, x3\n"
+ "ldr x10, [%x[params], %[offsetof_Params_bias]]\n"
+ "add x17, %x[params], %[offsetof_Params_inptrs]\n"
+ "ld1w { z17.s }, p2/Z, [x10]\n"
+ "ld1w { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z14.s, z17.s, z16.s\n"
+ "ld1sb { z26.h }, p4/Z, [x4]\n"
+ "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "uzp2 z23.s, z17.s, z16.s\n"
+ "addvl x10, x10, #2\n"
+ "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "mov x16, #0x0\n"
+ "mov z6.d, z14.d\n"
+ "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x17, #0x0]\n"
+ "mov z18.d, z23.d\n"
+ "mov z9.d, z14.d\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "mov z20.d, z23.d\n"
+ "mov z7.d, z14.d\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "mov z1.d, z23.d\n"
+ ".inst 0x454a135a // ssublb z26.h, z26.b, z10.b\n"
+ "ld1sb { z22.h }, p3/Z, [x9, x2]\n"
+ "ld1sb { z2.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454a1108 // ssublb z8.h, z8.b, z10.b\n"
+ ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
+ "ld1sb { z11.h }, p3/Z, [x27, x2]\n"
+ "ld1sb { z3.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x454a1231 // ssublb z17.h, z17.b, z10.b\n"
+ "ld1sb { z29.h }, p3/Z, [x25, x2]\n"
+ "ld1sb { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x455e12d6 // ssublb z22.h, z22.b, z30.b\n"
+ ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
+ "ld1sb { z31.h }, p3/Z, [x23, x2]\n"
+ "ld1sb { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e116b // ssublb z11.h, z11.b, z30.b\n"
+ ".inst 0x455e1063 // ssublb z3.h, z3.b, z30.b\n"
+ "ld1sb { z19.h }, p3/Z, [x21, x2]\n"
+ "ld1sb { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e13bd // ssublb z29.h, z29.b, z30.b\n"
+ ".inst 0x455e1084 // ssublb z4.h, z4.b, z30.b\n"
+ "ldr x15, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x10, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455e13ff // ssublb z31.h, z31.b, z30.b\n"
+ ".inst 0x455e1000 // ssublb z0.h, z0.b, z30.b\n"
+ ".inst 0x455e1273 // ssublb z19.h, z19.b, z30.b\n"
+ ".inst 0x455e139c // ssublb z28.h, z28.b, z30.b\n"
+ "1:" // Loop
+ ".inst 0x449a42ce // smlalb z14.s, p4/M, z22.h, z26.h\n"
+ ".inst 0x449a46d7 // smlalt z23.s, p4/M, z22.h, z26.h\n"
+ "ldr x20, [x17, #0x50]\n"
+ "ld1sb { z27.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x4488404e // smlalb z14.s, p4/M, z2.h, z8.h\n"
+ ".inst 0x449a4046 // smlalb z6.s, p4/M, z2.h, z26.h\n"
+ "ldr x20, [x17, #0x58]\n"
+ ".inst 0x455e137b // ssublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449a4169 // smlalb z9.s, p4/M, z11.h, z26.h\n"
+ ".inst 0x449a4067 // smlalb z7.s, p4/M, z3.h, z26.h\n"
+ "ld1sb { z5.h }, p3/Z, [x20, x2]\n"
+ "ldr x20, [x17, #0x60]\n"
+ ".inst 0x44884457 // smlalt z23.s, p4/M, z2.h, z8.h\n"
+ ".inst 0x449043ae // smlalb z14.s, p4/M, z29.h, z16.h\n"
+ "ld1sb { z25.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x455e10a5 // ssublb z5.h, z5.b, z30.b\n"
+ ".inst 0x449a4452 // smlalt z18.s, p4/M, z2.h, z26.h\n"
+ ".inst 0x449a4574 // smlalt z20.s, p4/M, z11.h, z26.h\n"
+ "ld1sb { z22.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
+ ".inst 0x449a4461 // smlalt z1.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x448843a6 // smlalb z6.s, p4/M, z29.h, z8.h\n"
+ "ldr x20, [x17, #0x68]\n"
+ "ld1sb { z2.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x44884069 // smlalb z9.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x44884087 // smlalb z7.s, p4/M, z4.h, z8.h\n"
+ ".inst 0x455e12d6 // ssublb z22.h, z22.b, z30.b\n"
+ "ld1sb { z26.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x449047b7 // smlalt z23.s, p4/M, z29.h, z16.h\n"
+ ".inst 0x449543ee // smlalb z14.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
+ "ldr x20, [x17, #0x70]\n"
+ ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x44884474 // smlalt z20.s, p4/M, z3.h, z8.h\n"
+ "ld1sb { z29.h }, p4/Z, [x4, #7, MUL VL]\n"
+ ".inst 0x455e135a // ssublb z26.h, z26.b, z30.b\n"
+ ".inst 0x44884481 // smlalt z1.s, p4/M, z4.h, z8.h\n"
+ ".inst 0x449043e6 // smlalb z6.s, p4/M, z31.h, z16.h\n"
+ "inch x4, ALL, MUL #8\n"
+ "ld1sb { z8.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x44904089 // smlalb z9.s, p4/M, z4.h, z16.h\n"
+ ".inst 0x44904367 // smlalb z7.s, p4/M, z27.h, z16.h\n"
+ ".inst 0x454a13bd // ssublb z29.h, z29.b, z10.b\n"
+ "ldr x20, [x17, #0x78]\n"
+ ".inst 0x449547f7 // smlalt z23.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x4491400e // smlalb z14.s, p4/M, z0.h, z17.h\n"
+ "ld1sb { z24.h }, p4/Z, [x4]\n"
+ ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
+ ".inst 0x449047f2 // smlalt z18.s, p4/M, z31.h, z16.h\n"
+ ".inst 0x44904494 // smlalt z20.s, p4/M, z4.h, z16.h\n"
+ "ld1sb { z31.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ ".inst 0x44904761 // smlalt z1.s, p4/M, z27.h, z16.h\n"
+ ".inst 0x44954006 // smlalb z6.s, p4/M, z0.h, z21.h\n"
+ "ldr x22, [x17, #0x80]\n"
+ "ld1sb { z16.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x44954369 // smlalb z9.s, p4/M, z27.h, z21.h\n"
+ ".inst 0x449540a7 // smlalb z7.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x455e13ff // ssublb z31.h, z31.b, z30.b\n"
+ "ldr x21, [x17, #0x88]\n"
+ ".inst 0x44914417 // smlalt z23.s, p4/M, z0.h, z17.h\n"
+ ".inst 0x4499416e // smlalb z14.s, p4/M, z11.h, z25.h\n"
+ ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
+ "ldr x20, [x17, #0x90]\n"
+ ".inst 0x44954412 // smlalt z18.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x44954774 // smlalt z20.s, p4/M, z27.h, z21.h\n"
+ "ld1sb { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1000 // ssublb z0.h, z0.b, z30.b\n"
+ ".inst 0x449544a1 // smlalt z1.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x449142c6 // smlalb z6.s, p4/M, z22.h, z17.h\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x449140a9 // smlalb z9.s, p4/M, z5.h, z17.h\n"
+ ".inst 0x44914267 // smlalb z7.s, p4/M, z19.h, z17.h\n"
+ "ldr x23, [x17, #0x98]\n"
+ "ldr x22, [x17, #0xa0]\n"
+ ".inst 0x44994577 // smlalt z23.s, p4/M, z11.h, z25.h\n"
+ ".inst 0x4482406e // smlalb z14.s, p4/M, z3.h, z2.h\n"
+ "ld1sb { z11.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x455e116b // ssublb z11.h, z11.b, z30.b\n"
+ ".inst 0x449146d2 // smlalt z18.s, p4/M, z22.h, z17.h\n"
+ ".inst 0x449144b4 // smlalt z20.s, p4/M, z5.h, z17.h\n"
+ "ld1sb { z22.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x454a12d6 // ssublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44914661 // smlalt z1.s, p4/M, z19.h, z17.h\n"
+ ".inst 0x44994066 // smlalb z6.s, p4/M, z3.h, z25.h\n"
+ "ld1sb { z17.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1231 // ssublb z17.h, z17.b, z30.b\n"
+ ".inst 0x44994389 // smlalb z9.s, p4/M, z28.h, z25.h\n"
+ ".inst 0x44994347 // smlalb z7.s, p4/M, z26.h, z25.h\n"
+ "ldr x20, [x17, #0xa8]\n"
+ "ldr x21, [x17, #0xb0]\n"
+ ".inst 0x44824477 // smlalt z23.s, p4/M, z3.h, z2.h\n"
+ ".inst 0x449d408e // smlalb z14.s, p4/M, z4.h, z29.h\n"
+ "ldr x13, [x17, #0xb8]\n"
+ "ldr x12, [x17, #0xc0]\n"
+ ".inst 0x44994472 // smlalt z18.s, p4/M, z3.h, z25.h\n"
+ ".inst 0x44994794 // smlalt z20.s, p4/M, z28.h, z25.h\n"
+ "ld1sb { z3.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x455e1063 // ssublb z3.h, z3.b, z30.b\n"
+ ".inst 0x44994741 // smlalt z1.s, p4/M, z26.h, z25.h\n"
+ ".inst 0x44824086 // smlalb z6.s, p4/M, z4.h, z2.h\n"
+ "ld1sb { z25.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
+ ".inst 0x44824349 // smlalb z9.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44824107 // smlalb z7.s, p4/M, z8.h, z2.h\n"
+ "ldr x11, [x17, #0xc8]\n"
+ "ldr x10, [x17, #0xd0]\n"
+ ".inst 0x449d4497 // smlalt z23.s, p4/M, z4.h, z29.h\n"
+ ".inst 0x4498436e // smlalb z14.s, p4/M, z27.h, z24.h\n"
+ "ldr x9, [x17, #0xd8]\n"
+ "ldr x28, [x17, #0xe0]\n"
+ ".inst 0x44824492 // smlalt z18.s, p4/M, z4.h, z2.h\n"
+ ".inst 0x44824754 // smlalt z20.s, p4/M, z26.h, z2.h\n"
+ "ld1sb { z4.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1084 // ssublb z4.h, z4.b, z30.b\n"
+ ".inst 0x44824501 // smlalt z1.s, p4/M, z8.h, z2.h\n"
+ ".inst 0x449d4366 // smlalb z6.s, p4/M, z27.h, z29.h\n"
+ "ld1sb { z2.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
+ ".inst 0x449d4109 // smlalb z9.s, p4/M, z8.h, z29.h\n"
+ ".inst 0x449d43e7 // smlalb z7.s, p4/M, z31.h, z29.h\n"
+ "ldr x27, [x17, #0xe8]\n"
+ "ldr x26, [x17, #0xf0]\n"
+ ".inst 0x44984777 // smlalt z23.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x449040ae // smlalb z14.s, p4/M, z5.h, z16.h\n"
+ "ldr x25, [x17, #0xf8]\n"
+ "ldr x24, [x17, #0x100]\n"
+ ".inst 0x449d4772 // smlalt z18.s, p4/M, z27.h, z29.h\n"
+ ".inst 0x449d4514 // smlalt z20.s, p4/M, z8.h, z29.h\n"
+ "ld1sb { z27.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e137b // ssublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449d47e1 // smlalt z1.s, p4/M, z31.h, z29.h\n"
+ ".inst 0x449840a6 // smlalb z6.s, p4/M, z5.h, z24.h\n"
+ "ld1sb { z29.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x454a13bd // ssublb z29.h, z29.b, z10.b\n"
+ ".inst 0x449843e9 // smlalb z9.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984007 // smlalb z7.s, p4/M, z0.h, z24.h\n"
+ "ldr x23, [x17, #0x108]\n"
+ "ldr x22, [x17, #0x110]\n"
+ ".inst 0x449044b7 // smlalt z23.s, p4/M, z5.h, z16.h\n"
+ ".inst 0x4495438e // smlalb z14.s, p4/M, z28.h, z21.h\n"
+ "ldr x20, [x17, #0x118]\n"
+ "whilelt p0.h, x16, x3\n"
+ ".inst 0x449844b2 // smlalt z18.s, p4/M, z5.h, z24.h\n"
+ ".inst 0x449847f4 // smlalt z20.s, p4/M, z31.h, z24.h\n"
+ "ld1sb { z5.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x455e10a5 // ssublb z5.h, z5.b, z30.b\n"
+ ".inst 0x44984401 // smlalt z1.s, p4/M, z0.h, z24.h\n"
+ ".inst 0x44904266 // smlalb z6.s, p4/M, z19.h, z16.h\n"
+ "ld1sb { z24.h }, p4/Z, [x4, #7, MUL VL]\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44904009 // smlalb z9.s, p4/M, z0.h, z16.h\n"
+ ".inst 0x44904167 // smlalb z7.s, p4/M, z11.h, z16.h\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44954797 // smlalt z23.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x4496434e // smlalb z14.s, p4/M, z26.h, z22.h\n"
+ "ld1sb { z28.h }, p3/Z, [x13, x2]\n"
+ ".inst 0x455e139c // ssublb z28.h, z28.b, z30.b\n"
+ ".inst 0x44904672 // smlalt z18.s, p4/M, z19.h, z16.h\n"
+ ".inst 0x44904414 // smlalt z20.s, p4/M, z0.h, z16.h\n"
+ "ld1sb { z19.h }, p4/Z, [x4]\n"
+ ".inst 0x454a1273 // ssublb z19.h, z19.b, z10.b\n"
+ ".inst 0x44904561 // smlalt z1.s, p4/M, z11.h, z16.h\n"
+ ".inst 0x44954346 // smlalb z6.s, p4/M, z26.h, z21.h\n"
+ "ld1sb { z16.h }, p3/Z, [x12, x2]\n"
+ ".inst 0x455e1210 // ssublb z16.h, z16.b, z30.b\n"
+ ".inst 0x44954229 // smlalb z9.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x44954067 // smlalb z7.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44964757 // smlalt z23.s, p4/M, z26.h, z22.h\n"
+ ".inst 0x4499410e // smlalb z14.s, p4/M, z8.h, z25.h\n"
+ ".inst 0x44954752 // smlalt z18.s, p4/M, z26.h, z21.h\n"
+ ".inst 0x44954634 // smlalt z20.s, p4/M, z17.h, z21.h\n"
+ "ld1sb { z26.h }, p3/Z, [x11, x2]\n"
+ ".inst 0x455e135a // ssublb z26.h, z26.b, z30.b\n"
+ ".inst 0x44954461 // smlalt z1.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44964106 // smlalb z6.s, p4/M, z8.h, z22.h\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x44964069 // smlalb z9.s, p4/M, z3.h, z22.h\n"
+ ".inst 0x44964087 // smlalb z7.s, p4/M, z4.h, z22.h\n"
+ ".inst 0x44994517 // smlalt z23.s, p4/M, z8.h, z25.h\n"
+ ".inst 0x448243ee // smlalb z14.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x44964512 // smlalt z18.s, p4/M, z8.h, z22.h\n"
+ ".inst 0x44964474 // smlalt z20.s, p4/M, z3.h, z22.h\n"
+ "ld1sb { z8.h }, p3/Z, [x10, x2]\n"
+ ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
+ ".inst 0x44964481 // smlalt z1.s, p4/M, z4.h, z22.h\n"
+ ".inst 0x449943e6 // smlalb z6.s, p4/M, z31.h, z25.h\n"
+ "ld1sb { z22.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x454a12d6 // ssublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44994089 // smlalb z9.s, p4/M, z4.h, z25.h\n"
+ ".inst 0x44994367 // smlalb z7.s, p4/M, z27.h, z25.h\n"
+ ".inst 0x448247f7 // smlalt z23.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x449d400e // smlalb z14.s, p4/M, z0.h, z29.h\n"
+ ".inst 0x449947f2 // smlalt z18.s, p4/M, z31.h, z25.h\n"
+ ".inst 0x44994494 // smlalt z20.s, p4/M, z4.h, z25.h\n"
+ "ld1sb { z31.h }, p3/Z, [x9, x2]\n"
+ ".inst 0x455e13ff // ssublb z31.h, z31.b, z30.b\n"
+ ".inst 0x44994761 // smlalt z1.s, p4/M, z27.h, z25.h\n"
+ ".inst 0x44824006 // smlalb z6.s, p4/M, z0.h, z2.h\n"
+ "ld1sb { z25.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
+ ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x448240a7 // smlalb z7.s, p4/M, z5.h, z2.h\n"
+ ".inst 0x449d4417 // smlalt z23.s, p4/M, z0.h, z29.h\n"
+ ".inst 0x4498422e // smlalb z14.s, p4/M, z17.h, z24.h\n"
+ ".inst 0x44824412 // smlalt z18.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x44824774 // smlalt z20.s, p4/M, z27.h, z2.h\n"
+ "ld1sb { z0.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x455e1000 // ssublb z0.h, z0.b, z30.b\n"
+ ".inst 0x448244a1 // smlalt z1.s, p4/M, z5.h, z2.h\n"
+ ".inst 0x449d4166 // smlalb z6.s, p4/M, z11.h, z29.h\n"
+ "ld1sb { z2.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
+ ".inst 0x449d40a9 // smlalb z9.s, p4/M, z5.h, z29.h\n"
+ ".inst 0x449d4387 // smlalb z7.s, p4/M, z28.h, z29.h\n"
+ ".inst 0x44984637 // smlalt z23.s, p4/M, z17.h, z24.h\n"
+ ".inst 0x4493406e // smlalb z14.s, p4/M, z3.h, z19.h\n"
+ "ld1sb { z17.h }, p3/Z, [x27, x2]\n"
+ ".inst 0x455e1231 // ssublb z17.h, z17.b, z30.b\n"
+ ".inst 0x449d4572 // smlalt z18.s, p4/M, z11.h, z29.h\n"
+ ".inst 0x449d44b4 // smlalt z20.s, p4/M, z5.h, z29.h\n"
+ "ld1sb { z11.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x454a116b // ssublb z11.h, z11.b, z10.b\n"
+ ".inst 0x449d4781 // smlalt z1.s, p4/M, z28.h, z29.h\n"
+ ".inst 0x44984066 // smlalb z6.s, p4/M, z3.h, z24.h\n"
+ "ld1sb { z29.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x455e13bd // ssublb z29.h, z29.b, z30.b\n"
+ ".inst 0x44984209 // smlalb z9.s, p4/M, z16.h, z24.h\n"
+ ".inst 0x44984347 // smlalb z7.s, p4/M, z26.h, z24.h\n"
+ ".inst 0x44934477 // smlalt z23.s, p4/M, z3.h, z19.h\n"
+ ".inst 0x4495408e // smlalb z14.s, p4/M, z4.h, z21.h\n"
+ ".inst 0x44984472 // smlalt z18.s, p4/M, z3.h, z24.h\n"
+ ".inst 0x44984614 // smlalt z20.s, p4/M, z16.h, z24.h\n"
+ "ld1sb { z3.h }, p3/Z, [x25, x2]\n"
+ ".inst 0x455e1063 // ssublb z3.h, z3.b, z30.b\n"
+ ".inst 0x44984741 // smlalt z1.s, p4/M, z26.h, z24.h\n"
+ ".inst 0x44934086 // smlalb z6.s, p4/M, z4.h, z19.h\n"
+ "ld1sb { z24.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ ".inst 0x44934349 // smlalb z9.s, p4/M, z26.h, z19.h\n"
+ ".inst 0x44934107 // smlalb z7.s, p4/M, z8.h, z19.h\n"
+ ".inst 0x44954497 // smlalt z23.s, p4/M, z4.h, z21.h\n"
+ ".inst 0x4496436e // smlalb z14.s, p4/M, z27.h, z22.h\n"
+ ".inst 0x44934492 // smlalt z18.s, p4/M, z4.h, z19.h\n"
+ ".inst 0x44934754 // smlalt z20.s, p4/M, z26.h, z19.h\n"
+ "ld1sb { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x455e1084 // ssublb z4.h, z4.b, z30.b\n"
+ ".inst 0x44934501 // smlalt z1.s, p4/M, z8.h, z19.h\n"
+ ".inst 0x44954366 // smlalb z6.s, p4/M, z27.h, z21.h\n"
+ "ld1sb { z19.h }, p4/Z, [x4, #7, MUL VL]\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44954109 // smlalb z9.s, p4/M, z8.h, z21.h\n"
+ ".inst 0x449543e7 // smlalb z7.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454a1273 // ssublb z19.h, z19.b, z10.b\n"
+ ".inst 0x44964777 // smlalt z23.s, p4/M, z27.h, z22.h\n"
+ ".inst 0x449940ae // smlalb z14.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x44954772 // smlalt z18.s, p4/M, z27.h, z21.h\n"
+ ".inst 0x44954514 // smlalt z20.s, p4/M, z8.h, z21.h\n"
+ "ld1sb { z27.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x455e137b // ssublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449547e1 // smlalt z1.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x449640a6 // smlalb z6.s, p4/M, z5.h, z22.h\n"
+ "ld1sb { z21.h }, p4/Z, [x4]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x449643e9 // smlalb z9.s, p4/M, z31.h, z22.h\n"
+ ".inst 0x44964007 // smlalb z7.s, p4/M, z0.h, z22.h\n"
+ "inch x4\n"
+ ".inst 0x449944b7 // smlalt z23.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x4482420e // smlalb z14.s, p4/M, z16.h, z2.h\n"
+ ".inst 0x449644b2 // smlalt z18.s, p4/M, z5.h, z22.h\n"
+ ".inst 0x449647f4 // smlalt z20.s, p4/M, z31.h, z22.h\n"
+ "ld1sb { z5.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e10a5 // ssublb z5.h, z5.b, z30.b\n"
+ ".inst 0x44964401 // smlalt z1.s, p4/M, z0.h, z22.h\n"
+ ".inst 0x44994386 // smlalb z6.s, p4/M, z28.h, z25.h\n"
+ "ld1w { z22.s }, p2/Z, [x15]\n"
+ ".inst 0x44994009 // smlalb z9.s, p4/M, z0.h, z25.h\n"
+ ".inst 0x44994227 // smlalb z7.s, p4/M, z17.h, z25.h\n"
+ ".inst 0x44824617 // smlalt z23.s, p4/M, z16.h, z2.h\n"
+ ".inst 0x448b434e // smlalb z14.s, p4/M, z26.h, z11.h\n"
+ "ld1w { z16.s }, p1/Z, [x15, #1, MUL VL]\n"
+ "addvl x15, x15, #2\n"
+ ".inst 0x44994792 // smlalt z18.s, p4/M, z28.h, z25.h\n"
+ ".inst 0x44994414 // smlalt z20.s, p4/M, z0.h, z25.h\n"
+ "ld1sb { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e139c // ssublb z28.h, z28.b, z30.b\n"
+ ".inst 0x44994621 // smlalt z1.s, p4/M, z17.h, z25.h\n"
+ ".inst 0x44824346 // smlalb z6.s, p4/M, z26.h, z2.h\n"
+ "uzp1 z25.s, z22.s, z16.s\n"
+ "inch x2\n"
+ ".inst 0x448243a9 // smlalb z9.s, p4/M, z29.h, z2.h\n"
+ ".inst 0x44824067 // smlalb z7.s, p4/M, z3.h, z2.h\n"
+ "uzp2 z16.s, z22.s, z16.s\n"
+ "ld1w { z22.s }, p2/Z, [x14]\n"
+ ".inst 0x448b4757 // smlalt z23.s, p4/M, z26.h, z11.h\n"
+ ".inst 0x4498410e // smlalb z14.s, p4/M, z8.h, z24.h\n"
+ "mov x20, x2\n"
+ "incw x20\n"
+ ".inst 0x44824752 // smlalt z18.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x448247b4 // smlalt z20.s, p4/M, z29.h, z2.h\n"
+ "ld1w { z26.s }, p1/Z, [x14, #1, MUL VL]\n"
+ "uzp1 z29.s, z22.s, z26.s\n"
+ ".inst 0x44824461 // smlalt z1.s, p4/M, z3.h, z2.h\n"
+ ".inst 0x448b4106 // smlalb z6.s, p4/M, z8.h, z11.h\n"
+ "uzp2 z22.s, z22.s, z26.s\n"
+ "whilelt p2.s, x2, x3\n"
+ ".inst 0x448b4069 // smlalb z9.s, p4/M, z3.h, z11.h\n"
+ ".inst 0x448b4087 // smlalb z7.s, p4/M, z4.h, z11.h\n"
+ "whilelt p1.s, x20, x3\n"
+ "whilelt p3.h, x2, x3\n"
+ ".inst 0x44984517 // smlalt z23.s, p4/M, z8.h, z24.h\n"
+ ".inst 0x449343ee // smlalb z14.s, p4/M, z31.h, z19.h\n"
+ "addvl x14, x14, #2\n"
+ ".inst 0x448b4512 // smlalt z18.s, p4/M, z8.h, z11.h\n"
+ ".inst 0x448b4474 // smlalt z20.s, p4/M, z3.h, z11.h\n"
+ ".inst 0x448b4481 // smlalt z1.s, p4/M, z4.h, z11.h\n"
+ ".inst 0x449843e6 // smlalb z6.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984089 // smlalb z9.s, p4/M, z4.h, z24.h\n"
+ ".inst 0x44984367 // smlalb z7.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x449347f7 // smlalt z23.s, p4/M, z31.h, z19.h\n"
+ ".inst 0x4495400e // smlalb z14.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x04b975ce // sqrdmulh z14.s, z14.s, z25.s\n"
+ ".inst 0x449847f2 // smlalt z18.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984494 // smlalt z20.s, p4/M, z4.h, z24.h\n"
+ "and z3.d, z14.d, z29.d\n"
+ ".inst 0x44984761 // smlalt z1.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x44934006 // smlalb z6.s, p4/M, z0.h, z19.h\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ ".inst 0x44934369 // smlalb z9.s, p4/M, z27.h, z19.h\n"
+ ".inst 0x449340a7 // smlalb z7.s, p4/M, z5.h, z19.h\n"
+ "sqadd z14.s, z14.s, z3.s\n"
+ ".inst 0x448293ae // srshl z14.s, p4/M, z14.s, z29.s\n"
+ ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x44934412 // smlalt z18.s, p4/M, z0.h, z19.h\n"
+ ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n"
+ ".inst 0x44934774 // smlalt z20.s, p4/M, z27.h, z19.h\n"
+ ".inst 0x449344a1 // smlalt z1.s, p4/M, z5.h, z19.h\n"
+ "and z31.d, z23.d, z22.d\n"
+ ".inst 0x44954226 // smlalb z6.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x449540a9 // smlalb z9.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x04b974c6 // sqrdmulh z6.s, z6.s, z25.s\n"
+ ".inst 0x44954387 // smlalb z7.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x44954632 // smlalt z18.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x04b97529 // sqrdmulh z9.s, z9.s, z25.s\n"
+ ".inst 0x449544b4 // smlalt z20.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x44954781 // smlalt z1.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x04b974e7 // sqrdmulh z7.s, z7.s, z25.s\n"
+ "asr z31.s, z31.s, #0x1f\n"
+ "and z3.d, z6.d, z29.d\n"
+ ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
+ "and z0.d, z9.d, z29.d\n"
+ ".inst 0x04b07694 // sqrdmulh z20.s, z20.s, z16.s\n"
+ "and z19.d, z7.d, z29.d\n"
+ ".inst 0x04b07421 // sqrdmulh z1.s, z1.s, z16.s\n"
+ "sqadd z23.s, z23.s, z31.s\n"
+ ".inst 0x448292d7 // srshl z23.s, p4/M, z23.s, z22.s\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ "and z21.d, z18.d, z22.d\n"
+ "asr z0.s, z0.s, #0x1f\n"
+ "and z17.d, z20.d, z22.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z16.d, z1.d, z22.d\n"
+ "sqadd z6.s, z6.s, z3.s\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ ".inst 0x448293a6 // srshl z6.s, p4/M, z6.s, z29.s\n"
+ "sqadd z9.s, z9.s, z0.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x448293a9 // srshl z9.s, p4/M, z9.s, z29.s\n"
+ "sqadd z7.s, z7.s, z19.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x448293a7 // srshl z7.s, p4/M, z7.s, z29.s\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "sqadd z20.s, z20.s, z17.s\n"
+ ".inst 0x448292d2 // srshl z18.s, p4/M, z18.s, z22.s\n"
+ ".inst 0x448292d4 // srshl z20.s, p4/M, z20.s, z22.s\n"
+ "sqadd z1.s, z1.s, z16.s\n"
+ ".inst 0x453041ce // sqxtnb z14.h, z14.s\n"
+ ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
+ ".inst 0x453040c6 // sqxtnb z6.h, z6.s\n"
+ ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
+ ".inst 0x453040e7 // sqxtnb z7.h, z7.s\n"
+ ".inst 0x453046ee // sqxtnt z14.h, z23.s\n"
+ ".inst 0x45304646 // sqxtnt z6.h, z18.s\n"
+ ".inst 0x45304689 // sqxtnt z9.h, z20.s\n"
+ ".inst 0x45304427 // sqxtnt z7.h, z1.s\n"
+ "sqadd z14.h, z14.h, z15.h\n"
+ "smax z14.h, p4/M, z14.h, z12.h\n"
+ "smin z14.h, p4/M, z14.h, z13.h\n"
+ "sqadd z6.h, z6.h, z15.h\n"
+ "sqadd z9.h, z9.h, z15.h\n"
+ "smax z6.h, p4/M, z6.h, z12.h\n"
+ "smax z9.h, p4/M, z9.h, z12.h\n"
+ "sqadd z7.h, z7.h, z15.h\n"
+ "smax z7.h, p4/M, z7.h, z12.h\n"
+ "smin z6.h, p4/M, z6.h, z13.h\n"
+ "st1b { z14.h }, p0, [x5, x16]\n"
+ "smin z9.h, p4/M, z9.h, z13.h\n"
+ "smin z7.h, p4/M, z7.h, z13.h\n"
+ "st1b { z6.h }, p0, [x6, x16]\n"
+ "st1b { z9.h }, p0, [x7, x16]\n"
+ "st1b { z7.h }, p0, [x8, x16]\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+ "uzp1 z14.s, z17.s, z16.s\n"
+ "ld1sb { z26.h }, p4/Z, [x4]\n"
+ "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "uzp2 z23.s, z17.s, z16.s\n"
+ "addvl x21, x21, #2\n"
+ "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "inch x16\n"
+ "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x17, #0x0]\n"
+ "mov z6.d, z14.d\n"
+ "mov z18.d, z23.d\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "mov z9.d, z14.d\n"
+ "mov z20.d, z23.d\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "mov z7.d, z14.d\n"
+ "mov z1.d, z23.d\n"
+ "ld1sb { z22.h }, p3/Z, [x9, x2]\n"
+ "ld1sb { z2.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454a135a // ssublb z26.h, z26.b, z10.b\n"
+ ".inst 0x454a1108 // ssublb z8.h, z8.b, z10.b\n"
+ "ld1sb { z11.h }, p3/Z, [x27, x2]\n"
+ "ld1sb { z3.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ "ld1sb { z29.h }, p3/Z, [x25, x2]\n"
+ "ld1sb { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x454a1231 // ssublb z17.h, z17.b, z10.b\n"
+ ".inst 0x455e12d6 // ssublb z22.h, z22.b, z30.b\n"
+ "ld1sb { z31.h }, p3/Z, [x23, x2]\n"
+ "ld1sb { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
+ ".inst 0x455e116b // ssublb z11.h, z11.b, z30.b\n"
+ "ld1sb { z19.h }, p3/Z, [x21, x2]\n"
+ "ld1sb { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1063 // ssublb z3.h, z3.b, z30.b\n"
+ ".inst 0x455e13bd // ssublb z29.h, z29.b, z30.b\n"
+ ".inst 0x455e1084 // ssublb z4.h, z4.b, z30.b\n"
+ ".inst 0x455e13ff // ssublb z31.h, z31.b, z30.b\n"
+ ".inst 0x455e1000 // ssublb z0.h, z0.b, z30.b\n"
+ ".inst 0x455e1273 // ssublb z19.h, z19.b, z30.b\n"
+ ".inst 0x455e139c // ssublb z28.h, z28.b, z30.b\n"
+ "b.any 1b\n"
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
new file mode 100644
index 0000000000..abc09ee5a3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst : DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+ using Parent = DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>;
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *)
+ : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
+
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..274b29dcfc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(
+ const int8_t *const *const inptrs,
+ int8_t *const *const outptrs,
+ const void *params,
+ unsigned int n_output_channels,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "mov x20, #0x9\n"
+ "whilelt p0.b, XZR, x20\n"
+ "ldr x23, [%x[inptrs], #0x8]\n"
+ "ldr x20, [%x[inptrs], #0x10]\n"
+ "ldr x22, [%x[inptrs], #0x20]\n"
+ "ldr x21, [%x[inptrs], #0x0]\n"
+ "mov z13.b, #0x1\n"
+ "lsr z13.s, z13.s, #0x8\n"
+ "ld1b { z1.b }, p0/Z, [x23]\n"
+ "ld1b { z2.b }, p0/Z, [x20]\n"
+ "mov z8.d, z1.d\n"
+ "mov z27.d, z1.d\n"
+ "ldr x20, [%x[inptrs], #0x18]\n"
+ "ld1b { z4.b }, p0/Z, [x22]\n"
+ "mov z31.d, z1.d\n"
+ "mov z28.d, z2.d\n"
+ "ld1b { z0.b }, p0/Z, [x21]\n"
+ "mov z30.d, z2.d\n"
+ "mov z26.d, z2.d\n"
+ "ld1b { z3.b }, p0/Z, [x20]\n"
+ "mov z22.d, z4.d\n"
+ "mov z10.d, z4.d\n"
+ "ptrue p2.b\n"
+ "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z18.d, z4.d\n"
+ "ext z8.b, z8.b, z8.b, #0x2\n"
+ "lsl x10, %x[n_channels], #0x2\n"
+ "neg z11.s, p2/M, z11.s\n"
+ "ext z27.b, z27.b, z27.b, #0x4\n"
+ "ext z31.b, z31.b, z31.b, #0x6\n"
+ "mov x9, #0x0\n"
+ "whilelt p0.b, x9, x10\n"
+ "ext z28.b, z28.b, z28.b, #0x2\n"
+ "ext z30.b, z30.b, z30.b, #0x4\n"
+ "ld1w { z14.s }, p0/Z, [%x[params]]\n"
+ "mov x28, #0x0\n"
+ "ext z26.b, z26.b, z26.b, #0x6\n"
+ "ext z22.b, z22.b, z22.b, #0x2\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "ext z10.b, z10.b, z10.b, #0x4\n"
+ "ext z18.b, z18.b, z18.b, #0x6\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "mov z21.d, z0.d\n"
+ "mov z20.d, z0.d\n"
+ "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "mov z19.d, z0.d\n"
+ "mov z24.d, z3.d\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1b { z5.b }, p0/Z, [%x[params], #1, MUL VL]\n"
+ "mov z17.d, z3.d\n"
+ "mov z16.d, z3.d\n"
+ "ld1b { z6.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z7.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+ "ext z21.b, z21.b, z21.b, #0x2\n"
+ "ext z20.b, z20.b, z20.b, #0x4\n"
+ "addvl %x[params], %x[params], #4\n"
+ "ext z19.b, z19.b, z19.b, #0x6\n"
+ "zip1 z1.s, z1.s, z27.s\n"
+ "zip1 z8.s, z8.s, z31.s\n"
+ "zip1 z2.s, z2.s, z30.s\n"
+ "zip1 z28.s, z28.s, z26.s\n"
+ "ext z24.b, z24.b, z24.b, #0x2\n"
+ "ext z17.b, z17.b, z17.b, #0x4\n"
+ "ext z16.b, z16.b, z16.b, #0x6\n"
+ "zip1 z4.s, z4.s, z10.s\n"
+ "zip1 z22.s, z22.s, z18.s\n"
+ "zip1 z0.s, z0.s, z20.s\n"
+ "zip1 z21.s, z21.s, z19.s\n"
+ "zip1 z1.s, z1.s, z8.s\n"
+ "zip1 z2.s, z2.s, z28.s\n"
+ "zip1 z3.s, z3.s, z17.s\n"
+ "zip1 z24.s, z24.s, z16.s\n"
+ "zip1 z4.s, z4.s, z22.s\n"
+ "zip1 z0.s, z0.s, z21.s\n"
+ "mov z1.q, z1.q[0]\n"
+ "mov z2.q, z2.q[0]\n"
+ "zip1 z3.s, z3.s, z24.s\n"
+ "mov z4.q, z4.q[0]\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "sdot z24.s, z13.b, z1.b[0]\n"
+ "mov z23.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "sdot z25.s, z13.b, z1.b[1]\n"
+ "mov z21.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "sdot z23.s, z13.b, z1.b[2]\n"
+ "mov z10.s, #0x0\n"
+ "mov z8.s, #0x0\n"
+ "sdot z22.s, z13.b, z1.b[3]\n"
+ "mov z20.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "sdot z21.s, z13.b, z2.b[0]\n"
+ "mov z17.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "sdot z19.s, z13.b, z2.b[1]\n"
+ "sdot z10.s, z13.b, z2.b[2]\n"
+ "sdot z8.s, z13.b, z2.b[3]\n"
+ "mov z0.q, z0.q[0]\n"
+ "sdot z20.s, z13.b, z4.b[0]\n"
+ "sdot z18.s, z13.b, z4.b[1]\n"
+ "mov z3.q, z3.q[0]\n"
+ "sdot z17.s, z13.b, z4.b[2]\n"
+ "sdot z16.s, z13.b, z4.b[3]\n"
+ "mov z31.s, #0x0\n"
+ "mov z30.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "sdot z31.s, z13.b, z0.b[0]\n"
+ "mov z27.s, #0x0\n"
+ "mov z28.s, #0x0\n"
+ "sdot z30.s, z13.b, z0.b[1]\n"
+ "mov z29.s, #0x0\n"
+ "sdot z26.s, z13.b, z0.b[2]\n"
+ "sdot z27.s, z13.b, z0.b[3]\n"
+ "sdot z28.s, z13.b, z3.b[0]\n"
+ "sdot z29.s, z13.b, z3.b[1]\n"
+ "add z24.s, z24.s, z21.s\n"
+ "add z25.s, z25.s, z19.s\n"
+ "add z23.s, z23.s, z10.s\n"
+ "add z22.s, z22.s, z8.s\n"
+ "add z21.s, z20.s, z21.s\n"
+ "mov z20.s, #0x0\n"
+ "sdot z20.s, z13.b, z3.b[2]\n"
+ "add z19.s, z18.s, z19.s\n"
+ "mov z18.s, #0x0\n"
+ "sdot z18.s, z13.b, z3.b[3]\n"
+ "add z17.s, z17.s, z10.s\n"
+ "add z16.s, z16.s, z8.s\n"
+ "add z24.s, z24.s, z31.s\n"
+ "add z25.s, z25.s, z30.s\n"
+ "mul z24.s, p2/M, z24.s, z11.s\n"
+ "mul z25.s, p2/M, z25.s, z11.s\n"
+ "add z26.s, z23.s, z26.s\n"
+ "add z27.s, z22.s, z27.s\n"
+ "mul z26.s, p2/M, z26.s, z11.s\n"
+ "mul z27.s, p2/M, z27.s, z11.s\n"
+ "add z28.s, z21.s, z28.s\n"
+ "add z29.s, z19.s, z29.s\n"
+ "mul z28.s, p2/M, z28.s, z11.s\n"
+ "mul z29.s, p2/M, z29.s, z11.s\n"
+ "add z30.s, z17.s, z20.s\n"
+ "add z31.s, z16.s, z18.s\n"
+ "mul z30.s, p2/M, z30.s, z11.s\n"
+ "mul z31.s, p2/M, z31.s, z11.s\n"
+ "zip1 z19.s, z24.s, z26.s\n"
+ "zip1 z18.s, z25.s, z27.s\n"
+ "zip1 z17.s, z28.s, z30.s\n"
+ "zip1 z16.s, z29.s, z31.s\n"
+ "zip1 z22.s, z19.s, z18.s\n"
+ "zip1 z23.s, z17.s, z16.s\n"
+ "add z24.s, z24.s, z14.s\n"
+ "add z25.s, z25.s, z14.s\n"
+ "add z26.s, z26.s, z14.s\n"
+ "add z27.s, z27.s, z14.s\n"
+ "add z28.s, z28.s, z14.s\n"
+ "add z29.s, z29.s, z14.s\n"
+ "add z30.s, z30.s, z14.s\n"
+ "add z31.s, z31.s, z14.s\n"
+ "1:" // Loop
+ "sdot z24.s, z5.b, z0.b[0]\n"
+ "sdot z25.s, z5.b, z0.b[1]\n"
+ "ld1w { z8.s }, p2/Z, [%x[params]]\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "sdot z26.s, z5.b, z0.b[2]\n"
+ "sdot z27.s, z5.b, z0.b[3]\n"
+ "incb x9\n"
+ "whilelt p1.s, x28, %x[n_channels]\n"
+ "sdot z24.s, z6.b, z1.b[0]\n"
+ "sdot z25.s, z6.b, z1.b[1]\n"
+ "whilelt p0.b, x9, x10\n"
+ "ld1w { z20.s }, p0/Z, [%x[params], #2, MUL VL]\n"
+ "sdot z26.s, z6.b, z1.b[2]\n"
+ "sdot z27.s, z6.b, z1.b[3]\n"
+ "sdot z28.s, z5.b, z2.b[0]\n"
+ "sdot z29.s, z5.b, z2.b[1]\n"
+ "sdot z30.s, z5.b, z2.b[2]\n"
+ "sdot z31.s, z5.b, z2.b[3]\n"
+ "ld1b { z5.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z24.s, z7.b, z2.b[0]\n"
+ "sdot z25.s, z7.b, z2.b[1]\n"
+ ".inst 0x04a87718 // sqrdmulh z24.s, z24.s, z8.s\n"
+ "sdot z26.s, z7.b, z2.b[2]\n"
+ "sdot z27.s, z7.b, z2.b[3]\n"
+ ".inst 0x04a87739 // sqrdmulh z25.s, z25.s, z8.s\n"
+ "sdot z28.s, z6.b, z3.b[0]\n"
+ "sdot z29.s, z6.b, z3.b[1]\n"
+ ".inst 0x04a8775a // sqrdmulh z26.s, z26.s, z8.s\n"
+ "sdot z30.s, z6.b, z3.b[2]\n"
+ "sdot z31.s, z6.b, z3.b[3]\n"
+ ".inst 0x04a8777b // sqrdmulh z27.s, z27.s, z8.s\n"
+ "ld1b { z6.b }, p0/Z, [%x[params], #4, MUL VL]\n"
+ "sdot z28.s, z7.b, z4.b[0]\n"
+ "sdot z29.s, z7.b, z4.b[1]\n"
+ "and z19.d, z24.d, z21.d\n"
+ "sdot z30.s, z7.b, z4.b[2]\n"
+ "sdot z31.s, z7.b, z4.b[3]\n"
+ "and z18.d, z25.d, z21.d\n"
+ "ld1b { z7.b }, p0/Z, [%x[params], #5, MUL VL]\n"
+ "and z17.d, z26.d, z21.d\n"
+ "and z16.d, z27.d, z21.d\n"
+ "addvl %x[params], %x[params], #6\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04a8779c // sqrdmulh z28.s, z28.s, z8.s\n"
+ ".inst 0x04a877bd // sqrdmulh z29.s, z29.s, z8.s\n"
+ ".inst 0x04a877de // sqrdmulh z30.s, z30.s, z8.s\n"
+ ".inst 0x04a877ff // sqrdmulh z31.s, z31.s, z8.s\n"
+ "sqadd z24.s, z24.s, z19.s\n"
+ "sqadd z25.s, z25.s, z18.s\n"
+ ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
+ ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
+ "sqadd z26.s, z26.s, z17.s\n"
+ "sqadd z27.s, z27.s, z16.s\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
+ "and z19.d, z28.d, z21.d\n"
+ "and z18.d, z29.d, z21.d\n"
+ "and z17.d, z30.d, z21.d\n"
+ "and z16.d, z31.d, z21.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z19.s\n"
+ "sqadd z29.s, z29.s, z18.s\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
+ "sqadd z30.s, z30.s, z17.s\n"
+ "sqadd z31.s, z31.s, z16.s\n"
+ ".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ "add z24.s, z24.s, z9.s\n"
+ "add z25.s, z25.s, z9.s\n"
+ "smin z24.s, p2/M, z24.s, z12.s\n"
+ "smin z25.s, p2/M, z25.s, z12.s\n"
+ "add z26.s, z26.s, z9.s\n"
+ "add z27.s, z27.s, z9.s\n"
+ "smin z26.s, p2/M, z26.s, z12.s\n"
+ "smin z27.s, p2/M, z27.s, z12.s\n"
+ "add z28.s, z28.s, z9.s\n"
+ "add z29.s, z29.s, z9.s\n"
+ "smin z28.s, p2/M, z28.s, z12.s\n"
+ "smin z29.s, p2/M, z29.s, z12.s\n"
+ "add z30.s, z30.s, z9.s\n"
+ "add z31.s, z31.s, z9.s\n"
+ "smin z30.s, p2/M, z30.s, z12.s\n"
+ "smin z31.s, p2/M, z31.s, z12.s\n"
+ "smax z24.s, p2/M, z24.s, z15.s\n"
+ "smax z25.s, p2/M, z25.s, z15.s\n"
+ "st1b { z24.s }, p1, [x27, x28]\n"
+ "mov z24.s, z22.s[0]\n"
+ "smax z26.s, p2/M, z26.s, z15.s\n"
+ "smax z27.s, p2/M, z27.s, z15.s\n"
+ "st1b { z25.s }, p1, [x26, x28]\n"
+ "mov z25.s, z22.s[1]\n"
+ "smax z28.s, p2/M, z28.s, z15.s\n"
+ "smax z29.s, p2/M, z29.s, z15.s\n"
+ "st1b { z26.s }, p1, [x25, x28]\n"
+ "mov z26.s, z22.s[2]\n"
+ "smax z30.s, p2/M, z30.s, z15.s\n"
+ "smax z31.s, p2/M, z31.s, z15.s\n"
+ "st1b { z27.s }, p1, [x24, x28]\n"
+ "mov z27.s, z22.s[3]\n"
+ "st1b { z28.s }, p1, [x23, x28]\n"
+ "mov z28.s, z23.s[0]\n"
+ "add z24.s, z24.s, z20.s\n"
+ "st1b { z29.s }, p1, [x22, x28]\n"
+ "mov z29.s, z23.s[1]\n"
+ "add z25.s, z25.s, z20.s\n"
+ "st1b { z30.s }, p1, [x21, x28]\n"
+ "mov z30.s, z23.s[2]\n"
+ "add z26.s, z26.s, z20.s\n"
+ "st1b { z31.s }, p1, [x20, x28]\n"
+ "mov z31.s, z23.s[3]\n"
+ "incw x28\n"
+ "add z27.s, z27.s, z20.s\n"
+ "add z28.s, z28.s, z20.s\n"
+ "add z29.s, z29.s, z20.s\n"
+ "add z30.s, z30.s, z20.s\n"
+ "add z31.s, z31.s, z20.s\n"
+ "b.any 1b\n"
+ : [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..701948f264
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst : DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+ using Parent = DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>;
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *)
+ : Parent(4, 2, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
+
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..a3b2b429c0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(
+ const int8_t *const *const inptrs,
+ int8_t *const *const outptrs,
+ const void *params,
+ unsigned int n_output_channels,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "mov x20, #0x6\n"
+ "whilelt p0.b, XZR, x20\n"
+ "ldr x22, [%x[inptrs], #0x18]\n"
+ "ldr x21, [%x[inptrs], #0x20]\n"
+ "ldr x20, [%x[inptrs], #0x10]\n"
+ "ld1b { z3.b }, p0/Z, [x22]\n"
+ "mov z23.d, z3.d\n"
+ "ext z23.b, z23.b, z23.b, #0x1\n"
+ "ld1b { z4.b }, p0/Z, [x21]\n"
+ "ldr x24, [%x[inptrs], #0x8]\n"
+ "mov z18.d, z4.d\n"
+ "ext z18.b, z18.b, z18.b, #0x1\n"
+ "ld1b { z2.b }, p0/Z, [x20]\n"
+ "ldr x23, [%x[inptrs], #0x28]\n"
+ "mov z15.d, z2.d\n"
+ "ext z15.b, z15.b, z15.b, #0x1\n"
+ "ldr x22, [%x[inptrs], #0x30]\n"
+ "ldr x21, [%x[inptrs], #0x38]\n"
+ "zip1 z3.d, z3.d, z23.d\n"
+ "zip1 z4.d, z4.d, z18.d\n"
+ "ldr x20, [%x[inptrs], #0x0]\n"
+ "ld1b { z1.b }, p0/Z, [x24]\n"
+ "mov z19.d, z1.d\n"
+ "ext z19.b, z19.b, z19.b, #0x1\n"
+ "ld1b { z5.b }, p0/Z, [x23]\n"
+ "ld1b { z6.b }, p0/Z, [x22]\n"
+ "mov z18.d, z5.d\n"
+ "mov z22.d, z6.d\n"
+ "ld1b { z7.b }, p0/Z, [x21]\n"
+ "ld1b { z0.b }, p0/Z, [x20]\n"
+ "mov z8.d, z7.d\n"
+ "zip1 z2.d, z2.d, z15.d\n"
+ "mov z3.q, z3.q[0]\n"
+ "mov z4.q, z4.q[0]\n"
+ "ptrue p2.b\n"
+ "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "ext z18.b, z18.b, z18.b, #0x1\n"
+ "ext z22.b, z22.b, z22.b, #0x1\n"
+ "lsl x10, %x[n_channels], #0x2\n"
+ "neg z23.s, p2/M, z23.s\n"
+ "ext z8.b, z8.b, z8.b, #0x1\n"
+ "mov z28.b, #0x1\n"
+ "mov x9, #0x0\n"
+ "whilelt p0.b, x9, x10\n"
+ "mov z25.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "sdot z25.s, z28.b, z3.b[0]\n"
+ "ld1w { z12.s }, p0/Z, [%x[params]]\n"
+ "mov z17.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "sdot z24.s, z28.b, z3.b[2]\n"
+ "mov x28, #0x0\n"
+ "mov z27.d, z0.d\n"
+ "sdot z17.s, z28.b, z4.b[0]\n"
+ "sdot z16.s, z28.b, z4.b[2]\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "ext z27.b, z27.b, z27.b, #0x1\n"
+ "zip1 z1.d, z1.d, z19.d\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "mov z2.q, z2.q[0]\n"
+ "zip1 z5.d, z5.d, z18.d\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "zip1 z6.d, z6.d, z22.d\n"
+ "zip1 z7.d, z7.d, z8.d\n"
+ "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "mov z30.s, #0x0\n"
+ "mov z31.s, #0x0\n"
+ "sdot z30.s, z28.b, z2.b[0]\n"
+ "ld1b { z8.b }, p0/Z, [%x[params], #1, MUL VL]\n"
+ "mov z29.s, #0x1\n"
+ "sdot z31.s, z28.b, z2.b[2]\n"
+ "sdot z25.s, z29.b, z3.b[1]\n"
+ "ld1b { z9.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+ "zip1 z0.d, z0.d, z27.d\n"
+ "mov z1.q, z1.q[0]\n"
+ "sdot z24.s, z29.b, z3.b[3]\n"
+ "ld1b { z10.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+ "mov z5.q, z5.q[0]\n"
+ "mov z6.q, z6.q[0]\n"
+ "sdot z17.s, z29.b, z4.b[1]\n"
+ "ld1b { z11.b }, p0/Z, [%x[params], #4, MUL VL]\n"
+ "mov z7.q, z7.q[0]\n"
+ "mov z22.s, #0x0\n"
+ "sdot z16.s, z29.b, z4.b[3]\n"
+ "addvl %x[params], %x[params], #5\n"
+ "mov z21.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "sdot z22.s, z28.b, z1.b[0]\n"
+ "mov z27.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "sdot z21.s, z28.b, z1.b[2]\n"
+ "mov z19.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "sdot z26.s, z28.b, z5.b[0]\n"
+ "sdot z27.s, z28.b, z5.b[2]\n"
+ "sdot z20.s, z28.b, z6.b[0]\n"
+ "mov z0.q, z0.q[0]\n"
+ "sdot z19.s, z28.b, z6.b[2]\n"
+ "sdot z18.s, z28.b, z7.b[0]\n"
+ "add z17.s, z25.s, z17.s\n"
+ "mov z25.s, #0x0\n"
+ "sdot z25.s, z28.b, z7.b[2]\n"
+ "sdot z30.s, z29.b, z2.b[1]\n"
+ "sdot z31.s, z29.b, z2.b[3]\n"
+ "add z16.s, z24.s, z16.s\n"
+ "sdot z22.s, z29.b, z1.b[1]\n"
+ "mov z24.s, #0x0\n"
+ "sdot z24.s, z28.b, z0.b[0]\n"
+ "sdot z21.s, z29.b, z1.b[3]\n"
+ "sdot z26.s, z29.b, z5.b[1]\n"
+ "sdot z27.s, z29.b, z5.b[3]\n"
+ "add z30.s, z30.s, z17.s\n"
+ "sdot z20.s, z29.b, z6.b[1]\n"
+ "sdot z19.s, z29.b, z6.b[3]\n"
+ "add z31.s, z31.s, z16.s\n"
+ "sdot z18.s, z29.b, z7.b[1]\n"
+ "sdot z25.s, z29.b, z7.b[3]\n"
+ "add z22.s, z22.s, z30.s\n"
+ "sdot z24.s, z29.b, z0.b[1]\n"
+ "add z21.s, z21.s, z31.s\n"
+ "add z20.s, z26.s, z20.s\n"
+ "add z19.s, z27.s, z19.s\n"
+ "add z18.s, z18.s, z17.s\n"
+ "mov z17.s, #0x0\n"
+ "sdot z17.s, z28.b, z0.b[2]\n"
+ "sdot z17.s, z29.b, z0.b[3]\n"
+ "add z16.s, z25.s, z16.s\n"
+ "add z24.s, z22.s, z24.s\n"
+ "add z25.s, z21.s, z17.s\n"
+ "mul z24.s, p2/M, z24.s, z23.s\n"
+ "mul z25.s, p2/M, z25.s, z23.s\n"
+ "add z26.s, z26.s, z22.s\n"
+ "add z27.s, z27.s, z21.s\n"
+ "mul z26.s, p2/M, z26.s, z23.s\n"
+ "mul z27.s, p2/M, z27.s, z23.s\n"
+ "add z28.s, z20.s, z30.s\n"
+ "add z29.s, z19.s, z31.s\n"
+ "mul z28.s, p2/M, z28.s, z23.s\n"
+ "mul z29.s, p2/M, z29.s, z23.s\n"
+ "add z30.s, z20.s, z18.s\n"
+ "add z31.s, z19.s, z16.s\n"
+ "mul z30.s, p2/M, z30.s, z23.s\n"
+ "mul z31.s, p2/M, z31.s, z23.s\n"
+ "zip1 z19.s, z24.s, z26.s\n"
+ "zip1 z18.s, z25.s, z27.s\n"
+ "zip1 z17.s, z28.s, z30.s\n"
+ "zip1 z16.s, z29.s, z31.s\n"
+ "zip1 z22.s, z19.s, z18.s\n"
+ "zip1 z23.s, z17.s, z16.s\n"
+ "add z24.s, z24.s, z12.s\n"
+ "add z25.s, z25.s, z12.s\n"
+ "add z26.s, z26.s, z12.s\n"
+ "add z27.s, z27.s, z12.s\n"
+ "add z28.s, z28.s, z12.s\n"
+ "add z29.s, z29.s, z12.s\n"
+ "add z30.s, z30.s, z12.s\n"
+ "add z31.s, z31.s, z12.s\n"
+ "1:" // Loop
+ "sdot z24.s, z8.b, z0.b[0]\n"
+ "sdot z25.s, z8.b, z0.b[2]\n"
+ "ld1w { z12.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "sdot z26.s, z8.b, z1.b[0]\n"
+ "sdot z27.s, z8.b, z1.b[2]\n"
+ "incb x9\n"
+ "whilelt p1.s, x28, %x[n_channels]\n"
+ "sdot z24.s, z9.b, z0.b[1]\n"
+ "sdot z25.s, z9.b, z0.b[3]\n"
+ "whilelt p0.b, x9, x10\n"
+ "sdot z26.s, z9.b, z1.b[1]\n"
+ "sdot z27.s, z9.b, z1.b[3]\n"
+ "sdot z28.s, z8.b, z2.b[0]\n"
+ "sdot z29.s, z8.b, z2.b[2]\n"
+ "sdot z30.s, z8.b, z3.b[0]\n"
+ "sdot z31.s, z8.b, z3.b[2]\n"
+ "ld1b { z17.b }, p2/Z, [%x[params]]\n"
+ "sdot z24.s, z10.b, z1.b[0]\n"
+ "sdot z25.s, z10.b, z1.b[2]\n"
+ "sdot z26.s, z10.b, z2.b[0]\n"
+ "sdot z27.s, z10.b, z2.b[2]\n"
+ "sdot z28.s, z9.b, z2.b[1]\n"
+ "sdot z29.s, z9.b, z2.b[3]\n"
+ "sdot z30.s, z9.b, z3.b[1]\n"
+ "sdot z31.s, z9.b, z3.b[3]\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "sdot z24.s, z11.b, z1.b[1]\n"
+ "sdot z25.s, z11.b, z1.b[3]\n"
+ "sdot z26.s, z11.b, z2.b[1]\n"
+ "sdot z27.s, z11.b, z2.b[3]\n"
+ "sdot z28.s, z10.b, z3.b[0]\n"
+ "sdot z29.s, z10.b, z3.b[2]\n"
+ "sdot z30.s, z10.b, z4.b[0]\n"
+ "sdot z31.s, z10.b, z4.b[2]\n"
+ "ld1b { z19.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "sdot z24.s, z17.b, z2.b[0]\n"
+ "sdot z25.s, z17.b, z2.b[2]\n"
+ "sdot z26.s, z17.b, z3.b[0]\n"
+ "sdot z27.s, z17.b, z3.b[2]\n"
+ "sdot z28.s, z11.b, z3.b[1]\n"
+ "sdot z29.s, z11.b, z3.b[3]\n"
+ "sdot z30.s, z11.b, z4.b[1]\n"
+ "sdot z31.s, z11.b, z4.b[3]\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z24.s, z16.b, z2.b[1]\n"
+ "sdot z25.s, z16.b, z2.b[3]\n"
+ "sdot z26.s, z16.b, z3.b[1]\n"
+ "sdot z27.s, z16.b, z3.b[3]\n"
+ "sdot z28.s, z17.b, z4.b[0]\n"
+ "sdot z29.s, z17.b, z4.b[2]\n"
+ "sdot z30.s, z17.b, z5.b[0]\n"
+ "sdot z31.s, z17.b, z5.b[2]\n"
+ "ld1b { z17.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "sdot z24.s, z19.b, z3.b[0]\n"
+ "sdot z25.s, z19.b, z3.b[2]\n"
+ "sdot z26.s, z19.b, z4.b[0]\n"
+ "sdot z27.s, z19.b, z4.b[2]\n"
+ "sdot z28.s, z16.b, z4.b[1]\n"
+ "sdot z29.s, z16.b, z4.b[3]\n"
+ "sdot z30.s, z16.b, z5.b[1]\n"
+ "sdot z31.s, z16.b, z5.b[3]\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "addvl %x[params], %x[params], #16\n"
+ "sdot z24.s, z18.b, z3.b[1]\n"
+ "sdot z25.s, z18.b, z3.b[3]\n"
+ "ld1w { z20.s }, p0/Z, [%x[params], #-8, MUL VL]\n"
+ "sdot z26.s, z18.b, z4.b[1]\n"
+ "sdot z27.s, z18.b, z4.b[3]\n"
+ "sdot z28.s, z19.b, z5.b[0]\n"
+ "sdot z29.s, z19.b, z5.b[2]\n"
+ "sdot z30.s, z19.b, z6.b[0]\n"
+ "sdot z31.s, z19.b, z6.b[2]\n"
+ "ld1b { z10.b }, p0/Z, [%x[params], #-5, MUL VL]\n"
+ "sdot z24.s, z17.b, z4.b[0]\n"
+ "sdot z25.s, z17.b, z4.b[2]\n"
+ "sdot z26.s, z17.b, z5.b[0]\n"
+ "sdot z27.s, z17.b, z5.b[2]\n"
+ "sdot z28.s, z18.b, z5.b[1]\n"
+ "sdot z29.s, z18.b, z5.b[3]\n"
+ "sdot z30.s, z18.b, z6.b[1]\n"
+ "sdot z31.s, z18.b, z6.b[3]\n"
+ "ld1b { z11.b }, p0/Z, [%x[params], #-4, MUL VL]\n"
+ "sdot z24.s, z16.b, z4.b[1]\n"
+ "sdot z25.s, z16.b, z4.b[3]\n"
+ ".inst 0x04ac7718 // sqrdmulh z24.s, z24.s, z12.s\n"
+ "sdot z26.s, z16.b, z5.b[1]\n"
+ "sdot z27.s, z16.b, z5.b[3]\n"
+ ".inst 0x04ac7739 // sqrdmulh z25.s, z25.s, z12.s\n"
+ "sdot z28.s, z17.b, z6.b[0]\n"
+ "sdot z29.s, z17.b, z6.b[2]\n"
+ ".inst 0x04ac775a // sqrdmulh z26.s, z26.s, z12.s\n"
+ "sdot z30.s, z17.b, z7.b[0]\n"
+ "sdot z31.s, z17.b, z7.b[2]\n"
+ ".inst 0x04ac777b // sqrdmulh z27.s, z27.s, z12.s\n"
+ "ld1b { z8.b }, p0/Z, [%x[params], #-7, MUL VL]\n"
+ "sdot z28.s, z16.b, z6.b[1]\n"
+ "sdot z29.s, z16.b, z6.b[3]\n"
+ "and z19.d, z24.d, z21.d\n"
+ "sdot z30.s, z16.b, z7.b[1]\n"
+ "sdot z31.s, z16.b, z7.b[3]\n"
+ "and z18.d, z25.d, z21.d\n"
+ "ld1b { z9.b }, p0/Z, [%x[params], #-6, MUL VL]\n"
+ "and z17.d, z26.d, z21.d\n"
+ "and z16.d, z27.d, z21.d\n"
+ "addvl %x[params], %x[params], #-3\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04ac779c // sqrdmulh z28.s, z28.s, z12.s\n"
+ ".inst 0x04ac77bd // sqrdmulh z29.s, z29.s, z12.s\n"
+ ".inst 0x04ac77de // sqrdmulh z30.s, z30.s, z12.s\n"
+ ".inst 0x04ac77ff // sqrdmulh z31.s, z31.s, z12.s\n"
+ "sqadd z24.s, z24.s, z19.s\n"
+ "sqadd z25.s, z25.s, z18.s\n"
+ ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
+ ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
+ "sqadd z26.s, z26.s, z17.s\n"
+ "sqadd z27.s, z27.s, z16.s\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
+ "and z19.d, z28.d, z21.d\n"
+ "and z18.d, z29.d, z21.d\n"
+ "and z17.d, z30.d, z21.d\n"
+ "and z16.d, z31.d, z21.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z19.s\n"
+ "sqadd z29.s, z29.s, z18.s\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
+ "sqadd z30.s, z30.s, z17.s\n"
+ "sqadd z31.s, z31.s, z16.s\n"
+ ".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ "add z24.s, z24.s, z13.s\n"
+ "add z25.s, z25.s, z13.s\n"
+ "smin z24.s, p2/M, z24.s, z15.s\n"
+ "smin z25.s, p2/M, z25.s, z15.s\n"
+ "add z26.s, z26.s, z13.s\n"
+ "add z27.s, z27.s, z13.s\n"
+ "smin z26.s, p2/M, z26.s, z15.s\n"
+ "smin z27.s, p2/M, z27.s, z15.s\n"
+ "add z28.s, z28.s, z13.s\n"
+ "add z29.s, z29.s, z13.s\n"
+ "smin z28.s, p2/M, z28.s, z15.s\n"
+ "smin z29.s, p2/M, z29.s, z15.s\n"
+ "add z30.s, z30.s, z13.s\n"
+ "add z31.s, z31.s, z13.s\n"
+ "smin z30.s, p2/M, z30.s, z15.s\n"
+ "smin z31.s, p2/M, z31.s, z15.s\n"
+ "smax z24.s, p2/M, z24.s, z14.s\n"
+ "smax z25.s, p2/M, z25.s, z14.s\n"
+ "st1b { z24.s }, p1, [x27, x28]\n"
+ "mov z24.s, z22.s[0]\n"
+ "smax z26.s, p2/M, z26.s, z14.s\n"
+ "smax z27.s, p2/M, z27.s, z14.s\n"
+ "st1b { z25.s }, p1, [x26, x28]\n"
+ "mov z25.s, z22.s[1]\n"
+ "smax z28.s, p2/M, z28.s, z14.s\n"
+ "smax z29.s, p2/M, z29.s, z14.s\n"
+ "st1b { z26.s }, p1, [x25, x28]\n"
+ "mov z26.s, z22.s[2]\n"
+ "smax z30.s, p2/M, z30.s, z14.s\n"
+ "smax z31.s, p2/M, z31.s, z14.s\n"
+ "st1b { z27.s }, p1, [x24, x28]\n"
+ "mov z27.s, z22.s[3]\n"
+ "st1b { z28.s }, p1, [x23, x28]\n"
+ "mov z28.s, z23.s[0]\n"
+ "add z24.s, z24.s, z20.s\n"
+ "st1b { z29.s }, p1, [x22, x28]\n"
+ "mov z29.s, z23.s[1]\n"
+ "add z25.s, z25.s, z20.s\n"
+ "st1b { z30.s }, p1, [x21, x28]\n"
+ "mov z30.s, z23.s[2]\n"
+ "add z26.s, z26.s, z20.s\n"
+ "st1b { z31.s }, p1, [x20, x28]\n"
+ "mov z31.s, z23.s[3]\n"
+ "incw x28\n"
+ "add z27.s, z27.s, z20.s\n"
+ "add z28.s, z28.s, z20.s\n"
+ "add z29.s, z29.s, z20.s\n"
+ "add z30.s, z30.s, z20.s\n"
+ "add z31.s, z31.s, z20.s\n"
+ "b.any 1b\n"
+ : [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..6799b10ed9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *);
+
+class sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleave_sve_s8q_3x3_dot::get_packed_size(args);
+ }
+
+ void pack_parameters(
+ const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const override
+ {
+ interleave_sve_s8q_3x3_dot::pack_parameters(
+ args.input_channels * args.channel_multiplier, buffer, reinterpret_cast<const int32_t *>(biases),
+ reinterpret_cast<const int8_t *>(weights), qp, ld_weight_col, ld_weight_row
+ );
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d9c8644fc4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,436 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const int8_t *const *const inptrs, const int8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, int8_t *const *const outptrs)
+{
+ __asm__ __volatile__(
+ "mov x13, #0x0\n"
+ "whilelt p0.b, x13, %x[n_channels]\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "ldp x25, x24, [%x[inptrs], #0x10]\n"
+ "ldp x23, x22, [%x[inptrs], #0x20]\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "ptrue p2.b\n"
+ "mov x12, #0x0\n"
+ "ldp x11, x10, [%x[outptrs], #0x0]\n"
+ "ldp x9, x28, [%x[outptrs], #0x10]\n"
+ "ld1b { z15.b }, p0/Z, [x27, x13]\n"
+ "ld1b { z18.b }, p0/Z, [x26, x13]\n"
+ "ldp x27, x26, [%x[inptrs], #0x40]\n"
+ "ld1b { z16.b }, p0/Z, [x25, x13]\n"
+ "zip2 z17.b, z15.b, z16.b\n"
+ "zip1 z15.b, z15.b, z16.b\n"
+ "ld1b { z14.b }, p0/Z, [x24, x13]\n"
+ "ldp x25, x24, [%x[inptrs], #0x50]\n"
+ "zip1 z16.b, z18.b, z14.b\n"
+ "zip2 z14.b, z18.b, z14.b\n"
+ "ld1b { z13.b }, p0/Z, [x23, x13]\n"
+ "ld1b { z18.b }, p0/Z, [x22, x13]\n"
+ "zip2 z12.b, z15.b, z16.b\n"
+ "zip1 z15.b, z15.b, z16.b\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "ld1b { z16.b }, p0/Z, [x21, x13]\n"
+ "zip1 z11.b, z17.b, z14.b\n"
+ "zip2 z14.b, z17.b, z14.b\n"
+ "ld1b { z10.b }, p0/Z, [x20, x13]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "zip2 z22.b, z13.b, z16.b\n"
+ "zip1 z13.b, z13.b, z16.b\n"
+ "ld1b { z9.b }, p0/Z, [x27, x13]\n"
+ "ld1b { z17.b }, p0/Z, [x26, x13]\n"
+ "zip1 z21.b, z18.b, z10.b\n"
+ "zip2 z10.b, z18.b, z10.b\n"
+ "ld1b { z16.b }, p0/Z, [x25, x13]\n"
+ "ld1b { z8.b }, p0/Z, [x24, x13]\n"
+ "zip2 z20.b, z9.b, z16.b\n"
+ "zip1 z9.b, z9.b, z16.b\n"
+ "ld1b { z7.b }, p0/Z, [x23, x13]\n"
+ "ld1b { z19.b }, p0/Z, [x22, x13]\n"
+ "zip1 z18.b, z17.b, z8.b\n"
+ "zip2 z8.b, z17.b, z8.b\n"
+ "ld1b { z16.b }, p0/Z, [x21, x13]\n"
+ "ld1b { z6.b }, p0/Z, [x20, x13]\n"
+ "zip2 z17.b, z7.b, z16.b\n"
+ "zip1 z7.b, z7.b, z16.b\n"
+ "zip1 z16.b, z19.b, z6.b\n"
+ "zip2 z6.b, z19.b, z6.b\n"
+ "ld1w { z5.s }, p2/Z, [%x[params]]\n"
+ "ld1rw { z4.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z3.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z2.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "zip2 z1.b, z13.b, z21.b\n"
+ "zip1 z13.b, z13.b, z21.b\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "ldp x25, x23, [%x[inptrs], #0x10]\n"
+ "zip1 z0.b, z22.b, z10.b\n"
+ "zip2 z10.b, z22.b, z10.b\n"
+ "ldp x24, x22, [%x[inptrs], #0x20]\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "zip2 z31.b, z9.b, z18.b\n"
+ "zip1 z9.b, z9.b, z18.b\n"
+ "zip1 z30.b, z20.b, z8.b\n"
+ "zip2 z8.b, z20.b, z8.b\n"
+ "ld1b { z29.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "zip2 z27.b, z7.b, z16.b\n"
+ "zip1 z7.b, z7.b, z16.b\n"
+ "ld1b { z26.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "addvl %x[params], %x[params], #4\n"
+ "zip1 z25.b, z17.b, z6.b\n"
+ "zip2 z6.b, z17.b, z6.b\n"
+ "mov z24.d, z5.d\n"
+ "mov z22.d, z5.d\n"
+ "mov z21.d, z5.d\n"
+ "1:" // Loop
+ "sdot z5.s, z29.b, z15.b\n"
+ "sdot z22.s, z29.b, z13.b\n"
+ "ext z15.b, z15.b, z15.b, #0x1\n"
+ "whilelt p0.s, x12, %x[n_channels]\n"
+ "sdot z5.s, z28.b, z13.b\n"
+ "ext z13.b, z13.b, z13.b, #0x1\n"
+ "sdot z24.s, z29.b, z15.b\n"
+ "ld1w { z17.s }, p2/Z, [%x[params]]\n"
+ "sdot z21.s, z29.b, z13.b\n"
+ "sdot z22.s, z28.b, z9.b\n"
+ "incw x13, ALL, MUL #4\n"
+ "sdot z5.s, z26.b, z9.b\n"
+ "ext z9.b, z9.b, z9.b, #0x1\n"
+ "sdot z24.s, z28.b, z13.b\n"
+ "ld1w { z20.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "sdot z21.s, z28.b, z9.b\n"
+ "sdot z22.s, z26.b, z7.b\n"
+ "ext z7.b, z7.b, z7.b, #0x1\n"
+ ".inst 0x04b174a5 // sqrdmulh z5.s, z5.s, z17.s\n"
+ "sdot z24.s, z26.b, z9.b\n"
+ "sdot z21.s, z26.b, z7.b\n"
+ "and z16.d, z5.d, z20.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b17718 // sqrdmulh z24.s, z24.s, z17.s\n"
+ ".inst 0x04b176d6 // sqrdmulh z22.s, z22.s, z17.s\n"
+ ".inst 0x04b176b5 // sqrdmulh z21.s, z21.s, z17.s\n"
+ "sqadd z5.s, z5.s, z16.s\n"
+ ".inst 0x44828a85 // srshl z5.s, p2/M, z5.s, z20.s\n"
+ "ld1w { z19.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "and z18.d, z24.d, z20.d\n"
+ "and z17.d, z22.d, z20.d\n"
+ "and z16.d, z21.d, z20.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z18.s\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ ".inst 0x44828a98 // srshl z24.s, p2/M, z24.s, z20.s\n"
+ ".inst 0x44828a96 // srshl z22.s, p2/M, z22.s, z20.s\n"
+ "sqadd z21.s, z21.s, z16.s\n"
+ "add z5.s, z5.s, z2.s\n"
+ ".inst 0x44828a95 // srshl z21.s, p2/M, z21.s, z20.s\n"
+ "smax z5.s, p2/M, z5.s, z4.s\n"
+ "add z24.s, z24.s, z2.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "smin z5.s, p2/M, z5.s, z3.s\n"
+ "smax z24.s, p2/M, z24.s, z4.s\n"
+ "add z21.s, z21.s, z2.s\n"
+ "smax z22.s, p2/M, z22.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z4.s\n"
+ "st1b { z5.s }, p0, [x11, x12]\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "smin z24.s, p2/M, z24.s, z3.s\n"
+ "smin z22.s, p2/M, z22.s, z3.s\n"
+ "smin z21.s, p2/M, z21.s, z3.s\n"
+ "st1b { z24.s }, p0, [x10, x12]\n"
+ "mov z24.d, z23.d\n"
+ "ld1b { z17.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "st1b { z22.s }, p0, [x9, x12]\n"
+ "mov z22.d, z23.d\n"
+ "sdot z22.s, z18.b, z1.b\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "st1b { z21.s }, p0, [x28, x12]\n"
+ "mov z21.d, z23.d\n"
+ "sdot z23.s, z18.b, z12.b\n"
+ "sdot z23.s, z17.b, z1.b\n"
+ "ext z12.b, z12.b, z12.b, #0x1\n"
+ "ext z1.b, z1.b, z1.b, #0x1\n"
+ "sdot z24.s, z18.b, z12.b\n"
+ "ld1w { z20.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "sdot z21.s, z18.b, z1.b\n"
+ "sdot z22.s, z17.b, z31.b\n"
+ "incw x12\n"
+ "whilelt p0.s, x12, %x[n_channels]\n"
+ "sdot z23.s, z16.b, z31.b\n"
+ "ext z31.b, z31.b, z31.b, #0x1\n"
+ "sdot z24.s, z17.b, z1.b\n"
+ "addvl %x[params], %x[params], #16\n"
+ "sdot z21.s, z17.b, z31.b\n"
+ "sdot z22.s, z16.b, z27.b\n"
+ "ext z27.b, z27.b, z27.b, #0x1\n"
+ ".inst 0x04b376f7 // sqrdmulh z23.s, z23.s, z19.s\n"
+ "sdot z24.s, z16.b, z31.b\n"
+ "sdot z21.s, z16.b, z27.b\n"
+ "and z16.d, z23.d, z20.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b37718 // sqrdmulh z24.s, z24.s, z19.s\n"
+ ".inst 0x04b376d6 // sqrdmulh z22.s, z22.s, z19.s\n"
+ ".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
+ "sqadd z23.s, z23.s, z16.s\n"
+ ".inst 0x44828a97 // srshl z23.s, p2/M, z23.s, z20.s\n"
+ "ld1w { z19.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+ "and z18.d, z24.d, z20.d\n"
+ "and z17.d, z22.d, z20.d\n"
+ "and z16.d, z21.d, z20.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z18.s\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ ".inst 0x44828a98 // srshl z24.s, p2/M, z24.s, z20.s\n"
+ ".inst 0x44828a96 // srshl z22.s, p2/M, z22.s, z20.s\n"
+ "sqadd z21.s, z21.s, z16.s\n"
+ "add z23.s, z23.s, z2.s\n"
+ ".inst 0x44828a95 // srshl z21.s, p2/M, z21.s, z20.s\n"
+ "smax z23.s, p2/M, z23.s, z4.s\n"
+ "add z24.s, z24.s, z2.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "smin z23.s, p2/M, z23.s, z3.s\n"
+ "smax z24.s, p2/M, z24.s, z4.s\n"
+ "add z21.s, z21.s, z2.s\n"
+ "smax z22.s, p2/M, z22.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z4.s\n"
+ "st1b { z23.s }, p0, [x11, x12]\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "smin z24.s, p2/M, z24.s, z3.s\n"
+ "smin z22.s, p2/M, z22.s, z3.s\n"
+ "smin z21.s, p2/M, z21.s, z3.s\n"
+ "st1b { z24.s }, p0, [x10, x12]\n"
+ "mov z24.d, z23.d\n"
+ "ld1b { z17.b }, p2/Z, [%x[params], #-6, MUL VL]\n"
+ "st1b { z22.s }, p0, [x9, x12]\n"
+ "mov z22.d, z23.d\n"
+ "sdot z22.s, z18.b, z0.b\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
+ "st1b { z21.s }, p0, [x28, x12]\n"
+ "mov z21.d, z23.d\n"
+ "sdot z23.s, z18.b, z11.b\n"
+ "sdot z23.s, z17.b, z0.b\n"
+ "ext z11.b, z11.b, z11.b, #0x1\n"
+ "ext z0.b, z0.b, z0.b, #0x1\n"
+ "sdot z24.s, z18.b, z11.b\n"
+ "ld1w { z20.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
+ "sdot z21.s, z18.b, z0.b\n"
+ "sdot z22.s, z17.b, z30.b\n"
+ "incw x12\n"
+ "whilelt p0.s, x12, %x[n_channels]\n"
+ "sdot z23.s, z16.b, z30.b\n"
+ "ext z30.b, z30.b, z30.b, #0x1\n"
+ "sdot z24.s, z17.b, z0.b\n"
+ "sdot z21.s, z17.b, z30.b\n"
+ "sdot z22.s, z16.b, z25.b\n"
+ "ext z25.b, z25.b, z25.b, #0x1\n"
+ ".inst 0x04b376f7 // sqrdmulh z23.s, z23.s, z19.s\n"
+ "sdot z24.s, z16.b, z30.b\n"
+ "sdot z21.s, z16.b, z25.b\n"
+ "and z16.d, z23.d, z20.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04b37718 // sqrdmulh z24.s, z24.s, z19.s\n"
+ ".inst 0x04b376d6 // sqrdmulh z22.s, z22.s, z19.s\n"
+ ".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
+ "sqadd z23.s, z23.s, z16.s\n"
+ ".inst 0x44828a97 // srshl z23.s, p2/M, z23.s, z20.s\n"
+ "ld1w { z19.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "and z18.d, z24.d, z20.d\n"
+ "and z17.d, z22.d, z20.d\n"
+ "and z16.d, z21.d, z20.d\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z18.s\n"
+ "sqadd z22.s, z22.s, z17.s\n"
+ ".inst 0x44828a98 // srshl z24.s, p2/M, z24.s, z20.s\n"
+ ".inst 0x44828a96 // srshl z22.s, p2/M, z22.s, z20.s\n"
+ "sqadd z21.s, z21.s, z16.s\n"
+ "add z23.s, z23.s, z2.s\n"
+ ".inst 0x44828a95 // srshl z21.s, p2/M, z21.s, z20.s\n"
+ "smax z23.s, p2/M, z23.s, z4.s\n"
+ "add z24.s, z24.s, z2.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "smin z23.s, p2/M, z23.s, z3.s\n"
+ "smax z24.s, p2/M, z24.s, z4.s\n"
+ "add z21.s, z21.s, z2.s\n"
+ "smax z22.s, p2/M, z22.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z4.s\n"
+ "st1b { z23.s }, p0, [x11, x12]\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+ "smin z24.s, p2/M, z24.s, z3.s\n"
+ "smin z22.s, p2/M, z22.s, z3.s\n"
+ "smin z21.s, p2/M, z21.s, z3.s\n"
+ "st1b { z24.s }, p0, [x10, x12]\n"
+ "mov z29.d, z23.d\n"
+ "ld1b { z17.b }, p2/Z, [%x[params]]\n"
+ "st1b { z22.s }, p0, [x9, x12]\n"
+ "mov z28.d, z23.d\n"
+ "sdot z28.s, z18.b, z10.b\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "st1b { z21.s }, p0, [x28, x12]\n"
+ "mov z27.d, z23.d\n"
+ "sdot z23.s, z18.b, z14.b\n"
+ "sdot z23.s, z17.b, z10.b\n"
+ "ext z14.b, z14.b, z14.b, #0x1\n"
+ "ext z10.b, z10.b, z10.b, #0x1\n"
+ "sdot z29.s, z18.b, z14.b\n"
+ "ld1w { z22.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "sdot z27.s, z18.b, z10.b\n"
+ "sdot z28.s, z17.b, z8.b\n"
+ "incw x12\n"
+ "whilelt p1.s, x12, %x[n_channels]\n"
+ "sdot z23.s, z16.b, z8.b\n"
+ "ext z8.b, z8.b, z8.b, #0x1\n"
+ "sdot z29.s, z17.b, z10.b\n"
+ "whilelt p0.b, x13, %x[n_channels]\n"
+ "sdot z27.s, z17.b, z8.b\n"
+ "sdot z28.s, z16.b, z6.b\n"
+ "ext z6.b, z6.b, z6.b, #0x1\n"
+ "ld1b { z26.b }, p0/Z, [x26, x13]\n"
+ ".inst 0x04b376f7 // sqrdmulh z23.s, z23.s, z19.s\n"
+ "sdot z29.s, z16.b, z8.b\n"
+ "sdot z27.s, z16.b, z6.b\n"
+ "ld1b { z21.b }, p0/Z, [x25, x13]\n"
+ "and z16.d, z23.d, z22.d\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "ld1b { z14.b }, p0/Z, [x23, x13]\n"
+ "ld1b { z25.b }, p0/Z, [x22, x13]\n"
+ ".inst 0x04b377bd // sqrdmulh z29.s, z29.s, z19.s\n"
+ ".inst 0x04b3779c // sqrdmulh z28.s, z28.s, z19.s\n"
+ "ld1b { z20.b }, p0/Z, [x21, x13]\n"
+ "ld1b { z10.b }, p0/Z, [x20, x13]\n"
+ ".inst 0x04b3777b // sqrdmulh z27.s, z27.s, z19.s\n"
+ "sqadd z23.s, z23.s, z16.s\n"
+ ".inst 0x44828ad7 // srshl z23.s, p2/M, z23.s, z22.s\n"
+ "ld1b { z15.b }, p0/Z, [x27, x13]\n"
+ "and z19.d, z29.d, z22.d\n"
+ "and z17.d, z28.d, z22.d\n"
+ "ldp x23, x22, [%x[inptrs], #0x40]\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "and z16.d, z27.d, z22.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "ld1b { z9.b }, p0/Z, [x23, x13]\n"
+ "ld1b { z24.b }, p0/Z, [x22, x13]\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "ld1b { z18.b }, p0/Z, [x21, x13]\n"
+ "ld1b { z8.b }, p0/Z, [x20, x13]\n"
+ "sqadd z29.s, z29.s, z19.s\n"
+ "sqadd z28.s, z28.s, z17.s\n"
+ ".inst 0x44828add // srshl z29.s, p2/M, z29.s, z22.s\n"
+ ".inst 0x44828adc // srshl z28.s, p2/M, z28.s, z22.s\n"
+ "sqadd z27.s, z27.s, z16.s\n"
+ "add z23.s, z23.s, z2.s\n"
+ ".inst 0x44828adb // srshl z27.s, p2/M, z27.s, z22.s\n"
+ "smax z23.s, p2/M, z23.s, z4.s\n"
+ "add z29.s, z29.s, z2.s\n"
+ "add z28.s, z28.s, z2.s\n"
+ "ld1b { z13.b }, p0/Z, [x24, x13]\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "add z27.s, z27.s, z2.s\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "smin z23.s, p2/M, z23.s, z3.s\n"
+ "smax z29.s, p2/M, z29.s, z4.s\n"
+ "smax z28.s, p2/M, z28.s, z4.s\n"
+ "smax z27.s, p2/M, z27.s, z4.s\n"
+ "st1b { z23.s }, p1, [x11, x12]\n"
+ "ld1b { z7.b }, p0/Z, [x23, x13]\n"
+ "ld1b { z23.b }, p0/Z, [x22, x13]\n"
+ "ld1b { z22.b }, p0/Z, [x21, x13]\n"
+ "zip2 z17.b, z15.b, z21.b\n"
+ "zip1 z15.b, z15.b, z21.b\n"
+ "ld1b { z6.b }, p0/Z, [x20, x13]\n"
+ "zip1 z16.b, z26.b, z14.b\n"
+ "zip2 z14.b, z26.b, z14.b\n"
+ "smin z29.s, p2/M, z29.s, z3.s\n"
+ "smin z28.s, p2/M, z28.s, z3.s\n"
+ "smin z27.s, p2/M, z27.s, z3.s\n"
+ "st1b { z29.s }, p1, [x10, x12]\n"
+ "zip2 z12.b, z15.b, z16.b\n"
+ "st1b { z28.s }, p1, [x9, x12]\n"
+ "zip1 z15.b, z15.b, z16.b\n"
+ "zip1 z11.b, z17.b, z14.b\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "st1b { z27.s }, p1, [x28, x12]\n"
+ "zip2 z14.b, z17.b, z14.b\n"
+ "zip2 z21.b, z13.b, z20.b\n"
+ "ld1w { z5.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "zip1 z13.b, z13.b, z20.b\n"
+ "zip1 z20.b, z25.b, z10.b\n"
+ "incw x12\n"
+ "ldp x25, x23, [%x[inptrs], #0x10]\n"
+ "zip2 z10.b, z25.b, z10.b\n"
+ "zip2 z19.b, z9.b, z18.b\n"
+ "ldp x24, x22, [%x[inptrs], #0x20]\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "zip1 z9.b, z9.b, z18.b\n"
+ "zip1 z18.b, z24.b, z8.b\n"
+ "ld1b { z29.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "ld1b { z28.b }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "zip2 z8.b, z24.b, z8.b\n"
+ "zip2 z17.b, z7.b, z22.b\n"
+ "ld1b { z26.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "addvl %x[params], %x[params], #8\n"
+ "zip1 z7.b, z7.b, z22.b\n"
+ "zip1 z16.b, z23.b, z6.b\n"
+ "zip2 z6.b, z23.b, z6.b\n"
+ "zip2 z1.b, z13.b, z20.b\n"
+ "zip1 z13.b, z13.b, z20.b\n"
+ "zip1 z0.b, z21.b, z10.b\n"
+ "zip2 z10.b, z21.b, z10.b\n"
+ "zip2 z31.b, z9.b, z18.b\n"
+ "zip1 z9.b, z9.b, z18.b\n"
+ "zip1 z30.b, z19.b, z8.b\n"
+ "zip2 z8.b, z19.b, z8.b\n"
+ "zip2 z27.b, z7.b, z16.b\n"
+ "zip1 z7.b, z7.b, z16.b\n"
+ "zip1 z25.b, z17.b, z6.b\n"
+ "zip2 z6.b, z17.b, z6.b\n"
+ "mov z24.d, z5.d\n"
+ "mov z22.d, z5.d\n"
+ "mov z21.d, z5.d\n"
+ "b.any 1b\n"
+ : [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..6b006e8d51
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleave_sve_u8q_3x3_dot::get_packed_size(args);
+ }
+
+ void pack_parameters(
+ const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const override
+ {
+ interleave_sve_u8q_3x3_dot::pack_parameters(
+ args.input_channels * args.channel_multiplier, buffer, reinterpret_cast<const int32_t *>(biases),
+ reinterpret_cast<const uint8_t *>(weights), qp, ld_weight_col, ld_weight_row
+ );
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..f0860c98b9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,497 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+#include "arm_gemm.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const unsigned int n_channels, const uint8_t *const *const inptrs, const uint8_t *params, const int32_t *, const arm_gemm::Requantize32& qp, const int32_t *, const int32_t *, uint8_t *const *const outptrs)
+{
+ __asm__ __volatile__(
+ "mov x14, #0x0\n"
+ "whilelt p0.b, x14, %x[n_channels]\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "ldp x25, x24, [%x[inptrs], #0x10]\n"
+ "ldp x23, x22, [%x[inptrs], #0x20]\n"
+ "ldp x13, x21, [%x[inptrs], #0x30]\n"
+ "mov x20, #0x1\n"
+ "ptrue p2.b\n"
+ "ldp x12, x11, [%x[outptrs], #0x0]\n"
+ "ldp x10, x9, [%x[outptrs], #0x10]\n"
+ "orr x20, x20, #0x100\n"
+ "orr x20, x20, #0x10000\n"
+ "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z21.b }, p0/Z, [x26, x14]\n"
+ "dup z25.s, w20\n"
+ "mov x28, #0x0\n"
+ "ldp x27, x26, [%x[inptrs], #0x40]\n"
+ "ld1b { z31.b }, p0/Z, [x25, x14]\n"
+ "zip2 z16.b, z15.b, z31.b\n"
+ "zip1 z15.b, z15.b, z31.b\n"
+ "ld1b { z29.b }, p0/Z, [x24, x14]\n"
+ "ldp x25, x24, [%x[inptrs], #0x50]\n"
+ "zip1 z30.b, z21.b, z29.b\n"
+ "zip2 z29.b, z21.b, z29.b\n"
+ "ld1b { z9.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z20.b }, p0/Z, [x22, x14]\n"
+ "zip2 z13.b, z15.b, z30.b\n"
+ "zip1 z15.b, z15.b, z30.b\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "ld1b { z5.b }, p0/Z, [x13, x14]\n"
+ "zip1 z14.b, z16.b, z29.b\n"
+ "zip2 z29.b, z16.b, z29.b\n"
+ "ld1b { z17.b }, p0/Z, [x21, x14]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "zip2 z31.b, z9.b, z5.b\n"
+ "zip1 z9.b, z9.b, z5.b\n"
+ "ld1b { z18.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z28.b }, p0/Z, [x26, x14]\n"
+ "zip1 z21.b, z20.b, z17.b\n"
+ "zip2 z17.b, z20.b, z17.b\n"
+ "ld1b { z6.b }, p0/Z, [x25, x14]\n"
+ "ld1b { z4.b }, p0/Z, [x24, x14]\n"
+ "zip2 z23.b, z18.b, z6.b\n"
+ "zip1 z18.b, z18.b, z6.b\n"
+ "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z19.b }, p0/Z, [x22, x14]\n"
+ "zip1 z24.b, z28.b, z4.b\n"
+ "zip2 z4.b, z28.b, z4.b\n"
+ "ld1b { z16.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z5.b }, p0/Z, [x20, x14]\n"
+ "zip2 z22.b, z2.b, z16.b\n"
+ "zip1 z2.b, z2.b, z16.b\n"
+ "zip1 z0.b, z19.b, z5.b\n"
+ "zip2 z5.b, z19.b, z5.b\n"
+ "ld1w { z10.s }, p2/Z, [%x[params]]\n"
+ "ld1rw { z7.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z6.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1rw { z8.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "zip2 z19.b, z9.b, z21.b\n"
+ "zip1 z9.b, z9.b, z21.b\n"
+ "ld1rw { z16.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "zip1 z11.b, z31.b, z17.b\n"
+ "zip2 z17.b, z31.b, z17.b\n"
+ "ldp x25, x23, [%x[inptrs], #0x10]\n"
+ "ldp x24, x22, [%x[inptrs], #0x20]\n"
+ "zip2 z12.b, z18.b, z24.b\n"
+ "zip1 z18.b, z18.b, z24.b\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "zip1 z20.b, z23.b, z4.b\n"
+ "zip2 z4.b, z23.b, z4.b\n"
+ "ld1b { z26.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "zip2 z24.b, z2.b, z0.b\n"
+ "zip1 z2.b, z2.b, z0.b\n"
+ "ld1b { z3.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "zip1 z0.b, z22.b, z5.b\n"
+ "zip2 z5.b, z22.b, z5.b\n"
+ "addvl %x[params], %x[params], #4\n"
+ "mov z22.d, z10.d\n"
+ "mov z31.d, z10.d\n"
+ "mov z21.d, z10.d\n"
+ "1:" // Loop
+ "mov z30.s, #0x0\n"
+ "udot z30.s, z25.b, z9.b\n"
+ "udot z10.s, z26.b, z15.b\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "udot z30.s, z25.b, z18.b\n"
+ "udot z31.s, z26.b, z9.b\n"
+ "mov z27.s, #0x0\n"
+ "incw x14, ALL, MUL #4\n"
+ "udot z10.s, z3.b, z9.b\n"
+ "ext z9.b, z9.b, z9.b, #0x1\n"
+ "movprfx z28, z30\n udot z28.s, z25.b, z2.b\n"
+ "udot z30.s, z25.b, z15.b\n"
+ "ext z15.b, z15.b, z15.b, #0x1\n"
+ "udot z27.s, z25.b, z9.b\n"
+ "udot z31.s, z3.b, z18.b\n"
+ "udot z10.s, z1.b, z18.b\n"
+ "ext z18.b, z18.b, z18.b, #0x1\n"
+ "udot z22.s, z26.b, z15.b\n"
+ "udot z21.s, z26.b, z9.b\n"
+ "udot z27.s, z25.b, z18.b\n"
+ "udot z31.s, z1.b, z2.b\n"
+ "ext z2.b, z2.b, z2.b, #0x1\n"
+ "udot z22.s, z3.b, z9.b\n"
+ "udot z21.s, z3.b, z18.b\n"
+ "ld1w { z3.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "mls z10.s, p2/M, z30.s, z8.s\n"
+ "movprfx z26, z27\n udot z26.s, z25.b, z2.b\n"
+ "mov z9.s, #0x0\n"
+ "udot z27.s, z25.b, z15.b\n"
+ "ld1w { z23.s }, p2/Z, [%x[params]]\n"
+ "udot z22.s, z1.b, z18.b\n"
+ ".inst 0x04b7754a // sqrdmulh z10.s, z10.s, z23.s\n"
+ "udot z21.s, z1.b, z2.b\n"
+ "mls z22.s, p2/M, z27.s, z8.s\n"
+ "and z18.d, z10.d, z3.d\n"
+ "mls z31.s, p2/M, z28.s, z8.s\n"
+ "mls z21.s, p2/M, z26.s, z8.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ ".inst 0x04b776d6 // sqrdmulh z22.s, z22.s, z23.s\n"
+ ".inst 0x04b777ff // sqrdmulh z31.s, z31.s, z23.s\n"
+ "udot z9.s, z25.b, z19.b\n"
+ ".inst 0x04b776b5 // sqrdmulh z21.s, z21.s, z23.s\n"
+ "sqadd z10.s, z10.s, z18.s\n"
+ ".inst 0x4482886a // srshl z10.s, p2/M, z10.s, z3.s\n"
+ "udot z9.s, z25.b, z12.b\n"
+ "and z28.d, z22.d, z3.d\n"
+ "and z23.d, z31.d, z3.d\n"
+ "movprfx z27, z9\n udot z27.s, z25.b, z24.b\n"
+ "ld1w { z30.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "and z18.d, z21.d, z3.d\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "udot z9.s, z25.b, z13.b\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z22.s, z22.s, z28.s\n"
+ "sqadd z31.s, z31.s, z23.s\n"
+ ".inst 0x44828876 // srshl z22.s, p2/M, z22.s, z3.s\n"
+ ".inst 0x4482887f // srshl z31.s, p2/M, z31.s, z3.s\n"
+ "sqadd z21.s, z21.s, z18.s\n"
+ "add z10.s, z10.s, z16.s\n"
+ ".inst 0x44828875 // srshl z21.s, p2/M, z21.s, z3.s\n"
+ "smax z10.s, p2/M, z10.s, z7.s\n"
+ "add z22.s, z22.s, z16.s\n"
+ "add z31.s, z31.s, z16.s\n"
+ "smin z10.s, p2/M, z10.s, z6.s\n"
+ "smax z22.s, p2/M, z22.s, z7.s\n"
+ "add z21.s, z21.s, z16.s\n"
+ "smax z31.s, p2/M, z31.s, z7.s\n"
+ "smax z21.s, p2/M, z21.s, z7.s\n"
+ "st1b { z10.s }, p0, [x12, x28]\n"
+ "ld1w { z28.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z1.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ "st1b { z22.s }, p0, [x11, x28]\n"
+ "mov z26.d, z28.d\n"
+ "ld1b { z15.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "st1b { z31.s }, p0, [x10, x28]\n"
+ "mov z31.d, z28.d\n"
+ "udot z31.s, z1.b, z19.b\n"
+ "ld1b { z23.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "st1b { z21.s }, p0, [x9, x28]\n"
+ "mov z22.d, z28.d\n"
+ "udot z28.s, z1.b, z13.b\n"
+ "udot z28.s, z15.b, z19.b\n"
+ "ext z13.b, z13.b, z13.b, #0x1\n"
+ "ext z19.b, z19.b, z19.b, #0x1\n"
+ "udot z26.s, z1.b, z13.b\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "mov z18.s, #0x0\n"
+ "udot z22.s, z1.b, z19.b\n"
+ "udot z18.s, z25.b, z19.b\n"
+ "incw x28\n"
+ "udot z31.s, z15.b, z12.b\n"
+ "udot z28.s, z23.b, z12.b\n"
+ "ext z12.b, z12.b, z12.b, #0x1\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "udot z26.s, z15.b, z19.b\n"
+ "udot z22.s, z15.b, z12.b\n"
+ "addvl %x[params], %x[params], #16\n"
+ "udot z18.s, z25.b, z12.b\n"
+ "udot z31.s, z23.b, z24.b\n"
+ "ext z24.b, z24.b, z24.b, #0x1\n"
+ "mls z28.s, p2/M, z9.s, z8.s\n"
+ "udot z26.s, z23.b, z12.b\n"
+ ".inst 0x04be779c // sqrdmulh z28.s, z28.s, z30.s\n"
+ "udot z22.s, z23.b, z24.b\n"
+ "movprfx z12, z18\n udot z12.s, z25.b, z24.b\n"
+ "and z2.d, z28.d, z21.d\n"
+ "udot z18.s, z25.b, z13.b\n"
+ "mls z26.s, p2/M, z18.s, z8.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ "mls z31.s, p2/M, z27.s, z8.s\n"
+ "mls z22.s, p2/M, z12.s, z8.s\n"
+ ".inst 0x04be775a // sqrdmulh z26.s, z26.s, z30.s\n"
+ ".inst 0x04be77ff // sqrdmulh z31.s, z31.s, z30.s\n"
+ ".inst 0x04be76d6 // sqrdmulh z22.s, z22.s, z30.s\n"
+ "ld1w { z1.s }, p2/Z, [%x[params], #-4, MUL VL]\n"
+ "sqadd z28.s, z28.s, z2.s\n"
+ "and z24.d, z26.d, z21.d\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ "and z23.d, z31.d, z21.d\n"
+ "and z18.d, z22.d, z21.d\n"
+ "asr z24.s, z24.s, #0x1f\n"
+ "asr z23.s, z23.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z26.s, z26.s, z24.s\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ "ld1b { z30.b }, p2/Z, [%x[params], #-6, MUL VL]\n"
+ "sqadd z31.s, z31.s, z23.s\n"
+ "sqadd z22.s, z22.s, z18.s\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ ".inst 0x44828ab6 // srshl z22.s, p2/M, z22.s, z21.s\n"
+ "add z28.s, z28.s, z16.s\n"
+ "smax z28.s, p2/M, z28.s, z7.s\n"
+ "add z26.s, z26.s, z16.s\n"
+ "smin z28.s, p2/M, z28.s, z6.s\n"
+ "add z31.s, z31.s, z16.s\n"
+ "add z22.s, z22.s, z16.s\n"
+ "smax z26.s, p2/M, z26.s, z7.s\n"
+ "smax z31.s, p2/M, z31.s, z7.s\n"
+ "mov z24.s, #0x0\n"
+ "udot z24.s, z25.b, z11.b\n"
+ "smax z22.s, p2/M, z22.s, z7.s\n"
+ "st1b { z28.s }, p0, [x12, x28]\n"
+ "ld1w { z23.s }, p2/Z, [%x[params], #-8, MUL VL]\n"
+ "ld1b { z19.b }, p2/Z, [%x[params], #-7, MUL VL]\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "st1b { z26.s }, p0, [x11, x28]\n"
+ "mov z28.d, z23.d\n"
+ "udot z24.s, z25.b, z20.b\n"
+ "st1b { z31.s }, p0, [x10, x28]\n"
+ "mov z27.d, z23.d\n"
+ "udot z27.s, z19.b, z11.b\n"
+ "movprfx z13, z24\n udot z13.s, z25.b, z0.b\n"
+ "st1b { z22.s }, p0, [x9, x28]\n"
+ "mov z26.d, z23.d\n"
+ "udot z23.s, z19.b, z14.b\n"
+ "udot z23.s, z30.b, z11.b\n"
+ "udot z24.s, z25.b, z14.b\n"
+ "ext z14.b, z14.b, z14.b, #0x1\n"
+ "ld1b { z21.b }, p2/Z, [%x[params], #-5, MUL VL]\n"
+ "udot z28.s, z19.b, z14.b\n"
+ "ext z11.b, z11.b, z11.b, #0x1\n"
+ "mov z12.s, #0x0\n"
+ "udot z26.s, z19.b, z11.b\n"
+ "ld1w { z22.s }, p2/Z, [%x[params], #-3, MUL VL]\n"
+ "udot z12.s, z25.b, z11.b\n"
+ "udot z27.s, z30.b, z20.b\n"
+ "incw x28\n"
+ "whilelt p0.s, x28, %x[n_channels]\n"
+ "udot z23.s, z21.b, z20.b\n"
+ "ext z20.b, z20.b, z20.b, #0x1\n"
+ "udot z28.s, z30.b, z11.b\n"
+ "udot z26.s, z30.b, z20.b\n"
+ "udot z12.s, z25.b, z20.b\n"
+ "udot z27.s, z21.b, z0.b\n"
+ "ext z0.b, z0.b, z0.b, #0x1\n"
+ "mls z23.s, p2/M, z24.s, z8.s\n"
+ "udot z28.s, z21.b, z20.b\n"
+ "udot z26.s, z21.b, z0.b\n"
+ ".inst 0x04a176f7 // sqrdmulh z23.s, z23.s, z1.s\n"
+ "movprfx z19, z12\n udot z19.s, z25.b, z0.b\n"
+ "udot z12.s, z25.b, z14.b\n"
+ "and z18.d, z23.d, z22.d\n"
+ "mls z28.s, p2/M, z12.s, z8.s\n"
+ "mls z27.s, p2/M, z13.s, z8.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "mls z26.s, p2/M, z19.s, z8.s\n"
+ ".inst 0x04a1779c // sqrdmulh z28.s, z28.s, z1.s\n"
+ ".inst 0x04a1777b // sqrdmulh z27.s, z27.s, z1.s\n"
+ ".inst 0x04a1775a // sqrdmulh z26.s, z26.s, z1.s\n"
+ "ld1w { z2.s }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "sqadd z23.s, z23.s, z18.s\n"
+ "and z20.d, z28.d, z22.d\n"
+ ".inst 0x44828ad7 // srshl z23.s, p2/M, z23.s, z22.s\n"
+ "and z19.d, z27.d, z22.d\n"
+ "and z18.d, z26.d, z22.d\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z20.s\n"
+ ".inst 0x44828adc // srshl z28.s, p2/M, z28.s, z22.s\n"
+ "ld1b { z13.b }, p2/Z, [%x[params]]\n"
+ "sqadd z27.s, z27.s, z19.s\n"
+ "sqadd z26.s, z26.s, z18.s\n"
+ ".inst 0x44828adb // srshl z27.s, p2/M, z27.s, z22.s\n"
+ ".inst 0x44828ada // srshl z26.s, p2/M, z26.s, z22.s\n"
+ "add z23.s, z23.s, z16.s\n"
+ "smax z23.s, p2/M, z23.s, z7.s\n"
+ "add z28.s, z28.s, z16.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "add z27.s, z27.s, z16.s\n"
+ "add z26.s, z26.s, z16.s\n"
+ "smax z28.s, p2/M, z28.s, z7.s\n"
+ "smax z27.s, p2/M, z27.s, z7.s\n"
+ "mov z24.s, #0x0\n"
+ "udot z24.s, z25.b, z17.b\n"
+ "smax z26.s, p2/M, z26.s, z7.s\n"
+ "st1b { z23.s }, p0, [x12, x28]\n"
+ "ld1w { z1.s }, p2/Z, [%x[params], #-2, MUL VL]\n"
+ "ld1b { z21.b }, p2/Z, [%x[params], #-1, MUL VL]\n"
+ "smin z28.s, p2/M, z28.s, z6.s\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "st1b { z28.s }, p0, [x11, x28]\n"
+ "mov z0.d, z1.d\n"
+ "udot z24.s, z25.b, z4.b\n"
+ "st1b { z27.s }, p0, [x10, x28]\n"
+ "mov z31.d, z1.d\n"
+ "udot z31.s, z21.b, z17.b\n"
+ "movprfx z23, z24\n udot z23.s, z25.b, z5.b\n"
+ "st1b { z26.s }, p0, [x9, x28]\n"
+ "mov z30.d, z1.d\n"
+ "udot z1.s, z21.b, z29.b\n"
+ "udot z1.s, z13.b, z17.b\n"
+ "udot z24.s, z25.b, z29.b\n"
+ "ext z29.b, z29.b, z29.b, #0x1\n"
+ "ld1b { z20.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "udot z0.s, z21.b, z29.b\n"
+ "ext z17.b, z17.b, z17.b, #0x1\n"
+ "mov z19.s, #0x0\n"
+ "udot z30.s, z21.b, z17.b\n"
+ "ld1w { z22.s }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "udot z19.s, z25.b, z17.b\n"
+ "udot z31.s, z13.b, z4.b\n"
+ "incw x28\n"
+ "whilelt p1.s, x28, %x[n_channels]\n"
+ "udot z1.s, z20.b, z4.b\n"
+ "ext z4.b, z4.b, z4.b, #0x1\n"
+ "udot z0.s, z13.b, z17.b\n"
+ "whilelt p0.b, x14, %x[n_channels]\n"
+ "udot z30.s, z13.b, z4.b\n"
+ "udot z19.s, z25.b, z4.b\n"
+ "ld1b { z13.b }, p0/Z, [x26, x14]\n"
+ "ld1b { z28.b }, p0/Z, [x25, x14]\n"
+ "udot z31.s, z20.b, z5.b\n"
+ "ext z5.b, z5.b, z5.b, #0x1\n"
+ "mls z1.s, p2/M, z24.s, z8.s\n"
+ "ld1b { z27.b }, p0/Z, [x22, x14]\n"
+ "udot z0.s, z20.b, z4.b\n"
+ "udot z30.s, z20.b, z5.b\n"
+ ".inst 0x04a27421 // sqrdmulh z1.s, z1.s, z2.s\n"
+ "ld1b { z26.b }, p0/Z, [x21, x14]\n"
+ "movprfx z18, z19\n udot z18.s, z25.b, z5.b\n"
+ "udot z19.s, z25.b, z29.b\n"
+ "and z11.d, z1.d, z22.d\n"
+ "ld1b { z29.b }, p0/Z, [x23, x14]\n"
+ "mls z0.s, p2/M, z19.s, z8.s\n"
+ "mls z31.s, p2/M, z23.s, z8.s\n"
+ "asr z11.s, z11.s, #0x1f\n"
+ "ld1b { z17.b }, p0/Z, [x20, x14]\n"
+ "mls z30.s, p2/M, z18.s, z8.s\n"
+ ".inst 0x04a27400 // sqrdmulh z0.s, z0.s, z2.s\n"
+ ".inst 0x04a277ff // sqrdmulh z31.s, z31.s, z2.s\n"
+ ".inst 0x04a277de // sqrdmulh z30.s, z30.s, z2.s\n"
+ "ld1b { z15.b }, p0/Z, [x27, x14]\n"
+ "ldp x23, x22, [%x[inptrs], #0x40]\n"
+ "sqadd z1.s, z1.s, z11.s\n"
+ "and z21.d, z0.d, z22.d\n"
+ ".inst 0x44828ac1 // srshl z1.s, p2/M, z1.s, z22.s\n"
+ "ldp x21, x20, [%x[inptrs], #0x50]\n"
+ "and z20.d, z31.d, z22.d\n"
+ "and z19.d, z30.d, z22.d\n"
+ "ld1b { z18.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z11.b }, p0/Z, [x22, x14]\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "ld1b { z24.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z4.b }, p0/Z, [x20, x14]\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "sqadd z0.s, z0.s, z21.s\n"
+ ".inst 0x44828ac0 // srshl z0.s, p2/M, z0.s, z22.s\n"
+ "ld1b { z3.b }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "sqadd z31.s, z31.s, z20.s\n"
+ "sqadd z30.s, z30.s, z19.s\n"
+ ".inst 0x44828adf // srshl z31.s, p2/M, z31.s, z22.s\n"
+ ".inst 0x44828ade // srshl z30.s, p2/M, z30.s, z22.s\n"
+ "add z1.s, z1.s, z16.s\n"
+ "smax z1.s, p2/M, z1.s, z7.s\n"
+ "add z0.s, z0.s, z16.s\n"
+ "ld1b { z9.b }, p0/Z, [x24, x14]\n"
+ "add z31.s, z31.s, z16.s\n"
+ "add z30.s, z30.s, z16.s\n"
+ "ldp x23, x22, [%x[inptrs], #0x60]\n"
+ "ldp x21, x20, [%x[inptrs], #0x70]\n"
+ "smin z1.s, p2/M, z1.s, z6.s\n"
+ "smax z0.s, p2/M, z0.s, z7.s\n"
+ "st1b { z1.s }, p1, [x12, x28]\n"
+ "ld1b { z2.b }, p0/Z, [x23, x14]\n"
+ "smax z31.s, p2/M, z31.s, z7.s\n"
+ "smax z30.s, p2/M, z30.s, z7.s\n"
+ "ld1b { z23.b }, p0/Z, [x22, x14]\n"
+ "ld1b { z22.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z5.b }, p0/Z, [x20, x14]\n"
+ "zip2 z20.b, z15.b, z28.b\n"
+ "zip1 z15.b, z15.b, z28.b\n"
+ "smin z0.s, p2/M, z0.s, z6.s\n"
+ "zip1 z19.b, z13.b, z29.b\n"
+ "zip2 z29.b, z13.b, z29.b\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smin z30.s, p2/M, z30.s, z6.s\n"
+ "st1b { z0.s }, p1, [x11, x28]\n"
+ "zip2 z13.b, z15.b, z19.b\n"
+ "zip1 z15.b, z15.b, z19.b\n"
+ "ldp x27, x26, [%x[inptrs], #0x0]\n"
+ "st1b { z31.s }, p1, [x10, x28]\n"
+ "zip1 z14.b, z20.b, z29.b\n"
+ "zip2 z29.b, z20.b, z29.b\n"
+ "ld1w { z10.s }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "st1b { z30.s }, p1, [x9, x28]\n"
+ "zip2 z21.b, z9.b, z26.b\n"
+ "zip1 z9.b, z9.b, z26.b\n"
+ "incw x28\n"
+ "zip1 z20.b, z27.b, z17.b\n"
+ "zip2 z17.b, z27.b, z17.b\n"
+ "ldp x25, x23, [%x[inptrs], #0x10]\n"
+ "ldp x24, x22, [%x[inptrs], #0x20]\n"
+ "zip2 z31.b, z18.b, z24.b\n"
+ "zip1 z18.b, z18.b, z24.b\n"
+ "ldp x21, x20, [%x[inptrs], #0x30]\n"
+ "ld1b { z26.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "zip1 z27.b, z11.b, z4.b\n"
+ "zip2 z4.b, z11.b, z4.b\n"
+ "ld1b { z1.b }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "addvl %x[params], %x[params], #8\n"
+ "zip2 z30.b, z2.b, z22.b\n"
+ "zip1 z2.b, z2.b, z22.b\n"
+ "zip1 z28.b, z23.b, z5.b\n"
+ "zip2 z5.b, z23.b, z5.b\n"
+ "zip2 z19.b, z9.b, z20.b\n"
+ "zip1 z9.b, z9.b, z20.b\n"
+ "zip1 z11.b, z21.b, z17.b\n"
+ "zip2 z17.b, z21.b, z17.b\n"
+ "zip2 z12.b, z18.b, z27.b\n"
+ "zip1 z18.b, z18.b, z27.b\n"
+ "zip1 z20.b, z31.b, z4.b\n"
+ "zip2 z4.b, z31.b, z4.b\n"
+ "zip2 z24.b, z2.b, z28.b\n"
+ "zip1 z2.b, z2.b, z28.b\n"
+ "zip1 z0.b, z30.b, z5.b\n"
+ "zip2 z5.b, z30.b, z5.b\n"
+ "mov z22.d, z10.d\n"
+ "mov z31.d, z10.d\n"
+ "mov z21.d, z10.d\n"
+ "b.any 1b\n"
+ : [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..0300b71d7c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..5c26010c0d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[16];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[5];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[3];
+ inptrs[3] = inptrs_raw[6];
+ inptrs[4] = inptrs_raw[9];
+ inptrs[5] = inptrs_raw[12];
+ inptrs[6] = inptrs_raw[15];
+ inptrs[7] = inptrs_raw[1];
+ inptrs[8] = inptrs_raw[2];
+ inptrs[9] = inptrs_raw[10];
+ inptrs[10] = inptrs_raw[4];
+ inptrs[11] = inptrs_raw[7];
+ inptrs[12] = inptrs_raw[8];
+ inptrs[13] = inptrs_raw[11];
+ inptrs[14] = inptrs_raw[13];
+ inptrs[15] = inptrs_raw[14];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "mov x16, #0x0\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+ "ptrue p4.b\n"
+ "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "mov x23, x16\n"
+ "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x15, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z12.b }, p4/Z, [x21]\n"
+ "ld1rb { z30.b }, p4/Z, [x20]\n"
+ "add x21, x25, %[offsetof_Requantize32_minval]\n"
+ "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+ "ld1rh { z24.h }, p4/Z, [x22]\n"
+ "ld1rh { z11.h }, p4/Z, [x21]\n"
+ "ld1rh { z26.h }, p4/Z, [x20]\n"
+ "ldp x13, x12, [x24, #0x0]\n"
+ "incw x23\n"
+ "whilelt p3.h, x16, x15\n"
+ "ldp x11, x10, [x24, #0x10]\n"
+ "whilelt p2.s, x16, x15\n"
+ "whilelt p1.s, x23, x15\n"
+ "ldr x9, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1b { z14.h }, p4/Z, [x14]\n"
+ "ld1b { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "add x28, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x27, #0x0\n"
+ "ld1b { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+ ".inst 0x455e19ce // usublb z14.h, z14.b, z30.b\n"
+ ".inst 0x455e1ab5 // usublb z21.h, z21.b, z30.b\n"
+ "ld1b { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+ ".inst 0x455e1821 // usublb z1.h, z1.b, z30.b\n"
+ ".inst 0x455e18c6 // usublb z6.h, z6.b, z30.b\n"
+ "ld1b { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "inch x14, ALL, MUL #8\n"
+ ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
+ "ld1w { z17.s }, p2/Z, [x9]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #1, MUL VL]\n"
+ "uzp1 z5.s, z17.s, z16.s\n"
+ "uzp2 z9.s, z17.s, z16.s\n"
+ "ld1b { z8.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x28, #0x0]\n"
+ "addvl x9, x9, #2\n"
+ "mov z17.d, z5.d\n"
+ "ldp x22, x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x20]\n"
+ "mov z25.d, z9.d\n"
+ "mov z16.d, z5.d\n"
+ "ld1b { z0.h }, p3/Z, [x24, x16]\n"
+ "ld1b { z29.h }, p3/Z, [x23, x16]\n"
+ "mov z23.d, z9.d\n"
+ "mov z22.d, z5.d\n"
+ "ld1b { z4.h }, p3/Z, [x22, x16]\n"
+ "ld1b { z13.h }, p3/Z, [x21, x16]\n"
+ "mov z27.d, z9.d\n"
+ ".inst 0x455e1a52 // usublb z18.h, z18.b, z30.b\n"
+ "ld1b { z20.h }, p3/Z, [x20, x16]\n"
+ "ldr x26, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x455e18e7 // usublb z7.h, z7.b, z30.b\n"
+ ".inst 0x455e194a // usublb z10.h, z10.b, z30.b\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x9, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1a94 // usublb z20.h, z20.b, z12.b\n"
+ "1:" // Loop
+ ".inst 0x44824005 // smlalb z5.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x44824409 // smlalt z9.s, p4/M, z0.h, z2.h\n"
+ "ldr x20, [x28, #0x28]\n"
+ "ldr x21, [x28, #0x38]\n"
+ ".inst 0x448e43a5 // smlalb z5.s, p4/M, z29.h, z14.h\n"
+ ".inst 0x44864011 // smlalb z17.s, p4/M, z0.h, z6.h\n"
+ "ld1b { z3.h }, p3/Z, [x20, x16]\n"
+ "ldr x20, [x28, #0x30]\n"
+ ".inst 0x44954010 // smlalb z16.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x448e4016 // smlalb z22.s, p4/M, z0.h, z14.h\n"
+ "ld1b { z31.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
+ ".inst 0x448e47a9 // smlalt z9.s, p4/M, z29.h, z14.h\n"
+ ".inst 0x449241a5 // smlalb z5.s, p4/M, z13.h, z18.h\n"
+ "ldr x21, [x28, #0x40]\n"
+ "ld1b { z15.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x44864419 // smlalt z25.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
+ "ldr x20, [x28, #0x48]\n"
+ ".inst 0x448e441b // smlalt z27.s, p4/M, z0.h, z14.h\n"
+ ".inst 0x44814091 // smlalb z17.s, p4/M, z4.h, z1.h\n"
+ "ld1b { z19.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x454c19ef // usublb z15.h, z15.b, z12.b\n"
+ ".inst 0x448141b0 // smlalb z16.s, p4/M, z13.h, z1.h\n"
+ ".inst 0x449541b6 // smlalb z22.s, p4/M, z13.h, z21.h\n"
+ "ld1b { z28.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x454c1a73 // usublb z19.h, z19.b, z12.b\n"
+ ".inst 0x449245a9 // smlalt z9.s, p4/M, z13.h, z18.h\n"
+ ".inst 0x448a4285 // smlalb z5.s, p4/M, z20.h, z10.h\n"
+ "ldr x21, [x28, #0x50]\n"
+ "ldr x20, [x28, #0x58]\n"
+ ".inst 0x44814499 // smlalt z25.s, p4/M, z4.h, z1.h\n"
+ ".inst 0x448145b7 // smlalt z23.s, p4/M, z13.h, z1.h\n"
+ ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
+ "ld1b { z4.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x449545bb // smlalt z27.s, p4/M, z13.h, z21.h\n"
+ ".inst 0x448241b1 // smlalb z17.s, p4/M, z13.h, z2.h\n"
+ "ld1b { z29.h }, p3/Z, [x20, x16]\n"
+ "ldr x21, [x28, #0x60]\n"
+ ".inst 0x44874070 // smlalb z16.s, p4/M, z3.h, z7.h\n"
+ ".inst 0x44864296 // smlalb z22.s, p4/M, z20.h, z6.h\n"
+ "ldr x20, [x28, #0x68]\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x448a4689 // smlalt z9.s, p4/M, z20.h, z10.h\n"
+ ".inst 0x449543e5 // smlalb z5.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ "ld1b { z0.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x448245b9 // smlalt z25.s, p4/M, z13.h, z2.h\n"
+ ".inst 0x44874477 // smlalt z23.s, p4/M, z3.h, z7.h\n"
+ "ld1b { z3.h }, p3/Z, [x20, x16]\n"
+ "ldr x20, [x28, #0x70]\n"
+ ".inst 0x4486469b // smlalt z27.s, p4/M, z20.h, z6.h\n"
+ ".inst 0x44874291 // smlalb z17.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ "ld1b { z13.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x44824290 // smlalb z16.s, p4/M, z20.h, z2.h\n"
+ ".inst 0x448841f6 // smlalb z22.s, p4/M, z15.h, z8.h\n"
+ ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
+ "ldr x20, [x28, #0x78]\n"
+ ".inst 0x449547e9 // smlalt z9.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x44814265 // smlalb z5.s, p4/M, z19.h, z1.h\n"
+ ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
+ "whilelt p0.h, x27, x15\n"
+ ".inst 0x44874699 // smlalt z25.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x44824697 // smlalt z23.s, p4/M, z20.h, z2.h\n"
+ "ld1w { z20.s }, p2/Z, [x26]\n"
+ "inch x14\n"
+ ".inst 0x448845fb // smlalt z27.s, p4/M, z15.h, z8.h\n"
+ ".inst 0x448e43f1 // smlalb z17.s, p4/M, z31.h, z14.h\n"
+ "ld1w { z15.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44924390 // smlalb z16.s, p4/M, z28.h, z18.h\n"
+ ".inst 0x44824396 // smlalb z22.s, p4/M, z28.h, z2.h\n"
+ "addvl x26, x26, #2\n"
+ ".inst 0x44814669 // smlalt z9.s, p4/M, z19.h, z1.h\n"
+ ".inst 0x44884385 // smlalb z5.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x448e47f9 // smlalt z25.s, p4/M, z31.h, z14.h\n"
+ ".inst 0x44924797 // smlalt z23.s, p4/M, z28.h, z18.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
+ ".inst 0x4482479b // smlalt z27.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x44954271 // smlalb z17.s, p4/M, z19.h, z21.h\n"
+ "uzp1 z2.s, z20.s, z15.s\n"
+ "inch x16\n"
+ ".inst 0x448e4090 // smlalb z16.s, p4/M, z4.h, z14.h\n"
+ ".inst 0x448143b6 // smlalb z22.s, p4/M, z29.h, z1.h\n"
+ "uzp2 z15.s, z20.s, z15.s\n"
+ "ld1w { z20.s }, p2/Z, [x25]\n"
+ ".inst 0x44884789 // smlalt z9.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x44864085 // smlalb z5.s, p4/M, z4.h, z6.h\n"
+ "mov x20, x16\n"
+ "incw x20\n"
+ ".inst 0x44954679 // smlalt z25.s, p4/M, z19.h, z21.h\n"
+ ".inst 0x448e4497 // smlalt z23.s, p4/M, z4.h, z14.h\n"
+ "ld1w { z19.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "uzp1 z21.s, z20.s, z19.s\n"
+ ".inst 0x448147bb // smlalt z27.s, p4/M, z29.h, z1.h\n"
+ ".inst 0x448a4391 // smlalb z17.s, p4/M, z28.h, z10.h\n"
+ "uzp2 z1.s, z20.s, z19.s\n"
+ "whilelt p2.s, x16, x15\n"
+ ".inst 0x44864010 // smlalb z16.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x44924076 // smlalb z22.s, p4/M, z3.h, z18.h\n"
+ "whilelt p1.s, x20, x15\n"
+ "whilelt p3.h, x16, x15\n"
+ ".inst 0x44864489 // smlalt z9.s, p4/M, z4.h, z6.h\n"
+ ".inst 0x44874005 // smlalb z5.s, p4/M, z0.h, z7.h\n"
+ ".inst 0x04a274a5 // sqrdmulh z5.s, z5.s, z2.s\n"
+ "addvl x25, x25, #2\n"
+ ".inst 0x448a4799 // smlalt z25.s, p4/M, z28.h, z10.h\n"
+ ".inst 0x44864417 // smlalt z23.s, p4/M, z0.h, z6.h\n"
+ "and z19.d, z5.d, z21.d\n"
+ ".inst 0x4492447b // smlalt z27.s, p4/M, z3.h, z18.h\n"
+ ".inst 0x449243b1 // smlalb z17.s, p4/M, z29.h, z18.h\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ ".inst 0x448a41b0 // smlalb z16.s, p4/M, z13.h, z10.h\n"
+ ".inst 0x448741b6 // smlalb z22.s, p4/M, z13.h, z7.h\n"
+ "sqadd z5.s, z5.s, z19.s\n"
+ ".inst 0x448292a5 // srshl z5.s, p4/M, z5.s, z21.s\n"
+ ".inst 0x44874409 // smlalt z9.s, p4/M, z0.h, z7.h\n"
+ ".inst 0x449247b9 // smlalt z25.s, p4/M, z29.h, z18.h\n"
+ ".inst 0x04af7529 // sqrdmulh z9.s, z9.s, z15.s\n"
+ ".inst 0x448a45b7 // smlalt z23.s, p4/M, z13.h, z10.h\n"
+ ".inst 0x448745bb // smlalt z27.s, p4/M, z13.h, z7.h\n"
+ "and z29.d, z9.d, z1.d\n"
+ ".inst 0x44884071 // smlalb z17.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x448843f0 // smlalb z16.s, p4/M, z31.h, z8.h\n"
+ ".inst 0x04a27631 // sqrdmulh z17.s, z17.s, z2.s\n"
+ ".inst 0x448a43f6 // smlalb z22.s, p4/M, z31.h, z10.h\n"
+ ".inst 0x44884479 // smlalt z25.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x04a27610 // sqrdmulh z16.s, z16.s, z2.s\n"
+ ".inst 0x448847f7 // smlalt z23.s, p4/M, z31.h, z8.h\n"
+ ".inst 0x448a47fb // smlalt z27.s, p4/M, z31.h, z10.h\n"
+ ".inst 0x04a276d6 // sqrdmulh z22.s, z22.s, z2.s\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "and z18.d, z17.d, z21.d\n"
+ ".inst 0x04af7739 // sqrdmulh z25.s, z25.s, z15.s\n"
+ "and z20.d, z16.d, z21.d\n"
+ ".inst 0x04af76f7 // sqrdmulh z23.s, z23.s, z15.s\n"
+ "and z19.d, z22.d, z21.d\n"
+ ".inst 0x04af777b // sqrdmulh z27.s, z27.s, z15.s\n"
+ "sqadd z9.s, z9.s, z29.s\n"
+ ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "and z7.d, z25.d, z1.d\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "and z6.d, z23.d, z1.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z2.d, z27.d, z1.d\n"
+ "sqadd z17.s, z17.s, z18.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ ".inst 0x448292b1 // srshl z17.s, p4/M, z17.s, z21.s\n"
+ "sqadd z16.s, z16.s, z20.s\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ ".inst 0x448292b0 // srshl z16.s, p4/M, z16.s, z21.s\n"
+ "sqadd z22.s, z22.s, z19.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ ".inst 0x448292b6 // srshl z22.s, p4/M, z22.s, z21.s\n"
+ "sqadd z25.s, z25.s, z7.s\n"
+ "sqadd z23.s, z23.s, z6.s\n"
+ ".inst 0x44829039 // srshl z25.s, p4/M, z25.s, z1.s\n"
+ ".inst 0x44829037 // srshl z23.s, p4/M, z23.s, z1.s\n"
+ "sqadd z27.s, z27.s, z2.s\n"
+ ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
+ ".inst 0x4482903b // srshl z27.s, p4/M, z27.s, z1.s\n"
+ ".inst 0x45304231 // sqxtnb z17.h, z17.s\n"
+ ".inst 0x45304210 // sqxtnb z16.h, z16.s\n"
+ ".inst 0x453042d6 // sqxtnb z22.h, z22.s\n"
+ ".inst 0x45304525 // sqxtnt z5.h, z9.s\n"
+ ".inst 0x45304731 // sqxtnt z17.h, z25.s\n"
+ ".inst 0x453046f0 // sqxtnt z16.h, z23.s\n"
+ ".inst 0x45304776 // sqxtnt z22.h, z27.s\n"
+ "sqadd z5.h, z5.h, z24.h\n"
+ "smax z5.h, p4/M, z5.h, z11.h\n"
+ "smin z5.h, p4/M, z5.h, z26.h\n"
+ "sqadd z17.h, z17.h, z24.h\n"
+ "sqadd z16.h, z16.h, z24.h\n"
+ "smax z17.h, p4/M, z17.h, z11.h\n"
+ "smax z16.h, p4/M, z16.h, z11.h\n"
+ "sqadd z22.h, z22.h, z24.h\n"
+ "smax z22.h, p4/M, z22.h, z11.h\n"
+ "smin z17.h, p4/M, z17.h, z26.h\n"
+ "st1b { z5.h }, p0, [x13, x27]\n"
+ "smin z16.h, p4/M, z16.h, z26.h\n"
+ "smin z22.h, p4/M, z22.h, z26.h\n"
+ "st1b { z17.h }, p0, [x12, x27]\n"
+ "st1b { z16.h }, p0, [x11, x27]\n"
+ "st1b { z22.h }, p0, [x10, x27]\n"
+ "ld1b { z14.h }, p4/Z, [x14]\n"
+ "ld1b { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "inch x27\n"
+ "ld1b { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+ ".inst 0x455e19ce // usublb z14.h, z14.b, z30.b\n"
+ ".inst 0x455e1ab5 // usublb z21.h, z21.b, z30.b\n"
+ "ld1b { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1b { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+ ".inst 0x455e1821 // usublb z1.h, z1.b, z30.b\n"
+ ".inst 0x455e18c6 // usublb z6.h, z6.b, z30.b\n"
+ "ld1b { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1b { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "inch x14, ALL, MUL #8\n"
+ ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+ "uzp1 z5.s, z17.s, z16.s\n"
+ "uzp2 z9.s, z17.s, z16.s\n"
+ "ld1b { z8.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x28, #0x0]\n"
+ "addvl x21, x21, #2\n"
+ "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x22, x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x20]\n"
+ "mov z17.d, z5.d\n"
+ "mov z25.d, z9.d\n"
+ "ld1b { z0.h }, p3/Z, [x24, x16]\n"
+ "ld1b { z29.h }, p3/Z, [x23, x16]\n"
+ "mov z16.d, z5.d\n"
+ "mov z23.d, z9.d\n"
+ "ld1b { z4.h }, p3/Z, [x22, x16]\n"
+ "ld1b { z13.h }, p3/Z, [x21, x16]\n"
+ "mov z22.d, z5.d\n"
+ "mov z27.d, z9.d\n"
+ "ld1b { z20.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x455e1a52 // usublb z18.h, z18.b, z30.b\n"
+ ".inst 0x455e18e7 // usublb z7.h, z7.b, z30.b\n"
+ ".inst 0x455e194a // usublb z10.h, z10.b, z30.b\n"
+ ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1a94 // usublb z20.h, z20.b, z12.b\n"
+ "b.any 1b\n"
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..bcd0d60d3c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..1ea2fcbfbd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[25];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[12];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[1];
+ inptrs[3] = inptrs_raw[3];
+ inptrs[4] = inptrs_raw[4];
+ inptrs[5] = inptrs_raw[5];
+ inptrs[6] = inptrs_raw[6];
+ inptrs[7] = inptrs_raw[2];
+ inptrs[8] = inptrs_raw[8];
+ inptrs[9] = inptrs_raw[9];
+ inptrs[10] = inptrs_raw[7];
+ inptrs[11] = inptrs_raw[15];
+ inptrs[12] = inptrs_raw[10];
+ inptrs[13] = inptrs_raw[16];
+ inptrs[14] = inptrs_raw[11];
+ inptrs[15] = inptrs_raw[18];
+ inptrs[16] = inptrs_raw[13];
+ inptrs[17] = inptrs_raw[19];
+ inptrs[18] = inptrs_raw[20];
+ inptrs[19] = inptrs_raw[14];
+ inptrs[20] = inptrs_raw[21];
+ inptrs[21] = inptrs_raw[17];
+ inptrs[22] = inptrs_raw[23];
+ inptrs[23] = inptrs_raw[22];
+ inptrs[24] = inptrs_raw[24];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "mov x7, #0x0\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+ "ptrue p4.b\n"
+ "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "mov x23, x7\n"
+ "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z26.b }, p4/Z, [x21]\n"
+ "ld1rb { z13.b }, p4/Z, [x20]\n"
+ "add x21, x25, %[offsetof_Requantize32_minval]\n"
+ "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+ "ld1rh { z19.h }, p4/Z, [x22]\n"
+ "ld1rh { z12.h }, p4/Z, [x21]\n"
+ "ld1rh { z9.h }, p4/Z, [x20]\n"
+ "ldp x16, x15, [x24, #0x0]\n"
+ "incw x23\n"
+ "whilelt p3.h, x7, x8\n"
+ "ldp x14, x13, [x24, #0x10]\n"
+ "whilelt p2.s, x7, x8\n"
+ "whilelt p1.s, x23, x8\n"
+ "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1b { z25.h }, p4/Z, [x17]\n"
+ "ld1b { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "add x11, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x10, #0x0\n"
+ "ld1b { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1b { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+ ".inst 0x454d1b39 // usublb z25.h, z25.b, z13.b\n"
+ ".inst 0x454d1bde // usublb z30.h, z30.b, z13.b\n"
+ "ld1b { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1b { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+ ".inst 0x454d19ce // usublb z14.h, z14.b, z13.b\n"
+ ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n"
+ "ld1b { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1b { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+ "inch x17, ALL, MUL #8\n"
+ ".inst 0x454d194a // usublb z10.h, z10.b, z13.b\n"
+ "ld1w { z17.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z8.s, z17.s, z16.s\n"
+ "uzp2 z24.s, z17.s, z16.s\n"
+ "ld1b { z2.h }, p4/Z, [x17]\n"
+ "ldp x27, x26, [x11, #0x0]\n"
+ "addvl x12, x12, #2\n"
+ "mov z18.d, z8.d\n"
+ "ldp x25, x24, [x11, #0x10]\n"
+ "ldp x23, x22, [x11, #0x20]\n"
+ "mov z0.d, z24.d\n"
+ "mov z15.d, z8.d\n"
+ "ldp x21, x20, [x11, #0x30]\n"
+ "ld1b { z21.h }, p3/Z, [x27, x7]\n"
+ "mov z1.d, z24.d\n"
+ "mov z5.d, z8.d\n"
+ "ld1b { z22.h }, p3/Z, [x26, x7]\n"
+ "ld1b { z11.h }, p3/Z, [x25, x7]\n"
+ "mov z6.d, z24.d\n"
+ ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n"
+ "ld1b { z20.h }, p3/Z, [x24, x7]\n"
+ "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+ ".inst 0x454d1af7 // usublb z23.h, z23.b, z13.b\n"
+ ".inst 0x454d18e7 // usublb z7.h, z7.b, z13.b\n"
+ "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+ "ld1b { z16.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n"
+ ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
+ "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+ "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
+ ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
+ ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
+ ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
+ "1:" // Loop
+ ".inst 0x448242a8 // smlalb z8.s, p4/M, z21.h, z2.h\n"
+ "ldr x21, [x11, #0x58]\n"
+ "ldr x20, [x11, #0x78]\n"
+ ".inst 0x448246b8 // smlalt z24.s, p4/M, z21.h, z2.h\n"
+ ".inst 0x449942c8 // smlalb z8.s, p4/M, z22.h, z25.h\n"
+ "ld1b { z17.h }, p3/Z, [x21, x7]\n"
+ "ld1b { z29.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x449742b2 // smlalb z18.s, p4/M, z21.h, z23.h\n"
+ "ldr x21, [x11, #0x60]\n"
+ "ldr x20, [x11, #0x80]\n"
+ ".inst 0x448e42af // smlalb z15.s, p4/M, z21.h, z14.h\n"
+ ".inst 0x449942a5 // smlalb z5.s, p4/M, z21.h, z25.h\n"
+ ".inst 0x449946d8 // smlalt z24.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x455a1a31 // usublb z17.h, z17.b, z26.b\n"
+ ".inst 0x449e4168 // smlalb z8.s, p4/M, z11.h, z30.h\n"
+ "ld1b { z22.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x455a1bbd // usublb z29.h, z29.b, z26.b\n"
+ ".inst 0x449746a0 // smlalt z0.s, p4/M, z21.h, z23.h\n"
+ ".inst 0x448e46a1 // smlalt z1.s, p4/M, z21.h, z14.h\n"
+ "ldr x21, [x11, #0x68]\n"
+ ".inst 0x449946a6 // smlalt z6.s, p4/M, z21.h, z25.h\n"
+ "ld1b { z21.h }, p3/Z, [x20, x7]\n"
+ "ldr x20, [x11, #0x88]\n"
+ ".inst 0x449e4292 // smlalb z18.s, p4/M, z20.h, z30.h\n"
+ ".inst 0x4484422f // smlalb z15.s, p4/M, z17.h, z4.h\n"
+ ".inst 0x448a43a5 // smlalb z5.s, p4/M, z29.h, z10.h\n"
+ ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
+ "ldr x22, [x11, #0x40]\n"
+ ".inst 0x449e4578 // smlalt z24.s, p4/M, z11.h, z30.h\n"
+ ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
+ ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
+ "ld1b { z11.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x449e4680 // smlalt z0.s, p4/M, z20.h, z30.h\n"
+ "ld1b { z20.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x44844621 // smlalt z1.s, p4/M, z17.h, z4.h\n"
+ "ldr x21, [x11, #0x70]\n"
+ ".inst 0x448a47a6 // smlalt z6.s, p4/M, z29.h, z10.h\n"
+ "ldr x20, [x11, #0x98]\n"
+ ".inst 0x448e4372 // smlalb z18.s, p4/M, z27.h, z14.h\n"
+ "ldr x23, [x11, #0x50]\n"
+ ".inst 0x449942cf // smlalb z15.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x449e42a5 // smlalb z5.s, p4/M, z21.h, z30.h\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ "ld1b { z17.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x44844798 // smlalt z24.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x448a4208 // smlalb z8.s, p4/M, z16.h, z10.h\n"
+ "ld1b { z29.h }, p3/Z, [x21, x7]\n"
+ "ld1b { z28.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x448e4760 // smlalt z0.s, p4/M, z27.h, z14.h\n"
+ "ldr x22, [x11, #0x48]\n"
+ ".inst 0x449946c1 // smlalt z1.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x449e46a6 // smlalt z6.s, p4/M, z21.h, z30.h\n"
+ "ldr x21, [x11, #0x90]\n"
+ "ldr x20, [x11, #0xa8]\n"
+ ".inst 0x449943f2 // smlalb z18.s, p4/M, z31.h, z25.h\n"
+ "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+ ".inst 0x448a416f // smlalb z15.s, p4/M, z11.h, z10.h\n"
+ ".inst 0x44834285 // smlalb z5.s, p4/M, z20.h, z3.h\n"
+ ".inst 0x455a1a31 // usublb z17.h, z17.b, z26.b\n"
+ ".inst 0x448a4618 // smlalt z24.s, p4/M, z16.h, z10.h\n"
+ ".inst 0x455a1bbd // usublb z29.h, z29.b, z26.b\n"
+ ".inst 0x448e43e8 // smlalb z8.s, p4/M, z31.h, z14.h\n"
+ "ld1b { z16.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
+ ".inst 0x449947e0 // smlalt z0.s, p4/M, z31.h, z25.h\n"
+ "ld1b { z25.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x448a4561 // smlalt z1.s, p4/M, z11.h, z10.h\n"
+ "ld1b { z11.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
+ ".inst 0x44834686 // smlalt z6.s, p4/M, z20.h, z3.h\n"
+ "ldr x21, [x11, #0xa0]\n"
+ "ldr x20, [x11, #0xb0]\n"
+ ".inst 0x448a4232 // smlalb z18.s, p4/M, z17.h, z10.h\n"
+ ".inst 0x449e43af // smlalb z15.s, p4/M, z29.h, z30.h\n"
+ ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
+ ".inst 0x448e4385 // smlalb z5.s, p4/M, z28.h, z14.h\n"
+ ".inst 0x448e47f8 // smlalt z24.s, p4/M, z31.h, z14.h\n"
+ ".inst 0x455a1b39 // usublb z25.h, z25.b, z26.b\n"
+ "ld1b { z20.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x448a4620 // smlalt z0.s, p4/M, z17.h, z10.h\n"
+ ".inst 0x449e47a1 // smlalt z1.s, p4/M, z29.h, z30.h\n"
+ ".inst 0x448e4786 // smlalt z6.s, p4/M, z28.h, z14.h\n"
+ "ldr x20, [x11, #0xb8]\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x44834212 // smlalb z18.s, p4/M, z16.h, z3.h\n"
+ ".inst 0x4497432f // smlalb z15.s, p4/M, z25.h, z23.h\n"
+ ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
+ "ld1b { z30.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x44844165 // smlalb z5.s, p4/M, z11.h, z4.h\n"
+ ".inst 0x44834778 // smlalt z24.s, p4/M, z27.h, z3.h\n"
+ "ldr x20, [x11, #0xc0]\n"
+ "ld1w { z17.s }, p2/Z, [x9]\n"
+ ".inst 0x449742c8 // smlalb z8.s, p4/M, z22.h, z23.h\n"
+ ".inst 0x44834600 // smlalt z0.s, p4/M, z16.h, z3.h\n"
+ "ld1w { z14.s }, p1/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x455a1bde // usublb z30.h, z30.b, z26.b\n"
+ ".inst 0x44974721 // smlalt z1.s, p4/M, z25.h, z23.h\n"
+ ".inst 0x44844566 // smlalt z6.s, p4/M, z11.h, z4.h\n"
+ "ld1b { z25.h }, p3/Z, [x20, x7]\n"
+ "uzp1 z10.s, z17.s, z14.s\n"
+ ".inst 0x44844372 // smlalb z18.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x4487428f // smlalb z15.s, p4/M, z20.h, z7.h\n"
+ "uzp2 z14.s, z17.s, z14.s\n"
+ "ld1w { z17.s }, p2/Z, [x28]\n"
+ ".inst 0x448743e5 // smlalb z5.s, p4/M, z31.h, z7.h\n"
+ ".inst 0x449746d8 // smlalt z24.s, p4/M, z22.h, z23.h\n"
+ "ld1w { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x455a1b39 // usublb z25.h, z25.b, z26.b\n"
+ ".inst 0x448743a8 // smlalb z8.s, p4/M, z29.h, z7.h\n"
+ ".inst 0x44844760 // smlalt z0.s, p4/M, z27.h, z4.h\n"
+ "uzp1 z4.s, z17.s, z16.s\n"
+ "inch x7\n"
+ ".inst 0x44874681 // smlalt z1.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x448747e6 // smlalt z6.s, p4/M, z31.h, z7.h\n"
+ ".inst 0x04aa7508 // sqrdmulh z8.s, z8.s, z10.s\n"
+ "whilelt p0.h, x10, x8\n"
+ ".inst 0x448742b2 // smlalb z18.s, p4/M, z21.h, z7.h\n"
+ ".inst 0x4483416f // smlalb z15.s, p4/M, z11.h, z3.h\n"
+ "uzp2 z22.s, z17.s, z16.s\n"
+ "mov x20, x7\n"
+ ".inst 0x449743c5 // smlalb z5.s, p4/M, z30.h, z23.h\n"
+ ".inst 0x448747b8 // smlalt z24.s, p4/M, z29.h, z7.h\n"
+ "and z17.d, z8.d, z4.d\n"
+ "inch x17\n"
+ ".inst 0x448746a0 // smlalt z0.s, p4/M, z21.h, z7.h\n"
+ ".inst 0x44834561 // smlalt z1.s, p4/M, z11.h, z3.h\n"
+ ".inst 0x04ae7718 // sqrdmulh z24.s, z24.s, z14.s\n"
+ "incw x20\n"
+ ".inst 0x449747c6 // smlalt z6.s, p4/M, z30.h, z23.h\n"
+ ".inst 0x44824392 // smlalb z18.s, p4/M, z28.h, z2.h\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "whilelt p2.s, x7, x8\n"
+ ".inst 0x448243cf // smlalb z15.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x44824325 // smlalb z5.s, p4/M, z25.h, z2.h\n"
+ "and z16.d, z24.d, z22.d\n"
+ "whilelt p1.s, x20, x8\n"
+ ".inst 0x44824780 // smlalt z0.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x448247c1 // smlalt z1.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x04aa7652 // sqrdmulh z18.s, z18.s, z10.s\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44824726 // smlalt z6.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x04aa75ef // sqrdmulh z15.s, z15.s, z10.s\n"
+ "whilelt p3.h, x7, x8\n"
+ "addvl x9, x9, #2\n"
+ ".inst 0x04aa74a5 // sqrdmulh z5.s, z5.s, z10.s\n"
+ "sqadd z8.s, z8.s, z17.s\n"
+ ".inst 0x44829088 // srshl z8.s, p4/M, z8.s, z4.s\n"
+ "addvl x28, x28, #2\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "and z21.d, z18.d, z4.d\n"
+ ".inst 0x04ae7400 // sqrdmulh z0.s, z0.s, z14.s\n"
+ "and z20.d, z15.d, z4.d\n"
+ ".inst 0x04ae7421 // sqrdmulh z1.s, z1.s, z14.s\n"
+ "and z28.d, z5.d, z4.d\n"
+ ".inst 0x04ae74c6 // sqrdmulh z6.s, z6.s, z14.s\n"
+ "sqadd z24.s, z24.s, z16.s\n"
+ ".inst 0x448292d8 // srshl z24.s, p4/M, z24.s, z22.s\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "and z25.d, z0.d, z22.d\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "and z17.d, z1.d, z22.d\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "and z16.d, z6.d, z22.d\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ ".inst 0x44829092 // srshl z18.s, p4/M, z18.s, z4.s\n"
+ "sqadd z15.s, z15.s, z20.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x4482908f // srshl z15.s, p4/M, z15.s, z4.s\n"
+ "sqadd z5.s, z5.s, z28.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x44829085 // srshl z5.s, p4/M, z5.s, z4.s\n"
+ "sqadd z0.s, z0.s, z25.s\n"
+ "sqadd z1.s, z1.s, z17.s\n"
+ ".inst 0x448292c0 // srshl z0.s, p4/M, z0.s, z22.s\n"
+ ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
+ "sqadd z6.s, z6.s, z16.s\n"
+ ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x448292c6 // srshl z6.s, p4/M, z6.s, z22.s\n"
+ ".inst 0x45304252 // sqxtnb z18.h, z18.s\n"
+ ".inst 0x453041ef // sqxtnb z15.h, z15.s\n"
+ ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
+ ".inst 0x45304708 // sqxtnt z8.h, z24.s\n"
+ ".inst 0x45304412 // sqxtnt z18.h, z0.s\n"
+ ".inst 0x4530442f // sqxtnt z15.h, z1.s\n"
+ ".inst 0x453044c5 // sqxtnt z5.h, z6.s\n"
+ "sqadd z8.h, z8.h, z19.h\n"
+ "smax z8.h, p4/M, z8.h, z12.h\n"
+ "smin z8.h, p4/M, z8.h, z9.h\n"
+ "sqadd z18.h, z18.h, z19.h\n"
+ "sqadd z15.h, z15.h, z19.h\n"
+ "smax z18.h, p4/M, z18.h, z12.h\n"
+ "smax z15.h, p4/M, z15.h, z12.h\n"
+ "sqadd z5.h, z5.h, z19.h\n"
+ "smax z5.h, p4/M, z5.h, z12.h\n"
+ "smin z18.h, p4/M, z18.h, z9.h\n"
+ "st1b { z8.h }, p0, [x16, x10]\n"
+ "smin z15.h, p4/M, z15.h, z9.h\n"
+ "smin z5.h, p4/M, z5.h, z9.h\n"
+ "st1b { z18.h }, p0, [x15, x10]\n"
+ "st1b { z15.h }, p0, [x14, x10]\n"
+ "st1b { z5.h }, p0, [x13, x10]\n"
+ "ld1b { z25.h }, p4/Z, [x17]\n"
+ "ld1b { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "inch x10\n"
+ "ld1b { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1b { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+ ".inst 0x454d1b39 // usublb z25.h, z25.b, z13.b\n"
+ ".inst 0x454d1bde // usublb z30.h, z30.b, z13.b\n"
+ "ld1b { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1b { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+ ".inst 0x454d19ce // usublb z14.h, z14.b, z13.b\n"
+ ".inst 0x454d1884 // usublb z4.h, z4.b, z13.b\n"
+ "ld1b { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1b { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+ "inch x17, ALL, MUL #8\n"
+ ".inst 0x454d194a // usublb z10.h, z10.b, z13.b\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
+ "uzp1 z8.s, z17.s, z16.s\n"
+ "uzp2 z24.s, z17.s, z16.s\n"
+ "ld1b { z2.h }, p4/Z, [x17]\n"
+ "ldp x27, x26, [x11, #0x0]\n"
+ "addvl x20, x20, #2\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x25, x24, [x11, #0x10]\n"
+ "ldp x23, x22, [x11, #0x20]\n"
+ "mov z18.d, z8.d\n"
+ "mov z0.d, z24.d\n"
+ "ldp x21, x20, [x11, #0x30]\n"
+ "ld1b { z21.h }, p3/Z, [x27, x7]\n"
+ "mov z15.d, z8.d\n"
+ "mov z1.d, z24.d\n"
+ "ld1b { z22.h }, p3/Z, [x26, x7]\n"
+ "ld1b { z11.h }, p3/Z, [x25, x7]\n"
+ "mov z5.d, z8.d\n"
+ "mov z6.d, z24.d\n"
+ "ld1b { z20.h }, p3/Z, [x24, x7]\n"
+ "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+ ".inst 0x454d1863 // usublb z3.h, z3.b, z13.b\n"
+ ".inst 0x454d1af7 // usublb z23.h, z23.b, z13.b\n"
+ "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+ "ld1b { z16.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x454d18e7 // usublb z7.h, z7.b, z13.b\n"
+ ".inst 0x454d1842 // usublb z2.h, z2.b, z13.b\n"
+ "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
+ ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
+ ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
+ ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
+ ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
+ "b.any 1b\n"
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..dfaa059e9f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..b8adbb8262
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,652 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[36];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[0];
+ inptrs[1] = inptrs_raw[1];
+ inptrs[2] = inptrs_raw[6];
+ inptrs[3] = inptrs_raw[7];
+ inptrs[4] = inptrs_raw[2];
+ inptrs[5] = inptrs_raw[8];
+ inptrs[6] = inptrs_raw[3];
+ inptrs[7] = inptrs_raw[4];
+ inptrs[8] = inptrs_raw[11];
+ inptrs[9] = inptrs_raw[12];
+ inptrs[10] = inptrs_raw[9];
+ inptrs[11] = inptrs_raw[10];
+ inptrs[12] = inptrs_raw[5];
+ inptrs[13] = inptrs_raw[13];
+ inptrs[14] = inptrs_raw[14];
+ inptrs[15] = inptrs_raw[15];
+ inptrs[16] = inptrs_raw[16];
+ inptrs[17] = inptrs_raw[17];
+ inptrs[18] = inptrs_raw[18];
+ inptrs[19] = inptrs_raw[19];
+ inptrs[20] = inptrs_raw[20];
+ inptrs[21] = inptrs_raw[21];
+ inptrs[22] = inptrs_raw[22];
+ inptrs[23] = inptrs_raw[23];
+ inptrs[24] = inptrs_raw[24];
+ inptrs[25] = inptrs_raw[25];
+ inptrs[26] = inptrs_raw[26];
+ inptrs[27] = inptrs_raw[27];
+ inptrs[28] = inptrs_raw[28];
+ inptrs[29] = inptrs_raw[29];
+ inptrs[30] = inptrs_raw[30];
+ inptrs[31] = inptrs_raw[31];
+ inptrs[32] = inptrs_raw[32];
+ inptrs[33] = inptrs_raw[33];
+ inptrs[34] = inptrs_raw[34];
+ inptrs[35] = inptrs_raw[35];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "mov x2, #0x0\n"
+ "mov x24, x2\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ptrue p4.b\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "incw x24\n"
+ "ldr x4, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x21, x23, %[offsetof_Requantize32_a_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+ "ld1rb { z30.b }, p4/Z, [x21]\n"
+ "ld1rb { z10.b }, p4/Z, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_minval]\n"
+ "ld1rh { z15.h }, p4/Z, [x21]\n"
+ "ld1rh { z12.h }, p4/Z, [x20]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1rh { z13.h }, p4/Z, [x20]\n"
+ "ldp x5, x6, [x22, #0x0]\n"
+ "whilelt p3.h, x2, x3\n"
+ "ldp x7, x8, [x22, #0x10]\n"
+ "whilelt p2.s, x2, x3\n"
+ "whilelt p1.s, x24, x3\n"
+ "ldr x10, [%x[params], %[offsetof_Params_bias]]\n"
+ "add x17, %x[params], %[offsetof_Params_inptrs]\n"
+ "ld1w { z17.s }, p2/Z, [x10]\n"
+ "ld1w { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z14.s, z17.s, z16.s\n"
+ "ld1b { z26.h }, p4/Z, [x4]\n"
+ "ld1b { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "uzp2 z23.s, z17.s, z16.s\n"
+ "addvl x10, x10, #2\n"
+ "ld1b { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1b { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "mov x16, #0x0\n"
+ "mov z6.d, z14.d\n"
+ "ld1b { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x17, #0x0]\n"
+ "mov z18.d, z23.d\n"
+ "mov z9.d, z14.d\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "mov z20.d, z23.d\n"
+ "mov z7.d, z14.d\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "mov z1.d, z23.d\n"
+ ".inst 0x454a1b5a // usublb z26.h, z26.b, z10.b\n"
+ "ld1b { z22.h }, p3/Z, [x9, x2]\n"
+ "ld1b { z2.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454a1908 // usublb z8.h, z8.b, z10.b\n"
+ ".inst 0x454a1a10 // usublb z16.h, z16.b, z10.b\n"
+ "ld1b { z11.h }, p3/Z, [x27, x2]\n"
+ "ld1b { z3.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
+ ".inst 0x454a1a31 // usublb z17.h, z17.b, z10.b\n"
+ "ld1b { z29.h }, p3/Z, [x25, x2]\n"
+ "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
+ ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
+ "ld1b { z31.h }, p3/Z, [x23, x2]\n"
+ "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ "ld1b { z19.h }, p3/Z, [x21, x2]\n"
+ "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ "ldr x15, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x10, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x455e1a73 // usublb z19.h, z19.b, z30.b\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
+ "1:" // Loop
+ ".inst 0x449a42ce // smlalb z14.s, p4/M, z22.h, z26.h\n"
+ ".inst 0x449a46d7 // smlalt z23.s, p4/M, z22.h, z26.h\n"
+ "ldr x20, [x17, #0x50]\n"
+ "ld1b { z27.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x4488404e // smlalb z14.s, p4/M, z2.h, z8.h\n"
+ ".inst 0x449a4046 // smlalb z6.s, p4/M, z2.h, z26.h\n"
+ "ldr x20, [x17, #0x58]\n"
+ ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449a4169 // smlalb z9.s, p4/M, z11.h, z26.h\n"
+ ".inst 0x449a4067 // smlalb z7.s, p4/M, z3.h, z26.h\n"
+ "ld1b { z5.h }, p3/Z, [x20, x2]\n"
+ "ldr x20, [x17, #0x60]\n"
+ ".inst 0x44884457 // smlalt z23.s, p4/M, z2.h, z8.h\n"
+ ".inst 0x449043ae // smlalb z14.s, p4/M, z29.h, z16.h\n"
+ "ld1b { z25.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
+ ".inst 0x449a4452 // smlalt z18.s, p4/M, z2.h, z26.h\n"
+ ".inst 0x449a4574 // smlalt z20.s, p4/M, z11.h, z26.h\n"
+ "ld1b { z22.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454a1b39 // usublb z25.h, z25.b, z10.b\n"
+ ".inst 0x449a4461 // smlalt z1.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x448843a6 // smlalb z6.s, p4/M, z29.h, z8.h\n"
+ "ldr x20, [x17, #0x68]\n"
+ "ld1b { z2.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x44884069 // smlalb z9.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x44884087 // smlalb z7.s, p4/M, z4.h, z8.h\n"
+ ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
+ "ld1b { z26.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x449047b7 // smlalt z23.s, p4/M, z29.h, z16.h\n"
+ ".inst 0x449543ee // smlalb z14.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454a1842 // usublb z2.h, z2.b, z10.b\n"
+ "ldr x20, [x17, #0x70]\n"
+ ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x44884474 // smlalt z20.s, p4/M, z3.h, z8.h\n"
+ "ld1b { z29.h }, p4/Z, [x4, #7, MUL VL]\n"
+ ".inst 0x455e1b5a // usublb z26.h, z26.b, z30.b\n"
+ ".inst 0x44884481 // smlalt z1.s, p4/M, z4.h, z8.h\n"
+ ".inst 0x449043e6 // smlalb z6.s, p4/M, z31.h, z16.h\n"
+ "inch x4, ALL, MUL #8\n"
+ "ld1b { z8.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x44904089 // smlalb z9.s, p4/M, z4.h, z16.h\n"
+ ".inst 0x44904367 // smlalb z7.s, p4/M, z27.h, z16.h\n"
+ ".inst 0x454a1bbd // usublb z29.h, z29.b, z10.b\n"
+ "ldr x20, [x17, #0x78]\n"
+ ".inst 0x449547f7 // smlalt z23.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x4491400e // smlalb z14.s, p4/M, z0.h, z17.h\n"
+ "ld1b { z24.h }, p4/Z, [x4]\n"
+ ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
+ ".inst 0x449047f2 // smlalt z18.s, p4/M, z31.h, z16.h\n"
+ ".inst 0x44904494 // smlalt z20.s, p4/M, z4.h, z16.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
+ ".inst 0x44904761 // smlalt z1.s, p4/M, z27.h, z16.h\n"
+ ".inst 0x44954006 // smlalb z6.s, p4/M, z0.h, z21.h\n"
+ "ldr x22, [x17, #0x80]\n"
+ "ld1b { z16.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x44954369 // smlalb z9.s, p4/M, z27.h, z21.h\n"
+ ".inst 0x449540a7 // smlalb z7.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ "ldr x21, [x17, #0x88]\n"
+ ".inst 0x44914417 // smlalt z23.s, p4/M, z0.h, z17.h\n"
+ ".inst 0x4499416e // smlalb z14.s, p4/M, z11.h, z25.h\n"
+ ".inst 0x454a1a10 // usublb z16.h, z16.b, z10.b\n"
+ "ldr x20, [x17, #0x90]\n"
+ ".inst 0x44954412 // smlalt z18.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x44954774 // smlalt z20.s, p4/M, z27.h, z21.h\n"
+ "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x449544a1 // smlalt z1.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x449142c6 // smlalb z6.s, p4/M, z22.h, z17.h\n"
+ "ld1b { z21.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
+ ".inst 0x449140a9 // smlalb z9.s, p4/M, z5.h, z17.h\n"
+ ".inst 0x44914267 // smlalb z7.s, p4/M, z19.h, z17.h\n"
+ "ldr x23, [x17, #0x98]\n"
+ "ldr x22, [x17, #0xa0]\n"
+ ".inst 0x44994577 // smlalt z23.s, p4/M, z11.h, z25.h\n"
+ ".inst 0x4482406e // smlalb z14.s, p4/M, z3.h, z2.h\n"
+ "ld1b { z11.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
+ ".inst 0x449146d2 // smlalt z18.s, p4/M, z22.h, z17.h\n"
+ ".inst 0x449144b4 // smlalt z20.s, p4/M, z5.h, z17.h\n"
+ "ld1b { z22.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x454a1ad6 // usublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44914661 // smlalt z1.s, p4/M, z19.h, z17.h\n"
+ ".inst 0x44994066 // smlalb z6.s, p4/M, z3.h, z25.h\n"
+ "ld1b { z17.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1a31 // usublb z17.h, z17.b, z30.b\n"
+ ".inst 0x44994389 // smlalb z9.s, p4/M, z28.h, z25.h\n"
+ ".inst 0x44994347 // smlalb z7.s, p4/M, z26.h, z25.h\n"
+ "ldr x20, [x17, #0xa8]\n"
+ "ldr x21, [x17, #0xb0]\n"
+ ".inst 0x44824477 // smlalt z23.s, p4/M, z3.h, z2.h\n"
+ ".inst 0x449d408e // smlalb z14.s, p4/M, z4.h, z29.h\n"
+ "ldr x13, [x17, #0xb8]\n"
+ "ldr x12, [x17, #0xc0]\n"
+ ".inst 0x44994472 // smlalt z18.s, p4/M, z3.h, z25.h\n"
+ ".inst 0x44994794 // smlalt z20.s, p4/M, z28.h, z25.h\n"
+ "ld1b { z3.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ ".inst 0x44994741 // smlalt z1.s, p4/M, z26.h, z25.h\n"
+ ".inst 0x44824086 // smlalb z6.s, p4/M, z4.h, z2.h\n"
+ "ld1b { z25.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x454a1b39 // usublb z25.h, z25.b, z10.b\n"
+ ".inst 0x44824349 // smlalb z9.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44824107 // smlalb z7.s, p4/M, z8.h, z2.h\n"
+ "ldr x11, [x17, #0xc8]\n"
+ "ldr x10, [x17, #0xd0]\n"
+ ".inst 0x449d4497 // smlalt z23.s, p4/M, z4.h, z29.h\n"
+ ".inst 0x4498436e // smlalb z14.s, p4/M, z27.h, z24.h\n"
+ "ldr x9, [x17, #0xd8]\n"
+ "ldr x28, [x17, #0xe0]\n"
+ ".inst 0x44824492 // smlalt z18.s, p4/M, z4.h, z2.h\n"
+ ".inst 0x44824754 // smlalt z20.s, p4/M, z26.h, z2.h\n"
+ "ld1b { z4.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ ".inst 0x44824501 // smlalt z1.s, p4/M, z8.h, z2.h\n"
+ ".inst 0x449d4366 // smlalb z6.s, p4/M, z27.h, z29.h\n"
+ "ld1b { z2.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x454a1842 // usublb z2.h, z2.b, z10.b\n"
+ ".inst 0x449d4109 // smlalb z9.s, p4/M, z8.h, z29.h\n"
+ ".inst 0x449d43e7 // smlalb z7.s, p4/M, z31.h, z29.h\n"
+ "ldr x27, [x17, #0xe8]\n"
+ "ldr x26, [x17, #0xf0]\n"
+ ".inst 0x44984777 // smlalt z23.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x449040ae // smlalb z14.s, p4/M, z5.h, z16.h\n"
+ "ldr x25, [x17, #0xf8]\n"
+ "ldr x24, [x17, #0x100]\n"
+ ".inst 0x449d4772 // smlalt z18.s, p4/M, z27.h, z29.h\n"
+ ".inst 0x449d4514 // smlalt z20.s, p4/M, z8.h, z29.h\n"
+ "ld1b { z27.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449d47e1 // smlalt z1.s, p4/M, z31.h, z29.h\n"
+ ".inst 0x449840a6 // smlalb z6.s, p4/M, z5.h, z24.h\n"
+ "ld1b { z29.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x454a1bbd // usublb z29.h, z29.b, z10.b\n"
+ ".inst 0x449843e9 // smlalb z9.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984007 // smlalb z7.s, p4/M, z0.h, z24.h\n"
+ "ldr x23, [x17, #0x108]\n"
+ "ldr x22, [x17, #0x110]\n"
+ ".inst 0x449044b7 // smlalt z23.s, p4/M, z5.h, z16.h\n"
+ ".inst 0x4495438e // smlalb z14.s, p4/M, z28.h, z21.h\n"
+ "ldr x20, [x17, #0x118]\n"
+ "whilelt p0.h, x16, x3\n"
+ ".inst 0x449844b2 // smlalt z18.s, p4/M, z5.h, z24.h\n"
+ ".inst 0x449847f4 // smlalt z20.s, p4/M, z31.h, z24.h\n"
+ "ld1b { z5.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
+ ".inst 0x44984401 // smlalt z1.s, p4/M, z0.h, z24.h\n"
+ ".inst 0x44904266 // smlalb z6.s, p4/M, z19.h, z16.h\n"
+ "ld1b { z24.h }, p4/Z, [x4, #7, MUL VL]\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44904009 // smlalb z9.s, p4/M, z0.h, z16.h\n"
+ ".inst 0x44904167 // smlalb z7.s, p4/M, z11.h, z16.h\n"
+ ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44954797 // smlalt z23.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x4496434e // smlalb z14.s, p4/M, z26.h, z22.h\n"
+ "ld1b { z28.h }, p3/Z, [x13, x2]\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
+ ".inst 0x44904672 // smlalt z18.s, p4/M, z19.h, z16.h\n"
+ ".inst 0x44904414 // smlalt z20.s, p4/M, z0.h, z16.h\n"
+ "ld1b { z19.h }, p4/Z, [x4]\n"
+ ".inst 0x454a1a73 // usublb z19.h, z19.b, z10.b\n"
+ ".inst 0x44904561 // smlalt z1.s, p4/M, z11.h, z16.h\n"
+ ".inst 0x44954346 // smlalb z6.s, p4/M, z26.h, z21.h\n"
+ "ld1b { z16.h }, p3/Z, [x12, x2]\n"
+ ".inst 0x455e1a10 // usublb z16.h, z16.b, z30.b\n"
+ ".inst 0x44954229 // smlalb z9.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x44954067 // smlalb z7.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44964757 // smlalt z23.s, p4/M, z26.h, z22.h\n"
+ ".inst 0x4499410e // smlalb z14.s, p4/M, z8.h, z25.h\n"
+ ".inst 0x44954752 // smlalt z18.s, p4/M, z26.h, z21.h\n"
+ ".inst 0x44954634 // smlalt z20.s, p4/M, z17.h, z21.h\n"
+ "ld1b { z26.h }, p3/Z, [x11, x2]\n"
+ ".inst 0x455e1b5a // usublb z26.h, z26.b, z30.b\n"
+ ".inst 0x44954461 // smlalt z1.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44964106 // smlalb z6.s, p4/M, z8.h, z22.h\n"
+ "ld1b { z21.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
+ ".inst 0x44964069 // smlalb z9.s, p4/M, z3.h, z22.h\n"
+ ".inst 0x44964087 // smlalb z7.s, p4/M, z4.h, z22.h\n"
+ ".inst 0x44994517 // smlalt z23.s, p4/M, z8.h, z25.h\n"
+ ".inst 0x448243ee // smlalb z14.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x44964512 // smlalt z18.s, p4/M, z8.h, z22.h\n"
+ ".inst 0x44964474 // smlalt z20.s, p4/M, z3.h, z22.h\n"
+ "ld1b { z8.h }, p3/Z, [x10, x2]\n"
+ ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
+ ".inst 0x44964481 // smlalt z1.s, p4/M, z4.h, z22.h\n"
+ ".inst 0x449943e6 // smlalb z6.s, p4/M, z31.h, z25.h\n"
+ "ld1b { z22.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x454a1ad6 // usublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44994089 // smlalb z9.s, p4/M, z4.h, z25.h\n"
+ ".inst 0x44994367 // smlalb z7.s, p4/M, z27.h, z25.h\n"
+ ".inst 0x448247f7 // smlalt z23.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x449d400e // smlalb z14.s, p4/M, z0.h, z29.h\n"
+ ".inst 0x449947f2 // smlalt z18.s, p4/M, z31.h, z25.h\n"
+ ".inst 0x44994494 // smlalt z20.s, p4/M, z4.h, z25.h\n"
+ "ld1b { z31.h }, p3/Z, [x9, x2]\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ ".inst 0x44994761 // smlalt z1.s, p4/M, z27.h, z25.h\n"
+ ".inst 0x44824006 // smlalb z6.s, p4/M, z0.h, z2.h\n"
+ "ld1b { z25.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x454a1b39 // usublb z25.h, z25.b, z10.b\n"
+ ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x448240a7 // smlalb z7.s, p4/M, z5.h, z2.h\n"
+ ".inst 0x449d4417 // smlalt z23.s, p4/M, z0.h, z29.h\n"
+ ".inst 0x4498422e // smlalb z14.s, p4/M, z17.h, z24.h\n"
+ ".inst 0x44824412 // smlalt z18.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x44824774 // smlalt z20.s, p4/M, z27.h, z2.h\n"
+ "ld1b { z0.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x448244a1 // smlalt z1.s, p4/M, z5.h, z2.h\n"
+ ".inst 0x449d4166 // smlalb z6.s, p4/M, z11.h, z29.h\n"
+ "ld1b { z2.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x454a1842 // usublb z2.h, z2.b, z10.b\n"
+ ".inst 0x449d40a9 // smlalb z9.s, p4/M, z5.h, z29.h\n"
+ ".inst 0x449d4387 // smlalb z7.s, p4/M, z28.h, z29.h\n"
+ ".inst 0x44984637 // smlalt z23.s, p4/M, z17.h, z24.h\n"
+ ".inst 0x4493406e // smlalb z14.s, p4/M, z3.h, z19.h\n"
+ "ld1b { z17.h }, p3/Z, [x27, x2]\n"
+ ".inst 0x455e1a31 // usublb z17.h, z17.b, z30.b\n"
+ ".inst 0x449d4572 // smlalt z18.s, p4/M, z11.h, z29.h\n"
+ ".inst 0x449d44b4 // smlalt z20.s, p4/M, z5.h, z29.h\n"
+ "ld1b { z11.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x454a196b // usublb z11.h, z11.b, z10.b\n"
+ ".inst 0x449d4781 // smlalt z1.s, p4/M, z28.h, z29.h\n"
+ ".inst 0x44984066 // smlalb z6.s, p4/M, z3.h, z24.h\n"
+ "ld1b { z29.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
+ ".inst 0x44984209 // smlalb z9.s, p4/M, z16.h, z24.h\n"
+ ".inst 0x44984347 // smlalb z7.s, p4/M, z26.h, z24.h\n"
+ ".inst 0x44934477 // smlalt z23.s, p4/M, z3.h, z19.h\n"
+ ".inst 0x4495408e // smlalb z14.s, p4/M, z4.h, z21.h\n"
+ ".inst 0x44984472 // smlalt z18.s, p4/M, z3.h, z24.h\n"
+ ".inst 0x44984614 // smlalt z20.s, p4/M, z16.h, z24.h\n"
+ "ld1b { z3.h }, p3/Z, [x25, x2]\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ ".inst 0x44984741 // smlalt z1.s, p4/M, z26.h, z24.h\n"
+ ".inst 0x44934086 // smlalb z6.s, p4/M, z4.h, z19.h\n"
+ "ld1b { z24.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
+ ".inst 0x44934349 // smlalb z9.s, p4/M, z26.h, z19.h\n"
+ ".inst 0x44934107 // smlalb z7.s, p4/M, z8.h, z19.h\n"
+ ".inst 0x44954497 // smlalt z23.s, p4/M, z4.h, z21.h\n"
+ ".inst 0x4496436e // smlalb z14.s, p4/M, z27.h, z22.h\n"
+ ".inst 0x44934492 // smlalt z18.s, p4/M, z4.h, z19.h\n"
+ ".inst 0x44934754 // smlalt z20.s, p4/M, z26.h, z19.h\n"
+ "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ ".inst 0x44934501 // smlalt z1.s, p4/M, z8.h, z19.h\n"
+ ".inst 0x44954366 // smlalb z6.s, p4/M, z27.h, z21.h\n"
+ "ld1b { z19.h }, p4/Z, [x4, #7, MUL VL]\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44954109 // smlalb z9.s, p4/M, z8.h, z21.h\n"
+ ".inst 0x449543e7 // smlalb z7.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454a1a73 // usublb z19.h, z19.b, z10.b\n"
+ ".inst 0x44964777 // smlalt z23.s, p4/M, z27.h, z22.h\n"
+ ".inst 0x449940ae // smlalb z14.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x44954772 // smlalt z18.s, p4/M, z27.h, z21.h\n"
+ ".inst 0x44954514 // smlalt z20.s, p4/M, z8.h, z21.h\n"
+ "ld1b { z27.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449547e1 // smlalt z1.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x449640a6 // smlalb z6.s, p4/M, z5.h, z22.h\n"
+ "ld1b { z21.h }, p4/Z, [x4]\n"
+ ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
+ ".inst 0x449643e9 // smlalb z9.s, p4/M, z31.h, z22.h\n"
+ ".inst 0x44964007 // smlalb z7.s, p4/M, z0.h, z22.h\n"
+ "inch x4\n"
+ ".inst 0x449944b7 // smlalt z23.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x4482420e // smlalb z14.s, p4/M, z16.h, z2.h\n"
+ ".inst 0x449644b2 // smlalt z18.s, p4/M, z5.h, z22.h\n"
+ ".inst 0x449647f4 // smlalt z20.s, p4/M, z31.h, z22.h\n"
+ "ld1b { z5.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
+ ".inst 0x44964401 // smlalt z1.s, p4/M, z0.h, z22.h\n"
+ ".inst 0x44994386 // smlalb z6.s, p4/M, z28.h, z25.h\n"
+ "ld1w { z22.s }, p2/Z, [x15]\n"
+ ".inst 0x44994009 // smlalb z9.s, p4/M, z0.h, z25.h\n"
+ ".inst 0x44994227 // smlalb z7.s, p4/M, z17.h, z25.h\n"
+ ".inst 0x44824617 // smlalt z23.s, p4/M, z16.h, z2.h\n"
+ ".inst 0x448b434e // smlalb z14.s, p4/M, z26.h, z11.h\n"
+ "ld1w { z16.s }, p1/Z, [x15, #1, MUL VL]\n"
+ "addvl x15, x15, #2\n"
+ ".inst 0x44994792 // smlalt z18.s, p4/M, z28.h, z25.h\n"
+ ".inst 0x44994414 // smlalt z20.s, p4/M, z0.h, z25.h\n"
+ "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
+ ".inst 0x44994621 // smlalt z1.s, p4/M, z17.h, z25.h\n"
+ ".inst 0x44824346 // smlalb z6.s, p4/M, z26.h, z2.h\n"
+ "uzp1 z25.s, z22.s, z16.s\n"
+ "inch x2\n"
+ ".inst 0x448243a9 // smlalb z9.s, p4/M, z29.h, z2.h\n"
+ ".inst 0x44824067 // smlalb z7.s, p4/M, z3.h, z2.h\n"
+ "uzp2 z16.s, z22.s, z16.s\n"
+ "ld1w { z22.s }, p2/Z, [x14]\n"
+ ".inst 0x448b4757 // smlalt z23.s, p4/M, z26.h, z11.h\n"
+ ".inst 0x4498410e // smlalb z14.s, p4/M, z8.h, z24.h\n"
+ "mov x20, x2\n"
+ "incw x20\n"
+ ".inst 0x44824752 // smlalt z18.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x448247b4 // smlalt z20.s, p4/M, z29.h, z2.h\n"
+ "ld1w { z26.s }, p1/Z, [x14, #1, MUL VL]\n"
+ "uzp1 z29.s, z22.s, z26.s\n"
+ ".inst 0x44824461 // smlalt z1.s, p4/M, z3.h, z2.h\n"
+ ".inst 0x448b4106 // smlalb z6.s, p4/M, z8.h, z11.h\n"
+ "uzp2 z22.s, z22.s, z26.s\n"
+ "whilelt p2.s, x2, x3\n"
+ ".inst 0x448b4069 // smlalb z9.s, p4/M, z3.h, z11.h\n"
+ ".inst 0x448b4087 // smlalb z7.s, p4/M, z4.h, z11.h\n"
+ "whilelt p1.s, x20, x3\n"
+ "whilelt p3.h, x2, x3\n"
+ ".inst 0x44984517 // smlalt z23.s, p4/M, z8.h, z24.h\n"
+ ".inst 0x449343ee // smlalb z14.s, p4/M, z31.h, z19.h\n"
+ "addvl x14, x14, #2\n"
+ ".inst 0x448b4512 // smlalt z18.s, p4/M, z8.h, z11.h\n"
+ ".inst 0x448b4474 // smlalt z20.s, p4/M, z3.h, z11.h\n"
+ ".inst 0x448b4481 // smlalt z1.s, p4/M, z4.h, z11.h\n"
+ ".inst 0x449843e6 // smlalb z6.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984089 // smlalb z9.s, p4/M, z4.h, z24.h\n"
+ ".inst 0x44984367 // smlalb z7.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x449347f7 // smlalt z23.s, p4/M, z31.h, z19.h\n"
+ ".inst 0x4495400e // smlalb z14.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x04b975ce // sqrdmulh z14.s, z14.s, z25.s\n"
+ ".inst 0x449847f2 // smlalt z18.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984494 // smlalt z20.s, p4/M, z4.h, z24.h\n"
+ "and z3.d, z14.d, z29.d\n"
+ ".inst 0x44984761 // smlalt z1.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x44934006 // smlalb z6.s, p4/M, z0.h, z19.h\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ ".inst 0x44934369 // smlalb z9.s, p4/M, z27.h, z19.h\n"
+ ".inst 0x449340a7 // smlalb z7.s, p4/M, z5.h, z19.h\n"
+ "sqadd z14.s, z14.s, z3.s\n"
+ ".inst 0x448293ae // srshl z14.s, p4/M, z14.s, z29.s\n"
+ ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x44934412 // smlalt z18.s, p4/M, z0.h, z19.h\n"
+ ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n"
+ ".inst 0x44934774 // smlalt z20.s, p4/M, z27.h, z19.h\n"
+ ".inst 0x449344a1 // smlalt z1.s, p4/M, z5.h, z19.h\n"
+ "and z31.d, z23.d, z22.d\n"
+ ".inst 0x44954226 // smlalb z6.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x449540a9 // smlalb z9.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x04b974c6 // sqrdmulh z6.s, z6.s, z25.s\n"
+ ".inst 0x44954387 // smlalb z7.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x44954632 // smlalt z18.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x04b97529 // sqrdmulh z9.s, z9.s, z25.s\n"
+ ".inst 0x449544b4 // smlalt z20.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x44954781 // smlalt z1.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x04b974e7 // sqrdmulh z7.s, z7.s, z25.s\n"
+ "asr z31.s, z31.s, #0x1f\n"
+ "and z3.d, z6.d, z29.d\n"
+ ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
+ "and z0.d, z9.d, z29.d\n"
+ ".inst 0x04b07694 // sqrdmulh z20.s, z20.s, z16.s\n"
+ "and z19.d, z7.d, z29.d\n"
+ ".inst 0x04b07421 // sqrdmulh z1.s, z1.s, z16.s\n"
+ "sqadd z23.s, z23.s, z31.s\n"
+ ".inst 0x448292d7 // srshl z23.s, p4/M, z23.s, z22.s\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ "and z21.d, z18.d, z22.d\n"
+ "asr z0.s, z0.s, #0x1f\n"
+ "and z17.d, z20.d, z22.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z16.d, z1.d, z22.d\n"
+ "sqadd z6.s, z6.s, z3.s\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ ".inst 0x448293a6 // srshl z6.s, p4/M, z6.s, z29.s\n"
+ "sqadd z9.s, z9.s, z0.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x448293a9 // srshl z9.s, p4/M, z9.s, z29.s\n"
+ "sqadd z7.s, z7.s, z19.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x448293a7 // srshl z7.s, p4/M, z7.s, z29.s\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "sqadd z20.s, z20.s, z17.s\n"
+ ".inst 0x448292d2 // srshl z18.s, p4/M, z18.s, z22.s\n"
+ ".inst 0x448292d4 // srshl z20.s, p4/M, z20.s, z22.s\n"
+ "sqadd z1.s, z1.s, z16.s\n"
+ ".inst 0x453041ce // sqxtnb z14.h, z14.s\n"
+ ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
+ ".inst 0x453040c6 // sqxtnb z6.h, z6.s\n"
+ ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
+ ".inst 0x453040e7 // sqxtnb z7.h, z7.s\n"
+ ".inst 0x453046ee // sqxtnt z14.h, z23.s\n"
+ ".inst 0x45304646 // sqxtnt z6.h, z18.s\n"
+ ".inst 0x45304689 // sqxtnt z9.h, z20.s\n"
+ ".inst 0x45304427 // sqxtnt z7.h, z1.s\n"
+ "sqadd z14.h, z14.h, z15.h\n"
+ "smax z14.h, p4/M, z14.h, z12.h\n"
+ "smin z14.h, p4/M, z14.h, z13.h\n"
+ "sqadd z6.h, z6.h, z15.h\n"
+ "sqadd z9.h, z9.h, z15.h\n"
+ "smax z6.h, p4/M, z6.h, z12.h\n"
+ "smax z9.h, p4/M, z9.h, z12.h\n"
+ "sqadd z7.h, z7.h, z15.h\n"
+ "smax z7.h, p4/M, z7.h, z12.h\n"
+ "smin z6.h, p4/M, z6.h, z13.h\n"
+ "st1b { z14.h }, p0, [x5, x16]\n"
+ "smin z9.h, p4/M, z9.h, z13.h\n"
+ "smin z7.h, p4/M, z7.h, z13.h\n"
+ "st1b { z6.h }, p0, [x6, x16]\n"
+ "st1b { z9.h }, p0, [x7, x16]\n"
+ "st1b { z7.h }, p0, [x8, x16]\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+ "uzp1 z14.s, z17.s, z16.s\n"
+ "ld1b { z26.h }, p4/Z, [x4]\n"
+ "ld1b { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "uzp2 z23.s, z17.s, z16.s\n"
+ "addvl x21, x21, #2\n"
+ "ld1b { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1b { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "inch x16\n"
+ "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1b { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x17, #0x0]\n"
+ "mov z6.d, z14.d\n"
+ "mov z18.d, z23.d\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "mov z9.d, z14.d\n"
+ "mov z20.d, z23.d\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "mov z7.d, z14.d\n"
+ "mov z1.d, z23.d\n"
+ "ld1b { z22.h }, p3/Z, [x9, x2]\n"
+ "ld1b { z2.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454a1b5a // usublb z26.h, z26.b, z10.b\n"
+ ".inst 0x454a1908 // usublb z8.h, z8.b, z10.b\n"
+ "ld1b { z11.h }, p3/Z, [x27, x2]\n"
+ "ld1b { z3.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x454a1a10 // usublb z16.h, z16.b, z10.b\n"
+ ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
+ "ld1b { z29.h }, p3/Z, [x25, x2]\n"
+ "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x454a1a31 // usublb z17.h, z17.b, z10.b\n"
+ ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
+ "ld1b { z31.h }, p3/Z, [x23, x2]\n"
+ "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
+ ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
+ "ld1b { z19.h }, p3/Z, [x21, x2]\n"
+ "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x455e1a73 // usublb z19.h, z19.b, z30.b\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
+ "b.any 1b\n"
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
new file mode 100644
index 0000000000..d5382533a8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst : DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+ using Parent = DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *)
+ : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
+
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..a9cd8a7fa9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp
@@ -0,0 +1,339 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(
+ const uint8_t *const *const inptrs,
+ uint8_t *const *const outptrs,
+ const void *params,
+ unsigned int n_output_channels,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "mov x20, #0x9\n"
+ "whilelt p0.b, XZR, x20\n"
+ "ldr x23, [%x[inptrs], #0x8]\n"
+ "ldr x20, [%x[inptrs], #0x10]\n"
+ "ldr x22, [%x[inptrs], #0x20]\n"
+ "ldr x21, [%x[inptrs], #0x0]\n"
+ "mov z13.b, #0x1\n"
+ "lsr z13.s, z13.s, #0x8\n"
+ "ld1b { z1.b }, p0/Z, [x23]\n"
+ "ld1b { z2.b }, p0/Z, [x20]\n"
+ "mov z8.d, z1.d\n"
+ "mov z27.d, z1.d\n"
+ "ldr x20, [%x[inptrs], #0x18]\n"
+ "ld1b { z4.b }, p0/Z, [x22]\n"
+ "mov z31.d, z1.d\n"
+ "mov z28.d, z2.d\n"
+ "ld1b { z0.b }, p0/Z, [x21]\n"
+ "mov z30.d, z2.d\n"
+ "mov z26.d, z2.d\n"
+ "ld1b { z3.b }, p0/Z, [x20]\n"
+ "mov z22.d, z4.d\n"
+ "mov z10.d, z4.d\n"
+ "ptrue p2.b\n"
+ "ld1rw { z11.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "mov z18.d, z4.d\n"
+ "ext z8.b, z8.b, z8.b, #0x2\n"
+ "lsl x10, %x[n_channels], #0x2\n"
+ "neg z11.s, p2/M, z11.s\n"
+ "ext z27.b, z27.b, z27.b, #0x4\n"
+ "ext z31.b, z31.b, z31.b, #0x6\n"
+ "mov x9, #0x0\n"
+ "whilelt p0.b, x9, x10\n"
+ "ext z28.b, z28.b, z28.b, #0x2\n"
+ "ext z30.b, z30.b, z30.b, #0x4\n"
+ "ld1w { z14.s }, p0/Z, [%x[params]]\n"
+ "mov x28, #0x0\n"
+ "ext z26.b, z26.b, z26.b, #0x6\n"
+ "ext z22.b, z22.b, z22.b, #0x2\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "ext z10.b, z10.b, z10.b, #0x4\n"
+ "ext z18.b, z18.b, z18.b, #0x6\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "mov z21.d, z0.d\n"
+ "mov z20.d, z0.d\n"
+ "ld1rw { z9.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "mov z19.d, z0.d\n"
+ "mov z24.d, z3.d\n"
+ "ld1rw { z12.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "ld1b { z5.b }, p0/Z, [%x[params], #1, MUL VL]\n"
+ "mov z17.d, z3.d\n"
+ "mov z16.d, z3.d\n"
+ "ld1b { z6.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+ "ld1b { z7.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+ "ext z21.b, z21.b, z21.b, #0x2\n"
+ "ext z20.b, z20.b, z20.b, #0x4\n"
+ "addvl %x[params], %x[params], #4\n"
+ "ext z19.b, z19.b, z19.b, #0x6\n"
+ "zip1 z1.s, z1.s, z27.s\n"
+ "zip1 z8.s, z8.s, z31.s\n"
+ "zip1 z2.s, z2.s, z30.s\n"
+ "zip1 z28.s, z28.s, z26.s\n"
+ "ext z24.b, z24.b, z24.b, #0x2\n"
+ "ext z17.b, z17.b, z17.b, #0x4\n"
+ "ext z16.b, z16.b, z16.b, #0x6\n"
+ "zip1 z4.s, z4.s, z10.s\n"
+ "zip1 z22.s, z22.s, z18.s\n"
+ "zip1 z0.s, z0.s, z20.s\n"
+ "zip1 z21.s, z21.s, z19.s\n"
+ "zip1 z1.s, z1.s, z8.s\n"
+ "zip1 z2.s, z2.s, z28.s\n"
+ "zip1 z3.s, z3.s, z17.s\n"
+ "zip1 z24.s, z24.s, z16.s\n"
+ "zip1 z4.s, z4.s, z22.s\n"
+ "zip1 z0.s, z0.s, z21.s\n"
+ "mov z1.q, z1.q[0]\n"
+ "mov z2.q, z2.q[0]\n"
+ "zip1 z3.s, z3.s, z24.s\n"
+ "mov z4.q, z4.q[0]\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "udot z24.s, z13.b, z1.b[0]\n"
+ "mov z23.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "udot z25.s, z13.b, z1.b[1]\n"
+ "mov z21.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "udot z23.s, z13.b, z1.b[2]\n"
+ "mov z10.s, #0x0\n"
+ "mov z8.s, #0x0\n"
+ "udot z22.s, z13.b, z1.b[3]\n"
+ "mov z20.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "udot z21.s, z13.b, z2.b[0]\n"
+ "mov z17.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "udot z19.s, z13.b, z2.b[1]\n"
+ "udot z10.s, z13.b, z2.b[2]\n"
+ "udot z8.s, z13.b, z2.b[3]\n"
+ "mov z0.q, z0.q[0]\n"
+ "udot z20.s, z13.b, z4.b[0]\n"
+ "udot z18.s, z13.b, z4.b[1]\n"
+ "mov z3.q, z3.q[0]\n"
+ "udot z17.s, z13.b, z4.b[2]\n"
+ "udot z16.s, z13.b, z4.b[3]\n"
+ "mov z31.s, #0x0\n"
+ "mov z30.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "udot z31.s, z13.b, z0.b[0]\n"
+ "mov z27.s, #0x0\n"
+ "mov z28.s, #0x0\n"
+ "udot z30.s, z13.b, z0.b[1]\n"
+ "mov z29.s, #0x0\n"
+ "udot z26.s, z13.b, z0.b[2]\n"
+ "udot z27.s, z13.b, z0.b[3]\n"
+ "udot z28.s, z13.b, z3.b[0]\n"
+ "udot z29.s, z13.b, z3.b[1]\n"
+ "add z24.s, z24.s, z21.s\n"
+ "add z25.s, z25.s, z19.s\n"
+ "add z23.s, z23.s, z10.s\n"
+ "add z22.s, z22.s, z8.s\n"
+ "add z21.s, z20.s, z21.s\n"
+ "mov z20.s, #0x0\n"
+ "udot z20.s, z13.b, z3.b[2]\n"
+ "add z19.s, z18.s, z19.s\n"
+ "mov z18.s, #0x0\n"
+ "udot z18.s, z13.b, z3.b[3]\n"
+ "add z17.s, z17.s, z10.s\n"
+ "add z16.s, z16.s, z8.s\n"
+ "add z24.s, z24.s, z31.s\n"
+ "add z25.s, z25.s, z30.s\n"
+ "mul z24.s, p2/M, z24.s, z11.s\n"
+ "mul z25.s, p2/M, z25.s, z11.s\n"
+ "add z26.s, z23.s, z26.s\n"
+ "add z27.s, z22.s, z27.s\n"
+ "mul z26.s, p2/M, z26.s, z11.s\n"
+ "mul z27.s, p2/M, z27.s, z11.s\n"
+ "add z28.s, z21.s, z28.s\n"
+ "add z29.s, z19.s, z29.s\n"
+ "mul z28.s, p2/M, z28.s, z11.s\n"
+ "mul z29.s, p2/M, z29.s, z11.s\n"
+ "add z30.s, z17.s, z20.s\n"
+ "add z31.s, z16.s, z18.s\n"
+ "mul z30.s, p2/M, z30.s, z11.s\n"
+ "mul z31.s, p2/M, z31.s, z11.s\n"
+ "zip1 z19.s, z24.s, z26.s\n"
+ "zip1 z18.s, z25.s, z27.s\n"
+ "zip1 z17.s, z28.s, z30.s\n"
+ "zip1 z16.s, z29.s, z31.s\n"
+ "zip1 z22.s, z19.s, z18.s\n"
+ "zip1 z23.s, z17.s, z16.s\n"
+ "add z24.s, z24.s, z14.s\n"
+ "add z25.s, z25.s, z14.s\n"
+ "add z26.s, z26.s, z14.s\n"
+ "add z27.s, z27.s, z14.s\n"
+ "add z28.s, z28.s, z14.s\n"
+ "add z29.s, z29.s, z14.s\n"
+ "add z30.s, z30.s, z14.s\n"
+ "add z31.s, z31.s, z14.s\n"
+ "1:" // Loop
+ "udot z24.s, z5.b, z0.b[0]\n"
+ "udot z25.s, z5.b, z0.b[1]\n"
+ "ld1w { z8.s }, p2/Z, [%x[params]]\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "udot z26.s, z5.b, z0.b[2]\n"
+ "udot z27.s, z5.b, z0.b[3]\n"
+ "incb x9\n"
+ "whilelt p1.s, x28, %x[n_channels]\n"
+ "udot z24.s, z6.b, z1.b[0]\n"
+ "udot z25.s, z6.b, z1.b[1]\n"
+ "whilelt p0.b, x9, x10\n"
+ "ld1w { z20.s }, p0/Z, [%x[params], #2, MUL VL]\n"
+ "udot z26.s, z6.b, z1.b[2]\n"
+ "udot z27.s, z6.b, z1.b[3]\n"
+ "udot z28.s, z5.b, z2.b[0]\n"
+ "udot z29.s, z5.b, z2.b[1]\n"
+ "udot z30.s, z5.b, z2.b[2]\n"
+ "udot z31.s, z5.b, z2.b[3]\n"
+ "ld1b { z5.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+ "udot z24.s, z7.b, z2.b[0]\n"
+ "udot z25.s, z7.b, z2.b[1]\n"
+ ".inst 0x04a87718 // sqrdmulh z24.s, z24.s, z8.s\n"
+ "udot z26.s, z7.b, z2.b[2]\n"
+ "udot z27.s, z7.b, z2.b[3]\n"
+ ".inst 0x04a87739 // sqrdmulh z25.s, z25.s, z8.s\n"
+ "udot z28.s, z6.b, z3.b[0]\n"
+ "udot z29.s, z6.b, z3.b[1]\n"
+ ".inst 0x04a8775a // sqrdmulh z26.s, z26.s, z8.s\n"
+ "udot z30.s, z6.b, z3.b[2]\n"
+ "udot z31.s, z6.b, z3.b[3]\n"
+ ".inst 0x04a8777b // sqrdmulh z27.s, z27.s, z8.s\n"
+ "ld1b { z6.b }, p0/Z, [%x[params], #4, MUL VL]\n"
+ "udot z28.s, z7.b, z4.b[0]\n"
+ "udot z29.s, z7.b, z4.b[1]\n"
+ "and z19.d, z24.d, z21.d\n"
+ "udot z30.s, z7.b, z4.b[2]\n"
+ "udot z31.s, z7.b, z4.b[3]\n"
+ "and z18.d, z25.d, z21.d\n"
+ "ld1b { z7.b }, p0/Z, [%x[params], #5, MUL VL]\n"
+ "and z17.d, z26.d, z21.d\n"
+ "and z16.d, z27.d, z21.d\n"
+ "addvl %x[params], %x[params], #6\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04a8779c // sqrdmulh z28.s, z28.s, z8.s\n"
+ ".inst 0x04a877bd // sqrdmulh z29.s, z29.s, z8.s\n"
+ ".inst 0x04a877de // sqrdmulh z30.s, z30.s, z8.s\n"
+ ".inst 0x04a877ff // sqrdmulh z31.s, z31.s, z8.s\n"
+ "sqadd z24.s, z24.s, z19.s\n"
+ "sqadd z25.s, z25.s, z18.s\n"
+ ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
+ ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
+ "sqadd z26.s, z26.s, z17.s\n"
+ "sqadd z27.s, z27.s, z16.s\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
+ "and z19.d, z28.d, z21.d\n"
+ "and z18.d, z29.d, z21.d\n"
+ "and z17.d, z30.d, z21.d\n"
+ "and z16.d, z31.d, z21.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z19.s\n"
+ "sqadd z29.s, z29.s, z18.s\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
+ "sqadd z30.s, z30.s, z17.s\n"
+ "sqadd z31.s, z31.s, z16.s\n"
+ ".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ "add z24.s, z24.s, z9.s\n"
+ "add z25.s, z25.s, z9.s\n"
+ "smin z24.s, p2/M, z24.s, z12.s\n"
+ "smin z25.s, p2/M, z25.s, z12.s\n"
+ "add z26.s, z26.s, z9.s\n"
+ "add z27.s, z27.s, z9.s\n"
+ "smin z26.s, p2/M, z26.s, z12.s\n"
+ "smin z27.s, p2/M, z27.s, z12.s\n"
+ "add z28.s, z28.s, z9.s\n"
+ "add z29.s, z29.s, z9.s\n"
+ "smin z28.s, p2/M, z28.s, z12.s\n"
+ "smin z29.s, p2/M, z29.s, z12.s\n"
+ "add z30.s, z30.s, z9.s\n"
+ "add z31.s, z31.s, z9.s\n"
+ "smin z30.s, p2/M, z30.s, z12.s\n"
+ "smin z31.s, p2/M, z31.s, z12.s\n"
+ "smax z24.s, p2/M, z24.s, z15.s\n"
+ "smax z25.s, p2/M, z25.s, z15.s\n"
+ "st1b { z24.s }, p1, [x27, x28]\n"
+ "mov z24.s, z22.s[0]\n"
+ "smax z26.s, p2/M, z26.s, z15.s\n"
+ "smax z27.s, p2/M, z27.s, z15.s\n"
+ "st1b { z25.s }, p1, [x26, x28]\n"
+ "mov z25.s, z22.s[1]\n"
+ "smax z28.s, p2/M, z28.s, z15.s\n"
+ "smax z29.s, p2/M, z29.s, z15.s\n"
+ "st1b { z26.s }, p1, [x25, x28]\n"
+ "mov z26.s, z22.s[2]\n"
+ "smax z30.s, p2/M, z30.s, z15.s\n"
+ "smax z31.s, p2/M, z31.s, z15.s\n"
+ "st1b { z27.s }, p1, [x24, x28]\n"
+ "mov z27.s, z22.s[3]\n"
+ "st1b { z28.s }, p1, [x23, x28]\n"
+ "mov z28.s, z23.s[0]\n"
+ "add z24.s, z24.s, z20.s\n"
+ "st1b { z29.s }, p1, [x22, x28]\n"
+ "mov z29.s, z23.s[1]\n"
+ "add z25.s, z25.s, z20.s\n"
+ "st1b { z30.s }, p1, [x21, x28]\n"
+ "mov z30.s, z23.s[2]\n"
+ "add z26.s, z26.s, z20.s\n"
+ "st1b { z31.s }, p1, [x20, x28]\n"
+ "mov z31.s, z23.s[3]\n"
+ "incw x28\n"
+ "add z27.s, z27.s, z20.s\n"
+ "add z28.s, z28.s, z20.s\n"
+ "add z29.s, z29.s, z20.s\n"
+ "add z30.s, z30.s, z20.s\n"
+ "add z31.s, z31.s, z20.s\n"
+ "b.any 1b\n"
+ : [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
new file mode 100644
index 0000000000..55b6edea2c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
+
+struct sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst : DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+ using Parent = DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *)
+ : Parent(4, 2, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
+
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
new file mode 100644
index 0000000000..4b65a67309
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "arm_gemm.hpp"
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(
+ const uint8_t *const *const inptrs,
+ uint8_t *const *const outptrs,
+ const void *params,
+ unsigned int n_output_channels,
+ const arm_gemm::Requantize32& qp
+)
+{
+ __asm__ __volatile__(
+ "mov x20, #0x6\n"
+ "whilelt p0.b, XZR, x20\n"
+ "ldr x22, [%x[inptrs], #0x18]\n"
+ "ldr x21, [%x[inptrs], #0x20]\n"
+ "ldr x20, [%x[inptrs], #0x10]\n"
+ "ld1b { z3.b }, p0/Z, [x22]\n"
+ "mov z23.d, z3.d\n"
+ "ext z23.b, z23.b, z23.b, #0x1\n"
+ "ld1b { z4.b }, p0/Z, [x21]\n"
+ "ldr x24, [%x[inptrs], #0x8]\n"
+ "mov z18.d, z4.d\n"
+ "ext z18.b, z18.b, z18.b, #0x1\n"
+ "ld1b { z2.b }, p0/Z, [x20]\n"
+ "ldr x23, [%x[inptrs], #0x28]\n"
+ "mov z15.d, z2.d\n"
+ "ext z15.b, z15.b, z15.b, #0x1\n"
+ "ldr x22, [%x[inptrs], #0x30]\n"
+ "ldr x21, [%x[inptrs], #0x38]\n"
+ "zip1 z3.d, z3.d, z23.d\n"
+ "zip1 z4.d, z4.d, z18.d\n"
+ "ldr x20, [%x[inptrs], #0x0]\n"
+ "ld1b { z1.b }, p0/Z, [x24]\n"
+ "mov z19.d, z1.d\n"
+ "ext z19.b, z19.b, z19.b, #0x1\n"
+ "ld1b { z5.b }, p0/Z, [x23]\n"
+ "ld1b { z6.b }, p0/Z, [x22]\n"
+ "mov z18.d, z5.d\n"
+ "mov z22.d, z6.d\n"
+ "ld1b { z7.b }, p0/Z, [x21]\n"
+ "ld1b { z0.b }, p0/Z, [x20]\n"
+ "mov z8.d, z7.d\n"
+ "zip1 z2.d, z2.d, z15.d\n"
+ "mov z3.q, z3.q[0]\n"
+ "mov z4.q, z4.q[0]\n"
+ "ptrue p2.b\n"
+ "ld1rw { z23.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_b_offset]]\n"
+ "ext z18.b, z18.b, z18.b, #0x1\n"
+ "ext z22.b, z22.b, z22.b, #0x1\n"
+ "lsl x10, %x[n_channels], #0x2\n"
+ "neg z23.s, p2/M, z23.s\n"
+ "ext z8.b, z8.b, z8.b, #0x1\n"
+ "mov z28.b, #0x1\n"
+ "mov x9, #0x0\n"
+ "whilelt p0.b, x9, x10\n"
+ "mov z25.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "udot z25.s, z28.b, z3.b[0]\n"
+ "ld1w { z12.s }, p0/Z, [%x[params]]\n"
+ "mov z17.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "udot z24.s, z28.b, z3.b[2]\n"
+ "mov x28, #0x0\n"
+ "mov z27.d, z0.d\n"
+ "udot z17.s, z28.b, z4.b[0]\n"
+ "udot z16.s, z28.b, z4.b[2]\n"
+ "ldp x27, x26, [%x[outptrs], #0x0]\n"
+ "ext z27.b, z27.b, z27.b, #0x1\n"
+ "zip1 z1.d, z1.d, z19.d\n"
+ "ldp x25, x24, [%x[outptrs], #0x10]\n"
+ "ldp x23, x22, [%x[outptrs], #0x20]\n"
+ "mov z2.q, z2.q[0]\n"
+ "zip1 z5.d, z5.d, z18.d\n"
+ "ldp x21, x20, [%x[outptrs], #0x30]\n"
+ "ld1rw { z13.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_c_offset]]\n"
+ "zip1 z6.d, z6.d, z22.d\n"
+ "zip1 z7.d, z7.d, z8.d\n"
+ "ld1rw { z14.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_minval]]\n"
+ "ld1rw { z15.s }, p2/Z, [%x[qp], %[offsetof_Requantize32_maxval]]\n"
+ "mov z30.s, #0x0\n"
+ "mov z31.s, #0x0\n"
+ "udot z30.s, z28.b, z2.b[0]\n"
+ "ld1b { z8.b }, p0/Z, [%x[params], #1, MUL VL]\n"
+ "mov z29.s, #0x1\n"
+ "udot z31.s, z28.b, z2.b[2]\n"
+ "udot z25.s, z29.b, z3.b[1]\n"
+ "ld1b { z9.b }, p0/Z, [%x[params], #2, MUL VL]\n"
+ "zip1 z0.d, z0.d, z27.d\n"
+ "mov z1.q, z1.q[0]\n"
+ "udot z24.s, z29.b, z3.b[3]\n"
+ "ld1b { z10.b }, p0/Z, [%x[params], #3, MUL VL]\n"
+ "mov z5.q, z5.q[0]\n"
+ "mov z6.q, z6.q[0]\n"
+ "udot z17.s, z29.b, z4.b[1]\n"
+ "ld1b { z11.b }, p0/Z, [%x[params], #4, MUL VL]\n"
+ "mov z7.q, z7.q[0]\n"
+ "mov z22.s, #0x0\n"
+ "udot z16.s, z29.b, z4.b[3]\n"
+ "addvl %x[params], %x[params], #5\n"
+ "mov z21.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "udot z22.s, z28.b, z1.b[0]\n"
+ "mov z27.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "udot z21.s, z28.b, z1.b[2]\n"
+ "mov z19.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "udot z26.s, z28.b, z5.b[0]\n"
+ "udot z27.s, z28.b, z5.b[2]\n"
+ "udot z20.s, z28.b, z6.b[0]\n"
+ "mov z0.q, z0.q[0]\n"
+ "udot z19.s, z28.b, z6.b[2]\n"
+ "udot z18.s, z28.b, z7.b[0]\n"
+ "add z17.s, z25.s, z17.s\n"
+ "mov z25.s, #0x0\n"
+ "udot z25.s, z28.b, z7.b[2]\n"
+ "udot z30.s, z29.b, z2.b[1]\n"
+ "udot z31.s, z29.b, z2.b[3]\n"
+ "add z16.s, z24.s, z16.s\n"
+ "udot z22.s, z29.b, z1.b[1]\n"
+ "mov z24.s, #0x0\n"
+ "udot z24.s, z28.b, z0.b[0]\n"
+ "udot z21.s, z29.b, z1.b[3]\n"
+ "udot z26.s, z29.b, z5.b[1]\n"
+ "udot z27.s, z29.b, z5.b[3]\n"
+ "add z30.s, z30.s, z17.s\n"
+ "udot z20.s, z29.b, z6.b[1]\n"
+ "udot z19.s, z29.b, z6.b[3]\n"
+ "add z31.s, z31.s, z16.s\n"
+ "udot z18.s, z29.b, z7.b[1]\n"
+ "udot z25.s, z29.b, z7.b[3]\n"
+ "add z22.s, z22.s, z30.s\n"
+ "udot z24.s, z29.b, z0.b[1]\n"
+ "add z21.s, z21.s, z31.s\n"
+ "add z20.s, z26.s, z20.s\n"
+ "add z19.s, z27.s, z19.s\n"
+ "add z18.s, z18.s, z17.s\n"
+ "mov z17.s, #0x0\n"
+ "udot z17.s, z28.b, z0.b[2]\n"
+ "udot z17.s, z29.b, z0.b[3]\n"
+ "add z16.s, z25.s, z16.s\n"
+ "add z24.s, z22.s, z24.s\n"
+ "add z25.s, z21.s, z17.s\n"
+ "mul z24.s, p2/M, z24.s, z23.s\n"
+ "mul z25.s, p2/M, z25.s, z23.s\n"
+ "add z26.s, z26.s, z22.s\n"
+ "add z27.s, z27.s, z21.s\n"
+ "mul z26.s, p2/M, z26.s, z23.s\n"
+ "mul z27.s, p2/M, z27.s, z23.s\n"
+ "add z28.s, z20.s, z30.s\n"
+ "add z29.s, z19.s, z31.s\n"
+ "mul z28.s, p2/M, z28.s, z23.s\n"
+ "mul z29.s, p2/M, z29.s, z23.s\n"
+ "add z30.s, z20.s, z18.s\n"
+ "add z31.s, z19.s, z16.s\n"
+ "mul z30.s, p2/M, z30.s, z23.s\n"
+ "mul z31.s, p2/M, z31.s, z23.s\n"
+ "zip1 z19.s, z24.s, z26.s\n"
+ "zip1 z18.s, z25.s, z27.s\n"
+ "zip1 z17.s, z28.s, z30.s\n"
+ "zip1 z16.s, z29.s, z31.s\n"
+ "zip1 z22.s, z19.s, z18.s\n"
+ "zip1 z23.s, z17.s, z16.s\n"
+ "add z24.s, z24.s, z12.s\n"
+ "add z25.s, z25.s, z12.s\n"
+ "add z26.s, z26.s, z12.s\n"
+ "add z27.s, z27.s, z12.s\n"
+ "add z28.s, z28.s, z12.s\n"
+ "add z29.s, z29.s, z12.s\n"
+ "add z30.s, z30.s, z12.s\n"
+ "add z31.s, z31.s, z12.s\n"
+ "1:" // Loop
+ "udot z24.s, z8.b, z0.b[0]\n"
+ "udot z25.s, z8.b, z0.b[2]\n"
+ "ld1w { z12.s }, p2/Z, [%x[params], #6, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [%x[params], #7, MUL VL]\n"
+ "udot z26.s, z8.b, z1.b[0]\n"
+ "udot z27.s, z8.b, z1.b[2]\n"
+ "incb x9\n"
+ "whilelt p1.s, x28, %x[n_channels]\n"
+ "udot z24.s, z9.b, z0.b[1]\n"
+ "udot z25.s, z9.b, z0.b[3]\n"
+ "whilelt p0.b, x9, x10\n"
+ "udot z26.s, z9.b, z1.b[1]\n"
+ "udot z27.s, z9.b, z1.b[3]\n"
+ "udot z28.s, z8.b, z2.b[0]\n"
+ "udot z29.s, z8.b, z2.b[2]\n"
+ "udot z30.s, z8.b, z3.b[0]\n"
+ "udot z31.s, z8.b, z3.b[2]\n"
+ "ld1b { z17.b }, p2/Z, [%x[params]]\n"
+ "udot z24.s, z10.b, z1.b[0]\n"
+ "udot z25.s, z10.b, z1.b[2]\n"
+ "udot z26.s, z10.b, z2.b[0]\n"
+ "udot z27.s, z10.b, z2.b[2]\n"
+ "udot z28.s, z9.b, z2.b[1]\n"
+ "udot z29.s, z9.b, z2.b[3]\n"
+ "udot z30.s, z9.b, z3.b[1]\n"
+ "udot z31.s, z9.b, z3.b[3]\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #1, MUL VL]\n"
+ "udot z24.s, z11.b, z1.b[1]\n"
+ "udot z25.s, z11.b, z1.b[3]\n"
+ "udot z26.s, z11.b, z2.b[1]\n"
+ "udot z27.s, z11.b, z2.b[3]\n"
+ "udot z28.s, z10.b, z3.b[0]\n"
+ "udot z29.s, z10.b, z3.b[2]\n"
+ "udot z30.s, z10.b, z4.b[0]\n"
+ "udot z31.s, z10.b, z4.b[2]\n"
+ "ld1b { z19.b }, p2/Z, [%x[params], #2, MUL VL]\n"
+ "udot z24.s, z17.b, z2.b[0]\n"
+ "udot z25.s, z17.b, z2.b[2]\n"
+ "udot z26.s, z17.b, z3.b[0]\n"
+ "udot z27.s, z17.b, z3.b[2]\n"
+ "udot z28.s, z11.b, z3.b[1]\n"
+ "udot z29.s, z11.b, z3.b[3]\n"
+ "udot z30.s, z11.b, z4.b[1]\n"
+ "udot z31.s, z11.b, z4.b[3]\n"
+ "ld1b { z18.b }, p2/Z, [%x[params], #3, MUL VL]\n"
+ "udot z24.s, z16.b, z2.b[1]\n"
+ "udot z25.s, z16.b, z2.b[3]\n"
+ "udot z26.s, z16.b, z3.b[1]\n"
+ "udot z27.s, z16.b, z3.b[3]\n"
+ "udot z28.s, z17.b, z4.b[0]\n"
+ "udot z29.s, z17.b, z4.b[2]\n"
+ "udot z30.s, z17.b, z5.b[0]\n"
+ "udot z31.s, z17.b, z5.b[2]\n"
+ "ld1b { z17.b }, p2/Z, [%x[params], #4, MUL VL]\n"
+ "udot z24.s, z19.b, z3.b[0]\n"
+ "udot z25.s, z19.b, z3.b[2]\n"
+ "udot z26.s, z19.b, z4.b[0]\n"
+ "udot z27.s, z19.b, z4.b[2]\n"
+ "udot z28.s, z16.b, z4.b[1]\n"
+ "udot z29.s, z16.b, z4.b[3]\n"
+ "udot z30.s, z16.b, z5.b[1]\n"
+ "udot z31.s, z16.b, z5.b[3]\n"
+ "ld1b { z16.b }, p2/Z, [%x[params], #5, MUL VL]\n"
+ "addvl %x[params], %x[params], #16\n"
+ "udot z24.s, z18.b, z3.b[1]\n"
+ "udot z25.s, z18.b, z3.b[3]\n"
+ "ld1w { z20.s }, p0/Z, [%x[params], #-8, MUL VL]\n"
+ "udot z26.s, z18.b, z4.b[1]\n"
+ "udot z27.s, z18.b, z4.b[3]\n"
+ "udot z28.s, z19.b, z5.b[0]\n"
+ "udot z29.s, z19.b, z5.b[2]\n"
+ "udot z30.s, z19.b, z6.b[0]\n"
+ "udot z31.s, z19.b, z6.b[2]\n"
+ "ld1b { z10.b }, p0/Z, [%x[params], #-5, MUL VL]\n"
+ "udot z24.s, z17.b, z4.b[0]\n"
+ "udot z25.s, z17.b, z4.b[2]\n"
+ "udot z26.s, z17.b, z5.b[0]\n"
+ "udot z27.s, z17.b, z5.b[2]\n"
+ "udot z28.s, z18.b, z5.b[1]\n"
+ "udot z29.s, z18.b, z5.b[3]\n"
+ "udot z30.s, z18.b, z6.b[1]\n"
+ "udot z31.s, z18.b, z6.b[3]\n"
+ "ld1b { z11.b }, p0/Z, [%x[params], #-4, MUL VL]\n"
+ "udot z24.s, z16.b, z4.b[1]\n"
+ "udot z25.s, z16.b, z4.b[3]\n"
+ ".inst 0x04ac7718 // sqrdmulh z24.s, z24.s, z12.s\n"
+ "udot z26.s, z16.b, z5.b[1]\n"
+ "udot z27.s, z16.b, z5.b[3]\n"
+ ".inst 0x04ac7739 // sqrdmulh z25.s, z25.s, z12.s\n"
+ "udot z28.s, z17.b, z6.b[0]\n"
+ "udot z29.s, z17.b, z6.b[2]\n"
+ ".inst 0x04ac775a // sqrdmulh z26.s, z26.s, z12.s\n"
+ "udot z30.s, z17.b, z7.b[0]\n"
+ "udot z31.s, z17.b, z7.b[2]\n"
+ ".inst 0x04ac777b // sqrdmulh z27.s, z27.s, z12.s\n"
+ "ld1b { z8.b }, p0/Z, [%x[params], #-7, MUL VL]\n"
+ "udot z28.s, z16.b, z6.b[1]\n"
+ "udot z29.s, z16.b, z6.b[3]\n"
+ "and z19.d, z24.d, z21.d\n"
+ "udot z30.s, z16.b, z7.b[1]\n"
+ "udot z31.s, z16.b, z7.b[3]\n"
+ "and z18.d, z25.d, z21.d\n"
+ "ld1b { z9.b }, p0/Z, [%x[params], #-6, MUL VL]\n"
+ "and z17.d, z26.d, z21.d\n"
+ "and z16.d, z27.d, z21.d\n"
+ "addvl %x[params], %x[params], #-3\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x04ac779c // sqrdmulh z28.s, z28.s, z12.s\n"
+ ".inst 0x04ac77bd // sqrdmulh z29.s, z29.s, z12.s\n"
+ ".inst 0x04ac77de // sqrdmulh z30.s, z30.s, z12.s\n"
+ ".inst 0x04ac77ff // sqrdmulh z31.s, z31.s, z12.s\n"
+ "sqadd z24.s, z24.s, z19.s\n"
+ "sqadd z25.s, z25.s, z18.s\n"
+ ".inst 0x44828ab8 // srshl z24.s, p2/M, z24.s, z21.s\n"
+ ".inst 0x44828ab9 // srshl z25.s, p2/M, z25.s, z21.s\n"
+ "sqadd z26.s, z26.s, z17.s\n"
+ "sqadd z27.s, z27.s, z16.s\n"
+ ".inst 0x44828aba // srshl z26.s, p2/M, z26.s, z21.s\n"
+ ".inst 0x44828abb // srshl z27.s, p2/M, z27.s, z21.s\n"
+ "and z19.d, z28.d, z21.d\n"
+ "and z18.d, z29.d, z21.d\n"
+ "and z17.d, z30.d, z21.d\n"
+ "and z16.d, z31.d, z21.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z19.s\n"
+ "sqadd z29.s, z29.s, z18.s\n"
+ ".inst 0x44828abc // srshl z28.s, p2/M, z28.s, z21.s\n"
+ ".inst 0x44828abd // srshl z29.s, p2/M, z29.s, z21.s\n"
+ "sqadd z30.s, z30.s, z17.s\n"
+ "sqadd z31.s, z31.s, z16.s\n"
+ ".inst 0x44828abe // srshl z30.s, p2/M, z30.s, z21.s\n"
+ ".inst 0x44828abf // srshl z31.s, p2/M, z31.s, z21.s\n"
+ "add z24.s, z24.s, z13.s\n"
+ "add z25.s, z25.s, z13.s\n"
+ "smin z24.s, p2/M, z24.s, z15.s\n"
+ "smin z25.s, p2/M, z25.s, z15.s\n"
+ "add z26.s, z26.s, z13.s\n"
+ "add z27.s, z27.s, z13.s\n"
+ "smin z26.s, p2/M, z26.s, z15.s\n"
+ "smin z27.s, p2/M, z27.s, z15.s\n"
+ "add z28.s, z28.s, z13.s\n"
+ "add z29.s, z29.s, z13.s\n"
+ "smin z28.s, p2/M, z28.s, z15.s\n"
+ "smin z29.s, p2/M, z29.s, z15.s\n"
+ "add z30.s, z30.s, z13.s\n"
+ "add z31.s, z31.s, z13.s\n"
+ "smin z30.s, p2/M, z30.s, z15.s\n"
+ "smin z31.s, p2/M, z31.s, z15.s\n"
+ "smax z24.s, p2/M, z24.s, z14.s\n"
+ "smax z25.s, p2/M, z25.s, z14.s\n"
+ "st1b { z24.s }, p1, [x27, x28]\n"
+ "mov z24.s, z22.s[0]\n"
+ "smax z26.s, p2/M, z26.s, z14.s\n"
+ "smax z27.s, p2/M, z27.s, z14.s\n"
+ "st1b { z25.s }, p1, [x26, x28]\n"
+ "mov z25.s, z22.s[1]\n"
+ "smax z28.s, p2/M, z28.s, z14.s\n"
+ "smax z29.s, p2/M, z29.s, z14.s\n"
+ "st1b { z26.s }, p1, [x25, x28]\n"
+ "mov z26.s, z22.s[2]\n"
+ "smax z30.s, p2/M, z30.s, z14.s\n"
+ "smax z31.s, p2/M, z31.s, z14.s\n"
+ "st1b { z27.s }, p1, [x24, x28]\n"
+ "mov z27.s, z22.s[3]\n"
+ "st1b { z28.s }, p1, [x23, x28]\n"
+ "mov z28.s, z23.s[0]\n"
+ "add z24.s, z24.s, z20.s\n"
+ "st1b { z29.s }, p1, [x22, x28]\n"
+ "mov z29.s, z23.s[1]\n"
+ "add z25.s, z25.s, z20.s\n"
+ "st1b { z30.s }, p1, [x21, x28]\n"
+ "mov z30.s, z23.s[2]\n"
+ "add z26.s, z26.s, z20.s\n"
+ "st1b { z31.s }, p1, [x20, x28]\n"
+ "mov z31.s, z23.s[3]\n"
+ "incw x28\n"
+ "add z27.s, z27.s, z20.s\n"
+ "add z28.s, z28.s, z20.s\n"
+ "add z29.s, z29.s, z20.s\n"
+ "add z30.s, z30.s, z20.s\n"
+ "add z31.s, z31.s, z20.s\n"
+ "b.any 1b\n"
+ : [params] "+&r" (params)
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_output_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..0f1030c0d7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..887eccf1e9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,410 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[16];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[5];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[3];
+ inptrs[3] = inptrs_raw[6];
+ inptrs[4] = inptrs_raw[9];
+ inptrs[5] = inptrs_raw[12];
+ inptrs[6] = inptrs_raw[15];
+ inptrs[7] = inptrs_raw[1];
+ inptrs[8] = inptrs_raw[2];
+ inptrs[9] = inptrs_raw[10];
+ inptrs[10] = inptrs_raw[4];
+ inptrs[11] = inptrs_raw[7];
+ inptrs[12] = inptrs_raw[8];
+ inptrs[13] = inptrs_raw[11];
+ inptrs[14] = inptrs_raw[13];
+ inptrs[15] = inptrs_raw[14];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "mov x16, #0x0\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+ "ptrue p4.b\n"
+ "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "mov x23, x16\n"
+ "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x15, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z12.b }, p4/Z, [x21]\n"
+ "ld1rb { z30.b }, p4/Z, [x20]\n"
+ "add x21, x25, %[offsetof_Requantize32_minval]\n"
+ "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+ "ld1rh { z24.h }, p4/Z, [x22]\n"
+ "ld1rh { z11.h }, p4/Z, [x21]\n"
+ "ld1rh { z26.h }, p4/Z, [x20]\n"
+ "ldp x13, x12, [x24, #0x0]\n"
+ "incw x23\n"
+ "whilelt p3.h, x16, x15\n"
+ "ldp x11, x10, [x24, #0x10]\n"
+ "whilelt p2.s, x16, x15\n"
+ "whilelt p1.s, x23, x15\n"
+ "ldr x9, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1sb { z14.h }, p4/Z, [x14]\n"
+ "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "add x28, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x27, #0x0\n"
+ "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+ ".inst 0x455e11ce // ssublb z14.h, z14.b, z30.b\n"
+ ".inst 0x455e12b5 // ssublb z21.h, z21.b, z30.b\n"
+ "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+ ".inst 0x455e1021 // ssublb z1.h, z1.b, z30.b\n"
+ ".inst 0x455e10c6 // ssublb z6.h, z6.b, z30.b\n"
+ "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "inch x14, ALL, MUL #8\n"
+ ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
+ "ld1w { z17.s }, p2/Z, [x9]\n"
+ "ld1w { z16.s }, p1/Z, [x9, #1, MUL VL]\n"
+ "uzp1 z5.s, z17.s, z16.s\n"
+ "uzp2 z9.s, z17.s, z16.s\n"
+ "ld1sb { z8.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x28, #0x0]\n"
+ "addvl x9, x9, #2\n"
+ "mov z17.d, z5.d\n"
+ "ldp x22, x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x20]\n"
+ "mov z25.d, z9.d\n"
+ "mov z16.d, z5.d\n"
+ "ld1b { z0.h }, p3/Z, [x24, x16]\n"
+ "ld1b { z29.h }, p3/Z, [x23, x16]\n"
+ "mov z23.d, z9.d\n"
+ "mov z22.d, z5.d\n"
+ "ld1b { z4.h }, p3/Z, [x22, x16]\n"
+ "ld1b { z13.h }, p3/Z, [x21, x16]\n"
+ "mov z27.d, z9.d\n"
+ ".inst 0x455e1252 // ssublb z18.h, z18.b, z30.b\n"
+ "ld1b { z20.h }, p3/Z, [x20, x16]\n"
+ "ldr x26, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x455e10e7 // ssublb z7.h, z7.b, z30.b\n"
+ ".inst 0x455e114a // ssublb z10.h, z10.b, z30.b\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x9, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1a94 // usublb z20.h, z20.b, z12.b\n"
+ "1:" // Loop
+ ".inst 0x44824005 // smlalb z5.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x44824409 // smlalt z9.s, p4/M, z0.h, z2.h\n"
+ "ldr x20, [x28, #0x28]\n"
+ "ldr x21, [x28, #0x38]\n"
+ ".inst 0x448e43a5 // smlalb z5.s, p4/M, z29.h, z14.h\n"
+ ".inst 0x44864011 // smlalb z17.s, p4/M, z0.h, z6.h\n"
+ "ld1b { z3.h }, p3/Z, [x20, x16]\n"
+ "ldr x20, [x28, #0x30]\n"
+ ".inst 0x44954010 // smlalb z16.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x448e4016 // smlalb z22.s, p4/M, z0.h, z14.h\n"
+ "ld1b { z31.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
+ ".inst 0x448e47a9 // smlalt z9.s, p4/M, z29.h, z14.h\n"
+ ".inst 0x449241a5 // smlalb z5.s, p4/M, z13.h, z18.h\n"
+ "ldr x21, [x28, #0x40]\n"
+ "ld1b { z15.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x44864419 // smlalt z25.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
+ "ldr x20, [x28, #0x48]\n"
+ ".inst 0x448e441b // smlalt z27.s, p4/M, z0.h, z14.h\n"
+ ".inst 0x44814091 // smlalb z17.s, p4/M, z4.h, z1.h\n"
+ "ld1b { z19.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x454c19ef // usublb z15.h, z15.b, z12.b\n"
+ ".inst 0x448141b0 // smlalb z16.s, p4/M, z13.h, z1.h\n"
+ ".inst 0x449541b6 // smlalb z22.s, p4/M, z13.h, z21.h\n"
+ "ld1b { z28.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x454c1a73 // usublb z19.h, z19.b, z12.b\n"
+ ".inst 0x449245a9 // smlalt z9.s, p4/M, z13.h, z18.h\n"
+ ".inst 0x448a4285 // smlalb z5.s, p4/M, z20.h, z10.h\n"
+ "ldr x21, [x28, #0x50]\n"
+ "ldr x20, [x28, #0x58]\n"
+ ".inst 0x44814499 // smlalt z25.s, p4/M, z4.h, z1.h\n"
+ ".inst 0x448145b7 // smlalt z23.s, p4/M, z13.h, z1.h\n"
+ ".inst 0x454c1b9c // usublb z28.h, z28.b, z12.b\n"
+ "ld1b { z4.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x449545bb // smlalt z27.s, p4/M, z13.h, z21.h\n"
+ ".inst 0x448241b1 // smlalb z17.s, p4/M, z13.h, z2.h\n"
+ "ld1b { z29.h }, p3/Z, [x20, x16]\n"
+ "ldr x21, [x28, #0x60]\n"
+ ".inst 0x44874070 // smlalb z16.s, p4/M, z3.h, z7.h\n"
+ ".inst 0x44864296 // smlalb z22.s, p4/M, z20.h, z6.h\n"
+ "ldr x20, [x28, #0x68]\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x448a4689 // smlalt z9.s, p4/M, z20.h, z10.h\n"
+ ".inst 0x449543e5 // smlalb z5.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ "ld1b { z0.h }, p3/Z, [x21, x16]\n"
+ ".inst 0x448245b9 // smlalt z25.s, p4/M, z13.h, z2.h\n"
+ ".inst 0x44874477 // smlalt z23.s, p4/M, z3.h, z7.h\n"
+ "ld1b { z3.h }, p3/Z, [x20, x16]\n"
+ "ldr x20, [x28, #0x70]\n"
+ ".inst 0x4486469b // smlalt z27.s, p4/M, z20.h, z6.h\n"
+ ".inst 0x44874291 // smlalb z17.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ "ld1b { z13.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x44824290 // smlalb z16.s, p4/M, z20.h, z2.h\n"
+ ".inst 0x448841f6 // smlalb z22.s, p4/M, z15.h, z8.h\n"
+ ".inst 0x454c1863 // usublb z3.h, z3.b, z12.b\n"
+ "ldr x20, [x28, #0x78]\n"
+ ".inst 0x449547e9 // smlalt z9.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x44814265 // smlalb z5.s, p4/M, z19.h, z1.h\n"
+ ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
+ "whilelt p0.h, x27, x15\n"
+ ".inst 0x44874699 // smlalt z25.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x44824697 // smlalt z23.s, p4/M, z20.h, z2.h\n"
+ "ld1w { z20.s }, p2/Z, [x26]\n"
+ "inch x14\n"
+ ".inst 0x448845fb // smlalt z27.s, p4/M, z15.h, z8.h\n"
+ ".inst 0x448e43f1 // smlalb z17.s, p4/M, z31.h, z14.h\n"
+ "ld1w { z15.s }, p1/Z, [x26, #1, MUL VL]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44924390 // smlalb z16.s, p4/M, z28.h, z18.h\n"
+ ".inst 0x44824396 // smlalb z22.s, p4/M, z28.h, z2.h\n"
+ "addvl x26, x26, #2\n"
+ ".inst 0x44814669 // smlalt z9.s, p4/M, z19.h, z1.h\n"
+ ".inst 0x44884385 // smlalb z5.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x448e47f9 // smlalt z25.s, p4/M, z31.h, z14.h\n"
+ ".inst 0x44924797 // smlalt z23.s, p4/M, z28.h, z18.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x454c1bff // usublb z31.h, z31.b, z12.b\n"
+ ".inst 0x4482479b // smlalt z27.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x44954271 // smlalb z17.s, p4/M, z19.h, z21.h\n"
+ "uzp1 z2.s, z20.s, z15.s\n"
+ "inch x16\n"
+ ".inst 0x448e4090 // smlalb z16.s, p4/M, z4.h, z14.h\n"
+ ".inst 0x448143b6 // smlalb z22.s, p4/M, z29.h, z1.h\n"
+ "uzp2 z15.s, z20.s, z15.s\n"
+ "ld1w { z20.s }, p2/Z, [x25]\n"
+ ".inst 0x44884789 // smlalt z9.s, p4/M, z28.h, z8.h\n"
+ ".inst 0x44864085 // smlalb z5.s, p4/M, z4.h, z6.h\n"
+ "mov x20, x16\n"
+ "incw x20\n"
+ ".inst 0x44954679 // smlalt z25.s, p4/M, z19.h, z21.h\n"
+ ".inst 0x448e4497 // smlalt z23.s, p4/M, z4.h, z14.h\n"
+ "ld1w { z19.s }, p1/Z, [x25, #1, MUL VL]\n"
+ "uzp1 z21.s, z20.s, z19.s\n"
+ ".inst 0x448147bb // smlalt z27.s, p4/M, z29.h, z1.h\n"
+ ".inst 0x448a4391 // smlalb z17.s, p4/M, z28.h, z10.h\n"
+ "uzp2 z1.s, z20.s, z19.s\n"
+ "whilelt p2.s, x16, x15\n"
+ ".inst 0x44864010 // smlalb z16.s, p4/M, z0.h, z6.h\n"
+ ".inst 0x44924076 // smlalb z22.s, p4/M, z3.h, z18.h\n"
+ "whilelt p1.s, x20, x15\n"
+ "whilelt p3.h, x16, x15\n"
+ ".inst 0x44864489 // smlalt z9.s, p4/M, z4.h, z6.h\n"
+ ".inst 0x44874005 // smlalb z5.s, p4/M, z0.h, z7.h\n"
+ ".inst 0x04a274a5 // sqrdmulh z5.s, z5.s, z2.s\n"
+ "addvl x25, x25, #2\n"
+ ".inst 0x448a4799 // smlalt z25.s, p4/M, z28.h, z10.h\n"
+ ".inst 0x44864417 // smlalt z23.s, p4/M, z0.h, z6.h\n"
+ "and z19.d, z5.d, z21.d\n"
+ ".inst 0x4492447b // smlalt z27.s, p4/M, z3.h, z18.h\n"
+ ".inst 0x449243b1 // smlalb z17.s, p4/M, z29.h, z18.h\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ ".inst 0x448a41b0 // smlalb z16.s, p4/M, z13.h, z10.h\n"
+ ".inst 0x448741b6 // smlalb z22.s, p4/M, z13.h, z7.h\n"
+ "sqadd z5.s, z5.s, z19.s\n"
+ ".inst 0x448292a5 // srshl z5.s, p4/M, z5.s, z21.s\n"
+ ".inst 0x44874409 // smlalt z9.s, p4/M, z0.h, z7.h\n"
+ ".inst 0x449247b9 // smlalt z25.s, p4/M, z29.h, z18.h\n"
+ ".inst 0x04af7529 // sqrdmulh z9.s, z9.s, z15.s\n"
+ ".inst 0x448a45b7 // smlalt z23.s, p4/M, z13.h, z10.h\n"
+ ".inst 0x448745bb // smlalt z27.s, p4/M, z13.h, z7.h\n"
+ "and z29.d, z9.d, z1.d\n"
+ ".inst 0x44884071 // smlalb z17.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x448843f0 // smlalb z16.s, p4/M, z31.h, z8.h\n"
+ ".inst 0x04a27631 // sqrdmulh z17.s, z17.s, z2.s\n"
+ ".inst 0x448a43f6 // smlalb z22.s, p4/M, z31.h, z10.h\n"
+ ".inst 0x44884479 // smlalt z25.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x04a27610 // sqrdmulh z16.s, z16.s, z2.s\n"
+ ".inst 0x448847f7 // smlalt z23.s, p4/M, z31.h, z8.h\n"
+ ".inst 0x448a47fb // smlalt z27.s, p4/M, z31.h, z10.h\n"
+ ".inst 0x04a276d6 // sqrdmulh z22.s, z22.s, z2.s\n"
+ "asr z29.s, z29.s, #0x1f\n"
+ "and z18.d, z17.d, z21.d\n"
+ ".inst 0x04af7739 // sqrdmulh z25.s, z25.s, z15.s\n"
+ "and z20.d, z16.d, z21.d\n"
+ ".inst 0x04af76f7 // sqrdmulh z23.s, z23.s, z15.s\n"
+ "and z19.d, z22.d, z21.d\n"
+ ".inst 0x04af777b // sqrdmulh z27.s, z27.s, z15.s\n"
+ "sqadd z9.s, z9.s, z29.s\n"
+ ".inst 0x44829029 // srshl z9.s, p4/M, z9.s, z1.s\n"
+ "asr z18.s, z18.s, #0x1f\n"
+ "and z7.d, z25.d, z1.d\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "and z6.d, z23.d, z1.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z2.d, z27.d, z1.d\n"
+ "sqadd z17.s, z17.s, z18.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ ".inst 0x448292b1 // srshl z17.s, p4/M, z17.s, z21.s\n"
+ "sqadd z16.s, z16.s, z20.s\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ ".inst 0x448292b0 // srshl z16.s, p4/M, z16.s, z21.s\n"
+ "sqadd z22.s, z22.s, z19.s\n"
+ "asr z2.s, z2.s, #0x1f\n"
+ ".inst 0x448292b6 // srshl z22.s, p4/M, z22.s, z21.s\n"
+ "sqadd z25.s, z25.s, z7.s\n"
+ "sqadd z23.s, z23.s, z6.s\n"
+ ".inst 0x44829039 // srshl z25.s, p4/M, z25.s, z1.s\n"
+ ".inst 0x44829037 // srshl z23.s, p4/M, z23.s, z1.s\n"
+ "sqadd z27.s, z27.s, z2.s\n"
+ ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
+ ".inst 0x4482903b // srshl z27.s, p4/M, z27.s, z1.s\n"
+ ".inst 0x45304231 // sqxtnb z17.h, z17.s\n"
+ ".inst 0x45304210 // sqxtnb z16.h, z16.s\n"
+ ".inst 0x453042d6 // sqxtnb z22.h, z22.s\n"
+ ".inst 0x45304525 // sqxtnt z5.h, z9.s\n"
+ ".inst 0x45304731 // sqxtnt z17.h, z25.s\n"
+ ".inst 0x453046f0 // sqxtnt z16.h, z23.s\n"
+ ".inst 0x45304776 // sqxtnt z22.h, z27.s\n"
+ "sqadd z5.h, z5.h, z24.h\n"
+ "smax z5.h, p4/M, z5.h, z11.h\n"
+ "smin z5.h, p4/M, z5.h, z26.h\n"
+ "sqadd z17.h, z17.h, z24.h\n"
+ "sqadd z16.h, z16.h, z24.h\n"
+ "smax z17.h, p4/M, z17.h, z11.h\n"
+ "smax z16.h, p4/M, z16.h, z11.h\n"
+ "sqadd z22.h, z22.h, z24.h\n"
+ "smax z22.h, p4/M, z22.h, z11.h\n"
+ "smin z17.h, p4/M, z17.h, z26.h\n"
+ "st1b { z5.h }, p0, [x13, x27]\n"
+ "smin z16.h, p4/M, z16.h, z26.h\n"
+ "smin z22.h, p4/M, z22.h, z26.h\n"
+ "st1b { z17.h }, p0, [x12, x27]\n"
+ "st1b { z16.h }, p0, [x11, x27]\n"
+ "st1b { z22.h }, p0, [x10, x27]\n"
+ "ld1sb { z14.h }, p4/Z, [x14]\n"
+ "ld1sb { z21.h }, p4/Z, [x14, #1, MUL VL]\n"
+ "inch x27\n"
+ "ld1sb { z1.h }, p4/Z, [x14, #2, MUL VL]\n"
+ "ld1sb { z6.h }, p4/Z, [x14, #3, MUL VL]\n"
+ ".inst 0x455e11ce // ssublb z14.h, z14.b, z30.b\n"
+ ".inst 0x455e12b5 // ssublb z21.h, z21.b, z30.b\n"
+ "ld1sb { z2.h }, p4/Z, [x14, #4, MUL VL]\n"
+ "ld1sb { z18.h }, p4/Z, [x14, #5, MUL VL]\n"
+ ".inst 0x455e1021 // ssublb z1.h, z1.b, z30.b\n"
+ ".inst 0x455e10c6 // ssublb z6.h, z6.b, z30.b\n"
+ "ld1sb { z7.h }, p4/Z, [x14, #6, MUL VL]\n"
+ "ld1sb { z10.h }, p4/Z, [x14, #7, MUL VL]\n"
+ "inch x14, ALL, MUL #8\n"
+ ".inst 0x455e1042 // ssublb z2.h, z2.b, z30.b\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+ "uzp1 z5.s, z17.s, z16.s\n"
+ "uzp2 z9.s, z17.s, z16.s\n"
+ "ld1sb { z8.h }, p4/Z, [x14]\n"
+ "ldp x24, x23, [x28, #0x0]\n"
+ "addvl x21, x21, #2\n"
+ "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x22, x21, [x28, #0x10]\n"
+ "ldr x20, [x28, #0x20]\n"
+ "mov z17.d, z5.d\n"
+ "mov z25.d, z9.d\n"
+ "ld1b { z0.h }, p3/Z, [x24, x16]\n"
+ "ld1b { z29.h }, p3/Z, [x23, x16]\n"
+ "mov z16.d, z5.d\n"
+ "mov z23.d, z9.d\n"
+ "ld1b { z4.h }, p3/Z, [x22, x16]\n"
+ "ld1b { z13.h }, p3/Z, [x21, x16]\n"
+ "mov z22.d, z5.d\n"
+ "mov z27.d, z9.d\n"
+ "ld1b { z20.h }, p3/Z, [x20, x16]\n"
+ ".inst 0x455e1252 // ssublb z18.h, z18.b, z30.b\n"
+ ".inst 0x455e10e7 // ssublb z7.h, z7.b, z30.b\n"
+ ".inst 0x455e114a // ssublb z10.h, z10.b, z30.b\n"
+ ".inst 0x455e1108 // ssublb z8.h, z8.b, z30.b\n"
+ ".inst 0x454c1800 // usublb z0.h, z0.b, z12.b\n"
+ ".inst 0x454c1bbd // usublb z29.h, z29.b, z12.b\n"
+ ".inst 0x454c1884 // usublb z4.h, z4.b, z12.b\n"
+ ".inst 0x454c19ad // usublb z13.h, z13.b, z12.b\n"
+ ".inst 0x454c1a94 // usublb z20.h, z20.b, z12.b\n"
+ "b.any 1b\n"
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..79e3fd5f54
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..754d06d443
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[25];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[12];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[1];
+ inptrs[3] = inptrs_raw[3];
+ inptrs[4] = inptrs_raw[4];
+ inptrs[5] = inptrs_raw[5];
+ inptrs[6] = inptrs_raw[6];
+ inptrs[7] = inptrs_raw[2];
+ inptrs[8] = inptrs_raw[8];
+ inptrs[9] = inptrs_raw[9];
+ inptrs[10] = inptrs_raw[7];
+ inptrs[11] = inptrs_raw[15];
+ inptrs[12] = inptrs_raw[10];
+ inptrs[13] = inptrs_raw[16];
+ inptrs[14] = inptrs_raw[11];
+ inptrs[15] = inptrs_raw[18];
+ inptrs[16] = inptrs_raw[13];
+ inptrs[17] = inptrs_raw[19];
+ inptrs[18] = inptrs_raw[20];
+ inptrs[19] = inptrs_raw[14];
+ inptrs[20] = inptrs_raw[21];
+ inptrs[21] = inptrs_raw[17];
+ inptrs[22] = inptrs_raw[23];
+ inptrs[23] = inptrs_raw[22];
+ inptrs[24] = inptrs_raw[24];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "mov x7, #0x0\n"
+ "ldr x25, [%x[params], %[offsetof_Params_requant]]\n"
+ "ptrue p4.b\n"
+ "ldr x24, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "mov x23, x7\n"
+ "add x21, x25, %[offsetof_Requantize32_a_offset]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x20, x25, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x25, %[offsetof_Requantize32_c_offset]\n"
+ "ld1rb { z26.b }, p4/Z, [x21]\n"
+ "ld1rb { z13.b }, p4/Z, [x20]\n"
+ "add x21, x25, %[offsetof_Requantize32_minval]\n"
+ "add x20, x25, %[offsetof_Requantize32_maxval]\n"
+ "ld1rh { z19.h }, p4/Z, [x22]\n"
+ "ld1rh { z12.h }, p4/Z, [x21]\n"
+ "ld1rh { z9.h }, p4/Z, [x20]\n"
+ "ldp x16, x15, [x24, #0x0]\n"
+ "incw x23\n"
+ "whilelt p3.h, x7, x8\n"
+ "ldp x14, x13, [x24, #0x10]\n"
+ "whilelt p2.s, x7, x8\n"
+ "whilelt p1.s, x23, x8\n"
+ "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1sb { z25.h }, p4/Z, [x17]\n"
+ "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "add x11, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x10, #0x0\n"
+ "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+ ".inst 0x454d1339 // ssublb z25.h, z25.b, z13.b\n"
+ ".inst 0x454d13de // ssublb z30.h, z30.b, z13.b\n"
+ "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+ ".inst 0x454d11ce // ssublb z14.h, z14.b, z13.b\n"
+ ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
+ "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+ "inch x17, ALL, MUL #8\n"
+ ".inst 0x454d114a // ssublb z10.h, z10.b, z13.b\n"
+ "ld1w { z17.s }, p2/Z, [x12]\n"
+ "ld1w { z16.s }, p1/Z, [x12, #1, MUL VL]\n"
+ "uzp1 z8.s, z17.s, z16.s\n"
+ "uzp2 z24.s, z17.s, z16.s\n"
+ "ld1sb { z2.h }, p4/Z, [x17]\n"
+ "ldp x27, x26, [x11, #0x0]\n"
+ "addvl x12, x12, #2\n"
+ "mov z18.d, z8.d\n"
+ "ldp x25, x24, [x11, #0x10]\n"
+ "ldp x23, x22, [x11, #0x20]\n"
+ "mov z0.d, z24.d\n"
+ "mov z15.d, z8.d\n"
+ "ldp x21, x20, [x11, #0x30]\n"
+ "ld1b { z21.h }, p3/Z, [x27, x7]\n"
+ "mov z1.d, z24.d\n"
+ "mov z5.d, z8.d\n"
+ "ld1b { z22.h }, p3/Z, [x26, x7]\n"
+ "ld1b { z11.h }, p3/Z, [x25, x7]\n"
+ "mov z6.d, z24.d\n"
+ ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
+ "ld1b { z20.h }, p3/Z, [x24, x7]\n"
+ "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+ ".inst 0x454d12f7 // ssublb z23.h, z23.b, z13.b\n"
+ ".inst 0x454d10e7 // ssublb z7.h, z7.b, z13.b\n"
+ "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+ "ld1b { z16.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
+ ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
+ "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+ "ldr x9, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ "ldr x28, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
+ ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
+ ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
+ ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
+ "1:" // Loop
+ ".inst 0x448242a8 // smlalb z8.s, p4/M, z21.h, z2.h\n"
+ "ldr x21, [x11, #0x58]\n"
+ "ldr x20, [x11, #0x78]\n"
+ ".inst 0x448246b8 // smlalt z24.s, p4/M, z21.h, z2.h\n"
+ ".inst 0x449942c8 // smlalb z8.s, p4/M, z22.h, z25.h\n"
+ "ld1b { z17.h }, p3/Z, [x21, x7]\n"
+ "ld1b { z29.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x449742b2 // smlalb z18.s, p4/M, z21.h, z23.h\n"
+ "ldr x21, [x11, #0x60]\n"
+ "ldr x20, [x11, #0x80]\n"
+ ".inst 0x448e42af // smlalb z15.s, p4/M, z21.h, z14.h\n"
+ ".inst 0x449942a5 // smlalb z5.s, p4/M, z21.h, z25.h\n"
+ ".inst 0x449946d8 // smlalt z24.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x455a1a31 // usublb z17.h, z17.b, z26.b\n"
+ ".inst 0x449e4168 // smlalb z8.s, p4/M, z11.h, z30.h\n"
+ "ld1b { z22.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x455a1bbd // usublb z29.h, z29.b, z26.b\n"
+ ".inst 0x449746a0 // smlalt z0.s, p4/M, z21.h, z23.h\n"
+ ".inst 0x448e46a1 // smlalt z1.s, p4/M, z21.h, z14.h\n"
+ "ldr x21, [x11, #0x68]\n"
+ ".inst 0x449946a6 // smlalt z6.s, p4/M, z21.h, z25.h\n"
+ "ld1b { z21.h }, p3/Z, [x20, x7]\n"
+ "ldr x20, [x11, #0x88]\n"
+ ".inst 0x449e4292 // smlalb z18.s, p4/M, z20.h, z30.h\n"
+ ".inst 0x4484422f // smlalb z15.s, p4/M, z17.h, z4.h\n"
+ ".inst 0x448a43a5 // smlalb z5.s, p4/M, z29.h, z10.h\n"
+ ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
+ "ldr x22, [x11, #0x40]\n"
+ ".inst 0x449e4578 // smlalt z24.s, p4/M, z11.h, z30.h\n"
+ ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
+ ".inst 0x44844388 // smlalb z8.s, p4/M, z28.h, z4.h\n"
+ "ld1b { z11.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x449e4680 // smlalt z0.s, p4/M, z20.h, z30.h\n"
+ "ld1b { z20.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x44844621 // smlalt z1.s, p4/M, z17.h, z4.h\n"
+ "ldr x21, [x11, #0x70]\n"
+ ".inst 0x448a47a6 // smlalt z6.s, p4/M, z29.h, z10.h\n"
+ "ldr x20, [x11, #0x98]\n"
+ ".inst 0x448e4372 // smlalb z18.s, p4/M, z27.h, z14.h\n"
+ "ldr x23, [x11, #0x50]\n"
+ ".inst 0x449942cf // smlalb z15.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x449e42a5 // smlalb z5.s, p4/M, z21.h, z30.h\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ "ld1b { z17.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x44844798 // smlalt z24.s, p4/M, z28.h, z4.h\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x448a4208 // smlalb z8.s, p4/M, z16.h, z10.h\n"
+ "ld1b { z29.h }, p3/Z, [x21, x7]\n"
+ "ld1b { z28.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x448e4760 // smlalt z0.s, p4/M, z27.h, z14.h\n"
+ "ldr x22, [x11, #0x48]\n"
+ ".inst 0x449946c1 // smlalt z1.s, p4/M, z22.h, z25.h\n"
+ ".inst 0x449e46a6 // smlalt z6.s, p4/M, z21.h, z30.h\n"
+ "ldr x21, [x11, #0x90]\n"
+ "ldr x20, [x11, #0xa8]\n"
+ ".inst 0x449943f2 // smlalb z18.s, p4/M, z31.h, z25.h\n"
+ "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+ ".inst 0x448a416f // smlalb z15.s, p4/M, z11.h, z10.h\n"
+ ".inst 0x44834285 // smlalb z5.s, p4/M, z20.h, z3.h\n"
+ ".inst 0x455a1a31 // usublb z17.h, z17.b, z26.b\n"
+ ".inst 0x448a4618 // smlalt z24.s, p4/M, z16.h, z10.h\n"
+ ".inst 0x455a1bbd // usublb z29.h, z29.b, z26.b\n"
+ ".inst 0x448e43e8 // smlalb z8.s, p4/M, z31.h, z14.h\n"
+ "ld1b { z16.h }, p3/Z, [x22, x7]\n"
+ ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
+ ".inst 0x449947e0 // smlalt z0.s, p4/M, z31.h, z25.h\n"
+ "ld1b { z25.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x448a4561 // smlalt z1.s, p4/M, z11.h, z10.h\n"
+ "ld1b { z11.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
+ ".inst 0x44834686 // smlalt z6.s, p4/M, z20.h, z3.h\n"
+ "ldr x21, [x11, #0xa0]\n"
+ "ldr x20, [x11, #0xb0]\n"
+ ".inst 0x448a4232 // smlalb z18.s, p4/M, z17.h, z10.h\n"
+ ".inst 0x449e43af // smlalb z15.s, p4/M, z29.h, z30.h\n"
+ ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
+ ".inst 0x448e4385 // smlalb z5.s, p4/M, z28.h, z14.h\n"
+ ".inst 0x448e47f8 // smlalt z24.s, p4/M, z31.h, z14.h\n"
+ ".inst 0x455a1b39 // usublb z25.h, z25.b, z26.b\n"
+ "ld1b { z20.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ ".inst 0x44834368 // smlalb z8.s, p4/M, z27.h, z3.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x448a4620 // smlalt z0.s, p4/M, z17.h, z10.h\n"
+ ".inst 0x449e47a1 // smlalt z1.s, p4/M, z29.h, z30.h\n"
+ ".inst 0x448e4786 // smlalt z6.s, p4/M, z28.h, z14.h\n"
+ "ldr x20, [x11, #0xb8]\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x44834212 // smlalb z18.s, p4/M, z16.h, z3.h\n"
+ ".inst 0x4497432f // smlalb z15.s, p4/M, z25.h, z23.h\n"
+ ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
+ "ld1b { z30.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x44844165 // smlalb z5.s, p4/M, z11.h, z4.h\n"
+ ".inst 0x44834778 // smlalt z24.s, p4/M, z27.h, z3.h\n"
+ "ldr x20, [x11, #0xc0]\n"
+ "ld1w { z17.s }, p2/Z, [x9]\n"
+ ".inst 0x449742c8 // smlalb z8.s, p4/M, z22.h, z23.h\n"
+ ".inst 0x44834600 // smlalt z0.s, p4/M, z16.h, z3.h\n"
+ "ld1w { z14.s }, p1/Z, [x9, #1, MUL VL]\n"
+ ".inst 0x455a1bde // usublb z30.h, z30.b, z26.b\n"
+ ".inst 0x44974721 // smlalt z1.s, p4/M, z25.h, z23.h\n"
+ ".inst 0x44844566 // smlalt z6.s, p4/M, z11.h, z4.h\n"
+ "ld1b { z25.h }, p3/Z, [x20, x7]\n"
+ "uzp1 z10.s, z17.s, z14.s\n"
+ ".inst 0x44844372 // smlalb z18.s, p4/M, z27.h, z4.h\n"
+ ".inst 0x4487428f // smlalb z15.s, p4/M, z20.h, z7.h\n"
+ "uzp2 z14.s, z17.s, z14.s\n"
+ "ld1w { z17.s }, p2/Z, [x28]\n"
+ ".inst 0x448743e5 // smlalb z5.s, p4/M, z31.h, z7.h\n"
+ ".inst 0x449746d8 // smlalt z24.s, p4/M, z22.h, z23.h\n"
+ "ld1w { z16.s }, p1/Z, [x28, #1, MUL VL]\n"
+ ".inst 0x455a1b39 // usublb z25.h, z25.b, z26.b\n"
+ ".inst 0x448743a8 // smlalb z8.s, p4/M, z29.h, z7.h\n"
+ ".inst 0x44844760 // smlalt z0.s, p4/M, z27.h, z4.h\n"
+ "uzp1 z4.s, z17.s, z16.s\n"
+ "inch x7\n"
+ ".inst 0x44874681 // smlalt z1.s, p4/M, z20.h, z7.h\n"
+ ".inst 0x448747e6 // smlalt z6.s, p4/M, z31.h, z7.h\n"
+ ".inst 0x04aa7508 // sqrdmulh z8.s, z8.s, z10.s\n"
+ "whilelt p0.h, x10, x8\n"
+ ".inst 0x448742b2 // smlalb z18.s, p4/M, z21.h, z7.h\n"
+ ".inst 0x4483416f // smlalb z15.s, p4/M, z11.h, z3.h\n"
+ "uzp2 z22.s, z17.s, z16.s\n"
+ "mov x20, x7\n"
+ ".inst 0x449743c5 // smlalb z5.s, p4/M, z30.h, z23.h\n"
+ ".inst 0x448747b8 // smlalt z24.s, p4/M, z29.h, z7.h\n"
+ "and z17.d, z8.d, z4.d\n"
+ "inch x17\n"
+ ".inst 0x448746a0 // smlalt z0.s, p4/M, z21.h, z7.h\n"
+ ".inst 0x44834561 // smlalt z1.s, p4/M, z11.h, z3.h\n"
+ ".inst 0x04ae7718 // sqrdmulh z24.s, z24.s, z14.s\n"
+ "incw x20\n"
+ ".inst 0x449747c6 // smlalt z6.s, p4/M, z30.h, z23.h\n"
+ ".inst 0x44824392 // smlalb z18.s, p4/M, z28.h, z2.h\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ "whilelt p2.s, x7, x8\n"
+ ".inst 0x448243cf // smlalb z15.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x44824325 // smlalb z5.s, p4/M, z25.h, z2.h\n"
+ "and z16.d, z24.d, z22.d\n"
+ "whilelt p1.s, x20, x8\n"
+ ".inst 0x44824780 // smlalt z0.s, p4/M, z28.h, z2.h\n"
+ ".inst 0x448247c1 // smlalt z1.s, p4/M, z30.h, z2.h\n"
+ ".inst 0x04aa7652 // sqrdmulh z18.s, z18.s, z10.s\n"
+ "ldr x20, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44824726 // smlalt z6.s, p4/M, z25.h, z2.h\n"
+ ".inst 0x04aa75ef // sqrdmulh z15.s, z15.s, z10.s\n"
+ "whilelt p3.h, x7, x8\n"
+ "addvl x9, x9, #2\n"
+ ".inst 0x04aa74a5 // sqrdmulh z5.s, z5.s, z10.s\n"
+ "sqadd z8.s, z8.s, z17.s\n"
+ ".inst 0x44829088 // srshl z8.s, p4/M, z8.s, z4.s\n"
+ "addvl x28, x28, #2\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ "and z21.d, z18.d, z4.d\n"
+ ".inst 0x04ae7400 // sqrdmulh z0.s, z0.s, z14.s\n"
+ "and z20.d, z15.d, z4.d\n"
+ ".inst 0x04ae7421 // sqrdmulh z1.s, z1.s, z14.s\n"
+ "and z28.d, z5.d, z4.d\n"
+ ".inst 0x04ae74c6 // sqrdmulh z6.s, z6.s, z14.s\n"
+ "sqadd z24.s, z24.s, z16.s\n"
+ ".inst 0x448292d8 // srshl z24.s, p4/M, z24.s, z22.s\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ "and z25.d, z0.d, z22.d\n"
+ "asr z20.s, z20.s, #0x1f\n"
+ "and z17.d, z1.d, z22.d\n"
+ "asr z28.s, z28.s, #0x1f\n"
+ "and z16.d, z6.d, z22.d\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "asr z25.s, z25.s, #0x1f\n"
+ ".inst 0x44829092 // srshl z18.s, p4/M, z18.s, z4.s\n"
+ "sqadd z15.s, z15.s, z20.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x4482908f // srshl z15.s, p4/M, z15.s, z4.s\n"
+ "sqadd z5.s, z5.s, z28.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x44829085 // srshl z5.s, p4/M, z5.s, z4.s\n"
+ "sqadd z0.s, z0.s, z25.s\n"
+ "sqadd z1.s, z1.s, z17.s\n"
+ ".inst 0x448292c0 // srshl z0.s, p4/M, z0.s, z22.s\n"
+ ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
+ "sqadd z6.s, z6.s, z16.s\n"
+ ".inst 0x45304108 // sqxtnb z8.h, z8.s\n"
+ ".inst 0x448292c6 // srshl z6.s, p4/M, z6.s, z22.s\n"
+ ".inst 0x45304252 // sqxtnb z18.h, z18.s\n"
+ ".inst 0x453041ef // sqxtnb z15.h, z15.s\n"
+ ".inst 0x453040a5 // sqxtnb z5.h, z5.s\n"
+ ".inst 0x45304708 // sqxtnt z8.h, z24.s\n"
+ ".inst 0x45304412 // sqxtnt z18.h, z0.s\n"
+ ".inst 0x4530442f // sqxtnt z15.h, z1.s\n"
+ ".inst 0x453044c5 // sqxtnt z5.h, z6.s\n"
+ "sqadd z8.h, z8.h, z19.h\n"
+ "smax z8.h, p4/M, z8.h, z12.h\n"
+ "smin z8.h, p4/M, z8.h, z9.h\n"
+ "sqadd z18.h, z18.h, z19.h\n"
+ "sqadd z15.h, z15.h, z19.h\n"
+ "smax z18.h, p4/M, z18.h, z12.h\n"
+ "smax z15.h, p4/M, z15.h, z12.h\n"
+ "sqadd z5.h, z5.h, z19.h\n"
+ "smax z5.h, p4/M, z5.h, z12.h\n"
+ "smin z18.h, p4/M, z18.h, z9.h\n"
+ "st1b { z8.h }, p0, [x16, x10]\n"
+ "smin z15.h, p4/M, z15.h, z9.h\n"
+ "smin z5.h, p4/M, z5.h, z9.h\n"
+ "st1b { z18.h }, p0, [x15, x10]\n"
+ "st1b { z15.h }, p0, [x14, x10]\n"
+ "st1b { z5.h }, p0, [x13, x10]\n"
+ "ld1sb { z25.h }, p4/Z, [x17]\n"
+ "ld1sb { z30.h }, p4/Z, [x17, #1, MUL VL]\n"
+ "inch x10\n"
+ "ld1sb { z14.h }, p4/Z, [x17, #2, MUL VL]\n"
+ "ld1sb { z4.h }, p4/Z, [x17, #3, MUL VL]\n"
+ ".inst 0x454d1339 // ssublb z25.h, z25.b, z13.b\n"
+ ".inst 0x454d13de // ssublb z30.h, z30.b, z13.b\n"
+ "ld1sb { z10.h }, p4/Z, [x17, #4, MUL VL]\n"
+ "ld1sb { z3.h }, p4/Z, [x17, #5, MUL VL]\n"
+ ".inst 0x454d11ce // ssublb z14.h, z14.b, z13.b\n"
+ ".inst 0x454d1084 // ssublb z4.h, z4.b, z13.b\n"
+ "ld1sb { z23.h }, p4/Z, [x17, #6, MUL VL]\n"
+ "ld1sb { z7.h }, p4/Z, [x17, #7, MUL VL]\n"
+ "inch x17, ALL, MUL #8\n"
+ ".inst 0x454d114a // ssublb z10.h, z10.b, z13.b\n"
+ "ld1w { z17.s }, p2/Z, [x20]\n"
+ "ld1w { z16.s }, p1/Z, [x20, #1, MUL VL]\n"
+ "uzp1 z8.s, z17.s, z16.s\n"
+ "uzp2 z24.s, z17.s, z16.s\n"
+ "ld1sb { z2.h }, p4/Z, [x17]\n"
+ "ldp x27, x26, [x11, #0x0]\n"
+ "addvl x20, x20, #2\n"
+ "str x20, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldp x25, x24, [x11, #0x10]\n"
+ "ldp x23, x22, [x11, #0x20]\n"
+ "mov z18.d, z8.d\n"
+ "mov z0.d, z24.d\n"
+ "ldp x21, x20, [x11, #0x30]\n"
+ "ld1b { z21.h }, p3/Z, [x27, x7]\n"
+ "mov z15.d, z8.d\n"
+ "mov z1.d, z24.d\n"
+ "ld1b { z22.h }, p3/Z, [x26, x7]\n"
+ "ld1b { z11.h }, p3/Z, [x25, x7]\n"
+ "mov z5.d, z8.d\n"
+ "mov z6.d, z24.d\n"
+ "ld1b { z20.h }, p3/Z, [x24, x7]\n"
+ "ld1b { z27.h }, p3/Z, [x23, x7]\n"
+ ".inst 0x454d1063 // ssublb z3.h, z3.b, z13.b\n"
+ ".inst 0x454d12f7 // ssublb z23.h, z23.b, z13.b\n"
+ "ld1b { z28.h }, p3/Z, [x22, x7]\n"
+ "ld1b { z16.h }, p3/Z, [x21, x7]\n"
+ ".inst 0x454d10e7 // ssublb z7.h, z7.b, z13.b\n"
+ ".inst 0x454d1042 // ssublb z2.h, z2.b, z13.b\n"
+ "ld1b { z31.h }, p3/Z, [x20, x7]\n"
+ ".inst 0x455a1ab5 // usublb z21.h, z21.b, z26.b\n"
+ ".inst 0x455a1ad6 // usublb z22.h, z22.b, z26.b\n"
+ ".inst 0x455a196b // usublb z11.h, z11.b, z26.b\n"
+ ".inst 0x455a1a94 // usublb z20.h, z20.b, z26.b\n"
+ ".inst 0x455a1b7b // usublb z27.h, z27.b, z26.b\n"
+ ".inst 0x455a1b9c // usublb z28.h, z28.b, z26.b\n"
+ ".inst 0x455a1a10 // usublb z16.h, z16.b, z26.b\n"
+ ".inst 0x455a1bff // usublb z31.h, z31.b, z26.b\n"
+ "b.any 1b\n"
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..0ff853ec2d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..f24a258484
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,652 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+
+namespace arm_conv {
+namespace depthwise {
+
+void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const int8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[36];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[0];
+ inptrs[1] = inptrs_raw[1];
+ inptrs[2] = inptrs_raw[6];
+ inptrs[3] = inptrs_raw[7];
+ inptrs[4] = inptrs_raw[2];
+ inptrs[5] = inptrs_raw[8];
+ inptrs[6] = inptrs_raw[3];
+ inptrs[7] = inptrs_raw[4];
+ inptrs[8] = inptrs_raw[11];
+ inptrs[9] = inptrs_raw[12];
+ inptrs[10] = inptrs_raw[9];
+ inptrs[11] = inptrs_raw[10];
+ inptrs[12] = inptrs_raw[5];
+ inptrs[13] = inptrs_raw[13];
+ inptrs[14] = inptrs_raw[14];
+ inptrs[15] = inptrs_raw[15];
+ inptrs[16] = inptrs_raw[16];
+ inptrs[17] = inptrs_raw[17];
+ inptrs[18] = inptrs_raw[18];
+ inptrs[19] = inptrs_raw[19];
+ inptrs[20] = inptrs_raw[20];
+ inptrs[21] = inptrs_raw[21];
+ inptrs[22] = inptrs_raw[22];
+ inptrs[23] = inptrs_raw[23];
+ inptrs[24] = inptrs_raw[24];
+ inptrs[25] = inptrs_raw[25];
+ inptrs[26] = inptrs_raw[26];
+ inptrs[27] = inptrs_raw[27];
+ inptrs[28] = inptrs_raw[28];
+ inptrs[29] = inptrs_raw[29];
+ inptrs[30] = inptrs_raw[30];
+ inptrs[31] = inptrs_raw[31];
+ inptrs[32] = inptrs_raw[32];
+ inptrs[33] = inptrs_raw[33];
+ inptrs[34] = inptrs_raw[34];
+ inptrs[35] = inptrs_raw[35];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "mov x2, #0x0\n"
+ "mov x24, x2\n"
+ "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "ptrue p4.b\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "incw x24\n"
+ "ldr x4, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x21, x23, %[offsetof_Requantize32_a_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
+ "ld1rb { z30.b }, p4/Z, [x21]\n"
+ "ld1rb { z10.b }, p4/Z, [x20]\n"
+ "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
+ "add x20, x23, %[offsetof_Requantize32_minval]\n"
+ "ld1rh { z15.h }, p4/Z, [x21]\n"
+ "ld1rh { z12.h }, p4/Z, [x20]\n"
+ "add x20, x23, %[offsetof_Requantize32_maxval]\n"
+ "ld1rh { z13.h }, p4/Z, [x20]\n"
+ "ldp x5, x6, [x22, #0x0]\n"
+ "whilelt p3.h, x2, x3\n"
+ "ldp x7, x8, [x22, #0x10]\n"
+ "whilelt p2.s, x2, x3\n"
+ "whilelt p1.s, x24, x3\n"
+ "ldr x10, [%x[params], %[offsetof_Params_bias]]\n"
+ "add x17, %x[params], %[offsetof_Params_inptrs]\n"
+ "ld1w { z17.s }, p2/Z, [x10]\n"
+ "ld1w { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
+ "uzp1 z14.s, z17.s, z16.s\n"
+ "ld1sb { z26.h }, p4/Z, [x4]\n"
+ "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "uzp2 z23.s, z17.s, z16.s\n"
+ "addvl x10, x10, #2\n"
+ "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "mov x16, #0x0\n"
+ "mov z6.d, z14.d\n"
+ "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x17, #0x0]\n"
+ "mov z18.d, z23.d\n"
+ "mov z9.d, z14.d\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "mov z20.d, z23.d\n"
+ "mov z7.d, z14.d\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "mov z1.d, z23.d\n"
+ ".inst 0x454a135a // ssublb z26.h, z26.b, z10.b\n"
+ "ld1b { z22.h }, p3/Z, [x9, x2]\n"
+ "ld1b { z2.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454a1108 // ssublb z8.h, z8.b, z10.b\n"
+ ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
+ "ld1b { z11.h }, p3/Z, [x27, x2]\n"
+ "ld1b { z3.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x454a1231 // ssublb z17.h, z17.b, z10.b\n"
+ "ld1b { z29.h }, p3/Z, [x25, x2]\n"
+ "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
+ ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
+ "ld1b { z31.h }, p3/Z, [x23, x2]\n"
+ "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ "ld1b { z19.h }, p3/Z, [x21, x2]\n"
+ "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ "ldr x15, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x14, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "str x10, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x455e1a73 // usublb z19.h, z19.b, z30.b\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
+ "1:" // Loop
+ ".inst 0x449a42ce // smlalb z14.s, p4/M, z22.h, z26.h\n"
+ ".inst 0x449a46d7 // smlalt z23.s, p4/M, z22.h, z26.h\n"
+ "ldr x20, [x17, #0x50]\n"
+ "ld1b { z27.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x4488404e // smlalb z14.s, p4/M, z2.h, z8.h\n"
+ ".inst 0x449a4046 // smlalb z6.s, p4/M, z2.h, z26.h\n"
+ "ldr x20, [x17, #0x58]\n"
+ ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449a4169 // smlalb z9.s, p4/M, z11.h, z26.h\n"
+ ".inst 0x449a4067 // smlalb z7.s, p4/M, z3.h, z26.h\n"
+ "ld1b { z5.h }, p3/Z, [x20, x2]\n"
+ "ldr x20, [x17, #0x60]\n"
+ ".inst 0x44884457 // smlalt z23.s, p4/M, z2.h, z8.h\n"
+ ".inst 0x449043ae // smlalb z14.s, p4/M, z29.h, z16.h\n"
+ "ld1sb { z25.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
+ ".inst 0x449a4452 // smlalt z18.s, p4/M, z2.h, z26.h\n"
+ ".inst 0x449a4574 // smlalt z20.s, p4/M, z11.h, z26.h\n"
+ "ld1b { z22.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
+ ".inst 0x449a4461 // smlalt z1.s, p4/M, z3.h, z26.h\n"
+ ".inst 0x448843a6 // smlalb z6.s, p4/M, z29.h, z8.h\n"
+ "ldr x20, [x17, #0x68]\n"
+ "ld1sb { z2.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x44884069 // smlalb z9.s, p4/M, z3.h, z8.h\n"
+ ".inst 0x44884087 // smlalb z7.s, p4/M, z4.h, z8.h\n"
+ ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
+ "ld1b { z26.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x449047b7 // smlalt z23.s, p4/M, z29.h, z16.h\n"
+ ".inst 0x449543ee // smlalb z14.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
+ "ldr x20, [x17, #0x70]\n"
+ ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
+ ".inst 0x44884474 // smlalt z20.s, p4/M, z3.h, z8.h\n"
+ "ld1sb { z29.h }, p4/Z, [x4, #7, MUL VL]\n"
+ ".inst 0x455e1b5a // usublb z26.h, z26.b, z30.b\n"
+ ".inst 0x44884481 // smlalt z1.s, p4/M, z4.h, z8.h\n"
+ ".inst 0x449043e6 // smlalb z6.s, p4/M, z31.h, z16.h\n"
+ "inch x4, ALL, MUL #8\n"
+ "ld1b { z8.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x44904089 // smlalb z9.s, p4/M, z4.h, z16.h\n"
+ ".inst 0x44904367 // smlalb z7.s, p4/M, z27.h, z16.h\n"
+ ".inst 0x454a13bd // ssublb z29.h, z29.b, z10.b\n"
+ "ldr x20, [x17, #0x78]\n"
+ ".inst 0x449547f7 // smlalt z23.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x4491400e // smlalb z14.s, p4/M, z0.h, z17.h\n"
+ "ld1sb { z24.h }, p4/Z, [x4]\n"
+ ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
+ ".inst 0x449047f2 // smlalt z18.s, p4/M, z31.h, z16.h\n"
+ ".inst 0x44904494 // smlalt z20.s, p4/M, z4.h, z16.h\n"
+ "ld1b { z31.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ ".inst 0x44904761 // smlalt z1.s, p4/M, z27.h, z16.h\n"
+ ".inst 0x44954006 // smlalb z6.s, p4/M, z0.h, z21.h\n"
+ "ldr x22, [x17, #0x80]\n"
+ "ld1sb { z16.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x44954369 // smlalb z9.s, p4/M, z27.h, z21.h\n"
+ ".inst 0x449540a7 // smlalb z7.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ "ldr x21, [x17, #0x88]\n"
+ ".inst 0x44914417 // smlalt z23.s, p4/M, z0.h, z17.h\n"
+ ".inst 0x4499416e // smlalb z14.s, p4/M, z11.h, z25.h\n"
+ ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
+ "ldr x20, [x17, #0x90]\n"
+ ".inst 0x44954412 // smlalt z18.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x44954774 // smlalt z20.s, p4/M, z27.h, z21.h\n"
+ "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x449544a1 // smlalt z1.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x449142c6 // smlalb z6.s, p4/M, z22.h, z17.h\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x449140a9 // smlalb z9.s, p4/M, z5.h, z17.h\n"
+ ".inst 0x44914267 // smlalb z7.s, p4/M, z19.h, z17.h\n"
+ "ldr x23, [x17, #0x98]\n"
+ "ldr x22, [x17, #0xa0]\n"
+ ".inst 0x44994577 // smlalt z23.s, p4/M, z11.h, z25.h\n"
+ ".inst 0x4482406e // smlalb z14.s, p4/M, z3.h, z2.h\n"
+ "ld1b { z11.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
+ ".inst 0x449146d2 // smlalt z18.s, p4/M, z22.h, z17.h\n"
+ ".inst 0x449144b4 // smlalt z20.s, p4/M, z5.h, z17.h\n"
+ "ld1sb { z22.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x454a12d6 // ssublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44914661 // smlalt z1.s, p4/M, z19.h, z17.h\n"
+ ".inst 0x44994066 // smlalb z6.s, p4/M, z3.h, z25.h\n"
+ "ld1b { z17.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1a31 // usublb z17.h, z17.b, z30.b\n"
+ ".inst 0x44994389 // smlalb z9.s, p4/M, z28.h, z25.h\n"
+ ".inst 0x44994347 // smlalb z7.s, p4/M, z26.h, z25.h\n"
+ "ldr x20, [x17, #0xa8]\n"
+ "ldr x21, [x17, #0xb0]\n"
+ ".inst 0x44824477 // smlalt z23.s, p4/M, z3.h, z2.h\n"
+ ".inst 0x449d408e // smlalb z14.s, p4/M, z4.h, z29.h\n"
+ "ldr x13, [x17, #0xb8]\n"
+ "ldr x12, [x17, #0xc0]\n"
+ ".inst 0x44994472 // smlalt z18.s, p4/M, z3.h, z25.h\n"
+ ".inst 0x44994794 // smlalt z20.s, p4/M, z28.h, z25.h\n"
+ "ld1b { z3.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ ".inst 0x44994741 // smlalt z1.s, p4/M, z26.h, z25.h\n"
+ ".inst 0x44824086 // smlalb z6.s, p4/M, z4.h, z2.h\n"
+ "ld1sb { z25.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
+ ".inst 0x44824349 // smlalb z9.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x44824107 // smlalb z7.s, p4/M, z8.h, z2.h\n"
+ "ldr x11, [x17, #0xc8]\n"
+ "ldr x10, [x17, #0xd0]\n"
+ ".inst 0x449d4497 // smlalt z23.s, p4/M, z4.h, z29.h\n"
+ ".inst 0x4498436e // smlalb z14.s, p4/M, z27.h, z24.h\n"
+ "ldr x9, [x17, #0xd8]\n"
+ "ldr x28, [x17, #0xe0]\n"
+ ".inst 0x44824492 // smlalt z18.s, p4/M, z4.h, z2.h\n"
+ ".inst 0x44824754 // smlalt z20.s, p4/M, z26.h, z2.h\n"
+ "ld1b { z4.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ ".inst 0x44824501 // smlalt z1.s, p4/M, z8.h, z2.h\n"
+ ".inst 0x449d4366 // smlalb z6.s, p4/M, z27.h, z29.h\n"
+ "ld1sb { z2.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
+ ".inst 0x449d4109 // smlalb z9.s, p4/M, z8.h, z29.h\n"
+ ".inst 0x449d43e7 // smlalb z7.s, p4/M, z31.h, z29.h\n"
+ "ldr x27, [x17, #0xe8]\n"
+ "ldr x26, [x17, #0xf0]\n"
+ ".inst 0x44984777 // smlalt z23.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x449040ae // smlalb z14.s, p4/M, z5.h, z16.h\n"
+ "ldr x25, [x17, #0xf8]\n"
+ "ldr x24, [x17, #0x100]\n"
+ ".inst 0x449d4772 // smlalt z18.s, p4/M, z27.h, z29.h\n"
+ ".inst 0x449d4514 // smlalt z20.s, p4/M, z8.h, z29.h\n"
+ "ld1b { z27.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449d47e1 // smlalt z1.s, p4/M, z31.h, z29.h\n"
+ ".inst 0x449840a6 // smlalb z6.s, p4/M, z5.h, z24.h\n"
+ "ld1sb { z29.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x454a13bd // ssublb z29.h, z29.b, z10.b\n"
+ ".inst 0x449843e9 // smlalb z9.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984007 // smlalb z7.s, p4/M, z0.h, z24.h\n"
+ "ldr x23, [x17, #0x108]\n"
+ "ldr x22, [x17, #0x110]\n"
+ ".inst 0x449044b7 // smlalt z23.s, p4/M, z5.h, z16.h\n"
+ ".inst 0x4495438e // smlalb z14.s, p4/M, z28.h, z21.h\n"
+ "ldr x20, [x17, #0x118]\n"
+ "whilelt p0.h, x16, x3\n"
+ ".inst 0x449844b2 // smlalt z18.s, p4/M, z5.h, z24.h\n"
+ ".inst 0x449847f4 // smlalt z20.s, p4/M, z31.h, z24.h\n"
+ "ld1b { z5.h }, p3/Z, [x21, x2]\n"
+ ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
+ ".inst 0x44984401 // smlalt z1.s, p4/M, z0.h, z24.h\n"
+ ".inst 0x44904266 // smlalb z6.s, p4/M, z19.h, z16.h\n"
+ "ld1sb { z24.h }, p4/Z, [x4, #7, MUL VL]\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44904009 // smlalb z9.s, p4/M, z0.h, z16.h\n"
+ ".inst 0x44904167 // smlalb z7.s, p4/M, z11.h, z16.h\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
+ ".inst 0x44954797 // smlalt z23.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x4496434e // smlalb z14.s, p4/M, z26.h, z22.h\n"
+ "ld1b { z28.h }, p3/Z, [x13, x2]\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
+ ".inst 0x44904672 // smlalt z18.s, p4/M, z19.h, z16.h\n"
+ ".inst 0x44904414 // smlalt z20.s, p4/M, z0.h, z16.h\n"
+ "ld1sb { z19.h }, p4/Z, [x4]\n"
+ ".inst 0x454a1273 // ssublb z19.h, z19.b, z10.b\n"
+ ".inst 0x44904561 // smlalt z1.s, p4/M, z11.h, z16.h\n"
+ ".inst 0x44954346 // smlalb z6.s, p4/M, z26.h, z21.h\n"
+ "ld1b { z16.h }, p3/Z, [x12, x2]\n"
+ ".inst 0x455e1a10 // usublb z16.h, z16.b, z30.b\n"
+ ".inst 0x44954229 // smlalb z9.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x44954067 // smlalb z7.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44964757 // smlalt z23.s, p4/M, z26.h, z22.h\n"
+ ".inst 0x4499410e // smlalb z14.s, p4/M, z8.h, z25.h\n"
+ ".inst 0x44954752 // smlalt z18.s, p4/M, z26.h, z21.h\n"
+ ".inst 0x44954634 // smlalt z20.s, p4/M, z17.h, z21.h\n"
+ "ld1b { z26.h }, p3/Z, [x11, x2]\n"
+ ".inst 0x455e1b5a // usublb z26.h, z26.b, z30.b\n"
+ ".inst 0x44954461 // smlalt z1.s, p4/M, z3.h, z21.h\n"
+ ".inst 0x44964106 // smlalb z6.s, p4/M, z8.h, z22.h\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #1, MUL VL]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x44964069 // smlalb z9.s, p4/M, z3.h, z22.h\n"
+ ".inst 0x44964087 // smlalb z7.s, p4/M, z4.h, z22.h\n"
+ ".inst 0x44994517 // smlalt z23.s, p4/M, z8.h, z25.h\n"
+ ".inst 0x448243ee // smlalb z14.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x44964512 // smlalt z18.s, p4/M, z8.h, z22.h\n"
+ ".inst 0x44964474 // smlalt z20.s, p4/M, z3.h, z22.h\n"
+ "ld1b { z8.h }, p3/Z, [x10, x2]\n"
+ ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
+ ".inst 0x44964481 // smlalt z1.s, p4/M, z4.h, z22.h\n"
+ ".inst 0x449943e6 // smlalb z6.s, p4/M, z31.h, z25.h\n"
+ "ld1sb { z22.h }, p4/Z, [x4, #2, MUL VL]\n"
+ ".inst 0x454a12d6 // ssublb z22.h, z22.b, z10.b\n"
+ ".inst 0x44994089 // smlalb z9.s, p4/M, z4.h, z25.h\n"
+ ".inst 0x44994367 // smlalb z7.s, p4/M, z27.h, z25.h\n"
+ ".inst 0x448247f7 // smlalt z23.s, p4/M, z31.h, z2.h\n"
+ ".inst 0x449d400e // smlalb z14.s, p4/M, z0.h, z29.h\n"
+ ".inst 0x449947f2 // smlalt z18.s, p4/M, z31.h, z25.h\n"
+ ".inst 0x44994494 // smlalt z20.s, p4/M, z4.h, z25.h\n"
+ "ld1b { z31.h }, p3/Z, [x9, x2]\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ ".inst 0x44994761 // smlalt z1.s, p4/M, z27.h, z25.h\n"
+ ".inst 0x44824006 // smlalb z6.s, p4/M, z0.h, z2.h\n"
+ "ld1sb { z25.h }, p4/Z, [x4, #3, MUL VL]\n"
+ ".inst 0x454a1339 // ssublb z25.h, z25.b, z10.b\n"
+ ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
+ ".inst 0x448240a7 // smlalb z7.s, p4/M, z5.h, z2.h\n"
+ ".inst 0x449d4417 // smlalt z23.s, p4/M, z0.h, z29.h\n"
+ ".inst 0x4498422e // smlalb z14.s, p4/M, z17.h, z24.h\n"
+ ".inst 0x44824412 // smlalt z18.s, p4/M, z0.h, z2.h\n"
+ ".inst 0x44824774 // smlalt z20.s, p4/M, z27.h, z2.h\n"
+ "ld1b { z0.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x448244a1 // smlalt z1.s, p4/M, z5.h, z2.h\n"
+ ".inst 0x449d4166 // smlalb z6.s, p4/M, z11.h, z29.h\n"
+ "ld1sb { z2.h }, p4/Z, [x4, #4, MUL VL]\n"
+ ".inst 0x454a1042 // ssublb z2.h, z2.b, z10.b\n"
+ ".inst 0x449d40a9 // smlalb z9.s, p4/M, z5.h, z29.h\n"
+ ".inst 0x449d4387 // smlalb z7.s, p4/M, z28.h, z29.h\n"
+ ".inst 0x44984637 // smlalt z23.s, p4/M, z17.h, z24.h\n"
+ ".inst 0x4493406e // smlalb z14.s, p4/M, z3.h, z19.h\n"
+ "ld1b { z17.h }, p3/Z, [x27, x2]\n"
+ ".inst 0x455e1a31 // usublb z17.h, z17.b, z30.b\n"
+ ".inst 0x449d4572 // smlalt z18.s, p4/M, z11.h, z29.h\n"
+ ".inst 0x449d44b4 // smlalt z20.s, p4/M, z5.h, z29.h\n"
+ "ld1sb { z11.h }, p4/Z, [x4, #5, MUL VL]\n"
+ ".inst 0x454a116b // ssublb z11.h, z11.b, z10.b\n"
+ ".inst 0x449d4781 // smlalt z1.s, p4/M, z28.h, z29.h\n"
+ ".inst 0x44984066 // smlalb z6.s, p4/M, z3.h, z24.h\n"
+ "ld1b { z29.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
+ ".inst 0x44984209 // smlalb z9.s, p4/M, z16.h, z24.h\n"
+ ".inst 0x44984347 // smlalb z7.s, p4/M, z26.h, z24.h\n"
+ ".inst 0x44934477 // smlalt z23.s, p4/M, z3.h, z19.h\n"
+ ".inst 0x4495408e // smlalb z14.s, p4/M, z4.h, z21.h\n"
+ ".inst 0x44984472 // smlalt z18.s, p4/M, z3.h, z24.h\n"
+ ".inst 0x44984614 // smlalt z20.s, p4/M, z16.h, z24.h\n"
+ "ld1b { z3.h }, p3/Z, [x25, x2]\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ ".inst 0x44984741 // smlalt z1.s, p4/M, z26.h, z24.h\n"
+ ".inst 0x44934086 // smlalb z6.s, p4/M, z4.h, z19.h\n"
+ "ld1sb { z24.h }, p4/Z, [x4, #6, MUL VL]\n"
+ ".inst 0x454a1318 // ssublb z24.h, z24.b, z10.b\n"
+ ".inst 0x44934349 // smlalb z9.s, p4/M, z26.h, z19.h\n"
+ ".inst 0x44934107 // smlalb z7.s, p4/M, z8.h, z19.h\n"
+ ".inst 0x44954497 // smlalt z23.s, p4/M, z4.h, z21.h\n"
+ ".inst 0x4496436e // smlalb z14.s, p4/M, z27.h, z22.h\n"
+ ".inst 0x44934492 // smlalt z18.s, p4/M, z4.h, z19.h\n"
+ ".inst 0x44934754 // smlalt z20.s, p4/M, z26.h, z19.h\n"
+ "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ ".inst 0x44934501 // smlalt z1.s, p4/M, z8.h, z19.h\n"
+ ".inst 0x44954366 // smlalb z6.s, p4/M, z27.h, z21.h\n"
+ "ld1sb { z19.h }, p4/Z, [x4, #7, MUL VL]\n"
+ "inch x4, ALL, MUL #8\n"
+ ".inst 0x44954109 // smlalb z9.s, p4/M, z8.h, z21.h\n"
+ ".inst 0x449543e7 // smlalb z7.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x454a1273 // ssublb z19.h, z19.b, z10.b\n"
+ ".inst 0x44964777 // smlalt z23.s, p4/M, z27.h, z22.h\n"
+ ".inst 0x449940ae // smlalb z14.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x44954772 // smlalt z18.s, p4/M, z27.h, z21.h\n"
+ ".inst 0x44954514 // smlalt z20.s, p4/M, z8.h, z21.h\n"
+ "ld1b { z27.h }, p3/Z, [x23, x2]\n"
+ ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
+ ".inst 0x449547e1 // smlalt z1.s, p4/M, z31.h, z21.h\n"
+ ".inst 0x449640a6 // smlalb z6.s, p4/M, z5.h, z22.h\n"
+ "ld1sb { z21.h }, p4/Z, [x4]\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ ".inst 0x449643e9 // smlalb z9.s, p4/M, z31.h, z22.h\n"
+ ".inst 0x44964007 // smlalb z7.s, p4/M, z0.h, z22.h\n"
+ "inch x4\n"
+ ".inst 0x449944b7 // smlalt z23.s, p4/M, z5.h, z25.h\n"
+ ".inst 0x4482420e // smlalb z14.s, p4/M, z16.h, z2.h\n"
+ ".inst 0x449644b2 // smlalt z18.s, p4/M, z5.h, z22.h\n"
+ ".inst 0x449647f4 // smlalt z20.s, p4/M, z31.h, z22.h\n"
+ "ld1b { z5.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
+ ".inst 0x44964401 // smlalt z1.s, p4/M, z0.h, z22.h\n"
+ ".inst 0x44994386 // smlalb z6.s, p4/M, z28.h, z25.h\n"
+ "ld1w { z22.s }, p2/Z, [x15]\n"
+ ".inst 0x44994009 // smlalb z9.s, p4/M, z0.h, z25.h\n"
+ ".inst 0x44994227 // smlalb z7.s, p4/M, z17.h, z25.h\n"
+ ".inst 0x44824617 // smlalt z23.s, p4/M, z16.h, z2.h\n"
+ ".inst 0x448b434e // smlalb z14.s, p4/M, z26.h, z11.h\n"
+ "ld1w { z16.s }, p1/Z, [x15, #1, MUL VL]\n"
+ "addvl x15, x15, #2\n"
+ ".inst 0x44994792 // smlalt z18.s, p4/M, z28.h, z25.h\n"
+ ".inst 0x44994414 // smlalt z20.s, p4/M, z0.h, z25.h\n"
+ "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
+ ".inst 0x44994621 // smlalt z1.s, p4/M, z17.h, z25.h\n"
+ ".inst 0x44824346 // smlalb z6.s, p4/M, z26.h, z2.h\n"
+ "uzp1 z25.s, z22.s, z16.s\n"
+ "inch x2\n"
+ ".inst 0x448243a9 // smlalb z9.s, p4/M, z29.h, z2.h\n"
+ ".inst 0x44824067 // smlalb z7.s, p4/M, z3.h, z2.h\n"
+ "uzp2 z16.s, z22.s, z16.s\n"
+ "ld1w { z22.s }, p2/Z, [x14]\n"
+ ".inst 0x448b4757 // smlalt z23.s, p4/M, z26.h, z11.h\n"
+ ".inst 0x4498410e // smlalb z14.s, p4/M, z8.h, z24.h\n"
+ "mov x20, x2\n"
+ "incw x20\n"
+ ".inst 0x44824752 // smlalt z18.s, p4/M, z26.h, z2.h\n"
+ ".inst 0x448247b4 // smlalt z20.s, p4/M, z29.h, z2.h\n"
+ "ld1w { z26.s }, p1/Z, [x14, #1, MUL VL]\n"
+ "uzp1 z29.s, z22.s, z26.s\n"
+ ".inst 0x44824461 // smlalt z1.s, p4/M, z3.h, z2.h\n"
+ ".inst 0x448b4106 // smlalb z6.s, p4/M, z8.h, z11.h\n"
+ "uzp2 z22.s, z22.s, z26.s\n"
+ "whilelt p2.s, x2, x3\n"
+ ".inst 0x448b4069 // smlalb z9.s, p4/M, z3.h, z11.h\n"
+ ".inst 0x448b4087 // smlalb z7.s, p4/M, z4.h, z11.h\n"
+ "whilelt p1.s, x20, x3\n"
+ "whilelt p3.h, x2, x3\n"
+ ".inst 0x44984517 // smlalt z23.s, p4/M, z8.h, z24.h\n"
+ ".inst 0x449343ee // smlalb z14.s, p4/M, z31.h, z19.h\n"
+ "addvl x14, x14, #2\n"
+ ".inst 0x448b4512 // smlalt z18.s, p4/M, z8.h, z11.h\n"
+ ".inst 0x448b4474 // smlalt z20.s, p4/M, z3.h, z11.h\n"
+ ".inst 0x448b4481 // smlalt z1.s, p4/M, z4.h, z11.h\n"
+ ".inst 0x449843e6 // smlalb z6.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984089 // smlalb z9.s, p4/M, z4.h, z24.h\n"
+ ".inst 0x44984367 // smlalb z7.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x449347f7 // smlalt z23.s, p4/M, z31.h, z19.h\n"
+ ".inst 0x4495400e // smlalb z14.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x04b975ce // sqrdmulh z14.s, z14.s, z25.s\n"
+ ".inst 0x449847f2 // smlalt z18.s, p4/M, z31.h, z24.h\n"
+ ".inst 0x44984494 // smlalt z20.s, p4/M, z4.h, z24.h\n"
+ "and z3.d, z14.d, z29.d\n"
+ ".inst 0x44984761 // smlalt z1.s, p4/M, z27.h, z24.h\n"
+ ".inst 0x44934006 // smlalb z6.s, p4/M, z0.h, z19.h\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ ".inst 0x44934369 // smlalb z9.s, p4/M, z27.h, z19.h\n"
+ ".inst 0x449340a7 // smlalb z7.s, p4/M, z5.h, z19.h\n"
+ "sqadd z14.s, z14.s, z3.s\n"
+ ".inst 0x448293ae // srshl z14.s, p4/M, z14.s, z29.s\n"
+ ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
+ ".inst 0x44934412 // smlalt z18.s, p4/M, z0.h, z19.h\n"
+ ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n"
+ ".inst 0x44934774 // smlalt z20.s, p4/M, z27.h, z19.h\n"
+ ".inst 0x449344a1 // smlalt z1.s, p4/M, z5.h, z19.h\n"
+ "and z31.d, z23.d, z22.d\n"
+ ".inst 0x44954226 // smlalb z6.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x449540a9 // smlalb z9.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x04b974c6 // sqrdmulh z6.s, z6.s, z25.s\n"
+ ".inst 0x44954387 // smlalb z7.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x44954632 // smlalt z18.s, p4/M, z17.h, z21.h\n"
+ ".inst 0x04b97529 // sqrdmulh z9.s, z9.s, z25.s\n"
+ ".inst 0x449544b4 // smlalt z20.s, p4/M, z5.h, z21.h\n"
+ ".inst 0x44954781 // smlalt z1.s, p4/M, z28.h, z21.h\n"
+ ".inst 0x04b974e7 // sqrdmulh z7.s, z7.s, z25.s\n"
+ "asr z31.s, z31.s, #0x1f\n"
+ "and z3.d, z6.d, z29.d\n"
+ ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
+ "and z0.d, z9.d, z29.d\n"
+ ".inst 0x04b07694 // sqrdmulh z20.s, z20.s, z16.s\n"
+ "and z19.d, z7.d, z29.d\n"
+ ".inst 0x04b07421 // sqrdmulh z1.s, z1.s, z16.s\n"
+ "sqadd z23.s, z23.s, z31.s\n"
+ ".inst 0x448292d7 // srshl z23.s, p4/M, z23.s, z22.s\n"
+ "asr z3.s, z3.s, #0x1f\n"
+ "and z21.d, z18.d, z22.d\n"
+ "asr z0.s, z0.s, #0x1f\n"
+ "and z17.d, z20.d, z22.d\n"
+ "asr z19.s, z19.s, #0x1f\n"
+ "and z16.d, z1.d, z22.d\n"
+ "sqadd z6.s, z6.s, z3.s\n"
+ "asr z21.s, z21.s, #0x1f\n"
+ ".inst 0x448293a6 // srshl z6.s, p4/M, z6.s, z29.s\n"
+ "sqadd z9.s, z9.s, z0.s\n"
+ "asr z17.s, z17.s, #0x1f\n"
+ ".inst 0x448293a9 // srshl z9.s, p4/M, z9.s, z29.s\n"
+ "sqadd z7.s, z7.s, z19.s\n"
+ "asr z16.s, z16.s, #0x1f\n"
+ ".inst 0x448293a7 // srshl z7.s, p4/M, z7.s, z29.s\n"
+ "sqadd z18.s, z18.s, z21.s\n"
+ "sqadd z20.s, z20.s, z17.s\n"
+ ".inst 0x448292d2 // srshl z18.s, p4/M, z18.s, z22.s\n"
+ ".inst 0x448292d4 // srshl z20.s, p4/M, z20.s, z22.s\n"
+ "sqadd z1.s, z1.s, z16.s\n"
+ ".inst 0x453041ce // sqxtnb z14.h, z14.s\n"
+ ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
+ ".inst 0x453040c6 // sqxtnb z6.h, z6.s\n"
+ ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
+ ".inst 0x453040e7 // sqxtnb z7.h, z7.s\n"
+ ".inst 0x453046ee // sqxtnt z14.h, z23.s\n"
+ ".inst 0x45304646 // sqxtnt z6.h, z18.s\n"
+ ".inst 0x45304689 // sqxtnt z9.h, z20.s\n"
+ ".inst 0x45304427 // sqxtnt z7.h, z1.s\n"
+ "sqadd z14.h, z14.h, z15.h\n"
+ "smax z14.h, p4/M, z14.h, z12.h\n"
+ "smin z14.h, p4/M, z14.h, z13.h\n"
+ "sqadd z6.h, z6.h, z15.h\n"
+ "sqadd z9.h, z9.h, z15.h\n"
+ "smax z6.h, p4/M, z6.h, z12.h\n"
+ "smax z9.h, p4/M, z9.h, z12.h\n"
+ "sqadd z7.h, z7.h, z15.h\n"
+ "smax z7.h, p4/M, z7.h, z12.h\n"
+ "smin z6.h, p4/M, z6.h, z13.h\n"
+ "st1b { z14.h }, p0, [x5, x16]\n"
+ "smin z9.h, p4/M, z9.h, z13.h\n"
+ "smin z7.h, p4/M, z7.h, z13.h\n"
+ "st1b { z6.h }, p0, [x6, x16]\n"
+ "st1b { z9.h }, p0, [x7, x16]\n"
+ "st1b { z7.h }, p0, [x8, x16]\n"
+ "ld1w { z17.s }, p2/Z, [x21]\n"
+ "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
+ "uzp1 z14.s, z17.s, z16.s\n"
+ "ld1sb { z26.h }, p4/Z, [x4]\n"
+ "ld1sb { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
+ "uzp2 z23.s, z17.s, z16.s\n"
+ "addvl x21, x21, #2\n"
+ "ld1sb { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
+ "ld1sb { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
+ "inch x16\n"
+ "str x21, [%x[params], %[offsetof_Params_bias]]\n"
+ "ld1sb { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
+ "ldp x9, x28, [x17, #0x0]\n"
+ "mov z6.d, z14.d\n"
+ "mov z18.d, z23.d\n"
+ "ldp x27, x26, [x17, #0x10]\n"
+ "ldp x25, x24, [x17, #0x20]\n"
+ "mov z9.d, z14.d\n"
+ "mov z20.d, z23.d\n"
+ "ldp x23, x22, [x17, #0x30]\n"
+ "ldp x21, x20, [x17, #0x40]\n"
+ "mov z7.d, z14.d\n"
+ "mov z1.d, z23.d\n"
+ "ld1b { z22.h }, p3/Z, [x9, x2]\n"
+ "ld1b { z2.h }, p3/Z, [x28, x2]\n"
+ ".inst 0x454a135a // ssublb z26.h, z26.b, z10.b\n"
+ ".inst 0x454a1108 // ssublb z8.h, z8.b, z10.b\n"
+ "ld1b { z11.h }, p3/Z, [x27, x2]\n"
+ "ld1b { z3.h }, p3/Z, [x26, x2]\n"
+ ".inst 0x454a1210 // ssublb z16.h, z16.b, z10.b\n"
+ ".inst 0x454a12b5 // ssublb z21.h, z21.b, z10.b\n"
+ "ld1b { z29.h }, p3/Z, [x25, x2]\n"
+ "ld1b { z4.h }, p3/Z, [x24, x2]\n"
+ ".inst 0x454a1231 // ssublb z17.h, z17.b, z10.b\n"
+ ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
+ "ld1b { z31.h }, p3/Z, [x23, x2]\n"
+ "ld1b { z0.h }, p3/Z, [x22, x2]\n"
+ ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
+ ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
+ "ld1b { z19.h }, p3/Z, [x21, x2]\n"
+ "ld1b { z28.h }, p3/Z, [x20, x2]\n"
+ ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
+ ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
+ ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
+ ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
+ ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
+ ".inst 0x455e1a73 // usublb z19.h, z19.b, z30.b\n"
+ ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
+ "b.any 1b\n"
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/premultiply.cpp b/src/core/NEON/kernels/arm_conv/depthwise/premultiply.cpp
new file mode 100644
index 0000000000..8a49c775d3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/premultiply.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <premultiply.hpp>
+
+#define CHANNEL_MULTIPLIER 6
+#define BLOCK_SIZE 4
+
+void do_premultiply_float_6(const float *in_ptr,
+ const unsigned int ld_row,
+ const unsigned int ld_col,
+ float *out_ptr,
+ const unsigned int out_ld_row,
+ const unsigned int out_ld_col,
+ const unsigned int tile_rows,
+ const unsigned int tile_cols,
+ const unsigned input_channels)
+{
+ for(unsigned int i = 0; i < tile_rows; i++)
+ {
+ const float *ip2 = in_ptr + i * ld_row;
+ float *op2 = out_ptr + i * out_ld_row;
+ for(unsigned int j = 0; j < tile_cols; j++)
+ {
+ const float *ip = ip2;
+ float *op = op2;
+
+ unsigned int num_blocks = input_channels / BLOCK_SIZE;
+ for(unsigned int c = 0; c < num_blocks; c++)
+ {
+ float vals[BLOCK_SIZE];
+ for(unsigned int v = 0; v < BLOCK_SIZE; v++)
+ {
+ vals[v] = ip[v];
+ }
+ ip += BLOCK_SIZE;
+
+ for(unsigned int v = 0; v < BLOCK_SIZE; v++)
+ {
+ for(unsigned int r = 0; r < CHANNEL_MULTIPLIER; r++)
+ {
+ op[r] = vals[v];
+ }
+ op += CHANNEL_MULTIPLIER;
+ }
+ }
+
+ unsigned int rem = input_channels - num_blocks * BLOCK_SIZE;
+ for(unsigned int c = 0; c < rem; c++)
+ {
+ float val = ip[c];
+ for(unsigned int r = 0; r < CHANNEL_MULTIPLIER; r++)
+ {
+ op[r] = val;
+ }
+ op += CHANNEL_MULTIPLIER;
+ }
+
+ ip2 += ld_col;
+ op2 += out_ld_col;
+ }
+ }
+}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp b/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp
new file mode 100644
index 0000000000..9805fd354f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp
@@ -0,0 +1,461 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Depthwise kernel drivers commonly require a per-thread blob of working space
+ * in which to store parameters required by the depthwise implementations. The
+ * composition of this working space varies with the driver, kernel, and data
+ * types -- but the tasks of requesting sufficient space, allocating buffer
+ * space, and performing initialisation of the working space are common.
+ *
+ * The classes in this file consist of a number of working space "Elements"
+ * (which are logical units of functionality) and a Workspace type which allows
+ * for compile time composition of elements into a single working space type.
+ *
+ * Creating a workspace
+ * ====================
+ *
+ * A new workspace type can be created by combining Elements as an argument to
+ * the Workspace class. For instance:
+ *
+ * Workspace<
+ * depthwise_depthfirst::InputArrayElement<float>,
+ * InputBufferElement<float>,
+ * OutputArrayElement<float>
+ * >
+ *
+ * Creates a new Workspace consisting of the given elements. The workspace type
+ * contained within this class (`Workspace<...>::WorkspaceType`) is equivalent to:
+ *
+ * struct WorkspaceType
+ * {
+ * const float **inptr_array; // From InputArrayElement<float>
+ * float *input_buffer; // From InputBufferElement<float>
+ * float **outptr_array; // From OutputArrayElement<float>
+ * float *output_buffer; // From OutputArrayElement<float>
+ * };
+ *
+ * Calling `Workspace<...>::get_sizeof_workspace(...)` will return the amount
+ * of space required to store the above struct and the elements contained
+ * within it. Once this space has been allocated, the workspace can be
+ * initialised by calling `Workspace<...>::initialise` with a pointer to the
+ * buffer and the same arguments. This will place a struct of type
+ * `Workspace<...>::WorkspaceType` at the start of the buffer, and share the
+ * remaining space between the specified elements. As this is all done at
+ * compile time, later code can access elements from the `WorkspaceType` by
+ * name.
+ *
+ * Writing a new element
+ * =====================
+ *
+ * Each Element must provide:
+ * - A struct called "Workspace" containing the variables contained within
+ * this portion of the workspace.
+ * - A static method called `get_element_size` which returns the amount of
+ * buffer space required by this element of the workspace (NOT including the
+ * size of the Workspace struct). For example, an element which stores a
+ * vector of pointers will return the amount of space required top store the
+ * vector.
+ * - A static method called `initialise` which accepts a pointer to a struct
+ * which will be composed of the Element's `Workspace` struct (along with
+ * other elements), a pointer to the start of the buffer allocated for this
+ * portion of the workspace, and arguments to be used to initialise the
+ * workspace. The Element should consume as much of the buffer as it
+ * requires, initialise the Workspace, and then return the pointer to the
+ * next free byte of the buffer.
+ *
+ * See the below elements for an example of how this should work.
+ */
+
+#pragma once
+
+#include "depthwise.hpp"
+#include "depthfirst_driver.hpp"
+#include "utils.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+namespace { // anonymous because we expect this to appear in several compilation units
+
+/* Arguments to use to size and initialise a workspace.
+ */
+template <class StratType, class OutputStage=Nothing>
+struct WorkspaceArgs
+{
+ const StratType *strategy;
+ const DepthwiseArgs &depthwise_args;
+ const OutputStage &output_stage;
+
+ WorkspaceArgs(const StratType *strat, const DepthwiseArgs &dwargs, const OutputStage &os = {})
+ : strategy(strat), depthwise_args(dwargs), output_stage(os)
+ {
+ }
+};
+
+
+/* Sometimes we use templated structs to fill in workspace types, the Empty
+ * element can be useful for when a blank element is required for some sets of
+ * parameters.
+ */
+struct EmptyElement
+{
+ struct Workspace {};
+
+ template <class StratType, class OutputStage>
+ static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &) { return 0; }
+
+ template <class WorkspaceType, class StratType, class OutputStage>
+ static void *initialise(WorkspaceType *, void *buffer, const WorkspaceArgs<StratType, OutputStage> &)
+ {
+ return buffer;
+ }
+};
+
+
+/* Store fused activations for a kernel.
+ *
+ * Activations are set based on the DepthwiseArgs.
+ */
+template <typename T, class OutputStage=Nothing>
+class ActivationsElement
+{
+ public:
+ struct Workspace
+ {
+ T activation_min, activation_max;
+ };
+
+ template <typename StratType>
+ static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &)
+ {
+ return 0;
+ }
+
+ template <class WorkspaceType, class StratType>
+ static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args)
+ {
+ ws->activation_min = static_cast<T>(-std::numeric_limits<float>::infinity());
+ ws->activation_max = static_cast<T>(std::numeric_limits<float>::infinity());
+
+ switch (args.depthwise_args.activation.type)
+ {
+ case arm_gemm::Activation::Type::BoundedReLU:
+ ws->activation_max = static_cast<T>(args.depthwise_args.activation.param1);
+ // Fall through
+ case arm_gemm::Activation::Type::ReLU:
+ ws->activation_min = static_cast<T>(0);
+ break;
+ default:
+ break;
+ }
+
+ return buffer;
+ }
+};
+
+/* Activation clamps are contained within `arm_gemm::Requantize32`, so if the
+ * output stage is one of these we substitute in an empty workspace element.
+ */
+template <typename T>
+class ActivationsElement<T, arm_gemm::Requantize32> : public EmptyElement
+{
+};
+
+
+/* Get the value with which to fill an input buffer. This defaults to `0`
+ * (which we return as a `char` since it gets used by `memset`).
+ */
+template <typename OutputStage>
+char get_input_buffer_fill_value(const OutputStage &)
+{
+ return 0;
+}
+
+/* In the case of kernels operating on quantized data, we need to fill the
+ * input buffer with the zero offset of the input tensor.
+ */
+template <> char get_input_buffer_fill_value(const arm_gemm::Requantize32 &qp) __attribute__ ((unused));
+template <> char get_input_buffer_fill_value(const arm_gemm::Requantize32 &qp)
+{
+ return qp.a_offset;
+}
+
+
+/* Container for a vector of padding values which can be safely consumed by the
+ * depthwise kernel. The padding values are initialised to either `0` or the
+ * zero offset of the input tensor (if quantized).
+ */
+template <typename T>
+class InputBufferElement
+{
+ public:
+ struct Workspace
+ {
+ T *input_buffer;
+ };
+
+ template <typename StratType, typename OutputStage>
+ static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &args)
+ {
+ return sizeof(T) * args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
+ }
+
+ template <class WorkspaceType, typename StratType, typename OutputStage>
+ static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args)
+ {
+ ws->input_buffer = reinterpret_cast<T*>(buffer);
+ memset(ws->input_buffer, get_input_buffer_fill_value(args.output_stage), get_element_size(args));
+ return reinterpret_cast<char *>(buffer) + get_element_size(args);
+ }
+};
+
+
+/* Container for an array of output pointers, and a buffer which can be used as
+ * a destination for unnecessary writes.
+ */
+template <typename T>
+class OutputArrayElement
+{
+ public:
+ struct Workspace
+ {
+ T **outptr_array;
+ T *output_buffer;
+ };
+
+ template <typename OutputStage>
+ static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ return sizeof_outptr_array(args) + sizeof_output_buffer(args);
+ }
+
+ template <class WorkspaceType, typename OutputStage>
+ static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ char *buffer_bytes = reinterpret_cast<char *>(buffer);
+
+ ws->outptr_array = reinterpret_cast<T **>(buffer_bytes);
+ buffer_bytes += sizeof_outptr_array(args);
+
+ ws->output_buffer = reinterpret_cast<T *>(buffer_bytes);
+ buffer_bytes += sizeof_output_buffer(args);
+
+ return buffer_bytes;
+ }
+
+ protected:
+ template <typename OutputStage>
+ static size_t sizeof_outptr_array(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ return sizeof(T **) * args.strategy->get_output_rows() * args.strategy->get_output_cols();
+ }
+
+ template <typename OutputStage>
+ static size_t sizeof_output_buffer(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ return sizeof(T) * args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
+ }
+};
+
+
+/* Intermediate array to store results of premultiplication.
+ * Used as input to the kernel instead of the original input array.
+ */
+template <typename T>
+class IntermediateBufferElement
+{
+public:
+ struct Workspace
+ {
+ T *intermediate_buffer;
+ };
+
+ template <typename StratType, typename OutputStage>
+ static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &args)
+ {
+ auto cols = args.depthwise_args.input_cols + args.depthwise_args.kernel_cols;
+ auto rows = args.strategy->get_input_rows() + args.depthwise_args.kernel_rows;
+ auto channels = args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
+ return sizeof(T) * cols * rows * channels;
+ }
+
+ template <class WorkspaceType, typename StratType, typename OutputStage>
+ static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args)
+ {
+ ws->intermediate_buffer = reinterpret_cast<T*>(buffer);
+ return reinterpret_cast<char *>(buffer) + get_element_size(args);
+ }
+};
+
+
+/* Container for requantization parameters.
+ *
+ * This removes the distinction between per-layer and per-channel
+ * requantization parameters by providing a vector of requantization parameters
+ * regardless of whether per-layer or per-channel is selected.
+ */
+class RequantizationParametersElement
+{
+ public:
+ struct Workspace
+ {
+ const int32_t *bias, *requant_muls, *requant_shifts;
+ };
+
+ template <typename StratType>
+ static size_t get_element_size(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+ {
+ return sizeof_bias(args) + sizeof_requant_muls(args) + sizeof_requant_shifts(args);
+ }
+
+ template <typename WorkspaceType, typename StratType>
+ static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+ {
+ const auto n_output_channels = args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
+ char *buffer_bytes = reinterpret_cast<char *>(buffer);
+
+ ws->bias = args.output_stage.bias;
+ ws->requant_muls = args.output_stage.per_channel_muls;
+ ws->requant_shifts = args.output_stage.per_channel_right_shifts;
+
+ if (ws->bias == nullptr)
+ {
+ ws->bias = reinterpret_cast<const int32_t *>(buffer_bytes);
+ memset(buffer_bytes, 0, sizeof_bias(args));
+ buffer_bytes += sizeof_bias(args);
+ }
+
+ if (ws->requant_muls == nullptr)
+ {
+ ws->requant_muls = reinterpret_cast<const int32_t *>(buffer_bytes);
+ auto muls = reinterpret_cast<int32_t *>(buffer_bytes);
+ buffer_bytes += sizeof_requant_muls(args);
+
+ for (auto n = 0u; n < n_output_channels; n++)
+ {
+ muls[n] = args.output_stage.per_layer_mul;
+ }
+ }
+
+ if (ws->requant_shifts == nullptr)
+ {
+ ws->requant_shifts = reinterpret_cast<int32_t *>(buffer_bytes);
+ auto shifts = reinterpret_cast<int32_t *>(buffer_bytes);
+ buffer_bytes += sizeof_requant_shifts(args);
+
+ for (auto n = 0u; n < n_output_channels; n++)
+ {
+ shifts[n] = args.output_stage.per_layer_right_shift;
+ }
+ }
+
+ return buffer_bytes;
+ }
+
+ protected:
+ template <typename StratType>
+ static size_t sizeof_bias(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+ {
+ return args.output_stage.bias != nullptr ?
+ 0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
+ }
+
+ template <typename StratType>
+ static size_t sizeof_requant_muls(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+ {
+ return args.output_stage.per_channel_muls != nullptr ?
+ 0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
+ }
+
+ template <typename StratType>
+ static size_t sizeof_requant_shifts(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+ {
+ return args.output_stage.per_channel_right_shifts != nullptr ?
+ 0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
+ }
+};
+
+
+template <typename ...Elements>
+class Workspace;
+
+template <typename Element, typename ...Elements>
+class Workspace<Element, Elements...>
+{
+ public:
+ struct WorkspaceType : Element::Workspace, Workspace<Elements...>::WorkspaceType
+ {
+ };
+
+ template <class S, class T>
+ static void initialise(void *buffer, const WorkspaceArgs<S, T> &args)
+ {
+ // Allocate sufficient space for the struct, then initialise each of the
+ // elements in turn.
+ auto ws = reinterpret_cast<WorkspaceType *>(buffer);
+ initialise_elements(ws, ws + 1, args);
+ }
+
+ template <class S, class T=Nothing>
+ static size_t get_sizeof_workspace(const WorkspaceArgs<S, T> &args)
+ {
+ return sizeof(WorkspaceType) + get_element_sizes(args);
+ }
+
+ template <class S, class T>
+ static inline size_t get_element_sizes(const WorkspaceArgs<S, T> &args)
+ {
+ return Element::get_element_size(args) + Workspace<Elements...>::get_element_sizes(args);
+ }
+
+ template <class WorkspaceType, class S, class T>
+ static void initialise_elements(WorkspaceType *ws, void *buffer, const WorkspaceArgs<S, T> &args)
+ {
+ buffer = Element::initialise(ws, buffer, args); // Get the next buffer
+ Workspace<Elements...>::initialise_elements(ws, buffer, args);
+ }
+};
+
+template <>
+class Workspace<>
+{
+ public:
+ struct WorkspaceType
+ {
+ };
+
+ template <class S, class T>
+ static inline size_t get_element_sizes(const WorkspaceArgs<S, T> &)
+ {
+ return 0;
+ }
+
+ template <class WorkspaceType, class S, class T>
+ static void initialise_elements(WorkspaceType *, void *, const WorkspaceArgs<S, T> &)
+ {
+ }
+};
+
+} // namespace {anonymous}
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp
new file mode 100644
index 0000000000..d0e8639229
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/depthfirst_driver.hpp
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "pooling.hpp"
+#include "utils.hpp"
+
+namespace arm_conv {
+namespace pooling {
+
+class IDepthfirstStrategy
+{
+ public:
+ virtual ~IDepthfirstStrategy() = default;
+
+ virtual unsigned int get_input_rows() const = 0;
+ virtual unsigned int get_input_cols() const = 0;
+
+ virtual unsigned int get_output_rows() const = 0;
+ virtual unsigned int get_output_cols() const = 0;
+};
+
+
+template <typename T>
+struct TensorSpec
+{
+ T base;
+ size_t ld_row, ld_col;
+
+ TensorSpec(T ptr, size_t ld_row, size_t ld_col)
+ : base(ptr), ld_row(ld_row), ld_col(ld_col) {}
+};
+
+
+template <typename TInput, typename TOutput>
+class DepthfirstDriver : public PoolingCommon<TInput, TOutput>
+{
+ protected:
+ using Parent = PoolingCommon<TInput, TOutput>;
+
+ // The strategy which we're applying to solve the pooling problem.
+ std::unique_ptr<const IDepthfirstStrategy> m_strat;
+
+ /* Compute the amount of working space required for a single thread. */
+ virtual size_t get_working_size_per_thread() const = 0;
+
+ /* Initialise the working space for a thread. */
+ virtual void initialise_working_space(void *) const = 0;
+
+ /* Compute a portion of the output tensor with padding. */
+ virtual void compute_tile_padded(
+ unsigned int output_i, unsigned int output_j,
+ unsigned int output_channel_start, unsigned int output_channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ void *working_space
+ ) const = 0;
+
+ /* Compute a portion of the work with only top/bottom padding.
+ *
+ * The default implementation of this repeatedly calls into the padded tile
+ * variant.
+ */
+ virtual void compute_row_padded_tile_row(
+ const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
+ const unsigned int output_channel_start, const unsigned int output_channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ void *working_space
+ ) const
+ {
+ for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols())
+ {
+ this->compute_tile_padded(
+ output_i, output_j, output_channel_start, output_channel_end,
+ input, output, working_space
+ );
+ }
+ }
+
+ /* Compute a portion of the output tensor with no padding.
+ *
+ * The default implementation of this repeatedly calls into the padded
+ * variant.
+ */
+ virtual void compute_tiles_unpadded(
+ unsigned int start_output_i, unsigned int start_output_j,
+ unsigned int n_tile_rows, unsigned int n_tile_cols,
+ unsigned int output_channel_start, unsigned int output_channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ void *working_space
+ ) const
+ {
+ for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
+ {
+ this->compute_row_padded_tile_row(
+ start_output_i, start_output_j, n_tile_cols,
+ output_channel_start, output_channel_end,
+ input, output, working_space
+ );
+ start_output_i += m_strat->get_output_rows();
+ }
+ }
+
+ void execute_internal(
+ unsigned int n_batches,
+ unsigned int input_height,
+ unsigned int input_width,
+ unsigned int n_channels,
+ const PaddingValues &padding,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads
+ ) const override
+ {
+ // Get and initialise the working space for this thread.
+ void *thread_working_space =
+ static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
+ this->initialise_working_space(thread_working_space);
+
+ // Construct convenient representations of the input/output tensors.
+ TensorSpec<const TInput *> input_tensor(reinterpret_cast<const TInput *>(input), ld_input_row, ld_input_col);
+ TensorSpec<TOutput *> output_tensor(reinterpret_cast<TOutput *>(output), ld_output_row, ld_output_col);
+
+ // If the output is a 1x1 tensor, which commonly occurs at the end of a
+ // network, then we change the threading strategy to parallelise over
+ // channels rather than rows of the tensor.
+ if (n_threads > 1 && output_height == 1 && output_width == 1)
+ {
+ // Determine how many channels should be assigned to each thread, we
+ // round up first to ensure we get a reasonable spread across the
+ // threads.
+ const auto channels_per_thread = arm_gemm::roundup(arm_gemm::roundup(n_channels, 16u), n_threads) / n_threads;
+ const auto start_channel = thread_id * channels_per_thread;
+ const auto end_channel = std::min(start_channel + channels_per_thread, n_channels);
+
+ if (start_channel >= end_channel)
+ {
+ // This thread should move on if we have insufficient work to do.
+ return;
+ }
+
+ for (; n_batches; n_batches--)
+ {
+ // We know we don't need to iterate over rows or columns here; so just
+ // execute the tile.
+ this->compute_tile_padded(
+ 0, 0, // Compute the only output point
+ start_channel, end_channel,
+ input_tensor, output_tensor, thread_working_space
+ );
+
+ // Progress the pointers for the next batch.
+ input_tensor.base += ld_input_batch;
+ output_tensor.base += ld_output_batch;
+ }
+
+ // Exit here, since we've done all the work using the different strategy.
+ return;
+ }
+
+ for (unsigned int batch = 0; batch < n_batches; batch++)
+ {
+ // Iterate over rows of the output tensor; we stripe over the tiles.
+ for (unsigned int start_output_i = thread_id * m_strat->get_output_rows();
+ start_output_i < output_height;
+ start_output_i += n_threads * m_strat->get_output_rows())
+ {
+ // Determine what (if any padding) is required on the top/bottom of
+ // this row of the convolution.
+ const auto end_output_i = start_output_i + m_strat->get_output_rows();
+ const bool pad_output_bottom = output_height < end_output_i;
+
+ const int start_input_i = start_output_i * this->m_args.pool_stride.rows - padding.top;
+ const bool pad_input_top = start_input_i < 0;
+ const int end_input_i = start_input_i + m_strat->get_input_rows();
+ const bool pad_input_bottom = static_cast<int>(input_height) < end_input_i;
+ const bool pad_row = pad_input_top || pad_input_bottom || pad_output_bottom;
+
+ // Iterate over the columns of the output tensor; we attempt to grab as
+ // much as possible of the unpadded regions, so the loop structure is a
+ // bit odd.
+ unsigned int start_output_j = 0;
+ while (start_output_j < output_width)
+ {
+ const int start_in_j = start_output_j * this->m_args.pool_stride.cols - padding.left;
+ const bool pad_input_left = start_in_j < 0;
+
+ // Determine if we can process a number of unpadded tiles in one go.
+ int n_unpadded_tiles = 0;
+ if (!pad_input_left)
+ {
+ // Determine the maximum number of tiles we could handle.
+ n_unpadded_tiles = (output_width - start_output_j) / m_strat->get_output_cols();
+
+ // Handle padding on the right hand edge
+ const int tile_stride = m_strat->get_output_cols() * this->m_args.pool_stride.cols;
+ int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols();
+ int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride;
+
+ while (n_unpadded_tiles > 0 &&
+ (static_cast<int>(output_width) < end_output_j ||
+ static_cast<int>(input_width) < end_input_j))
+ {
+ n_unpadded_tiles--;
+ end_output_j -= m_strat->get_output_cols();
+ end_input_j -= tile_stride;
+ }
+ }
+
+ // Process unpadded tiles, if possible, otherwise process a padded tile.
+ if (n_unpadded_tiles)
+ {
+ if (!pad_row)
+ {
+ // Completely unpadded execution
+ this->compute_tiles_unpadded(
+ start_output_i, start_output_j,
+ 1, n_unpadded_tiles, // Compute a row of unpadded tiles
+ 0, n_channels, // Compute all channels
+ input_tensor, output_tensor, thread_working_space
+ );
+ }
+ else
+ {
+ // Top/bottom padding only
+ this->compute_row_padded_tile_row(
+ start_output_i, start_output_j, n_unpadded_tiles,
+ 0, n_channels, // Compute all channels
+ input_tensor, output_tensor, thread_working_space
+ );
+ }
+ start_output_j += n_unpadded_tiles * m_strat->get_output_cols();
+ }
+ else
+ {
+ this->compute_tile_padded(
+ start_output_i, start_output_j,
+ 0, n_channels, // Compute all channels
+ input_tensor, output_tensor, thread_working_space
+ );
+ start_output_j += m_strat->get_output_cols();
+ }
+ }
+ }
+
+ // Progress the pointers for the next batch.
+ input_tensor.base += ld_input_batch;
+ output_tensor.base += ld_output_batch;
+ }
+ }
+
+ public:
+ DepthfirstDriver(const IDepthfirstStrategy *strategy, const PoolingArgs &args)
+ : Parent(args), m_strat(strategy)
+ {
+ }
+
+ size_t get_working_size(unsigned int n_threads) const override final
+ {
+ return n_threads * this->get_working_size_per_thread();
+ }
+};
+
+} // namespace pooling
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
index 178db4a0b0..6b3ebe6664 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,37 +24,28 @@
#pragma once
-#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace pooling {
void a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-struct a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst
+struct a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy<__fp16, __fp16>
{
- typedef __fp16 operand_type;
- typedef __fp16 return_type;
+ using Parent = DepthfirstStrategy<__fp16, __fp16>;
- typedef void (*kern_type)(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+ const static auto pooling_type = PoolingType::AVERAGE;
+ const static auto pool_rows = 3u, pool_cols = 3u;
+ const static auto stride_rows = 1u, stride_cols = 1u;
- constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
+ a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *)
+ : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
- constexpr static unsigned int pool_rows(void) { return 3; }
- constexpr static unsigned int pool_cols(void) { return 3; }
-
- constexpr static unsigned int stride_rows(void) { return 1; }
- constexpr static unsigned int stride_cols(void) { return 1; }
-
- constexpr static unsigned int out_rows(void) { return 2; }
- constexpr static unsigned int out_cols(void) { return 2; }
-
- kern_type kernel = a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl;
-
- a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const { return a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 89dbf5ce02..5df848d1dd 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,7 +26,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace pooling {
@@ -82,174 +82,173 @@ void a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
pad_left, pad_top, pad_right, pad_bottom);
__asm__ __volatile__(
- "ldr x4, [%x[args], %[offsetof_n_channels]]\n"
+ "ldr d7, [%x[args], %[offsetof_rescale]]\n"
+ "ldr x3, [%x[args], %[offsetof_n_channels]]\n"
+ "cmp x3, #0x8\n"
+ "mov x4, #0x0\n"
+ "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"mov x5, #0x0\n"
- "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
- "mov x6, #0x0\n"
- "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
- "cmp x4, #0x8\n"
- "ldp x7, x8, [x20, #0x0]\n"
- "ldp x17, x16, [x20, #0x10]\n"
- "ldp x15, x14, [x19, #0x0]\n"
- "ldp x13, x12, [x19, #0x10]\n"
- "ldp x11, x10, [x19, #0x20]\n"
- "ldp x9, x28, [x19, #0x30]\n"
- "ldp x27, x26, [x19, #0x40]\n"
- "ldp x25, x24, [x19, #0x50]\n"
- "ldp x23, x22, [x19, #0x60]\n"
- "ldp x21, x20, [x19, #0x70]\n"
- "ldr d8, [%x[args], %[offsetof_rescale]]\n"
+ "ldp x6, x7, [x21, #0x0]\n"
+ "ldp x8, x17, [x21, #0x10]\n"
+ "ldp x16, x15, [x20, #0x0]\n"
+ "ldp x14, x13, [x20, #0x10]\n"
+ "ldp x12, x11, [x20, #0x20]\n"
+ "ldp x10, x9, [x20, #0x30]\n"
+ "ldp x28, x27, [x20, #0x40]\n"
+ "ldp x26, x25, [x20, #0x50]\n"
+ "ldp x24, x23, [x20, #0x60]\n"
+ "ldp x22, x21, [x20, #0x70]\n"
"blt 3f\n"
- "ldr q7, [x10, x5]\n"
- "lsr x19, x4, #0x3\n"
- "ldr q6, [x9, x5]\n"
- "sub x4, x4, x19, LSL #3\n"
- "ldr q5, [x26, x5]\n"
- "subs x19, x19, #0x1\n"
- "ldr q4, [x25, x5]\n"
- "ldr q3, [x14, x5]\n"
- "ldr q2, [x13, x5]\n"
- "ldr q1, [x11, x5]\n"
- "ldr q0, [x27, x5]\n"
- "ldr q31, [x28, x5]\n"
- "ldr q30, [x24, x5]\n"
- "ldr q29, [x22, x5]\n"
- "ldr q28, [x21, x5]\n"
- "ldr q27, [x15, x5]\n"
- "ldr q26, [x12, x5]\n"
- "ldr q25, [x23, x5]\n"
- "ldr q24, [x20, x5]\n"
- "add x5, x5, #0x10\n"
+ "ldr q6, [x11, x4]\n"
+ "ldr q5, [x10, x4]\n"
+ "lsr x20, x3, #0x3\n"
+ "sub x3, x3, x20, LSL #3\n"
+ "ldr q4, [x27, x4]\n"
+ "ldr q3, [x26, x4]\n"
+ "subs x20, x20, #0x1\n"
+ "ldr q2, [x15, x4]\n"
+ "ldr q1, [x14, x4]\n"
+ "ldr q0, [x12, x4]\n"
+ "ldr q31, [x28, x4]\n"
+ "ldr q30, [x9, x4]\n"
+ "ldr q29, [x25, x4]\n"
+ "ldr q28, [x23, x4]\n"
+ "ldr q27, [x22, x4]\n"
+ "ldr q26, [x16, x4]\n"
+ "ldr q25, [x13, x4]\n"
+ "ldr q24, [x24, x4]\n"
+ "ldr q23, [x21, x4]\n"
+ "add x4, x4, #0x10\n"
"beq 2f\n"
"1:" // Vector: Loop
- "fadd v17.8h, v7.8h, v6.8h\n"
- "ldr q7, [x10, x5]\n"
- "subs x19, x19, #0x1\n"
- "fadd v16.8h, v5.8h, v4.8h\n"
- "ldr q6, [x9, x5]\n"
- "fadd v18.8h, v3.8h, v2.8h\n"
- "ldr q5, [x26, x5]\n"
- "fadd v23.8h, v1.8h, v0.8h\n"
- "ldr q4, [x25, x5]\n"
- "fadd v22.8h, v31.8h, v30.8h\n"
- "ldr q3, [x14, x5]\n"
- "fadd v17.8h, v17.8h, v16.8h\n"
- "ldr q2, [x13, x5]\n"
- "fadd v16.8h, v29.8h, v28.8h\n"
- "ldr q1, [x11, x5]\n"
- "fadd v19.8h, v27.8h, v23.8h\n"
- "ldr q0, [x27, x5]\n"
- "fadd v21.8h, v18.8h, v17.8h\n"
- "ldr q31, [x28, x5]\n"
- "fadd v20.8h, v16.8h, v17.8h\n"
- "ldr q30, [x24, x5]\n"
- "fadd v18.8h, v26.8h, v22.8h\n"
- "ldr q29, [x22, x5]\n"
- "fadd v17.8h, v25.8h, v23.8h\n"
- "ldr q28, [x21, x5]\n"
- "fadd v16.8h, v24.8h, v22.8h\n"
- "ldr q27, [x15, x5]\n"
+ "fadd v17.8h, v6.8h, v5.8h\n"
+ "ldr q6, [x11, x4]\n"
+ "ldr q5, [x10, x4]\n"
+ "fadd v16.8h, v4.8h, v3.8h\n"
+ "ldr q4, [x27, x4]\n"
+ "ldr q3, [x26, x4]\n"
+ "fadd v19.8h, v17.8h, v16.8h\n"
+ "fadd v18.8h, v2.8h, v1.8h\n"
+ "ldr q2, [x15, x4]\n"
+ "ldr q1, [x14, x4]\n"
+ "fadd v17.8h, v0.8h, v31.8h\n"
+ "fadd v22.8h, v30.8h, v29.8h\n"
+ "ldr q0, [x12, x4]\n"
+ "ldr q31, [x28, x4]\n"
+ "fadd v16.8h, v28.8h, v27.8h\n"
+ "fadd v21.8h, v18.8h, v19.8h\n"
+ "ldr q30, [x9, x4]\n"
+ "ldr q29, [x25, x4]\n"
+ "fadd v20.8h, v16.8h, v19.8h\n"
+ "fadd v19.8h, v26.8h, v17.8h\n"
+ "ldr q28, [x23, x4]\n"
+ "ldr q27, [x22, x4]\n"
+ "fadd v18.8h, v25.8h, v22.8h\n"
+ "fadd v17.8h, v24.8h, v17.8h\n"
+ "ldr q26, [x16, x4]\n"
+ "ldr q25, [x13, x4]\n"
+ "fadd v16.8h, v23.8h, v22.8h\n"
"fadd v19.8h, v21.8h, v19.8h\n"
- "ldr q26, [x12, x5]\n"
+ "ldr q24, [x24, x4]\n"
+ "ldr q23, [x21, x4]\n"
"fadd v18.8h, v21.8h, v18.8h\n"
- "ldr q25, [x23, x5]\n"
"fadd v17.8h, v17.8h, v20.8h\n"
- "ldr q24, [x20, x5]\n"
+ "fadd v16.8h, v16.8h, v20.8h\n"
+ "subs x20, x20, #0x1\n"
+ "fmul v19.8h, v19.8h, v7.h[0]\n"
+ "add x4, x4, #0x10\n"
+ "fmul v18.8h, v18.8h, v7.h[1]\n"
+ "fmul v17.8h, v17.8h, v7.h[2]\n"
+ "str q19, [x6, x5]\n"
+ "fmul v16.8h, v16.8h, v7.h[3]\n"
+ "str q18, [x7, x5]\n"
+ "str q17, [x8, x5]\n"
+ "str q16, [x17, x5]\n"
"add x5, x5, #0x10\n"
- "fadd v16.8h, v20.8h, v16.8h\n"
- "fmul v19.8h, v19.8h, v8.h[0]\n"
- "str q19, [x7, x6]\n"
- "fmul v18.8h, v18.8h, v8.h[1]\n"
- "fmul v17.8h, v17.8h, v8.h[2]\n"
- "str q18, [x8, x6]\n"
- "fmul v16.8h, v16.8h, v8.h[3]\n"
- "str q17, [x17, x6]\n"
- "str q16, [x16, x6]\n"
- "add x6, x6, #0x10\n"
"bgt 1b\n"
"2:" // Vector: Tail
- "fadd v17.8h, v7.8h, v6.8h\n"
- "fadd v16.8h, v5.8h, v4.8h\n"
- "fadd v18.8h, v3.8h, v2.8h\n"
- "fadd v23.8h, v1.8h, v0.8h\n"
- "fadd v17.8h, v17.8h, v16.8h\n"
- "fadd v22.8h, v31.8h, v30.8h\n"
- "fadd v16.8h, v29.8h, v28.8h\n"
- "fadd v21.8h, v18.8h, v17.8h\n"
- "fadd v19.8h, v27.8h, v23.8h\n"
- "fadd v20.8h, v16.8h, v17.8h\n"
- "fadd v18.8h, v26.8h, v22.8h\n"
- "fadd v17.8h, v25.8h, v23.8h\n"
- "fadd v16.8h, v24.8h, v22.8h\n"
+ "fadd v17.8h, v6.8h, v5.8h\n"
+ "fadd v16.8h, v4.8h, v3.8h\n"
+ "fadd v19.8h, v17.8h, v16.8h\n"
+ "fadd v18.8h, v2.8h, v1.8h\n"
+ "fadd v17.8h, v0.8h, v31.8h\n"
+ "fadd v22.8h, v30.8h, v29.8h\n"
+ "fadd v16.8h, v28.8h, v27.8h\n"
+ "fadd v21.8h, v18.8h, v19.8h\n"
+ "fadd v20.8h, v16.8h, v19.8h\n"
+ "fadd v19.8h, v26.8h, v17.8h\n"
+ "fadd v18.8h, v25.8h, v22.8h\n"
+ "fadd v17.8h, v24.8h, v17.8h\n"
+ "fadd v16.8h, v23.8h, v22.8h\n"
"fadd v19.8h, v21.8h, v19.8h\n"
"fadd v18.8h, v21.8h, v18.8h\n"
"fadd v17.8h, v17.8h, v20.8h\n"
- "fadd v16.8h, v20.8h, v16.8h\n"
- "fmul v19.8h, v19.8h, v8.h[0]\n"
- "str q19, [x7, x6]\n"
- "fmul v18.8h, v18.8h, v8.h[1]\n"
- "fmul v17.8h, v17.8h, v8.h[2]\n"
- "str q18, [x8, x6]\n"
- "fmul v16.8h, v16.8h, v8.h[3]\n"
- "str q17, [x17, x6]\n"
- "str q16, [x16, x6]\n"
- "add x6, x6, #0x10\n"
- "cbz x4, 4f\n"
+ "fadd v16.8h, v16.8h, v20.8h\n"
+ "fmul v19.8h, v19.8h, v7.h[0]\n"
+ "str q19, [x6, x5]\n"
+ "fmul v18.8h, v18.8h, v7.h[1]\n"
+ "fmul v17.8h, v17.8h, v7.h[2]\n"
+ "str q18, [x7, x5]\n"
+ "fmul v16.8h, v16.8h, v7.h[3]\n"
+ "str q17, [x8, x5]\n"
+ "str q16, [x17, x5]\n"
+ "add x5, x5, #0x10\n"
+ "cbz x3, 4f\n"
"3:" // Oddments
- "ldr h7, [x10, x5]\n"
- "subs x4, x4, #0x1\n"
- "ldr h6, [x9, x5]\n"
- "fadd v17.8h, v7.8h, v6.8h\n"
- "ldr h5, [x26, x5]\n"
- "ldr h4, [x25, x5]\n"
- "fadd v16.8h, v5.8h, v4.8h\n"
- "ldr h3, [x14, x5]\n"
- "ldr h2, [x13, x5]\n"
- "fadd v17.8h, v17.8h, v16.8h\n"
- "ldr h1, [x11, x5]\n"
- "ldr h0, [x27, x5]\n"
- "fadd v18.8h, v3.8h, v2.8h\n"
- "ldr h31, [x28, x5]\n"
- "fadd v23.8h, v1.8h, v0.8h\n"
- "ldr h30, [x24, x5]\n"
- "fadd v21.8h, v18.8h, v17.8h\n"
- "ldr h29, [x22, x5]\n"
- "ldr h28, [x21, x5]\n"
- "fadd v22.8h, v31.8h, v30.8h\n"
- "ldr h27, [x15, x5]\n"
- "ldr h26, [x12, x5]\n"
- "fadd v16.8h, v29.8h, v28.8h\n"
- "ldr h25, [x23, x5]\n"
- "fadd v20.8h, v16.8h, v17.8h\n"
- "ldr h24, [x20, x5]\n"
- "add x5, x5, #0x2\n"
- "fadd v19.8h, v27.8h, v23.8h\n"
- "fadd v18.8h, v26.8h, v22.8h\n"
- "fadd v17.8h, v25.8h, v23.8h\n"
- "fadd v16.8h, v24.8h, v22.8h\n"
- "fadd v19.8h, v21.8h, v19.8h\n"
- "fadd v18.8h, v21.8h, v18.8h\n"
+ "ldr h17, [x11, x4]\n"
+ "ldr h16, [x10, x4]\n"
+ "fadd v18.8h, v17.8h, v16.8h\n"
+ "subs x3, x3, #0x1\n"
+ "ldr h17, [x27, x4]\n"
+ "ldr h16, [x26, x4]\n"
+ "fadd v16.8h, v17.8h, v16.8h\n"
+ "fadd v18.8h, v18.8h, v16.8h\n"
+ "ldr h17, [x15, x4]\n"
+ "ldr h16, [x14, x4]\n"
+ "fadd v16.8h, v17.8h, v16.8h\n"
+ "fadd v23.8h, v16.8h, v18.8h\n"
+ "ldr h17, [x12, x4]\n"
+ "ldr h16, [x28, x4]\n"
+ "fadd v22.8h, v17.8h, v16.8h\n"
+ "ldr h17, [x9, x4]\n"
+ "ldr h16, [x25, x4]\n"
+ "fadd v21.8h, v17.8h, v16.8h\n"
+ "ldr h17, [x23, x4]\n"
+ "ldr h16, [x22, x4]\n"
+ "fadd v16.8h, v17.8h, v16.8h\n"
+ "fadd v20.8h, v16.8h, v18.8h\n"
+ "ldr h17, [x16, x4]\n"
+ "ldr h16, [x13, x4]\n"
+ "fadd v19.8h, v17.8h, v22.8h\n"
+ "fadd v18.8h, v16.8h, v21.8h\n"
+ "ldr h17, [x24, x4]\n"
+ "ldr h16, [x21, x4]\n"
+ "fadd v17.8h, v17.8h, v22.8h\n"
+ "fadd v16.8h, v16.8h, v21.8h\n"
+ "fadd v19.8h, v23.8h, v19.8h\n"
+ "fadd v18.8h, v23.8h, v18.8h\n"
+ "add x4, x4, #0x2\n"
"fadd v17.8h, v17.8h, v20.8h\n"
- "fadd v16.8h, v20.8h, v16.8h\n"
- "fmul v19.8h, v19.8h, v8.h[0]\n"
- "str h19, [x7, x6]\n"
- "fmul v18.8h, v18.8h, v8.h[1]\n"
- "fmul v17.8h, v17.8h, v8.h[2]\n"
- "str h18, [x8, x6]\n"
- "fmul v16.8h, v16.8h, v8.h[3]\n"
- "str h17, [x17, x6]\n"
- "str h16, [x16, x6]\n"
- "add x6, x6, #0x2\n"
+ "fadd v16.8h, v16.8h, v20.8h\n"
+ "fmul v19.8h, v19.8h, v7.h[0]\n"
+ "fmul v18.8h, v18.8h, v7.h[1]\n"
+ "str h19, [x6, x5]\n"
+ "fmul v17.8h, v17.8h, v7.h[2]\n"
+ "fmul v16.8h, v16.8h, v7.h[3]\n"
+ "str h18, [x7, x5]\n"
+ "str h17, [x8, x5]\n"
+ "str h16, [x17, x5]\n"
+ "add x5, x5, #0x2\n"
"bgt 3b\n"
"4:" // End
-
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst.hpp
index 9dc153a764..25e7af1cee 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,19 +33,11 @@ namespace pooling {
void a64_fp16_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
-struct a64_fp16_nhwc_avg_generic_depthfirst
+struct a64_fp16_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<__fp16, __fp16>
{
- typedef __fp16 operand_type;
- typedef __fp16 return_type;
-
- typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
- kern_type kernel = a64_fp16_nhwc_avg_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<__fp16, __fp16>;
a64_fp16_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return a64_fp16_nhwc_avg_generic_depthfirst_impl; }
};
} // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp
index 5bef7f2bf4..f7be92e53f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include <cstdint>
+#include <cstddef>
#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
@@ -41,308 +42,306 @@ void a64_fp16_nhwc_avg_generic_depthfirst_impl(
const auto rescale_value = static_cast<__fp16>(1.0f / static_cast<float>(window_cells));
__asm__ __volatile__(
- "ld1r { v8.8h }, [%x[rescale_ptr]]\n"
- "mov x28, #0x0\n"
- "mov x27, #0x10\n" // cntb _, ALL, #1
- "mov x26, #0x20\n" // cntb _, ALL, #2
- "mov x25, #0x30\n" // cntb _, ALL, #3
+ "ld1r { v9.8h }, [%x[rescale_ptr]]\n"
"cmp %x[n_channels], #0x20\n"
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "movi v8.16b, #0x0\n"
"movi v7.16b, #0x0\n"
- "mov x19, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x0\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
"movi v5.16b, #0x0\n"
- "movi v4.16b, #0x0\n"
- "cbz x24, 4f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x22, x27]\n"
- "ldr q29, [x21, x27]\n"
- "ldr q28, [x20, x27]\n"
- "ldr q27, [x23, x26]\n"
- "ldr q21, [x22, x26]\n"
- "ldr q26, [x21, x26]\n"
- "ldr q17, [x20, x26]\n"
- "ldr q25, [x23, x25]\n"
- "ldr q20, [x22, x25]\n"
- "ldr q24, [x21, x25]\n"
- "ldr q16, [x20, x25]\n"
+ "cbz x25, 4f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "fadd v23.8h, v3.8h, v2.8h\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "fadd v19.8h, v1.8h, v0.8h\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "fadd v22.8h, v31.8h, v30.8h\n"
- "ldr q3, [x23, x28]\n"
- "fadd v18.8h, v29.8h, v28.8h\n"
- "fadd v21.8h, v27.8h, v21.8h\n"
- "ldr q2, [x22, x28]\n"
- "fadd v17.8h, v26.8h, v17.8h\n"
- "ldr q1, [x21, x28]\n"
- "fadd v20.8h, v25.8h, v20.8h\n"
- "ldr q0, [x20, x28]\n"
- "fadd v16.8h, v24.8h, v16.8h\n"
- "ldr q31, [x23, x27]\n"
+ "fadd v23.8h, v4.8h, v3.8h\n"
+ "fadd v19.8h, v28.8h, v22.8h\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "fadd v22.8h, v2.8h, v1.8h\n"
+ "ldr q2, [x21, x26]\n"
+ "fadd v18.8h, v27.8h, v21.8h\n"
+ "ldr q1, [x20, x26]\n"
+ "fadd v21.8h, v0.8h, v31.8h\n"
+ "ldr q0, [x21, x24]\n"
+ "fadd v17.8h, v26.8h, v20.8h\n"
+ "ldr q31, [x20, x24]\n"
+ "fadd v20.8h, v30.8h, v29.8h\n"
+ "ldr q30, [x21, x23]\n"
+ "fadd v16.8h, v25.8h, v24.8h\n"
+ "ldr q29, [x20, x23]\n"
"fadd v19.8h, v23.8h, v19.8h\n"
- "ldr q30, [x22, x27]\n"
"fadd v18.8h, v22.8h, v18.8h\n"
- "ldr q29, [x21, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"fadd v17.8h, v21.8h, v17.8h\n"
- "ldr q28, [x20, x27]\n"
"fadd v16.8h, v20.8h, v16.8h\n"
- "ldr q27, [x23, x26]\n"
- "fadd v7.8h, v7.8h, v19.8h\n"
- "ldr q21, [x22, x26]\n"
- "fadd v6.8h, v6.8h, v18.8h\n"
- "ldr q26, [x21, x26]\n"
- "fadd v5.8h, v5.8h, v17.8h\n"
- "ldr q17, [x20, x26]\n"
- "fadd v4.8h, v4.8h, v16.8h\n"
- "ldr q25, [x23, x25]\n"
- "ldr q20, [x22, x25]\n"
- "ldr q24, [x21, x25]\n"
- "ldr q16, [x20, x25]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "subs x25, x25, #0x1\n"
+ "fadd v8.8h, v8.8h, v19.8h\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "fadd v7.8h, v7.8h, v18.8h\n"
+ "fadd v6.8h, v6.8h, v17.8h\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
+ "fadd v5.8h, v5.8h, v16.8h\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "fadd v23.8h, v3.8h, v2.8h\n"
- "fadd v19.8h, v1.8h, v0.8h\n"
- "fadd v22.8h, v31.8h, v30.8h\n"
- "fadd v18.8h, v29.8h, v28.8h\n"
- "fadd v21.8h, v27.8h, v21.8h\n"
- "fadd v17.8h, v26.8h, v17.8h\n"
- "fadd v20.8h, v25.8h, v20.8h\n"
- "fadd v16.8h, v24.8h, v16.8h\n"
+ "fadd v23.8h, v4.8h, v3.8h\n"
+ "fadd v19.8h, v28.8h, v22.8h\n"
+ "fadd v22.8h, v2.8h, v1.8h\n"
+ "fadd v18.8h, v27.8h, v21.8h\n"
+ "fadd v21.8h, v0.8h, v31.8h\n"
+ "fadd v17.8h, v26.8h, v20.8h\n"
+ "fadd v20.8h, v30.8h, v29.8h\n"
+ "fadd v16.8h, v25.8h, v24.8h\n"
"fadd v19.8h, v23.8h, v19.8h\n"
"fadd v18.8h, v22.8h, v18.8h\n"
"fadd v17.8h, v21.8h, v17.8h\n"
"fadd v16.8h, v20.8h, v16.8h\n"
- "fadd v7.8h, v7.8h, v19.8h\n"
- "fadd v6.8h, v6.8h, v18.8h\n"
- "fadd v5.8h, v5.8h, v17.8h\n"
- "fadd v4.8h, v4.8h, v16.8h\n"
+ "fadd v8.8h, v8.8h, v19.8h\n"
+ "fadd v7.8h, v7.8h, v18.8h\n"
+ "fadd v6.8h, v6.8h, v17.8h\n"
+ "fadd v5.8h, v5.8h, v16.8h\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "fadd v7.8h, v7.8h, v3.8h\n"
- "ldr q31, [x23, x27]\n"
- "ldr q27, [x23, x26]\n"
- "fadd v6.8h, v6.8h, v31.8h\n"
- "ldr q25, [x23, x25]\n"
- "fadd v5.8h, v5.8h, v27.8h\n"
- "fadd v4.8h, v4.8h, v25.8h\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "subs x21, x21, #0x1\n"
+ "fadd v8.8h, v8.8h, v16.8h\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "fadd v7.8h, v7.8h, v17.8h\n"
+ "fadd v6.8h, v6.8h, v16.8h\n"
+ "ldr q16, [x20, x23]\n"
+ "fadd v5.8h, v5.8h, v16.8h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "fmul v7.8h, v7.8h, v8.8h\n"
- "str q7, [%x[outptr], x28]\n"
- "fmul v6.8h, v6.8h, v8.8h\n"
- "add x28, x28, #0x40\n"
- "fmul v5.8h, v5.8h, v8.8h\n"
- "str q6, [%x[outptr], x27]\n"
- "fmul v4.8h, v4.8h, v8.8h\n"
- "add x27, x27, #0x40\n"
- "str q5, [%x[outptr], x26]\n"
- "add x26, x26, #0x40\n"
"sub %x[n_channels], %x[n_channels], #0x20\n"
- "str q4, [%x[outptr], x25]\n"
- "add x25, x25, #0x40\n"
"cmp %x[n_channels], #0x20\n"
+ "fmul v8.8h, v8.8h, v9.8h\n"
+ "fmul v7.8h, v7.8h, v9.8h\n"
+ "fmul v6.8h, v6.8h, v9.8h\n"
+ "fmul v5.8h, v5.8h, v9.8h\n"
+ "str q8, [%x[outptr], x27]\n"
+ "add x27, x27, #0x40\n"
+ "str q7, [%x[outptr], x26]\n"
+ "add x26, x26, #0x40\n"
+ "str q6, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q5, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 31f\n"
"7:" // Single vector of channels
"cmp %x[n_channels], #0x8\n"
"blt 14f\n"
"8:" // Single vector of channels: Loop
- "movi v7.16b, #0x0\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 11f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "movi v8.16b, #0x0\n"
+ "mov x22, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fadd v23.8h, v3.8h, v2.8h\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "fadd v19.8h, v1.8h, v0.8h\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "fadd v19.8h, v23.8h, v19.8h\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "fadd v7.8h, v7.8h, v19.8h\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
+ "fadd v17.8h, v4.8h, v3.8h\n"
+ "fadd v16.8h, v28.8h, v22.8h\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "fadd v16.8h, v17.8h, v16.8h\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "fadd v8.8h, v8.8h, v16.8h\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fadd v23.8h, v3.8h, v2.8h\n"
- "fadd v19.8h, v1.8h, v0.8h\n"
- "fadd v19.8h, v23.8h, v19.8h\n"
- "fadd v7.8h, v7.8h, v19.8h\n"
+ "fadd v17.8h, v4.8h, v3.8h\n"
+ "fadd v16.8h, v28.8h, v22.8h\n"
+ "fadd v16.8h, v17.8h, v16.8h\n"
+ "fadd v8.8h, v8.8h, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "fadd v7.8h, v7.8h, v3.8h\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "subs x21, x21, #0x1\n"
+ "fadd v8.8h, v8.8h, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "fmul v7.8h, v7.8h, v8.8h\n"
- "str q7, [%x[outptr], x28]\n"
- "add x28, x28, #0x10\n"
"sub %x[n_channels], %x[n_channels], #0x8\n"
"cmp %x[n_channels], #0x8\n"
+ "fmul v8.8h, v8.8h, v9.8h\n"
+ "str q8, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 31f\n"
"14:" // Oddments
- "movi v7.16b, #0x0\n"
- "add %x[outptr], %x[outptr], x28\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 20f\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "add %x[outptr], %x[outptr], x27\n"
+ "movi v8.16b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 20f\n"
"15:" // Oddments: 4 inputs loop
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
+ "movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "add x23, x23, x28\n"
- "movi v2.16b, #0x0\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "movi v1.16b, #0x0\n"
- "add x19, x19, #0x20\n"
- "movi v0.16b, #0x0\n"
- "add x22, x22, x28\n"
- "add x21, x21, x28\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x27\n"
+ "movi v28.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
"tbz %x[n_channels], #2, 17f\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d2, [x22], #0x8\n"
- "ldr d1, [x21], #0x8\n"
- "ldr d0, [x20], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v2.s }[2], [x22], #0x4\n"
- "ld1 { v1.s }[2], [x21], #0x4\n"
- "ld1 { v0.s }[2], [x20], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
- "ld1 { v2.h }[6], [x22], #0x2\n"
- "ld1 { v1.h }[6], [x21], #0x2\n"
- "ld1 { v0.h }[6], [x20], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v3.h }[6], [x22], #0x2\n"
+ "ld1 { v28.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x20], #0x2\n"
"b 19f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
- "ld1 { v2.h }[4], [x22], #0x2\n"
- "ld1 { v1.h }[4], [x21], #0x2\n"
- "ld1 { v0.h }[4], [x20], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v3.h }[4], [x22], #0x2\n"
+ "ld1 { v28.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x20], #0x2\n"
"b 19f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s2, [x22], #0x4\n"
- "ldr s1, [x21], #0x4\n"
- "ldr s0, [x20], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
- "ld1 { v2.h }[2], [x22], #0x2\n"
- "ld1 { v1.h }[2], [x21], #0x2\n"
- "ld1 { v0.h }[2], [x20], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v3.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
"b 19f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h2, [x22], #0x2\n"
- "ldr h1, [x21], #0x2\n"
- "ldr h0, [x20], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "ldr h22, [x20], #0x2\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 2: End
- "fadd v23.8h, v3.8h, v2.8h\n"
- "subs x24, x24, #0x1\n"
- "fadd v19.8h, v1.8h, v0.8h\n"
- "fadd v19.8h, v23.8h, v19.8h\n"
- "fadd v7.8h, v7.8h, v19.8h\n"
+ "fadd v17.8h, v4.8h, v3.8h\n"
+ "fadd v16.8h, v28.8h, v22.8h\n"
+ "subs x25, x25, #0x1\n"
+ "fadd v16.8h, v17.8h, v16.8h\n"
+ "fadd v8.8h, v8.8h, v16.8h\n"
"bgt 15b\n"
"20:" // Oddments: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 26f\n"
"21:" // Oddments: Single input loop
- "movi v3.16b, #0x0\n"
- "ldr x23, [x19], #0x8\n"
- "add x23, x23, x28\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
+ "movi v4.16b, #0x0\n"
"tbz %x[n_channels], #2, 23f\n"
- "ldr d3, [x23], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #1, 22f\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
"b 25f\n"
"22:" // Oddments: Single input loop: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
"b 25f\n"
"23:" // Oddments: Single input loop: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 24f\n"
- "ldr s3, [x23], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
"b 25f\n"
"24:" // Oddments: Single input loop: Load: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 25f\n"
- "ldr h3, [x23], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
"25:" // Oddments: Single input loop: Load: Bit 2: End
- "fadd v7.8h, v7.8h, v3.8h\n"
- "subs x20, x20, #0x1\n"
+ "subs x21, x21, #0x1\n"
+ "fadd v8.8h, v8.8h, v4.8h\n"
"bgt 21b\n"
"26:" // Oddments: Single input loop: End
- "fmul v7.8h, v7.8h, v8.8h\n"
+ "fmul v8.8h, v8.8h, v9.8h\n"
"tbz %x[n_channels], #2, 28f\n"
- "st1 { v7.d }[0], [%x[outptr]], #0x8\n"
+ "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #1, 27f\n"
- "st1 { v7.s }[2], [%x[outptr]], #0x4\n"
+ "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
"tbz %x[n_channels], #0, 30f\n"
- "st1 { v7.h }[6], [%x[outptr]], #0x2\n"
+ "st1 { v8.h }[6], [%x[outptr]], #0x2\n"
"b 30f\n"
"27:" // Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 30f\n"
- "st1 { v7.h }[4], [%x[outptr]], #0x2\n"
+ "st1 { v8.h }[4], [%x[outptr]], #0x2\n"
"b 30f\n"
"28:" // Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 29f\n"
- "st1 { v7.s }[0], [%x[outptr]], #0x4\n"
+ "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
"tbz %x[n_channels], #0, 30f\n"
- "st1 { v7.h }[2], [%x[outptr]], #0x2\n"
+ "st1 { v8.h }[2], [%x[outptr]], #0x2\n"
"b 30f\n"
"29:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 30f\n"
- "st1 { v7.h }[0], [%x[outptr]], #0x2\n"
+ "st1 { v8.h }[0], [%x[outptr]], #0x2\n"
"30:" // Oddments: Store: Bit 2: End
-
"31:" // End
-
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [rescale_ptr] "r" (&rescale_value)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 9950bb8cdb..b65ac7e9fa 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,37 +24,28 @@
#pragma once
-#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace pooling {
void a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-struct a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst
+struct a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<__fp16, __fp16>
{
- typedef __fp16 operand_type;
- typedef __fp16 return_type;
+ using Parent = DepthfirstStrategy<__fp16, __fp16>;
- typedef void (*kern_type)(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+ const static auto pooling_type = PoolingType::MAX;
+ const static auto pool_rows = 2u, pool_cols = 2u;
+ const static auto stride_rows = 1u, stride_cols = 1u;
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+ a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+ : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
- constexpr static unsigned int pool_rows(void) { return 2; }
- constexpr static unsigned int pool_cols(void) { return 2; }
-
- constexpr static unsigned int stride_rows(void) { return 1; }
- constexpr static unsigned int stride_cols(void) { return 1; }
-
- constexpr static unsigned int out_rows(void) { return 2; }
- constexpr static unsigned int out_cols(void) { return 2; }
-
- kern_type kernel = a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
- a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const { return a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 1c461ee163..4b073b9076 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,7 +26,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace pooling {
@@ -63,116 +63,115 @@ void a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
pad_left, pad_top, pad_right, pad_bottom);
__asm__ __volatile__(
- "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
- "mov x14, #0x0\n"
- "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
- "mov x13, #0x0\n"
- "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
- "cmp x15, #0x8\n"
- "ldp x12, x11, [x20, #0x0]\n"
- "ldp x10, x9, [x20, #0x10]\n"
- "ldp x28, x27, [x19, #0x0]\n"
- "ldp x26, x25, [x19, #0x10]\n"
- "ldp x24, x23, [x19, #0x20]\n"
- "ldp x22, x21, [x19, #0x30]\n"
- "ldr x20, [x19, #0x40]\n"
+ "ldr x16, [%x[args], %[offsetof_n_channels]]\n"
+ "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+ "cmp x16, #0x8\n"
+ "mov x15, #0x0\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+ "ldp x14, x13, [x21, #0x0]\n"
+ "mov x12, #0x0\n"
+ "ldp x11, x10, [x21, #0x10]\n"
+ "ldp x9, x28, [x20, #0x0]\n"
+ "ldp x27, x26, [x20, #0x10]\n"
+ "ldp x25, x24, [x20, #0x20]\n"
+ "ldp x23, x22, [x20, #0x30]\n"
+ "ldr x21, [x20, #0x40]\n"
"blt 3f\n"
- "ldr q30, [x27, x14]\n"
- "lsr x19, x15, #0x3\n"
- "ldr q29, [x24, x14]\n"
- "sub x15, x15, x19, LSL #3\n"
- "ldr q28, [x21, x14]\n"
- "subs x19, x19, #0x1\n"
- "ldr q27, [x25, x14]\n"
- "ldr q26, [x28, x14]\n"
- "ldr q25, [x23, x14]\n"
- "ldr q24, [x26, x14]\n"
- "ldr q23, [x22, x14]\n"
- "ldr q22, [x20, x14]\n"
- "add x14, x14, #0x10\n"
+ "ldr q30, [x28, x15]\n"
+ "ldr q29, [x25, x15]\n"
+ "lsr x20, x16, #0x3\n"
+ "sub x16, x16, x20, LSL #3\n"
+ "ldr q28, [x22, x15]\n"
+ "ldr q27, [x26, x15]\n"
+ "subs x20, x20, #0x1\n"
+ "ldr q26, [x9, x15]\n"
+ "ldr q25, [x27, x15]\n"
+ "ldr q24, [x24, x15]\n"
+ "ldr q23, [x23, x15]\n"
+ "ldr q22, [x21, x15]\n"
+ "add x15, x15, #0x10\n"
"beq 2f\n"
"1:" // Vector: Loop
"fmax v21.8h, v30.8h, v29.8h\n"
- "ldr q30, [x27, x14]\n"
- "subs x19, x19, #0x1\n"
+ "ldr q30, [x28, x15]\n"
"fmax v20.8h, v29.8h, v28.8h\n"
- "ldr q29, [x24, x14]\n"
+ "ldr q29, [x25, x15]\n"
+ "ldr q28, [x22, x15]\n"
"fmax v19.8h, v27.8h, v26.8h\n"
- "ldr q28, [x21, x14]\n"
+ "ldr q26, [x9, x15]\n"
"fmax v18.8h, v25.8h, v24.8h\n"
- "ldr q26, [x28, x14]\n"
- "fmax v17.8h, v23.8h, v27.8h\n"
- "ldr q27, [x25, x14]\n"
- "fmax v16.8h, v25.8h, v22.8h\n"
- "ldr q25, [x23, x14]\n"
+ "ldr q25, [x27, x15]\n"
+ "fmax v17.8h, v27.8h, v23.8h\n"
+ "ldr q27, [x26, x15]\n"
+ "fmax v16.8h, v24.8h, v22.8h\n"
+ "ldr q24, [x24, x15]\n"
+ "ldr q23, [x23, x15]\n"
+ "subs x20, x20, #0x1\n"
"fmax v19.8h, v21.8h, v19.8h\n"
- "ldr q24, [x26, x14]\n"
- "fmax v18.8h, v21.8h, v18.8h\n"
- "ldr q23, [x22, x14]\n"
- "fmax v17.8h, v20.8h, v17.8h\n"
- "ldr q22, [x20, x14]\n"
- "add x14, x14, #0x10\n"
+ "ldr q22, [x21, x15]\n"
+ "fmax v18.8h, v18.8h, v21.8h\n"
+ "fmax v17.8h, v17.8h, v20.8h\n"
+ "add x15, x15, #0x10\n"
"fmax v16.8h, v20.8h, v16.8h\n"
- "str q19, [x12, x13]\n"
- "str q18, [x11, x13]\n"
- "str q17, [x10, x13]\n"
- "str q16, [x9, x13]\n"
- "add x13, x13, #0x10\n"
+ "str q19, [x14, x12]\n"
+ "str q18, [x13, x12]\n"
+ "str q17, [x11, x12]\n"
+ "str q16, [x10, x12]\n"
+ "add x12, x12, #0x10\n"
"bgt 1b\n"
"2:" // Vector: Tail
"fmax v21.8h, v30.8h, v29.8h\n"
"fmax v20.8h, v29.8h, v28.8h\n"
- "fmax v19.8h, v27.8h, v26.8h\n"
+ "fmax v16.8h, v27.8h, v26.8h\n"
"fmax v18.8h, v25.8h, v24.8h\n"
- "fmax v17.8h, v23.8h, v27.8h\n"
- "fmax v16.8h, v25.8h, v22.8h\n"
- "fmax v19.8h, v21.8h, v19.8h\n"
- "str q19, [x12, x13]\n"
- "fmax v18.8h, v21.8h, v18.8h\n"
- "fmax v17.8h, v20.8h, v17.8h\n"
- "str q18, [x11, x13]\n"
- "fmax v16.8h, v20.8h, v16.8h\n"
- "str q17, [x10, x13]\n"
- "str q16, [x9, x13]\n"
- "add x13, x13, #0x10\n"
- "cbz x15, 4f\n"
+ "fmax v17.8h, v27.8h, v23.8h\n"
+ "fmax v19.8h, v24.8h, v22.8h\n"
+ "fmax v16.8h, v21.8h, v16.8h\n"
+ "fmax v18.8h, v18.8h, v21.8h\n"
+ "str q16, [x14, x12]\n"
+ "fmax v17.8h, v17.8h, v20.8h\n"
+ "fmax v16.8h, v20.8h, v19.8h\n"
+ "str q18, [x13, x12]\n"
+ "str q17, [x11, x12]\n"
+ "str q16, [x10, x12]\n"
+ "add x12, x12, #0x10\n"
+ "cbz x16, 4f\n"
"3:" // Oddments
- "ldr h30, [x27, x14]\n"
- "subs x15, x15, #0x1\n"
- "ldr h29, [x24, x14]\n"
- "fmax v21.8h, v30.8h, v29.8h\n"
- "ldr h28, [x21, x14]\n"
- "ldr h27, [x25, x14]\n"
- "fmax v20.8h, v29.8h, v28.8h\n"
- "ldr h26, [x28, x14]\n"
- "ldr h25, [x23, x14]\n"
- "fmax v19.8h, v27.8h, v26.8h\n"
- "ldr h24, [x26, x14]\n"
- "ldr h23, [x22, x14]\n"
- "fmax v19.8h, v21.8h, v19.8h\n"
- "ldr h22, [x20, x14]\n"
- "add x14, x14, #0x2\n"
- "fmax v18.8h, v25.8h, v24.8h\n"
- "str h19, [x12, x13]\n"
- "fmax v17.8h, v23.8h, v27.8h\n"
- "fmax v16.8h, v25.8h, v22.8h\n"
- "fmax v18.8h, v21.8h, v18.8h\n"
- "str h18, [x11, x13]\n"
- "fmax v17.8h, v20.8h, v17.8h\n"
- "fmax v16.8h, v20.8h, v16.8h\n"
- "str h17, [x10, x13]\n"
- "str h16, [x9, x13]\n"
- "add x13, x13, #0x2\n"
+ "ldr h16, [x28, x15]\n"
+ "ldr h17, [x25, x15]\n"
+ "fmax v23.8h, v16.8h, v17.8h\n"
+ "subs x16, x16, #0x1\n"
+ "ldr h16, [x22, x15]\n"
+ "ldr h22, [x26, x15]\n"
+ "fmax v21.8h, v17.8h, v16.8h\n"
+ "ldr h16, [x9, x15]\n"
+ "ldr h17, [x27, x15]\n"
+ "fmax v16.8h, v22.8h, v16.8h\n"
+ "fmax v20.8h, v23.8h, v16.8h\n"
+ "ldr h19, [x24, x15]\n"
+ "ldr h16, [x23, x15]\n"
+ "fmax v18.8h, v17.8h, v19.8h\n"
+ "fmax v17.8h, v22.8h, v16.8h\n"
+ "ldr h16, [x21, x15]\n"
+ "fmax v16.8h, v19.8h, v16.8h\n"
+ "add x15, x15, #0x2\n"
+ "fmax v18.8h, v18.8h, v23.8h\n"
+ "fmax v17.8h, v17.8h, v21.8h\n"
+ "fmax v16.8h, v21.8h, v16.8h\n"
+ "str h20, [x14, x12]\n"
+ "str h18, [x13, x12]\n"
+ "str h17, [x11, x12]\n"
+ "str h16, [x10, x12]\n"
+ "add x12, x12, #0x2\n"
"bgt 3b\n"
"4:" // End
-
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst.hpp
index 8bea0bf5df..4998b37b4b 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,19 +33,11 @@ namespace pooling {
void a64_fp16_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
-struct a64_fp16_nhwc_max_generic_depthfirst
+struct a64_fp16_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<__fp16, __fp16>
{
- typedef __fp16 operand_type;
- typedef __fp16 return_type;
-
- typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
- kern_type kernel = a64_fp16_nhwc_max_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<__fp16, __fp16>;
a64_fp16_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return a64_fp16_nhwc_max_generic_depthfirst_impl; }
};
} // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp
index e5f7ee3c72..c92e2cdebd 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include <cstdint>
+#include <cstddef>
#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
@@ -39,304 +40,302 @@ void a64_fp16_nhwc_max_generic_depthfirst_impl(
)
{
__asm__ __volatile__(
- "mov x28, #0x0\n"
- "mov x27, #0x10\n" // cntb _, ALL, #1
- "mov x26, #0x20\n" // cntb _, ALL, #2
- "mov x25, #0x30\n" // cntb _, ALL, #3
"cmp %x[n_channels], #0x20\n"
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"mov w20, #0xfc00\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "dup v8.8h, w20\n"
"dup v7.8h, w20\n"
- "mov x19, %x[inptrs]\n"
"dup v6.8h, w20\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
"dup v5.8h, w20\n"
- "dup v4.8h, w20\n"
- "cbz x24, 4f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x22, x27]\n"
- "ldr q29, [x21, x27]\n"
- "ldr q28, [x20, x27]\n"
- "ldr q27, [x23, x26]\n"
- "ldr q21, [x22, x26]\n"
- "ldr q26, [x21, x26]\n"
- "ldr q17, [x20, x26]\n"
- "ldr q25, [x23, x25]\n"
- "ldr q20, [x22, x25]\n"
- "ldr q24, [x21, x25]\n"
- "ldr q16, [x20, x25]\n"
+ "mov x22, %x[inptrs]\n"
+ "cbz x25, 4f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "fmax v23.8h, v3.8h, v2.8h\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "fmax v19.8h, v1.8h, v0.8h\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "fmax v22.8h, v31.8h, v30.8h\n"
- "ldr q3, [x23, x28]\n"
- "fmax v18.8h, v29.8h, v28.8h\n"
- "fmax v21.8h, v27.8h, v21.8h\n"
- "ldr q2, [x22, x28]\n"
- "fmax v17.8h, v26.8h, v17.8h\n"
- "ldr q1, [x21, x28]\n"
- "fmax v20.8h, v25.8h, v20.8h\n"
- "ldr q0, [x20, x28]\n"
- "fmax v16.8h, v24.8h, v16.8h\n"
- "ldr q31, [x23, x27]\n"
+ "fmax v23.8h, v4.8h, v3.8h\n"
+ "fmax v19.8h, v28.8h, v22.8h\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "fmax v22.8h, v2.8h, v1.8h\n"
+ "ldr q2, [x21, x26]\n"
+ "fmax v18.8h, v27.8h, v21.8h\n"
+ "ldr q1, [x20, x26]\n"
+ "fmax v21.8h, v0.8h, v31.8h\n"
+ "ldr q0, [x21, x24]\n"
+ "fmax v17.8h, v26.8h, v20.8h\n"
+ "ldr q31, [x20, x24]\n"
+ "fmax v20.8h, v30.8h, v29.8h\n"
+ "ldr q30, [x21, x23]\n"
+ "fmax v16.8h, v25.8h, v24.8h\n"
+ "ldr q29, [x20, x23]\n"
"fmax v19.8h, v23.8h, v19.8h\n"
- "ldr q30, [x22, x27]\n"
"fmax v18.8h, v22.8h, v18.8h\n"
- "ldr q29, [x21, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"fmax v17.8h, v21.8h, v17.8h\n"
- "ldr q28, [x20, x27]\n"
"fmax v16.8h, v20.8h, v16.8h\n"
- "ldr q27, [x23, x26]\n"
- "fmax v7.8h, v7.8h, v19.8h\n"
- "ldr q21, [x22, x26]\n"
- "fmax v6.8h, v6.8h, v18.8h\n"
- "ldr q26, [x21, x26]\n"
- "fmax v5.8h, v5.8h, v17.8h\n"
- "ldr q17, [x20, x26]\n"
- "fmax v4.8h, v4.8h, v16.8h\n"
- "ldr q25, [x23, x25]\n"
- "ldr q20, [x22, x25]\n"
- "ldr q24, [x21, x25]\n"
- "ldr q16, [x20, x25]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "subs x25, x25, #0x1\n"
+ "fmax v8.8h, v8.8h, v19.8h\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "fmax v7.8h, v7.8h, v18.8h\n"
+ "fmax v6.8h, v6.8h, v17.8h\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
+ "fmax v5.8h, v5.8h, v16.8h\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "fmax v23.8h, v3.8h, v2.8h\n"
- "fmax v19.8h, v1.8h, v0.8h\n"
- "fmax v22.8h, v31.8h, v30.8h\n"
- "fmax v18.8h, v29.8h, v28.8h\n"
- "fmax v21.8h, v27.8h, v21.8h\n"
- "fmax v17.8h, v26.8h, v17.8h\n"
- "fmax v20.8h, v25.8h, v20.8h\n"
- "fmax v16.8h, v24.8h, v16.8h\n"
+ "fmax v23.8h, v4.8h, v3.8h\n"
+ "fmax v19.8h, v28.8h, v22.8h\n"
+ "fmax v22.8h, v2.8h, v1.8h\n"
+ "fmax v18.8h, v27.8h, v21.8h\n"
+ "fmax v21.8h, v0.8h, v31.8h\n"
+ "fmax v17.8h, v26.8h, v20.8h\n"
+ "fmax v20.8h, v30.8h, v29.8h\n"
+ "fmax v16.8h, v25.8h, v24.8h\n"
"fmax v19.8h, v23.8h, v19.8h\n"
"fmax v18.8h, v22.8h, v18.8h\n"
"fmax v17.8h, v21.8h, v17.8h\n"
"fmax v16.8h, v20.8h, v16.8h\n"
- "fmax v7.8h, v7.8h, v19.8h\n"
- "fmax v6.8h, v6.8h, v18.8h\n"
- "fmax v5.8h, v5.8h, v17.8h\n"
- "fmax v4.8h, v4.8h, v16.8h\n"
+ "fmax v8.8h, v8.8h, v19.8h\n"
+ "fmax v7.8h, v7.8h, v18.8h\n"
+ "fmax v6.8h, v6.8h, v17.8h\n"
+ "fmax v5.8h, v5.8h, v16.8h\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "fmax v7.8h, v7.8h, v3.8h\n"
- "ldr q31, [x23, x27]\n"
- "ldr q27, [x23, x26]\n"
- "fmax v6.8h, v6.8h, v31.8h\n"
- "ldr q25, [x23, x25]\n"
- "fmax v5.8h, v5.8h, v27.8h\n"
- "fmax v4.8h, v4.8h, v25.8h\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "subs x21, x21, #0x1\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "fmax v7.8h, v7.8h, v17.8h\n"
+ "fmax v6.8h, v6.8h, v16.8h\n"
+ "ldr q16, [x20, x23]\n"
+ "fmax v5.8h, v5.8h, v16.8h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "str q7, [%x[outptr], x28]\n"
- "add x28, x28, #0x40\n"
- "str q6, [%x[outptr], x27]\n"
- "add x27, x27, #0x40\n"
- "str q5, [%x[outptr], x26]\n"
- "add x26, x26, #0x40\n"
- "str q4, [%x[outptr], x25]\n"
- "add x25, x25, #0x40\n"
"sub %x[n_channels], %x[n_channels], #0x20\n"
"cmp %x[n_channels], #0x20\n"
+ "str q8, [%x[outptr], x27]\n"
+ "str q7, [%x[outptr], x26]\n"
+ "add x27, x27, #0x40\n"
+ "add x26, x26, #0x40\n"
+ "str q6, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q5, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 31f\n"
"7:" // Single vector of channels
"cmp %x[n_channels], #0x8\n"
"blt 14f\n"
"8:" // Single vector of channels: Loop
- "mov w19, #0xfc00\n"
- "dup v7.8h, w19\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 11f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
+ "mov w20, #0xfc00\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "dup v8.8h, w20\n"
+ "mov x22, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fmax v23.8h, v3.8h, v2.8h\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "fmax v19.8h, v1.8h, v0.8h\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "fmax v19.8h, v23.8h, v19.8h\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "fmax v7.8h, v7.8h, v19.8h\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
+ "fmax v17.8h, v4.8h, v3.8h\n"
+ "fmax v16.8h, v28.8h, v22.8h\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "fmax v16.8h, v17.8h, v16.8h\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fmax v23.8h, v3.8h, v2.8h\n"
- "fmax v19.8h, v1.8h, v0.8h\n"
- "fmax v19.8h, v23.8h, v19.8h\n"
- "fmax v7.8h, v7.8h, v19.8h\n"
+ "fmax v17.8h, v4.8h, v3.8h\n"
+ "fmax v16.8h, v28.8h, v22.8h\n"
+ "fmax v16.8h, v17.8h, v16.8h\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "fmax v7.8h, v7.8h, v3.8h\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "subs x21, x21, #0x1\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "str q7, [%x[outptr], x28]\n"
- "add x28, x28, #0x10\n"
"sub %x[n_channels], %x[n_channels], #0x8\n"
"cmp %x[n_channels], #0x8\n"
+ "str q8, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 31f\n"
"14:" // Oddments
- "add %x[outptr], %x[outptr], x28\n"
- "mov w19, #0xfc00\n"
- "dup v7.8h, w19\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 20f\n"
+ "mov w20, #0xfc00\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "dup v8.8h, w20\n"
+ "add %x[outptr], %x[outptr], x27\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 20f\n"
"15:" // Oddments: 4 inputs loop
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
+ "movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "add x23, x23, x28\n"
- "movi v2.16b, #0x0\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "movi v1.16b, #0x0\n"
- "add x19, x19, #0x20\n"
- "movi v0.16b, #0x0\n"
- "add x22, x22, x28\n"
- "add x21, x21, x28\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x27\n"
+ "movi v28.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
"tbz %x[n_channels], #2, 17f\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d2, [x22], #0x8\n"
- "ldr d1, [x21], #0x8\n"
- "ldr d0, [x20], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v2.s }[2], [x22], #0x4\n"
- "ld1 { v1.s }[2], [x21], #0x4\n"
- "ld1 { v0.s }[2], [x20], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
- "ld1 { v2.h }[6], [x22], #0x2\n"
- "ld1 { v1.h }[6], [x21], #0x2\n"
- "ld1 { v0.h }[6], [x20], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v3.h }[6], [x22], #0x2\n"
+ "ld1 { v28.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x20], #0x2\n"
"b 19f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
- "ld1 { v2.h }[4], [x22], #0x2\n"
- "ld1 { v1.h }[4], [x21], #0x2\n"
- "ld1 { v0.h }[4], [x20], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v3.h }[4], [x22], #0x2\n"
+ "ld1 { v28.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x20], #0x2\n"
"b 19f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s2, [x22], #0x4\n"
- "ldr s1, [x21], #0x4\n"
- "ldr s0, [x20], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"tbz %x[n_channels], #0, 19f\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
- "ld1 { v2.h }[2], [x22], #0x2\n"
- "ld1 { v1.h }[2], [x21], #0x2\n"
- "ld1 { v0.h }[2], [x20], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v3.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
"b 19f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 19f\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h2, [x22], #0x2\n"
- "ldr h1, [x21], #0x2\n"
- "ldr h0, [x20], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "ldr h22, [x20], #0x2\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 2: End
- "fmax v23.8h, v3.8h, v2.8h\n"
- "subs x24, x24, #0x1\n"
- "fmax v19.8h, v1.8h, v0.8h\n"
- "fmax v19.8h, v23.8h, v19.8h\n"
- "fmax v7.8h, v7.8h, v19.8h\n"
+ "fmax v17.8h, v4.8h, v3.8h\n"
+ "fmax v16.8h, v28.8h, v22.8h\n"
+ "subs x25, x25, #0x1\n"
+ "fmax v16.8h, v17.8h, v16.8h\n"
+ "fmax v8.8h, v8.8h, v16.8h\n"
"bgt 15b\n"
"20:" // Oddments: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 26f\n"
"21:" // Oddments: Single input loop
- "movi v3.16b, #0x0\n"
- "ldr x23, [x19], #0x8\n"
- "add x23, x23, x28\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
+ "movi v4.16b, #0x0\n"
"tbz %x[n_channels], #2, 23f\n"
- "ldr d3, [x23], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #1, 22f\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
"b 25f\n"
"22:" // Oddments: Single input loop: Load: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
"b 25f\n"
"23:" // Oddments: Single input loop: Load: Bit 2: Unset
"tbz %x[n_channels], #1, 24f\n"
- "ldr s3, [x23], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"tbz %x[n_channels], #0, 25f\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
"b 25f\n"
"24:" // Oddments: Single input loop: Load: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 25f\n"
- "ldr h3, [x23], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
"25:" // Oddments: Single input loop: Load: Bit 2: End
- "fmax v7.8h, v7.8h, v3.8h\n"
- "subs x20, x20, #0x1\n"
+ "subs x21, x21, #0x1\n"
+ "fmax v8.8h, v8.8h, v4.8h\n"
"bgt 21b\n"
"26:" // Oddments: Single input loop: End
"tbz %x[n_channels], #2, 28f\n"
- "st1 { v7.d }[0], [%x[outptr]], #0x8\n"
+ "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #1, 27f\n"
- "st1 { v7.s }[2], [%x[outptr]], #0x4\n"
+ "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
"tbz %x[n_channels], #0, 30f\n"
- "st1 { v7.h }[6], [%x[outptr]], #0x2\n"
+ "st1 { v8.h }[6], [%x[outptr]], #0x2\n"
"b 30f\n"
"27:" // Oddments: Store: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 30f\n"
- "st1 { v7.h }[4], [%x[outptr]], #0x2\n"
+ "st1 { v8.h }[4], [%x[outptr]], #0x2\n"
"b 30f\n"
"28:" // Oddments: Store: Bit 2: Unset
"tbz %x[n_channels], #1, 29f\n"
- "st1 { v7.s }[0], [%x[outptr]], #0x4\n"
+ "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
"tbz %x[n_channels], #0, 30f\n"
- "st1 { v7.h }[2], [%x[outptr]], #0x2\n"
+ "st1 { v8.h }[2], [%x[outptr]], #0x2\n"
"b 30f\n"
"29:" // Oddments: Store: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 30f\n"
- "st1 { v7.h }[0], [%x[outptr]], #0x2\n"
+ "st1 { v8.h }[0], [%x[outptr]], #0x2\n"
"30:" // Oddments: Store: Bit 2: End
-
"31:" // End
-
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
index 9a16b99a71..7add5feb1d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,33 +24,28 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
void a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-struct a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst
+struct a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy<float, float>
{
- typedef float operand_type;
- typedef float return_type;
-
- typedef void (*kern_type)(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
+ using Parent = DepthfirstStrategy<float, float>;
- constexpr static unsigned int pool_rows(void) { return 3; }
- constexpr static unsigned int pool_cols(void) { return 3; }
+ const static auto pooling_type = PoolingType::AVERAGE;
+ const static auto pool_rows = 3u, pool_cols = 3u;
+ const static auto stride_rows = 1u, stride_cols = 1u;
- constexpr static unsigned int stride_rows(void) { return 1; }
- constexpr static unsigned int stride_cols(void) { return 1; }
+ a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *)
+ : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
- constexpr static unsigned int out_rows(void) { return 2; }
- constexpr static unsigned int out_cols(void) { return 2; }
-
- kern_type kernel = a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl;
-
- a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const { return a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index ff8d7d8ba1..cf0047638e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -80,172 +82,173 @@ void a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
pad_left, pad_top, pad_right, pad_bottom);
__asm__ __volatile__(
- "ldr x4, [%x[args], %[offsetof_n_channels]]\n"
+ "ldr q7, [%x[args], %[offsetof_rescale]]\n"
+ "ldr x3, [%x[args], %[offsetof_n_channels]]\n"
+ "cmp x3, #0x4\n"
+ "mov x4, #0x0\n"
+ "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
"mov x5, #0x0\n"
- "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
- "mov x6, #0x0\n"
- "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
- "cmp x4, #0x4\n"
- "ldp x7, x8, [x20, #0x0]\n"
- "ldp x17, x16, [x20, #0x10]\n"
- "ldp x15, x14, [x19, #0x0]\n"
- "ldp x13, x12, [x19, #0x10]\n"
- "ldp x11, x10, [x19, #0x20]\n"
- "ldp x9, x28, [x19, #0x30]\n"
- "ldp x27, x26, [x19, #0x40]\n"
- "ldp x25, x24, [x19, #0x50]\n"
- "ldp x23, x22, [x19, #0x60]\n"
- "ldp x21, x20, [x19, #0x70]\n"
- "ldr q8, [%x[args], %[offsetof_rescale]]\n"
+ "ldp x6, x7, [x21, #0x0]\n"
+ "ldp x8, x17, [x21, #0x10]\n"
+ "ldp x16, x15, [x20, #0x0]\n"
+ "ldp x14, x13, [x20, #0x10]\n"
+ "ldp x12, x11, [x20, #0x20]\n"
+ "ldp x10, x9, [x20, #0x30]\n"
+ "ldp x28, x27, [x20, #0x40]\n"
+ "ldp x26, x25, [x20, #0x50]\n"
+ "ldp x24, x23, [x20, #0x60]\n"
+ "ldp x22, x21, [x20, #0x70]\n"
"blt 3f\n"
- "ldr q7, [x10, x5]\n"
- "lsr x19, x4, #0x2\n"
- "ldr q6, [x9, x5]\n"
- "sub x4, x4, x19, LSL #2\n"
- "ldr q5, [x26, x5]\n"
- "subs x19, x19, #0x1\n"
- "ldr q4, [x25, x5]\n"
- "ldr q3, [x14, x5]\n"
- "ldr q2, [x13, x5]\n"
- "ldr q1, [x11, x5]\n"
- "ldr q0, [x27, x5]\n"
- "ldr q31, [x28, x5]\n"
- "ldr q30, [x24, x5]\n"
- "ldr q29, [x22, x5]\n"
- "ldr q28, [x21, x5]\n"
- "ldr q27, [x15, x5]\n"
- "ldr q26, [x12, x5]\n"
- "ldr q25, [x23, x5]\n"
- "ldr q24, [x20, x5]\n"
- "add x5, x5, #0x10\n"
+ "ldr q6, [x11, x4]\n"
+ "ldr q5, [x10, x4]\n"
+ "lsr x20, x3, #0x2\n"
+ "sub x3, x3, x20, LSL #2\n"
+ "ldr q4, [x27, x4]\n"
+ "ldr q3, [x26, x4]\n"
+ "subs x20, x20, #0x1\n"
+ "ldr q2, [x15, x4]\n"
+ "ldr q1, [x14, x4]\n"
+ "ldr q0, [x12, x4]\n"
+ "ldr q31, [x28, x4]\n"
+ "ldr q30, [x9, x4]\n"
+ "ldr q29, [x25, x4]\n"
+ "ldr q28, [x23, x4]\n"
+ "ldr q27, [x22, x4]\n"
+ "ldr q26, [x16, x4]\n"
+ "ldr q25, [x13, x4]\n"
+ "ldr q24, [x24, x4]\n"
+ "ldr q23, [x21, x4]\n"
+ "add x4, x4, #0x10\n"
"beq 2f\n"
"1:" // Vector: Loop
- "fadd v17.4s, v7.4s, v6.4s\n"
- "ldr q7, [x10, x5]\n"
- "subs x19, x19, #0x1\n"
- "fadd v16.4s, v5.4s, v4.4s\n"
- "ldr q6, [x9, x5]\n"
- "fadd v18.4s, v3.4s, v2.4s\n"
- "ldr q5, [x26, x5]\n"
- "fadd v23.4s, v1.4s, v0.4s\n"
- "ldr q4, [x25, x5]\n"
- "fadd v22.4s, v31.4s, v30.4s\n"
- "ldr q3, [x14, x5]\n"
- "fadd v17.4s, v17.4s, v16.4s\n"
- "ldr q2, [x13, x5]\n"
- "fadd v16.4s, v29.4s, v28.4s\n"
- "ldr q1, [x11, x5]\n"
- "fadd v19.4s, v27.4s, v23.4s\n"
- "ldr q0, [x27, x5]\n"
- "fadd v21.4s, v18.4s, v17.4s\n"
- "ldr q31, [x28, x5]\n"
- "fadd v20.4s, v16.4s, v17.4s\n"
- "ldr q30, [x24, x5]\n"
- "fadd v18.4s, v26.4s, v22.4s\n"
- "ldr q29, [x22, x5]\n"
- "fadd v17.4s, v25.4s, v23.4s\n"
- "ldr q28, [x21, x5]\n"
- "fadd v16.4s, v24.4s, v22.4s\n"
- "ldr q27, [x15, x5]\n"
+ "fadd v17.4s, v6.4s, v5.4s\n"
+ "ldr q6, [x11, x4]\n"
+ "ldr q5, [x10, x4]\n"
+ "fadd v16.4s, v4.4s, v3.4s\n"
+ "ldr q4, [x27, x4]\n"
+ "ldr q3, [x26, x4]\n"
+ "fadd v19.4s, v17.4s, v16.4s\n"
+ "fadd v18.4s, v2.4s, v1.4s\n"
+ "ldr q2, [x15, x4]\n"
+ "ldr q1, [x14, x4]\n"
+ "fadd v17.4s, v0.4s, v31.4s\n"
+ "fadd v22.4s, v30.4s, v29.4s\n"
+ "ldr q0, [x12, x4]\n"
+ "ldr q31, [x28, x4]\n"
+ "fadd v16.4s, v28.4s, v27.4s\n"
+ "fadd v21.4s, v18.4s, v19.4s\n"
+ "ldr q30, [x9, x4]\n"
+ "ldr q29, [x25, x4]\n"
+ "fadd v20.4s, v16.4s, v19.4s\n"
+ "fadd v19.4s, v26.4s, v17.4s\n"
+ "ldr q28, [x23, x4]\n"
+ "ldr q27, [x22, x4]\n"
+ "fadd v18.4s, v25.4s, v22.4s\n"
+ "fadd v17.4s, v24.4s, v17.4s\n"
+ "ldr q26, [x16, x4]\n"
+ "ldr q25, [x13, x4]\n"
+ "fadd v16.4s, v23.4s, v22.4s\n"
"fadd v19.4s, v21.4s, v19.4s\n"
- "ldr q26, [x12, x5]\n"
+ "ldr q24, [x24, x4]\n"
+ "ldr q23, [x21, x4]\n"
"fadd v18.4s, v21.4s, v18.4s\n"
- "ldr q25, [x23, x5]\n"
"fadd v17.4s, v17.4s, v20.4s\n"
- "ldr q24, [x20, x5]\n"
+ "fadd v16.4s, v16.4s, v20.4s\n"
+ "subs x20, x20, #0x1\n"
+ "fmul v19.4s, v19.4s, v7.s[0]\n"
+ "add x4, x4, #0x10\n"
+ "fmul v18.4s, v18.4s, v7.s[1]\n"
+ "fmul v17.4s, v17.4s, v7.s[2]\n"
+ "str q19, [x6, x5]\n"
+ "fmul v16.4s, v16.4s, v7.s[3]\n"
+ "str q18, [x7, x5]\n"
+ "str q17, [x8, x5]\n"
+ "str q16, [x17, x5]\n"
"add x5, x5, #0x10\n"
- "fadd v16.4s, v20.4s, v16.4s\n"
- "fmul v19.4s, v19.4s, v8.s[0]\n"
- "str q19, [x7, x6]\n"
- "fmul v18.4s, v18.4s, v8.s[1]\n"
- "fmul v17.4s, v17.4s, v8.s[2]\n"
- "str q18, [x8, x6]\n"
- "fmul v16.4s, v16.4s, v8.s[3]\n"
- "str q17, [x17, x6]\n"
- "str q16, [x16, x6]\n"
- "add x6, x6, #0x10\n"
"bgt 1b\n"
"2:" // Vector: Tail
- "fadd v17.4s, v7.4s, v6.4s\n"
- "fadd v16.4s, v5.4s, v4.4s\n"
- "fadd v18.4s, v3.4s, v2.4s\n"
- "fadd v23.4s, v1.4s, v0.4s\n"
- "fadd v17.4s, v17.4s, v16.4s\n"
- "fadd v22.4s, v31.4s, v30.4s\n"
- "fadd v16.4s, v29.4s, v28.4s\n"
- "fadd v21.4s, v18.4s, v17.4s\n"
- "fadd v19.4s, v27.4s, v23.4s\n"
- "fadd v20.4s, v16.4s, v17.4s\n"
- "fadd v18.4s, v26.4s, v22.4s\n"
- "fadd v17.4s, v25.4s, v23.4s\n"
- "fadd v16.4s, v24.4s, v22.4s\n"
+ "fadd v17.4s, v6.4s, v5.4s\n"
+ "fadd v16.4s, v4.4s, v3.4s\n"
+ "fadd v19.4s, v17.4s, v16.4s\n"
+ "fadd v18.4s, v2.4s, v1.4s\n"
+ "fadd v17.4s, v0.4s, v31.4s\n"
+ "fadd v22.4s, v30.4s, v29.4s\n"
+ "fadd v16.4s, v28.4s, v27.4s\n"
+ "fadd v21.4s, v18.4s, v19.4s\n"
+ "fadd v20.4s, v16.4s, v19.4s\n"
+ "fadd v19.4s, v26.4s, v17.4s\n"
+ "fadd v18.4s, v25.4s, v22.4s\n"
+ "fadd v17.4s, v24.4s, v17.4s\n"
+ "fadd v16.4s, v23.4s, v22.4s\n"
"fadd v19.4s, v21.4s, v19.4s\n"
"fadd v18.4s, v21.4s, v18.4s\n"
"fadd v17.4s, v17.4s, v20.4s\n"
- "fadd v16.4s, v20.4s, v16.4s\n"
- "fmul v19.4s, v19.4s, v8.s[0]\n"
- "str q19, [x7, x6]\n"
- "fmul v18.4s, v18.4s, v8.s[1]\n"
- "fmul v17.4s, v17.4s, v8.s[2]\n"
- "str q18, [x8, x6]\n"
- "fmul v16.4s, v16.4s, v8.s[3]\n"
- "str q17, [x17, x6]\n"
- "str q16, [x16, x6]\n"
- "add x6, x6, #0x10\n"
- "cbz x4, 4f\n"
+ "fadd v16.4s, v16.4s, v20.4s\n"
+ "fmul v19.4s, v19.4s, v7.s[0]\n"
+ "str q19, [x6, x5]\n"
+ "fmul v18.4s, v18.4s, v7.s[1]\n"
+ "fmul v17.4s, v17.4s, v7.s[2]\n"
+ "str q18, [x7, x5]\n"
+ "fmul v16.4s, v16.4s, v7.s[3]\n"
+ "str q17, [x8, x5]\n"
+ "str q16, [x17, x5]\n"
+ "add x5, x5, #0x10\n"
+ "cbz x3, 4f\n"
"3:" // Oddments
- "ldr s7, [x10, x5]\n"
- "subs x4, x4, #0x1\n"
- "ldr s6, [x9, x5]\n"
- "fadd v17.4s, v7.4s, v6.4s\n"
- "ldr s5, [x26, x5]\n"
- "ldr s4, [x25, x5]\n"
- "fadd v16.4s, v5.4s, v4.4s\n"
- "ldr s3, [x14, x5]\n"
- "ldr s2, [x13, x5]\n"
- "fadd v17.4s, v17.4s, v16.4s\n"
- "ldr s1, [x11, x5]\n"
- "ldr s0, [x27, x5]\n"
- "fadd v18.4s, v3.4s, v2.4s\n"
- "ldr s31, [x28, x5]\n"
- "fadd v23.4s, v1.4s, v0.4s\n"
- "ldr s30, [x24, x5]\n"
- "fadd v21.4s, v18.4s, v17.4s\n"
- "ldr s29, [x22, x5]\n"
- "ldr s28, [x21, x5]\n"
- "fadd v22.4s, v31.4s, v30.4s\n"
- "ldr s27, [x15, x5]\n"
- "ldr s26, [x12, x5]\n"
- "fadd v16.4s, v29.4s, v28.4s\n"
- "ldr s25, [x23, x5]\n"
- "fadd v20.4s, v16.4s, v17.4s\n"
- "ldr s24, [x20, x5]\n"
- "add x5, x5, #0x4\n"
- "fadd v19.4s, v27.4s, v23.4s\n"
- "fadd v18.4s, v26.4s, v22.4s\n"
- "fadd v17.4s, v25.4s, v23.4s\n"
- "fadd v16.4s, v24.4s, v22.4s\n"
- "fadd v19.4s, v21.4s, v19.4s\n"
- "fadd v18.4s, v21.4s, v18.4s\n"
+ "ldr s17, [x11, x4]\n"
+ "ldr s16, [x10, x4]\n"
+ "fadd v18.4s, v17.4s, v16.4s\n"
+ "subs x3, x3, #0x1\n"
+ "ldr s17, [x27, x4]\n"
+ "ldr s16, [x26, x4]\n"
+ "fadd v16.4s, v17.4s, v16.4s\n"
+ "fadd v18.4s, v18.4s, v16.4s\n"
+ "ldr s17, [x15, x4]\n"
+ "ldr s16, [x14, x4]\n"
+ "fadd v16.4s, v17.4s, v16.4s\n"
+ "fadd v23.4s, v16.4s, v18.4s\n"
+ "ldr s17, [x12, x4]\n"
+ "ldr s16, [x28, x4]\n"
+ "fadd v22.4s, v17.4s, v16.4s\n"
+ "ldr s17, [x9, x4]\n"
+ "ldr s16, [x25, x4]\n"
+ "fadd v21.4s, v17.4s, v16.4s\n"
+ "ldr s17, [x23, x4]\n"
+ "ldr s16, [x22, x4]\n"
+ "fadd v16.4s, v17.4s, v16.4s\n"
+ "fadd v20.4s, v16.4s, v18.4s\n"
+ "ldr s17, [x16, x4]\n"
+ "ldr s16, [x13, x4]\n"
+ "fadd v19.4s, v17.4s, v22.4s\n"
+ "fadd v18.4s, v16.4s, v21.4s\n"
+ "ldr s17, [x24, x4]\n"
+ "ldr s16, [x21, x4]\n"
+ "fadd v17.4s, v17.4s, v22.4s\n"
+ "fadd v16.4s, v16.4s, v21.4s\n"
+ "fadd v19.4s, v23.4s, v19.4s\n"
+ "fadd v18.4s, v23.4s, v18.4s\n"
+ "add x4, x4, #0x4\n"
"fadd v17.4s, v17.4s, v20.4s\n"
- "fadd v16.4s, v20.4s, v16.4s\n"
- "fmul v19.4s, v19.4s, v8.s[0]\n"
- "str s19, [x7, x6]\n"
- "fmul v18.4s, v18.4s, v8.s[1]\n"
- "fmul v17.4s, v17.4s, v8.s[2]\n"
- "str s18, [x8, x6]\n"
- "fmul v16.4s, v16.4s, v8.s[3]\n"
- "str s17, [x17, x6]\n"
- "str s16, [x16, x6]\n"
- "add x6, x6, #0x4\n"
+ "fadd v16.4s, v16.4s, v20.4s\n"
+ "fmul v19.4s, v19.4s, v7.s[0]\n"
+ "fmul v18.4s, v18.4s, v7.s[1]\n"
+ "str s19, [x6, x5]\n"
+ "fmul v17.4s, v17.4s, v7.s[2]\n"
+ "fmul v16.4s, v16.4s, v7.s[3]\n"
+ "str s18, [x7, x5]\n"
+ "str s17, [x8, x5]\n"
+ "str s16, [x17, x5]\n"
+ "add x5, x5, #0x4\n"
"bgt 3b\n"
"4:" // End
-
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst.hpp
index 4ef26318d4..26895e610d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,19 +33,11 @@ namespace pooling {
void a64_fp32_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
-struct a64_fp32_nhwc_avg_generic_depthfirst
+struct a64_fp32_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<float, float>
{
- typedef float operand_type;
- typedef float return_type;
-
- typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
- kern_type kernel = a64_fp32_nhwc_avg_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<float, float>;
a64_fp32_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return a64_fp32_nhwc_avg_generic_depthfirst_impl; }
};
} // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp
index 21f705451a..d236f07b1c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include <cstdint>
+#include <cstddef>
#if defined(__aarch64__)
@@ -41,260 +42,258 @@ void a64_fp32_nhwc_avg_generic_depthfirst_impl(
const auto rescale_value = static_cast<float>(1.0f / static_cast<float>(window_cells));
__asm__ __volatile__(
- "ld1r { v8.4s }, [%x[rescale_ptr]]\n"
- "mov x28, #0x0\n"
- "mov x27, #0x10\n" // cntb _, ALL, #1
- "mov x26, #0x20\n" // cntb _, ALL, #2
- "mov x25, #0x30\n" // cntb _, ALL, #3
+ "ld1r { v9.4s }, [%x[rescale_ptr]]\n"
"cmp %x[n_channels], #0x10\n"
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "movi v8.16b, #0x0\n"
"movi v7.16b, #0x0\n"
- "mov x19, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x0\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
"movi v5.16b, #0x0\n"
- "movi v4.16b, #0x0\n"
- "cbz x24, 4f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x22, x27]\n"
- "ldr q29, [x21, x27]\n"
- "ldr q28, [x20, x27]\n"
- "ldr q27, [x23, x26]\n"
- "ldr q21, [x22, x26]\n"
- "ldr q26, [x21, x26]\n"
- "ldr q17, [x20, x26]\n"
- "ldr q25, [x23, x25]\n"
- "ldr q20, [x22, x25]\n"
- "ldr q24, [x21, x25]\n"
- "ldr q16, [x20, x25]\n"
+ "cbz x25, 4f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "fadd v23.4s, v3.4s, v2.4s\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "fadd v19.4s, v1.4s, v0.4s\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "fadd v22.4s, v31.4s, v30.4s\n"
- "ldr q3, [x23, x28]\n"
- "fadd v18.4s, v29.4s, v28.4s\n"
- "fadd v21.4s, v27.4s, v21.4s\n"
- "ldr q2, [x22, x28]\n"
- "fadd v17.4s, v26.4s, v17.4s\n"
- "ldr q1, [x21, x28]\n"
- "fadd v20.4s, v25.4s, v20.4s\n"
- "ldr q0, [x20, x28]\n"
- "fadd v16.4s, v24.4s, v16.4s\n"
- "ldr q31, [x23, x27]\n"
+ "fadd v23.4s, v4.4s, v3.4s\n"
+ "fadd v19.4s, v28.4s, v22.4s\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "fadd v22.4s, v2.4s, v1.4s\n"
+ "ldr q2, [x21, x26]\n"
+ "fadd v18.4s, v27.4s, v21.4s\n"
+ "ldr q1, [x20, x26]\n"
+ "fadd v21.4s, v0.4s, v31.4s\n"
+ "ldr q0, [x21, x24]\n"
+ "fadd v17.4s, v26.4s, v20.4s\n"
+ "ldr q31, [x20, x24]\n"
+ "fadd v20.4s, v30.4s, v29.4s\n"
+ "ldr q30, [x21, x23]\n"
+ "fadd v16.4s, v25.4s, v24.4s\n"
+ "ldr q29, [x20, x23]\n"
"fadd v19.4s, v23.4s, v19.4s\n"
- "ldr q30, [x22, x27]\n"
"fadd v18.4s, v22.4s, v18.4s\n"
- "ldr q29, [x21, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"fadd v17.4s, v21.4s, v17.4s\n"
- "ldr q28, [x20, x27]\n"
"fadd v16.4s, v20.4s, v16.4s\n"
- "ldr q27, [x23, x26]\n"
- "fadd v7.4s, v7.4s, v19.4s\n"
- "ldr q21, [x22, x26]\n"
- "fadd v6.4s, v6.4s, v18.4s\n"
- "ldr q26, [x21, x26]\n"
- "fadd v5.4s, v5.4s, v17.4s\n"
- "ldr q17, [x20, x26]\n"
- "fadd v4.4s, v4.4s, v16.4s\n"
- "ldr q25, [x23, x25]\n"
- "ldr q20, [x22, x25]\n"
- "ldr q24, [x21, x25]\n"
- "ldr q16, [x20, x25]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "subs x25, x25, #0x1\n"
+ "fadd v8.4s, v8.4s, v19.4s\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "fadd v7.4s, v7.4s, v18.4s\n"
+ "fadd v6.4s, v6.4s, v17.4s\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
+ "fadd v5.4s, v5.4s, v16.4s\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "fadd v23.4s, v3.4s, v2.4s\n"
- "fadd v19.4s, v1.4s, v0.4s\n"
- "fadd v22.4s, v31.4s, v30.4s\n"
- "fadd v18.4s, v29.4s, v28.4s\n"
- "fadd v21.4s, v27.4s, v21.4s\n"
- "fadd v17.4s, v26.4s, v17.4s\n"
- "fadd v20.4s, v25.4s, v20.4s\n"
- "fadd v16.4s, v24.4s, v16.4s\n"
+ "fadd v23.4s, v4.4s, v3.4s\n"
+ "fadd v19.4s, v28.4s, v22.4s\n"
+ "fadd v22.4s, v2.4s, v1.4s\n"
+ "fadd v18.4s, v27.4s, v21.4s\n"
+ "fadd v21.4s, v0.4s, v31.4s\n"
+ "fadd v17.4s, v26.4s, v20.4s\n"
+ "fadd v20.4s, v30.4s, v29.4s\n"
+ "fadd v16.4s, v25.4s, v24.4s\n"
"fadd v19.4s, v23.4s, v19.4s\n"
"fadd v18.4s, v22.4s, v18.4s\n"
"fadd v17.4s, v21.4s, v17.4s\n"
"fadd v16.4s, v20.4s, v16.4s\n"
- "fadd v7.4s, v7.4s, v19.4s\n"
- "fadd v6.4s, v6.4s, v18.4s\n"
- "fadd v5.4s, v5.4s, v17.4s\n"
- "fadd v4.4s, v4.4s, v16.4s\n"
+ "fadd v8.4s, v8.4s, v19.4s\n"
+ "fadd v7.4s, v7.4s, v18.4s\n"
+ "fadd v6.4s, v6.4s, v17.4s\n"
+ "fadd v5.4s, v5.4s, v16.4s\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "fadd v7.4s, v7.4s, v3.4s\n"
- "ldr q31, [x23, x27]\n"
- "ldr q27, [x23, x26]\n"
- "fadd v6.4s, v6.4s, v31.4s\n"
- "ldr q25, [x23, x25]\n"
- "fadd v5.4s, v5.4s, v27.4s\n"
- "fadd v4.4s, v4.4s, v25.4s\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "subs x21, x21, #0x1\n"
+ "fadd v8.4s, v8.4s, v16.4s\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "fadd v7.4s, v7.4s, v17.4s\n"
+ "fadd v6.4s, v6.4s, v16.4s\n"
+ "ldr q16, [x20, x23]\n"
+ "fadd v5.4s, v5.4s, v16.4s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "fmul v7.4s, v7.4s, v8.4s\n"
- "str q7, [%x[outptr], x28]\n"
- "fmul v6.4s, v6.4s, v8.4s\n"
- "add x28, x28, #0x40\n"
- "fmul v5.4s, v5.4s, v8.4s\n"
- "str q6, [%x[outptr], x27]\n"
- "fmul v4.4s, v4.4s, v8.4s\n"
- "add x27, x27, #0x40\n"
- "str q5, [%x[outptr], x26]\n"
- "add x26, x26, #0x40\n"
"sub %x[n_channels], %x[n_channels], #0x10\n"
- "str q4, [%x[outptr], x25]\n"
- "add x25, x25, #0x40\n"
"cmp %x[n_channels], #0x10\n"
+ "fmul v8.4s, v8.4s, v9.4s\n"
+ "fmul v7.4s, v7.4s, v9.4s\n"
+ "fmul v6.4s, v6.4s, v9.4s\n"
+ "fmul v5.4s, v5.4s, v9.4s\n"
+ "str q8, [%x[outptr], x27]\n"
+ "add x27, x27, #0x40\n"
+ "str q7, [%x[outptr], x26]\n"
+ "add x26, x26, #0x40\n"
+ "str q6, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q5, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 25f\n"
"7:" // Single vector of channels
"cmp %x[n_channels], #0x4\n"
"blt 14f\n"
"8:" // Single vector of channels: Loop
- "movi v7.16b, #0x0\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 11f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "movi v8.16b, #0x0\n"
+ "mov x22, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fadd v23.4s, v3.4s, v2.4s\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "fadd v19.4s, v1.4s, v0.4s\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "fadd v19.4s, v23.4s, v19.4s\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "fadd v7.4s, v7.4s, v19.4s\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
+ "fadd v17.4s, v4.4s, v3.4s\n"
+ "fadd v16.4s, v28.4s, v22.4s\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "fadd v16.4s, v17.4s, v16.4s\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "fadd v8.4s, v8.4s, v16.4s\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fadd v23.4s, v3.4s, v2.4s\n"
- "fadd v19.4s, v1.4s, v0.4s\n"
- "fadd v19.4s, v23.4s, v19.4s\n"
- "fadd v7.4s, v7.4s, v19.4s\n"
+ "fadd v17.4s, v4.4s, v3.4s\n"
+ "fadd v16.4s, v28.4s, v22.4s\n"
+ "fadd v16.4s, v17.4s, v16.4s\n"
+ "fadd v8.4s, v8.4s, v16.4s\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "fadd v7.4s, v7.4s, v3.4s\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "subs x21, x21, #0x1\n"
+ "fadd v8.4s, v8.4s, v16.4s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "fmul v7.4s, v7.4s, v8.4s\n"
- "str q7, [%x[outptr], x28]\n"
- "add x28, x28, #0x10\n"
"sub %x[n_channels], %x[n_channels], #0x4\n"
"cmp %x[n_channels], #0x4\n"
+ "fmul v8.4s, v8.4s, v9.4s\n"
+ "str q8, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 25f\n"
"14:" // Oddments
- "movi v7.16b, #0x0\n"
- "add %x[outptr], %x[outptr], x28\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 18f\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "add %x[outptr], %x[outptr], x27\n"
+ "movi v8.16b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 18f\n"
"15:" // Oddments: 4 inputs loop
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
+ "movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "add x23, x23, x28\n"
- "movi v2.16b, #0x0\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "movi v1.16b, #0x0\n"
- "add x19, x19, #0x20\n"
- "movi v0.16b, #0x0\n"
- "add x22, x22, x28\n"
- "add x21, x21, x28\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x27\n"
+ "movi v28.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
"tbz %x[n_channels], #1, 16f\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d2, [x22], #0x8\n"
- "ldr d1, [x21], #0x8\n"
- "ldr d0, [x20], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #0, 17f\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v2.s }[2], [x22], #0x4\n"
- "ld1 { v1.s }[2], [x21], #0x4\n"
- "ld1 { v0.s }[2], [x20], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"b 17f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 1: Unset
"tbz %x[n_channels], #0, 17f\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s2, [x22], #0x4\n"
- "ldr s1, [x21], #0x4\n"
- "ldr s0, [x20], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 1: End
- "fadd v23.4s, v3.4s, v2.4s\n"
- "subs x24, x24, #0x1\n"
- "fadd v19.4s, v1.4s, v0.4s\n"
- "fadd v19.4s, v23.4s, v19.4s\n"
- "fadd v7.4s, v7.4s, v19.4s\n"
+ "fadd v17.4s, v4.4s, v3.4s\n"
+ "fadd v16.4s, v28.4s, v22.4s\n"
+ "subs x25, x25, #0x1\n"
+ "fadd v16.4s, v17.4s, v16.4s\n"
+ "fadd v8.4s, v8.4s, v16.4s\n"
"bgt 15b\n"
"18:" // Oddments: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 22f\n"
"19:" // Oddments: Single input loop
- "movi v3.16b, #0x0\n"
- "ldr x23, [x19], #0x8\n"
- "add x23, x23, x28\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
+ "movi v4.16b, #0x0\n"
"tbz %x[n_channels], #1, 20f\n"
- "ldr d3, [x23], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #0, 21f\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"b 21f\n"
"20:" // Oddments: Single input loop: Load: Bit 1: Unset
"tbz %x[n_channels], #0, 21f\n"
- "ldr s3, [x23], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"21:" // Oddments: Single input loop: Load: Bit 1: End
- "fadd v7.4s, v7.4s, v3.4s\n"
- "subs x20, x20, #0x1\n"
+ "subs x21, x21, #0x1\n"
+ "fadd v8.4s, v8.4s, v4.4s\n"
"bgt 19b\n"
"22:" // Oddments: Single input loop: End
- "fmul v7.4s, v7.4s, v8.4s\n"
+ "fmul v8.4s, v8.4s, v9.4s\n"
"tbz %x[n_channels], #1, 23f\n"
- "st1 { v7.d }[0], [%x[outptr]], #0x8\n"
+ "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #0, 24f\n"
- "st1 { v7.s }[2], [%x[outptr]], #0x4\n"
+ "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
"b 24f\n"
"23:" // Oddments: Store: Bit 1: Unset
"tbz %x[n_channels], #0, 24f\n"
- "st1 { v7.s }[0], [%x[outptr]], #0x4\n"
+ "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
"24:" // Oddments: Store: Bit 1: End
-
"25:" // End
-
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [rescale_ptr] "r" (&rescale_value)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 9a22adf6f4..2f72b59d70 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,33 +24,28 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
void a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-struct a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst
+struct a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<float, float>
{
- typedef float operand_type;
- typedef float return_type;
-
- typedef void (*kern_type)(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+ using Parent = DepthfirstStrategy<float, float>;
- constexpr static unsigned int pool_rows(void) { return 2; }
- constexpr static unsigned int pool_cols(void) { return 2; }
+ const static auto pooling_type = PoolingType::MAX;
+ const static auto pool_rows = 2u, pool_cols = 2u;
+ const static auto stride_rows = 1u, stride_cols = 1u;
- constexpr static unsigned int stride_rows(void) { return 1; }
- constexpr static unsigned int stride_cols(void) { return 1; }
+ a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+ : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
- constexpr static unsigned int out_rows(void) { return 2; }
- constexpr static unsigned int out_cols(void) { return 2; }
-
- kern_type kernel = a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
- a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const { return a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index ea7e2195d1..f4202de1ed 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -61,114 +63,115 @@ void a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
pad_left, pad_top, pad_right, pad_bottom);
__asm__ __volatile__(
- "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
- "mov x14, #0x0\n"
- "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
- "mov x13, #0x0\n"
- "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
- "cmp x15, #0x4\n"
- "ldp x12, x11, [x20, #0x0]\n"
- "ldp x10, x9, [x20, #0x10]\n"
- "ldp x28, x27, [x19, #0x0]\n"
- "ldp x26, x25, [x19, #0x10]\n"
- "ldp x24, x23, [x19, #0x20]\n"
- "ldp x22, x21, [x19, #0x30]\n"
- "ldr x20, [x19, #0x40]\n"
+ "ldr x16, [%x[args], %[offsetof_n_channels]]\n"
+ "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+ "cmp x16, #0x4\n"
+ "mov x15, #0x0\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+ "ldp x14, x13, [x21, #0x0]\n"
+ "mov x12, #0x0\n"
+ "ldp x11, x10, [x21, #0x10]\n"
+ "ldp x9, x28, [x20, #0x0]\n"
+ "ldp x27, x26, [x20, #0x10]\n"
+ "ldp x25, x24, [x20, #0x20]\n"
+ "ldp x23, x22, [x20, #0x30]\n"
+ "ldr x21, [x20, #0x40]\n"
"blt 3f\n"
- "ldr q30, [x27, x14]\n"
- "lsr x19, x15, #0x2\n"
- "ldr q29, [x24, x14]\n"
- "sub x15, x15, x19, LSL #2\n"
- "ldr q28, [x21, x14]\n"
- "subs x19, x19, #0x1\n"
- "ldr q27, [x25, x14]\n"
- "ldr q26, [x28, x14]\n"
- "ldr q25, [x23, x14]\n"
- "ldr q24, [x26, x14]\n"
- "ldr q23, [x22, x14]\n"
- "ldr q22, [x20, x14]\n"
- "add x14, x14, #0x10\n"
+ "ldr q30, [x28, x15]\n"
+ "ldr q29, [x25, x15]\n"
+ "lsr x20, x16, #0x2\n"
+ "sub x16, x16, x20, LSL #2\n"
+ "ldr q28, [x22, x15]\n"
+ "ldr q27, [x26, x15]\n"
+ "subs x20, x20, #0x1\n"
+ "ldr q26, [x9, x15]\n"
+ "ldr q25, [x27, x15]\n"
+ "ldr q24, [x24, x15]\n"
+ "ldr q23, [x23, x15]\n"
+ "ldr q22, [x21, x15]\n"
+ "add x15, x15, #0x10\n"
"beq 2f\n"
"1:" // Vector: Loop
"fmax v21.4s, v30.4s, v29.4s\n"
- "ldr q30, [x27, x14]\n"
- "subs x19, x19, #0x1\n"
+ "ldr q30, [x28, x15]\n"
"fmax v20.4s, v29.4s, v28.4s\n"
- "ldr q29, [x24, x14]\n"
+ "ldr q29, [x25, x15]\n"
+ "ldr q28, [x22, x15]\n"
"fmax v19.4s, v27.4s, v26.4s\n"
- "ldr q28, [x21, x14]\n"
+ "ldr q26, [x9, x15]\n"
"fmax v18.4s, v25.4s, v24.4s\n"
- "ldr q26, [x28, x14]\n"
- "fmax v17.4s, v23.4s, v27.4s\n"
- "ldr q27, [x25, x14]\n"
- "fmax v16.4s, v25.4s, v22.4s\n"
- "ldr q25, [x23, x14]\n"
+ "ldr q25, [x27, x15]\n"
+ "fmax v17.4s, v27.4s, v23.4s\n"
+ "ldr q27, [x26, x15]\n"
+ "fmax v16.4s, v24.4s, v22.4s\n"
+ "ldr q24, [x24, x15]\n"
+ "ldr q23, [x23, x15]\n"
+ "subs x20, x20, #0x1\n"
"fmax v19.4s, v21.4s, v19.4s\n"
- "ldr q24, [x26, x14]\n"
- "fmax v18.4s, v21.4s, v18.4s\n"
- "ldr q23, [x22, x14]\n"
- "fmax v17.4s, v20.4s, v17.4s\n"
- "ldr q22, [x20, x14]\n"
- "add x14, x14, #0x10\n"
+ "ldr q22, [x21, x15]\n"
+ "fmax v18.4s, v18.4s, v21.4s\n"
+ "fmax v17.4s, v17.4s, v20.4s\n"
+ "add x15, x15, #0x10\n"
"fmax v16.4s, v20.4s, v16.4s\n"
- "str q19, [x12, x13]\n"
- "str q18, [x11, x13]\n"
- "str q17, [x10, x13]\n"
- "str q16, [x9, x13]\n"
- "add x13, x13, #0x10\n"
+ "str q19, [x14, x12]\n"
+ "str q18, [x13, x12]\n"
+ "str q17, [x11, x12]\n"
+ "str q16, [x10, x12]\n"
+ "add x12, x12, #0x10\n"
"bgt 1b\n"
"2:" // Vector: Tail
"fmax v21.4s, v30.4s, v29.4s\n"
"fmax v20.4s, v29.4s, v28.4s\n"
- "fmax v19.4s, v27.4s, v26.4s\n"
+ "fmax v16.4s, v27.4s, v26.4s\n"
"fmax v18.4s, v25.4s, v24.4s\n"
- "fmax v17.4s, v23.4s, v27.4s\n"
- "fmax v16.4s, v25.4s, v22.4s\n"
- "fmax v19.4s, v21.4s, v19.4s\n"
- "str q19, [x12, x13]\n"
- "fmax v18.4s, v21.4s, v18.4s\n"
- "fmax v17.4s, v20.4s, v17.4s\n"
- "str q18, [x11, x13]\n"
- "fmax v16.4s, v20.4s, v16.4s\n"
- "str q17, [x10, x13]\n"
- "str q16, [x9, x13]\n"
- "add x13, x13, #0x10\n"
- "cbz x15, 4f\n"
+ "fmax v17.4s, v27.4s, v23.4s\n"
+ "fmax v19.4s, v24.4s, v22.4s\n"
+ "fmax v16.4s, v21.4s, v16.4s\n"
+ "fmax v18.4s, v18.4s, v21.4s\n"
+ "str q16, [x14, x12]\n"
+ "fmax v17.4s, v17.4s, v20.4s\n"
+ "fmax v16.4s, v20.4s, v19.4s\n"
+ "str q18, [x13, x12]\n"
+ "str q17, [x11, x12]\n"
+ "str q16, [x10, x12]\n"
+ "add x12, x12, #0x10\n"
+ "cbz x16, 4f\n"
"3:" // Oddments
- "ldr s30, [x27, x14]\n"
- "subs x15, x15, #0x1\n"
- "ldr s29, [x24, x14]\n"
- "fmax v21.4s, v30.4s, v29.4s\n"
- "ldr s28, [x21, x14]\n"
- "ldr s27, [x25, x14]\n"
- "fmax v20.4s, v29.4s, v28.4s\n"
- "ldr s26, [x28, x14]\n"
- "ldr s25, [x23, x14]\n"
- "fmax v19.4s, v27.4s, v26.4s\n"
- "ldr s24, [x26, x14]\n"
- "ldr s23, [x22, x14]\n"
- "fmax v19.4s, v21.4s, v19.4s\n"
- "ldr s22, [x20, x14]\n"
- "add x14, x14, #0x4\n"
- "fmax v18.4s, v25.4s, v24.4s\n"
- "str s19, [x12, x13]\n"
- "fmax v17.4s, v23.4s, v27.4s\n"
- "fmax v16.4s, v25.4s, v22.4s\n"
- "fmax v18.4s, v21.4s, v18.4s\n"
- "str s18, [x11, x13]\n"
- "fmax v17.4s, v20.4s, v17.4s\n"
- "fmax v16.4s, v20.4s, v16.4s\n"
- "str s17, [x10, x13]\n"
- "str s16, [x9, x13]\n"
- "add x13, x13, #0x4\n"
+ "ldr s16, [x28, x15]\n"
+ "ldr s17, [x25, x15]\n"
+ "fmax v23.4s, v16.4s, v17.4s\n"
+ "subs x16, x16, #0x1\n"
+ "ldr s16, [x22, x15]\n"
+ "ldr s22, [x26, x15]\n"
+ "fmax v21.4s, v17.4s, v16.4s\n"
+ "ldr s16, [x9, x15]\n"
+ "ldr s17, [x27, x15]\n"
+ "fmax v16.4s, v22.4s, v16.4s\n"
+ "fmax v20.4s, v23.4s, v16.4s\n"
+ "ldr s19, [x24, x15]\n"
+ "ldr s16, [x23, x15]\n"
+ "fmax v18.4s, v17.4s, v19.4s\n"
+ "fmax v17.4s, v22.4s, v16.4s\n"
+ "ldr s16, [x21, x15]\n"
+ "fmax v16.4s, v19.4s, v16.4s\n"
+ "add x15, x15, #0x4\n"
+ "fmax v18.4s, v18.4s, v23.4s\n"
+ "fmax v17.4s, v17.4s, v21.4s\n"
+ "fmax v16.4s, v21.4s, v16.4s\n"
+ "str s20, [x14, x12]\n"
+ "str s18, [x13, x12]\n"
+ "str s17, [x11, x12]\n"
+ "str s16, [x10, x12]\n"
+ "add x12, x12, #0x4\n"
"bgt 3b\n"
"4:" // End
-
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst.hpp
index b20ffc20cf..7577b31d7d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,19 +33,11 @@ namespace pooling {
void a64_fp32_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
-struct a64_fp32_nhwc_max_generic_depthfirst
+struct a64_fp32_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<float, float>
{
- typedef float operand_type;
- typedef float return_type;
-
- typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
- kern_type kernel = a64_fp32_nhwc_max_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<float, float>;
a64_fp32_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return a64_fp32_nhwc_max_generic_depthfirst_impl; }
};
} // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp
index e0acb7ac02..f4706635dc 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp32_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include <cstdint>
+#include <cstddef>
#if defined(__aarch64__)
@@ -39,256 +40,254 @@ void a64_fp32_nhwc_max_generic_depthfirst_impl(
)
{
__asm__ __volatile__(
- "mov x28, #0x0\n"
- "mov x27, #0x10\n" // cntb _, ALL, #1
- "mov x26, #0x20\n" // cntb _, ALL, #2
- "mov x25, #0x30\n" // cntb _, ALL, #3
"cmp %x[n_channels], #0x10\n"
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"mov w20, #0xff800000\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "dup v8.4s, w20\n"
"dup v7.4s, w20\n"
- "mov x19, %x[inptrs]\n"
"dup v6.4s, w20\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
"dup v5.4s, w20\n"
- "dup v4.4s, w20\n"
- "cbz x24, 4f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x22, x27]\n"
- "ldr q29, [x21, x27]\n"
- "ldr q28, [x20, x27]\n"
- "ldr q27, [x23, x26]\n"
- "ldr q21, [x22, x26]\n"
- "ldr q26, [x21, x26]\n"
- "ldr q17, [x20, x26]\n"
- "ldr q25, [x23, x25]\n"
- "ldr q20, [x22, x25]\n"
- "ldr q24, [x21, x25]\n"
- "ldr q16, [x20, x25]\n"
+ "mov x22, %x[inptrs]\n"
+ "cbz x25, 4f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "fmax v23.4s, v3.4s, v2.4s\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "fmax v19.4s, v1.4s, v0.4s\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "fmax v22.4s, v31.4s, v30.4s\n"
- "ldr q3, [x23, x28]\n"
- "fmax v18.4s, v29.4s, v28.4s\n"
- "fmax v21.4s, v27.4s, v21.4s\n"
- "ldr q2, [x22, x28]\n"
- "fmax v17.4s, v26.4s, v17.4s\n"
- "ldr q1, [x21, x28]\n"
- "fmax v20.4s, v25.4s, v20.4s\n"
- "ldr q0, [x20, x28]\n"
- "fmax v16.4s, v24.4s, v16.4s\n"
- "ldr q31, [x23, x27]\n"
+ "fmax v23.4s, v4.4s, v3.4s\n"
+ "fmax v19.4s, v28.4s, v22.4s\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "fmax v22.4s, v2.4s, v1.4s\n"
+ "ldr q2, [x21, x26]\n"
+ "fmax v18.4s, v27.4s, v21.4s\n"
+ "ldr q1, [x20, x26]\n"
+ "fmax v21.4s, v0.4s, v31.4s\n"
+ "ldr q0, [x21, x24]\n"
+ "fmax v17.4s, v26.4s, v20.4s\n"
+ "ldr q31, [x20, x24]\n"
+ "fmax v20.4s, v30.4s, v29.4s\n"
+ "ldr q30, [x21, x23]\n"
+ "fmax v16.4s, v25.4s, v24.4s\n"
+ "ldr q29, [x20, x23]\n"
"fmax v19.4s, v23.4s, v19.4s\n"
- "ldr q30, [x22, x27]\n"
"fmax v18.4s, v22.4s, v18.4s\n"
- "ldr q29, [x21, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"fmax v17.4s, v21.4s, v17.4s\n"
- "ldr q28, [x20, x27]\n"
"fmax v16.4s, v20.4s, v16.4s\n"
- "ldr q27, [x23, x26]\n"
- "fmax v7.4s, v7.4s, v19.4s\n"
- "ldr q21, [x22, x26]\n"
- "fmax v6.4s, v6.4s, v18.4s\n"
- "ldr q26, [x21, x26]\n"
- "fmax v5.4s, v5.4s, v17.4s\n"
- "ldr q17, [x20, x26]\n"
- "fmax v4.4s, v4.4s, v16.4s\n"
- "ldr q25, [x23, x25]\n"
- "ldr q20, [x22, x25]\n"
- "ldr q24, [x21, x25]\n"
- "ldr q16, [x20, x25]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "subs x25, x25, #0x1\n"
+ "fmax v8.4s, v8.4s, v19.4s\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "fmax v7.4s, v7.4s, v18.4s\n"
+ "fmax v6.4s, v6.4s, v17.4s\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
+ "fmax v5.4s, v5.4s, v16.4s\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "fmax v23.4s, v3.4s, v2.4s\n"
- "fmax v19.4s, v1.4s, v0.4s\n"
- "fmax v22.4s, v31.4s, v30.4s\n"
- "fmax v18.4s, v29.4s, v28.4s\n"
- "fmax v21.4s, v27.4s, v21.4s\n"
- "fmax v17.4s, v26.4s, v17.4s\n"
- "fmax v20.4s, v25.4s, v20.4s\n"
- "fmax v16.4s, v24.4s, v16.4s\n"
+ "fmax v23.4s, v4.4s, v3.4s\n"
+ "fmax v19.4s, v28.4s, v22.4s\n"
+ "fmax v22.4s, v2.4s, v1.4s\n"
+ "fmax v18.4s, v27.4s, v21.4s\n"
+ "fmax v21.4s, v0.4s, v31.4s\n"
+ "fmax v17.4s, v26.4s, v20.4s\n"
+ "fmax v20.4s, v30.4s, v29.4s\n"
+ "fmax v16.4s, v25.4s, v24.4s\n"
"fmax v19.4s, v23.4s, v19.4s\n"
"fmax v18.4s, v22.4s, v18.4s\n"
"fmax v17.4s, v21.4s, v17.4s\n"
"fmax v16.4s, v20.4s, v16.4s\n"
- "fmax v7.4s, v7.4s, v19.4s\n"
- "fmax v6.4s, v6.4s, v18.4s\n"
- "fmax v5.4s, v5.4s, v17.4s\n"
- "fmax v4.4s, v4.4s, v16.4s\n"
+ "fmax v8.4s, v8.4s, v19.4s\n"
+ "fmax v7.4s, v7.4s, v18.4s\n"
+ "fmax v6.4s, v6.4s, v17.4s\n"
+ "fmax v5.4s, v5.4s, v16.4s\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "fmax v7.4s, v7.4s, v3.4s\n"
- "ldr q31, [x23, x27]\n"
- "ldr q27, [x23, x26]\n"
- "fmax v6.4s, v6.4s, v31.4s\n"
- "ldr q25, [x23, x25]\n"
- "fmax v5.4s, v5.4s, v27.4s\n"
- "fmax v4.4s, v4.4s, v25.4s\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "subs x21, x21, #0x1\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "fmax v7.4s, v7.4s, v17.4s\n"
+ "fmax v6.4s, v6.4s, v16.4s\n"
+ "ldr q16, [x20, x23]\n"
+ "fmax v5.4s, v5.4s, v16.4s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "str q7, [%x[outptr], x28]\n"
- "add x28, x28, #0x40\n"
- "str q6, [%x[outptr], x27]\n"
- "add x27, x27, #0x40\n"
- "str q5, [%x[outptr], x26]\n"
- "add x26, x26, #0x40\n"
- "str q4, [%x[outptr], x25]\n"
- "add x25, x25, #0x40\n"
"sub %x[n_channels], %x[n_channels], #0x10\n"
"cmp %x[n_channels], #0x10\n"
+ "str q8, [%x[outptr], x27]\n"
+ "str q7, [%x[outptr], x26]\n"
+ "add x27, x27, #0x40\n"
+ "add x26, x26, #0x40\n"
+ "str q6, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q5, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 25f\n"
"7:" // Single vector of channels
"cmp %x[n_channels], #0x4\n"
"blt 14f\n"
"8:" // Single vector of channels: Loop
- "mov w19, #0xff800000\n"
- "dup v7.4s, w19\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 11f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
+ "mov w20, #0xff800000\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "dup v8.4s, w20\n"
+ "mov x22, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fmax v23.4s, v3.4s, v2.4s\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "fmax v19.4s, v1.4s, v0.4s\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "fmax v19.4s, v23.4s, v19.4s\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "fmax v7.4s, v7.4s, v19.4s\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
+ "fmax v17.4s, v4.4s, v3.4s\n"
+ "fmax v16.4s, v28.4s, v22.4s\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "fmax v16.4s, v17.4s, v16.4s\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fmax v23.4s, v3.4s, v2.4s\n"
- "fmax v19.4s, v1.4s, v0.4s\n"
- "fmax v19.4s, v23.4s, v19.4s\n"
- "fmax v7.4s, v7.4s, v19.4s\n"
+ "fmax v17.4s, v4.4s, v3.4s\n"
+ "fmax v16.4s, v28.4s, v22.4s\n"
+ "fmax v16.4s, v17.4s, v16.4s\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "fmax v7.4s, v7.4s, v3.4s\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "subs x21, x21, #0x1\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "str q7, [%x[outptr], x28]\n"
- "add x28, x28, #0x10\n"
"sub %x[n_channels], %x[n_channels], #0x4\n"
"cmp %x[n_channels], #0x4\n"
+ "str q8, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 25f\n"
"14:" // Oddments
- "add %x[outptr], %x[outptr], x28\n"
- "mov w19, #0xff800000\n"
- "dup v7.4s, w19\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 18f\n"
+ "mov w20, #0xff800000\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "dup v8.4s, w20\n"
+ "add %x[outptr], %x[outptr], x27\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 18f\n"
"15:" // Oddments: 4 inputs loop
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
+ "movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "add x23, x23, x28\n"
- "movi v2.16b, #0x0\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "movi v1.16b, #0x0\n"
- "add x19, x19, #0x20\n"
- "movi v0.16b, #0x0\n"
- "add x22, x22, x28\n"
- "add x21, x21, x28\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x27\n"
+ "movi v28.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
"tbz %x[n_channels], #1, 16f\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d2, [x22], #0x8\n"
- "ldr d1, [x21], #0x8\n"
- "ldr d0, [x20], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #0, 17f\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v2.s }[2], [x22], #0x4\n"
- "ld1 { v1.s }[2], [x21], #0x4\n"
- "ld1 { v0.s }[2], [x20], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"b 17f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 1: Unset
"tbz %x[n_channels], #0, 17f\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s2, [x22], #0x4\n"
- "ldr s1, [x21], #0x4\n"
- "ldr s0, [x20], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 1: End
- "fmax v23.4s, v3.4s, v2.4s\n"
- "subs x24, x24, #0x1\n"
- "fmax v19.4s, v1.4s, v0.4s\n"
- "fmax v19.4s, v23.4s, v19.4s\n"
- "fmax v7.4s, v7.4s, v19.4s\n"
+ "fmax v17.4s, v4.4s, v3.4s\n"
+ "fmax v16.4s, v28.4s, v22.4s\n"
+ "subs x25, x25, #0x1\n"
+ "fmax v16.4s, v17.4s, v16.4s\n"
+ "fmax v8.4s, v8.4s, v16.4s\n"
"bgt 15b\n"
"18:" // Oddments: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 22f\n"
"19:" // Oddments: Single input loop
- "movi v3.16b, #0x0\n"
- "ldr x23, [x19], #0x8\n"
- "add x23, x23, x28\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
+ "movi v4.16b, #0x0\n"
"tbz %x[n_channels], #1, 20f\n"
- "ldr d3, [x23], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #0, 21f\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"b 21f\n"
"20:" // Oddments: Single input loop: Load: Bit 1: Unset
"tbz %x[n_channels], #0, 21f\n"
- "ldr s3, [x23], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"21:" // Oddments: Single input loop: Load: Bit 1: End
- "fmax v7.4s, v7.4s, v3.4s\n"
- "subs x20, x20, #0x1\n"
+ "subs x21, x21, #0x1\n"
+ "fmax v8.4s, v8.4s, v4.4s\n"
"bgt 19b\n"
"22:" // Oddments: Single input loop: End
"tbz %x[n_channels], #1, 23f\n"
- "st1 { v7.d }[0], [%x[outptr]], #0x8\n"
+ "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #0, 24f\n"
- "st1 { v7.s }[2], [%x[outptr]], #0x4\n"
+ "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
"b 24f\n"
"23:" // Oddments: Store: Bit 1: Unset
"tbz %x[n_channels], #0, 24f\n"
- "st1 { v7.s }[0], [%x[outptr]], #0x4\n"
+ "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
"24:" // Oddments: Store: Bit 1: End
-
"25:" // End
-
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst.hpp
index df66ab7a2c..de94ec0ec3 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,19 +33,11 @@ namespace pooling {
void a64_s8_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
-struct a64_s8_nhwc_avg_generic_depthfirst
+struct a64_s8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t>
{
- typedef int8_t operand_type;
- typedef int8_t return_type;
-
- typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
- kern_type kernel = a64_s8_nhwc_avg_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<int8_t, int8_t>;
a64_s8_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return a64_s8_nhwc_avg_generic_depthfirst_impl; }
};
} // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp
index 405ae66755..5d082102b3 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include <cstdint>
+#include <cstddef>
#include <cstring>
#include <cmath>
@@ -83,27 +84,28 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
shift_value--;
f_rescale_value *= 2.0f;
}
- int64_t large_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
- if (large_rescale_value == (1ll << 31))
+
+ int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+ if (long_rescale_value == (1ll << 31))
{
shift_value++;
- large_rescale_value >>= 1;
+ long_rescale_value >>= 1;
}
- rescale_value = static_cast<int32_t>(large_rescale_value);
+ rescale_value = static_cast<int32_t>(long_rescale_value);
}
__asm__ __volatile__(
- "mov x26, #0x0\n"
- "mov x25, #0x10\n" // cntb _, ALL, #1
- "mov x24, #0x20\n" // cntb _, ALL, #2
- "mov x23, #0x30\n" // cntb _, ALL, #3
"cmp %x[n_channels], #0x40\n"
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x25, #0x20\n" // cntb _, ALL, #2
+ "mov x24, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
+ "lsr x23, %x[n_valid_cells], #0x1\n"
"movi v15.4s, #0x0\n"
- "mov x19, %x[inptrs]\n"
"movi v14.4s, #0x0\n"
- "lsr x22, %x[n_valid_cells], #0x1\n"
+ "mov x22, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"movi v11.4s, #0x0\n"
@@ -118,43 +120,43 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"movi v2.4s, #0x0\n"
"movi v1.4s, #0x0\n"
"movi v0.4s, #0x0\n"
- "cbz x22, 4f\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "ldr q31, [x21, x26]\n"
- "add x19, x19, #0x10\n"
- "ldr q30, [x20, x26]\n"
- "subs x22, x22, #0x1\n"
- "ldr q29, [x21, x25]\n"
- "ldr q28, [x20, x25]\n"
- "ldr q27, [x21, x24]\n"
- "ldr q26, [x20, x24]\n"
- "ldr q25, [x21, x23]\n"
- "ldr q24, [x20, x23]\n"
+ "cbz x23, 4f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
"saddl v23.8h, v31.8b, v30.8b\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "add x19, x19, #0x10\n"
"saddl2 v22.8h, v31.16b, v30.16b\n"
- "ldr q31, [x21, x26]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"saddl v21.8h, v29.8b, v28.8b\n"
- "subs x22, x22, #0x1\n"
"saddl2 v20.8h, v29.16b, v28.16b\n"
- "ldr q30, [x20, x26]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
"saddl v19.8h, v27.8b, v26.8b\n"
- "ldr q29, [x21, x25]\n"
"saddl2 v18.8h, v27.16b, v26.16b\n"
- "ldr q28, [x20, x25]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
"saddl v17.8h, v25.8b, v24.8b\n"
- "ldr q27, [x21, x24]\n"
"saddl2 v16.8h, v25.16b, v24.16b\n"
- "ldr q26, [x20, x24]\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
+ "subs x23, x23, #0x1\n"
"saddw v15.4s, v15.4s, v23.4h\n"
- "ldr q25, [x21, x23]\n"
"saddw2 v14.4s, v14.4s, v23.8h\n"
- "ldr q24, [x20, x23]\n"
"saddw v13.4s, v13.4s, v22.4h\n"
"saddw2 v12.4s, v12.4s, v22.8h\n"
+ "add x22, x22, #0x10\n"
"saddw v11.4s, v11.4s, v21.4h\n"
"saddw2 v10.4s, v10.4s, v21.8h\n"
"saddw v9.4s, v9.4s, v20.4h\n"
@@ -194,23 +196,23 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"saddw v1.4s, v1.4s, v16.4h\n"
"saddw2 v0.4s, v0.4s, v16.8h\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x21, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q31, [x21, x26]\n"
- "sxtl v23.8h, v31.8b\n"
- "ldr q29, [x21, x25]\n"
- "sxtl2 v22.8h, v31.16b\n"
- "ldr q27, [x21, x24]\n"
- "ldr q25, [x21, x23]\n"
- "sxtl v21.8h, v29.8b\n"
- "sxtl2 v20.8h, v29.16b\n"
- "sxtl v19.8h, v27.8b\n"
- "sxtl2 v18.8h, v27.16b\n"
- "sxtl v17.8h, v25.8b\n"
- "sxtl2 v16.8h, v25.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "sxtl v23.8h, v16.8b\n"
+ "sxtl2 v22.8h, v16.16b\n"
+ "ldr q16, [x20, x26]\n"
+ "ldr q17, [x20, x25]\n"
+ "sxtl v21.8h, v16.8b\n"
+ "sxtl2 v20.8h, v16.16b\n"
+ "ldr q16, [x20, x24]\n"
+ "sxtl v19.8h, v17.8b\n"
+ "sxtl2 v18.8h, v17.16b\n"
+ "subs x23, x23, #0x1\n"
+ "sxtl v17.8h, v16.8b\n"
+ "sxtl2 v16.8h, v16.16b\n"
"saddw v15.4s, v15.4s, v23.4h\n"
"saddw2 v14.4s, v14.4s, v23.8h\n"
"saddw v13.4s, v13.4s, v22.4h\n"
@@ -229,195 +231,195 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"saddw2 v0.4s, v0.4s, v16.8h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "movi v19.4s, #0x7f\n"
- "ld1r { v18.4s }, [%x[rescale_ptr]]\n"
+ "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
+ "ld1r { v16.4s }, [%x[shift_ptr]]\n"
+ "sqdmulh v15.4s, v15.4s, v17.4s\n"
+ "sqdmulh v14.4s, v14.4s, v17.4s\n"
+ "sqdmulh v13.4s, v13.4s, v17.4s\n"
+ "sqdmulh v12.4s, v12.4s, v17.4s\n"
"sub %x[n_channels], %x[n_channels], #0x40\n"
- "sqdmulh v15.4s, v15.4s, v18.4s\n"
- "ld1r { v17.4s }, [%x[shift_ptr]]\n"
- "not v16.16b, v19.16b\n"
- "sqdmulh v14.4s, v14.4s, v18.4s\n"
"cmp %x[n_channels], #0x40\n"
- "sqdmulh v13.4s, v13.4s, v18.4s\n"
- "sqdmulh v12.4s, v12.4s, v18.4s\n"
- "sqdmulh v11.4s, v11.4s, v18.4s\n"
- "sqdmulh v10.4s, v10.4s, v18.4s\n"
- "sqdmulh v9.4s, v9.4s, v18.4s\n"
- "srshl v15.4s, v15.4s, v17.4s\n"
- "srshl v14.4s, v14.4s, v17.4s\n"
- "srshl v13.4s, v13.4s, v17.4s\n"
- "srshl v12.4s, v12.4s, v17.4s\n"
- "srshl v11.4s, v11.4s, v17.4s\n"
- "srshl v10.4s, v10.4s, v17.4s\n"
- "srshl v9.4s, v9.4s, v17.4s\n"
- "sqdmulh v8.4s, v8.4s, v18.4s\n"
- "sqdmulh v7.4s, v7.4s, v18.4s\n"
- "sqdmulh v6.4s, v6.4s, v18.4s\n"
- "sqdmulh v5.4s, v5.4s, v18.4s\n"
- "srshl v8.4s, v8.4s, v17.4s\n"
- "srshl v7.4s, v7.4s, v17.4s\n"
- "srshl v6.4s, v6.4s, v17.4s\n"
- "srshl v5.4s, v5.4s, v17.4s\n"
- "sqdmulh v4.4s, v4.4s, v18.4s\n"
- "sqdmulh v3.4s, v3.4s, v18.4s\n"
- "sqdmulh v2.4s, v2.4s, v18.4s\n"
- "sqdmulh v1.4s, v1.4s, v18.4s\n"
- "srshl v4.4s, v4.4s, v17.4s\n"
- "srshl v3.4s, v3.4s, v17.4s\n"
- "srshl v2.4s, v2.4s, v17.4s\n"
- "srshl v1.4s, v1.4s, v17.4s\n"
- "sqdmulh v0.4s, v0.4s, v18.4s\n"
+ "sqdmulh v11.4s, v11.4s, v17.4s\n"
+ "sqdmulh v10.4s, v10.4s, v17.4s\n"
+ "sqdmulh v9.4s, v9.4s, v17.4s\n"
+ "sqdmulh v8.4s, v8.4s, v17.4s\n"
+ "sqdmulh v7.4s, v7.4s, v17.4s\n"
+ "sqdmulh v6.4s, v6.4s, v17.4s\n"
+ "sqdmulh v5.4s, v5.4s, v17.4s\n"
+ "sqdmulh v4.4s, v4.4s, v17.4s\n"
+ "sqdmulh v3.4s, v3.4s, v17.4s\n"
+ "sqdmulh v2.4s, v2.4s, v17.4s\n"
+ "sqdmulh v1.4s, v1.4s, v17.4s\n"
+ "sqdmulh v0.4s, v0.4s, v17.4s\n"
+ "movi v17.4s, #0x7f\n"
+ "srshl v15.4s, v15.4s, v16.4s\n"
+ "srshl v14.4s, v14.4s, v16.4s\n"
+ "srshl v13.4s, v13.4s, v16.4s\n"
+ "srshl v12.4s, v12.4s, v16.4s\n"
+ "srshl v11.4s, v11.4s, v16.4s\n"
+ "srshl v10.4s, v10.4s, v16.4s\n"
+ "srshl v9.4s, v9.4s, v16.4s\n"
+ "srshl v8.4s, v8.4s, v16.4s\n"
+ "srshl v7.4s, v7.4s, v16.4s\n"
+ "srshl v6.4s, v6.4s, v16.4s\n"
+ "srshl v5.4s, v5.4s, v16.4s\n"
+ "srshl v4.4s, v4.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v16.4s\n"
+ "srshl v2.4s, v2.4s, v16.4s\n"
+ "srshl v1.4s, v1.4s, v16.4s\n"
+ "srshl v0.4s, v0.4s, v16.4s\n"
+ "not v16.16b, v17.16b\n"
"smax v15.4s, v15.4s, v16.4s\n"
"smax v14.4s, v14.4s, v16.4s\n"
"smax v13.4s, v13.4s, v16.4s\n"
- "srshl v0.4s, v0.4s, v17.4s\n"
- "smin v15.4s, v15.4s, v19.4s\n"
- "smin v14.4s, v14.4s, v19.4s\n"
- "smin v13.4s, v13.4s, v19.4s\n"
"smax v12.4s, v12.4s, v16.4s\n"
"smax v11.4s, v11.4s, v16.4s\n"
"smax v10.4s, v10.4s, v16.4s\n"
- "smin v12.4s, v12.4s, v19.4s\n"
- "smin v11.4s, v11.4s, v19.4s\n"
- "smin v10.4s, v10.4s, v19.4s\n"
"smax v9.4s, v9.4s, v16.4s\n"
"smax v8.4s, v8.4s, v16.4s\n"
"smax v7.4s, v7.4s, v16.4s\n"
- "smin v9.4s, v9.4s, v19.4s\n"
- "smin v8.4s, v8.4s, v19.4s\n"
- "smin v7.4s, v7.4s, v19.4s\n"
"smax v6.4s, v6.4s, v16.4s\n"
"smax v5.4s, v5.4s, v16.4s\n"
"smax v4.4s, v4.4s, v16.4s\n"
- "smin v6.4s, v6.4s, v19.4s\n"
- "smin v5.4s, v5.4s, v19.4s\n"
- "smin v4.4s, v4.4s, v19.4s\n"
"smax v3.4s, v3.4s, v16.4s\n"
"smax v2.4s, v2.4s, v16.4s\n"
"smax v1.4s, v1.4s, v16.4s\n"
- "smin v3.4s, v3.4s, v19.4s\n"
- "smin v2.4s, v2.4s, v19.4s\n"
- "smin v1.4s, v1.4s, v19.4s\n"
"smax v0.4s, v0.4s, v16.4s\n"
+ "smin v15.4s, v15.4s, v17.4s\n"
+ "smin v14.4s, v14.4s, v17.4s\n"
+ "smin v13.4s, v13.4s, v17.4s\n"
+ "smin v12.4s, v12.4s, v17.4s\n"
+ "smin v11.4s, v11.4s, v17.4s\n"
+ "smin v10.4s, v10.4s, v17.4s\n"
+ "smin v9.4s, v9.4s, v17.4s\n"
+ "smin v8.4s, v8.4s, v17.4s\n"
+ "smin v7.4s, v7.4s, v17.4s\n"
+ "smin v6.4s, v6.4s, v17.4s\n"
+ "smin v5.4s, v5.4s, v17.4s\n"
+ "smin v4.4s, v4.4s, v17.4s\n"
+ "smin v3.4s, v3.4s, v17.4s\n"
+ "smin v2.4s, v2.4s, v17.4s\n"
+ "smin v1.4s, v1.4s, v17.4s\n"
+ "smin v0.4s, v0.4s, v17.4s\n"
"uzp1 v23.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "smin v0.4s, v0.4s, v19.4s\n"
"uzp1 v22.16b, v11.16b, v10.16b\n"
- "uzp1 v21.16b, v9.16b, v8.16b\n"
- "uzp1 v20.16b, v7.16b, v6.16b\n"
+ "uzp1 v18.16b, v9.16b, v8.16b\n"
+ "uzp1 v21.16b, v7.16b, v6.16b\n"
"uzp1 v17.16b, v5.16b, v4.16b\n"
- "uzp1 v19.16b, v3.16b, v2.16b\n"
- "uzp1 v18.16b, v1.16b, v0.16b\n"
+ "uzp1 v20.16b, v3.16b, v2.16b\n"
+ "uzp1 v19.16b, v1.16b, v0.16b\n"
"uzp1 v16.16b, v23.16b, v16.16b\n"
- "str q16, [%x[outptr], x26]\n"
- "uzp1 v16.16b, v22.16b, v21.16b\n"
+ "uzp1 v18.16b, v22.16b, v18.16b\n"
+ "str q16, [%x[outptr], x27]\n"
+ "add x27, x27, #0x40\n"
+ "uzp1 v17.16b, v21.16b, v17.16b\n"
+ "uzp1 v16.16b, v20.16b, v19.16b\n"
+ "str q18, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
- "uzp1 v17.16b, v20.16b, v17.16b\n"
- "str q16, [%x[outptr], x25]\n"
- "uzp1 v16.16b, v19.16b, v18.16b\n"
+ "str q17, [%x[outptr], x25]\n"
"add x25, x25, #0x40\n"
- "str q17, [%x[outptr], x24]\n"
+ "str q16, [%x[outptr], x24]\n"
"add x24, x24, #0x40\n"
- "str q16, [%x[outptr], x23]\n"
- "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 43f\n"
"7:" // Single vector of channels
"cmp %x[n_channels], #0x10\n"
"blt 14f\n"
"8:" // Single vector of channels: Loop
+ "lsr x23, %x[n_valid_cells], #0x1\n"
"movi v15.4s, #0x0\n"
- "mov x19, %x[inptrs]\n"
"movi v14.4s, #0x0\n"
- "lsr x22, %x[n_valid_cells], #0x1\n"
+ "mov x22, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
- "cbz x22, 11f\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "ldr q31, [x21, x26]\n"
- "add x19, x19, #0x10\n"
- "ldr q30, [x20, x26]\n"
- "subs x22, x22, #0x1\n"
+ "cbz x23, 11f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- "saddl v23.8h, v31.8b, v30.8b\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "add x19, x19, #0x10\n"
- "saddl2 v22.8h, v31.16b, v30.16b\n"
- "ldr q31, [x21, x26]\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "subs x22, x22, #0x1\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "ldr q30, [x20, x26]\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "saddl v17.8h, v31.8b, v30.8b\n"
+ "saddl2 v16.8h, v31.16b, v30.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
+ "subs x23, x23, #0x1\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
+ "add x22, x22, #0x10\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- "saddl v23.8h, v31.8b, v30.8b\n"
- "saddl2 v22.8h, v31.16b, v30.16b\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "saddl v17.8h, v31.8b, v30.8b\n"
+ "saddl2 v16.8h, v31.16b, v30.16b\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x21, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q31, [x21, x26]\n"
- "sxtl v23.8h, v31.8b\n"
- "sxtl2 v22.8h, v31.16b\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "sxtl v17.8h, v16.8b\n"
+ "sxtl2 v16.8h, v16.16b\n"
+ "subs x23, x23, #0x1\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "movi v19.4s, #0x7f\n"
- "ld1r { v18.4s }, [%x[rescale_ptr]]\n"
+ "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
+ "ld1r { v16.4s }, [%x[shift_ptr]]\n"
+ "sqdmulh v15.4s, v15.4s, v17.4s\n"
+ "sqdmulh v14.4s, v14.4s, v17.4s\n"
+ "sqdmulh v13.4s, v13.4s, v17.4s\n"
+ "sqdmulh v12.4s, v12.4s, v17.4s\n"
"sub %x[n_channels], %x[n_channels], #0x10\n"
- "sqdmulh v15.4s, v15.4s, v18.4s\n"
- "ld1r { v17.4s }, [%x[shift_ptr]]\n"
- "not v16.16b, v19.16b\n"
- "sqdmulh v14.4s, v14.4s, v18.4s\n"
"cmp %x[n_channels], #0x10\n"
- "sqdmulh v13.4s, v13.4s, v18.4s\n"
- "sqdmulh v12.4s, v12.4s, v18.4s\n"
- "srshl v15.4s, v15.4s, v17.4s\n"
- "srshl v14.4s, v14.4s, v17.4s\n"
- "srshl v13.4s, v13.4s, v17.4s\n"
- "srshl v12.4s, v12.4s, v17.4s\n"
+ "movi v17.4s, #0x7f\n"
+ "srshl v15.4s, v15.4s, v16.4s\n"
+ "srshl v14.4s, v14.4s, v16.4s\n"
+ "srshl v13.4s, v13.4s, v16.4s\n"
+ "srshl v12.4s, v12.4s, v16.4s\n"
+ "not v16.16b, v17.16b\n"
"smax v15.4s, v15.4s, v16.4s\n"
"smax v14.4s, v14.4s, v16.4s\n"
"smax v13.4s, v13.4s, v16.4s\n"
"smax v12.4s, v12.4s, v16.4s\n"
- "smin v15.4s, v15.4s, v19.4s\n"
- "smin v14.4s, v14.4s, v19.4s\n"
- "smin v13.4s, v13.4s, v19.4s\n"
- "smin v12.4s, v12.4s, v19.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "smin v15.4s, v15.4s, v17.4s\n"
+ "smin v14.4s, v14.4s, v17.4s\n"
+ "smin v13.4s, v13.4s, v17.4s\n"
+ "smin v12.4s, v12.4s, v17.4s\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
- "str q16, [%x[outptr], x26]\n"
- "add x26, x26, #0x10\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
+ "str q16, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 43f\n"
"14:" // Oddments
+ "lsr x23, %x[n_valid_cells], #0x1\n"
+ "add %x[outptr], %x[outptr], x27\n"
"movi v15.4s, #0x0\n"
- "add %x[outptr], %x[outptr], x26\n"
"movi v14.4s, #0x0\n"
- "mov x19, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
- "lsr x22, %x[n_valid_cells], #0x1\n"
"movi v12.4s, #0x0\n"
- "cbz x22, 24f\n"
+ "mov x22, %x[inptrs]\n"
+ "cbz x23, 24f\n"
"15:" // Oddments: 2 inputs loop
+ "ldp x21, x20, [x22, #0x0]\n"
+ "add x22, x22, #0x10\n"
+ "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "add x19, x19, #0x10\n"
+ "add x20, x20, x27\n"
"movi v30.16b, #0x0\n"
- "add x21, x21, x26\n"
- "add x20, x20, x26\n"
"tbz %x[n_channels], #3, 19f\n"
"ldr d31, [x21], #0x8\n"
"ldr d30, [x20], #0x8\n"
@@ -478,21 +480,21 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"ldr b31, [x21], #0x1\n"
"ldr b30, [x20], #0x1\n"
"23:" // Oddments: 2 inputs loop: Load: Bit 3: End
- "saddl v23.8h, v31.8b, v30.8b\n"
- "subs x22, x22, #0x1\n"
- "saddl2 v22.8h, v31.16b, v30.16b\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "saddl v17.8h, v31.8b, v30.8b\n"
+ "saddl2 v16.8h, v31.16b, v30.16b\n"
+ "subs x23, x23, #0x1\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 15b\n"
"24:" // Oddments: After loop
- "ands x20, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
+ "ldr x21, [x22], #0x8\n"
+ "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
- "ldr x21, [x19], #0x8\n"
- "add x21, x21, x26\n"
"tbz %x[n_channels], #3, 29f\n"
"ldr d31, [x21], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
@@ -538,38 +540,38 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"tbz %x[n_channels], #0, 33f\n"
"ldr b31, [x21], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
- "sxtl v23.8h, v31.8b\n"
- "subs x20, x20, #0x1\n"
- "sxtl2 v22.8h, v31.16b\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "sxtl v17.8h, v31.8b\n"
+ "sxtl2 v16.8h, v31.16b\n"
+ "subs x23, x23, #0x1\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
- "movi v19.4s, #0x7f\n"
- "ld1r { v18.4s }, [%x[rescale_ptr]]\n"
- "not v16.16b, v19.16b\n"
- "sqdmulh v15.4s, v15.4s, v18.4s\n"
- "ld1r { v17.4s }, [%x[shift_ptr]]\n"
- "sqdmulh v14.4s, v14.4s, v18.4s\n"
- "sqdmulh v13.4s, v13.4s, v18.4s\n"
- "sqdmulh v12.4s, v12.4s, v18.4s\n"
- "srshl v15.4s, v15.4s, v17.4s\n"
- "srshl v14.4s, v14.4s, v17.4s\n"
- "srshl v13.4s, v13.4s, v17.4s\n"
- "srshl v12.4s, v12.4s, v17.4s\n"
+ "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
+ "ld1r { v16.4s }, [%x[shift_ptr]]\n"
+ "sqdmulh v15.4s, v15.4s, v17.4s\n"
+ "sqdmulh v14.4s, v14.4s, v17.4s\n"
+ "sqdmulh v13.4s, v13.4s, v17.4s\n"
+ "sqdmulh v12.4s, v12.4s, v17.4s\n"
+ "movi v17.4s, #0x7f\n"
+ "srshl v15.4s, v15.4s, v16.4s\n"
+ "srshl v14.4s, v14.4s, v16.4s\n"
+ "srshl v13.4s, v13.4s, v16.4s\n"
+ "srshl v12.4s, v12.4s, v16.4s\n"
+ "not v16.16b, v17.16b\n"
"smax v15.4s, v15.4s, v16.4s\n"
"smax v14.4s, v14.4s, v16.4s\n"
"smax v13.4s, v13.4s, v16.4s\n"
"smax v12.4s, v12.4s, v16.4s\n"
- "smin v15.4s, v15.4s, v19.4s\n"
- "smin v14.4s, v14.4s, v19.4s\n"
- "smin v13.4s, v13.4s, v19.4s\n"
- "smin v12.4s, v12.4s, v19.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "smin v15.4s, v15.4s, v17.4s\n"
+ "smin v14.4s, v14.4s, v17.4s\n"
+ "smin v13.4s, v13.4s, v17.4s\n"
+ "smin v12.4s, v12.4s, v17.4s\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
"st1 { v16.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
@@ -615,12 +617,10 @@ void a64_s8_nhwc_avg_generic_depthfirst_impl(
"tbz %x[n_channels], #0, 42f\n"
"st1 { v16.b }[0], [%x[outptr]], #0x1\n"
"42:" // Oddments: Store: Bit 3: End
-
"43:" // End
-
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [rescale_ptr] "r" (&rescale_value), [shift_ptr] "r" (&shift_value)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 7829ecc0e9..f8f1134866 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,33 +24,28 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
void a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const int8_t *const *const, int8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-struct a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst
+struct a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<int8_t, int8_t>
{
- typedef int8_t operand_type;
- typedef int8_t return_type;
-
- typedef void (*kern_type)(unsigned int, const int8_t *const *const, int8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+ using Parent = DepthfirstStrategy<int8_t, int8_t>;
- constexpr static unsigned int pool_rows(void) { return 2; }
- constexpr static unsigned int pool_cols(void) { return 2; }
+ const static auto pooling_type = PoolingType::MAX;
+ const static auto pool_rows = 2u, pool_cols = 2u;
+ const static auto stride_rows = 1u, stride_cols = 1u;
- constexpr static unsigned int stride_rows(void) { return 1; }
- constexpr static unsigned int stride_cols(void) { return 1; }
+ a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+ : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
- constexpr static unsigned int out_rows(void) { return 2; }
- constexpr static unsigned int out_cols(void) { return 2; }
-
- kern_type kernel = a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
- a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const { return a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 298db96861..7e62ac1afc 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -61,114 +63,115 @@ void a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
pad_left, pad_top, pad_right, pad_bottom);
__asm__ __volatile__(
- "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
- "mov x14, #0x0\n"
- "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
- "mov x13, #0x0\n"
- "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
- "cmp x15, #0x10\n"
- "ldp x12, x11, [x20, #0x0]\n"
- "ldp x10, x9, [x20, #0x10]\n"
- "ldp x28, x27, [x19, #0x0]\n"
- "ldp x26, x25, [x19, #0x10]\n"
- "ldp x24, x23, [x19, #0x20]\n"
- "ldp x22, x21, [x19, #0x30]\n"
- "ldr x20, [x19, #0x40]\n"
+ "ldr x16, [%x[args], %[offsetof_n_channels]]\n"
+ "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+ "cmp x16, #0x10\n"
+ "mov x15, #0x0\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+ "ldp x14, x13, [x21, #0x0]\n"
+ "mov x12, #0x0\n"
+ "ldp x11, x10, [x21, #0x10]\n"
+ "ldp x9, x28, [x20, #0x0]\n"
+ "ldp x27, x26, [x20, #0x10]\n"
+ "ldp x25, x24, [x20, #0x20]\n"
+ "ldp x23, x22, [x20, #0x30]\n"
+ "ldr x21, [x20, #0x40]\n"
"blt 3f\n"
- "ldr q30, [x27, x14]\n"
- "lsr x19, x15, #0x4\n"
- "ldr q29, [x24, x14]\n"
- "sub x15, x15, x19, LSL #4\n"
- "ldr q28, [x21, x14]\n"
- "subs x19, x19, #0x1\n"
- "ldr q27, [x25, x14]\n"
- "ldr q26, [x28, x14]\n"
- "ldr q25, [x23, x14]\n"
- "ldr q24, [x26, x14]\n"
- "ldr q23, [x22, x14]\n"
- "ldr q22, [x20, x14]\n"
- "add x14, x14, #0x10\n"
+ "ldr q30, [x28, x15]\n"
+ "ldr q29, [x25, x15]\n"
+ "lsr x20, x16, #0x4\n"
+ "sub x16, x16, x20, LSL #4\n"
+ "ldr q28, [x22, x15]\n"
+ "ldr q27, [x26, x15]\n"
+ "subs x20, x20, #0x1\n"
+ "ldr q26, [x9, x15]\n"
+ "ldr q25, [x27, x15]\n"
+ "ldr q24, [x24, x15]\n"
+ "ldr q23, [x23, x15]\n"
+ "ldr q22, [x21, x15]\n"
+ "add x15, x15, #0x10\n"
"beq 2f\n"
"1:" // Vector: Loop
"smax v21.16b, v30.16b, v29.16b\n"
- "ldr q30, [x27, x14]\n"
- "subs x19, x19, #0x1\n"
+ "ldr q30, [x28, x15]\n"
"smax v20.16b, v29.16b, v28.16b\n"
- "ldr q29, [x24, x14]\n"
+ "ldr q29, [x25, x15]\n"
+ "ldr q28, [x22, x15]\n"
"smax v19.16b, v27.16b, v26.16b\n"
- "ldr q28, [x21, x14]\n"
+ "ldr q26, [x9, x15]\n"
"smax v18.16b, v25.16b, v24.16b\n"
- "ldr q26, [x28, x14]\n"
- "smax v17.16b, v23.16b, v27.16b\n"
- "ldr q27, [x25, x14]\n"
- "smax v16.16b, v25.16b, v22.16b\n"
- "ldr q25, [x23, x14]\n"
+ "ldr q25, [x27, x15]\n"
+ "smax v17.16b, v27.16b, v23.16b\n"
+ "ldr q27, [x26, x15]\n"
+ "smax v16.16b, v24.16b, v22.16b\n"
+ "ldr q24, [x24, x15]\n"
+ "ldr q23, [x23, x15]\n"
+ "subs x20, x20, #0x1\n"
"smax v19.16b, v21.16b, v19.16b\n"
- "ldr q24, [x26, x14]\n"
- "smax v18.16b, v21.16b, v18.16b\n"
- "ldr q23, [x22, x14]\n"
- "smax v17.16b, v20.16b, v17.16b\n"
- "ldr q22, [x20, x14]\n"
- "add x14, x14, #0x10\n"
+ "ldr q22, [x21, x15]\n"
+ "smax v18.16b, v18.16b, v21.16b\n"
+ "smax v17.16b, v17.16b, v20.16b\n"
+ "add x15, x15, #0x10\n"
"smax v16.16b, v20.16b, v16.16b\n"
- "str q19, [x12, x13]\n"
- "str q18, [x11, x13]\n"
- "str q17, [x10, x13]\n"
- "str q16, [x9, x13]\n"
- "add x13, x13, #0x10\n"
+ "str q19, [x14, x12]\n"
+ "str q18, [x13, x12]\n"
+ "str q17, [x11, x12]\n"
+ "str q16, [x10, x12]\n"
+ "add x12, x12, #0x10\n"
"bgt 1b\n"
"2:" // Vector: Tail
"smax v21.16b, v30.16b, v29.16b\n"
"smax v20.16b, v29.16b, v28.16b\n"
- "smax v19.16b, v27.16b, v26.16b\n"
+ "smax v16.16b, v27.16b, v26.16b\n"
"smax v18.16b, v25.16b, v24.16b\n"
- "smax v17.16b, v23.16b, v27.16b\n"
- "smax v16.16b, v25.16b, v22.16b\n"
- "smax v19.16b, v21.16b, v19.16b\n"
- "str q19, [x12, x13]\n"
- "smax v18.16b, v21.16b, v18.16b\n"
- "smax v17.16b, v20.16b, v17.16b\n"
- "str q18, [x11, x13]\n"
- "smax v16.16b, v20.16b, v16.16b\n"
- "str q17, [x10, x13]\n"
- "str q16, [x9, x13]\n"
- "add x13, x13, #0x10\n"
- "cbz x15, 4f\n"
+ "smax v17.16b, v27.16b, v23.16b\n"
+ "smax v19.16b, v24.16b, v22.16b\n"
+ "smax v16.16b, v21.16b, v16.16b\n"
+ "smax v18.16b, v18.16b, v21.16b\n"
+ "str q16, [x14, x12]\n"
+ "smax v17.16b, v17.16b, v20.16b\n"
+ "smax v16.16b, v20.16b, v19.16b\n"
+ "str q18, [x13, x12]\n"
+ "str q17, [x11, x12]\n"
+ "str q16, [x10, x12]\n"
+ "add x12, x12, #0x10\n"
+ "cbz x16, 4f\n"
"3:" // Oddments
- "ldr b30, [x27, x14]\n"
- "subs x15, x15, #0x1\n"
- "ldr b29, [x24, x14]\n"
- "smax v21.16b, v30.16b, v29.16b\n"
- "ldr b28, [x21, x14]\n"
- "ldr b27, [x25, x14]\n"
- "smax v20.16b, v29.16b, v28.16b\n"
- "ldr b26, [x28, x14]\n"
- "ldr b25, [x23, x14]\n"
- "smax v19.16b, v27.16b, v26.16b\n"
- "ldr b24, [x26, x14]\n"
- "ldr b23, [x22, x14]\n"
- "smax v19.16b, v21.16b, v19.16b\n"
- "ldr b22, [x20, x14]\n"
- "add x14, x14, #0x1\n"
- "smax v18.16b, v25.16b, v24.16b\n"
- "str b19, [x12, x13]\n"
- "smax v17.16b, v23.16b, v27.16b\n"
- "smax v16.16b, v25.16b, v22.16b\n"
- "smax v18.16b, v21.16b, v18.16b\n"
- "str b18, [x11, x13]\n"
- "smax v17.16b, v20.16b, v17.16b\n"
- "smax v16.16b, v20.16b, v16.16b\n"
- "str b17, [x10, x13]\n"
- "str b16, [x9, x13]\n"
- "add x13, x13, #0x1\n"
+ "ldr b16, [x28, x15]\n"
+ "ldr b17, [x25, x15]\n"
+ "smax v23.16b, v16.16b, v17.16b\n"
+ "subs x16, x16, #0x1\n"
+ "ldr b16, [x22, x15]\n"
+ "ldr b22, [x26, x15]\n"
+ "smax v21.16b, v17.16b, v16.16b\n"
+ "ldr b16, [x9, x15]\n"
+ "ldr b17, [x27, x15]\n"
+ "smax v16.16b, v22.16b, v16.16b\n"
+ "smax v20.16b, v23.16b, v16.16b\n"
+ "ldr b19, [x24, x15]\n"
+ "ldr b16, [x23, x15]\n"
+ "smax v18.16b, v17.16b, v19.16b\n"
+ "smax v17.16b, v22.16b, v16.16b\n"
+ "ldr b16, [x21, x15]\n"
+ "smax v16.16b, v19.16b, v16.16b\n"
+ "add x15, x15, #0x1\n"
+ "smax v18.16b, v18.16b, v23.16b\n"
+ "smax v17.16b, v17.16b, v21.16b\n"
+ "smax v16.16b, v21.16b, v16.16b\n"
+ "str b20, [x14, x12]\n"
+ "str b18, [x13, x12]\n"
+ "str b17, [x11, x12]\n"
+ "str b16, [x10, x12]\n"
+ "add x12, x12, #0x1\n"
"bgt 3b\n"
"4:" // End
-
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst.hpp
index 6c4cd1467f..ba6d52f570 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,19 +33,11 @@ namespace pooling {
void a64_s8_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
-struct a64_s8_nhwc_max_generic_depthfirst
+struct a64_s8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t>
{
- typedef int8_t operand_type;
- typedef int8_t return_type;
-
- typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
- kern_type kernel = a64_s8_nhwc_max_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<int8_t, int8_t>;
a64_s8_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return a64_s8_nhwc_max_generic_depthfirst_impl; }
};
} // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp
index 5e4c84d23e..411fd11460 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include <cstdint>
+#include <cstddef>
#if defined(__aarch64__)
@@ -39,397 +40,395 @@ void a64_s8_nhwc_max_generic_depthfirst_impl(
)
{
__asm__ __volatile__(
- "mov x28, #0x0\n"
- "mov x27, #0x10\n" // cntb _, ALL, #1
- "mov x26, #0x20\n" // cntb _, ALL, #2
- "mov x25, #0x30\n" // cntb _, ALL, #3
"cmp %x[n_channels], #0x40\n"
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "movi v8.16b, #0x80\n"
"movi v7.16b, #0x80\n"
- "mov x19, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x80\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
"movi v5.16b, #0x80\n"
- "movi v4.16b, #0x80\n"
- "cbz x24, 4f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x22, x27]\n"
- "ldr q29, [x21, x27]\n"
- "ldr q28, [x20, x27]\n"
- "ldr q27, [x23, x26]\n"
- "ldr q21, [x22, x26]\n"
- "ldr q26, [x21, x26]\n"
- "ldr q17, [x20, x26]\n"
- "ldr q25, [x23, x25]\n"
- "ldr q20, [x22, x25]\n"
- "ldr q24, [x21, x25]\n"
- "ldr q16, [x20, x25]\n"
+ "cbz x25, 4f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "smax v23.16b, v3.16b, v2.16b\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "smax v19.16b, v1.16b, v0.16b\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "smax v22.16b, v31.16b, v30.16b\n"
- "ldr q3, [x23, x28]\n"
- "smax v18.16b, v29.16b, v28.16b\n"
- "smax v21.16b, v27.16b, v21.16b\n"
- "ldr q2, [x22, x28]\n"
- "smax v17.16b, v26.16b, v17.16b\n"
- "ldr q1, [x21, x28]\n"
- "smax v20.16b, v25.16b, v20.16b\n"
- "ldr q0, [x20, x28]\n"
- "smax v16.16b, v24.16b, v16.16b\n"
- "ldr q31, [x23, x27]\n"
+ "smax v23.16b, v4.16b, v3.16b\n"
+ "smax v19.16b, v28.16b, v22.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "smax v22.16b, v2.16b, v1.16b\n"
+ "ldr q2, [x21, x26]\n"
+ "smax v18.16b, v27.16b, v21.16b\n"
+ "ldr q1, [x20, x26]\n"
+ "smax v21.16b, v0.16b, v31.16b\n"
+ "ldr q0, [x21, x24]\n"
+ "smax v17.16b, v26.16b, v20.16b\n"
+ "ldr q31, [x20, x24]\n"
+ "smax v20.16b, v30.16b, v29.16b\n"
+ "ldr q30, [x21, x23]\n"
+ "smax v16.16b, v25.16b, v24.16b\n"
+ "ldr q29, [x20, x23]\n"
"smax v19.16b, v23.16b, v19.16b\n"
- "ldr q30, [x22, x27]\n"
"smax v18.16b, v22.16b, v18.16b\n"
- "ldr q29, [x21, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"smax v17.16b, v21.16b, v17.16b\n"
- "ldr q28, [x20, x27]\n"
"smax v16.16b, v20.16b, v16.16b\n"
- "ldr q27, [x23, x26]\n"
- "smax v7.16b, v7.16b, v19.16b\n"
- "ldr q21, [x22, x26]\n"
- "smax v6.16b, v6.16b, v18.16b\n"
- "ldr q26, [x21, x26]\n"
- "smax v5.16b, v5.16b, v17.16b\n"
- "ldr q17, [x20, x26]\n"
- "smax v4.16b, v4.16b, v16.16b\n"
- "ldr q25, [x23, x25]\n"
- "ldr q20, [x22, x25]\n"
- "ldr q24, [x21, x25]\n"
- "ldr q16, [x20, x25]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "subs x25, x25, #0x1\n"
+ "smax v8.16b, v8.16b, v19.16b\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "smax v7.16b, v7.16b, v18.16b\n"
+ "smax v6.16b, v6.16b, v17.16b\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
+ "smax v5.16b, v5.16b, v16.16b\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "smax v23.16b, v3.16b, v2.16b\n"
- "smax v19.16b, v1.16b, v0.16b\n"
- "smax v22.16b, v31.16b, v30.16b\n"
- "smax v18.16b, v29.16b, v28.16b\n"
- "smax v21.16b, v27.16b, v21.16b\n"
- "smax v17.16b, v26.16b, v17.16b\n"
- "smax v20.16b, v25.16b, v20.16b\n"
- "smax v16.16b, v24.16b, v16.16b\n"
+ "smax v23.16b, v4.16b, v3.16b\n"
+ "smax v19.16b, v28.16b, v22.16b\n"
+ "smax v22.16b, v2.16b, v1.16b\n"
+ "smax v18.16b, v27.16b, v21.16b\n"
+ "smax v21.16b, v0.16b, v31.16b\n"
+ "smax v17.16b, v26.16b, v20.16b\n"
+ "smax v20.16b, v30.16b, v29.16b\n"
+ "smax v16.16b, v25.16b, v24.16b\n"
"smax v19.16b, v23.16b, v19.16b\n"
"smax v18.16b, v22.16b, v18.16b\n"
"smax v17.16b, v21.16b, v17.16b\n"
"smax v16.16b, v20.16b, v16.16b\n"
- "smax v7.16b, v7.16b, v19.16b\n"
- "smax v6.16b, v6.16b, v18.16b\n"
- "smax v5.16b, v5.16b, v17.16b\n"
- "smax v4.16b, v4.16b, v16.16b\n"
+ "smax v8.16b, v8.16b, v19.16b\n"
+ "smax v7.16b, v7.16b, v18.16b\n"
+ "smax v6.16b, v6.16b, v17.16b\n"
+ "smax v5.16b, v5.16b, v16.16b\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "smax v7.16b, v7.16b, v3.16b\n"
- "ldr q31, [x23, x27]\n"
- "ldr q27, [x23, x26]\n"
- "smax v6.16b, v6.16b, v31.16b\n"
- "ldr q25, [x23, x25]\n"
- "smax v5.16b, v5.16b, v27.16b\n"
- "smax v4.16b, v4.16b, v25.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "subs x21, x21, #0x1\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "smax v7.16b, v7.16b, v17.16b\n"
+ "smax v6.16b, v6.16b, v16.16b\n"
+ "ldr q16, [x20, x23]\n"
+ "smax v5.16b, v5.16b, v16.16b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "str q7, [%x[outptr], x28]\n"
- "add x28, x28, #0x40\n"
- "str q6, [%x[outptr], x27]\n"
- "add x27, x27, #0x40\n"
- "str q5, [%x[outptr], x26]\n"
- "add x26, x26, #0x40\n"
- "str q4, [%x[outptr], x25]\n"
- "add x25, x25, #0x40\n"
"sub %x[n_channels], %x[n_channels], #0x40\n"
"cmp %x[n_channels], #0x40\n"
+ "str q8, [%x[outptr], x27]\n"
+ "str q7, [%x[outptr], x26]\n"
+ "add x27, x27, #0x40\n"
+ "add x26, x26, #0x40\n"
+ "str q6, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q5, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 43f\n"
"7:" // Single vector of channels
"cmp %x[n_channels], #0x10\n"
"blt 14f\n"
"8:" // Single vector of channels: Loop
- "movi v7.16b, #0x80\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 11f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "movi v8.16b, #0x80\n"
+ "mov x22, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "smax v23.16b, v3.16b, v2.16b\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "smax v19.16b, v1.16b, v0.16b\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "smax v19.16b, v23.16b, v19.16b\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "smax v7.16b, v7.16b, v19.16b\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
+ "smax v17.16b, v4.16b, v3.16b\n"
+ "smax v16.16b, v28.16b, v22.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "smax v16.16b, v17.16b, v16.16b\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "smax v23.16b, v3.16b, v2.16b\n"
- "smax v19.16b, v1.16b, v0.16b\n"
- "smax v19.16b, v23.16b, v19.16b\n"
- "smax v7.16b, v7.16b, v19.16b\n"
+ "smax v17.16b, v4.16b, v3.16b\n"
+ "smax v16.16b, v28.16b, v22.16b\n"
+ "smax v16.16b, v17.16b, v16.16b\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "smax v7.16b, v7.16b, v3.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "subs x21, x21, #0x1\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "str q7, [%x[outptr], x28]\n"
- "add x28, x28, #0x10\n"
"sub %x[n_channels], %x[n_channels], #0x10\n"
"cmp %x[n_channels], #0x10\n"
+ "str q8, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 43f\n"
"14:" // Oddments
- "movi v7.16b, #0x80\n"
- "add %x[outptr], %x[outptr], x28\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 24f\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "add %x[outptr], %x[outptr], x27\n"
+ "movi v8.16b, #0x80\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 24f\n"
"15:" // Oddments: 4 inputs loop
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
+ "movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "add x23, x23, x28\n"
- "movi v2.16b, #0x0\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "movi v1.16b, #0x0\n"
- "add x19, x19, #0x20\n"
- "movi v0.16b, #0x0\n"
- "add x22, x22, x28\n"
- "add x21, x21, x28\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x27\n"
+ "movi v28.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d2, [x22], #0x8\n"
- "ldr d1, [x21], #0x8\n"
- "ldr d0, [x20], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v2.s }[2], [x22], #0x4\n"
- "ld1 { v1.s }[2], [x21], #0x4\n"
- "ld1 { v0.s }[2], [x20], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
- "ld1 { v2.h }[6], [x22], #0x2\n"
- "ld1 { v1.h }[6], [x21], #0x2\n"
- "ld1 { v0.h }[6], [x20], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v3.h }[6], [x22], #0x2\n"
+ "ld1 { v28.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[14], [x23], #0x1\n"
- "ld1 { v2.b }[14], [x22], #0x1\n"
- "ld1 { v1.b }[14], [x21], #0x1\n"
- "ld1 { v0.b }[14], [x20], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
+ "ld1 { v3.b }[14], [x22], #0x1\n"
+ "ld1 { v28.b }[14], [x21], #0x1\n"
+ "ld1 { v22.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[12], [x23], #0x1\n"
- "ld1 { v2.b }[12], [x22], #0x1\n"
- "ld1 { v1.b }[12], [x21], #0x1\n"
- "ld1 { v0.b }[12], [x20], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
+ "ld1 { v3.b }[12], [x22], #0x1\n"
+ "ld1 { v28.b }[12], [x21], #0x1\n"
+ "ld1 { v22.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
- "ld1 { v2.h }[4], [x22], #0x2\n"
- "ld1 { v1.h }[4], [x21], #0x2\n"
- "ld1 { v0.h }[4], [x20], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v3.h }[4], [x22], #0x2\n"
+ "ld1 { v28.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[10], [x23], #0x1\n"
- "ld1 { v2.b }[10], [x22], #0x1\n"
- "ld1 { v1.b }[10], [x21], #0x1\n"
- "ld1 { v0.b }[10], [x20], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
+ "ld1 { v3.b }[10], [x22], #0x1\n"
+ "ld1 { v28.b }[10], [x21], #0x1\n"
+ "ld1 { v22.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[8], [x23], #0x1\n"
- "ld1 { v2.b }[8], [x22], #0x1\n"
- "ld1 { v1.b }[8], [x21], #0x1\n"
- "ld1 { v0.b }[8], [x20], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
+ "ld1 { v3.b }[8], [x22], #0x1\n"
+ "ld1 { v28.b }[8], [x21], #0x1\n"
+ "ld1 { v22.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s2, [x22], #0x4\n"
- "ldr s1, [x21], #0x4\n"
- "ldr s0, [x20], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
- "ld1 { v2.h }[2], [x22], #0x2\n"
- "ld1 { v1.h }[2], [x21], #0x2\n"
- "ld1 { v0.h }[2], [x20], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v3.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[6], [x23], #0x1\n"
- "ld1 { v2.b }[6], [x22], #0x1\n"
- "ld1 { v1.b }[6], [x21], #0x1\n"
- "ld1 { v0.b }[6], [x20], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
+ "ld1 { v3.b }[6], [x22], #0x1\n"
+ "ld1 { v28.b }[6], [x21], #0x1\n"
+ "ld1 { v22.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[4], [x23], #0x1\n"
- "ld1 { v2.b }[4], [x22], #0x1\n"
- "ld1 { v1.b }[4], [x21], #0x1\n"
- "ld1 { v0.b }[4], [x20], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
+ "ld1 { v3.b }[4], [x22], #0x1\n"
+ "ld1 { v28.b }[4], [x21], #0x1\n"
+ "ld1 { v22.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h2, [x22], #0x2\n"
- "ldr h1, [x21], #0x2\n"
- "ldr h0, [x20], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "ldr h22, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[2], [x23], #0x1\n"
- "ld1 { v2.b }[2], [x22], #0x1\n"
- "ld1 { v1.b }[2], [x21], #0x1\n"
- "ld1 { v0.b }[2], [x20], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
+ "ld1 { v3.b }[2], [x22], #0x1\n"
+ "ld1 { v28.b }[2], [x21], #0x1\n"
+ "ld1 { v22.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b3, [x23], #0x1\n"
- "ldr b2, [x22], #0x1\n"
- "ldr b1, [x21], #0x1\n"
- "ldr b0, [x20], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
+ "ldr b3, [x22], #0x1\n"
+ "ldr b28, [x21], #0x1\n"
+ "ldr b22, [x20], #0x1\n"
"23:" // Oddments: 4 inputs loop: Load: Bit 3: End
- "smax v23.16b, v3.16b, v2.16b\n"
- "subs x24, x24, #0x1\n"
- "smax v19.16b, v1.16b, v0.16b\n"
- "smax v19.16b, v23.16b, v19.16b\n"
- "smax v7.16b, v7.16b, v19.16b\n"
+ "smax v17.16b, v4.16b, v3.16b\n"
+ "smax v16.16b, v28.16b, v22.16b\n"
+ "subs x25, x25, #0x1\n"
+ "smax v16.16b, v17.16b, v16.16b\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
"bgt 15b\n"
"24:" // Oddments: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "movi v3.16b, #0x0\n"
- "ldr x23, [x19], #0x8\n"
- "add x23, x23, x28\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
+ "movi v4.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d3, [x23], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[14], [x23], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[12], [x23], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[10], [x23], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[8], [x23], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s3, [x23], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[6], [x23], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[4], [x23], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h3, [x23], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[2], [x23], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b3, [x23], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
- "smax v7.16b, v7.16b, v3.16b\n"
- "subs x20, x20, #0x1\n"
+ "subs x21, x21, #0x1\n"
+ "smax v8.16b, v8.16b, v4.16b\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
"tbz %x[n_channels], #3, 38f\n"
- "st1 { v7.d }[0], [%x[outptr]], #0x8\n"
+ "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
- "st1 { v7.s }[2], [%x[outptr]], #0x4\n"
+ "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
"tbz %x[n_channels], #1, 35f\n"
- "st1 { v7.h }[6], [%x[outptr]], #0x2\n"
+ "st1 { v8.h }[6], [%x[outptr]], #0x2\n"
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v7.b }[14], [%x[outptr]], #0x1\n"
+ "st1 { v8.b }[14], [%x[outptr]], #0x1\n"
"b 42f\n"
"35:" // Oddments: Store: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v7.b }[12], [%x[outptr]], #0x1\n"
+ "st1 { v8.b }[12], [%x[outptr]], #0x1\n"
"b 42f\n"
"36:" // Oddments: Store: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 37f\n"
- "st1 { v7.h }[4], [%x[outptr]], #0x2\n"
+ "st1 { v8.h }[4], [%x[outptr]], #0x2\n"
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v7.b }[10], [%x[outptr]], #0x1\n"
+ "st1 { v8.b }[10], [%x[outptr]], #0x1\n"
"b 42f\n"
"37:" // Oddments: Store: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v7.b }[8], [%x[outptr]], #0x1\n"
+ "st1 { v8.b }[8], [%x[outptr]], #0x1\n"
"b 42f\n"
"38:" // Oddments: Store: Bit 3: Unset
"tbz %x[n_channels], #2, 40f\n"
- "st1 { v7.s }[0], [%x[outptr]], #0x4\n"
+ "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
"tbz %x[n_channels], #1, 39f\n"
- "st1 { v7.h }[2], [%x[outptr]], #0x2\n"
+ "st1 { v8.h }[2], [%x[outptr]], #0x2\n"
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v7.b }[6], [%x[outptr]], #0x1\n"
+ "st1 { v8.b }[6], [%x[outptr]], #0x1\n"
"b 42f\n"
"39:" // Oddments: Store: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v7.b }[4], [%x[outptr]], #0x1\n"
+ "st1 { v8.b }[4], [%x[outptr]], #0x1\n"
"b 42f\n"
"40:" // Oddments: Store: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 41f\n"
- "st1 { v7.h }[0], [%x[outptr]], #0x2\n"
+ "st1 { v8.h }[0], [%x[outptr]], #0x2\n"
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v7.b }[2], [%x[outptr]], #0x1\n"
+ "st1 { v8.b }[2], [%x[outptr]], #0x1\n"
"b 42f\n"
"41:" // Oddments: Store: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v7.b }[0], [%x[outptr]], #0x1\n"
+ "st1 { v8.b }[0], [%x[outptr]], #0x1\n"
"42:" // Oddments: Store: Bit 3: End
-
"43:" // End
-
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst.hpp
index a50e99a009..d5d7313a90 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,19 +33,11 @@ namespace pooling {
void a64_s8q_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
-struct a64_s8q_nhwc_avg_generic_depthfirst
+struct a64_s8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>
{
- typedef int8_t operand_type;
- typedef int8_t return_type;
-
- typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
- kern_type kernel = a64_s8q_nhwc_avg_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>;
a64_s8q_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return a64_s8q_nhwc_avg_generic_depthfirst_impl; }
};
} // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp
index f288a4119c..019f402911 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,7 @@
#include "pooling.hpp"
#include <cstdint>
+#include <cstddef>
#include <cstring>
#include <cmath>
@@ -86,12 +87,13 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
f_rescale_value *= 2.0f;
}
- rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
- if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+ int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+ if (long_rescale_value == (1ll << 31))
{
shift_value++;
- rescale_value >>= 1;
+ long_rescale_value >>= 1;
}
+ rescale_value = static_cast<int32_t>(long_rescale_value);
}
// Combine together the rescale value for the requantization and the scaling
@@ -112,17 +114,17 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
);
__asm__ __volatile__(
- "mov x26, #0x0\n"
- "mov x25, #0x10\n" // cntb _, ALL, #1
- "mov x24, #0x20\n" // cntb _, ALL, #2
- "mov x23, #0x30\n" // cntb _, ALL, #3
"cmp %x[n_channels], #0x40\n"
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x25, #0x20\n" // cntb _, ALL, #2
+ "mov x24, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
+ "lsr x23, %x[n_valid_cells], #0x1\n"
"movi v15.4s, #0x0\n"
- "mov x19, %x[inptrs]\n"
"movi v14.4s, #0x0\n"
- "lsr x22, %x[n_valid_cells], #0x1\n"
+ "mov x22, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"movi v11.4s, #0x0\n"
@@ -137,43 +139,43 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"movi v2.4s, #0x0\n"
"movi v1.4s, #0x0\n"
"movi v0.4s, #0x0\n"
- "cbz x22, 4f\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "ldr q31, [x21, x26]\n"
- "add x19, x19, #0x10\n"
- "ldr q30, [x20, x26]\n"
- "subs x22, x22, #0x1\n"
- "ldr q29, [x21, x25]\n"
- "ldr q28, [x20, x25]\n"
- "ldr q27, [x21, x24]\n"
- "ldr q26, [x20, x24]\n"
- "ldr q25, [x21, x23]\n"
- "ldr q24, [x20, x23]\n"
+ "cbz x23, 4f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
"saddl v23.8h, v31.8b, v30.8b\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "add x19, x19, #0x10\n"
"saddl2 v22.8h, v31.16b, v30.16b\n"
- "ldr q31, [x21, x26]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"saddl v21.8h, v29.8b, v28.8b\n"
- "subs x22, x22, #0x1\n"
"saddl2 v20.8h, v29.16b, v28.16b\n"
- "ldr q30, [x20, x26]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
"saddl v19.8h, v27.8b, v26.8b\n"
- "ldr q29, [x21, x25]\n"
"saddl2 v18.8h, v27.16b, v26.16b\n"
- "ldr q28, [x20, x25]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
"saddl v17.8h, v25.8b, v24.8b\n"
- "ldr q27, [x21, x24]\n"
"saddl2 v16.8h, v25.16b, v24.16b\n"
- "ldr q26, [x20, x24]\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
+ "subs x23, x23, #0x1\n"
"saddw v15.4s, v15.4s, v23.4h\n"
- "ldr q25, [x21, x23]\n"
"saddw2 v14.4s, v14.4s, v23.8h\n"
- "ldr q24, [x20, x23]\n"
"saddw v13.4s, v13.4s, v22.4h\n"
"saddw2 v12.4s, v12.4s, v22.8h\n"
+ "add x22, x22, #0x10\n"
"saddw v11.4s, v11.4s, v21.4h\n"
"saddw2 v10.4s, v10.4s, v21.8h\n"
"saddw v9.4s, v9.4s, v20.4h\n"
@@ -213,23 +215,23 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"saddw v1.4s, v1.4s, v16.4h\n"
"saddw2 v0.4s, v0.4s, v16.8h\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x21, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q31, [x21, x26]\n"
- "sxtl v23.8h, v31.8b\n"
- "ldr q29, [x21, x25]\n"
- "sxtl2 v22.8h, v31.16b\n"
- "ldr q27, [x21, x24]\n"
- "ldr q25, [x21, x23]\n"
- "sxtl v21.8h, v29.8b\n"
- "sxtl2 v20.8h, v29.16b\n"
- "sxtl v19.8h, v27.8b\n"
- "sxtl2 v18.8h, v27.16b\n"
- "sxtl v17.8h, v25.8b\n"
- "sxtl2 v16.8h, v25.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "sxtl v23.8h, v16.8b\n"
+ "sxtl2 v22.8h, v16.16b\n"
+ "ldr q16, [x20, x26]\n"
+ "ldr q17, [x20, x25]\n"
+ "sxtl v21.8h, v16.8b\n"
+ "sxtl2 v20.8h, v16.16b\n"
+ "ldr q16, [x20, x24]\n"
+ "sxtl v19.8h, v17.8b\n"
+ "sxtl2 v18.8h, v17.16b\n"
+ "subs x23, x23, #0x1\n"
+ "sxtl v17.8h, v16.8b\n"
+ "sxtl2 v16.8h, v16.16b\n"
"saddw v15.4s, v15.4s, v23.4h\n"
"saddw2 v14.4s, v14.4s, v23.8h\n"
"saddw v13.4s, v13.4s, v22.4h\n"
@@ -248,217 +250,217 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"saddw2 v0.4s, v0.4s, v16.8h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "movi v20.4s, #0x7f\n"
- "ld1r { v19.4s }, [%x[combined_rescale_value]]\n"
- "sub %x[n_channels], %x[n_channels], #0x40\n"
"ld1r { v18.4s }, [%x[left_shift]]\n"
+ "ld1r { v17.4s }, [%x[combined_rescale_value]]\n"
"srshl v15.4s, v15.4s, v18.4s\n"
- "ld1r { v17.4s }, [%x[right_shift]]\n"
- "not v16.16b, v20.16b\n"
"srshl v14.4s, v14.4s, v18.4s\n"
- "cmp %x[n_channels], #0x40\n"
+ "ld1r { v16.4s }, [%x[right_shift]]\n"
"srshl v13.4s, v13.4s, v18.4s\n"
"srshl v12.4s, v12.4s, v18.4s\n"
+ "sub %x[n_channels], %x[n_channels], #0x40\n"
"srshl v11.4s, v11.4s, v18.4s\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "sqrdmulh v14.4s, v14.4s, v19.4s\n"
- "sqrdmulh v13.4s, v13.4s, v19.4s\n"
- "sqrdmulh v12.4s, v12.4s, v19.4s\n"
- "srshl v15.4s, v15.4s, v17.4s\n"
- "srshl v14.4s, v14.4s, v17.4s\n"
- "srshl v13.4s, v13.4s, v17.4s\n"
- "srshl v12.4s, v12.4s, v17.4s\n"
- "sqrdmulh v11.4s, v11.4s, v19.4s\n"
"srshl v10.4s, v10.4s, v18.4s\n"
+ "cmp %x[n_channels], #0x40\n"
"srshl v9.4s, v9.4s, v18.4s\n"
"srshl v8.4s, v8.4s, v18.4s\n"
- "srshl v11.4s, v11.4s, v17.4s\n"
- "sqrdmulh v10.4s, v10.4s, v19.4s\n"
- "sqrdmulh v9.4s, v9.4s, v19.4s\n"
- "sqrdmulh v8.4s, v8.4s, v19.4s\n"
"srshl v7.4s, v7.4s, v18.4s\n"
- "srshl v10.4s, v10.4s, v17.4s\n"
- "srshl v9.4s, v9.4s, v17.4s\n"
- "srshl v8.4s, v8.4s, v17.4s\n"
- "sqrdmulh v7.4s, v7.4s, v19.4s\n"
"srshl v6.4s, v6.4s, v18.4s\n"
"srshl v5.4s, v5.4s, v18.4s\n"
"srshl v4.4s, v4.4s, v18.4s\n"
- "srshl v7.4s, v7.4s, v17.4s\n"
- "sqrdmulh v6.4s, v6.4s, v19.4s\n"
- "sqrdmulh v5.4s, v5.4s, v19.4s\n"
- "sqrdmulh v4.4s, v4.4s, v19.4s\n"
"srshl v3.4s, v3.4s, v18.4s\n"
- "srshl v6.4s, v6.4s, v17.4s\n"
- "srshl v5.4s, v5.4s, v17.4s\n"
- "srshl v4.4s, v4.4s, v17.4s\n"
- "sqrdmulh v3.4s, v3.4s, v19.4s\n"
"srshl v2.4s, v2.4s, v18.4s\n"
"srshl v1.4s, v1.4s, v18.4s\n"
"srshl v0.4s, v0.4s, v18.4s\n"
- "srshl v3.4s, v3.4s, v17.4s\n"
- "sqrdmulh v2.4s, v2.4s, v19.4s\n"
- "sqrdmulh v1.4s, v1.4s, v19.4s\n"
- "sqrdmulh v0.4s, v0.4s, v19.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v17.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v17.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v17.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v17.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v17.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v17.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v17.4s\n"
+ "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+ "sqrdmulh v7.4s, v7.4s, v17.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v17.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v17.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v17.4s\n"
+ "sqrdmulh v3.4s, v3.4s, v17.4s\n"
+ "sqrdmulh v2.4s, v2.4s, v17.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v17.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v17.4s\n"
+ "movi v17.4s, #0x7f\n"
+ "srshl v15.4s, v15.4s, v16.4s\n"
+ "srshl v14.4s, v14.4s, v16.4s\n"
+ "srshl v13.4s, v13.4s, v16.4s\n"
+ "srshl v12.4s, v12.4s, v16.4s\n"
+ "srshl v11.4s, v11.4s, v16.4s\n"
+ "srshl v10.4s, v10.4s, v16.4s\n"
+ "srshl v9.4s, v9.4s, v16.4s\n"
+ "srshl v8.4s, v8.4s, v16.4s\n"
+ "srshl v7.4s, v7.4s, v16.4s\n"
+ "srshl v6.4s, v6.4s, v16.4s\n"
+ "srshl v5.4s, v5.4s, v16.4s\n"
+ "srshl v4.4s, v4.4s, v16.4s\n"
+ "srshl v3.4s, v3.4s, v16.4s\n"
+ "srshl v2.4s, v2.4s, v16.4s\n"
+ "srshl v1.4s, v1.4s, v16.4s\n"
+ "srshl v0.4s, v0.4s, v16.4s\n"
+ "not v16.16b, v17.16b\n"
"smax v15.4s, v15.4s, v16.4s\n"
- "srshl v2.4s, v2.4s, v17.4s\n"
- "srshl v1.4s, v1.4s, v17.4s\n"
- "srshl v0.4s, v0.4s, v17.4s\n"
- "smin v15.4s, v15.4s, v20.4s\n"
"smax v14.4s, v14.4s, v16.4s\n"
"smax v13.4s, v13.4s, v16.4s\n"
"smax v12.4s, v12.4s, v16.4s\n"
- "smin v14.4s, v14.4s, v20.4s\n"
- "smin v13.4s, v13.4s, v20.4s\n"
- "smin v12.4s, v12.4s, v20.4s\n"
"smax v11.4s, v11.4s, v16.4s\n"
"smax v10.4s, v10.4s, v16.4s\n"
"smax v9.4s, v9.4s, v16.4s\n"
- "smin v11.4s, v11.4s, v20.4s\n"
- "smin v10.4s, v10.4s, v20.4s\n"
- "smin v9.4s, v9.4s, v20.4s\n"
"smax v8.4s, v8.4s, v16.4s\n"
"smax v7.4s, v7.4s, v16.4s\n"
"smax v6.4s, v6.4s, v16.4s\n"
- "smin v8.4s, v8.4s, v20.4s\n"
- "smin v7.4s, v7.4s, v20.4s\n"
- "smin v6.4s, v6.4s, v20.4s\n"
"smax v5.4s, v5.4s, v16.4s\n"
"smax v4.4s, v4.4s, v16.4s\n"
"smax v3.4s, v3.4s, v16.4s\n"
- "smin v5.4s, v5.4s, v20.4s\n"
- "smin v4.4s, v4.4s, v20.4s\n"
- "smin v3.4s, v3.4s, v20.4s\n"
"smax v2.4s, v2.4s, v16.4s\n"
"smax v1.4s, v1.4s, v16.4s\n"
"smax v0.4s, v0.4s, v16.4s\n"
- "smin v2.4s, v2.4s, v20.4s\n"
- "smin v1.4s, v1.4s, v20.4s\n"
- "smin v0.4s, v0.4s, v20.4s\n"
+ "smin v15.4s, v15.4s, v17.4s\n"
+ "smin v14.4s, v14.4s, v17.4s\n"
+ "smin v13.4s, v13.4s, v17.4s\n"
+ "smin v12.4s, v12.4s, v17.4s\n"
+ "smin v11.4s, v11.4s, v17.4s\n"
+ "smin v10.4s, v10.4s, v17.4s\n"
+ "smin v9.4s, v9.4s, v17.4s\n"
+ "smin v8.4s, v8.4s, v17.4s\n"
+ "smin v7.4s, v7.4s, v17.4s\n"
+ "smin v6.4s, v6.4s, v17.4s\n"
+ "smin v5.4s, v5.4s, v17.4s\n"
+ "smin v4.4s, v4.4s, v17.4s\n"
+ "smin v3.4s, v3.4s, v17.4s\n"
+ "smin v2.4s, v2.4s, v17.4s\n"
+ "smin v1.4s, v1.4s, v17.4s\n"
+ "smin v0.4s, v0.4s, v17.4s\n"
"uzp1 v23.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
"uzp1 v22.16b, v11.16b, v10.16b\n"
- "uzp1 v21.16b, v9.16b, v8.16b\n"
- "uzp1 v20.16b, v7.16b, v6.16b\n"
+ "uzp1 v18.16b, v9.16b, v8.16b\n"
+ "uzp1 v21.16b, v7.16b, v6.16b\n"
"uzp1 v17.16b, v5.16b, v4.16b\n"
- "uzp1 v19.16b, v3.16b, v2.16b\n"
- "uzp1 v18.16b, v1.16b, v0.16b\n"
+ "uzp1 v20.16b, v3.16b, v2.16b\n"
+ "uzp1 v19.16b, v1.16b, v0.16b\n"
"uzp1 v16.16b, v23.16b, v16.16b\n"
- "str q16, [%x[outptr], x26]\n"
- "uzp1 v16.16b, v22.16b, v21.16b\n"
+ "uzp1 v18.16b, v22.16b, v18.16b\n"
+ "str q16, [%x[outptr], x27]\n"
+ "add x27, x27, #0x40\n"
+ "uzp1 v17.16b, v21.16b, v17.16b\n"
+ "uzp1 v16.16b, v20.16b, v19.16b\n"
+ "str q18, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
- "uzp1 v17.16b, v20.16b, v17.16b\n"
- "str q16, [%x[outptr], x25]\n"
- "uzp1 v16.16b, v19.16b, v18.16b\n"
+ "str q17, [%x[outptr], x25]\n"
"add x25, x25, #0x40\n"
- "str q17, [%x[outptr], x24]\n"
+ "str q16, [%x[outptr], x24]\n"
"add x24, x24, #0x40\n"
- "str q16, [%x[outptr], x23]\n"
- "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 43f\n"
"7:" // Single vector of channels
"cmp %x[n_channels], #0x10\n"
"blt 14f\n"
"8:" // Single vector of channels: Loop
+ "lsr x23, %x[n_valid_cells], #0x1\n"
"movi v15.4s, #0x0\n"
- "mov x19, %x[inptrs]\n"
"movi v14.4s, #0x0\n"
- "lsr x22, %x[n_valid_cells], #0x1\n"
+ "mov x22, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
- "cbz x22, 11f\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "ldr q31, [x21, x26]\n"
- "add x19, x19, #0x10\n"
- "ldr q30, [x20, x26]\n"
- "subs x22, x22, #0x1\n"
+ "cbz x23, 11f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- "saddl v23.8h, v31.8b, v30.8b\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "add x19, x19, #0x10\n"
- "saddl2 v22.8h, v31.16b, v30.16b\n"
- "ldr q31, [x21, x26]\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "subs x22, x22, #0x1\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "ldr q30, [x20, x26]\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "saddl v17.8h, v31.8b, v30.8b\n"
+ "saddl2 v16.8h, v31.16b, v30.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
+ "subs x23, x23, #0x1\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
+ "add x22, x22, #0x10\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- "saddl v23.8h, v31.8b, v30.8b\n"
- "saddl2 v22.8h, v31.16b, v30.16b\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "saddl v17.8h, v31.8b, v30.8b\n"
+ "saddl2 v16.8h, v31.16b, v30.16b\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x21, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q31, [x21, x26]\n"
- "sxtl v23.8h, v31.8b\n"
- "sxtl2 v22.8h, v31.16b\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "sxtl v17.8h, v16.8b\n"
+ "sxtl2 v16.8h, v16.16b\n"
+ "subs x23, x23, #0x1\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "movi v20.4s, #0x7f\n"
- "ld1r { v19.4s }, [%x[combined_rescale_value]]\n"
- "sub %x[n_channels], %x[n_channels], #0x10\n"
"ld1r { v18.4s }, [%x[left_shift]]\n"
+ "ld1r { v17.4s }, [%x[combined_rescale_value]]\n"
"srshl v15.4s, v15.4s, v18.4s\n"
- "ld1r { v17.4s }, [%x[right_shift]]\n"
- "not v16.16b, v20.16b\n"
"srshl v14.4s, v14.4s, v18.4s\n"
- "cmp %x[n_channels], #0x10\n"
+ "ld1r { v16.4s }, [%x[right_shift]]\n"
"srshl v13.4s, v13.4s, v18.4s\n"
"srshl v12.4s, v12.4s, v18.4s\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "sqrdmulh v14.4s, v14.4s, v19.4s\n"
- "sqrdmulh v13.4s, v13.4s, v19.4s\n"
- "sqrdmulh v12.4s, v12.4s, v19.4s\n"
- "srshl v15.4s, v15.4s, v17.4s\n"
- "srshl v14.4s, v14.4s, v17.4s\n"
- "srshl v13.4s, v13.4s, v17.4s\n"
- "srshl v12.4s, v12.4s, v17.4s\n"
+ "sub %x[n_channels], %x[n_channels], #0x10\n"
+ "sqrdmulh v15.4s, v15.4s, v17.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v17.4s\n"
+ "cmp %x[n_channels], #0x10\n"
+ "sqrdmulh v13.4s, v13.4s, v17.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v17.4s\n"
+ "movi v17.4s, #0x7f\n"
+ "srshl v15.4s, v15.4s, v16.4s\n"
+ "srshl v14.4s, v14.4s, v16.4s\n"
+ "srshl v13.4s, v13.4s, v16.4s\n"
+ "srshl v12.4s, v12.4s, v16.4s\n"
+ "not v16.16b, v17.16b\n"
"smax v15.4s, v15.4s, v16.4s\n"
"smax v14.4s, v14.4s, v16.4s\n"
"smax v13.4s, v13.4s, v16.4s\n"
"smax v12.4s, v12.4s, v16.4s\n"
- "smin v15.4s, v15.4s, v20.4s\n"
- "smin v14.4s, v14.4s, v20.4s\n"
- "smin v13.4s, v13.4s, v20.4s\n"
- "smin v12.4s, v12.4s, v20.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "smin v15.4s, v15.4s, v17.4s\n"
+ "smin v14.4s, v14.4s, v17.4s\n"
+ "smin v13.4s, v13.4s, v17.4s\n"
+ "smin v12.4s, v12.4s, v17.4s\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
- "str q16, [%x[outptr], x26]\n"
- "add x26, x26, #0x10\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
+ "str q16, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 43f\n"
"14:" // Oddments
+ "lsr x23, %x[n_valid_cells], #0x1\n"
+ "add %x[outptr], %x[outptr], x27\n"
"movi v15.4s, #0x0\n"
- "add %x[outptr], %x[outptr], x26\n"
"movi v14.4s, #0x0\n"
- "mov x19, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
- "lsr x22, %x[n_valid_cells], #0x1\n"
"movi v12.4s, #0x0\n"
- "cbz x22, 24f\n"
+ "mov x22, %x[inptrs]\n"
+ "cbz x23, 24f\n"
"15:" // Oddments: 2 inputs loop
+ "ldp x21, x20, [x22, #0x0]\n"
+ "add x22, x22, #0x10\n"
+ "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "add x19, x19, #0x10\n"
+ "add x20, x20, x27\n"
"movi v30.16b, #0x0\n"
- "add x21, x21, x26\n"
- "add x20, x20, x26\n"
"tbz %x[n_channels], #3, 19f\n"
"ldr d31, [x21], #0x8\n"
"ldr d30, [x20], #0x8\n"
@@ -519,21 +521,21 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"ldr b31, [x21], #0x1\n"
"ldr b30, [x20], #0x1\n"
"23:" // Oddments: 2 inputs loop: Load: Bit 3: End
- "saddl v23.8h, v31.8b, v30.8b\n"
- "subs x22, x22, #0x1\n"
- "saddl2 v22.8h, v31.16b, v30.16b\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "saddl v17.8h, v31.8b, v30.8b\n"
+ "saddl2 v16.8h, v31.16b, v30.16b\n"
+ "subs x23, x23, #0x1\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 15b\n"
"24:" // Oddments: After loop
- "ands x20, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
+ "ldr x21, [x22], #0x8\n"
+ "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
- "ldr x21, [x19], #0x8\n"
- "add x21, x21, x26\n"
"tbz %x[n_channels], #3, 29f\n"
"ldr d31, [x21], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
@@ -579,43 +581,43 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"tbz %x[n_channels], #0, 33f\n"
"ldr b31, [x21], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
- "sxtl v23.8h, v31.8b\n"
- "subs x20, x20, #0x1\n"
- "sxtl2 v22.8h, v31.16b\n"
- "saddw v15.4s, v15.4s, v23.4h\n"
- "saddw2 v14.4s, v14.4s, v23.8h\n"
- "saddw v13.4s, v13.4s, v22.4h\n"
- "saddw2 v12.4s, v12.4s, v22.8h\n"
+ "sxtl v17.8h, v31.8b\n"
+ "sxtl2 v16.8h, v31.16b\n"
+ "subs x23, x23, #0x1\n"
+ "saddw v15.4s, v15.4s, v17.4h\n"
+ "saddw2 v14.4s, v14.4s, v17.8h\n"
+ "saddw v13.4s, v13.4s, v16.4h\n"
+ "saddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
- "movi v20.4s, #0x7f\n"
- "ld1r { v19.4s }, [%x[combined_rescale_value]]\n"
- "not v16.16b, v20.16b\n"
"ld1r { v18.4s }, [%x[left_shift]]\n"
+ "ld1r { v17.4s }, [%x[combined_rescale_value]]\n"
"srshl v15.4s, v15.4s, v18.4s\n"
- "ld1r { v17.4s }, [%x[right_shift]]\n"
"srshl v14.4s, v14.4s, v18.4s\n"
+ "ld1r { v16.4s }, [%x[right_shift]]\n"
"srshl v13.4s, v13.4s, v18.4s\n"
"srshl v12.4s, v12.4s, v18.4s\n"
- "sqrdmulh v15.4s, v15.4s, v19.4s\n"
- "sqrdmulh v14.4s, v14.4s, v19.4s\n"
- "sqrdmulh v13.4s, v13.4s, v19.4s\n"
- "sqrdmulh v12.4s, v12.4s, v19.4s\n"
- "srshl v15.4s, v15.4s, v17.4s\n"
- "srshl v14.4s, v14.4s, v17.4s\n"
- "srshl v13.4s, v13.4s, v17.4s\n"
- "srshl v12.4s, v12.4s, v17.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v17.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v17.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v17.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v17.4s\n"
+ "movi v17.4s, #0x7f\n"
+ "srshl v15.4s, v15.4s, v16.4s\n"
+ "srshl v14.4s, v14.4s, v16.4s\n"
+ "srshl v13.4s, v13.4s, v16.4s\n"
+ "srshl v12.4s, v12.4s, v16.4s\n"
+ "not v16.16b, v17.16b\n"
"smax v15.4s, v15.4s, v16.4s\n"
"smax v14.4s, v14.4s, v16.4s\n"
"smax v13.4s, v13.4s, v16.4s\n"
"smax v12.4s, v12.4s, v16.4s\n"
- "smin v15.4s, v15.4s, v20.4s\n"
- "smin v14.4s, v14.4s, v20.4s\n"
- "smin v13.4s, v13.4s, v20.4s\n"
- "smin v12.4s, v12.4s, v20.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "smin v15.4s, v15.4s, v17.4s\n"
+ "smin v14.4s, v14.4s, v17.4s\n"
+ "smin v13.4s, v13.4s, v17.4s\n"
+ "smin v12.4s, v12.4s, v17.4s\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
"st1 { v16.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
@@ -661,12 +663,10 @@ void a64_s8q_nhwc_avg_generic_depthfirst_impl(
"tbz %x[n_channels], #0, 42f\n"
"st1 { v16.b }[0], [%x[outptr]], #0x1\n"
"42:" // Oddments: Store: Bit 3: End
-
"43:" // End
-
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [combined_rescale_value] "r" (&combined_rescale_value), [inptrs] "r" (inptrs), [left_shift] "r" (&left_shift), [n_valid_cells] "r" (n_valid_cells), [right_shift] "r" (&right_shift)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst.hpp
index ea7f7f89fe..68e7a98d0a 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,19 +33,11 @@ namespace pooling {
void a64_s8q_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
-struct a64_s8q_nhwc_max_generic_depthfirst
+struct a64_s8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>
{
- typedef int8_t operand_type;
- typedef int8_t return_type;
-
- typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
- kern_type kernel = a64_s8q_nhwc_max_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>;
a64_s8q_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return a64_s8q_nhwc_max_generic_depthfirst_impl; }
};
} // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp
index a077121991..f7b8dc761c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_s8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,8 +23,8 @@
*/
#include "pooling.hpp"
-#include <cstddef>
#include <cstdint>
+#include <cstddef>
#if defined(__aarch64__)
@@ -42,88 +42,88 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
)
{
__asm__ __volatile__(
- "mov x28, #0x0\n"
- "mov x27, #0x10\n" // cntb _, ALL, #1
- "mov x26, #0x20\n" // cntb _, ALL, #2
- "mov x25, #0x30\n" // cntb _, ALL, #3
"cmp %x[n_channels], #0x40\n"
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
+ "lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x80\n"
- "mov x19, %x[inptrs]\n"
"movi v7.16b, #0x80\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
+ "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x80\n"
"movi v5.16b, #0x80\n"
- "cbz x24, 4f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x22, x27]\n"
- "ldr q29, [x21, x27]\n"
- "ldr q28, [x20, x27]\n"
- "ldr q27, [x23, x26]\n"
- "ldr q21, [x22, x26]\n"
- "ldr q26, [x21, x26]\n"
- "ldr q17, [x20, x26]\n"
- "ldr q25, [x23, x25]\n"
- "ldr q20, [x22, x25]\n"
- "ldr q24, [x21, x25]\n"
- "ldr q16, [x20, x25]\n"
+ "cbz x25, 4f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "smax v23.16b, v3.16b, v2.16b\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "smax v19.16b, v1.16b, v0.16b\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "smax v22.16b, v31.16b, v30.16b\n"
- "ldr q3, [x23, x28]\n"
- "smax v18.16b, v29.16b, v28.16b\n"
- "smax v21.16b, v27.16b, v21.16b\n"
- "ldr q2, [x22, x28]\n"
- "smax v17.16b, v26.16b, v17.16b\n"
- "ldr q1, [x21, x28]\n"
- "smax v20.16b, v25.16b, v20.16b\n"
- "ldr q0, [x20, x28]\n"
- "smax v16.16b, v24.16b, v16.16b\n"
- "ldr q31, [x23, x27]\n"
+ "smax v23.16b, v4.16b, v3.16b\n"
+ "smax v19.16b, v28.16b, v22.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "smax v22.16b, v2.16b, v1.16b\n"
+ "ldr q2, [x21, x26]\n"
+ "smax v18.16b, v27.16b, v21.16b\n"
+ "ldr q1, [x20, x26]\n"
+ "smax v21.16b, v0.16b, v31.16b\n"
+ "ldr q0, [x21, x24]\n"
+ "smax v17.16b, v26.16b, v20.16b\n"
+ "ldr q31, [x20, x24]\n"
+ "smax v20.16b, v30.16b, v29.16b\n"
+ "ldr q30, [x21, x23]\n"
+ "smax v16.16b, v25.16b, v24.16b\n"
+ "ldr q29, [x20, x23]\n"
"smax v19.16b, v23.16b, v19.16b\n"
- "ldr q30, [x22, x27]\n"
"smax v18.16b, v22.16b, v18.16b\n"
- "ldr q29, [x21, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"smax v17.16b, v21.16b, v17.16b\n"
- "ldr q28, [x20, x27]\n"
"smax v16.16b, v20.16b, v16.16b\n"
- "ldr q27, [x23, x26]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "subs x25, x25, #0x1\n"
"smax v8.16b, v8.16b, v19.16b\n"
- "ldr q21, [x22, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
"smax v7.16b, v7.16b, v18.16b\n"
- "ldr q26, [x21, x26]\n"
"smax v6.16b, v6.16b, v17.16b\n"
- "ldr q17, [x20, x26]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"smax v5.16b, v5.16b, v16.16b\n"
- "ldr q25, [x23, x25]\n"
- "ldr q20, [x22, x25]\n"
- "ldr q24, [x21, x25]\n"
- "ldr q16, [x20, x25]\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "smax v23.16b, v3.16b, v2.16b\n"
- "smax v19.16b, v1.16b, v0.16b\n"
- "smax v22.16b, v31.16b, v30.16b\n"
- "smax v18.16b, v29.16b, v28.16b\n"
- "smax v21.16b, v27.16b, v21.16b\n"
- "smax v17.16b, v26.16b, v17.16b\n"
- "smax v20.16b, v25.16b, v20.16b\n"
- "smax v16.16b, v24.16b, v16.16b\n"
+ "smax v23.16b, v4.16b, v3.16b\n"
+ "smax v19.16b, v28.16b, v22.16b\n"
+ "smax v22.16b, v2.16b, v1.16b\n"
+ "smax v18.16b, v27.16b, v21.16b\n"
+ "smax v21.16b, v0.16b, v31.16b\n"
+ "smax v17.16b, v26.16b, v20.16b\n"
+ "smax v20.16b, v30.16b, v29.16b\n"
+ "smax v16.16b, v25.16b, v24.16b\n"
"smax v19.16b, v23.16b, v19.16b\n"
"smax v18.16b, v22.16b, v18.16b\n"
"smax v17.16b, v21.16b, v17.16b\n"
@@ -133,453 +133,453 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
"smax v6.16b, v6.16b, v17.16b\n"
"smax v5.16b, v5.16b, v16.16b\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "smax v8.16b, v8.16b, v3.16b\n"
- "ldr q31, [x23, x27]\n"
- "ldr q27, [x23, x26]\n"
- "smax v7.16b, v7.16b, v31.16b\n"
- "ldr q25, [x23, x25]\n"
- "smax v6.16b, v6.16b, v27.16b\n"
- "smax v5.16b, v5.16b, v25.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "subs x21, x21, #0x1\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "smax v7.16b, v7.16b, v17.16b\n"
+ "smax v6.16b, v6.16b, v16.16b\n"
+ "ldr q16, [x20, x23]\n"
+ "smax v5.16b, v5.16b, v16.16b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
"sxtl v23.8h, v8.8b\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1r { v4.4s }, [x19]\n"
"sxtl2 v22.8h, v8.16b\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+ "ld1r { v4.4s }, [x20]\n"
"sxtl v21.8h, v7.8b\n"
- "ld1r { v3.4s }, [x19]\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "sxtl2 v20.8h, v7.16b\n"
- "ld1r { v2.4s }, [x19]\n"
- "sub %x[n_channels], %x[n_channels], #0x40\n"
- "sxtl v19.8h, v6.8b\n"
- "cmp %x[n_channels], #0x40\n"
- "sxtl2 v18.8h, v6.16b\n"
+ "sxtl2 v18.8h, v7.16b\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+ "ld1r { v3.4s }, [x20]\n"
+ "sxtl v20.8h, v6.8b\n"
+ "sxtl2 v19.8h, v6.16b\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+ "ld1r { v2.4s }, [x20]\n"
"sxtl v17.8h, v5.8b\n"
"sxtl2 v16.8h, v5.16b\n"
+ "sub %x[n_channels], %x[n_channels], #0x40\n"
+ "cmp %x[n_channels], #0x40\n"
"sxtl v1.4s, v23.4h\n"
"sxtl2 v23.4s, v23.8h\n"
"sxtl v0.4s, v22.4h\n"
"sxtl2 v31.4s, v22.8h\n"
"sxtl v30.4s, v21.4h\n"
"sxtl2 v22.4s, v21.8h\n"
- "sxtl v29.4s, v20.4h\n"
+ "sxtl v29.4s, v18.4h\n"
+ "sxtl2 v18.4s, v18.8h\n"
+ "sxtl v28.4s, v20.4h\n"
"sxtl2 v21.4s, v20.8h\n"
- "sxtl v28.4s, v19.4h\n"
- "sxtl2 v20.4s, v19.8h\n"
- "sxtl v27.4s, v18.4h\n"
- "sxtl2 v26.4s, v18.8h\n"
+ "sxtl v27.4s, v19.4h\n"
+ "sxtl2 v26.4s, v19.8h\n"
"sxtl v25.4s, v17.4h\n"
- "sxtl2 v19.4s, v17.8h\n"
+ "sxtl2 v20.4s, v17.8h\n"
"sxtl v24.4s, v16.4h\n"
- "sxtl2 v18.4s, v16.8h\n"
- "srshl v1.4s, v1.4s, v3.4s\n"
- "srshl v23.4s, v23.4s, v3.4s\n"
- "srshl v0.4s, v0.4s, v3.4s\n"
- "srshl v31.4s, v31.4s, v3.4s\n"
- "sqrdmulh v1.4s, v1.4s, v4.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "sqrdmulh v0.4s, v0.4s, v4.4s\n"
- "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+ "sxtl2 v19.4s, v16.8h\n"
+ "srshl v1.4s, v1.4s, v4.4s\n"
+ "srshl v23.4s, v23.4s, v4.4s\n"
+ "srshl v0.4s, v0.4s, v4.4s\n"
+ "srshl v31.4s, v31.4s, v4.4s\n"
+ "srshl v30.4s, v30.4s, v4.4s\n"
+ "srshl v22.4s, v22.4s, v4.4s\n"
+ "srshl v29.4s, v29.4s, v4.4s\n"
+ "srshl v18.4s, v18.4s, v4.4s\n"
+ "srshl v28.4s, v28.4s, v4.4s\n"
+ "srshl v21.4s, v21.4s, v4.4s\n"
+ "srshl v27.4s, v27.4s, v4.4s\n"
+ "srshl v26.4s, v26.4s, v4.4s\n"
+ "srshl v25.4s, v25.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v4.4s\n"
+ "srshl v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v3.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v3.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v3.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v3.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v3.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v3.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v3.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v3.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v3.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v3.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v3.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v3.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v3.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v3.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v3.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v3.4s\n"
+ "movi v17.4s, #0x7f\n"
"srshl v1.4s, v1.4s, v2.4s\n"
"srshl v23.4s, v23.4s, v2.4s\n"
"srshl v0.4s, v0.4s, v2.4s\n"
"srshl v31.4s, v31.4s, v2.4s\n"
- "srshl v30.4s, v30.4s, v3.4s\n"
- "srshl v22.4s, v22.4s, v3.4s\n"
- "srshl v29.4s, v29.4s, v3.4s\n"
- "srshl v21.4s, v21.4s, v3.4s\n"
- "sqrdmulh v30.4s, v30.4s, v4.4s\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v29.4s, v29.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
"srshl v30.4s, v30.4s, v2.4s\n"
"srshl v22.4s, v22.4s, v2.4s\n"
"srshl v29.4s, v29.4s, v2.4s\n"
- "srshl v21.4s, v21.4s, v2.4s\n"
- "srshl v28.4s, v28.4s, v3.4s\n"
- "srshl v20.4s, v20.4s, v3.4s\n"
- "srshl v27.4s, v27.4s, v3.4s\n"
- "srshl v26.4s, v26.4s, v3.4s\n"
- "sqrdmulh v28.4s, v28.4s, v4.4s\n"
- "sqrdmulh v20.4s, v20.4s, v4.4s\n"
- "sqrdmulh v27.4s, v27.4s, v4.4s\n"
- "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+ "srshl v18.4s, v18.4s, v2.4s\n"
"srshl v28.4s, v28.4s, v2.4s\n"
- "srshl v20.4s, v20.4s, v2.4s\n"
+ "srshl v21.4s, v21.4s, v2.4s\n"
"srshl v27.4s, v27.4s, v2.4s\n"
"srshl v26.4s, v26.4s, v2.4s\n"
- "srshl v25.4s, v25.4s, v3.4s\n"
- "srshl v19.4s, v19.4s, v3.4s\n"
- "srshl v24.4s, v24.4s, v3.4s\n"
- "srshl v18.4s, v18.4s, v3.4s\n"
- "sqrdmulh v25.4s, v25.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "sqrdmulh v24.4s, v24.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
"srshl v25.4s, v25.4s, v2.4s\n"
- "srshl v19.4s, v19.4s, v2.4s\n"
+ "srshl v20.4s, v20.4s, v2.4s\n"
"srshl v24.4s, v24.4s, v2.4s\n"
- "srshl v18.4s, v18.4s, v2.4s\n"
- "movi v17.4s, #0x7f\n"
+ "srshl v19.4s, v19.4s, v2.4s\n"
"not v16.16b, v17.16b\n"
"smax v1.4s, v1.4s, v16.4s\n"
"smax v23.4s, v23.4s, v16.4s\n"
"smax v0.4s, v0.4s, v16.4s\n"
"smax v31.4s, v31.4s, v16.4s\n"
+ "smax v30.4s, v30.4s, v16.4s\n"
+ "smax v22.4s, v22.4s, v16.4s\n"
+ "smax v29.4s, v29.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "smax v28.4s, v28.4s, v16.4s\n"
+ "smax v21.4s, v21.4s, v16.4s\n"
+ "smax v27.4s, v27.4s, v16.4s\n"
+ "smax v26.4s, v26.4s, v16.4s\n"
+ "smax v25.4s, v25.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "smax v24.4s, v24.4s, v16.4s\n"
+ "smax v19.4s, v19.4s, v16.4s\n"
"smin v1.4s, v1.4s, v17.4s\n"
"smin v23.4s, v23.4s, v17.4s\n"
"smin v0.4s, v0.4s, v17.4s\n"
"smin v31.4s, v31.4s, v17.4s\n"
- "smax v30.4s, v30.4s, v16.4s\n"
- "smax v22.4s, v22.4s, v16.4s\n"
- "smax v29.4s, v29.4s, v16.4s\n"
"smin v30.4s, v30.4s, v17.4s\n"
"smin v22.4s, v22.4s, v17.4s\n"
"smin v29.4s, v29.4s, v17.4s\n"
- "smax v21.4s, v21.4s, v16.4s\n"
- "smax v28.4s, v28.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v16.4s\n"
- "smin v21.4s, v21.4s, v17.4s\n"
+ "smin v18.4s, v18.4s, v17.4s\n"
"smin v28.4s, v28.4s, v17.4s\n"
- "smin v20.4s, v20.4s, v17.4s\n"
- "smax v27.4s, v27.4s, v16.4s\n"
- "smax v26.4s, v26.4s, v16.4s\n"
- "smax v25.4s, v25.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v17.4s\n"
"smin v27.4s, v27.4s, v17.4s\n"
"smin v26.4s, v26.4s, v17.4s\n"
"smin v25.4s, v25.4s, v17.4s\n"
- "smax v19.4s, v19.4s, v16.4s\n"
- "smax v24.4s, v24.4s, v16.4s\n"
- "smax v18.4s, v18.4s, v16.4s\n"
- "smin v19.4s, v19.4s, v17.4s\n"
+ "smin v20.4s, v20.4s, v17.4s\n"
"smin v24.4s, v24.4s, v17.4s\n"
- "smin v18.4s, v18.4s, v17.4s\n"
+ "smin v19.4s, v19.4s, v17.4s\n"
"uzp1 v23.16b, v1.16b, v23.16b\n"
"uzp1 v16.16b, v0.16b, v31.16b\n"
"uzp1 v22.16b, v30.16b, v22.16b\n"
- "uzp1 v21.16b, v29.16b, v21.16b\n"
- "uzp1 v20.16b, v28.16b, v20.16b\n"
+ "uzp1 v18.16b, v29.16b, v18.16b\n"
+ "uzp1 v21.16b, v28.16b, v21.16b\n"
"uzp1 v17.16b, v27.16b, v26.16b\n"
- "uzp1 v19.16b, v25.16b, v19.16b\n"
- "uzp1 v18.16b, v24.16b, v18.16b\n"
+ "uzp1 v20.16b, v25.16b, v20.16b\n"
+ "uzp1 v19.16b, v24.16b, v19.16b\n"
"uzp1 v16.16b, v23.16b, v16.16b\n"
- "str q16, [%x[outptr], x28]\n"
- "uzp1 v16.16b, v22.16b, v21.16b\n"
- "add x28, x28, #0x40\n"
- "uzp1 v17.16b, v20.16b, v17.16b\n"
+ "uzp1 v18.16b, v22.16b, v18.16b\n"
"str q16, [%x[outptr], x27]\n"
- "uzp1 v16.16b, v19.16b, v18.16b\n"
"add x27, x27, #0x40\n"
- "str q17, [%x[outptr], x26]\n"
+ "uzp1 v17.16b, v21.16b, v17.16b\n"
+ "uzp1 v16.16b, v20.16b, v19.16b\n"
+ "str q18, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
- "str q16, [%x[outptr], x25]\n"
- "add x25, x25, #0x40\n"
+ "str q17, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q16, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 43f\n"
"7:" // Single vector of channels
"cmp %x[n_channels], #0x10\n"
"blt 14f\n"
"8:" // Single vector of channels: Loop
+ "lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x80\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 11f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
+ "mov x22, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "smax v23.16b, v3.16b, v2.16b\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "smax v19.16b, v1.16b, v0.16b\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "smax v19.16b, v23.16b, v19.16b\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "smax v8.16b, v8.16b, v19.16b\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
+ "smax v17.16b, v4.16b, v3.16b\n"
+ "smax v16.16b, v28.16b, v22.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "smax v16.16b, v17.16b, v16.16b\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "smax v23.16b, v3.16b, v2.16b\n"
- "smax v19.16b, v1.16b, v0.16b\n"
- "smax v19.16b, v23.16b, v19.16b\n"
- "smax v8.16b, v8.16b, v19.16b\n"
+ "smax v17.16b, v4.16b, v3.16b\n"
+ "smax v16.16b, v28.16b, v22.16b\n"
+ "smax v16.16b, v17.16b, v16.16b\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "smax v8.16b, v8.16b, v3.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "subs x21, x21, #0x1\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "sxtl v23.8h, v8.8b\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1r { v4.4s }, [x19]\n"
- "sxtl2 v22.8h, v8.16b\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "movi v17.4s, #0x7f\n"
- "ld1r { v3.4s }, [x19]\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "sxtl v1.4s, v23.4h\n"
- "ld1r { v2.4s }, [x19]\n"
- "not v16.16b, v17.16b\n"
- "sxtl2 v23.4s, v23.8h\n"
+ "sxtl v17.8h, v8.8b\n"
+ "sxtl2 v16.8h, v8.16b\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+ "ld1r { v22.4s }, [x20]\n"
+ "sxtl v21.4s, v17.4h\n"
+ "sxtl2 v20.4s, v17.8h\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "sxtl v19.4s, v16.4h\n"
+ "sxtl2 v18.4s, v16.8h\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "srshl v21.4s, v21.4s, v22.4s\n"
+ "srshl v20.4s, v20.4s, v22.4s\n"
"sub %x[n_channels], %x[n_channels], #0x10\n"
- "sxtl v0.4s, v22.4h\n"
"cmp %x[n_channels], #0x10\n"
- "sxtl2 v31.4s, v22.8h\n"
- "srshl v1.4s, v1.4s, v3.4s\n"
- "srshl v23.4s, v23.4s, v3.4s\n"
- "srshl v0.4s, v0.4s, v3.4s\n"
- "srshl v31.4s, v31.4s, v3.4s\n"
- "sqrdmulh v1.4s, v1.4s, v4.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "sqrdmulh v0.4s, v0.4s, v4.4s\n"
- "sqrdmulh v31.4s, v31.4s, v4.4s\n"
- "srshl v1.4s, v1.4s, v2.4s\n"
- "srshl v23.4s, v23.4s, v2.4s\n"
- "srshl v0.4s, v0.4s, v2.4s\n"
- "srshl v31.4s, v31.4s, v2.4s\n"
- "smax v1.4s, v1.4s, v16.4s\n"
- "smax v23.4s, v23.4s, v16.4s\n"
- "smax v0.4s, v0.4s, v16.4s\n"
- "smax v31.4s, v31.4s, v16.4s\n"
- "smin v1.4s, v1.4s, v17.4s\n"
- "smin v23.4s, v23.4s, v17.4s\n"
- "smin v0.4s, v0.4s, v17.4s\n"
- "smin v31.4s, v31.4s, v17.4s\n"
- "uzp1 v23.16b, v1.16b, v23.16b\n"
- "uzp1 v16.16b, v0.16b, v31.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
- "str q16, [%x[outptr], x28]\n"
- "add x28, x28, #0x10\n"
+ "srshl v19.4s, v19.4s, v22.4s\n"
+ "srshl v18.4s, v18.4s, v22.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v17.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
+ "movi v17.4s, #0x7f\n"
+ "srshl v21.4s, v21.4s, v16.4s\n"
+ "srshl v20.4s, v20.4s, v16.4s\n"
+ "srshl v19.4s, v19.4s, v16.4s\n"
+ "srshl v18.4s, v18.4s, v16.4s\n"
+ "not v16.16b, v17.16b\n"
+ "smax v21.4s, v21.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "smax v19.4s, v19.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v17.4s\n"
+ "smin v20.4s, v20.4s, v17.4s\n"
+ "smin v19.4s, v19.4s, v17.4s\n"
+ "smin v18.4s, v18.4s, v17.4s\n"
+ "uzp1 v17.16b, v21.16b, v20.16b\n"
+ "uzp1 v16.16b, v19.16b, v18.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
+ "str q16, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 43f\n"
"14:" // Oddments
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "add %x[outptr], %x[outptr], x27\n"
"movi v8.16b, #0x80\n"
- "add %x[outptr], %x[outptr], x28\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 24f\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 24f\n"
"15:" // Oddments: 4 inputs loop
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
+ "movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "add x23, x23, x28\n"
- "movi v2.16b, #0x0\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "movi v1.16b, #0x0\n"
- "add x19, x19, #0x20\n"
- "movi v0.16b, #0x0\n"
- "add x22, x22, x28\n"
- "add x21, x21, x28\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x27\n"
+ "movi v28.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d2, [x22], #0x8\n"
- "ldr d1, [x21], #0x8\n"
- "ldr d0, [x20], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v2.s }[2], [x22], #0x4\n"
- "ld1 { v1.s }[2], [x21], #0x4\n"
- "ld1 { v0.s }[2], [x20], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
- "ld1 { v2.h }[6], [x22], #0x2\n"
- "ld1 { v1.h }[6], [x21], #0x2\n"
- "ld1 { v0.h }[6], [x20], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v3.h }[6], [x22], #0x2\n"
+ "ld1 { v28.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[14], [x23], #0x1\n"
- "ld1 { v2.b }[14], [x22], #0x1\n"
- "ld1 { v1.b }[14], [x21], #0x1\n"
- "ld1 { v0.b }[14], [x20], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
+ "ld1 { v3.b }[14], [x22], #0x1\n"
+ "ld1 { v28.b }[14], [x21], #0x1\n"
+ "ld1 { v22.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[12], [x23], #0x1\n"
- "ld1 { v2.b }[12], [x22], #0x1\n"
- "ld1 { v1.b }[12], [x21], #0x1\n"
- "ld1 { v0.b }[12], [x20], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
+ "ld1 { v3.b }[12], [x22], #0x1\n"
+ "ld1 { v28.b }[12], [x21], #0x1\n"
+ "ld1 { v22.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
- "ld1 { v2.h }[4], [x22], #0x2\n"
- "ld1 { v1.h }[4], [x21], #0x2\n"
- "ld1 { v0.h }[4], [x20], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v3.h }[4], [x22], #0x2\n"
+ "ld1 { v28.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[10], [x23], #0x1\n"
- "ld1 { v2.b }[10], [x22], #0x1\n"
- "ld1 { v1.b }[10], [x21], #0x1\n"
- "ld1 { v0.b }[10], [x20], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
+ "ld1 { v3.b }[10], [x22], #0x1\n"
+ "ld1 { v28.b }[10], [x21], #0x1\n"
+ "ld1 { v22.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[8], [x23], #0x1\n"
- "ld1 { v2.b }[8], [x22], #0x1\n"
- "ld1 { v1.b }[8], [x21], #0x1\n"
- "ld1 { v0.b }[8], [x20], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
+ "ld1 { v3.b }[8], [x22], #0x1\n"
+ "ld1 { v28.b }[8], [x21], #0x1\n"
+ "ld1 { v22.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s2, [x22], #0x4\n"
- "ldr s1, [x21], #0x4\n"
- "ldr s0, [x20], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
- "ld1 { v2.h }[2], [x22], #0x2\n"
- "ld1 { v1.h }[2], [x21], #0x2\n"
- "ld1 { v0.h }[2], [x20], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v3.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[6], [x23], #0x1\n"
- "ld1 { v2.b }[6], [x22], #0x1\n"
- "ld1 { v1.b }[6], [x21], #0x1\n"
- "ld1 { v0.b }[6], [x20], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
+ "ld1 { v3.b }[6], [x22], #0x1\n"
+ "ld1 { v28.b }[6], [x21], #0x1\n"
+ "ld1 { v22.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[4], [x23], #0x1\n"
- "ld1 { v2.b }[4], [x22], #0x1\n"
- "ld1 { v1.b }[4], [x21], #0x1\n"
- "ld1 { v0.b }[4], [x20], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
+ "ld1 { v3.b }[4], [x22], #0x1\n"
+ "ld1 { v28.b }[4], [x21], #0x1\n"
+ "ld1 { v22.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h2, [x22], #0x2\n"
- "ldr h1, [x21], #0x2\n"
- "ldr h0, [x20], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "ldr h22, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[2], [x23], #0x1\n"
- "ld1 { v2.b }[2], [x22], #0x1\n"
- "ld1 { v1.b }[2], [x21], #0x1\n"
- "ld1 { v0.b }[2], [x20], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
+ "ld1 { v3.b }[2], [x22], #0x1\n"
+ "ld1 { v28.b }[2], [x21], #0x1\n"
+ "ld1 { v22.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b3, [x23], #0x1\n"
- "ldr b2, [x22], #0x1\n"
- "ldr b1, [x21], #0x1\n"
- "ldr b0, [x20], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
+ "ldr b3, [x22], #0x1\n"
+ "ldr b28, [x21], #0x1\n"
+ "ldr b22, [x20], #0x1\n"
"23:" // Oddments: 4 inputs loop: Load: Bit 3: End
- "smax v23.16b, v3.16b, v2.16b\n"
- "subs x24, x24, #0x1\n"
- "smax v19.16b, v1.16b, v0.16b\n"
- "smax v19.16b, v23.16b, v19.16b\n"
- "smax v8.16b, v8.16b, v19.16b\n"
+ "smax v17.16b, v4.16b, v3.16b\n"
+ "smax v16.16b, v28.16b, v22.16b\n"
+ "subs x25, x25, #0x1\n"
+ "smax v16.16b, v17.16b, v16.16b\n"
+ "smax v8.16b, v8.16b, v16.16b\n"
"bgt 15b\n"
"24:" // Oddments: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "movi v3.16b, #0x0\n"
- "ldr x23, [x19], #0x8\n"
- "add x23, x23, x28\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
+ "movi v4.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d3, [x23], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[14], [x23], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[12], [x23], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[10], [x23], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[8], [x23], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s3, [x23], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[6], [x23], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[4], [x23], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h3, [x23], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[2], [x23], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b3, [x23], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
- "smax v8.16b, v8.16b, v3.16b\n"
- "subs x20, x20, #0x1\n"
+ "subs x21, x21, #0x1\n"
+ "smax v8.16b, v8.16b, v4.16b\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
- "sxtl v23.8h, v8.8b\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1r { v4.4s }, [x19]\n"
- "sxtl2 v22.8h, v8.16b\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+ "sxtl v17.8h, v8.8b\n"
+ "sxtl2 v16.8h, v8.16b\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+ "ld1r { v22.4s }, [x20]\n"
+ "sxtl v21.4s, v17.4h\n"
+ "sxtl2 v20.4s, v17.8h\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "sxtl v19.4s, v16.4h\n"
+ "sxtl2 v18.4s, v16.8h\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "srshl v21.4s, v21.4s, v22.4s\n"
+ "srshl v20.4s, v20.4s, v22.4s\n"
+ "srshl v19.4s, v19.4s, v22.4s\n"
+ "srshl v18.4s, v18.4s, v22.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v17.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v17.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v17.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v17.4s\n"
"movi v17.4s, #0x7f\n"
- "ld1r { v3.4s }, [x19]\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "sxtl v1.4s, v23.4h\n"
- "ld1r { v2.4s }, [x19]\n"
+ "srshl v21.4s, v21.4s, v16.4s\n"
+ "srshl v20.4s, v20.4s, v16.4s\n"
+ "srshl v19.4s, v19.4s, v16.4s\n"
+ "srshl v18.4s, v18.4s, v16.4s\n"
"not v16.16b, v17.16b\n"
- "sxtl2 v23.4s, v23.8h\n"
- "sxtl v0.4s, v22.4h\n"
- "sxtl2 v31.4s, v22.8h\n"
- "srshl v1.4s, v1.4s, v3.4s\n"
- "srshl v23.4s, v23.4s, v3.4s\n"
- "srshl v0.4s, v0.4s, v3.4s\n"
- "srshl v31.4s, v31.4s, v3.4s\n"
- "sqrdmulh v1.4s, v1.4s, v4.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "sqrdmulh v0.4s, v0.4s, v4.4s\n"
- "sqrdmulh v31.4s, v31.4s, v4.4s\n"
- "srshl v1.4s, v1.4s, v2.4s\n"
- "srshl v23.4s, v23.4s, v2.4s\n"
- "srshl v0.4s, v0.4s, v2.4s\n"
- "srshl v31.4s, v31.4s, v2.4s\n"
- "smax v1.4s, v1.4s, v16.4s\n"
- "smax v23.4s, v23.4s, v16.4s\n"
- "smax v0.4s, v0.4s, v16.4s\n"
- "smax v31.4s, v31.4s, v16.4s\n"
- "smin v1.4s, v1.4s, v17.4s\n"
- "smin v23.4s, v23.4s, v17.4s\n"
- "smin v0.4s, v0.4s, v17.4s\n"
- "smin v31.4s, v31.4s, v17.4s\n"
- "uzp1 v23.16b, v1.16b, v23.16b\n"
- "uzp1 v16.16b, v0.16b, v31.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "smax v21.4s, v21.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "smax v19.4s, v19.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v17.4s\n"
+ "smin v20.4s, v20.4s, v17.4s\n"
+ "smin v19.4s, v19.4s, v17.4s\n"
+ "smin v18.4s, v18.4s, v17.4s\n"
+ "uzp1 v17.16b, v21.16b, v20.16b\n"
+ "uzp1 v16.16b, v19.16b, v18.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
"st1 { v16.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
@@ -625,12 +625,10 @@ void a64_s8q_nhwc_max_generic_depthfirst_impl(
"tbz %x[n_channels], #0, 42f\n"
"st1 { v16.b }[0], [%x[outptr]], #0x1\n"
"42:" // Oddments: Store: Bit 3: End
-
"43:" // End
-
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [quant_params] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst.hpp
index 230952452b..97818595e8 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,19 +33,11 @@ namespace pooling {
void a64_u8_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
-struct a64_u8_nhwc_avg_generic_depthfirst
+struct a64_u8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t>
{
- typedef uint8_t operand_type;
- typedef uint8_t return_type;
-
- typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
- kern_type kernel = a64_u8_nhwc_avg_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t>;
a64_u8_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return a64_u8_nhwc_avg_generic_depthfirst_impl; }
};
} // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
index 2c8a29248d..f8984c451c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include <cstdint>
+#include <cstddef>
#include <cstring>
#include <cmath>
@@ -84,26 +85,27 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
f_rescale_value *= 2.0f;
}
- rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
- if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+ int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+ if (long_rescale_value == (1ll << 31))
{
shift_value++;
- rescale_value >>= 1;
+ long_rescale_value >>= 1;
}
+ rescale_value = static_cast<int32_t>(long_rescale_value);
}
__asm__ __volatile__(
- "mov x26, #0x0\n"
- "mov x25, #0x10\n" // cntb _, ALL, #1
- "mov x24, #0x20\n" // cntb _, ALL, #2
- "mov x23, #0x30\n" // cntb _, ALL, #3
"cmp %x[n_channels], #0x40\n"
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x25, #0x20\n" // cntb _, ALL, #2
+ "mov x24, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
+ "lsr x23, %x[n_valid_cells], #0x1\n"
"movi v15.4s, #0x0\n"
- "mov x19, %x[inptrs]\n"
"movi v14.4s, #0x0\n"
- "lsr x22, %x[n_valid_cells], #0x1\n"
+ "mov x22, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
"movi v11.4s, #0x0\n"
@@ -118,43 +120,43 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"movi v2.4s, #0x0\n"
"movi v1.4s, #0x0\n"
"movi v0.4s, #0x0\n"
- "cbz x22, 4f\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "ldr q31, [x21, x26]\n"
- "add x19, x19, #0x10\n"
- "ldr q30, [x20, x26]\n"
- "subs x22, x22, #0x1\n"
- "ldr q29, [x21, x25]\n"
- "ldr q28, [x20, x25]\n"
- "ldr q27, [x21, x24]\n"
- "ldr q26, [x20, x24]\n"
- "ldr q25, [x21, x23]\n"
- "ldr q24, [x20, x23]\n"
+ "cbz x23, 4f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
"uaddl v23.8h, v31.8b, v30.8b\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "add x19, x19, #0x10\n"
"uaddl2 v22.8h, v31.16b, v30.16b\n"
- "ldr q31, [x21, x26]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"uaddl v21.8h, v29.8b, v28.8b\n"
- "subs x22, x22, #0x1\n"
"uaddl2 v20.8h, v29.16b, v28.16b\n"
- "ldr q30, [x20, x26]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
"uaddl v19.8h, v27.8b, v26.8b\n"
- "ldr q29, [x21, x25]\n"
"uaddl2 v18.8h, v27.16b, v26.16b\n"
- "ldr q28, [x20, x25]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
"uaddl v17.8h, v25.8b, v24.8b\n"
- "ldr q27, [x21, x24]\n"
"uaddl2 v16.8h, v25.16b, v24.16b\n"
- "ldr q26, [x20, x24]\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
+ "subs x23, x23, #0x1\n"
"uaddw v15.4s, v15.4s, v23.4h\n"
- "ldr q25, [x21, x23]\n"
"uaddw2 v14.4s, v14.4s, v23.8h\n"
- "ldr q24, [x20, x23]\n"
"uaddw v13.4s, v13.4s, v22.4h\n"
"uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "add x22, x22, #0x10\n"
"uaddw v11.4s, v11.4s, v21.4h\n"
"uaddw2 v10.4s, v10.4s, v21.8h\n"
"uaddw v9.4s, v9.4s, v20.4h\n"
@@ -194,23 +196,23 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"uaddw v1.4s, v1.4s, v16.4h\n"
"uaddw2 v0.4s, v0.4s, v16.8h\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x21, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q31, [x21, x26]\n"
- "uxtl v23.8h, v31.8b\n"
- "ldr q29, [x21, x25]\n"
- "uxtl2 v22.8h, v31.16b\n"
- "ldr q27, [x21, x24]\n"
- "ldr q25, [x21, x23]\n"
- "uxtl v21.8h, v29.8b\n"
- "uxtl2 v20.8h, v29.16b\n"
- "uxtl v19.8h, v27.8b\n"
- "uxtl2 v18.8h, v27.16b\n"
- "uxtl v17.8h, v25.8b\n"
- "uxtl2 v16.8h, v25.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "uxtl v23.8h, v16.8b\n"
+ "uxtl2 v22.8h, v16.16b\n"
+ "ldr q16, [x20, x26]\n"
+ "ldr q17, [x20, x25]\n"
+ "uxtl v21.8h, v16.8b\n"
+ "uxtl2 v20.8h, v16.16b\n"
+ "ldr q16, [x20, x24]\n"
+ "uxtl v19.8h, v17.8b\n"
+ "uxtl2 v18.8h, v17.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uxtl v17.8h, v16.8b\n"
+ "uxtl2 v16.8h, v16.16b\n"
"uaddw v15.4s, v15.4s, v23.4h\n"
"uaddw2 v14.4s, v14.4s, v23.8h\n"
"uaddw v13.4s, v13.4s, v22.4h\n"
@@ -229,195 +231,195 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"uaddw2 v0.4s, v0.4s, v16.8h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "movi v19.4s, #0x0\n"
- "ld1r { v18.4s }, [%x[rescale_ptr]]\n"
- "sub %x[n_channels], %x[n_channels], #0x40\n"
- "movi v17.4s, #0xff\n"
+ "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
"ld1r { v16.4s }, [%x[shift_ptr]]\n"
+ "sqdmulh v15.4s, v15.4s, v17.4s\n"
+ "sqdmulh v14.4s, v14.4s, v17.4s\n"
+ "sqdmulh v13.4s, v13.4s, v17.4s\n"
+ "sqdmulh v12.4s, v12.4s, v17.4s\n"
+ "sub %x[n_channels], %x[n_channels], #0x40\n"
"cmp %x[n_channels], #0x40\n"
- "sqdmulh v15.4s, v15.4s, v18.4s\n"
- "sqdmulh v14.4s, v14.4s, v18.4s\n"
- "sqdmulh v13.4s, v13.4s, v18.4s\n"
- "sqdmulh v12.4s, v12.4s, v18.4s\n"
- "sqdmulh v11.4s, v11.4s, v18.4s\n"
+ "sqdmulh v11.4s, v11.4s, v17.4s\n"
+ "sqdmulh v10.4s, v10.4s, v17.4s\n"
+ "sqdmulh v9.4s, v9.4s, v17.4s\n"
+ "sqdmulh v8.4s, v8.4s, v17.4s\n"
+ "sqdmulh v7.4s, v7.4s, v17.4s\n"
+ "sqdmulh v6.4s, v6.4s, v17.4s\n"
+ "sqdmulh v5.4s, v5.4s, v17.4s\n"
+ "sqdmulh v4.4s, v4.4s, v17.4s\n"
+ "sqdmulh v3.4s, v3.4s, v17.4s\n"
+ "sqdmulh v2.4s, v2.4s, v17.4s\n"
+ "sqdmulh v1.4s, v1.4s, v17.4s\n"
+ "sqdmulh v0.4s, v0.4s, v17.4s\n"
"srshl v15.4s, v15.4s, v16.4s\n"
"srshl v14.4s, v14.4s, v16.4s\n"
"srshl v13.4s, v13.4s, v16.4s\n"
"srshl v12.4s, v12.4s, v16.4s\n"
"srshl v11.4s, v11.4s, v16.4s\n"
- "sqdmulh v10.4s, v10.4s, v18.4s\n"
- "sqdmulh v9.4s, v9.4s, v18.4s\n"
- "sqdmulh v8.4s, v8.4s, v18.4s\n"
- "sqdmulh v7.4s, v7.4s, v18.4s\n"
"srshl v10.4s, v10.4s, v16.4s\n"
"srshl v9.4s, v9.4s, v16.4s\n"
"srshl v8.4s, v8.4s, v16.4s\n"
"srshl v7.4s, v7.4s, v16.4s\n"
- "sqdmulh v6.4s, v6.4s, v18.4s\n"
- "sqdmulh v5.4s, v5.4s, v18.4s\n"
- "sqdmulh v4.4s, v4.4s, v18.4s\n"
- "sqdmulh v3.4s, v3.4s, v18.4s\n"
"srshl v6.4s, v6.4s, v16.4s\n"
"srshl v5.4s, v5.4s, v16.4s\n"
"srshl v4.4s, v4.4s, v16.4s\n"
"srshl v3.4s, v3.4s, v16.4s\n"
- "sqdmulh v2.4s, v2.4s, v18.4s\n"
- "sqdmulh v1.4s, v1.4s, v18.4s\n"
- "sqdmulh v0.4s, v0.4s, v18.4s\n"
- "smax v15.4s, v15.4s, v19.4s\n"
"srshl v2.4s, v2.4s, v16.4s\n"
"srshl v1.4s, v1.4s, v16.4s\n"
"srshl v0.4s, v0.4s, v16.4s\n"
- "smin v15.4s, v15.4s, v17.4s\n"
- "smax v14.4s, v14.4s, v19.4s\n"
- "smax v13.4s, v13.4s, v19.4s\n"
- "smax v12.4s, v12.4s, v19.4s\n"
- "smin v14.4s, v14.4s, v17.4s\n"
- "smin v13.4s, v13.4s, v17.4s\n"
- "smin v12.4s, v12.4s, v17.4s\n"
- "smax v11.4s, v11.4s, v19.4s\n"
- "smax v10.4s, v10.4s, v19.4s\n"
- "smax v9.4s, v9.4s, v19.4s\n"
- "smin v11.4s, v11.4s, v17.4s\n"
- "smin v10.4s, v10.4s, v17.4s\n"
- "smin v9.4s, v9.4s, v17.4s\n"
- "smax v8.4s, v8.4s, v19.4s\n"
- "smax v7.4s, v7.4s, v19.4s\n"
- "smax v6.4s, v6.4s, v19.4s\n"
- "smin v8.4s, v8.4s, v17.4s\n"
- "smin v7.4s, v7.4s, v17.4s\n"
- "smin v6.4s, v6.4s, v17.4s\n"
- "smax v5.4s, v5.4s, v19.4s\n"
- "smax v4.4s, v4.4s, v19.4s\n"
- "smax v3.4s, v3.4s, v19.4s\n"
- "smin v5.4s, v5.4s, v17.4s\n"
- "smin v4.4s, v4.4s, v17.4s\n"
- "smin v3.4s, v3.4s, v17.4s\n"
- "smax v2.4s, v2.4s, v19.4s\n"
- "smax v1.4s, v1.4s, v19.4s\n"
- "smax v0.4s, v0.4s, v19.4s\n"
- "smin v2.4s, v2.4s, v17.4s\n"
- "smin v1.4s, v1.4s, v17.4s\n"
- "smin v0.4s, v0.4s, v17.4s\n"
+ "movi v16.4s, #0x0\n"
+ "smax v15.4s, v15.4s, v16.4s\n"
+ "smax v14.4s, v14.4s, v16.4s\n"
+ "smax v13.4s, v13.4s, v16.4s\n"
+ "smax v12.4s, v12.4s, v16.4s\n"
+ "smax v11.4s, v11.4s, v16.4s\n"
+ "smax v10.4s, v10.4s, v16.4s\n"
+ "smax v9.4s, v9.4s, v16.4s\n"
+ "smax v8.4s, v8.4s, v16.4s\n"
+ "smax v7.4s, v7.4s, v16.4s\n"
+ "smax v6.4s, v6.4s, v16.4s\n"
+ "smax v5.4s, v5.4s, v16.4s\n"
+ "smax v4.4s, v4.4s, v16.4s\n"
+ "smax v3.4s, v3.4s, v16.4s\n"
+ "smax v2.4s, v2.4s, v16.4s\n"
+ "smax v1.4s, v1.4s, v16.4s\n"
+ "smax v0.4s, v0.4s, v16.4s\n"
+ "movi v16.4s, #0xff\n"
+ "smin v15.4s, v15.4s, v16.4s\n"
+ "smin v14.4s, v14.4s, v16.4s\n"
+ "smin v13.4s, v13.4s, v16.4s\n"
+ "smin v12.4s, v12.4s, v16.4s\n"
+ "smin v11.4s, v11.4s, v16.4s\n"
+ "smin v10.4s, v10.4s, v16.4s\n"
+ "smin v9.4s, v9.4s, v16.4s\n"
+ "smin v8.4s, v8.4s, v16.4s\n"
+ "smin v7.4s, v7.4s, v16.4s\n"
+ "smin v6.4s, v6.4s, v16.4s\n"
+ "smin v5.4s, v5.4s, v16.4s\n"
+ "smin v4.4s, v4.4s, v16.4s\n"
+ "smin v3.4s, v3.4s, v16.4s\n"
+ "smin v2.4s, v2.4s, v16.4s\n"
+ "smin v1.4s, v1.4s, v16.4s\n"
+ "smin v0.4s, v0.4s, v16.4s\n"
"uzp1 v23.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
"uzp1 v22.16b, v11.16b, v10.16b\n"
- "uzp1 v21.16b, v9.16b, v8.16b\n"
- "uzp1 v20.16b, v7.16b, v6.16b\n"
+ "uzp1 v18.16b, v9.16b, v8.16b\n"
+ "uzp1 v21.16b, v7.16b, v6.16b\n"
"uzp1 v17.16b, v5.16b, v4.16b\n"
- "uzp1 v19.16b, v3.16b, v2.16b\n"
- "uzp1 v18.16b, v1.16b, v0.16b\n"
+ "uzp1 v20.16b, v3.16b, v2.16b\n"
+ "uzp1 v19.16b, v1.16b, v0.16b\n"
"uzp1 v16.16b, v23.16b, v16.16b\n"
- "str q16, [%x[outptr], x26]\n"
- "uzp1 v16.16b, v22.16b, v21.16b\n"
+ "uzp1 v18.16b, v22.16b, v18.16b\n"
+ "str q16, [%x[outptr], x27]\n"
+ "add x27, x27, #0x40\n"
+ "uzp1 v17.16b, v21.16b, v17.16b\n"
+ "uzp1 v16.16b, v20.16b, v19.16b\n"
+ "str q18, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
- "uzp1 v17.16b, v20.16b, v17.16b\n"
- "str q16, [%x[outptr], x25]\n"
- "uzp1 v16.16b, v19.16b, v18.16b\n"
+ "str q17, [%x[outptr], x25]\n"
"add x25, x25, #0x40\n"
- "str q17, [%x[outptr], x24]\n"
+ "str q16, [%x[outptr], x24]\n"
"add x24, x24, #0x40\n"
- "str q16, [%x[outptr], x23]\n"
- "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 43f\n"
"7:" // Single vector of channels
"cmp %x[n_channels], #0x10\n"
"blt 14f\n"
"8:" // Single vector of channels: Loop
+ "lsr x23, %x[n_valid_cells], #0x1\n"
"movi v15.4s, #0x0\n"
- "mov x19, %x[inptrs]\n"
"movi v14.4s, #0x0\n"
- "lsr x22, %x[n_valid_cells], #0x1\n"
+ "mov x22, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
"movi v12.4s, #0x0\n"
- "cbz x22, 11f\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "ldr q31, [x21, x26]\n"
- "add x19, x19, #0x10\n"
- "ldr q30, [x20, x26]\n"
- "subs x22, x22, #0x1\n"
+ "cbz x23, 11f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "add x19, x19, #0x10\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
- "ldr q31, [x21, x26]\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "subs x22, x22, #0x1\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "ldr q30, [x20, x26]\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
+ "subs x23, x23, #0x1\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
+ "add x22, x22, #0x10\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x21, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q31, [x21, x26]\n"
- "uxtl v23.8h, v31.8b\n"
- "uxtl2 v22.8h, v31.16b\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "uxtl v17.8h, v16.8b\n"
+ "uxtl2 v16.8h, v16.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "movi v19.4s, #0x0\n"
- "ld1r { v18.4s }, [%x[rescale_ptr]]\n"
- "sub %x[n_channels], %x[n_channels], #0x10\n"
- "movi v17.4s, #0xff\n"
+ "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
"ld1r { v16.4s }, [%x[shift_ptr]]\n"
+ "sqdmulh v15.4s, v15.4s, v17.4s\n"
+ "sqdmulh v14.4s, v14.4s, v17.4s\n"
+ "sqdmulh v13.4s, v13.4s, v17.4s\n"
+ "sqdmulh v12.4s, v12.4s, v17.4s\n"
+ "sub %x[n_channels], %x[n_channels], #0x10\n"
"cmp %x[n_channels], #0x10\n"
- "sqdmulh v15.4s, v15.4s, v18.4s\n"
- "sqdmulh v14.4s, v14.4s, v18.4s\n"
- "sqdmulh v13.4s, v13.4s, v18.4s\n"
- "sqdmulh v12.4s, v12.4s, v18.4s\n"
"srshl v15.4s, v15.4s, v16.4s\n"
"srshl v14.4s, v14.4s, v16.4s\n"
"srshl v13.4s, v13.4s, v16.4s\n"
"srshl v12.4s, v12.4s, v16.4s\n"
- "smax v15.4s, v15.4s, v19.4s\n"
- "smax v14.4s, v14.4s, v19.4s\n"
- "smax v13.4s, v13.4s, v19.4s\n"
- "smax v12.4s, v12.4s, v19.4s\n"
- "smin v15.4s, v15.4s, v17.4s\n"
- "smin v14.4s, v14.4s, v17.4s\n"
- "smin v13.4s, v13.4s, v17.4s\n"
- "smin v12.4s, v12.4s, v17.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "movi v16.4s, #0x0\n"
+ "smax v15.4s, v15.4s, v16.4s\n"
+ "smax v14.4s, v14.4s, v16.4s\n"
+ "smax v13.4s, v13.4s, v16.4s\n"
+ "smax v12.4s, v12.4s, v16.4s\n"
+ "movi v16.4s, #0xff\n"
+ "smin v15.4s, v15.4s, v16.4s\n"
+ "smin v14.4s, v14.4s, v16.4s\n"
+ "smin v13.4s, v13.4s, v16.4s\n"
+ "smin v12.4s, v12.4s, v16.4s\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
- "str q16, [%x[outptr], x26]\n"
- "add x26, x26, #0x10\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
+ "str q16, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 43f\n"
"14:" // Oddments
+ "lsr x23, %x[n_valid_cells], #0x1\n"
+ "add %x[outptr], %x[outptr], x27\n"
"movi v15.4s, #0x0\n"
- "add %x[outptr], %x[outptr], x26\n"
"movi v14.4s, #0x0\n"
- "mov x19, %x[inptrs]\n"
"movi v13.4s, #0x0\n"
- "lsr x22, %x[n_valid_cells], #0x1\n"
"movi v12.4s, #0x0\n"
- "cbz x22, 24f\n"
+ "mov x22, %x[inptrs]\n"
+ "cbz x23, 24f\n"
"15:" // Oddments: 2 inputs loop
+ "ldp x21, x20, [x22, #0x0]\n"
+ "add x22, x22, #0x10\n"
+ "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "add x19, x19, #0x10\n"
+ "add x20, x20, x27\n"
"movi v30.16b, #0x0\n"
- "add x21, x21, x26\n"
- "add x20, x20, x26\n"
"tbz %x[n_channels], #3, 19f\n"
"ldr d31, [x21], #0x8\n"
"ldr d30, [x20], #0x8\n"
@@ -478,21 +480,21 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"ldr b31, [x21], #0x1\n"
"ldr b30, [x20], #0x1\n"
"23:" // Oddments: 2 inputs loop: Load: Bit 3: End
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "subs x22, x22, #0x1\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 15b\n"
"24:" // Oddments: After loop
- "ands x20, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
+ "ldr x21, [x22], #0x8\n"
+ "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
- "ldr x21, [x19], #0x8\n"
- "add x21, x21, x26\n"
"tbz %x[n_channels], #3, 29f\n"
"ldr d31, [x21], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
@@ -538,38 +540,38 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"tbz %x[n_channels], #0, 33f\n"
"ldr b31, [x21], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
- "uxtl v23.8h, v31.8b\n"
- "subs x20, x20, #0x1\n"
- "uxtl2 v22.8h, v31.16b\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uxtl v17.8h, v31.8b\n"
+ "uxtl2 v16.8h, v31.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
- "movi v19.4s, #0x0\n"
- "ld1r { v18.4s }, [%x[rescale_ptr]]\n"
- "movi v17.4s, #0xff\n"
+ "ld1r { v17.4s }, [%x[rescale_ptr]]\n"
"ld1r { v16.4s }, [%x[shift_ptr]]\n"
- "sqdmulh v15.4s, v15.4s, v18.4s\n"
- "sqdmulh v14.4s, v14.4s, v18.4s\n"
- "sqdmulh v13.4s, v13.4s, v18.4s\n"
- "sqdmulh v12.4s, v12.4s, v18.4s\n"
+ "sqdmulh v15.4s, v15.4s, v17.4s\n"
+ "sqdmulh v14.4s, v14.4s, v17.4s\n"
+ "sqdmulh v13.4s, v13.4s, v17.4s\n"
+ "sqdmulh v12.4s, v12.4s, v17.4s\n"
"srshl v15.4s, v15.4s, v16.4s\n"
"srshl v14.4s, v14.4s, v16.4s\n"
"srshl v13.4s, v13.4s, v16.4s\n"
"srshl v12.4s, v12.4s, v16.4s\n"
- "smax v15.4s, v15.4s, v19.4s\n"
- "smax v14.4s, v14.4s, v19.4s\n"
- "smax v13.4s, v13.4s, v19.4s\n"
- "smax v12.4s, v12.4s, v19.4s\n"
- "smin v15.4s, v15.4s, v17.4s\n"
- "smin v14.4s, v14.4s, v17.4s\n"
- "smin v13.4s, v13.4s, v17.4s\n"
- "smin v12.4s, v12.4s, v17.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
+ "movi v16.4s, #0x0\n"
+ "smax v15.4s, v15.4s, v16.4s\n"
+ "smax v14.4s, v14.4s, v16.4s\n"
+ "smax v13.4s, v13.4s, v16.4s\n"
+ "smax v12.4s, v12.4s, v16.4s\n"
+ "movi v16.4s, #0xff\n"
+ "smin v15.4s, v15.4s, v16.4s\n"
+ "smin v14.4s, v14.4s, v16.4s\n"
+ "smin v13.4s, v13.4s, v16.4s\n"
+ "smin v12.4s, v12.4s, v16.4s\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
"st1 { v16.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
@@ -615,12 +617,10 @@ void a64_u8_nhwc_avg_generic_depthfirst_impl(
"tbz %x[n_channels], #0, 42f\n"
"st1 { v16.b }[0], [%x[outptr]], #0x1\n"
"42:" // Oddments: Store: Bit 3: End
-
"43:" // End
-
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [rescale_ptr] "r" (&rescale_value), [shift_ptr] "r" (&shift_value)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 0103de812d..9d160bf8f8 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,33 +24,28 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
void a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const uint8_t *const *const, uint8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-struct a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst
+struct a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<uint8_t, uint8_t>
{
- typedef uint8_t operand_type;
- typedef uint8_t return_type;
-
- typedef void (*kern_type)(unsigned int, const uint8_t *const *const, uint8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+ using Parent = DepthfirstStrategy<uint8_t, uint8_t>;
- constexpr static unsigned int pool_rows(void) { return 2; }
- constexpr static unsigned int pool_cols(void) { return 2; }
+ const static auto pooling_type = PoolingType::MAX;
+ const static auto pool_rows = 2u, pool_cols = 2u;
+ const static auto stride_rows = 1u, stride_cols = 1u;
- constexpr static unsigned int stride_rows(void) { return 1; }
- constexpr static unsigned int stride_cols(void) { return 1; }
+ a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+ : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
- constexpr static unsigned int out_rows(void) { return 2; }
- constexpr static unsigned int out_cols(void) { return 2; }
-
- kern_type kernel = a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
- a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const { return a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 02c43ccaba..66cdb7f849 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#include <cstddef>
#include <cstdint>
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace pooling {
@@ -61,114 +63,115 @@ void a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
pad_left, pad_top, pad_right, pad_bottom);
__asm__ __volatile__(
- "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
- "mov x14, #0x0\n"
- "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
- "mov x13, #0x0\n"
- "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
- "cmp x15, #0x10\n"
- "ldp x12, x11, [x20, #0x0]\n"
- "ldp x10, x9, [x20, #0x10]\n"
- "ldp x28, x27, [x19, #0x0]\n"
- "ldp x26, x25, [x19, #0x10]\n"
- "ldp x24, x23, [x19, #0x20]\n"
- "ldp x22, x21, [x19, #0x30]\n"
- "ldr x20, [x19, #0x40]\n"
+ "ldr x16, [%x[args], %[offsetof_n_channels]]\n"
+ "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+ "cmp x16, #0x10\n"
+ "mov x15, #0x0\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+ "ldp x14, x13, [x21, #0x0]\n"
+ "mov x12, #0x0\n"
+ "ldp x11, x10, [x21, #0x10]\n"
+ "ldp x9, x28, [x20, #0x0]\n"
+ "ldp x27, x26, [x20, #0x10]\n"
+ "ldp x25, x24, [x20, #0x20]\n"
+ "ldp x23, x22, [x20, #0x30]\n"
+ "ldr x21, [x20, #0x40]\n"
"blt 3f\n"
- "ldr q30, [x27, x14]\n"
- "lsr x19, x15, #0x4\n"
- "ldr q29, [x24, x14]\n"
- "sub x15, x15, x19, LSL #4\n"
- "ldr q28, [x21, x14]\n"
- "subs x19, x19, #0x1\n"
- "ldr q27, [x25, x14]\n"
- "ldr q26, [x28, x14]\n"
- "ldr q25, [x23, x14]\n"
- "ldr q24, [x26, x14]\n"
- "ldr q23, [x22, x14]\n"
- "ldr q22, [x20, x14]\n"
- "add x14, x14, #0x10\n"
+ "ldr q30, [x28, x15]\n"
+ "ldr q29, [x25, x15]\n"
+ "lsr x20, x16, #0x4\n"
+ "sub x16, x16, x20, LSL #4\n"
+ "ldr q28, [x22, x15]\n"
+ "ldr q27, [x26, x15]\n"
+ "subs x20, x20, #0x1\n"
+ "ldr q26, [x9, x15]\n"
+ "ldr q25, [x27, x15]\n"
+ "ldr q24, [x24, x15]\n"
+ "ldr q23, [x23, x15]\n"
+ "ldr q22, [x21, x15]\n"
+ "add x15, x15, #0x10\n"
"beq 2f\n"
"1:" // Vector: Loop
"umax v21.16b, v30.16b, v29.16b\n"
- "ldr q30, [x27, x14]\n"
- "subs x19, x19, #0x1\n"
+ "ldr q30, [x28, x15]\n"
"umax v20.16b, v29.16b, v28.16b\n"
- "ldr q29, [x24, x14]\n"
+ "ldr q29, [x25, x15]\n"
+ "ldr q28, [x22, x15]\n"
"umax v19.16b, v27.16b, v26.16b\n"
- "ldr q28, [x21, x14]\n"
+ "ldr q26, [x9, x15]\n"
"umax v18.16b, v25.16b, v24.16b\n"
- "ldr q26, [x28, x14]\n"
- "umax v17.16b, v23.16b, v27.16b\n"
- "ldr q27, [x25, x14]\n"
- "umax v16.16b, v25.16b, v22.16b\n"
- "ldr q25, [x23, x14]\n"
+ "ldr q25, [x27, x15]\n"
+ "umax v17.16b, v27.16b, v23.16b\n"
+ "ldr q27, [x26, x15]\n"
+ "umax v16.16b, v24.16b, v22.16b\n"
+ "ldr q24, [x24, x15]\n"
+ "ldr q23, [x23, x15]\n"
+ "subs x20, x20, #0x1\n"
"umax v19.16b, v21.16b, v19.16b\n"
- "ldr q24, [x26, x14]\n"
- "umax v18.16b, v21.16b, v18.16b\n"
- "ldr q23, [x22, x14]\n"
- "umax v17.16b, v20.16b, v17.16b\n"
- "ldr q22, [x20, x14]\n"
- "add x14, x14, #0x10\n"
+ "ldr q22, [x21, x15]\n"
+ "umax v18.16b, v18.16b, v21.16b\n"
+ "umax v17.16b, v17.16b, v20.16b\n"
+ "add x15, x15, #0x10\n"
"umax v16.16b, v20.16b, v16.16b\n"
- "str q19, [x12, x13]\n"
- "str q18, [x11, x13]\n"
- "str q17, [x10, x13]\n"
- "str q16, [x9, x13]\n"
- "add x13, x13, #0x10\n"
+ "str q19, [x14, x12]\n"
+ "str q18, [x13, x12]\n"
+ "str q17, [x11, x12]\n"
+ "str q16, [x10, x12]\n"
+ "add x12, x12, #0x10\n"
"bgt 1b\n"
"2:" // Vector: Tail
"umax v21.16b, v30.16b, v29.16b\n"
"umax v20.16b, v29.16b, v28.16b\n"
- "umax v19.16b, v27.16b, v26.16b\n"
+ "umax v16.16b, v27.16b, v26.16b\n"
"umax v18.16b, v25.16b, v24.16b\n"
- "umax v17.16b, v23.16b, v27.16b\n"
- "umax v16.16b, v25.16b, v22.16b\n"
- "umax v19.16b, v21.16b, v19.16b\n"
- "str q19, [x12, x13]\n"
- "umax v18.16b, v21.16b, v18.16b\n"
- "umax v17.16b, v20.16b, v17.16b\n"
- "str q18, [x11, x13]\n"
- "umax v16.16b, v20.16b, v16.16b\n"
- "str q17, [x10, x13]\n"
- "str q16, [x9, x13]\n"
- "add x13, x13, #0x10\n"
- "cbz x15, 4f\n"
+ "umax v17.16b, v27.16b, v23.16b\n"
+ "umax v19.16b, v24.16b, v22.16b\n"
+ "umax v16.16b, v21.16b, v16.16b\n"
+ "umax v18.16b, v18.16b, v21.16b\n"
+ "str q16, [x14, x12]\n"
+ "umax v17.16b, v17.16b, v20.16b\n"
+ "umax v16.16b, v20.16b, v19.16b\n"
+ "str q18, [x13, x12]\n"
+ "str q17, [x11, x12]\n"
+ "str q16, [x10, x12]\n"
+ "add x12, x12, #0x10\n"
+ "cbz x16, 4f\n"
"3:" // Oddments
- "ldr b30, [x27, x14]\n"
- "subs x15, x15, #0x1\n"
- "ldr b29, [x24, x14]\n"
- "umax v21.16b, v30.16b, v29.16b\n"
- "ldr b28, [x21, x14]\n"
- "ldr b27, [x25, x14]\n"
- "umax v20.16b, v29.16b, v28.16b\n"
- "ldr b26, [x28, x14]\n"
- "ldr b25, [x23, x14]\n"
- "umax v19.16b, v27.16b, v26.16b\n"
- "ldr b24, [x26, x14]\n"
- "ldr b23, [x22, x14]\n"
- "umax v19.16b, v21.16b, v19.16b\n"
- "ldr b22, [x20, x14]\n"
- "add x14, x14, #0x1\n"
- "umax v18.16b, v25.16b, v24.16b\n"
- "str b19, [x12, x13]\n"
- "umax v17.16b, v23.16b, v27.16b\n"
- "umax v16.16b, v25.16b, v22.16b\n"
- "umax v18.16b, v21.16b, v18.16b\n"
- "str b18, [x11, x13]\n"
- "umax v17.16b, v20.16b, v17.16b\n"
- "umax v16.16b, v20.16b, v16.16b\n"
- "str b17, [x10, x13]\n"
- "str b16, [x9, x13]\n"
- "add x13, x13, #0x1\n"
+ "ldr b16, [x28, x15]\n"
+ "ldr b17, [x25, x15]\n"
+ "umax v23.16b, v16.16b, v17.16b\n"
+ "subs x16, x16, #0x1\n"
+ "ldr b16, [x22, x15]\n"
+ "ldr b22, [x26, x15]\n"
+ "umax v21.16b, v17.16b, v16.16b\n"
+ "ldr b16, [x9, x15]\n"
+ "ldr b17, [x27, x15]\n"
+ "umax v16.16b, v22.16b, v16.16b\n"
+ "umax v20.16b, v23.16b, v16.16b\n"
+ "ldr b19, [x24, x15]\n"
+ "ldr b16, [x23, x15]\n"
+ "umax v18.16b, v17.16b, v19.16b\n"
+ "umax v17.16b, v22.16b, v16.16b\n"
+ "ldr b16, [x21, x15]\n"
+ "umax v16.16b, v19.16b, v16.16b\n"
+ "add x15, x15, #0x1\n"
+ "umax v18.16b, v18.16b, v23.16b\n"
+ "umax v17.16b, v17.16b, v21.16b\n"
+ "umax v16.16b, v21.16b, v16.16b\n"
+ "str b20, [x14, x12]\n"
+ "str b18, [x13, x12]\n"
+ "str b17, [x11, x12]\n"
+ "str b16, [x10, x12]\n"
+ "add x12, x12, #0x1\n"
"bgt 3b\n"
"4:" // End
-
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
- : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
} // namespace pooling
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst.hpp
index 391af31d03..7d528ccc65 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,19 +33,11 @@ namespace pooling {
void a64_u8_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
-struct a64_u8_nhwc_max_generic_depthfirst
+struct a64_u8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t>
{
- typedef uint8_t operand_type;
- typedef uint8_t return_type;
-
- typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
- kern_type kernel = a64_u8_nhwc_max_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t>;
a64_u8_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return a64_u8_nhwc_max_generic_depthfirst_impl; }
};
} // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp
index f9bbfd8b90..2ceef125ca 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,7 @@
*/
#include <cstdint>
+#include <cstddef>
#if defined(__aarch64__)
@@ -39,397 +40,395 @@ void a64_u8_nhwc_max_generic_depthfirst_impl(
)
{
__asm__ __volatile__(
- "mov x28, #0x0\n"
- "mov x27, #0x10\n" // cntb _, ALL, #1
- "mov x26, #0x20\n" // cntb _, ALL, #2
- "mov x25, #0x30\n" // cntb _, ALL, #3
"cmp %x[n_channels], #0x40\n"
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "movi v8.16b, #0x0\n"
"movi v7.16b, #0x0\n"
- "mov x19, %x[inptrs]\n"
+ "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x0\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
"movi v5.16b, #0x0\n"
- "movi v4.16b, #0x0\n"
- "cbz x24, 4f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x22, x27]\n"
- "ldr q29, [x21, x27]\n"
- "ldr q28, [x20, x27]\n"
- "ldr q27, [x23, x26]\n"
- "ldr q21, [x22, x26]\n"
- "ldr q26, [x21, x26]\n"
- "ldr q17, [x20, x26]\n"
- "ldr q25, [x23, x25]\n"
- "ldr q20, [x22, x25]\n"
- "ldr q24, [x21, x25]\n"
- "ldr q16, [x20, x25]\n"
+ "cbz x25, 4f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "umax v23.16b, v3.16b, v2.16b\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "umax v19.16b, v1.16b, v0.16b\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "umax v22.16b, v31.16b, v30.16b\n"
- "ldr q3, [x23, x28]\n"
- "umax v18.16b, v29.16b, v28.16b\n"
- "umax v21.16b, v27.16b, v21.16b\n"
- "ldr q2, [x22, x28]\n"
- "umax v17.16b, v26.16b, v17.16b\n"
- "ldr q1, [x21, x28]\n"
- "umax v20.16b, v25.16b, v20.16b\n"
- "ldr q0, [x20, x28]\n"
- "umax v16.16b, v24.16b, v16.16b\n"
- "ldr q31, [x23, x27]\n"
+ "umax v23.16b, v4.16b, v3.16b\n"
+ "umax v19.16b, v28.16b, v22.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "umax v22.16b, v2.16b, v1.16b\n"
+ "ldr q2, [x21, x26]\n"
+ "umax v18.16b, v27.16b, v21.16b\n"
+ "ldr q1, [x20, x26]\n"
+ "umax v21.16b, v0.16b, v31.16b\n"
+ "ldr q0, [x21, x24]\n"
+ "umax v17.16b, v26.16b, v20.16b\n"
+ "ldr q31, [x20, x24]\n"
+ "umax v20.16b, v30.16b, v29.16b\n"
+ "ldr q30, [x21, x23]\n"
+ "umax v16.16b, v25.16b, v24.16b\n"
+ "ldr q29, [x20, x23]\n"
"umax v19.16b, v23.16b, v19.16b\n"
- "ldr q30, [x22, x27]\n"
"umax v18.16b, v22.16b, v18.16b\n"
- "ldr q29, [x21, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"umax v17.16b, v21.16b, v17.16b\n"
- "ldr q28, [x20, x27]\n"
"umax v16.16b, v20.16b, v16.16b\n"
- "ldr q27, [x23, x26]\n"
- "umax v7.16b, v7.16b, v19.16b\n"
- "ldr q21, [x22, x26]\n"
- "umax v6.16b, v6.16b, v18.16b\n"
- "ldr q26, [x21, x26]\n"
- "umax v5.16b, v5.16b, v17.16b\n"
- "ldr q17, [x20, x26]\n"
- "umax v4.16b, v4.16b, v16.16b\n"
- "ldr q25, [x23, x25]\n"
- "ldr q20, [x22, x25]\n"
- "ldr q24, [x21, x25]\n"
- "ldr q16, [x20, x25]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "subs x25, x25, #0x1\n"
+ "umax v8.16b, v8.16b, v19.16b\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "umax v7.16b, v7.16b, v18.16b\n"
+ "umax v6.16b, v6.16b, v17.16b\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
+ "umax v5.16b, v5.16b, v16.16b\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "umax v23.16b, v3.16b, v2.16b\n"
- "umax v19.16b, v1.16b, v0.16b\n"
- "umax v22.16b, v31.16b, v30.16b\n"
- "umax v18.16b, v29.16b, v28.16b\n"
- "umax v21.16b, v27.16b, v21.16b\n"
- "umax v17.16b, v26.16b, v17.16b\n"
- "umax v20.16b, v25.16b, v20.16b\n"
- "umax v16.16b, v24.16b, v16.16b\n"
+ "umax v23.16b, v4.16b, v3.16b\n"
+ "umax v19.16b, v28.16b, v22.16b\n"
+ "umax v22.16b, v2.16b, v1.16b\n"
+ "umax v18.16b, v27.16b, v21.16b\n"
+ "umax v21.16b, v0.16b, v31.16b\n"
+ "umax v17.16b, v26.16b, v20.16b\n"
+ "umax v20.16b, v30.16b, v29.16b\n"
+ "umax v16.16b, v25.16b, v24.16b\n"
"umax v19.16b, v23.16b, v19.16b\n"
"umax v18.16b, v22.16b, v18.16b\n"
"umax v17.16b, v21.16b, v17.16b\n"
"umax v16.16b, v20.16b, v16.16b\n"
- "umax v7.16b, v7.16b, v19.16b\n"
- "umax v6.16b, v6.16b, v18.16b\n"
- "umax v5.16b, v5.16b, v17.16b\n"
- "umax v4.16b, v4.16b, v16.16b\n"
+ "umax v8.16b, v8.16b, v19.16b\n"
+ "umax v7.16b, v7.16b, v18.16b\n"
+ "umax v6.16b, v6.16b, v17.16b\n"
+ "umax v5.16b, v5.16b, v16.16b\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "umax v7.16b, v7.16b, v3.16b\n"
- "ldr q31, [x23, x27]\n"
- "ldr q27, [x23, x26]\n"
- "umax v6.16b, v6.16b, v31.16b\n"
- "ldr q25, [x23, x25]\n"
- "umax v5.16b, v5.16b, v27.16b\n"
- "umax v4.16b, v4.16b, v25.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "subs x21, x21, #0x1\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "umax v7.16b, v7.16b, v17.16b\n"
+ "umax v6.16b, v6.16b, v16.16b\n"
+ "ldr q16, [x20, x23]\n"
+ "umax v5.16b, v5.16b, v16.16b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "str q7, [%x[outptr], x28]\n"
- "add x28, x28, #0x40\n"
- "str q6, [%x[outptr], x27]\n"
- "add x27, x27, #0x40\n"
- "str q5, [%x[outptr], x26]\n"
- "add x26, x26, #0x40\n"
- "str q4, [%x[outptr], x25]\n"
- "add x25, x25, #0x40\n"
"sub %x[n_channels], %x[n_channels], #0x40\n"
"cmp %x[n_channels], #0x40\n"
+ "str q8, [%x[outptr], x27]\n"
+ "str q7, [%x[outptr], x26]\n"
+ "add x27, x27, #0x40\n"
+ "add x26, x26, #0x40\n"
+ "str q6, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q5, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 43f\n"
"7:" // Single vector of channels
"cmp %x[n_channels], #0x10\n"
"blt 14f\n"
"8:" // Single vector of channels: Loop
- "movi v7.16b, #0x0\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 11f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "movi v8.16b, #0x0\n"
+ "mov x22, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "umax v23.16b, v3.16b, v2.16b\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "umax v19.16b, v1.16b, v0.16b\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "umax v19.16b, v23.16b, v19.16b\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "umax v7.16b, v7.16b, v19.16b\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
+ "umax v17.16b, v4.16b, v3.16b\n"
+ "umax v16.16b, v28.16b, v22.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "umax v16.16b, v17.16b, v16.16b\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "umax v23.16b, v3.16b, v2.16b\n"
- "umax v19.16b, v1.16b, v0.16b\n"
- "umax v19.16b, v23.16b, v19.16b\n"
- "umax v7.16b, v7.16b, v19.16b\n"
+ "umax v17.16b, v4.16b, v3.16b\n"
+ "umax v16.16b, v28.16b, v22.16b\n"
+ "umax v16.16b, v17.16b, v16.16b\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "umax v7.16b, v7.16b, v3.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "subs x21, x21, #0x1\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "str q7, [%x[outptr], x28]\n"
- "add x28, x28, #0x10\n"
"sub %x[n_channels], %x[n_channels], #0x10\n"
"cmp %x[n_channels], #0x10\n"
+ "str q8, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 43f\n"
"14:" // Oddments
- "movi v7.16b, #0x0\n"
- "add %x[outptr], %x[outptr], x28\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 24f\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "add %x[outptr], %x[outptr], x27\n"
+ "movi v8.16b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 24f\n"
"15:" // Oddments: 4 inputs loop
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
+ "movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "add x23, x23, x28\n"
- "movi v2.16b, #0x0\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "movi v1.16b, #0x0\n"
- "add x19, x19, #0x20\n"
- "movi v0.16b, #0x0\n"
- "add x22, x22, x28\n"
- "add x21, x21, x28\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x27\n"
+ "movi v28.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d2, [x22], #0x8\n"
- "ldr d1, [x21], #0x8\n"
- "ldr d0, [x20], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v2.s }[2], [x22], #0x4\n"
- "ld1 { v1.s }[2], [x21], #0x4\n"
- "ld1 { v0.s }[2], [x20], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
- "ld1 { v2.h }[6], [x22], #0x2\n"
- "ld1 { v1.h }[6], [x21], #0x2\n"
- "ld1 { v0.h }[6], [x20], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v3.h }[6], [x22], #0x2\n"
+ "ld1 { v28.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[14], [x23], #0x1\n"
- "ld1 { v2.b }[14], [x22], #0x1\n"
- "ld1 { v1.b }[14], [x21], #0x1\n"
- "ld1 { v0.b }[14], [x20], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
+ "ld1 { v3.b }[14], [x22], #0x1\n"
+ "ld1 { v28.b }[14], [x21], #0x1\n"
+ "ld1 { v22.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[12], [x23], #0x1\n"
- "ld1 { v2.b }[12], [x22], #0x1\n"
- "ld1 { v1.b }[12], [x21], #0x1\n"
- "ld1 { v0.b }[12], [x20], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
+ "ld1 { v3.b }[12], [x22], #0x1\n"
+ "ld1 { v28.b }[12], [x21], #0x1\n"
+ "ld1 { v22.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
- "ld1 { v2.h }[4], [x22], #0x2\n"
- "ld1 { v1.h }[4], [x21], #0x2\n"
- "ld1 { v0.h }[4], [x20], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v3.h }[4], [x22], #0x2\n"
+ "ld1 { v28.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[10], [x23], #0x1\n"
- "ld1 { v2.b }[10], [x22], #0x1\n"
- "ld1 { v1.b }[10], [x21], #0x1\n"
- "ld1 { v0.b }[10], [x20], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
+ "ld1 { v3.b }[10], [x22], #0x1\n"
+ "ld1 { v28.b }[10], [x21], #0x1\n"
+ "ld1 { v22.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[8], [x23], #0x1\n"
- "ld1 { v2.b }[8], [x22], #0x1\n"
- "ld1 { v1.b }[8], [x21], #0x1\n"
- "ld1 { v0.b }[8], [x20], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
+ "ld1 { v3.b }[8], [x22], #0x1\n"
+ "ld1 { v28.b }[8], [x21], #0x1\n"
+ "ld1 { v22.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s2, [x22], #0x4\n"
- "ldr s1, [x21], #0x4\n"
- "ldr s0, [x20], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
- "ld1 { v2.h }[2], [x22], #0x2\n"
- "ld1 { v1.h }[2], [x21], #0x2\n"
- "ld1 { v0.h }[2], [x20], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v3.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[6], [x23], #0x1\n"
- "ld1 { v2.b }[6], [x22], #0x1\n"
- "ld1 { v1.b }[6], [x21], #0x1\n"
- "ld1 { v0.b }[6], [x20], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
+ "ld1 { v3.b }[6], [x22], #0x1\n"
+ "ld1 { v28.b }[6], [x21], #0x1\n"
+ "ld1 { v22.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[4], [x23], #0x1\n"
- "ld1 { v2.b }[4], [x22], #0x1\n"
- "ld1 { v1.b }[4], [x21], #0x1\n"
- "ld1 { v0.b }[4], [x20], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
+ "ld1 { v3.b }[4], [x22], #0x1\n"
+ "ld1 { v28.b }[4], [x21], #0x1\n"
+ "ld1 { v22.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h2, [x22], #0x2\n"
- "ldr h1, [x21], #0x2\n"
- "ldr h0, [x20], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "ldr h22, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[2], [x23], #0x1\n"
- "ld1 { v2.b }[2], [x22], #0x1\n"
- "ld1 { v1.b }[2], [x21], #0x1\n"
- "ld1 { v0.b }[2], [x20], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
+ "ld1 { v3.b }[2], [x22], #0x1\n"
+ "ld1 { v28.b }[2], [x21], #0x1\n"
+ "ld1 { v22.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b3, [x23], #0x1\n"
- "ldr b2, [x22], #0x1\n"
- "ldr b1, [x21], #0x1\n"
- "ldr b0, [x20], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
+ "ldr b3, [x22], #0x1\n"
+ "ldr b28, [x21], #0x1\n"
+ "ldr b22, [x20], #0x1\n"
"23:" // Oddments: 4 inputs loop: Load: Bit 3: End
- "umax v23.16b, v3.16b, v2.16b\n"
- "subs x24, x24, #0x1\n"
- "umax v19.16b, v1.16b, v0.16b\n"
- "umax v19.16b, v23.16b, v19.16b\n"
- "umax v7.16b, v7.16b, v19.16b\n"
+ "umax v17.16b, v4.16b, v3.16b\n"
+ "umax v16.16b, v28.16b, v22.16b\n"
+ "subs x25, x25, #0x1\n"
+ "umax v16.16b, v17.16b, v16.16b\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
"bgt 15b\n"
"24:" // Oddments: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "movi v3.16b, #0x0\n"
- "ldr x23, [x19], #0x8\n"
- "add x23, x23, x28\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
+ "movi v4.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d3, [x23], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[14], [x23], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[12], [x23], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[10], [x23], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[8], [x23], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s3, [x23], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[6], [x23], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[4], [x23], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h3, [x23], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[2], [x23], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b3, [x23], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
- "umax v7.16b, v7.16b, v3.16b\n"
- "subs x20, x20, #0x1\n"
+ "subs x21, x21, #0x1\n"
+ "umax v8.16b, v8.16b, v4.16b\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
"tbz %x[n_channels], #3, 38f\n"
- "st1 { v7.d }[0], [%x[outptr]], #0x8\n"
+ "st1 { v8.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
- "st1 { v7.s }[2], [%x[outptr]], #0x4\n"
+ "st1 { v8.s }[2], [%x[outptr]], #0x4\n"
"tbz %x[n_channels], #1, 35f\n"
- "st1 { v7.h }[6], [%x[outptr]], #0x2\n"
+ "st1 { v8.h }[6], [%x[outptr]], #0x2\n"
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v7.b }[14], [%x[outptr]], #0x1\n"
+ "st1 { v8.b }[14], [%x[outptr]], #0x1\n"
"b 42f\n"
"35:" // Oddments: Store: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v7.b }[12], [%x[outptr]], #0x1\n"
+ "st1 { v8.b }[12], [%x[outptr]], #0x1\n"
"b 42f\n"
"36:" // Oddments: Store: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 37f\n"
- "st1 { v7.h }[4], [%x[outptr]], #0x2\n"
+ "st1 { v8.h }[4], [%x[outptr]], #0x2\n"
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v7.b }[10], [%x[outptr]], #0x1\n"
+ "st1 { v8.b }[10], [%x[outptr]], #0x1\n"
"b 42f\n"
"37:" // Oddments: Store: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v7.b }[8], [%x[outptr]], #0x1\n"
+ "st1 { v8.b }[8], [%x[outptr]], #0x1\n"
"b 42f\n"
"38:" // Oddments: Store: Bit 3: Unset
"tbz %x[n_channels], #2, 40f\n"
- "st1 { v7.s }[0], [%x[outptr]], #0x4\n"
+ "st1 { v8.s }[0], [%x[outptr]], #0x4\n"
"tbz %x[n_channels], #1, 39f\n"
- "st1 { v7.h }[2], [%x[outptr]], #0x2\n"
+ "st1 { v8.h }[2], [%x[outptr]], #0x2\n"
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v7.b }[6], [%x[outptr]], #0x1\n"
+ "st1 { v8.b }[6], [%x[outptr]], #0x1\n"
"b 42f\n"
"39:" // Oddments: Store: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v7.b }[4], [%x[outptr]], #0x1\n"
+ "st1 { v8.b }[4], [%x[outptr]], #0x1\n"
"b 42f\n"
"40:" // Oddments: Store: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 41f\n"
- "st1 { v7.h }[0], [%x[outptr]], #0x2\n"
+ "st1 { v8.h }[0], [%x[outptr]], #0x2\n"
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v7.b }[2], [%x[outptr]], #0x1\n"
+ "st1 { v8.b }[2], [%x[outptr]], #0x1\n"
"b 42f\n"
"41:" // Oddments: Store: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 42f\n"
- "st1 { v7.b }[0], [%x[outptr]], #0x1\n"
+ "st1 { v8.b }[0], [%x[outptr]], #0x1\n"
"42:" // Oddments: Store: Bit 3: End
-
"43:" // End
-
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst.hpp
index d46658f080..daf836f5d6 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,19 +33,11 @@ namespace pooling {
void a64_u8q_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
-struct a64_u8q_nhwc_avg_generic_depthfirst
+struct a64_u8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>
{
- typedef uint8_t operand_type;
- typedef uint8_t return_type;
-
- typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
- kern_type kernel = a64_u8q_nhwc_avg_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>;
a64_u8q_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return a64_u8q_nhwc_avg_generic_depthfirst_impl; }
};
} // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp
index a57fe6df68..31a3489e5c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,8 +23,8 @@
*/
#include "pooling.hpp"
-#include <cstddef>
#include <cstdint>
+#include <cstddef>
#include <cstring>
#include <cmath>
@@ -87,12 +87,13 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
f_rescale_value *= 2.0f;
}
- rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
- if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+ int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+ if (long_rescale_value == (1ll << 31))
{
shift_value++;
- rescale_value >>= 1;
+ long_rescale_value >>= 1;
}
+ rescale_value = static_cast<int32_t>(long_rescale_value);
}
@@ -118,20 +119,20 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
);
__asm__ __volatile__(
- "mov x26, #0x0\n"
- "mov x25, #0x10\n" // cntb _, ALL, #1
- "mov x24, #0x20\n" // cntb _, ALL, #2
- "mov x23, #0x30\n" // cntb _, ALL, #3
"cmp %x[n_channels], #0x40\n"
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x25, #0x20\n" // cntb _, ALL, #2
+ "mov x24, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
"ld1r { v15.4s }, [%x[accumulator_init]]\n"
+ "lsr x23, %x[n_valid_cells], #0x1\n"
"mov v14.16b, v15.16b\n"
- "mov x19, %x[inptrs]\n"
"mov v13.16b, v15.16b\n"
- "lsr x22, %x[n_valid_cells], #0x1\n"
"mov v12.16b, v15.16b\n"
"mov v11.16b, v15.16b\n"
+ "mov x22, %x[inptrs]\n"
"mov v10.16b, v15.16b\n"
"mov v9.16b, v15.16b\n"
"mov v8.16b, v15.16b\n"
@@ -143,43 +144,43 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"mov v2.16b, v15.16b\n"
"mov v1.16b, v15.16b\n"
"mov v0.16b, v15.16b\n"
- "cbz x22, 4f\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "ldr q31, [x21, x26]\n"
- "add x19, x19, #0x10\n"
- "ldr q30, [x20, x26]\n"
- "subs x22, x22, #0x1\n"
- "ldr q29, [x21, x25]\n"
- "ldr q28, [x20, x25]\n"
- "ldr q27, [x21, x24]\n"
- "ldr q26, [x20, x24]\n"
- "ldr q25, [x21, x23]\n"
- "ldr q24, [x20, x23]\n"
+ "cbz x23, 4f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
"uaddl v23.8h, v31.8b, v30.8b\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "add x19, x19, #0x10\n"
"uaddl2 v22.8h, v31.16b, v30.16b\n"
- "ldr q31, [x21, x26]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
"uaddl v21.8h, v29.8b, v28.8b\n"
- "subs x22, x22, #0x1\n"
"uaddl2 v20.8h, v29.16b, v28.16b\n"
- "ldr q30, [x20, x26]\n"
+ "ldr q29, [x21, x26]\n"
+ "ldr q28, [x20, x26]\n"
"uaddl v19.8h, v27.8b, v26.8b\n"
- "ldr q29, [x21, x25]\n"
"uaddl2 v18.8h, v27.16b, v26.16b\n"
- "ldr q28, [x20, x25]\n"
+ "ldr q27, [x21, x25]\n"
+ "ldr q26, [x20, x25]\n"
"uaddl v17.8h, v25.8b, v24.8b\n"
- "ldr q27, [x21, x24]\n"
"uaddl2 v16.8h, v25.16b, v24.16b\n"
- "ldr q26, [x20, x24]\n"
+ "ldr q25, [x21, x24]\n"
+ "ldr q24, [x20, x24]\n"
+ "subs x23, x23, #0x1\n"
"uaddw v15.4s, v15.4s, v23.4h\n"
- "ldr q25, [x21, x23]\n"
"uaddw2 v14.4s, v14.4s, v23.8h\n"
- "ldr q24, [x20, x23]\n"
"uaddw v13.4s, v13.4s, v22.4h\n"
"uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "add x22, x22, #0x10\n"
"uaddw v11.4s, v11.4s, v21.4h\n"
"uaddw2 v10.4s, v10.4s, v21.8h\n"
"uaddw v9.4s, v9.4s, v20.4h\n"
@@ -219,23 +220,23 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"uaddw v1.4s, v1.4s, v16.4h\n"
"uaddw2 v0.4s, v0.4s, v16.8h\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x21, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q31, [x21, x26]\n"
- "uxtl v23.8h, v31.8b\n"
- "ldr q29, [x21, x25]\n"
- "uxtl2 v22.8h, v31.16b\n"
- "ldr q27, [x21, x24]\n"
- "ldr q25, [x21, x23]\n"
- "uxtl v21.8h, v29.8b\n"
- "uxtl2 v20.8h, v29.16b\n"
- "uxtl v19.8h, v27.8b\n"
- "uxtl2 v18.8h, v27.16b\n"
- "uxtl v17.8h, v25.8b\n"
- "uxtl2 v16.8h, v25.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "uxtl v23.8h, v16.8b\n"
+ "uxtl2 v22.8h, v16.16b\n"
+ "ldr q16, [x20, x26]\n"
+ "ldr q17, [x20, x25]\n"
+ "uxtl v21.8h, v16.8b\n"
+ "uxtl2 v20.8h, v16.16b\n"
+ "ldr q16, [x20, x24]\n"
+ "uxtl v19.8h, v17.8b\n"
+ "uxtl2 v18.8h, v17.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uxtl v17.8h, v16.8b\n"
+ "uxtl2 v16.8h, v16.16b\n"
"uaddw v15.4s, v15.4s, v23.4h\n"
"uaddw2 v14.4s, v14.4s, v23.8h\n"
"uaddw v13.4s, v13.4s, v22.4h\n"
@@ -254,64 +255,62 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"uaddw2 v0.4s, v0.4s, v16.8h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "movi v21.4s, #0x0\n"
- "ld1r { v20.4s }, [%x[combined_rescale_value]]\n"
- "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
- "movi v19.4s, #0xff\n"
- "ld1r { v18.4s }, [%x[left_shift]]\n"
- "sub %x[n_channels], %x[n_channels], #0x40\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
+ "ld1r { v19.4s }, [%x[left_shift]]\n"
+ "ld1r { v18.4s }, [%x[combined_rescale_value]]\n"
+ "srshl v15.4s, v15.4s, v19.4s\n"
+ "srshl v14.4s, v14.4s, v19.4s\n"
"ld1r { v17.4s }, [%x[right_shift]]\n"
+ "srshl v13.4s, v13.4s, v19.4s\n"
+ "srshl v12.4s, v12.4s, v19.4s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "srshl v11.4s, v11.4s, v19.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sub %x[n_channels], %x[n_channels], #0x40\n"
+ "srshl v9.4s, v9.4s, v19.4s\n"
+ "srshl v8.4s, v8.4s, v19.4s\n"
"cmp %x[n_channels], #0x40\n"
- "srshl v14.4s, v14.4s, v18.4s\n"
- "ld1r { v16.4s }, [x19]\n"
- "srshl v13.4s, v13.4s, v18.4s\n"
- "srshl v12.4s, v12.4s, v18.4s\n"
- "srshl v11.4s, v11.4s, v18.4s\n"
- "sqrdmulh v15.4s, v15.4s, v20.4s\n"
- "sqrdmulh v14.4s, v14.4s, v20.4s\n"
- "sqrdmulh v13.4s, v13.4s, v20.4s\n"
- "sqrdmulh v12.4s, v12.4s, v20.4s\n"
+ "srshl v7.4s, v7.4s, v19.4s\n"
+ "srshl v6.4s, v6.4s, v19.4s\n"
+ "srshl v5.4s, v5.4s, v19.4s\n"
+ "srshl v4.4s, v4.4s, v19.4s\n"
+ "srshl v3.4s, v3.4s, v19.4s\n"
+ "srshl v2.4s, v2.4s, v19.4s\n"
+ "srshl v1.4s, v1.4s, v19.4s\n"
+ "srshl v0.4s, v0.4s, v19.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v18.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v18.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v18.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v18.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v18.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v18.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v18.4s\n"
+ "sqrdmulh v8.4s, v8.4s, v18.4s\n"
+ "sqrdmulh v7.4s, v7.4s, v18.4s\n"
+ "sqrdmulh v6.4s, v6.4s, v18.4s\n"
+ "sqrdmulh v5.4s, v5.4s, v18.4s\n"
+ "sqrdmulh v4.4s, v4.4s, v18.4s\n"
+ "sqrdmulh v3.4s, v3.4s, v18.4s\n"
+ "sqrdmulh v2.4s, v2.4s, v18.4s\n"
+ "sqrdmulh v1.4s, v1.4s, v18.4s\n"
+ "sqrdmulh v0.4s, v0.4s, v18.4s\n"
"srshl v15.4s, v15.4s, v17.4s\n"
"srshl v14.4s, v14.4s, v17.4s\n"
"srshl v13.4s, v13.4s, v17.4s\n"
"srshl v12.4s, v12.4s, v17.4s\n"
- "sqrdmulh v11.4s, v11.4s, v20.4s\n"
- "srshl v10.4s, v10.4s, v18.4s\n"
- "srshl v9.4s, v9.4s, v18.4s\n"
- "srshl v8.4s, v8.4s, v18.4s\n"
"srshl v11.4s, v11.4s, v17.4s\n"
- "sqrdmulh v10.4s, v10.4s, v20.4s\n"
- "sqrdmulh v9.4s, v9.4s, v20.4s\n"
- "sqrdmulh v8.4s, v8.4s, v20.4s\n"
- "srshl v7.4s, v7.4s, v18.4s\n"
"srshl v10.4s, v10.4s, v17.4s\n"
"srshl v9.4s, v9.4s, v17.4s\n"
"srshl v8.4s, v8.4s, v17.4s\n"
- "sqrdmulh v7.4s, v7.4s, v20.4s\n"
- "srshl v6.4s, v6.4s, v18.4s\n"
- "srshl v5.4s, v5.4s, v18.4s\n"
- "srshl v4.4s, v4.4s, v18.4s\n"
"srshl v7.4s, v7.4s, v17.4s\n"
- "sqrdmulh v6.4s, v6.4s, v20.4s\n"
- "sqrdmulh v5.4s, v5.4s, v20.4s\n"
- "sqrdmulh v4.4s, v4.4s, v20.4s\n"
- "srshl v3.4s, v3.4s, v18.4s\n"
"srshl v6.4s, v6.4s, v17.4s\n"
"srshl v5.4s, v5.4s, v17.4s\n"
"srshl v4.4s, v4.4s, v17.4s\n"
- "sqrdmulh v3.4s, v3.4s, v20.4s\n"
- "srshl v2.4s, v2.4s, v18.4s\n"
- "srshl v1.4s, v1.4s, v18.4s\n"
- "srshl v0.4s, v0.4s, v18.4s\n"
"srshl v3.4s, v3.4s, v17.4s\n"
- "sqrdmulh v2.4s, v2.4s, v20.4s\n"
- "sqrdmulh v1.4s, v1.4s, v20.4s\n"
- "sqrdmulh v0.4s, v0.4s, v20.4s\n"
- "add v15.4s, v15.4s, v16.4s\n"
"srshl v2.4s, v2.4s, v17.4s\n"
"srshl v1.4s, v1.4s, v17.4s\n"
"srshl v0.4s, v0.4s, v17.4s\n"
+ "add v15.4s, v15.4s, v16.4s\n"
"add v14.4s, v14.4s, v16.4s\n"
"add v13.4s, v13.4s, v16.4s\n"
"add v12.4s, v12.4s, v16.4s\n"
@@ -327,58 +326,60 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"add v2.4s, v2.4s, v16.4s\n"
"add v1.4s, v1.4s, v16.4s\n"
"add v0.4s, v0.4s, v16.4s\n"
- "smax v15.4s, v15.4s, v21.4s\n"
- "smax v14.4s, v14.4s, v21.4s\n"
- "smax v13.4s, v13.4s, v21.4s\n"
- "smin v15.4s, v15.4s, v19.4s\n"
- "smin v14.4s, v14.4s, v19.4s\n"
- "smin v13.4s, v13.4s, v19.4s\n"
- "smax v12.4s, v12.4s, v21.4s\n"
- "smax v11.4s, v11.4s, v21.4s\n"
- "smax v10.4s, v10.4s, v21.4s\n"
- "smin v12.4s, v12.4s, v19.4s\n"
- "smin v11.4s, v11.4s, v19.4s\n"
- "smin v10.4s, v10.4s, v19.4s\n"
- "smax v9.4s, v9.4s, v21.4s\n"
- "smax v8.4s, v8.4s, v21.4s\n"
- "smax v7.4s, v7.4s, v21.4s\n"
- "smin v9.4s, v9.4s, v19.4s\n"
- "smin v8.4s, v8.4s, v19.4s\n"
- "smin v7.4s, v7.4s, v19.4s\n"
- "smax v6.4s, v6.4s, v21.4s\n"
- "smax v5.4s, v5.4s, v21.4s\n"
- "smax v4.4s, v4.4s, v21.4s\n"
- "smin v6.4s, v6.4s, v19.4s\n"
- "smin v5.4s, v5.4s, v19.4s\n"
- "smin v4.4s, v4.4s, v19.4s\n"
- "smax v3.4s, v3.4s, v21.4s\n"
- "smax v2.4s, v2.4s, v21.4s\n"
- "smax v1.4s, v1.4s, v21.4s\n"
- "smin v3.4s, v3.4s, v19.4s\n"
- "smin v2.4s, v2.4s, v19.4s\n"
- "smin v1.4s, v1.4s, v19.4s\n"
- "smax v0.4s, v0.4s, v21.4s\n"
+ "movi v16.4s, #0x0\n"
+ "smax v15.4s, v15.4s, v16.4s\n"
+ "smax v14.4s, v14.4s, v16.4s\n"
+ "smax v13.4s, v13.4s, v16.4s\n"
+ "smax v12.4s, v12.4s, v16.4s\n"
+ "smax v11.4s, v11.4s, v16.4s\n"
+ "smax v10.4s, v10.4s, v16.4s\n"
+ "smax v9.4s, v9.4s, v16.4s\n"
+ "smax v8.4s, v8.4s, v16.4s\n"
+ "smax v7.4s, v7.4s, v16.4s\n"
+ "smax v6.4s, v6.4s, v16.4s\n"
+ "smax v5.4s, v5.4s, v16.4s\n"
+ "smax v4.4s, v4.4s, v16.4s\n"
+ "smax v3.4s, v3.4s, v16.4s\n"
+ "smax v2.4s, v2.4s, v16.4s\n"
+ "smax v1.4s, v1.4s, v16.4s\n"
+ "smax v0.4s, v0.4s, v16.4s\n"
+ "movi v16.4s, #0xff\n"
+ "smin v15.4s, v15.4s, v16.4s\n"
+ "smin v14.4s, v14.4s, v16.4s\n"
+ "smin v13.4s, v13.4s, v16.4s\n"
+ "smin v12.4s, v12.4s, v16.4s\n"
+ "smin v11.4s, v11.4s, v16.4s\n"
+ "smin v10.4s, v10.4s, v16.4s\n"
+ "smin v9.4s, v9.4s, v16.4s\n"
+ "smin v8.4s, v8.4s, v16.4s\n"
+ "smin v7.4s, v7.4s, v16.4s\n"
+ "smin v6.4s, v6.4s, v16.4s\n"
+ "smin v5.4s, v5.4s, v16.4s\n"
+ "smin v4.4s, v4.4s, v16.4s\n"
+ "smin v3.4s, v3.4s, v16.4s\n"
+ "smin v2.4s, v2.4s, v16.4s\n"
+ "smin v1.4s, v1.4s, v16.4s\n"
+ "smin v0.4s, v0.4s, v16.4s\n"
"uzp1 v23.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "smin v0.4s, v0.4s, v19.4s\n"
"uzp1 v22.16b, v11.16b, v10.16b\n"
- "uzp1 v21.16b, v9.16b, v8.16b\n"
- "uzp1 v20.16b, v7.16b, v6.16b\n"
+ "uzp1 v18.16b, v9.16b, v8.16b\n"
+ "uzp1 v21.16b, v7.16b, v6.16b\n"
"uzp1 v17.16b, v5.16b, v4.16b\n"
- "uzp1 v19.16b, v3.16b, v2.16b\n"
- "uzp1 v18.16b, v1.16b, v0.16b\n"
+ "uzp1 v20.16b, v3.16b, v2.16b\n"
+ "uzp1 v19.16b, v1.16b, v0.16b\n"
"uzp1 v16.16b, v23.16b, v16.16b\n"
- "str q16, [%x[outptr], x26]\n"
- "uzp1 v16.16b, v22.16b, v21.16b\n"
+ "uzp1 v18.16b, v22.16b, v18.16b\n"
+ "str q16, [%x[outptr], x27]\n"
+ "add x27, x27, #0x40\n"
+ "uzp1 v17.16b, v21.16b, v17.16b\n"
+ "uzp1 v16.16b, v20.16b, v19.16b\n"
+ "str q18, [%x[outptr], x26]\n"
"add x26, x26, #0x40\n"
- "uzp1 v17.16b, v20.16b, v17.16b\n"
- "str q16, [%x[outptr], x25]\n"
- "uzp1 v16.16b, v19.16b, v18.16b\n"
+ "str q17, [%x[outptr], x25]\n"
"add x25, x25, #0x40\n"
- "str q17, [%x[outptr], x24]\n"
+ "str q16, [%x[outptr], x24]\n"
"add x24, x24, #0x40\n"
- "str q16, [%x[outptr], x23]\n"
- "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 43f\n"
"7:" // Single vector of channels
@@ -386,70 +387,68 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"blt 14f\n"
"8:" // Single vector of channels: Loop
"ld1r { v15.4s }, [%x[accumulator_init]]\n"
+ "lsr x23, %x[n_valid_cells], #0x1\n"
"mov v14.16b, v15.16b\n"
- "mov x19, %x[inptrs]\n"
"mov v13.16b, v15.16b\n"
- "lsr x22, %x[n_valid_cells], #0x1\n"
"mov v12.16b, v15.16b\n"
- "cbz x22, 11f\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "ldr q31, [x21, x26]\n"
- "add x19, x19, #0x10\n"
- "ldr q30, [x20, x26]\n"
- "subs x22, x22, #0x1\n"
+ "mov x22, %x[inptrs]\n"
+ "cbz x23, 11f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ldr q30, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "add x19, x19, #0x10\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
- "ldr q31, [x21, x26]\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "subs x22, x22, #0x1\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "ldr q30, [x20, x26]\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q31, [x21, x27]\n"
+ "ldr q30, [x20, x27]\n"
+ "subs x23, x23, #0x1\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
+ "add x22, x22, #0x10\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x21, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q31, [x21, x26]\n"
- "uxtl v23.8h, v31.8b\n"
- "uxtl2 v22.8h, v31.16b\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "uxtl v17.8h, v16.8b\n"
+ "uxtl2 v16.8h, v16.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "movi v21.4s, #0x0\n"
- "ld1r { v20.4s }, [%x[combined_rescale_value]]\n"
- "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
- "movi v19.4s, #0xff\n"
- "ld1r { v18.4s }, [%x[left_shift]]\n"
- "sub %x[n_channels], %x[n_channels], #0x10\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
+ "ld1r { v16.4s }, [%x[left_shift]]\n"
+ "ld1r { v18.4s }, [%x[combined_rescale_value]]\n"
+ "srshl v15.4s, v15.4s, v16.4s\n"
+ "srshl v14.4s, v14.4s, v16.4s\n"
"ld1r { v17.4s }, [%x[right_shift]]\n"
+ "srshl v13.4s, v13.4s, v16.4s\n"
+ "srshl v12.4s, v12.4s, v16.4s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "sqrdmulh v15.4s, v15.4s, v18.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v18.4s\n"
+ "sub %x[n_channels], %x[n_channels], #0x10\n"
+ "sqrdmulh v13.4s, v13.4s, v18.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v18.4s\n"
"cmp %x[n_channels], #0x10\n"
- "srshl v14.4s, v14.4s, v18.4s\n"
- "ld1r { v16.4s }, [x19]\n"
- "srshl v13.4s, v13.4s, v18.4s\n"
- "srshl v12.4s, v12.4s, v18.4s\n"
- "sqrdmulh v15.4s, v15.4s, v20.4s\n"
- "sqrdmulh v14.4s, v14.4s, v20.4s\n"
- "sqrdmulh v13.4s, v13.4s, v20.4s\n"
- "sqrdmulh v12.4s, v12.4s, v20.4s\n"
"srshl v15.4s, v15.4s, v17.4s\n"
"srshl v14.4s, v14.4s, v17.4s\n"
"srshl v13.4s, v13.4s, v17.4s\n"
@@ -458,37 +457,39 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"add v14.4s, v14.4s, v16.4s\n"
"add v13.4s, v13.4s, v16.4s\n"
"add v12.4s, v12.4s, v16.4s\n"
- "smax v15.4s, v15.4s, v21.4s\n"
- "smax v14.4s, v14.4s, v21.4s\n"
- "smax v13.4s, v13.4s, v21.4s\n"
- "smin v15.4s, v15.4s, v19.4s\n"
- "smin v14.4s, v14.4s, v19.4s\n"
- "smin v13.4s, v13.4s, v19.4s\n"
- "smax v12.4s, v12.4s, v21.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
- "smin v12.4s, v12.4s, v19.4s\n"
+ "movi v16.4s, #0x0\n"
+ "smax v15.4s, v15.4s, v16.4s\n"
+ "smax v14.4s, v14.4s, v16.4s\n"
+ "smax v13.4s, v13.4s, v16.4s\n"
+ "smax v12.4s, v12.4s, v16.4s\n"
+ "movi v16.4s, #0xff\n"
+ "smin v15.4s, v15.4s, v16.4s\n"
+ "smin v14.4s, v14.4s, v16.4s\n"
+ "smin v13.4s, v13.4s, v16.4s\n"
+ "smin v12.4s, v12.4s, v16.4s\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
- "str q16, [%x[outptr], x26]\n"
- "add x26, x26, #0x10\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
+ "str q16, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 43f\n"
"14:" // Oddments
"ld1r { v15.4s }, [%x[accumulator_init]]\n"
+ "lsr x23, %x[n_valid_cells], #0x1\n"
+ "add %x[outptr], %x[outptr], x27\n"
"mov v14.16b, v15.16b\n"
- "add %x[outptr], %x[outptr], x26\n"
"mov v13.16b, v15.16b\n"
- "mov x19, %x[inptrs]\n"
"mov v12.16b, v15.16b\n"
- "lsr x22, %x[n_valid_cells], #0x1\n"
- "cbz x22, 24f\n"
+ "mov x22, %x[inptrs]\n"
+ "cbz x23, 24f\n"
"15:" // Oddments: 2 inputs loop
+ "ldp x21, x20, [x22, #0x0]\n"
+ "add x22, x22, #0x10\n"
+ "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "add x19, x19, #0x10\n"
+ "add x20, x20, x27\n"
"movi v30.16b, #0x0\n"
- "add x21, x21, x26\n"
- "add x20, x20, x26\n"
"tbz %x[n_channels], #3, 19f\n"
"ldr d31, [x21], #0x8\n"
"ldr d30, [x20], #0x8\n"
@@ -549,21 +550,21 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"ldr b31, [x21], #0x1\n"
"ldr b30, [x20], #0x1\n"
"23:" // Oddments: 2 inputs loop: Load: Bit 3: End
- "uaddl v23.8h, v31.8b, v30.8b\n"
- "subs x22, x22, #0x1\n"
- "uaddl2 v22.8h, v31.16b, v30.16b\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uaddl v17.8h, v31.8b, v30.8b\n"
+ "uaddl2 v16.8h, v31.16b, v30.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 15b\n"
"24:" // Oddments: After loop
- "ands x20, %x[n_valid_cells], #0x1\n"
+ "ands x23, %x[n_valid_cells], #0x1\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
+ "ldr x21, [x22], #0x8\n"
+ "add x21, x21, x27\n"
"movi v31.16b, #0x0\n"
- "ldr x21, [x19], #0x8\n"
- "add x21, x21, x26\n"
"tbz %x[n_channels], #3, 29f\n"
"ldr d31, [x21], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
@@ -609,30 +610,28 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"tbz %x[n_channels], #0, 33f\n"
"ldr b31, [x21], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
- "uxtl v23.8h, v31.8b\n"
- "subs x20, x20, #0x1\n"
- "uxtl2 v22.8h, v31.16b\n"
- "uaddw v15.4s, v15.4s, v23.4h\n"
- "uaddw2 v14.4s, v14.4s, v23.8h\n"
- "uaddw v13.4s, v13.4s, v22.4h\n"
- "uaddw2 v12.4s, v12.4s, v22.8h\n"
+ "uxtl v17.8h, v31.8b\n"
+ "uxtl2 v16.8h, v31.16b\n"
+ "subs x23, x23, #0x1\n"
+ "uaddw v15.4s, v15.4s, v17.4h\n"
+ "uaddw2 v14.4s, v14.4s, v17.8h\n"
+ "uaddw v13.4s, v13.4s, v16.4h\n"
+ "uaddw2 v12.4s, v12.4s, v16.8h\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
- "movi v21.4s, #0x0\n"
- "ld1r { v20.4s }, [%x[combined_rescale_value]]\n"
- "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
- "movi v19.4s, #0xff\n"
- "ld1r { v18.4s }, [%x[left_shift]]\n"
+ "ld1r { v16.4s }, [%x[left_shift]]\n"
+ "ld1r { v18.4s }, [%x[combined_rescale_value]]\n"
+ "srshl v15.4s, v15.4s, v16.4s\n"
+ "srshl v14.4s, v14.4s, v16.4s\n"
"ld1r { v17.4s }, [%x[right_shift]]\n"
- "srshl v15.4s, v15.4s, v18.4s\n"
- "ld1r { v16.4s }, [x19]\n"
- "srshl v14.4s, v14.4s, v18.4s\n"
- "srshl v13.4s, v13.4s, v18.4s\n"
- "srshl v12.4s, v12.4s, v18.4s\n"
- "sqrdmulh v15.4s, v15.4s, v20.4s\n"
- "sqrdmulh v14.4s, v14.4s, v20.4s\n"
- "sqrdmulh v13.4s, v13.4s, v20.4s\n"
- "sqrdmulh v12.4s, v12.4s, v20.4s\n"
+ "srshl v13.4s, v13.4s, v16.4s\n"
+ "srshl v12.4s, v12.4s, v16.4s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "sqrdmulh v15.4s, v15.4s, v18.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v18.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v18.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v18.4s\n"
"srshl v15.4s, v15.4s, v17.4s\n"
"srshl v14.4s, v14.4s, v17.4s\n"
"srshl v13.4s, v13.4s, v17.4s\n"
@@ -641,17 +640,19 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"add v14.4s, v14.4s, v16.4s\n"
"add v13.4s, v13.4s, v16.4s\n"
"add v12.4s, v12.4s, v16.4s\n"
- "smax v15.4s, v15.4s, v21.4s\n"
- "smax v14.4s, v14.4s, v21.4s\n"
- "smax v13.4s, v13.4s, v21.4s\n"
- "smin v15.4s, v15.4s, v19.4s\n"
- "smin v14.4s, v14.4s, v19.4s\n"
- "smin v13.4s, v13.4s, v19.4s\n"
- "smax v12.4s, v12.4s, v21.4s\n"
- "uzp1 v23.16b, v15.16b, v14.16b\n"
- "smin v12.4s, v12.4s, v19.4s\n"
+ "movi v16.4s, #0x0\n"
+ "smax v15.4s, v15.4s, v16.4s\n"
+ "smax v14.4s, v14.4s, v16.4s\n"
+ "smax v13.4s, v13.4s, v16.4s\n"
+ "smax v12.4s, v12.4s, v16.4s\n"
+ "movi v16.4s, #0xff\n"
+ "smin v15.4s, v15.4s, v16.4s\n"
+ "smin v14.4s, v14.4s, v16.4s\n"
+ "smin v13.4s, v13.4s, v16.4s\n"
+ "smin v12.4s, v12.4s, v16.4s\n"
+ "uzp1 v17.16b, v15.16b, v14.16b\n"
"uzp1 v16.16b, v13.16b, v12.16b\n"
- "uzp1 v16.16b, v23.16b, v16.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
"st1 { v16.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
@@ -697,12 +698,10 @@ void a64_u8q_nhwc_avg_generic_depthfirst_impl(
"tbz %x[n_channels], #0, 42f\n"
"st1 { v16.b }[0], [%x[outptr]], #0x1\n"
"42:" // Oddments: Store: Bit 3: End
-
"43:" // End
-
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [accumulator_init] "r" (&accumulator_init), [combined_rescale_value] "r" (&combined_rescale_value), [inptrs] "r" (inptrs), [left_shift] "r" (&left_shift), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [quant_params] "r" (&qp), [right_shift] "r" (&right_shift)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst.hpp
index 1b97b458c0..fa9600f83d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,19 +33,11 @@ namespace pooling {
void a64_u8q_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
-struct a64_u8q_nhwc_max_generic_depthfirst
+struct a64_u8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>
{
- typedef uint8_t operand_type;
- typedef uint8_t return_type;
-
- typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
- kern_type kernel = a64_u8q_nhwc_max_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>;
a64_u8q_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return a64_u8q_nhwc_max_generic_depthfirst_impl; }
};
} // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp
index 0d196e097e..f4927c5536 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,8 +23,8 @@
*/
#include "pooling.hpp"
-#include <cstddef>
#include <cstdint>
+#include <cstddef>
#if defined(__aarch64__)
@@ -42,583 +42,583 @@ void a64_u8q_nhwc_max_generic_depthfirst_impl(
)
{
__asm__ __volatile__(
- "mov x28, #0x0\n"
- "mov x27, #0x10\n" // cntb _, ALL, #1
- "mov x26, #0x20\n" // cntb _, ALL, #2
- "mov x25, #0x30\n" // cntb _, ALL, #3
"cmp %x[n_channels], #0x40\n"
+ "mov x27, #0x0\n"
+ "mov x26, #0x10\n" // cntb _, ALL, #1
+ "mov x24, #0x20\n" // cntb _, ALL, #2
+ "mov x23, #0x30\n" // cntb _, ALL, #3
"blt 7f\n"
"1:" // 4-vectors of channels
- "movi v4.16b, #0x0\n"
- "mov x19, %x[inptrs]\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
"movi v8.16b, #0x0\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
"movi v7.16b, #0x0\n"
+ "mov x22, %x[inptrs]\n"
"movi v6.16b, #0x0\n"
- "cbz x24, 4f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
- "ldr q31, [x23, x27]\n"
- "ldr q30, [x22, x27]\n"
- "ldr q29, [x21, x27]\n"
- "ldr q28, [x20, x27]\n"
- "ldr q27, [x23, x26]\n"
- "ldr q21, [x22, x26]\n"
- "ldr q26, [x21, x26]\n"
- "ldr q17, [x20, x26]\n"
- "ldr q25, [x23, x25]\n"
- "ldr q20, [x22, x25]\n"
- "ldr q24, [x21, x25]\n"
- "ldr q16, [x20, x25]\n"
+ "movi v5.16b, #0x0\n"
+ "cbz x25, 4f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q3, [x20, x27]\n"
+ "ldr q2, [x21, x26]\n"
+ "ldr q1, [x20, x26]\n"
+ "ldr q0, [x21, x24]\n"
+ "ldr q31, [x20, x24]\n"
+ "ldr q30, [x21, x23]\n"
+ "ldr q29, [x20, x23]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "umax v23.16b, v3.16b, v2.16b\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "umax v19.16b, v1.16b, v0.16b\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "umax v22.16b, v31.16b, v30.16b\n"
- "ldr q3, [x23, x28]\n"
- "umax v18.16b, v29.16b, v28.16b\n"
- "umax v21.16b, v27.16b, v21.16b\n"
- "ldr q2, [x22, x28]\n"
- "umax v17.16b, v26.16b, v17.16b\n"
- "ldr q1, [x21, x28]\n"
- "umax v20.16b, v25.16b, v20.16b\n"
- "ldr q0, [x20, x28]\n"
- "umax v16.16b, v24.16b, v16.16b\n"
- "ldr q31, [x23, x27]\n"
+ "umax v23.16b, v4.16b, v3.16b\n"
+ "umax v19.16b, v28.16b, v22.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "umax v22.16b, v2.16b, v1.16b\n"
+ "ldr q2, [x21, x26]\n"
+ "umax v18.16b, v27.16b, v21.16b\n"
+ "ldr q1, [x20, x26]\n"
+ "umax v21.16b, v0.16b, v31.16b\n"
+ "ldr q0, [x21, x24]\n"
+ "umax v17.16b, v26.16b, v20.16b\n"
+ "ldr q31, [x20, x24]\n"
+ "umax v20.16b, v30.16b, v29.16b\n"
+ "ldr q30, [x21, x23]\n"
+ "umax v16.16b, v25.16b, v24.16b\n"
+ "ldr q29, [x20, x23]\n"
"umax v19.16b, v23.16b, v19.16b\n"
- "ldr q30, [x22, x27]\n"
"umax v18.16b, v22.16b, v18.16b\n"
- "ldr q29, [x21, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"umax v17.16b, v21.16b, v17.16b\n"
- "ldr q28, [x20, x27]\n"
"umax v16.16b, v20.16b, v16.16b\n"
- "ldr q27, [x23, x26]\n"
- "umax v4.16b, v4.16b, v19.16b\n"
- "ldr q21, [x22, x26]\n"
- "umax v8.16b, v8.16b, v18.16b\n"
- "ldr q26, [x21, x26]\n"
- "umax v7.16b, v7.16b, v17.16b\n"
- "ldr q17, [x20, x26]\n"
- "umax v6.16b, v6.16b, v16.16b\n"
- "ldr q25, [x23, x25]\n"
- "ldr q20, [x22, x25]\n"
- "ldr q24, [x21, x25]\n"
- "ldr q16, [x20, x25]\n"
+ "ldr q27, [x21, x26]\n"
+ "ldr q21, [x20, x26]\n"
+ "subs x25, x25, #0x1\n"
+ "umax v8.16b, v8.16b, v19.16b\n"
+ "ldr q26, [x21, x24]\n"
+ "ldr q20, [x20, x24]\n"
+ "umax v7.16b, v7.16b, v18.16b\n"
+ "umax v6.16b, v6.16b, v17.16b\n"
+ "ldr q25, [x21, x23]\n"
+ "ldr q24, [x20, x23]\n"
+ "umax v5.16b, v5.16b, v16.16b\n"
+ "add x22, x22, #0x20\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "umax v23.16b, v3.16b, v2.16b\n"
- "umax v19.16b, v1.16b, v0.16b\n"
- "umax v22.16b, v31.16b, v30.16b\n"
- "umax v18.16b, v29.16b, v28.16b\n"
- "umax v21.16b, v27.16b, v21.16b\n"
- "umax v17.16b, v26.16b, v17.16b\n"
- "umax v20.16b, v25.16b, v20.16b\n"
- "umax v16.16b, v24.16b, v16.16b\n"
+ "umax v23.16b, v4.16b, v3.16b\n"
+ "umax v19.16b, v28.16b, v22.16b\n"
+ "umax v22.16b, v2.16b, v1.16b\n"
+ "umax v18.16b, v27.16b, v21.16b\n"
+ "umax v21.16b, v0.16b, v31.16b\n"
+ "umax v17.16b, v26.16b, v20.16b\n"
+ "umax v20.16b, v30.16b, v29.16b\n"
+ "umax v16.16b, v25.16b, v24.16b\n"
"umax v19.16b, v23.16b, v19.16b\n"
"umax v18.16b, v22.16b, v18.16b\n"
"umax v17.16b, v21.16b, v17.16b\n"
"umax v16.16b, v20.16b, v16.16b\n"
- "umax v4.16b, v4.16b, v19.16b\n"
- "umax v8.16b, v8.16b, v18.16b\n"
- "umax v7.16b, v7.16b, v17.16b\n"
- "umax v6.16b, v6.16b, v16.16b\n"
+ "umax v8.16b, v8.16b, v19.16b\n"
+ "umax v7.16b, v7.16b, v18.16b\n"
+ "umax v6.16b, v6.16b, v17.16b\n"
+ "umax v5.16b, v5.16b, v16.16b\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "umax v4.16b, v4.16b, v3.16b\n"
- "ldr q31, [x23, x27]\n"
- "ldr q27, [x23, x26]\n"
- "umax v8.16b, v8.16b, v31.16b\n"
- "ldr q25, [x23, x25]\n"
- "umax v7.16b, v7.16b, v27.16b\n"
- "umax v6.16b, v6.16b, v25.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "subs x21, x21, #0x1\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
+ "ldr q17, [x20, x26]\n"
+ "ldr q16, [x20, x24]\n"
+ "umax v7.16b, v7.16b, v17.16b\n"
+ "umax v6.16b, v6.16b, v16.16b\n"
+ "ldr q16, [x20, x23]\n"
+ "umax v5.16b, v5.16b, v16.16b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "uxtl v17.8h, v4.8b\n"
- "add x19, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1r { v5.4s }, [x19]\n"
- "uxtl2 v16.8h, v4.16b\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "uxtl v21.8h, v8.8b\n"
- "ld1r { v4.4s }, [x19]\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "uxtl2 v20.8h, v8.16b\n"
- "ld1r { v3.4s }, [x19]\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "uxtl v19.8h, v7.8b\n"
- "ld1r { v2.4s }, [x19]\n"
- "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
- "uxtl2 v24.8h, v7.16b\n"
- "ld1r { v1.4s }, [x19]\n"
+ "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+ "ld1r { v4.4s }, [x20]\n"
+ "uxtl v23.8h, v8.8b\n"
+ "uxtl2 v24.8h, v8.16b\n"
+ "uxtl v22.8h, v7.8b\n"
+ "uxtl2 v21.8h, v7.16b\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+ "ld1r { v3.4s }, [x20]\n"
+ "uxtl v20.8h, v6.8b\n"
+ "uxtl2 v17.8h, v6.16b\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+ "ld1r { v2.4s }, [x20]\n"
+ "uxtl v19.8h, v5.8b\n"
+ "uxtl2 v18.8h, v5.16b\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+ "ld1r { v1.4s }, [x20]\n"
+ "neg v4.4s, v4.4s\n"
+ "saddw v0.4s, v4.4s, v23.4h\n"
+ "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "saddw2 v23.4s, v4.4s, v23.8h\n"
+ "saddw v31.4s, v4.4s, v24.4h\n"
"sub %x[n_channels], %x[n_channels], #0x40\n"
- "uxtl v0.8h, v6.8b\n"
"cmp %x[n_channels], #0x40\n"
- "uxtl2 v31.8h, v6.16b\n"
- "neg v5.4s, v5.4s\n"
- "movi v30.4s, #0x0\n"
- "movi v29.4s, #0xff\n"
- "saddw v23.4s, v5.4s, v17.4h\n"
- "saddw2 v18.4s, v5.4s, v17.8h\n"
- "saddw v17.4s, v5.4s, v16.4h\n"
- "saddw2 v16.4s, v5.4s, v16.8h\n"
- "saddw v22.4s, v5.4s, v21.4h\n"
- "saddw2 v21.4s, v5.4s, v21.8h\n"
- "saddw v28.4s, v5.4s, v20.4h\n"
- "saddw2 v20.4s, v5.4s, v20.8h\n"
- "saddw v27.4s, v5.4s, v19.4h\n"
- "saddw2 v19.4s, v5.4s, v19.8h\n"
+ "saddw2 v30.4s, v4.4s, v24.8h\n"
+ "saddw v29.4s, v4.4s, v22.4h\n"
+ "saddw2 v22.4s, v4.4s, v22.8h\n"
+ "saddw v28.4s, v4.4s, v21.4h\n"
+ "saddw2 v21.4s, v4.4s, v21.8h\n"
+ "saddw v27.4s, v4.4s, v20.4h\n"
+ "saddw2 v20.4s, v4.4s, v20.8h\n"
+ "saddw v26.4s, v4.4s, v17.4h\n"
+ "saddw2 v17.4s, v4.4s, v17.8h\n"
+ "saddw v25.4s, v4.4s, v19.4h\n"
+ "saddw2 v19.4s, v4.4s, v19.8h\n"
+ "saddw v24.4s, v4.4s, v18.4h\n"
+ "saddw2 v18.4s, v4.4s, v18.8h\n"
+ "srshl v0.4s, v0.4s, v3.4s\n"
"srshl v23.4s, v23.4s, v3.4s\n"
- "srshl v18.4s, v18.4s, v3.4s\n"
- "srshl v17.4s, v17.4s, v3.4s\n"
- "srshl v16.4s, v16.4s, v3.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "srshl v23.4s, v23.4s, v2.4s\n"
- "srshl v18.4s, v18.4s, v2.4s\n"
- "srshl v17.4s, v17.4s, v2.4s\n"
- "srshl v16.4s, v16.4s, v2.4s\n"
+ "srshl v31.4s, v31.4s, v3.4s\n"
+ "srshl v30.4s, v30.4s, v3.4s\n"
+ "srshl v29.4s, v29.4s, v3.4s\n"
"srshl v22.4s, v22.4s, v3.4s\n"
- "srshl v21.4s, v21.4s, v3.4s\n"
"srshl v28.4s, v28.4s, v3.4s\n"
- "srshl v20.4s, v20.4s, v3.4s\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "sqrdmulh v28.4s, v28.4s, v4.4s\n"
- "sqrdmulh v20.4s, v20.4s, v4.4s\n"
- "srshl v22.4s, v22.4s, v2.4s\n"
- "srshl v21.4s, v21.4s, v2.4s\n"
- "srshl v28.4s, v28.4s, v2.4s\n"
- "srshl v20.4s, v20.4s, v2.4s\n"
+ "srshl v21.4s, v21.4s, v3.4s\n"
"srshl v27.4s, v27.4s, v3.4s\n"
- "srshl v19.4s, v19.4s, v3.4s\n"
- "add v23.4s, v23.4s, v1.4s\n"
- "add v18.4s, v18.4s, v1.4s\n"
- "sqrdmulh v27.4s, v27.4s, v4.4s\n"
- "sqrdmulh v19.4s, v19.4s, v4.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v16.4s, v16.4s, v1.4s\n"
- "srshl v27.4s, v27.4s, v2.4s\n"
- "srshl v19.4s, v19.4s, v2.4s\n"
- "add v22.4s, v22.4s, v1.4s\n"
- "add v21.4s, v21.4s, v1.4s\n"
- "add v28.4s, v28.4s, v1.4s\n"
- "add v20.4s, v20.4s, v1.4s\n"
- "add v27.4s, v27.4s, v1.4s\n"
- "add v19.4s, v19.4s, v1.4s\n"
- "smax v23.4s, v23.4s, v30.4s\n"
- "smax v18.4s, v18.4s, v30.4s\n"
- "smax v17.4s, v17.4s, v30.4s\n"
- "smin v23.4s, v23.4s, v29.4s\n"
- "smin v18.4s, v18.4s, v29.4s\n"
- "smin v17.4s, v17.4s, v29.4s\n"
- "smax v16.4s, v16.4s, v30.4s\n"
- "smax v22.4s, v22.4s, v30.4s\n"
- "smax v21.4s, v21.4s, v30.4s\n"
- "smin v16.4s, v16.4s, v29.4s\n"
- "smin v22.4s, v22.4s, v29.4s\n"
- "smin v21.4s, v21.4s, v29.4s\n"
- "smax v28.4s, v28.4s, v30.4s\n"
- "smax v20.4s, v20.4s, v30.4s\n"
- "smax v27.4s, v27.4s, v30.4s\n"
- "smin v28.4s, v28.4s, v29.4s\n"
- "smin v20.4s, v20.4s, v29.4s\n"
- "smin v27.4s, v27.4s, v29.4s\n"
- "smax v19.4s, v19.4s, v30.4s\n"
- "uzp1 v26.16b, v23.16b, v18.16b\n"
- "saddw v25.4s, v5.4s, v24.4h\n"
- "saddw2 v18.4s, v5.4s, v24.8h\n"
- "smin v19.4s, v19.4s, v29.4s\n"
+ "srshl v20.4s, v20.4s, v3.4s\n"
+ "srshl v26.4s, v26.4s, v3.4s\n"
+ "srshl v17.4s, v17.4s, v3.4s\n"
"srshl v25.4s, v25.4s, v3.4s\n"
+ "srshl v19.4s, v19.4s, v3.4s\n"
+ "srshl v24.4s, v24.4s, v3.4s\n"
"srshl v18.4s, v18.4s, v3.4s\n"
- "uzp1 v24.16b, v17.16b, v16.16b\n"
- "saddw v17.4s, v5.4s, v0.4h\n"
- "saddw2 v16.4s, v5.4s, v0.8h\n"
- "sqrdmulh v25.4s, v25.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "srshl v17.4s, v17.4s, v3.4s\n"
- "srshl v16.4s, v16.4s, v3.4s\n"
- "srshl v25.4s, v25.4s, v2.4s\n"
- "srshl v18.4s, v18.4s, v2.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "add v25.4s, v25.4s, v1.4s\n"
- "add v18.4s, v18.4s, v1.4s\n"
- "srshl v17.4s, v17.4s, v2.4s\n"
- "srshl v16.4s, v16.4s, v2.4s\n"
- "smax v25.4s, v25.4s, v30.4s\n"
- "smax v18.4s, v18.4s, v30.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v16.4s, v16.4s, v1.4s\n"
- "smin v25.4s, v25.4s, v29.4s\n"
- "smin v18.4s, v18.4s, v29.4s\n"
- "smax v17.4s, v17.4s, v30.4s\n"
- "smax v16.4s, v16.4s, v30.4s\n"
- "uzp1 v23.16b, v22.16b, v21.16b\n"
- "saddw v22.4s, v5.4s, v31.4h\n"
- "saddw2 v21.4s, v5.4s, v31.8h\n"
- "smin v17.4s, v17.4s, v29.4s\n"
- "srshl v22.4s, v22.4s, v3.4s\n"
- "srshl v21.4s, v21.4s, v3.4s\n"
- "smin v16.4s, v16.4s, v29.4s\n"
- "uzp1 v20.16b, v28.16b, v20.16b\n"
- "sqrdmulh v22.4s, v22.4s, v4.4s\n"
- "sqrdmulh v21.4s, v21.4s, v4.4s\n"
- "uzp1 v19.16b, v27.16b, v19.16b\n"
- "uzp1 v18.16b, v25.16b, v18.16b\n"
- "srshl v22.4s, v22.4s, v2.4s\n"
- "srshl v21.4s, v21.4s, v2.4s\n"
- "uzp1 v17.16b, v17.16b, v16.16b\n"
- "uzp1 v16.16b, v26.16b, v24.16b\n"
- "str q16, [%x[outptr], x28]\n"
- "add v22.4s, v22.4s, v1.4s\n"
- "add x28, x28, #0x40\n"
- "add v21.4s, v21.4s, v1.4s\n"
- "uzp1 v16.16b, v23.16b, v20.16b\n"
+ "sqrdmulh v0.4s, v0.4s, v2.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v2.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v2.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v2.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v2.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v2.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v2.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v2.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v2.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v2.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v2.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v2.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v2.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v2.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v2.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v2.4s\n"
+ "srshl v0.4s, v0.4s, v1.4s\n"
+ "srshl v23.4s, v23.4s, v1.4s\n"
+ "srshl v31.4s, v31.4s, v1.4s\n"
+ "srshl v30.4s, v30.4s, v1.4s\n"
+ "srshl v29.4s, v29.4s, v1.4s\n"
+ "srshl v22.4s, v22.4s, v1.4s\n"
+ "srshl v28.4s, v28.4s, v1.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "srshl v27.4s, v27.4s, v1.4s\n"
+ "srshl v20.4s, v20.4s, v1.4s\n"
+ "srshl v26.4s, v26.4s, v1.4s\n"
+ "srshl v17.4s, v17.4s, v1.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "srshl v19.4s, v19.4s, v1.4s\n"
+ "srshl v24.4s, v24.4s, v1.4s\n"
+ "srshl v18.4s, v18.4s, v1.4s\n"
+ "add v0.4s, v0.4s, v16.4s\n"
+ "add v23.4s, v23.4s, v16.4s\n"
+ "add v31.4s, v31.4s, v16.4s\n"
+ "add v30.4s, v30.4s, v16.4s\n"
+ "add v29.4s, v29.4s, v16.4s\n"
+ "add v22.4s, v22.4s, v16.4s\n"
+ "add v28.4s, v28.4s, v16.4s\n"
+ "add v21.4s, v21.4s, v16.4s\n"
+ "add v27.4s, v27.4s, v16.4s\n"
+ "add v20.4s, v20.4s, v16.4s\n"
+ "add v26.4s, v26.4s, v16.4s\n"
+ "add v17.4s, v17.4s, v16.4s\n"
+ "add v25.4s, v25.4s, v16.4s\n"
+ "add v19.4s, v19.4s, v16.4s\n"
+ "add v24.4s, v24.4s, v16.4s\n"
+ "add v18.4s, v18.4s, v16.4s\n"
+ "movi v16.4s, #0x0\n"
+ "smax v0.4s, v0.4s, v16.4s\n"
+ "smax v23.4s, v23.4s, v16.4s\n"
+ "smax v31.4s, v31.4s, v16.4s\n"
+ "smax v30.4s, v30.4s, v16.4s\n"
+ "smax v29.4s, v29.4s, v16.4s\n"
+ "smax v22.4s, v22.4s, v16.4s\n"
+ "smax v28.4s, v28.4s, v16.4s\n"
+ "smax v21.4s, v21.4s, v16.4s\n"
+ "smax v27.4s, v27.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "smax v26.4s, v26.4s, v16.4s\n"
+ "smax v17.4s, v17.4s, v16.4s\n"
+ "smax v25.4s, v25.4s, v16.4s\n"
+ "smax v19.4s, v19.4s, v16.4s\n"
+ "smax v24.4s, v24.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "movi v16.4s, #0xff\n"
+ "smin v0.4s, v0.4s, v16.4s\n"
+ "smin v23.4s, v23.4s, v16.4s\n"
+ "smin v31.4s, v31.4s, v16.4s\n"
+ "smin v30.4s, v30.4s, v16.4s\n"
+ "smin v29.4s, v29.4s, v16.4s\n"
+ "smin v22.4s, v22.4s, v16.4s\n"
+ "smin v28.4s, v28.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v16.4s\n"
+ "smin v27.4s, v27.4s, v16.4s\n"
+ "smin v20.4s, v20.4s, v16.4s\n"
+ "smin v26.4s, v26.4s, v16.4s\n"
+ "smin v17.4s, v17.4s, v16.4s\n"
+ "smin v25.4s, v25.4s, v16.4s\n"
+ "smin v19.4s, v19.4s, v16.4s\n"
+ "smin v24.4s, v24.4s, v16.4s\n"
+ "smin v18.4s, v18.4s, v16.4s\n"
+ "uzp1 v23.16b, v0.16b, v23.16b\n"
+ "uzp1 v16.16b, v31.16b, v30.16b\n"
+ "uzp1 v22.16b, v29.16b, v22.16b\n"
+ "uzp1 v21.16b, v28.16b, v21.16b\n"
+ "uzp1 v20.16b, v27.16b, v20.16b\n"
+ "uzp1 v17.16b, v26.16b, v17.16b\n"
+ "uzp1 v19.16b, v25.16b, v19.16b\n"
+ "uzp1 v18.16b, v24.16b, v18.16b\n"
+ "uzp1 v16.16b, v23.16b, v16.16b\n"
"str q16, [%x[outptr], x27]\n"
- "smax v22.4s, v22.4s, v30.4s\n"
"add x27, x27, #0x40\n"
- "smax v21.4s, v21.4s, v30.4s\n"
- "uzp1 v16.16b, v19.16b, v18.16b\n"
+ "uzp1 v16.16b, v22.16b, v21.16b\n"
+ "uzp1 v17.16b, v20.16b, v17.16b\n"
"str q16, [%x[outptr], x26]\n"
- "smin v22.4s, v22.4s, v29.4s\n"
"add x26, x26, #0x40\n"
- "smin v21.4s, v21.4s, v29.4s\n"
- "uzp1 v16.16b, v22.16b, v21.16b\n"
- "uzp1 v16.16b, v17.16b, v16.16b\n"
- "str q16, [%x[outptr], x25]\n"
- "add x25, x25, #0x40\n"
+ "uzp1 v16.16b, v19.16b, v18.16b\n"
+ "str q17, [%x[outptr], x24]\n"
+ "add x24, x24, #0x40\n"
+ "str q16, [%x[outptr], x23]\n"
+ "add x23, x23, #0x40\n"
"bge 1b\n"
"cbz %x[n_channels], 43f\n"
"7:" // Single vector of channels
"cmp %x[n_channels], #0x10\n"
"blt 14f\n"
"8:" // Single vector of channels: Loop
- "movi v4.16b, #0x0\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 11f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "movi v8.16b, #0x0\n"
+ "mov x22, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q3, [x20, x27]\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "add x22, x22, #0x20\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "umax v23.16b, v3.16b, v2.16b\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "umax v19.16b, v1.16b, v0.16b\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "umax v19.16b, v23.16b, v19.16b\n"
- "ldr q3, [x23, x28]\n"
- "ldr q2, [x22, x28]\n"
- "umax v4.16b, v4.16b, v19.16b\n"
- "ldr q1, [x21, x28]\n"
- "ldr q0, [x20, x28]\n"
+ "umax v17.16b, v4.16b, v3.16b\n"
+ "umax v16.16b, v28.16b, v22.16b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "ldr q4, [x21, x27]\n"
+ "ldr q3, [x20, x27]\n"
+ "umax v16.16b, v17.16b, v16.16b\n"
+ "ldp x21, x20, [x22, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "ldr q28, [x21, x27]\n"
+ "ldr q22, [x20, x27]\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
+ "add x22, x22, #0x20\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "umax v23.16b, v3.16b, v2.16b\n"
- "umax v19.16b, v1.16b, v0.16b\n"
- "umax v19.16b, v23.16b, v19.16b\n"
- "umax v4.16b, v4.16b, v19.16b\n"
+ "umax v17.16b, v4.16b, v3.16b\n"
+ "umax v16.16b, v28.16b, v22.16b\n"
+ "umax v16.16b, v17.16b, v16.16b\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ldr q3, [x23, x28]\n"
- "umax v4.16b, v4.16b, v3.16b\n"
+ "ldr x20, [x22], #0x8\n"
+ "ldr q16, [x20, x27]\n"
+ "subs x21, x21, #0x1\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "uxtl v17.8h, v4.8b\n"
- "add x19, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1r { v5.4s }, [x19]\n"
- "uxtl2 v16.8h, v4.16b\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "movi v30.4s, #0x0\n"
- "ld1r { v4.4s }, [x19]\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "movi v29.4s, #0xff\n"
- "ld1r { v3.4s }, [x19]\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "neg v5.4s, v5.4s\n"
- "ld1r { v2.4s }, [x19]\n"
- "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
- "saddw v23.4s, v5.4s, v17.4h\n"
- "ld1r { v1.4s }, [x19]\n"
+ "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+ "ld1r { v18.4s }, [x20]\n"
+ "uxtl v17.8h, v8.8b\n"
+ "uxtl2 v16.8h, v8.16b\n"
+ "neg v18.4s, v18.4s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+ "ld1r { v23.4s }, [x20]\n"
+ "saddw v22.4s, v18.4s, v17.4h\n"
+ "saddw2 v21.4s, v18.4s, v17.8h\n"
+ "saddw v20.4s, v18.4s, v16.4h\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+ "ld1r { v19.4s }, [x20]\n"
+ "saddw2 v18.4s, v18.4s, v16.8h\n"
+ "srshl v22.4s, v22.4s, v23.4s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "srshl v21.4s, v21.4s, v23.4s\n"
+ "srshl v20.4s, v20.4s, v23.4s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "srshl v18.4s, v18.4s, v23.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v19.4s\n"
"sub %x[n_channels], %x[n_channels], #0x10\n"
- "saddw2 v18.4s, v5.4s, v17.8h\n"
"cmp %x[n_channels], #0x10\n"
- "saddw v17.4s, v5.4s, v16.4h\n"
- "saddw2 v16.4s, v5.4s, v16.8h\n"
- "srshl v23.4s, v23.4s, v3.4s\n"
- "srshl v18.4s, v18.4s, v3.4s\n"
- "srshl v17.4s, v17.4s, v3.4s\n"
- "srshl v16.4s, v16.4s, v3.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "srshl v23.4s, v23.4s, v2.4s\n"
- "srshl v18.4s, v18.4s, v2.4s\n"
- "srshl v17.4s, v17.4s, v2.4s\n"
- "srshl v16.4s, v16.4s, v2.4s\n"
- "add v23.4s, v23.4s, v1.4s\n"
- "add v18.4s, v18.4s, v1.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v16.4s, v16.4s, v1.4s\n"
- "smax v23.4s, v23.4s, v30.4s\n"
- "smax v18.4s, v18.4s, v30.4s\n"
- "smax v17.4s, v17.4s, v30.4s\n"
- "smin v23.4s, v23.4s, v29.4s\n"
- "smin v18.4s, v18.4s, v29.4s\n"
- "smin v17.4s, v17.4s, v29.4s\n"
- "smax v16.4s, v16.4s, v30.4s\n"
- "uzp1 v26.16b, v23.16b, v18.16b\n"
- "smin v16.4s, v16.4s, v29.4s\n"
- "uzp1 v24.16b, v17.16b, v16.16b\n"
- "uzp1 v16.16b, v26.16b, v24.16b\n"
- "str q16, [%x[outptr], x28]\n"
- "add x28, x28, #0x10\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v19.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v19.4s\n"
+ "srshl v22.4s, v22.4s, v17.4s\n"
+ "srshl v21.4s, v21.4s, v17.4s\n"
+ "srshl v20.4s, v20.4s, v17.4s\n"
+ "srshl v18.4s, v18.4s, v17.4s\n"
+ "add v22.4s, v22.4s, v16.4s\n"
+ "add v21.4s, v21.4s, v16.4s\n"
+ "add v20.4s, v20.4s, v16.4s\n"
+ "add v18.4s, v18.4s, v16.4s\n"
+ "movi v16.4s, #0x0\n"
+ "smax v22.4s, v22.4s, v16.4s\n"
+ "smax v21.4s, v21.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "movi v16.4s, #0xff\n"
+ "smin v22.4s, v22.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v16.4s\n"
+ "smin v20.4s, v20.4s, v16.4s\n"
+ "smin v18.4s, v18.4s, v16.4s\n"
+ "uzp1 v17.16b, v22.16b, v21.16b\n"
+ "uzp1 v16.16b, v20.16b, v18.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
+ "str q16, [%x[outptr], x27]\n"
+ "add x27, x27, #0x10\n"
"bge 8b\n"
"cbz %x[n_channels], 43f\n"
"14:" // Oddments
- "movi v4.16b, #0x0\n"
- "add %x[outptr], %x[outptr], x28\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 24f\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "add %x[outptr], %x[outptr], x27\n"
+ "movi v8.16b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 24f\n"
"15:" // Oddments: 4 inputs loop
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "add x23, x23, x27\n"
+ "add x22, x22, x27\n"
+ "add x21, x21, x27\n"
+ "movi v4.16b, #0x0\n"
"movi v3.16b, #0x0\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "add x23, x23, x28\n"
- "movi v2.16b, #0x0\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "movi v1.16b, #0x0\n"
- "add x19, x19, #0x20\n"
- "movi v0.16b, #0x0\n"
- "add x22, x22, x28\n"
- "add x21, x21, x28\n"
- "add x20, x20, x28\n"
+ "add x20, x20, x27\n"
+ "movi v28.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
"tbz %x[n_channels], #3, 19f\n"
- "ldr d3, [x23], #0x8\n"
- "ldr d2, [x22], #0x8\n"
- "ldr d1, [x21], #0x8\n"
- "ldr d0, [x20], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
+ "ldr d3, [x22], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
"tbz %x[n_channels], #2, 17f\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
- "ld1 { v2.s }[2], [x22], #0x4\n"
- "ld1 { v1.s }[2], [x21], #0x4\n"
- "ld1 { v0.s }[2], [x20], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
+ "ld1 { v3.s }[2], [x22], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
"tbz %x[n_channels], #1, 16f\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
- "ld1 { v2.h }[6], [x22], #0x2\n"
- "ld1 { v1.h }[6], [x21], #0x2\n"
- "ld1 { v0.h }[6], [x20], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
+ "ld1 { v3.h }[6], [x22], #0x2\n"
+ "ld1 { v28.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[14], [x23], #0x1\n"
- "ld1 { v2.b }[14], [x22], #0x1\n"
- "ld1 { v1.b }[14], [x21], #0x1\n"
- "ld1 { v0.b }[14], [x20], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
+ "ld1 { v3.b }[14], [x22], #0x1\n"
+ "ld1 { v28.b }[14], [x21], #0x1\n"
+ "ld1 { v22.b }[14], [x20], #0x1\n"
"b 23f\n"
"16:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[12], [x23], #0x1\n"
- "ld1 { v2.b }[12], [x22], #0x1\n"
- "ld1 { v1.b }[12], [x21], #0x1\n"
- "ld1 { v0.b }[12], [x20], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
+ "ld1 { v3.b }[12], [x22], #0x1\n"
+ "ld1 { v28.b }[12], [x21], #0x1\n"
+ "ld1 { v22.b }[12], [x20], #0x1\n"
"b 23f\n"
"17:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 18f\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
- "ld1 { v2.h }[4], [x22], #0x2\n"
- "ld1 { v1.h }[4], [x21], #0x2\n"
- "ld1 { v0.h }[4], [x20], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
+ "ld1 { v3.h }[4], [x22], #0x2\n"
+ "ld1 { v28.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[10], [x23], #0x1\n"
- "ld1 { v2.b }[10], [x22], #0x1\n"
- "ld1 { v1.b }[10], [x21], #0x1\n"
- "ld1 { v0.b }[10], [x20], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
+ "ld1 { v3.b }[10], [x22], #0x1\n"
+ "ld1 { v28.b }[10], [x21], #0x1\n"
+ "ld1 { v22.b }[10], [x20], #0x1\n"
"b 23f\n"
"18:" // Oddments: 4 inputs loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[8], [x23], #0x1\n"
- "ld1 { v2.b }[8], [x22], #0x1\n"
- "ld1 { v1.b }[8], [x21], #0x1\n"
- "ld1 { v0.b }[8], [x20], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
+ "ld1 { v3.b }[8], [x22], #0x1\n"
+ "ld1 { v28.b }[8], [x21], #0x1\n"
+ "ld1 { v22.b }[8], [x20], #0x1\n"
"b 23f\n"
"19:" // Oddments: 4 inputs loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 21f\n"
- "ldr s3, [x23], #0x4\n"
- "ldr s2, [x22], #0x4\n"
- "ldr s1, [x21], #0x4\n"
- "ldr s0, [x20], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
+ "ldr s3, [x22], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
"tbz %x[n_channels], #1, 20f\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
- "ld1 { v2.h }[2], [x22], #0x2\n"
- "ld1 { v1.h }[2], [x21], #0x2\n"
- "ld1 { v0.h }[2], [x20], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
+ "ld1 { v3.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[6], [x23], #0x1\n"
- "ld1 { v2.b }[6], [x22], #0x1\n"
- "ld1 { v1.b }[6], [x21], #0x1\n"
- "ld1 { v0.b }[6], [x20], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
+ "ld1 { v3.b }[6], [x22], #0x1\n"
+ "ld1 { v28.b }[6], [x21], #0x1\n"
+ "ld1 { v22.b }[6], [x20], #0x1\n"
"b 23f\n"
"20:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[4], [x23], #0x1\n"
- "ld1 { v2.b }[4], [x22], #0x1\n"
- "ld1 { v1.b }[4], [x21], #0x1\n"
- "ld1 { v0.b }[4], [x20], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
+ "ld1 { v3.b }[4], [x22], #0x1\n"
+ "ld1 { v28.b }[4], [x21], #0x1\n"
+ "ld1 { v22.b }[4], [x20], #0x1\n"
"b 23f\n"
"21:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 22f\n"
- "ldr h3, [x23], #0x2\n"
- "ldr h2, [x22], #0x2\n"
- "ldr h1, [x21], #0x2\n"
- "ldr h0, [x20], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
+ "ldr h3, [x22], #0x2\n"
+ "ldr h28, [x21], #0x2\n"
+ "ldr h22, [x20], #0x2\n"
"tbz %x[n_channels], #0, 23f\n"
- "ld1 { v3.b }[2], [x23], #0x1\n"
- "ld1 { v2.b }[2], [x22], #0x1\n"
- "ld1 { v1.b }[2], [x21], #0x1\n"
- "ld1 { v0.b }[2], [x20], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
+ "ld1 { v3.b }[2], [x22], #0x1\n"
+ "ld1 { v28.b }[2], [x21], #0x1\n"
+ "ld1 { v22.b }[2], [x20], #0x1\n"
"b 23f\n"
"22:" // Oddments: 4 inputs loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 23f\n"
- "ldr b3, [x23], #0x1\n"
- "ldr b2, [x22], #0x1\n"
- "ldr b1, [x21], #0x1\n"
- "ldr b0, [x20], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
+ "ldr b3, [x22], #0x1\n"
+ "ldr b28, [x21], #0x1\n"
+ "ldr b22, [x20], #0x1\n"
"23:" // Oddments: 4 inputs loop: Load: Bit 3: End
- "umax v23.16b, v3.16b, v2.16b\n"
- "subs x24, x24, #0x1\n"
- "umax v19.16b, v1.16b, v0.16b\n"
- "umax v19.16b, v23.16b, v19.16b\n"
- "umax v4.16b, v4.16b, v19.16b\n"
+ "umax v17.16b, v4.16b, v3.16b\n"
+ "umax v16.16b, v28.16b, v22.16b\n"
+ "subs x25, x25, #0x1\n"
+ "umax v16.16b, v17.16b, v16.16b\n"
+ "umax v8.16b, v8.16b, v16.16b\n"
"bgt 15b\n"
"24:" // Oddments: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 34f\n"
"25:" // Oddments: Single input loop
- "movi v3.16b, #0x0\n"
- "ldr x23, [x19], #0x8\n"
- "add x23, x23, x28\n"
+ "ldr x23, [x24], #0x8\n"
+ "add x23, x23, x27\n"
+ "movi v4.16b, #0x0\n"
"tbz %x[n_channels], #3, 29f\n"
- "ldr d3, [x23], #0x8\n"
+ "ldr d4, [x23], #0x8\n"
"tbz %x[n_channels], #2, 27f\n"
- "ld1 { v3.s }[2], [x23], #0x4\n"
+ "ld1 { v4.s }[2], [x23], #0x4\n"
"tbz %x[n_channels], #1, 26f\n"
- "ld1 { v3.h }[6], [x23], #0x2\n"
+ "ld1 { v4.h }[6], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[14], [x23], #0x1\n"
+ "ld1 { v4.b }[14], [x23], #0x1\n"
"b 33f\n"
"26:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[12], [x23], #0x1\n"
+ "ld1 { v4.b }[12], [x23], #0x1\n"
"b 33f\n"
"27:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset
"tbz %x[n_channels], #1, 28f\n"
- "ld1 { v3.h }[4], [x23], #0x2\n"
+ "ld1 { v4.h }[4], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[10], [x23], #0x1\n"
+ "ld1 { v4.b }[10], [x23], #0x1\n"
"b 33f\n"
"28:" // Oddments: Single input loop: Load: Bit 3: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[8], [x23], #0x1\n"
+ "ld1 { v4.b }[8], [x23], #0x1\n"
"b 33f\n"
"29:" // Oddments: Single input loop: Load: Bit 3: Unset
"tbz %x[n_channels], #2, 31f\n"
- "ldr s3, [x23], #0x4\n"
+ "ldr s4, [x23], #0x4\n"
"tbz %x[n_channels], #1, 30f\n"
- "ld1 { v3.h }[2], [x23], #0x2\n"
+ "ld1 { v4.h }[2], [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[6], [x23], #0x1\n"
+ "ld1 { v4.b }[6], [x23], #0x1\n"
"b 33f\n"
"30:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[4], [x23], #0x1\n"
+ "ld1 { v4.b }[4], [x23], #0x1\n"
"b 33f\n"
"31:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset
"tbz %x[n_channels], #1, 32f\n"
- "ldr h3, [x23], #0x2\n"
+ "ldr h4, [x23], #0x2\n"
"tbz %x[n_channels], #0, 33f\n"
- "ld1 { v3.b }[2], [x23], #0x1\n"
+ "ld1 { v4.b }[2], [x23], #0x1\n"
"b 33f\n"
"32:" // Oddments: Single input loop: Load: Bit 3: Unset: Bit 2: Unset: Bit 1: Unset
"tbz %x[n_channels], #0, 33f\n"
- "ldr b3, [x23], #0x1\n"
+ "ldr b4, [x23], #0x1\n"
"33:" // Oddments: Single input loop: Load: Bit 3: End
- "umax v4.16b, v4.16b, v3.16b\n"
- "subs x20, x20, #0x1\n"
+ "subs x21, x21, #0x1\n"
+ "umax v8.16b, v8.16b, v4.16b\n"
"bgt 25b\n"
"34:" // Oddments: Single input loop: End
- "uxtl v17.8h, v4.8b\n"
- "add x19, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1r { v5.4s }, [x19]\n"
- "uxtl2 v16.8h, v4.16b\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "movi v30.4s, #0x0\n"
- "ld1r { v4.4s }, [x19]\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- "movi v29.4s, #0xff\n"
- "ld1r { v3.4s }, [x19]\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "neg v5.4s, v5.4s\n"
- "ld1r { v2.4s }, [x19]\n"
- "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
- "saddw v23.4s, v5.4s, v17.4h\n"
- "ld1r { v1.4s }, [x19]\n"
- "saddw2 v18.4s, v5.4s, v17.8h\n"
- "saddw v17.4s, v5.4s, v16.4h\n"
- "saddw2 v16.4s, v5.4s, v16.8h\n"
- "srshl v23.4s, v23.4s, v3.4s\n"
- "srshl v18.4s, v18.4s, v3.4s\n"
- "srshl v17.4s, v17.4s, v3.4s\n"
- "srshl v16.4s, v16.4s, v3.4s\n"
- "sqrdmulh v23.4s, v23.4s, v4.4s\n"
- "sqrdmulh v18.4s, v18.4s, v4.4s\n"
- "sqrdmulh v17.4s, v17.4s, v4.4s\n"
- "sqrdmulh v16.4s, v16.4s, v4.4s\n"
- "srshl v23.4s, v23.4s, v2.4s\n"
- "srshl v18.4s, v18.4s, v2.4s\n"
- "srshl v17.4s, v17.4s, v2.4s\n"
- "srshl v16.4s, v16.4s, v2.4s\n"
- "add v23.4s, v23.4s, v1.4s\n"
- "add v18.4s, v18.4s, v1.4s\n"
- "add v17.4s, v17.4s, v1.4s\n"
- "add v16.4s, v16.4s, v1.4s\n"
- "smax v23.4s, v23.4s, v30.4s\n"
- "smax v18.4s, v18.4s, v30.4s\n"
- "smax v17.4s, v17.4s, v30.4s\n"
- "smin v23.4s, v23.4s, v29.4s\n"
- "smin v18.4s, v18.4s, v29.4s\n"
- "smin v17.4s, v17.4s, v29.4s\n"
- "smax v16.4s, v16.4s, v30.4s\n"
- "uzp1 v26.16b, v23.16b, v18.16b\n"
- "smin v16.4s, v16.4s, v29.4s\n"
- "uzp1 v24.16b, v17.16b, v16.16b\n"
- "uzp1 v16.16b, v26.16b, v24.16b\n"
+ "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+ "ld1r { v18.4s }, [x20]\n"
+ "uxtl v17.8h, v8.8b\n"
+ "uxtl2 v16.8h, v8.16b\n"
+ "neg v18.4s, v18.4s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+ "ld1r { v23.4s }, [x20]\n"
+ "saddw v22.4s, v18.4s, v17.4h\n"
+ "saddw2 v21.4s, v18.4s, v17.8h\n"
+ "saddw v20.4s, v18.4s, v16.4h\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+ "ld1r { v19.4s }, [x20]\n"
+ "saddw2 v18.4s, v18.4s, v16.8h\n"
+ "srshl v22.4s, v22.4s, v23.4s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+ "ld1r { v17.4s }, [x20]\n"
+ "srshl v21.4s, v21.4s, v23.4s\n"
+ "srshl v20.4s, v20.4s, v23.4s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+ "ld1r { v16.4s }, [x20]\n"
+ "srshl v18.4s, v18.4s, v23.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v19.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v19.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v19.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v19.4s\n"
+ "srshl v22.4s, v22.4s, v17.4s\n"
+ "srshl v21.4s, v21.4s, v17.4s\n"
+ "srshl v20.4s, v20.4s, v17.4s\n"
+ "srshl v18.4s, v18.4s, v17.4s\n"
+ "add v22.4s, v22.4s, v16.4s\n"
+ "add v21.4s, v21.4s, v16.4s\n"
+ "add v20.4s, v20.4s, v16.4s\n"
+ "add v18.4s, v18.4s, v16.4s\n"
+ "movi v16.4s, #0x0\n"
+ "smax v22.4s, v22.4s, v16.4s\n"
+ "smax v21.4s, v21.4s, v16.4s\n"
+ "smax v20.4s, v20.4s, v16.4s\n"
+ "smax v18.4s, v18.4s, v16.4s\n"
+ "movi v16.4s, #0xff\n"
+ "smin v22.4s, v22.4s, v16.4s\n"
+ "smin v21.4s, v21.4s, v16.4s\n"
+ "smin v20.4s, v20.4s, v16.4s\n"
+ "smin v18.4s, v18.4s, v16.4s\n"
+ "uzp1 v17.16b, v22.16b, v21.16b\n"
+ "uzp1 v16.16b, v20.16b, v18.16b\n"
+ "uzp1 v16.16b, v17.16b, v16.16b\n"
"tbz %x[n_channels], #3, 38f\n"
"st1 { v16.d }[0], [%x[outptr]], #0x8\n"
"tbz %x[n_channels], #2, 36f\n"
@@ -664,12 +664,10 @@ void a64_u8q_nhwc_max_generic_depthfirst_impl(
"tbz %x[n_channels], #0, 42f\n"
"st1 { v16.b }[0], [%x[outptr]], #0x1\n"
"42:" // Oddments: Store: Bit 3: End
-
"43:" // End
-
: [n_channels] "+&r" (n_channels), [outptr] "+&r" (outptr)
: [inptrs] "r" (inptrs), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_input_offset] "I" (offsetof(Requantize32, input_offset)), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [quant_params] "r" (&qp)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp
index 6dffdcf01c..225f1e42c9 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,18 +33,11 @@ template <typename T>
void cpp_nhwc_1x1_stride_any_depthfirst_impl(const uint64_t, const uint64_t, uint64_t n_channels, const T *const *const inptrs, T *outptr);
template <typename T>
-struct cpp_nhwc_1x1_stride_any_depthfirst
+struct cpp_nhwc_1x1_stride_any_depthfirst : IGenericDepthfirstStrategy<T, T, Nothing>
{
- typedef T operand_type;
- typedef T return_type;
-
- typedef void (*kern_type)(const uint64_t, const uint64_t, uint64_t n_channels, const operand_type *const *const inptrs, return_type *outptr);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
- kern_type kernel = cpp_nhwc_1x1_stride_any_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<T, T, Nothing>;
cpp_nhwc_1x1_stride_any_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return cpp_nhwc_1x1_stride_any_depthfirst_impl<T>; }
};
} // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp
index 2bb22131f7..1f8f863de2 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,6 +24,10 @@
#include <cstdint>
#include <cstring>
+#ifdef ARM_COMPUTE_ENABLE_BF16
+#include "bfloat.hpp"
+using arm_gemm::bfloat16;
+#endif
namespace arm_conv {
namespace pooling {
@@ -41,9 +45,15 @@ void cpp_nhwc_1x1_stride_any_depthfirst_impl(
}
template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const float *const *, float *);
-#if defined(__ARM_FP16_ARGS)
+
+#ifdef __ARM_FP16_ARGS
template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const __fp16 *const *, __fp16 *);
-#endif // defined(__ARM_FP16_ARGS)
+#endif
+
+#ifdef ARM_COMPUTE_ENABLE_BF16
+template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const bfloat16 *const *, bfloat16 *);
+#endif
+
template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const int8_t *const *, int8_t *);
template void cpp_nhwc_1x1_stride_any_depthfirst_impl(uint64_t, uint64_t, uint64_t, const uint8_t *const *, uint8_t *);
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
new file mode 100644
index 0000000000..f6682e75e2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+
+struct sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy<__fp16, __fp16>
+{
+ using Parent = DepthfirstStrategy<__fp16, __fp16>;
+
+ const static auto pooling_type = PoolingType::AVERAGE;
+ const static auto pool_rows = 3u, pool_cols = 3u;
+ const static auto stride_rows = 1u, stride_cols = 1u;
+
+ sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *)
+ : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
+
+ Parent::KernelType get_kernel(void) const { return sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl; }
+};
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
new file mode 100644
index 0000000000..67b07205cd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
+ const unsigned int n_channels,
+ const __fp16 *const *const inptrs,
+ __fp16 *const *const outptrs,
+ const bool exclude_padding,
+ const unsigned int pad_left,
+ const unsigned int pad_top,
+ const unsigned int pad_right,
+ const unsigned int pad_bottom
+)
+{
+ struct KernelArgs
+ {
+ const uint64_t n_channels;
+ const __fp16 *const *const inptrs;
+ __fp16 *const *const outptrs;
+ __fp16 rescale_vals[4];
+
+ KernelArgs(
+ unsigned int channels,
+ const __fp16 *const *input_ptrs,
+ __fp16 *const * output_ptrs,
+ bool exclude_padding, unsigned int pad_left, unsigned int pad_top, unsigned int pad_right, unsigned int pad_bottom
+ ) : n_channels(channels),
+ inptrs(input_ptrs),
+ outptrs(output_ptrs)
+ {
+ for (unsigned int i = 0; i < 2; i++)
+ {
+ const int start_i = 1*i - static_cast<int>(pad_top);
+ const int end_i = std::min<int>(start_i + 3, 4 - pad_top - pad_bottom);
+ const int valid_rows = end_i - std::max<int>(0, start_i);
+
+ for (unsigned int j = 0; j < 2; j++)
+ {
+ const int start_j = 1*j - static_cast<int>(pad_left);
+ const int end_j = std::min<int>(start_j + 3, 4 - pad_left - pad_right);
+ const int valid_cols = end_j - std::max<int>(0, start_j);
+
+ rescale_vals[i*2 + j] = static_cast<__fp16>(1.0f / static_cast<float>(
+ exclude_padding ? valid_rows * valid_cols : 9
+ ));
+ }
+ }
+ }
+ };
+
+ const KernelArgs args(n_channels, inptrs, outptrs, exclude_padding,
+ pad_left, pad_top, pad_right, pad_bottom);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x3, #0x0\n"
+ "mov x20, #0x4\n"
+ "ldr x4, [%x[args], %[offsetof_inptrs]]\n"
+ "whilelt p0.h, XZR, x20\n"
+ "add x20, %x[args], %[offsetof_rescale]\n"
+ "ld1rqh { z4.h }, p0/Z, [x20]\n"
+ "ldr x5, [%x[args], %[offsetof_n_channels]]\n"
+ "whilelt p0.h, x3, x5\n"
+ "mov x6, #0x0\n"
+ "ldp x7, x8, [x21, #0x0]\n"
+ "ldp x17, x16, [x21, #0x10]\n"
+ "ldp x15, x14, [x4, #0x0]\n"
+ "ld1h { z3.h }, p0/Z, [x14, x3, LSL #1]\n"
+ "ldp x13, x12, [x4, #0x10]\n"
+ "ld1h { z2.h }, p0/Z, [x13, x3, LSL #1]\n"
+ "ldp x11, x10, [x4, #0x20]\n"
+ "ld1h { z1.h }, p0/Z, [x10, x3, LSL #1]\n"
+ "ldp x9, x28, [x4, #0x30]\n"
+ "ld1h { z0.h }, p0/Z, [x9, x3, LSL #1]\n"
+ "ldp x27, x26, [x4, #0x40]\n"
+ "ld1h { z31.h }, p0/Z, [x26, x3, LSL #1]\n"
+ "ldp x25, x24, [x4, #0x50]\n"
+ "ld1h { z30.h }, p0/Z, [x25, x3, LSL #1]\n"
+ "ldp x23, x22, [x4, #0x60]\n"
+ "ld1h { z29.h }, p0/Z, [x11, x3, LSL #1]\n"
+ "ldp x21, x20, [x4, #0x70]\n"
+ "ld1h { z28.h }, p0/Z, [x27, x3, LSL #1]\n"
+ "ld1h { z27.h }, p0/Z, [x28, x3, LSL #1]\n"
+ "ld1h { z22.h }, p0/Z, [x24, x3, LSL #1]\n"
+ "ld1h { z21.h }, p0/Z, [x22, x3, LSL #1]\n"
+ "ld1h { z20.h }, p0/Z, [x21, x3, LSL #1]\n"
+ "ld1h { z26.h }, p0/Z, [x15, x3, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x12, x3, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x23, x3, LSL #1]\n"
+ "ld1h { z23.h }, p0/Z, [x20, x3, LSL #1]\n"
+ "incw x3\n"
+ "whilelt p1.h, x3, x5\n"
+ "b.none 2f\n"
+ "1:" // Vector: Loop
+ "fadd z17.h, z1.h, z0.h\n"
+ "fadd z16.h, z31.h, z30.h\n"
+ "ld1h { z1.h }, p1/Z, [x10, x3, LSL #1]\n"
+ "whilelt p0.h, x6, x5\n"
+ "fadd z19.h, z17.h, z16.h\n"
+ "fadd z18.h, z3.h, z2.h\n"
+ "ld1h { z0.h }, p1/Z, [x9, x3, LSL #1]\n"
+ "fadd z17.h, z29.h, z28.h\n"
+ "fadd z22.h, z27.h, z22.h\n"
+ "ld1h { z31.h }, p1/Z, [x26, x3, LSL #1]\n"
+ "fadd z16.h, z21.h, z20.h\n"
+ "fadd z21.h, z18.h, z19.h\n"
+ "ld1h { z30.h }, p1/Z, [x25, x3, LSL #1]\n"
+ "fadd z20.h, z16.h, z19.h\n"
+ "fadd z19.h, z26.h, z17.h\n"
+ "ld1h { z3.h }, p1/Z, [x14, x3, LSL #1]\n"
+ "fadd z18.h, z25.h, z22.h\n"
+ "fadd z17.h, z24.h, z17.h\n"
+ "ld1h { z2.h }, p1/Z, [x13, x3, LSL #1]\n"
+ "fadd z16.h, z23.h, z22.h\n"
+ "fadd z19.h, z21.h, z19.h\n"
+ "ld1h { z29.h }, p1/Z, [x11, x3, LSL #1]\n"
+ "fadd z18.h, z21.h, z18.h\n"
+ "fadd z17.h, z17.h, z20.h\n"
+ "ld1h { z28.h }, p1/Z, [x27, x3, LSL #1]\n"
+ "fadd z16.h, z16.h, z20.h\n"
+ "ld1h { z27.h }, p1/Z, [x28, x3, LSL #1]\n"
+ "fmul z19.h, z19.h, z4.h[0]\n"
+ "ld1h { z22.h }, p1/Z, [x24, x3, LSL #1]\n"
+ "fmul z18.h, z18.h, z4.h[1]\n"
+ "fmul z17.h, z17.h, z4.h[2]\n"
+ "ld1h { z21.h }, p1/Z, [x22, x3, LSL #1]\n"
+ "fmul z16.h, z16.h, z4.h[3]\n"
+ "st1h { z19.h }, p0, [x7, x6, LSL #1]\n"
+ "ld1h { z20.h }, p1/Z, [x21, x3, LSL #1]\n"
+ "st1h { z18.h }, p0, [x8, x6, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x15, x3, LSL #1]\n"
+ "st1h { z17.h }, p0, [x17, x6, LSL #1]\n"
+ "ld1h { z25.h }, p1/Z, [x12, x3, LSL #1]\n"
+ "st1h { z16.h }, p0, [x16, x6, LSL #1]\n"
+ "incw x6\n"
+ "ld1h { z24.h }, p1/Z, [x23, x3, LSL #1]\n"
+ "ld1h { z23.h }, p1/Z, [x20, x3, LSL #1]\n"
+ "incw x3\n"
+ "whilelt p1.h, x3, x5\n"
+ "b.any 1b\n"
+ "2:" // Vector: Tail
+ "fadd z17.h, z1.h, z0.h\n"
+ "fadd z16.h, z31.h, z30.h\n"
+ "whilelt p0.h, x6, x5\n"
+ "fadd z19.h, z17.h, z16.h\n"
+ "fadd z18.h, z3.h, z2.h\n"
+ "fadd z17.h, z29.h, z28.h\n"
+ "fadd z22.h, z27.h, z22.h\n"
+ "fadd z16.h, z21.h, z20.h\n"
+ "fadd z21.h, z18.h, z19.h\n"
+ "fadd z20.h, z16.h, z19.h\n"
+ "fadd z19.h, z26.h, z17.h\n"
+ "fadd z18.h, z25.h, z22.h\n"
+ "fadd z17.h, z24.h, z17.h\n"
+ "fadd z16.h, z23.h, z22.h\n"
+ "fadd z19.h, z21.h, z19.h\n"
+ "fadd z18.h, z21.h, z18.h\n"
+ "fadd z17.h, z17.h, z20.h\n"
+ "fadd z16.h, z16.h, z20.h\n"
+ "fmul z19.h, z19.h, z4.h[0]\n"
+ "st1h { z19.h }, p0, [x7, x6, LSL #1]\n"
+ "fmul z18.h, z18.h, z4.h[1]\n"
+ "fmul z17.h, z17.h, z4.h[2]\n"
+ "st1h { z18.h }, p0, [x8, x6, LSL #1]\n"
+ "fmul z16.h, z16.h, z4.h[3]\n"
+ "st1h { z17.h }, p0, [x17, x6, LSL #1]\n"
+ "st1h { z16.h }, p0, [x16, x6, LSL #1]\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp
new file mode 100644
index 0000000000..cf09f421c4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp16_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
+
+struct sme_fp16_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<__fp16, __fp16>
+{
+ using Parent = IGenericDepthfirstStrategy<__fp16, __fp16>;
+ sme_fp16_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sme_fp16_nhwc_avg_generic_depthfirst_impl; }
+};
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..60f17b7bc2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_avg_generic_depthfirst/generic.cpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_fp16_nhwc_avg_generic_depthfirst_impl(
+ const uint64_t window_cells,
+ const uint64_t n_valid_cells,
+ uint64_t n_channels,
+ const __fp16 *const *const inptrs,
+ __fp16 *outptr
+)
+{
+ const auto rescale_value = static_cast<__fp16>(1.0f / static_cast<float>(window_cells));
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x9, #0x0\n"
+ "cnth x28\n"
+ "cnth x27, ALL, MUL #2\n"
+ "cnth x26, ALL, MUL #3\n"
+ "ptrue p0.b\n"
+ "whilelt p3.h, x9, %x[n_channels]\n"
+ "ld1rh { z6.h }, p0/Z, [%x[rescale_ptr]]\n"
+ "whilelt p2.h, x28, %x[n_channels]\n"
+ "whilelt p1.h, x27, %x[n_channels]\n"
+ "whilelt p0.h, x26, %x[n_channels]\n"
+ "b.none 7f\n"
+ "1:" // 4-vectors of channels
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z5.b, #0x0\n"
+ "mov z4.b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
+ "mov z3.b, #0x0\n"
+ "mov z2.b, #0x0\n"
+ "cbz x25, 4f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z29.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
+ "ld1h { z28.h }, p2/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z20.h }, p0/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
+ "beq 3f\n"
+ "2:" // 4-vectors of channels: 4 inputs loop
+ "fadd z23.h, z1.h, z0.h\n"
+ "fadd z19.h, z31.h, z30.h\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "fadd z22.h, z29.h, z22.h\n"
+ "fadd z18.h, z28.h, z18.h\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "fadd z21.h, z27.h, z21.h\n"
+ "fadd z17.h, z26.h, z17.h\n"
+ "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "fadd z20.h, z25.h, z20.h\n"
+ "fadd z16.h, z24.h, z16.h\n"
+ "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "fadd z19.h, z23.h, z19.h\n"
+ "fadd z18.h, z22.h, z18.h\n"
+ "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "fadd z17.h, z21.h, z17.h\n"
+ "fadd z16.h, z20.h, z16.h\n"
+ "ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "fadd z5.h, z5.h, z19.h\n"
+ "fadd z4.h, z4.h, z18.h\n"
+ "ld1h { z29.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "fadd z3.h, z3.h, z17.h\n"
+ "fadd z2.h, z2.h, z16.h\n"
+ "ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
+ "ld1h { z28.h }, p2/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z18.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z20.h }, p0/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
+ "bgt 2b\n"
+ "3:" // 4-vectors of channels: 4 inputs tail
+ "fadd z23.h, z1.h, z0.h\n"
+ "fadd z19.h, z31.h, z30.h\n"
+ "fadd z22.h, z29.h, z22.h\n"
+ "fadd z18.h, z28.h, z18.h\n"
+ "fadd z21.h, z27.h, z21.h\n"
+ "fadd z17.h, z26.h, z17.h\n"
+ "fadd z20.h, z25.h, z20.h\n"
+ "fadd z16.h, z24.h, z16.h\n"
+ "fadd z19.h, z23.h, z19.h\n"
+ "fadd z18.h, z22.h, z18.h\n"
+ "fadd z17.h, z21.h, z17.h\n"
+ "fadd z16.h, z20.h, z16.h\n"
+ "fadd z5.h, z5.h, z19.h\n"
+ "fadd z4.h, z4.h, z18.h\n"
+ "fadd z3.h, z3.h, z17.h\n"
+ "fadd z2.h, z2.h, z16.h\n"
+ "4:" // 4-vectors of channels: After loop
+ "ands x21, %x[n_valid_cells], #0x3\n"
+ "beq 6f\n"
+ "5:" // 4-vectors of channels: Single input loop
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "subs x21, x21, #0x1\n"
+ "fadd z5.h, z5.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "fadd z4.h, z4.h, z16.h\n"
+ "ld1h { z16.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "fadd z3.h, z3.h, z16.h\n"
+ "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
+ "fadd z2.h, z2.h, z16.h\n"
+ "bgt 5b\n"
+ "6:" // 4-vectors of channels: Single input loop: End
+ "fmul z5.h, z5.h, z6.h\n"
+ "fmul z4.h, z4.h, z6.h\n"
+ "st1h { z5.h }, p3, [%x[outptr], x9, LSL #1]\n"
+ "inch x9, ALL, MUL #4\n"
+ "fmul z3.h, z3.h, z6.h\n"
+ "fmul z2.h, z2.h, z6.h\n"
+ "st1h { z4.h }, p2, [%x[outptr], x28, LSL #1]\n"
+ "inch x28, ALL, MUL #4\n"
+ "st1h { z3.h }, p1, [%x[outptr], x27, LSL #1]\n"
+ "inch x27, ALL, MUL #4\n"
+ "st1h { z2.h }, p0, [%x[outptr], x26, LSL #1]\n"
+ "inch x26, ALL, MUL #4\n"
+ "whilelt p0.h, x26, %x[n_channels]\n"
+ "b.any 1b\n"
+ "7:" // Single vector of channels
+ "whilelt p3.h, x9, %x[n_channels]\n"
+ "b.none 14f\n"
+ "8:" // Single vector of channels: Loop
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z5.b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x20, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "ld1h { z1.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "beq 10f\n"
+ "9:" // Single vector of channels: Loop: 4 inputs loop
+ "fadd z17.h, z1.h, z0.h\n"
+ "fadd z16.h, z31.h, z30.h\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "fadd z16.h, z17.h, z16.h\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fadd z5.h, z5.h, z16.h\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z1.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "bgt 9b\n"
+ "10:" // Single vector of channels: Loop: 4 inputs tail
+ "fadd z17.h, z1.h, z0.h\n"
+ "fadd z16.h, z31.h, z30.h\n"
+ "fadd z16.h, z17.h, z16.h\n"
+ "fadd z5.h, z5.h, z16.h\n"
+ "11:" // Single vector of channels: Loop: After loop
+ "ands x21, %x[n_valid_cells], #0x3\n"
+ "beq 13f\n"
+ "12:" // Single vector of channels: Loop: Single input loop
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "subs x21, x21, #0x1\n"
+ "fadd z5.h, z5.h, z16.h\n"
+ "bgt 12b\n"
+ "13:" // Single vector of channels: Loop: Single input loop: End
+ "fmul z5.h, z5.h, z6.h\n"
+ "st1h { z5.h }, p3, [%x[outptr], x9, LSL #1]\n"
+ "inch x9\n"
+ "whilelt p3.h, x9, %x[n_channels]\n"
+ "b.any 8b\n"
+ "14:" // End
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
new file mode 100644
index 0000000000..cd6c7449a8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+
+struct sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<__fp16, __fp16>
+{
+ using Parent = DepthfirstStrategy<__fp16, __fp16>;
+
+ const static auto pooling_type = PoolingType::MAX;
+ const static auto pool_rows = 2u, pool_cols = 2u;
+ const static auto stride_rows = 1u, stride_cols = 1u;
+
+ sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+ : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
+
+ Parent::KernelType get_kernel(void) const { return sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
+};
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
new file mode 100644
index 0000000000..7fc776ed4e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
+ const unsigned int n_channels,
+ const __fp16 *const *const inptrs,
+ __fp16 *const *const outptrs,
+ const bool exclude_padding,
+ const unsigned int pad_left,
+ const unsigned int pad_top,
+ const unsigned int pad_right,
+ const unsigned int pad_bottom
+)
+{
+ struct KernelArgs
+ {
+ const uint64_t n_channels;
+ const __fp16 *const *const inptrs;
+ __fp16 *const *const outptrs;
+ KernelArgs(
+ unsigned int channels,
+ const __fp16 *const *input_ptrs,
+ __fp16 *const * output_ptrs,
+ bool, unsigned int, unsigned int, unsigned int, unsigned int
+ ) : n_channels(channels),
+ inptrs(input_ptrs),
+ outptrs(output_ptrs)
+ {
+ }
+ };
+
+ const KernelArgs args(n_channels, inptrs, outptrs, exclude_padding,
+ pad_left, pad_top, pad_right, pad_bottom);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x15, #0x0\n"
+ "ptrue p2.b\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+ "mov x14, #0x0\n"
+ "ldr x13, [%x[args], %[offsetof_n_channels]]\n"
+ "whilelt p0.h, x15, x13\n"
+ "ldp x12, x11, [x21, #0x0]\n"
+ "ldp x10, x9, [x21, #0x10]\n"
+ "ldp x28, x27, [x20, #0x0]\n"
+ "ld1h { z30.h }, p0/Z, [x27, x15, LSL #1]\n"
+ "ldp x26, x25, [x20, #0x10]\n"
+ "ld1h { z29.h }, p0/Z, [x25, x15, LSL #1]\n"
+ "ldp x24, x23, [x20, #0x20]\n"
+ "ld1h { z28.h }, p0/Z, [x24, x15, LSL #1]\n"
+ "ldp x22, x21, [x20, #0x30]\n"
+ "ld1h { z27.h }, p0/Z, [x21, x15, LSL #1]\n"
+ "ldr x20, [x20, #0x40]\n"
+ "ld1h { z26.h }, p0/Z, [x28, x15, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x26, x15, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x23, x15, LSL #1]\n"
+ "ld1h { z19.h }, p0/Z, [x22, x15, LSL #1]\n"
+ "ld1h { z23.h }, p0/Z, [x20, x15, LSL #1]\n"
+ "incw x15\n"
+ "whilelt p1.h, x15, x13\n"
+ "b.none 2f\n"
+ "1:" // Vector: Loop
+ "movprfx z22, z30\n fmax z22.h, p2/M, z22.h, z28.h\n"
+ "movprfx z21, z28\n fmax z21.h, p2/M, z21.h, z27.h\n"
+ "ld1h { z30.h }, p1/Z, [x27, x15, LSL #1]\n"
+ "whilelt p0.h, x14, x13\n"
+ "movprfx z18, z29\n fmax z18.h, p2/M, z18.h, z26.h\n"
+ "movprfx z17, z25\n fmax z17.h, p2/M, z17.h, z24.h\n"
+ "ld1h { z28.h }, p1/Z, [x24, x15, LSL #1]\n"
+ "movprfx z16, z29\n fmax z16.h, p2/M, z16.h, z19.h\n"
+ "movprfx z20, z24\n fmax z20.h, p2/M, z20.h, z23.h\n"
+ "ld1h { z27.h }, p1/Z, [x21, x15, LSL #1]\n"
+ "ld1h { z29.h }, p1/Z, [x25, x15, LSL #1]\n"
+ "movprfx z19, z22\n fmax z19.h, p2/M, z19.h, z18.h\n"
+ "movprfx z18, z17\n fmax z18.h, p2/M, z18.h, z22.h\n"
+ "ld1h { z26.h }, p1/Z, [x28, x15, LSL #1]\n"
+ "movprfx z17, z16\n fmax z17.h, p2/M, z17.h, z21.h\n"
+ "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z20.h\n"
+ "ld1h { z25.h }, p1/Z, [x26, x15, LSL #1]\n"
+ "st1h { z19.h }, p0, [x12, x14, LSL #1]\n"
+ "ld1h { z24.h }, p1/Z, [x23, x15, LSL #1]\n"
+ "st1h { z18.h }, p0, [x11, x14, LSL #1]\n"
+ "ld1h { z19.h }, p1/Z, [x22, x15, LSL #1]\n"
+ "st1h { z17.h }, p0, [x10, x14, LSL #1]\n"
+ "ld1h { z23.h }, p1/Z, [x20, x15, LSL #1]\n"
+ "incw x15\n"
+ "whilelt p1.h, x15, x13\n"
+ "st1h { z16.h }, p0, [x9, x14, LSL #1]\n"
+ "incw x14\n"
+ "b.any 1b\n"
+ "2:" // Vector: Tail
+ "movprfx z22, z30\n fmax z22.h, p2/M, z22.h, z28.h\n"
+ "movprfx z21, z28\n fmax z21.h, p2/M, z21.h, z27.h\n"
+ "whilelt p0.h, x14, x13\n"
+ "movprfx z20, z29\n fmax z20.h, p2/M, z20.h, z26.h\n"
+ "movprfx z18, z25\n fmax z18.h, p2/M, z18.h, z24.h\n"
+ "movprfx z17, z29\n fmax z17.h, p2/M, z17.h, z19.h\n"
+ "movprfx z19, z24\n fmax z19.h, p2/M, z19.h, z23.h\n"
+ "movprfx z16, z22\n fmax z16.h, p2/M, z16.h, z20.h\n"
+ "fmax z18.h, p2/M, z18.h, z22.h\n"
+ "st1h { z16.h }, p0, [x12, x14, LSL #1]\n"
+ "fmax z17.h, p2/M, z17.h, z21.h\n"
+ "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z19.h\n"
+ "st1h { z18.h }, p0, [x11, x14, LSL #1]\n"
+ "st1h { z17.h }, p0, [x10, x14, LSL #1]\n"
+ "st1h { z16.h }, p0, [x9, x14, LSL #1]\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp
new file mode 100644
index 0000000000..bfb3bf5b1a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp16_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
+
+struct sme_fp16_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<__fp16, __fp16>
+{
+ using Parent = IGenericDepthfirstStrategy<__fp16, __fp16>;
+ sme_fp16_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sme_fp16_nhwc_max_generic_depthfirst_impl; }
+};
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..afa2ccbd71
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp16_nhwc_max_generic_depthfirst/generic.cpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_fp16_nhwc_max_generic_depthfirst_impl(
+ const uint64_t,
+ const uint64_t n_valid_cells,
+ uint64_t n_channels,
+ const __fp16 *const *const inptrs,
+ __fp16 *outptr
+)
+{
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x9, #0x0\n"
+ "cnth x28\n"
+ "cnth x27, ALL, MUL #2\n"
+ "cnth x26, ALL, MUL #3\n"
+ "whilelt p4.h, x9, %x[n_channels]\n"
+ "whilelt p3.h, x28, %x[n_channels]\n"
+ "whilelt p2.h, x27, %x[n_channels]\n"
+ "whilelt p1.h, x26, %x[n_channels]\n"
+ "ptrue p0.b\n"
+ "b.none 7f\n"
+ "1:" // 4-vectors of channels
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z4.h, #0xfc00\n"
+ "mov z3.h, #0xfc00\n"
+ "mov x24, %x[inptrs]\n"
+ "mov z2.h, #0xfc00\n"
+ "mov z1.h, #0xfc00\n"
+ "cbz x25, 4f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "ld1h { z0.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z18.h }, p3/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z29.h }, p3/Z, [x22, x28, LSL #1]\n"
+ "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z28.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x22, x27, LSL #1]\n"
+ "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z26.h }, p2/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z16.h }, p1/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
+ "beq 3f\n"
+ "2:" // 4-vectors of channels: 4 inputs loop
+ "movprfx z19, z0\n fmax z19.h, p0/M, z19.h, z31.h\n"
+ "fmax z23.h, p0/M, z23.h, z30.h\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "fmax z18.h, p0/M, z18.h, z29.h\n"
+ "fmax z22.h, p0/M, z22.h, z28.h\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "fmax z17.h, p0/M, z17.h, z27.h\n"
+ "fmax z21.h, p0/M, z21.h, z26.h\n"
+ "ld1h { z0.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "fmax z16.h, p0/M, z16.h, z25.h\n"
+ "fmax z20.h, p0/M, z20.h, z24.h\n"
+ "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "fmax z19.h, p0/M, z19.h, z23.h\n"
+ "fmax z18.h, p0/M, z18.h, z22.h\n"
+ "ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "fmax z17.h, p0/M, z17.h, z21.h\n"
+ "fmax z16.h, p0/M, z16.h, z20.h\n"
+ "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "fmax z4.h, p0/M, z4.h, z19.h\n"
+ "fmax z3.h, p0/M, z3.h, z18.h\n"
+ "ld1h { z18.h }, p3/Z, [x23, x28, LSL #1]\n"
+ "fmax z2.h, p0/M, z2.h, z17.h\n"
+ "fmax z1.h, p0/M, z1.h, z16.h\n"
+ "ld1h { z29.h }, p3/Z, [x22, x28, LSL #1]\n"
+ "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z28.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z17.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x22, x27, LSL #1]\n"
+ "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z26.h }, p2/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z16.h }, p1/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
+ "bgt 2b\n"
+ "3:" // 4-vectors of channels: 4 inputs tail
+ "movprfx z19, z0\n fmax z19.h, p0/M, z19.h, z31.h\n"
+ "fmax z23.h, p0/M, z23.h, z30.h\n"
+ "fmax z18.h, p0/M, z18.h, z29.h\n"
+ "fmax z22.h, p0/M, z22.h, z28.h\n"
+ "fmax z17.h, p0/M, z17.h, z27.h\n"
+ "fmax z21.h, p0/M, z21.h, z26.h\n"
+ "fmax z16.h, p0/M, z16.h, z25.h\n"
+ "fmax z20.h, p0/M, z20.h, z24.h\n"
+ "fmax z19.h, p0/M, z19.h, z23.h\n"
+ "fmax z18.h, p0/M, z18.h, z22.h\n"
+ "fmax z17.h, p0/M, z17.h, z21.h\n"
+ "fmax z16.h, p0/M, z16.h, z20.h\n"
+ "fmax z4.h, p0/M, z4.h, z19.h\n"
+ "fmax z3.h, p0/M, z3.h, z18.h\n"
+ "fmax z2.h, p0/M, z2.h, z17.h\n"
+ "fmax z1.h, p0/M, z1.h, z16.h\n"
+ "4:" // 4-vectors of channels: After loop
+ "ands x21, %x[n_valid_cells], #0x3\n"
+ "beq 6f\n"
+ "5:" // 4-vectors of channels: Single input loop
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "subs x21, x21, #0x1\n"
+ "fmax z4.h, p0/M, z4.h, z16.h\n"
+ "ld1h { z16.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "fmax z3.h, p0/M, z3.h, z16.h\n"
+ "ld1h { z16.h }, p2/Z, [x20, x27, LSL #1]\n"
+ "fmax z2.h, p0/M, z2.h, z16.h\n"
+ "ld1h { z16.h }, p1/Z, [x20, x26, LSL #1]\n"
+ "fmax z1.h, p0/M, z1.h, z16.h\n"
+ "bgt 5b\n"
+ "6:" // 4-vectors of channels: Single input loop: End
+ "st1h { z4.h }, p4, [%x[outptr], x9, LSL #1]\n"
+ "inch x9, ALL, MUL #4\n"
+ "st1h { z3.h }, p3, [%x[outptr], x28, LSL #1]\n"
+ "inch x28, ALL, MUL #4\n"
+ "st1h { z2.h }, p2, [%x[outptr], x27, LSL #1]\n"
+ "inch x27, ALL, MUL #4\n"
+ "st1h { z1.h }, p1, [%x[outptr], x26, LSL #1]\n"
+ "inch x26, ALL, MUL #4\n"
+ "whilelt p1.h, x26, %x[n_channels]\n"
+ "b.any 1b\n"
+ "7:" // Single vector of channels
+ "whilelt p4.h, x9, %x[n_channels]\n"
+ "b.none 14f\n"
+ "8:" // Single vector of channels: Loop
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z4.h, #0xfc00\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x20, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "ld1h { z0.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "beq 10f\n"
+ "9:" // Single vector of channels: Loop: 4 inputs loop
+ "movprfx z16, z0\n fmax z16.h, p0/M, z16.h, z31.h\n"
+ "movprfx z17, z23\n fmax z17.h, p0/M, z17.h, z30.h\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "fmax z16.h, p0/M, z16.h, z17.h\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fmax z4.h, p0/M, z4.h, z16.h\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z0.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z31.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z23.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z30.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "bgt 9b\n"
+ "10:" // Single vector of channels: Loop: 4 inputs tail
+ "movprfx z16, z0\n fmax z16.h, p0/M, z16.h, z31.h\n"
+ "movprfx z17, z23\n fmax z17.h, p0/M, z17.h, z30.h\n"
+ "fmax z16.h, p0/M, z16.h, z17.h\n"
+ "fmax z4.h, p0/M, z4.h, z16.h\n"
+ "11:" // Single vector of channels: Loop: After loop
+ "ands x21, %x[n_valid_cells], #0x3\n"
+ "beq 13f\n"
+ "12:" // Single vector of channels: Loop: Single input loop
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "subs x21, x21, #0x1\n"
+ "fmax z4.h, p0/M, z4.h, z16.h\n"
+ "bgt 12b\n"
+ "13:" // Single vector of channels: Loop: Single input loop: End
+ "st1h { z4.h }, p4, [%x[outptr], x9, LSL #1]\n"
+ "inch x9\n"
+ "whilelt p4.h, x9, %x[n_channels]\n"
+ "b.any 8b\n"
+ "14:" // End
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
new file mode 100644
index 0000000000..23a0eee04e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+
+struct sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy<float, float>
+{
+ using Parent = DepthfirstStrategy<float, float>;
+
+ const static auto pooling_type = PoolingType::AVERAGE;
+ const static auto pool_rows = 3u, pool_cols = 3u;
+ const static auto stride_rows = 1u, stride_cols = 1u;
+
+ sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *)
+ : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
+
+ Parent::KernelType get_kernel(void) const { return sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl; }
+};
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
new file mode 100644
index 0000000000..8c8532827a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
+ const unsigned int n_channels,
+ const float *const *const inptrs,
+ float *const *const outptrs,
+ const bool exclude_padding,
+ const unsigned int pad_left,
+ const unsigned int pad_top,
+ const unsigned int pad_right,
+ const unsigned int pad_bottom
+)
+{
+ struct KernelArgs
+ {
+ const uint64_t n_channels;
+ const float *const *const inptrs;
+ float *const *const outptrs;
+ float rescale_vals[4];
+
+ KernelArgs(
+ unsigned int channels,
+ const float *const *input_ptrs,
+ float *const * output_ptrs,
+ bool exclude_padding, unsigned int pad_left, unsigned int pad_top, unsigned int pad_right, unsigned int pad_bottom
+ ) : n_channels(channels),
+ inptrs(input_ptrs),
+ outptrs(output_ptrs)
+ {
+ for (unsigned int i = 0; i < 2; i++)
+ {
+ const int start_i = 1*i - static_cast<int>(pad_top);
+ const int end_i = std::min<int>(start_i + 3, 4 - pad_top - pad_bottom);
+ const int valid_rows = end_i - std::max<int>(0, start_i);
+
+ for (unsigned int j = 0; j < 2; j++)
+ {
+ const int start_j = 1*j - static_cast<int>(pad_left);
+ const int end_j = std::min<int>(start_j + 3, 4 - pad_left - pad_right);
+ const int valid_cols = end_j - std::max<int>(0, start_j);
+
+ rescale_vals[i*2 + j] = static_cast<float>(1.0f / static_cast<float>(
+ exclude_padding ? valid_rows * valid_cols : 9
+ ));
+ }
+ }
+ }
+ };
+
+ const KernelArgs args(n_channels, inptrs, outptrs, exclude_padding,
+ pad_left, pad_top, pad_right, pad_bottom);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x3, #0x0\n"
+ "mov x20, #0x4\n"
+ "ldr x4, [%x[args], %[offsetof_inptrs]]\n"
+ "whilelt p0.s, XZR, x20\n"
+ "add x20, %x[args], %[offsetof_rescale]\n"
+ "ld1rqw { z4.s }, p0/Z, [x20]\n"
+ "ldr x5, [%x[args], %[offsetof_n_channels]]\n"
+ "whilelt p0.s, x3, x5\n"
+ "mov x6, #0x0\n"
+ "ldp x7, x8, [x21, #0x0]\n"
+ "ldp x17, x16, [x21, #0x10]\n"
+ "ldp x15, x14, [x4, #0x0]\n"
+ "ld1w { z3.s }, p0/Z, [x14, x3, LSL #2]\n"
+ "ldp x13, x12, [x4, #0x10]\n"
+ "ld1w { z2.s }, p0/Z, [x13, x3, LSL #2]\n"
+ "ldp x11, x10, [x4, #0x20]\n"
+ "ld1w { z1.s }, p0/Z, [x10, x3, LSL #2]\n"
+ "ldp x9, x28, [x4, #0x30]\n"
+ "ld1w { z0.s }, p0/Z, [x9, x3, LSL #2]\n"
+ "ldp x27, x26, [x4, #0x40]\n"
+ "ld1w { z31.s }, p0/Z, [x26, x3, LSL #2]\n"
+ "ldp x25, x24, [x4, #0x50]\n"
+ "ld1w { z30.s }, p0/Z, [x25, x3, LSL #2]\n"
+ "ldp x23, x22, [x4, #0x60]\n"
+ "ld1w { z29.s }, p0/Z, [x11, x3, LSL #2]\n"
+ "ldp x21, x20, [x4, #0x70]\n"
+ "ld1w { z28.s }, p0/Z, [x27, x3, LSL #2]\n"
+ "ld1w { z27.s }, p0/Z, [x28, x3, LSL #2]\n"
+ "ld1w { z22.s }, p0/Z, [x24, x3, LSL #2]\n"
+ "ld1w { z21.s }, p0/Z, [x22, x3, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x21, x3, LSL #2]\n"
+ "ld1w { z26.s }, p0/Z, [x15, x3, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x12, x3, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x23, x3, LSL #2]\n"
+ "ld1w { z23.s }, p0/Z, [x20, x3, LSL #2]\n"
+ "incw x3\n"
+ "whilelt p1.s, x3, x5\n"
+ "b.none 2f\n"
+ "1:" // Vector: Loop
+ "fadd z17.s, z1.s, z0.s\n"
+ "fadd z16.s, z31.s, z30.s\n"
+ "ld1w { z1.s }, p1/Z, [x10, x3, LSL #2]\n"
+ "whilelt p0.s, x6, x5\n"
+ "fadd z19.s, z17.s, z16.s\n"
+ "fadd z18.s, z3.s, z2.s\n"
+ "ld1w { z0.s }, p1/Z, [x9, x3, LSL #2]\n"
+ "fadd z17.s, z29.s, z28.s\n"
+ "fadd z22.s, z27.s, z22.s\n"
+ "ld1w { z31.s }, p1/Z, [x26, x3, LSL #2]\n"
+ "fadd z16.s, z21.s, z20.s\n"
+ "fadd z21.s, z18.s, z19.s\n"
+ "ld1w { z30.s }, p1/Z, [x25, x3, LSL #2]\n"
+ "fadd z20.s, z16.s, z19.s\n"
+ "fadd z19.s, z26.s, z17.s\n"
+ "ld1w { z3.s }, p1/Z, [x14, x3, LSL #2]\n"
+ "fadd z18.s, z25.s, z22.s\n"
+ "fadd z17.s, z24.s, z17.s\n"
+ "ld1w { z2.s }, p1/Z, [x13, x3, LSL #2]\n"
+ "fadd z16.s, z23.s, z22.s\n"
+ "fadd z19.s, z21.s, z19.s\n"
+ "ld1w { z29.s }, p1/Z, [x11, x3, LSL #2]\n"
+ "fadd z18.s, z21.s, z18.s\n"
+ "fadd z17.s, z17.s, z20.s\n"
+ "ld1w { z28.s }, p1/Z, [x27, x3, LSL #2]\n"
+ "fadd z16.s, z16.s, z20.s\n"
+ "ld1w { z27.s }, p1/Z, [x28, x3, LSL #2]\n"
+ "fmul z19.s, z19.s, z4.s[0]\n"
+ "ld1w { z22.s }, p1/Z, [x24, x3, LSL #2]\n"
+ "fmul z18.s, z18.s, z4.s[1]\n"
+ "fmul z17.s, z17.s, z4.s[2]\n"
+ "ld1w { z21.s }, p1/Z, [x22, x3, LSL #2]\n"
+ "fmul z16.s, z16.s, z4.s[3]\n"
+ "st1w { z19.s }, p0, [x7, x6, LSL #2]\n"
+ "ld1w { z20.s }, p1/Z, [x21, x3, LSL #2]\n"
+ "st1w { z18.s }, p0, [x8, x6, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x15, x3, LSL #2]\n"
+ "st1w { z17.s }, p0, [x17, x6, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x12, x3, LSL #2]\n"
+ "st1w { z16.s }, p0, [x16, x6, LSL #2]\n"
+ "incw x6\n"
+ "ld1w { z24.s }, p1/Z, [x23, x3, LSL #2]\n"
+ "ld1w { z23.s }, p1/Z, [x20, x3, LSL #2]\n"
+ "incw x3\n"
+ "whilelt p1.s, x3, x5\n"
+ "b.any 1b\n"
+ "2:" // Vector: Tail
+ "fadd z17.s, z1.s, z0.s\n"
+ "fadd z16.s, z31.s, z30.s\n"
+ "whilelt p0.s, x6, x5\n"
+ "fadd z19.s, z17.s, z16.s\n"
+ "fadd z18.s, z3.s, z2.s\n"
+ "fadd z17.s, z29.s, z28.s\n"
+ "fadd z22.s, z27.s, z22.s\n"
+ "fadd z16.s, z21.s, z20.s\n"
+ "fadd z21.s, z18.s, z19.s\n"
+ "fadd z20.s, z16.s, z19.s\n"
+ "fadd z19.s, z26.s, z17.s\n"
+ "fadd z18.s, z25.s, z22.s\n"
+ "fadd z17.s, z24.s, z17.s\n"
+ "fadd z16.s, z23.s, z22.s\n"
+ "fadd z19.s, z21.s, z19.s\n"
+ "fadd z18.s, z21.s, z18.s\n"
+ "fadd z17.s, z17.s, z20.s\n"
+ "fadd z16.s, z16.s, z20.s\n"
+ "fmul z19.s, z19.s, z4.s[0]\n"
+ "st1w { z19.s }, p0, [x7, x6, LSL #2]\n"
+ "fmul z18.s, z18.s, z4.s[1]\n"
+ "fmul z17.s, z17.s, z4.s[2]\n"
+ "st1w { z18.s }, p0, [x8, x6, LSL #2]\n"
+ "fmul z16.s, z16.s, z4.s[3]\n"
+ "st1w { z17.s }, p0, [x17, x6, LSL #2]\n"
+ "st1w { z16.s }, p0, [x16, x6, LSL #2]\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp
new file mode 100644
index 0000000000..29bcfc5a3b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp32_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
+
+struct sme_fp32_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<float, float>
+{
+ using Parent = IGenericDepthfirstStrategy<float, float>;
+ sme_fp32_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sme_fp32_nhwc_avg_generic_depthfirst_impl; }
+};
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..86e7f84542
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_avg_generic_depthfirst/generic.cpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_fp32_nhwc_avg_generic_depthfirst_impl(
+ const uint64_t window_cells,
+ const uint64_t n_valid_cells,
+ uint64_t n_channels,
+ const float *const *const inptrs,
+ float *outptr
+)
+{
+ const auto rescale_value = static_cast<float>(1.0f / static_cast<float>(window_cells));
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x9, #0x0\n"
+ "cntw x28\n"
+ "cntw x27, ALL, MUL #2\n"
+ "cntw x26, ALL, MUL #3\n"
+ "ptrue p0.b\n"
+ "whilelt p3.s, x9, %x[n_channels]\n"
+ "ld1rw { z6.s }, p0/Z, [%x[rescale_ptr]]\n"
+ "whilelt p2.s, x28, %x[n_channels]\n"
+ "whilelt p1.s, x27, %x[n_channels]\n"
+ "whilelt p0.s, x26, %x[n_channels]\n"
+ "b.none 7f\n"
+ "1:" // 4-vectors of channels
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z5.b, #0x0\n"
+ "mov z4.b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
+ "mov z3.b, #0x0\n"
+ "mov z2.b, #0x0\n"
+ "cbz x25, 4f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z29.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z28.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
+ "beq 3f\n"
+ "2:" // 4-vectors of channels: 4 inputs loop
+ "fadd z23.s, z1.s, z0.s\n"
+ "fadd z19.s, z31.s, z30.s\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "fadd z22.s, z29.s, z22.s\n"
+ "fadd z18.s, z28.s, z18.s\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "fadd z21.s, z27.s, z21.s\n"
+ "fadd z17.s, z26.s, z17.s\n"
+ "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "fadd z20.s, z25.s, z20.s\n"
+ "fadd z16.s, z24.s, z16.s\n"
+ "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "fadd z19.s, z23.s, z19.s\n"
+ "fadd z18.s, z22.s, z18.s\n"
+ "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "fadd z17.s, z21.s, z17.s\n"
+ "fadd z16.s, z20.s, z16.s\n"
+ "ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "fadd z5.s, z5.s, z19.s\n"
+ "fadd z4.s, z4.s, z18.s\n"
+ "ld1w { z29.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "fadd z3.s, z3.s, z17.s\n"
+ "fadd z2.s, z2.s, z16.s\n"
+ "ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z28.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z18.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
+ "bgt 2b\n"
+ "3:" // 4-vectors of channels: 4 inputs tail
+ "fadd z23.s, z1.s, z0.s\n"
+ "fadd z19.s, z31.s, z30.s\n"
+ "fadd z22.s, z29.s, z22.s\n"
+ "fadd z18.s, z28.s, z18.s\n"
+ "fadd z21.s, z27.s, z21.s\n"
+ "fadd z17.s, z26.s, z17.s\n"
+ "fadd z20.s, z25.s, z20.s\n"
+ "fadd z16.s, z24.s, z16.s\n"
+ "fadd z19.s, z23.s, z19.s\n"
+ "fadd z18.s, z22.s, z18.s\n"
+ "fadd z17.s, z21.s, z17.s\n"
+ "fadd z16.s, z20.s, z16.s\n"
+ "fadd z5.s, z5.s, z19.s\n"
+ "fadd z4.s, z4.s, z18.s\n"
+ "fadd z3.s, z3.s, z17.s\n"
+ "fadd z2.s, z2.s, z16.s\n"
+ "4:" // 4-vectors of channels: After loop
+ "ands x21, %x[n_valid_cells], #0x3\n"
+ "beq 6f\n"
+ "5:" // 4-vectors of channels: Single input loop
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "subs x21, x21, #0x1\n"
+ "fadd z5.s, z5.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "fadd z4.s, z4.s, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "fadd z3.s, z3.s, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
+ "fadd z2.s, z2.s, z16.s\n"
+ "bgt 5b\n"
+ "6:" // 4-vectors of channels: Single input loop: End
+ "fmul z5.s, z5.s, z6.s\n"
+ "fmul z4.s, z4.s, z6.s\n"
+ "st1w { z5.s }, p3, [%x[outptr], x9, LSL #2]\n"
+ "incw x9, ALL, MUL #4\n"
+ "fmul z3.s, z3.s, z6.s\n"
+ "fmul z2.s, z2.s, z6.s\n"
+ "st1w { z4.s }, p2, [%x[outptr], x28, LSL #2]\n"
+ "incw x28, ALL, MUL #4\n"
+ "st1w { z3.s }, p1, [%x[outptr], x27, LSL #2]\n"
+ "incw x27, ALL, MUL #4\n"
+ "st1w { z2.s }, p0, [%x[outptr], x26, LSL #2]\n"
+ "incw x26, ALL, MUL #4\n"
+ "whilelt p0.s, x26, %x[n_channels]\n"
+ "b.any 1b\n"
+ "7:" // Single vector of channels
+ "whilelt p3.s, x9, %x[n_channels]\n"
+ "b.none 14f\n"
+ "8:" // Single vector of channels: Loop
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z5.b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x20, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "ld1w { z1.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "beq 10f\n"
+ "9:" // Single vector of channels: Loop: 4 inputs loop
+ "fadd z17.s, z1.s, z0.s\n"
+ "fadd z16.s, z31.s, z30.s\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "fadd z16.s, z17.s, z16.s\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fadd z5.s, z5.s, z16.s\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z1.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "bgt 9b\n"
+ "10:" // Single vector of channels: Loop: 4 inputs tail
+ "fadd z17.s, z1.s, z0.s\n"
+ "fadd z16.s, z31.s, z30.s\n"
+ "fadd z16.s, z17.s, z16.s\n"
+ "fadd z5.s, z5.s, z16.s\n"
+ "11:" // Single vector of channels: Loop: After loop
+ "ands x21, %x[n_valid_cells], #0x3\n"
+ "beq 13f\n"
+ "12:" // Single vector of channels: Loop: Single input loop
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "subs x21, x21, #0x1\n"
+ "fadd z5.s, z5.s, z16.s\n"
+ "bgt 12b\n"
+ "13:" // Single vector of channels: Loop: Single input loop: End
+ "fmul z5.s, z5.s, z6.s\n"
+ "st1w { z5.s }, p3, [%x[outptr], x9, LSL #2]\n"
+ "incw x9\n"
+ "whilelt p3.s, x9, %x[n_channels]\n"
+ "b.any 8b\n"
+ "14:" // End
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
new file mode 100644
index 0000000000..338348231f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+
+struct sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<float, float>
+{
+ using Parent = DepthfirstStrategy<float, float>;
+
+ const static auto pooling_type = PoolingType::MAX;
+ const static auto pool_rows = 2u, pool_cols = 2u;
+ const static auto stride_rows = 1u, stride_cols = 1u;
+
+ sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+ : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
+
+ Parent::KernelType get_kernel(void) const { return sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
+};
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
new file mode 100644
index 0000000000..3c7213a498
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
+ const unsigned int n_channels,
+ const float *const *const inptrs,
+ float *const *const outptrs,
+ const bool exclude_padding,
+ const unsigned int pad_left,
+ const unsigned int pad_top,
+ const unsigned int pad_right,
+ const unsigned int pad_bottom
+)
+{
+ struct KernelArgs
+ {
+ const uint64_t n_channels;
+ const float *const *const inptrs;
+ float *const *const outptrs;
+ KernelArgs(
+ unsigned int channels,
+ const float *const *input_ptrs,
+ float *const * output_ptrs,
+ bool, unsigned int, unsigned int, unsigned int, unsigned int
+ ) : n_channels(channels),
+ inptrs(input_ptrs),
+ outptrs(output_ptrs)
+ {
+ }
+ };
+
+ const KernelArgs args(n_channels, inptrs, outptrs, exclude_padding,
+ pad_left, pad_top, pad_right, pad_bottom);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x15, #0x0\n"
+ "ptrue p2.b\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+ "mov x14, #0x0\n"
+ "ldr x13, [%x[args], %[offsetof_n_channels]]\n"
+ "whilelt p0.s, x15, x13\n"
+ "ldp x12, x11, [x21, #0x0]\n"
+ "ldp x10, x9, [x21, #0x10]\n"
+ "ldp x28, x27, [x20, #0x0]\n"
+ "ld1w { z30.s }, p0/Z, [x27, x15, LSL #2]\n"
+ "ldp x26, x25, [x20, #0x10]\n"
+ "ld1w { z29.s }, p0/Z, [x25, x15, LSL #2]\n"
+ "ldp x24, x23, [x20, #0x20]\n"
+ "ld1w { z28.s }, p0/Z, [x24, x15, LSL #2]\n"
+ "ldp x22, x21, [x20, #0x30]\n"
+ "ld1w { z27.s }, p0/Z, [x21, x15, LSL #2]\n"
+ "ldr x20, [x20, #0x40]\n"
+ "ld1w { z26.s }, p0/Z, [x28, x15, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x26, x15, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x23, x15, LSL #2]\n"
+ "ld1w { z19.s }, p0/Z, [x22, x15, LSL #2]\n"
+ "ld1w { z23.s }, p0/Z, [x20, x15, LSL #2]\n"
+ "incw x15\n"
+ "whilelt p1.s, x15, x13\n"
+ "b.none 2f\n"
+ "1:" // Vector: Loop
+ "movprfx z22, z30\n fmax z22.s, p2/M, z22.s, z28.s\n"
+ "movprfx z21, z28\n fmax z21.s, p2/M, z21.s, z27.s\n"
+ "ld1w { z30.s }, p1/Z, [x27, x15, LSL #2]\n"
+ "whilelt p0.s, x14, x13\n"
+ "movprfx z18, z29\n fmax z18.s, p2/M, z18.s, z26.s\n"
+ "movprfx z17, z25\n fmax z17.s, p2/M, z17.s, z24.s\n"
+ "ld1w { z28.s }, p1/Z, [x24, x15, LSL #2]\n"
+ "movprfx z16, z29\n fmax z16.s, p2/M, z16.s, z19.s\n"
+ "movprfx z20, z24\n fmax z20.s, p2/M, z20.s, z23.s\n"
+ "ld1w { z27.s }, p1/Z, [x21, x15, LSL #2]\n"
+ "ld1w { z29.s }, p1/Z, [x25, x15, LSL #2]\n"
+ "movprfx z19, z22\n fmax z19.s, p2/M, z19.s, z18.s\n"
+ "movprfx z18, z17\n fmax z18.s, p2/M, z18.s, z22.s\n"
+ "ld1w { z26.s }, p1/Z, [x28, x15, LSL #2]\n"
+ "movprfx z17, z16\n fmax z17.s, p2/M, z17.s, z21.s\n"
+ "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z20.s\n"
+ "ld1w { z25.s }, p1/Z, [x26, x15, LSL #2]\n"
+ "st1w { z19.s }, p0, [x12, x14, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x23, x15, LSL #2]\n"
+ "st1w { z18.s }, p0, [x11, x14, LSL #2]\n"
+ "ld1w { z19.s }, p1/Z, [x22, x15, LSL #2]\n"
+ "st1w { z17.s }, p0, [x10, x14, LSL #2]\n"
+ "ld1w { z23.s }, p1/Z, [x20, x15, LSL #2]\n"
+ "incw x15\n"
+ "whilelt p1.s, x15, x13\n"
+ "st1w { z16.s }, p0, [x9, x14, LSL #2]\n"
+ "incw x14\n"
+ "b.any 1b\n"
+ "2:" // Vector: Tail
+ "movprfx z22, z30\n fmax z22.s, p2/M, z22.s, z28.s\n"
+ "movprfx z21, z28\n fmax z21.s, p2/M, z21.s, z27.s\n"
+ "whilelt p0.s, x14, x13\n"
+ "movprfx z20, z29\n fmax z20.s, p2/M, z20.s, z26.s\n"
+ "movprfx z18, z25\n fmax z18.s, p2/M, z18.s, z24.s\n"
+ "movprfx z17, z29\n fmax z17.s, p2/M, z17.s, z19.s\n"
+ "movprfx z19, z24\n fmax z19.s, p2/M, z19.s, z23.s\n"
+ "movprfx z16, z22\n fmax z16.s, p2/M, z16.s, z20.s\n"
+ "fmax z18.s, p2/M, z18.s, z22.s\n"
+ "st1w { z16.s }, p0, [x12, x14, LSL #2]\n"
+ "fmax z17.s, p2/M, z17.s, z21.s\n"
+ "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z19.s\n"
+ "st1w { z18.s }, p0, [x11, x14, LSL #2]\n"
+ "st1w { z17.s }, p0, [x10, x14, LSL #2]\n"
+ "st1w { z16.s }, p0, [x9, x14, LSL #2]\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp
new file mode 100644
index 0000000000..9bc1f11601
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_fp32_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
+
+struct sme_fp32_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<float, float>
+{
+ using Parent = IGenericDepthfirstStrategy<float, float>;
+ sme_fp32_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sme_fp32_nhwc_max_generic_depthfirst_impl; }
+};
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..0dabc2f292
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_fp32_nhwc_max_generic_depthfirst/generic.cpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_fp32_nhwc_max_generic_depthfirst_impl(
+ const uint64_t,
+ const uint64_t n_valid_cells,
+ uint64_t n_channels,
+ const float *const *const inptrs,
+ float *outptr
+)
+{
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x9, #0x0\n"
+ "cntw x28\n"
+ "cntw x27, ALL, MUL #2\n"
+ "cntw x26, ALL, MUL #3\n"
+ "whilelt p4.s, x9, %x[n_channels]\n"
+ "whilelt p3.s, x28, %x[n_channels]\n"
+ "whilelt p2.s, x27, %x[n_channels]\n"
+ "whilelt p1.s, x26, %x[n_channels]\n"
+ "ptrue p0.b\n"
+ "b.none 7f\n"
+ "1:" // 4-vectors of channels
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z4.s, #0xff800000\n"
+ "mov z3.s, #0xff800000\n"
+ "mov x24, %x[inptrs]\n"
+ "mov z2.s, #0xff800000\n"
+ "mov z1.s, #0xff800000\n"
+ "cbz x25, 4f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "ld1w { z0.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z18.s }, p3/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z29.s }, p3/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z28.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x22, x27, LSL #2]\n"
+ "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z26.s }, p2/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z16.s }, p1/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
+ "beq 3f\n"
+ "2:" // 4-vectors of channels: 4 inputs loop
+ "movprfx z19, z0\n fmax z19.s, p0/M, z19.s, z31.s\n"
+ "fmax z23.s, p0/M, z23.s, z30.s\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "fmax z18.s, p0/M, z18.s, z29.s\n"
+ "fmax z22.s, p0/M, z22.s, z28.s\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "fmax z17.s, p0/M, z17.s, z27.s\n"
+ "fmax z21.s, p0/M, z21.s, z26.s\n"
+ "ld1w { z0.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "fmax z16.s, p0/M, z16.s, z25.s\n"
+ "fmax z20.s, p0/M, z20.s, z24.s\n"
+ "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "fmax z19.s, p0/M, z19.s, z23.s\n"
+ "fmax z18.s, p0/M, z18.s, z22.s\n"
+ "ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "fmax z17.s, p0/M, z17.s, z21.s\n"
+ "fmax z16.s, p0/M, z16.s, z20.s\n"
+ "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "fmax z4.s, p0/M, z4.s, z19.s\n"
+ "fmax z3.s, p0/M, z3.s, z18.s\n"
+ "ld1w { z18.s }, p3/Z, [x23, x28, LSL #2]\n"
+ "fmax z2.s, p0/M, z2.s, z17.s\n"
+ "fmax z1.s, p0/M, z1.s, z16.s\n"
+ "ld1w { z29.s }, p3/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z28.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z17.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x22, x27, LSL #2]\n"
+ "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z26.s }, p2/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z16.s }, p1/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
+ "bgt 2b\n"
+ "3:" // 4-vectors of channels: 4 inputs tail
+ "movprfx z19, z0\n fmax z19.s, p0/M, z19.s, z31.s\n"
+ "fmax z23.s, p0/M, z23.s, z30.s\n"
+ "fmax z18.s, p0/M, z18.s, z29.s\n"
+ "fmax z22.s, p0/M, z22.s, z28.s\n"
+ "fmax z17.s, p0/M, z17.s, z27.s\n"
+ "fmax z21.s, p0/M, z21.s, z26.s\n"
+ "fmax z16.s, p0/M, z16.s, z25.s\n"
+ "fmax z20.s, p0/M, z20.s, z24.s\n"
+ "fmax z19.s, p0/M, z19.s, z23.s\n"
+ "fmax z18.s, p0/M, z18.s, z22.s\n"
+ "fmax z17.s, p0/M, z17.s, z21.s\n"
+ "fmax z16.s, p0/M, z16.s, z20.s\n"
+ "fmax z4.s, p0/M, z4.s, z19.s\n"
+ "fmax z3.s, p0/M, z3.s, z18.s\n"
+ "fmax z2.s, p0/M, z2.s, z17.s\n"
+ "fmax z1.s, p0/M, z1.s, z16.s\n"
+ "4:" // 4-vectors of channels: After loop
+ "ands x21, %x[n_valid_cells], #0x3\n"
+ "beq 6f\n"
+ "5:" // 4-vectors of channels: Single input loop
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "subs x21, x21, #0x1\n"
+ "fmax z4.s, p0/M, z4.s, z16.s\n"
+ "ld1w { z16.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "fmax z3.s, p0/M, z3.s, z16.s\n"
+ "ld1w { z16.s }, p2/Z, [x20, x27, LSL #2]\n"
+ "fmax z2.s, p0/M, z2.s, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x20, x26, LSL #2]\n"
+ "fmax z1.s, p0/M, z1.s, z16.s\n"
+ "bgt 5b\n"
+ "6:" // 4-vectors of channels: Single input loop: End
+ "st1w { z4.s }, p4, [%x[outptr], x9, LSL #2]\n"
+ "incw x9, ALL, MUL #4\n"
+ "st1w { z3.s }, p3, [%x[outptr], x28, LSL #2]\n"
+ "incw x28, ALL, MUL #4\n"
+ "st1w { z2.s }, p2, [%x[outptr], x27, LSL #2]\n"
+ "incw x27, ALL, MUL #4\n"
+ "st1w { z1.s }, p1, [%x[outptr], x26, LSL #2]\n"
+ "incw x26, ALL, MUL #4\n"
+ "whilelt p1.s, x26, %x[n_channels]\n"
+ "b.any 1b\n"
+ "7:" // Single vector of channels
+ "whilelt p4.s, x9, %x[n_channels]\n"
+ "b.none 14f\n"
+ "8:" // Single vector of channels: Loop
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z4.s, #0xff800000\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x20, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "ld1w { z0.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "beq 10f\n"
+ "9:" // Single vector of channels: Loop: 4 inputs loop
+ "movprfx z16, z0\n fmax z16.s, p0/M, z16.s, z31.s\n"
+ "movprfx z17, z23\n fmax z17.s, p0/M, z17.s, z30.s\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "fmax z16.s, p0/M, z16.s, z17.s\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fmax z4.s, p0/M, z4.s, z16.s\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z0.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z31.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z23.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z30.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "bgt 9b\n"
+ "10:" // Single vector of channels: Loop: 4 inputs tail
+ "movprfx z16, z0\n fmax z16.s, p0/M, z16.s, z31.s\n"
+ "movprfx z17, z23\n fmax z17.s, p0/M, z17.s, z30.s\n"
+ "fmax z16.s, p0/M, z16.s, z17.s\n"
+ "fmax z4.s, p0/M, z4.s, z16.s\n"
+ "11:" // Single vector of channels: Loop: After loop
+ "ands x21, %x[n_valid_cells], #0x3\n"
+ "beq 13f\n"
+ "12:" // Single vector of channels: Loop: Single input loop
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "subs x21, x21, #0x1\n"
+ "fmax z4.s, p0/M, z4.s, z16.s\n"
+ "bgt 12b\n"
+ "13:" // Single vector of channels: Loop: Single input loop: End
+ "st1w { z4.s }, p4, [%x[outptr], x9, LSL #2]\n"
+ "incw x9\n"
+ "whilelt p4.s, x9, %x[n_channels]\n"
+ "b.any 8b\n"
+ "14:" // End
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp
new file mode 100644
index 0000000000..318510e697
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_s8_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
+
+struct sme_s8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t>
+{
+ using Parent = IGenericDepthfirstStrategy<int8_t, int8_t>;
+ sme_s8_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sme_s8_nhwc_avg_generic_depthfirst_impl; }
+};
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..c24e977dc6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+#include <cstring>
+#include <cmath>
+
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+namespace {
+ struct RescaleParams
+ {
+ int32_t multiplier, shift;
+ };
+
+ constexpr RescaleParams rescale_params[8] = {
+ {0x40000000, -0}, // 1/2
+ {0x55555556, -1}, // 1/3
+ {0x40000000, -1}, // 1/4
+ {0x66666666, -2}, // 1/5
+ {0x55555556, -2}, // 1/6
+ {0x49249249, -2}, // 1/7
+ {0x40000000, -2}, // 1/8
+ {0x71c71c72, -3}, // 1/9
+ };
+}
+
+void sme_s8_nhwc_avg_generic_depthfirst_impl(
+ const uint64_t window_cells,
+ const uint64_t n_valid_cells,
+ uint64_t n_channels,
+ const int8_t *const *const inptrs,
+ int8_t *outptr
+)
+{
+ if (n_valid_cells == 1 && window_cells == 1)
+ {
+ // In this case, simply copy from the input to the output
+ std::memcpy(outptr, *inptrs, n_channels);
+ return;
+ }
+
+ // Compute (or look up) the rescale values
+ int32_t shift_value = 0, rescale_value = 0;
+ if (2 <= window_cells && window_cells <= 9)
+ {
+ auto &params = rescale_params[window_cells - 2];
+ rescale_value = params.multiplier;
+ shift_value = params.shift;
+ }
+ else
+ {
+ auto f_rescale_value = 1.0f / static_cast<float>(window_cells);
+
+ shift_value = 0;
+ while (f_rescale_value < 0.5f)
+ {
+ shift_value--;
+ f_rescale_value *= 2.0f;
+ }
+
+ int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+ if (long_rescale_value == (1ll << 31))
+ {
+ shift_value++;
+ long_rescale_value >>= 1;
+ }
+ rescale_value = static_cast<int32_t>(long_rescale_value);
+ }
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x27, #0x0\n"
+ "cntb x26\n"
+ "cntb x25, ALL, MUL #2\n"
+ "cntb x24, ALL, MUL #3\n"
+ "whilelt p4.b, x27, %x[n_channels]\n"
+ "whilelt p3.b, x26, %x[n_channels]\n"
+ "whilelt p2.b, x25, %x[n_channels]\n"
+ "whilelt p1.b, x24, %x[n_channels]\n"
+ "ptrue p0.b\n"
+ "b.none 7f\n"
+ "1:" // 4-vectors of channels
+ "lsr x23, %x[n_valid_cells], #0x1\n"
+ "mov z15.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov x22, %x[inptrs]\n"
+ "mov z13.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z8.s, #0x0\n"
+ "mov z7.s, #0x0\n"
+ "mov z6.s, #0x0\n"
+ "mov z5.s, #0x0\n"
+ "mov z4.s, #0x0\n"
+ "mov z3.s, #0x0\n"
+ "mov z2.s, #0x0\n"
+ "mov z1.s, #0x0\n"
+ "mov z0.s, #0x0\n"
+ "cbz x23, 4f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+ "beq 3f\n"
+ "2:" // 4-vectors of channels: 2 inputs loop
+ ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
+ ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0x455c03b5 // saddlb z21.h, z29.b, z28.b\n"
+ ".inst 0x455c07b4 // saddlt z20.h, z29.b, z28.b\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x455a0373 // saddlb z19.h, z27.b, z26.b\n"
+ ".inst 0x455a0772 // saddlt z18.h, z27.b, z26.b\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x45580331 // saddlb z17.h, z25.b, z24.b\n"
+ ".inst 0x45580730 // saddlt z16.h, z25.b, z24.b\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
+ ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
+ ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ ".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
+ ".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
+ ".inst 0x45944508 // saddwt z8.s, z8.s, z20.h\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ ".inst 0x459340e7 // saddwb z7.s, z7.s, z19.h\n"
+ ".inst 0x459344c6 // saddwt z6.s, z6.s, z19.h\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x459240a5 // saddwb z5.s, z5.s, z18.h\n"
+ ".inst 0x45924484 // saddwt z4.s, z4.s, z18.h\n"
+ ".inst 0x45914063 // saddwb z3.s, z3.s, z17.h\n"
+ ".inst 0x45914442 // saddwt z2.s, z2.s, z17.h\n"
+ ".inst 0x45904021 // saddwb z1.s, z1.s, z16.h\n"
+ ".inst 0x45904400 // saddwt z0.s, z0.s, z16.h\n"
+ "bgt 2b\n"
+ "3:" // 4-vectors of channels: 2 inputs tail
+ ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
+ ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
+ ".inst 0x455c03b5 // saddlb z21.h, z29.b, z28.b\n"
+ ".inst 0x455c07b4 // saddlt z20.h, z29.b, z28.b\n"
+ ".inst 0x455a0373 // saddlb z19.h, z27.b, z26.b\n"
+ ".inst 0x455a0772 // saddlt z18.h, z27.b, z26.b\n"
+ ".inst 0x45580331 // saddlb z17.h, z25.b, z24.b\n"
+ ".inst 0x45580730 // saddlt z16.h, z25.b, z24.b\n"
+ ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
+ ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
+ ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
+ ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
+ ".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
+ ".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
+ ".inst 0x45944508 // saddwt z8.s, z8.s, z20.h\n"
+ ".inst 0x459340e7 // saddwb z7.s, z7.s, z19.h\n"
+ ".inst 0x459344c6 // saddwt z6.s, z6.s, z19.h\n"
+ ".inst 0x459240a5 // saddwb z5.s, z5.s, z18.h\n"
+ ".inst 0x45924484 // saddwt z4.s, z4.s, z18.h\n"
+ ".inst 0x45914063 // saddwb z3.s, z3.s, z17.h\n"
+ ".inst 0x45914442 // saddwt z2.s, z2.s, z17.h\n"
+ ".inst 0x45904021 // saddwb z1.s, z1.s, z16.h\n"
+ ".inst 0x45904400 // saddwt z0.s, z0.s, z16.h\n"
+ "4:" // 4-vectors of channels: After loop
+ "ands x21, %x[n_valid_cells], #0x1\n"
+ "beq 6f\n"
+ "5:" // 4-vectors of channels: Single input loop
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a217 // sshllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508a616 // sshllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ ".inst 0x4508a215 // sshllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508a614 // sshllt z20.h, z16.b, #0x0\n"
+ "subs x21, x21, #0x1\n"
+ "ld1b { z16.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508a213 // sshllb z19.h, z16.b, #0x0\n"
+ ".inst 0x4508a612 // sshllt z18.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
+ ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
+ ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
+ ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
+ ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
+ ".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
+ ".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
+ ".inst 0x45944508 // saddwt z8.s, z8.s, z20.h\n"
+ ".inst 0x459340e7 // saddwb z7.s, z7.s, z19.h\n"
+ ".inst 0x459344c6 // saddwt z6.s, z6.s, z19.h\n"
+ ".inst 0x459240a5 // saddwb z5.s, z5.s, z18.h\n"
+ ".inst 0x45924484 // saddwt z4.s, z4.s, z18.h\n"
+ ".inst 0x45914063 // saddwb z3.s, z3.s, z17.h\n"
+ ".inst 0x45914442 // saddwt z2.s, z2.s, z17.h\n"
+ ".inst 0x45904021 // saddwb z1.s, z1.s, z16.h\n"
+ ".inst 0x45904400 // saddwt z0.s, z0.s, z16.h\n"
+ "bgt 5b\n"
+ "6:" // 4-vectors of channels: Single input loop: End
+ "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
+ ".inst 0x04b175ef // sqdmulh z15.s, z15.s, z17.s\n"
+ ".inst 0x04b175ce // sqdmulh z14.s, z14.s, z17.s\n"
+ ".inst 0x04b175ad // sqdmulh z13.s, z13.s, z17.s\n"
+ ".inst 0x04b1758c // sqdmulh z12.s, z12.s, z17.s\n"
+ "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
+ ".inst 0x04b1756b // sqdmulh z11.s, z11.s, z17.s\n"
+ ".inst 0x04b1754a // sqdmulh z10.s, z10.s, z17.s\n"
+ ".inst 0x04b17529 // sqdmulh z9.s, z9.s, z17.s\n"
+ ".inst 0x04b17508 // sqdmulh z8.s, z8.s, z17.s\n"
+ ".inst 0x04b174e7 // sqdmulh z7.s, z7.s, z17.s\n"
+ ".inst 0x04b174c6 // sqdmulh z6.s, z6.s, z17.s\n"
+ ".inst 0x04b174a5 // sqdmulh z5.s, z5.s, z17.s\n"
+ ".inst 0x04b17484 // sqdmulh z4.s, z4.s, z17.s\n"
+ ".inst 0x04b17463 // sqdmulh z3.s, z3.s, z17.s\n"
+ ".inst 0x04b17442 // sqdmulh z2.s, z2.s, z17.s\n"
+ ".inst 0x04b17421 // sqdmulh z1.s, z1.s, z17.s\n"
+ ".inst 0x04b17400 // sqdmulh z0.s, z0.s, z17.s\n"
+ "mov z19.s, #0x7f\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
+ ".inst 0x4482820b // srshl z11.s, p0/M, z11.s, z16.s\n"
+ ".inst 0x4482820a // srshl z10.s, p0/M, z10.s, z16.s\n"
+ ".inst 0x44828209 // srshl z9.s, p0/M, z9.s, z16.s\n"
+ ".inst 0x44828208 // srshl z8.s, p0/M, z8.s, z16.s\n"
+ ".inst 0x44828207 // srshl z7.s, p0/M, z7.s, z16.s\n"
+ ".inst 0x44828206 // srshl z6.s, p0/M, z6.s, z16.s\n"
+ ".inst 0x44828205 // srshl z5.s, p0/M, z5.s, z16.s\n"
+ ".inst 0x44828204 // srshl z4.s, p0/M, z4.s, z16.s\n"
+ ".inst 0x44828203 // srshl z3.s, p0/M, z3.s, z16.s\n"
+ ".inst 0x44828202 // srshl z2.s, p0/M, z2.s, z16.s\n"
+ ".inst 0x44828201 // srshl z1.s, p0/M, z1.s, z16.s\n"
+ ".inst 0x44828200 // srshl z0.s, p0/M, z0.s, z16.s\n"
+ "not z16.s, p0/M, z19.s\n"
+ "smax z15.s, p0/M, z15.s, z16.s\n"
+ "smax z14.s, p0/M, z14.s, z16.s\n"
+ "smax z13.s, p0/M, z13.s, z16.s\n"
+ "smax z12.s, p0/M, z12.s, z16.s\n"
+ "smax z11.s, p0/M, z11.s, z16.s\n"
+ "smax z10.s, p0/M, z10.s, z16.s\n"
+ "smax z9.s, p0/M, z9.s, z16.s\n"
+ "smax z8.s, p0/M, z8.s, z16.s\n"
+ "smax z7.s, p0/M, z7.s, z16.s\n"
+ "smax z6.s, p0/M, z6.s, z16.s\n"
+ "smax z5.s, p0/M, z5.s, z16.s\n"
+ "smax z4.s, p0/M, z4.s, z16.s\n"
+ "smax z3.s, p0/M, z3.s, z16.s\n"
+ "smax z2.s, p0/M, z2.s, z16.s\n"
+ "smax z1.s, p0/M, z1.s, z16.s\n"
+ "smax z0.s, p0/M, z0.s, z16.s\n"
+ "smin z15.s, p0/M, z15.s, z19.s\n"
+ "smin z14.s, p0/M, z14.s, z19.s\n"
+ "trn1 z23.h, z15.h, z14.h\n"
+ "smin z13.s, p0/M, z13.s, z19.s\n"
+ "smin z12.s, p0/M, z12.s, z19.s\n"
+ "trn1 z16.h, z13.h, z12.h\n"
+ "smin z11.s, p0/M, z11.s, z19.s\n"
+ "smin z10.s, p0/M, z10.s, z19.s\n"
+ "trn1 z22.h, z11.h, z10.h\n"
+ "smin z9.s, p0/M, z9.s, z19.s\n"
+ "smin z8.s, p0/M, z8.s, z19.s\n"
+ "trn1 z18.h, z9.h, z8.h\n"
+ "smin z7.s, p0/M, z7.s, z19.s\n"
+ "smin z6.s, p0/M, z6.s, z19.s\n"
+ "trn1 z21.h, z7.h, z6.h\n"
+ "smin z5.s, p0/M, z5.s, z19.s\n"
+ "smin z4.s, p0/M, z4.s, z19.s\n"
+ "trn1 z17.h, z5.h, z4.h\n"
+ "smin z3.s, p0/M, z3.s, z19.s\n"
+ "smin z2.s, p0/M, z2.s, z19.s\n"
+ "trn1 z20.h, z3.h, z2.h\n"
+ "smin z1.s, p0/M, z1.s, z19.s\n"
+ "smin z0.s, p0/M, z0.s, z19.s\n"
+ "trn1 z19.h, z1.h, z0.h\n"
+ "trn1 z16.b, z23.b, z16.b\n"
+ "trn1 z18.b, z22.b, z18.b\n"
+ "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+ "incb x27, ALL, MUL #4\n"
+ "trn1 z17.b, z21.b, z17.b\n"
+ "trn1 z16.b, z20.b, z19.b\n"
+ "st1b { z18.b }, p3, [%x[outptr], x26]\n"
+ "incb x26, ALL, MUL #4\n"
+ "st1b { z17.b }, p2, [%x[outptr], x25]\n"
+ "incb x25, ALL, MUL #4\n"
+ "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+ "incb x24, ALL, MUL #4\n"
+ "whilelt p1.b, x24, %x[n_channels]\n"
+ "b.any 1b\n"
+ "7:" // Single vector of channels
+ "whilelt p4.b, x27, %x[n_channels]\n"
+ "b.none 14f\n"
+ "8:" // Single vector of channels: Loop
+ "lsr x23, %x[n_valid_cells], #0x1\n"
+ "mov z15.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov x22, %x[inptrs]\n"
+ "mov z13.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "cbz x23, 11f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "beq 10f\n"
+ "9:" // Single vector of channels: Loop: 2 inputs loop
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "bgt 9b\n"
+ "10:" // Single vector of channels: Loop: 2 inputs tail
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
+ "11:" // Single vector of channels: Loop: After loop
+ "ands x21, %x[n_valid_cells], #0x1\n"
+ "beq 13f\n"
+ "12:" // Single vector of channels: Loop: Single input loop
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
+ "bgt 12b\n"
+ "13:" // Single vector of channels: Loop: Single input loop: End
+ "ld1rw { z16.s }, p0/Z, [%x[rescale_ptr]]\n"
+ ".inst 0x04b075ef // sqdmulh z15.s, z15.s, z16.s\n"
+ ".inst 0x04b075ce // sqdmulh z14.s, z14.s, z16.s\n"
+ ".inst 0x04b075ad // sqdmulh z13.s, z13.s, z16.s\n"
+ ".inst 0x04b0758c // sqdmulh z12.s, z12.s, z16.s\n"
+ "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
+ "mov z18.s, #0x7f\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
+ "not z16.s, p0/M, z18.s\n"
+ "smax z15.s, p0/M, z15.s, z16.s\n"
+ "smax z14.s, p0/M, z14.s, z16.s\n"
+ "smax z13.s, p0/M, z13.s, z16.s\n"
+ "smax z12.s, p0/M, z12.s, z16.s\n"
+ "smin z15.s, p0/M, z15.s, z18.s\n"
+ "smin z14.s, p0/M, z14.s, z18.s\n"
+ "trn1 z17.h, z15.h, z14.h\n"
+ "smin z13.s, p0/M, z13.s, z18.s\n"
+ "smin z12.s, p0/M, z12.s, z18.s\n"
+ "trn1 z16.h, z13.h, z12.h\n"
+ "trn1 z16.b, z17.b, z16.b\n"
+ "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+ "incb x27\n"
+ "whilelt p4.b, x27, %x[n_channels]\n"
+ "b.any 8b\n"
+ "14:" // End
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value), [shift_ptr] "r" (&shift_value)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
new file mode 100644
index 0000000000..c9a80e6a5b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const int8_t *const *const, int8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+
+struct sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<int8_t, int8_t>
+{
+ using Parent = DepthfirstStrategy<int8_t, int8_t>;
+
+ const static auto pooling_type = PoolingType::MAX;
+ const static auto pool_rows = 2u, pool_cols = 2u;
+ const static auto stride_rows = 1u, stride_cols = 1u;
+
+ sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+ : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
+
+ Parent::KernelType get_kernel(void) const { return sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
+};
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
new file mode 100644
index 0000000000..96617566a8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
+ const unsigned int n_channels,
+ const int8_t *const *const inptrs,
+ int8_t *const *const outptrs,
+ const bool exclude_padding,
+ const unsigned int pad_left,
+ const unsigned int pad_top,
+ const unsigned int pad_right,
+ const unsigned int pad_bottom
+)
+{
+ struct KernelArgs
+ {
+ const uint64_t n_channels;
+ const int8_t *const *const inptrs;
+ int8_t *const *const outptrs;
+ KernelArgs(
+ unsigned int channels,
+ const int8_t *const *input_ptrs,
+ int8_t *const * output_ptrs,
+ bool, unsigned int, unsigned int, unsigned int, unsigned int
+ ) : n_channels(channels),
+ inptrs(input_ptrs),
+ outptrs(output_ptrs)
+ {
+ }
+ };
+
+ const KernelArgs args(n_channels, inptrs, outptrs, exclude_padding,
+ pad_left, pad_top, pad_right, pad_bottom);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x15, #0x0\n"
+ "ptrue p2.b\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+ "mov x14, #0x0\n"
+ "ldr x13, [%x[args], %[offsetof_n_channels]]\n"
+ "whilelt p0.b, x15, x13\n"
+ "ldp x12, x11, [x21, #0x0]\n"
+ "ldp x10, x9, [x21, #0x10]\n"
+ "ldp x28, x27, [x20, #0x0]\n"
+ "ld1b { z30.b }, p0/Z, [x27, x15]\n"
+ "ldp x26, x25, [x20, #0x10]\n"
+ "ld1b { z29.b }, p0/Z, [x25, x15]\n"
+ "ldp x24, x23, [x20, #0x20]\n"
+ "ld1b { z28.b }, p0/Z, [x24, x15]\n"
+ "ldp x22, x21, [x20, #0x30]\n"
+ "ld1b { z27.b }, p0/Z, [x21, x15]\n"
+ "ldr x20, [x20, #0x40]\n"
+ "ld1b { z26.b }, p0/Z, [x28, x15]\n"
+ "ld1b { z25.b }, p0/Z, [x26, x15]\n"
+ "ld1b { z24.b }, p0/Z, [x23, x15]\n"
+ "ld1b { z19.b }, p0/Z, [x22, x15]\n"
+ "ld1b { z23.b }, p0/Z, [x20, x15]\n"
+ "incw x15\n"
+ "whilelt p1.b, x15, x13\n"
+ "b.none 2f\n"
+ "1:" // Vector: Loop
+ "movprfx z22, z30\n smax z22.b, p2/M, z22.b, z28.b\n"
+ "movprfx z21, z28\n smax z21.b, p2/M, z21.b, z27.b\n"
+ "ld1b { z30.b }, p1/Z, [x27, x15]\n"
+ "whilelt p0.b, x14, x13\n"
+ "movprfx z18, z29\n smax z18.b, p2/M, z18.b, z26.b\n"
+ "movprfx z17, z25\n smax z17.b, p2/M, z17.b, z24.b\n"
+ "ld1b { z28.b }, p1/Z, [x24, x15]\n"
+ "movprfx z16, z29\n smax z16.b, p2/M, z16.b, z19.b\n"
+ "movprfx z20, z24\n smax z20.b, p2/M, z20.b, z23.b\n"
+ "ld1b { z27.b }, p1/Z, [x21, x15]\n"
+ "ld1b { z29.b }, p1/Z, [x25, x15]\n"
+ "movprfx z19, z22\n smax z19.b, p2/M, z19.b, z18.b\n"
+ "movprfx z18, z17\n smax z18.b, p2/M, z18.b, z22.b\n"
+ "ld1b { z26.b }, p1/Z, [x28, x15]\n"
+ "movprfx z17, z16\n smax z17.b, p2/M, z17.b, z21.b\n"
+ "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z20.b\n"
+ "ld1b { z25.b }, p1/Z, [x26, x15]\n"
+ "st1b { z19.b }, p0, [x12, x14]\n"
+ "ld1b { z24.b }, p1/Z, [x23, x15]\n"
+ "st1b { z18.b }, p0, [x11, x14]\n"
+ "ld1b { z19.b }, p1/Z, [x22, x15]\n"
+ "st1b { z17.b }, p0, [x10, x14]\n"
+ "ld1b { z23.b }, p1/Z, [x20, x15]\n"
+ "incw x15\n"
+ "whilelt p1.b, x15, x13\n"
+ "st1b { z16.b }, p0, [x9, x14]\n"
+ "incw x14\n"
+ "b.any 1b\n"
+ "2:" // Vector: Tail
+ "movprfx z22, z30\n smax z22.b, p2/M, z22.b, z28.b\n"
+ "movprfx z21, z28\n smax z21.b, p2/M, z21.b, z27.b\n"
+ "whilelt p0.b, x14, x13\n"
+ "movprfx z20, z29\n smax z20.b, p2/M, z20.b, z26.b\n"
+ "movprfx z18, z25\n smax z18.b, p2/M, z18.b, z24.b\n"
+ "movprfx z17, z29\n smax z17.b, p2/M, z17.b, z19.b\n"
+ "movprfx z19, z24\n smax z19.b, p2/M, z19.b, z23.b\n"
+ "movprfx z16, z22\n smax z16.b, p2/M, z16.b, z20.b\n"
+ "smax z18.b, p2/M, z18.b, z22.b\n"
+ "st1b { z16.b }, p0, [x12, x14]\n"
+ "smax z17.b, p2/M, z17.b, z21.b\n"
+ "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z19.b\n"
+ "st1b { z18.b }, p0, [x11, x14]\n"
+ "st1b { z17.b }, p0, [x10, x14]\n"
+ "st1b { z16.b }, p0, [x9, x14]\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst.hpp
new file mode 100644
index 0000000000..3e0d76c277
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_s8_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
+
+struct sme_s8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t>
+{
+ using Parent = IGenericDepthfirstStrategy<int8_t, int8_t>;
+ sme_s8_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sme_s8_nhwc_max_generic_depthfirst_impl; }
+};
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..d2b45cd353
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8_nhwc_max_generic_depthfirst/generic.cpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_s8_nhwc_max_generic_depthfirst_impl(
+ const uint64_t,
+ const uint64_t n_valid_cells,
+ uint64_t n_channels,
+ const int8_t *const *const inptrs,
+ int8_t *outptr
+)
+{
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x9, #0x0\n"
+ "cntb x28\n"
+ "cntb x27, ALL, MUL #2\n"
+ "cntb x26, ALL, MUL #3\n"
+ "whilelt p4.b, x9, %x[n_channels]\n"
+ "whilelt p3.b, x28, %x[n_channels]\n"
+ "whilelt p2.b, x27, %x[n_channels]\n"
+ "whilelt p1.b, x26, %x[n_channels]\n"
+ "ptrue p0.b\n"
+ "b.none 7f\n"
+ "1:" // 4-vectors of channels
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z4.b, #0x80\n"
+ "mov z3.b, #0x80\n"
+ "mov x24, %x[inptrs]\n"
+ "mov z2.b, #0x80\n"
+ "mov z1.b, #0x80\n"
+ "cbz x25, 4f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+ "beq 3f\n"
+ "2:" // 4-vectors of channels: 4 inputs loop
+ "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
+ "smax z23.b, p0/M, z23.b, z30.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "smax z18.b, p0/M, z18.b, z29.b\n"
+ "smax z22.b, p0/M, z22.b, z28.b\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "smax z17.b, p0/M, z17.b, z27.b\n"
+ "smax z21.b, p0/M, z21.b, z26.b\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "smax z16.b, p0/M, z16.b, z25.b\n"
+ "smax z20.b, p0/M, z20.b, z24.b\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "smax z19.b, p0/M, z19.b, z23.b\n"
+ "smax z18.b, p0/M, z18.b, z22.b\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "smax z17.b, p0/M, z17.b, z21.b\n"
+ "smax z16.b, p0/M, z16.b, z20.b\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "smax z4.b, p0/M, z4.b, z19.b\n"
+ "smax z3.b, p0/M, z3.b, z18.b\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+ "smax z2.b, p0/M, z2.b, z17.b\n"
+ "smax z1.b, p0/M, z1.b, z16.b\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+ "bgt 2b\n"
+ "3:" // 4-vectors of channels: 4 inputs tail
+ "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
+ "smax z23.b, p0/M, z23.b, z30.b\n"
+ "smax z18.b, p0/M, z18.b, z29.b\n"
+ "smax z22.b, p0/M, z22.b, z28.b\n"
+ "smax z17.b, p0/M, z17.b, z27.b\n"
+ "smax z21.b, p0/M, z21.b, z26.b\n"
+ "smax z16.b, p0/M, z16.b, z25.b\n"
+ "smax z20.b, p0/M, z20.b, z24.b\n"
+ "smax z19.b, p0/M, z19.b, z23.b\n"
+ "smax z18.b, p0/M, z18.b, z22.b\n"
+ "smax z17.b, p0/M, z17.b, z21.b\n"
+ "smax z16.b, p0/M, z16.b, z20.b\n"
+ "smax z4.b, p0/M, z4.b, z19.b\n"
+ "smax z3.b, p0/M, z3.b, z18.b\n"
+ "smax z2.b, p0/M, z2.b, z17.b\n"
+ "smax z1.b, p0/M, z1.b, z16.b\n"
+ "4:" // 4-vectors of channels: After loop
+ "ands x21, %x[n_valid_cells], #0x3\n"
+ "beq 6f\n"
+ "5:" // 4-vectors of channels: Single input loop
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+ "subs x21, x21, #0x1\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
+ "ld1b { z16.b }, p3/Z, [x20, x28]\n"
+ "smax z3.b, p0/M, z3.b, z16.b\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "smax z2.b, p0/M, z2.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+ "smax z1.b, p0/M, z1.b, z16.b\n"
+ "bgt 5b\n"
+ "6:" // 4-vectors of channels: Single input loop: End
+ "st1b { z4.b }, p4, [%x[outptr], x9]\n"
+ "incb x9, ALL, MUL #4\n"
+ "st1b { z3.b }, p3, [%x[outptr], x28]\n"
+ "incb x28, ALL, MUL #4\n"
+ "st1b { z2.b }, p2, [%x[outptr], x27]\n"
+ "incb x27, ALL, MUL #4\n"
+ "st1b { z1.b }, p1, [%x[outptr], x26]\n"
+ "incb x26, ALL, MUL #4\n"
+ "whilelt p1.b, x26, %x[n_channels]\n"
+ "b.any 1b\n"
+ "7:" // Single vector of channels
+ "whilelt p4.b, x9, %x[n_channels]\n"
+ "b.none 14f\n"
+ "8:" // Single vector of channels: Loop
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z4.b, #0x80\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x20, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "ld1b { z0.b }, p4/Z, [x20, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "beq 10f\n"
+ "9:" // Single vector of channels: Loop: 4 inputs loop
+ "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "bgt 9b\n"
+ "10:" // Single vector of channels: Loop: 4 inputs tail
+ "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
+ "11:" // Single vector of channels: Loop: After loop
+ "ands x21, %x[n_valid_cells], #0x3\n"
+ "beq 13f\n"
+ "12:" // Single vector of channels: Loop: Single input loop
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+ "subs x21, x21, #0x1\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
+ "bgt 12b\n"
+ "13:" // Single vector of channels: Loop: Single input loop: End
+ "st1b { z4.b }, p4, [%x[outptr], x9]\n"
+ "incb x9\n"
+ "whilelt p4.b, x9, %x[n_channels]\n"
+ "b.any 8b\n"
+ "14:" // End
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp
new file mode 100644
index 0000000000..c6263f5dbc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_s8q_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
+
+struct sme_s8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>
+{
+ using Parent = IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>;
+ sme_s8q_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sme_s8q_nhwc_avg_generic_depthfirst_impl; }
+};
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..91f2f7ab31
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pooling.hpp"
+#include <cstdint>
+#include <cstddef>
+#include <cstring>
+#include <cmath>
+
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+namespace {
+ struct RescaleParams
+ {
+ int32_t multiplier, shift;
+ };
+
+ constexpr RescaleParams rescale_params[8] = {
+ {0x40000000, -0}, // 1/2
+ {0x55555556, -1}, // 1/3
+ {0x40000000, -1}, // 1/4
+ {0x66666666, -2}, // 1/5
+ {0x55555556, -2}, // 1/6
+ {0x49249249, -2}, // 1/7
+ {0x40000000, -2}, // 1/8
+ {0x71c71c72, -3}, // 1/9
+ };
+}
+
+void sme_s8q_nhwc_avg_generic_depthfirst_impl(
+ const uint64_t window_cells,
+ const uint64_t n_valid_cells,
+ uint64_t n_channels,
+ const int8_t *const *const inptrs,
+ int8_t *outptr,
+ const Requantize32 &qp
+)
+{
+ if (n_valid_cells == 1 && window_cells == 1)
+ {
+ // In this case, simply copy from the input to the output
+ std::memcpy(outptr, *inptrs, n_channels);
+ return;
+ }
+
+ // Compute (or look up) the rescale values
+ int32_t shift_value = 0, rescale_value = 0;
+ if (2 <= window_cells && window_cells <= 9)
+ {
+ auto &params = rescale_params[window_cells - 2];
+ rescale_value = params.multiplier;
+ shift_value = params.shift;
+ }
+ else
+ {
+ auto f_rescale_value = 1.0f / static_cast<float>(window_cells);
+
+ shift_value = 0;
+ while (f_rescale_value < 0.5f)
+ {
+ shift_value--;
+ f_rescale_value *= 2.0f;
+ }
+
+ int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+ if (long_rescale_value == (1ll << 31))
+ {
+ shift_value++;
+ long_rescale_value >>= 1;
+ }
+ rescale_value = static_cast<int32_t>(long_rescale_value);
+ }
+
+ // Combine together the rescale value for the requantization and the scaling
+ // factor for the average pool.
+ const int32_t shift = qp.per_layer_left_shift - qp.per_layer_right_shift + shift_value;
+ const int32_t left_shift = shift > 0 ? shift : 0;
+ const int32_t right_shift = shift <= 0 ? shift : 0;
+
+ int32_t combined_rescale_value = 0;
+ __asm__ __volatile__ (
+ "mov v16.s[0], %w[per_layer_mul]\n"
+ "mov v17.s[0], %w[rescale_value]\n"
+ "sqrdmulh s18, s16, s17\n"
+ "mov %w[combined_rescale_value], v18.s[0]\n"
+ : [combined_rescale_value] "=r" (combined_rescale_value)
+ : [per_layer_mul] "r" (qp.per_layer_mul), [rescale_value] "r" (rescale_value)
+ : "v16", "v17", "v18"
+ );
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x27, #0x0\n"
+ "cntb x26\n"
+ "cntb x25, ALL, MUL #2\n"
+ "cntb x24, ALL, MUL #3\n"
+ "whilelt p4.b, x27, %x[n_channels]\n"
+ "whilelt p3.b, x26, %x[n_channels]\n"
+ "whilelt p2.b, x25, %x[n_channels]\n"
+ "whilelt p1.b, x24, %x[n_channels]\n"
+ "ptrue p0.b\n"
+ "b.none 7f\n"
+ "1:" // 4-vectors of channels
+ "lsr x23, %x[n_valid_cells], #0x1\n"
+ "mov z15.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov x22, %x[inptrs]\n"
+ "mov z13.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z8.s, #0x0\n"
+ "mov z7.s, #0x0\n"
+ "mov z6.s, #0x0\n"
+ "mov z5.s, #0x0\n"
+ "mov z4.s, #0x0\n"
+ "mov z3.s, #0x0\n"
+ "mov z2.s, #0x0\n"
+ "mov z1.s, #0x0\n"
+ "mov z0.s, #0x0\n"
+ "cbz x23, 4f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+ "beq 3f\n"
+ "2:" // 4-vectors of channels: 2 inputs loop
+ ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
+ ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0x455c03b5 // saddlb z21.h, z29.b, z28.b\n"
+ ".inst 0x455c07b4 // saddlt z20.h, z29.b, z28.b\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x455a0373 // saddlb z19.h, z27.b, z26.b\n"
+ ".inst 0x455a0772 // saddlt z18.h, z27.b, z26.b\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x45580331 // saddlb z17.h, z25.b, z24.b\n"
+ ".inst 0x45580730 // saddlt z16.h, z25.b, z24.b\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
+ ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
+ ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ ".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
+ ".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
+ ".inst 0x45944508 // saddwt z8.s, z8.s, z20.h\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ ".inst 0x459340e7 // saddwb z7.s, z7.s, z19.h\n"
+ ".inst 0x459344c6 // saddwt z6.s, z6.s, z19.h\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x459240a5 // saddwb z5.s, z5.s, z18.h\n"
+ ".inst 0x45924484 // saddwt z4.s, z4.s, z18.h\n"
+ ".inst 0x45914063 // saddwb z3.s, z3.s, z17.h\n"
+ ".inst 0x45914442 // saddwt z2.s, z2.s, z17.h\n"
+ ".inst 0x45904021 // saddwb z1.s, z1.s, z16.h\n"
+ ".inst 0x45904400 // saddwt z0.s, z0.s, z16.h\n"
+ "bgt 2b\n"
+ "3:" // 4-vectors of channels: 2 inputs tail
+ ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
+ ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
+ ".inst 0x455c03b5 // saddlb z21.h, z29.b, z28.b\n"
+ ".inst 0x455c07b4 // saddlt z20.h, z29.b, z28.b\n"
+ ".inst 0x455a0373 // saddlb z19.h, z27.b, z26.b\n"
+ ".inst 0x455a0772 // saddlt z18.h, z27.b, z26.b\n"
+ ".inst 0x45580331 // saddlb z17.h, z25.b, z24.b\n"
+ ".inst 0x45580730 // saddlt z16.h, z25.b, z24.b\n"
+ ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
+ ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
+ ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
+ ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
+ ".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
+ ".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
+ ".inst 0x45944508 // saddwt z8.s, z8.s, z20.h\n"
+ ".inst 0x459340e7 // saddwb z7.s, z7.s, z19.h\n"
+ ".inst 0x459344c6 // saddwt z6.s, z6.s, z19.h\n"
+ ".inst 0x459240a5 // saddwb z5.s, z5.s, z18.h\n"
+ ".inst 0x45924484 // saddwt z4.s, z4.s, z18.h\n"
+ ".inst 0x45914063 // saddwb z3.s, z3.s, z17.h\n"
+ ".inst 0x45914442 // saddwt z2.s, z2.s, z17.h\n"
+ ".inst 0x45904021 // saddwb z1.s, z1.s, z16.h\n"
+ ".inst 0x45904400 // saddwt z0.s, z0.s, z16.h\n"
+ "4:" // 4-vectors of channels: After loop
+ "ands x21, %x[n_valid_cells], #0x1\n"
+ "beq 6f\n"
+ "5:" // 4-vectors of channels: Single input loop
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a217 // sshllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508a616 // sshllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ ".inst 0x4508a215 // sshllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508a614 // sshllt z20.h, z16.b, #0x0\n"
+ "subs x21, x21, #0x1\n"
+ "ld1b { z16.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508a213 // sshllb z19.h, z16.b, #0x0\n"
+ ".inst 0x4508a612 // sshllt z18.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
+ ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
+ ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
+ ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
+ ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
+ ".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
+ ".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
+ ".inst 0x45944508 // saddwt z8.s, z8.s, z20.h\n"
+ ".inst 0x459340e7 // saddwb z7.s, z7.s, z19.h\n"
+ ".inst 0x459344c6 // saddwt z6.s, z6.s, z19.h\n"
+ ".inst 0x459240a5 // saddwb z5.s, z5.s, z18.h\n"
+ ".inst 0x45924484 // saddwt z4.s, z4.s, z18.h\n"
+ ".inst 0x45914063 // saddwb z3.s, z3.s, z17.h\n"
+ ".inst 0x45914442 // saddwt z2.s, z2.s, z17.h\n"
+ ".inst 0x45904021 // saddwb z1.s, z1.s, z16.h\n"
+ ".inst 0x45904400 // saddwt z0.s, z0.s, z16.h\n"
+ "bgt 5b\n"
+ "6:" // 4-vectors of channels: Single input loop: End
+ "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
+ ".inst 0x4482824f // srshl z15.s, p0/M, z15.s, z18.s\n"
+ ".inst 0x4482824e // srshl z14.s, p0/M, z14.s, z18.s\n"
+ ".inst 0x4482824d // srshl z13.s, p0/M, z13.s, z18.s\n"
+ ".inst 0x4482824c // srshl z12.s, p0/M, z12.s, z18.s\n"
+ "ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
+ ".inst 0x4482824b // srshl z11.s, p0/M, z11.s, z18.s\n"
+ ".inst 0x4482824a // srshl z10.s, p0/M, z10.s, z18.s\n"
+ "ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
+ ".inst 0x44828249 // srshl z9.s, p0/M, z9.s, z18.s\n"
+ ".inst 0x44828248 // srshl z8.s, p0/M, z8.s, z18.s\n"
+ ".inst 0x44828247 // srshl z7.s, p0/M, z7.s, z18.s\n"
+ ".inst 0x44828246 // srshl z6.s, p0/M, z6.s, z18.s\n"
+ ".inst 0x44828245 // srshl z5.s, p0/M, z5.s, z18.s\n"
+ ".inst 0x44828244 // srshl z4.s, p0/M, z4.s, z18.s\n"
+ ".inst 0x44828243 // srshl z3.s, p0/M, z3.s, z18.s\n"
+ ".inst 0x44828242 // srshl z2.s, p0/M, z2.s, z18.s\n"
+ ".inst 0x44828241 // srshl z1.s, p0/M, z1.s, z18.s\n"
+ ".inst 0x44828240 // srshl z0.s, p0/M, z0.s, z18.s\n"
+ ".inst 0x04b175ef // sqrdmulh z15.s, z15.s, z17.s\n"
+ ".inst 0x04b175ce // sqrdmulh z14.s, z14.s, z17.s\n"
+ ".inst 0x04b175ad // sqrdmulh z13.s, z13.s, z17.s\n"
+ ".inst 0x04b1758c // sqrdmulh z12.s, z12.s, z17.s\n"
+ ".inst 0x04b1756b // sqrdmulh z11.s, z11.s, z17.s\n"
+ ".inst 0x04b1754a // sqrdmulh z10.s, z10.s, z17.s\n"
+ ".inst 0x04b17529 // sqrdmulh z9.s, z9.s, z17.s\n"
+ ".inst 0x04b17508 // sqrdmulh z8.s, z8.s, z17.s\n"
+ ".inst 0x04b174e7 // sqrdmulh z7.s, z7.s, z17.s\n"
+ ".inst 0x04b174c6 // sqrdmulh z6.s, z6.s, z17.s\n"
+ ".inst 0x04b174a5 // sqrdmulh z5.s, z5.s, z17.s\n"
+ ".inst 0x04b17484 // sqrdmulh z4.s, z4.s, z17.s\n"
+ ".inst 0x04b17463 // sqrdmulh z3.s, z3.s, z17.s\n"
+ ".inst 0x04b17442 // sqrdmulh z2.s, z2.s, z17.s\n"
+ ".inst 0x04b17421 // sqrdmulh z1.s, z1.s, z17.s\n"
+ ".inst 0x04b17400 // sqrdmulh z0.s, z0.s, z17.s\n"
+ "mov z19.s, #0x7f\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
+ ".inst 0x4482820b // srshl z11.s, p0/M, z11.s, z16.s\n"
+ ".inst 0x4482820a // srshl z10.s, p0/M, z10.s, z16.s\n"
+ ".inst 0x44828209 // srshl z9.s, p0/M, z9.s, z16.s\n"
+ ".inst 0x44828208 // srshl z8.s, p0/M, z8.s, z16.s\n"
+ ".inst 0x44828207 // srshl z7.s, p0/M, z7.s, z16.s\n"
+ ".inst 0x44828206 // srshl z6.s, p0/M, z6.s, z16.s\n"
+ ".inst 0x44828205 // srshl z5.s, p0/M, z5.s, z16.s\n"
+ ".inst 0x44828204 // srshl z4.s, p0/M, z4.s, z16.s\n"
+ ".inst 0x44828203 // srshl z3.s, p0/M, z3.s, z16.s\n"
+ ".inst 0x44828202 // srshl z2.s, p0/M, z2.s, z16.s\n"
+ ".inst 0x44828201 // srshl z1.s, p0/M, z1.s, z16.s\n"
+ ".inst 0x44828200 // srshl z0.s, p0/M, z0.s, z16.s\n"
+ "not z16.s, p0/M, z19.s\n"
+ "smax z15.s, p0/M, z15.s, z16.s\n"
+ "smax z14.s, p0/M, z14.s, z16.s\n"
+ "smax z13.s, p0/M, z13.s, z16.s\n"
+ "smax z12.s, p0/M, z12.s, z16.s\n"
+ "smax z11.s, p0/M, z11.s, z16.s\n"
+ "smax z10.s, p0/M, z10.s, z16.s\n"
+ "smax z9.s, p0/M, z9.s, z16.s\n"
+ "smax z8.s, p0/M, z8.s, z16.s\n"
+ "smax z7.s, p0/M, z7.s, z16.s\n"
+ "smax z6.s, p0/M, z6.s, z16.s\n"
+ "smax z5.s, p0/M, z5.s, z16.s\n"
+ "smax z4.s, p0/M, z4.s, z16.s\n"
+ "smax z3.s, p0/M, z3.s, z16.s\n"
+ "smax z2.s, p0/M, z2.s, z16.s\n"
+ "smax z1.s, p0/M, z1.s, z16.s\n"
+ "smax z0.s, p0/M, z0.s, z16.s\n"
+ "smin z15.s, p0/M, z15.s, z19.s\n"
+ "smin z14.s, p0/M, z14.s, z19.s\n"
+ "trn1 z23.h, z15.h, z14.h\n"
+ "smin z13.s, p0/M, z13.s, z19.s\n"
+ "smin z12.s, p0/M, z12.s, z19.s\n"
+ "trn1 z16.h, z13.h, z12.h\n"
+ "smin z11.s, p0/M, z11.s, z19.s\n"
+ "smin z10.s, p0/M, z10.s, z19.s\n"
+ "trn1 z22.h, z11.h, z10.h\n"
+ "smin z9.s, p0/M, z9.s, z19.s\n"
+ "smin z8.s, p0/M, z8.s, z19.s\n"
+ "trn1 z18.h, z9.h, z8.h\n"
+ "smin z7.s, p0/M, z7.s, z19.s\n"
+ "smin z6.s, p0/M, z6.s, z19.s\n"
+ "trn1 z21.h, z7.h, z6.h\n"
+ "smin z5.s, p0/M, z5.s, z19.s\n"
+ "smin z4.s, p0/M, z4.s, z19.s\n"
+ "trn1 z17.h, z5.h, z4.h\n"
+ "smin z3.s, p0/M, z3.s, z19.s\n"
+ "smin z2.s, p0/M, z2.s, z19.s\n"
+ "trn1 z20.h, z3.h, z2.h\n"
+ "smin z1.s, p0/M, z1.s, z19.s\n"
+ "smin z0.s, p0/M, z0.s, z19.s\n"
+ "trn1 z19.h, z1.h, z0.h\n"
+ "trn1 z16.b, z23.b, z16.b\n"
+ "trn1 z18.b, z22.b, z18.b\n"
+ "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+ "incb x27, ALL, MUL #4\n"
+ "trn1 z17.b, z21.b, z17.b\n"
+ "trn1 z16.b, z20.b, z19.b\n"
+ "st1b { z18.b }, p3, [%x[outptr], x26]\n"
+ "incb x26, ALL, MUL #4\n"
+ "st1b { z17.b }, p2, [%x[outptr], x25]\n"
+ "incb x25, ALL, MUL #4\n"
+ "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+ "incb x24, ALL, MUL #4\n"
+ "whilelt p1.b, x24, %x[n_channels]\n"
+ "b.any 1b\n"
+ "7:" // Single vector of channels
+ "whilelt p4.b, x27, %x[n_channels]\n"
+ "b.none 14f\n"
+ "8:" // Single vector of channels: Loop
+ "lsr x23, %x[n_valid_cells], #0x1\n"
+ "mov z15.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov x22, %x[inptrs]\n"
+ "mov z13.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "cbz x23, 11f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "beq 10f\n"
+ "9:" // Single vector of channels: Loop: 2 inputs loop
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "bgt 9b\n"
+ "10:" // Single vector of channels: Loop: 2 inputs tail
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
+ "11:" // Single vector of channels: Loop: After loop
+ "ands x21, %x[n_valid_cells], #0x1\n"
+ "beq 13f\n"
+ "12:" // Single vector of channels: Loop: Single input loop
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
+ "bgt 12b\n"
+ "13:" // Single vector of channels: Loop: Single input loop: End
+ "ld1rw { z16.s }, p0/Z, [%x[left_shift]]\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
+ "ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
+ ".inst 0x04b175ef // sqrdmulh z15.s, z15.s, z17.s\n"
+ ".inst 0x04b175ce // sqrdmulh z14.s, z14.s, z17.s\n"
+ "ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
+ ".inst 0x04b175ad // sqrdmulh z13.s, z13.s, z17.s\n"
+ ".inst 0x04b1758c // sqrdmulh z12.s, z12.s, z17.s\n"
+ "mov z18.s, #0x7f\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
+ "not z16.s, p0/M, z18.s\n"
+ "smax z15.s, p0/M, z15.s, z16.s\n"
+ "smax z14.s, p0/M, z14.s, z16.s\n"
+ "smax z13.s, p0/M, z13.s, z16.s\n"
+ "smax z12.s, p0/M, z12.s, z16.s\n"
+ "smin z15.s, p0/M, z15.s, z18.s\n"
+ "smin z14.s, p0/M, z14.s, z18.s\n"
+ "trn1 z17.h, z15.h, z14.h\n"
+ "smin z13.s, p0/M, z13.s, z18.s\n"
+ "smin z12.s, p0/M, z12.s, z18.s\n"
+ "trn1 z16.h, z13.h, z12.h\n"
+ "trn1 z16.b, z17.b, z16.b\n"
+ "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+ "incb x27\n"
+ "whilelt p4.b, x27, %x[n_channels]\n"
+ "b.any 8b\n"
+ "14:" // End
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [combined_rescale_value] "r" (&combined_rescale_value), [inptrs] "r" (inptrs), [left_shift] "r" (&left_shift), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [right_shift] "r" (&right_shift)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp
new file mode 100644
index 0000000000..9667d37954
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_s8q_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
+
+struct sme_s8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>
+{
+ using Parent = IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>;
+ sme_s8q_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sme_s8q_nhwc_max_generic_depthfirst_impl; }
+};
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..e9b586f4ce
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_s8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pooling.hpp"
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_s8q_nhwc_max_generic_depthfirst_impl(
+ const uint64_t,
+ const uint64_t n_valid_cells,
+ uint64_t n_channels,
+ const int8_t *const *const inptrs,
+ int8_t *outptr,
+ const Requantize32 &qp
+)
+{
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x9, #0x0\n"
+ "cntb x28\n"
+ "cntb x27, ALL, MUL #2\n"
+ "cntb x26, ALL, MUL #3\n"
+ "whilelt p4.b, x9, %x[n_channels]\n"
+ "whilelt p3.b, x28, %x[n_channels]\n"
+ "whilelt p2.b, x27, %x[n_channels]\n"
+ "whilelt p1.b, x26, %x[n_channels]\n"
+ "ptrue p0.b\n"
+ "b.none 7f\n"
+ "1:" // 4-vectors of channels
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z4.b, #0x80\n"
+ "mov z3.b, #0x80\n"
+ "mov x24, %x[inptrs]\n"
+ "mov z2.b, #0x80\n"
+ "mov z1.b, #0x80\n"
+ "cbz x25, 4f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+ "beq 3f\n"
+ "2:" // 4-vectors of channels: 4 inputs loop
+ "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
+ "smax z23.b, p0/M, z23.b, z30.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "smax z18.b, p0/M, z18.b, z29.b\n"
+ "smax z22.b, p0/M, z22.b, z28.b\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "smax z17.b, p0/M, z17.b, z27.b\n"
+ "smax z21.b, p0/M, z21.b, z26.b\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "smax z16.b, p0/M, z16.b, z25.b\n"
+ "smax z20.b, p0/M, z20.b, z24.b\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "smax z19.b, p0/M, z19.b, z23.b\n"
+ "smax z18.b, p0/M, z18.b, z22.b\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "smax z17.b, p0/M, z17.b, z21.b\n"
+ "smax z16.b, p0/M, z16.b, z20.b\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "smax z4.b, p0/M, z4.b, z19.b\n"
+ "smax z3.b, p0/M, z3.b, z18.b\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+ "smax z2.b, p0/M, z2.b, z17.b\n"
+ "smax z1.b, p0/M, z1.b, z16.b\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+ "bgt 2b\n"
+ "3:" // 4-vectors of channels: 4 inputs tail
+ "movprfx z19, z0\n smax z19.b, p0/M, z19.b, z31.b\n"
+ "smax z23.b, p0/M, z23.b, z30.b\n"
+ "smax z18.b, p0/M, z18.b, z29.b\n"
+ "smax z22.b, p0/M, z22.b, z28.b\n"
+ "smax z17.b, p0/M, z17.b, z27.b\n"
+ "smax z21.b, p0/M, z21.b, z26.b\n"
+ "smax z16.b, p0/M, z16.b, z25.b\n"
+ "smax z20.b, p0/M, z20.b, z24.b\n"
+ "smax z19.b, p0/M, z19.b, z23.b\n"
+ "smax z18.b, p0/M, z18.b, z22.b\n"
+ "smax z17.b, p0/M, z17.b, z21.b\n"
+ "smax z16.b, p0/M, z16.b, z20.b\n"
+ "smax z4.b, p0/M, z4.b, z19.b\n"
+ "smax z3.b, p0/M, z3.b, z18.b\n"
+ "smax z2.b, p0/M, z2.b, z17.b\n"
+ "smax z1.b, p0/M, z1.b, z16.b\n"
+ "4:" // 4-vectors of channels: After loop
+ "ands x21, %x[n_valid_cells], #0x3\n"
+ "beq 6f\n"
+ "5:" // 4-vectors of channels: Single input loop
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+ "subs x21, x21, #0x1\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
+ "ld1b { z16.b }, p3/Z, [x20, x28]\n"
+ "smax z3.b, p0/M, z3.b, z16.b\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "smax z2.b, p0/M, z2.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+ "smax z1.b, p0/M, z1.b, z16.b\n"
+ "bgt 5b\n"
+ "6:" // 4-vectors of channels: Single input loop: End
+ ".inst 0x4508a097 // sshllb z23.h, z4.b, #0x0\n"
+ ".inst 0x4508a496 // sshllt z22.h, z4.b, #0x0\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+ "ld1rw { z4.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a075 // sshllb z21.h, z3.b, #0x0\n"
+ ".inst 0x4508a472 // sshllt z18.h, z3.b, #0x0\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+ "ld1rw { z3.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a054 // sshllb z20.h, z2.b, #0x0\n"
+ ".inst 0x4508a451 // sshllt z17.h, z2.b, #0x0\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+ "ld1rw { z2.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a033 // sshllb z19.h, z1.b, #0x0\n"
+ ".inst 0x4508a430 // sshllt z16.h, z1.b, #0x0\n"
+ ".inst 0x4510a2e1 // sshllb z1.s, z23.h, #0x0\n"
+ ".inst 0x4510a6f7 // sshllt z23.s, z23.h, #0x0\n"
+ ".inst 0x4510a2c0 // sshllb z0.s, z22.h, #0x0\n"
+ ".inst 0x4510a6df // sshllt z31.s, z22.h, #0x0\n"
+ ".inst 0x4510a2be // sshllb z30.s, z21.h, #0x0\n"
+ ".inst 0x4510a6b6 // sshllt z22.s, z21.h, #0x0\n"
+ ".inst 0x4510a25d // sshllb z29.s, z18.h, #0x0\n"
+ ".inst 0x4510a652 // sshllt z18.s, z18.h, #0x0\n"
+ ".inst 0x4510a29c // sshllb z28.s, z20.h, #0x0\n"
+ ".inst 0x4510a695 // sshllt z21.s, z20.h, #0x0\n"
+ ".inst 0x4510a23b // sshllb z27.s, z17.h, #0x0\n"
+ ".inst 0x4510a631 // sshllt z17.s, z17.h, #0x0\n"
+ ".inst 0x4510a27a // sshllb z26.s, z19.h, #0x0\n"
+ ".inst 0x4510a674 // sshllt z20.s, z19.h, #0x0\n"
+ ".inst 0x4510a219 // sshllb z25.s, z16.h, #0x0\n"
+ ".inst 0x4510a618 // sshllt z24.s, z16.h, #0x0\n"
+ ".inst 0x44828081 // srshl z1.s, p0/M, z1.s, z4.s\n"
+ ".inst 0x44828097 // srshl z23.s, p0/M, z23.s, z4.s\n"
+ ".inst 0x44828080 // srshl z0.s, p0/M, z0.s, z4.s\n"
+ ".inst 0x4482809f // srshl z31.s, p0/M, z31.s, z4.s\n"
+ ".inst 0x4482809e // srshl z30.s, p0/M, z30.s, z4.s\n"
+ ".inst 0x44828096 // srshl z22.s, p0/M, z22.s, z4.s\n"
+ ".inst 0x4482809d // srshl z29.s, p0/M, z29.s, z4.s\n"
+ ".inst 0x44828092 // srshl z18.s, p0/M, z18.s, z4.s\n"
+ ".inst 0x4482809c // srshl z28.s, p0/M, z28.s, z4.s\n"
+ ".inst 0x44828095 // srshl z21.s, p0/M, z21.s, z4.s\n"
+ ".inst 0x4482809b // srshl z27.s, p0/M, z27.s, z4.s\n"
+ ".inst 0x44828091 // srshl z17.s, p0/M, z17.s, z4.s\n"
+ ".inst 0x4482809a // srshl z26.s, p0/M, z26.s, z4.s\n"
+ ".inst 0x44828094 // srshl z20.s, p0/M, z20.s, z4.s\n"
+ ".inst 0x44828099 // srshl z25.s, p0/M, z25.s, z4.s\n"
+ ".inst 0x44828098 // srshl z24.s, p0/M, z24.s, z4.s\n"
+ ".inst 0x04a37421 // sqrdmulh z1.s, z1.s, z3.s\n"
+ ".inst 0x04a376f7 // sqrdmulh z23.s, z23.s, z3.s\n"
+ ".inst 0x04a37400 // sqrdmulh z0.s, z0.s, z3.s\n"
+ ".inst 0x04a377ff // sqrdmulh z31.s, z31.s, z3.s\n"
+ ".inst 0x04a377de // sqrdmulh z30.s, z30.s, z3.s\n"
+ ".inst 0x04a376d6 // sqrdmulh z22.s, z22.s, z3.s\n"
+ ".inst 0x04a377bd // sqrdmulh z29.s, z29.s, z3.s\n"
+ ".inst 0x04a37652 // sqrdmulh z18.s, z18.s, z3.s\n"
+ ".inst 0x04a3779c // sqrdmulh z28.s, z28.s, z3.s\n"
+ ".inst 0x04a376b5 // sqrdmulh z21.s, z21.s, z3.s\n"
+ ".inst 0x04a3777b // sqrdmulh z27.s, z27.s, z3.s\n"
+ ".inst 0x04a37631 // sqrdmulh z17.s, z17.s, z3.s\n"
+ ".inst 0x04a3775a // sqrdmulh z26.s, z26.s, z3.s\n"
+ ".inst 0x04a37694 // sqrdmulh z20.s, z20.s, z3.s\n"
+ ".inst 0x04a37739 // sqrdmulh z25.s, z25.s, z3.s\n"
+ ".inst 0x04a37718 // sqrdmulh z24.s, z24.s, z3.s\n"
+ "mov z19.s, #0x7f\n"
+ ".inst 0x44828041 // srshl z1.s, p0/M, z1.s, z2.s\n"
+ ".inst 0x44828057 // srshl z23.s, p0/M, z23.s, z2.s\n"
+ ".inst 0x44828040 // srshl z0.s, p0/M, z0.s, z2.s\n"
+ ".inst 0x4482805f // srshl z31.s, p0/M, z31.s, z2.s\n"
+ ".inst 0x4482805e // srshl z30.s, p0/M, z30.s, z2.s\n"
+ ".inst 0x44828056 // srshl z22.s, p0/M, z22.s, z2.s\n"
+ ".inst 0x4482805d // srshl z29.s, p0/M, z29.s, z2.s\n"
+ ".inst 0x44828052 // srshl z18.s, p0/M, z18.s, z2.s\n"
+ ".inst 0x4482805c // srshl z28.s, p0/M, z28.s, z2.s\n"
+ ".inst 0x44828055 // srshl z21.s, p0/M, z21.s, z2.s\n"
+ ".inst 0x4482805b // srshl z27.s, p0/M, z27.s, z2.s\n"
+ ".inst 0x44828051 // srshl z17.s, p0/M, z17.s, z2.s\n"
+ ".inst 0x4482805a // srshl z26.s, p0/M, z26.s, z2.s\n"
+ ".inst 0x44828054 // srshl z20.s, p0/M, z20.s, z2.s\n"
+ ".inst 0x44828059 // srshl z25.s, p0/M, z25.s, z2.s\n"
+ ".inst 0x44828058 // srshl z24.s, p0/M, z24.s, z2.s\n"
+ "not z16.s, p0/M, z19.s\n"
+ "smax z1.s, p0/M, z1.s, z16.s\n"
+ "smax z23.s, p0/M, z23.s, z16.s\n"
+ "smax z0.s, p0/M, z0.s, z16.s\n"
+ "smax z31.s, p0/M, z31.s, z16.s\n"
+ "smax z30.s, p0/M, z30.s, z16.s\n"
+ "smax z22.s, p0/M, z22.s, z16.s\n"
+ "smax z29.s, p0/M, z29.s, z16.s\n"
+ "smax z18.s, p0/M, z18.s, z16.s\n"
+ "smax z28.s, p0/M, z28.s, z16.s\n"
+ "smax z21.s, p0/M, z21.s, z16.s\n"
+ "smax z27.s, p0/M, z27.s, z16.s\n"
+ "smax z17.s, p0/M, z17.s, z16.s\n"
+ "smax z26.s, p0/M, z26.s, z16.s\n"
+ "smax z20.s, p0/M, z20.s, z16.s\n"
+ "smax z25.s, p0/M, z25.s, z16.s\n"
+ "smax z24.s, p0/M, z24.s, z16.s\n"
+ "smin z1.s, p0/M, z1.s, z19.s\n"
+ "smin z23.s, p0/M, z23.s, z19.s\n"
+ "trn1 z23.h, z1.h, z23.h\n"
+ "smin z0.s, p0/M, z0.s, z19.s\n"
+ "smin z31.s, p0/M, z31.s, z19.s\n"
+ "trn1 z16.h, z0.h, z31.h\n"
+ "smin z30.s, p0/M, z30.s, z19.s\n"
+ "smin z22.s, p0/M, z22.s, z19.s\n"
+ "trn1 z22.h, z30.h, z22.h\n"
+ "smin z29.s, p0/M, z29.s, z19.s\n"
+ "smin z18.s, p0/M, z18.s, z19.s\n"
+ "trn1 z18.h, z29.h, z18.h\n"
+ "smin z28.s, p0/M, z28.s, z19.s\n"
+ "smin z21.s, p0/M, z21.s, z19.s\n"
+ "trn1 z21.h, z28.h, z21.h\n"
+ "smin z27.s, p0/M, z27.s, z19.s\n"
+ "smin z17.s, p0/M, z17.s, z19.s\n"
+ "trn1 z17.h, z27.h, z17.h\n"
+ "smin z26.s, p0/M, z26.s, z19.s\n"
+ "smin z20.s, p0/M, z20.s, z19.s\n"
+ "trn1 z20.h, z26.h, z20.h\n"
+ "smin z25.s, p0/M, z25.s, z19.s\n"
+ "smin z24.s, p0/M, z24.s, z19.s\n"
+ "trn1 z19.h, z25.h, z24.h\n"
+ "trn1 z16.b, z23.b, z16.b\n"
+ "trn1 z18.b, z22.b, z18.b\n"
+ "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+ "incb x9, ALL, MUL #4\n"
+ "trn1 z17.b, z21.b, z17.b\n"
+ "trn1 z16.b, z20.b, z19.b\n"
+ "st1b { z18.b }, p3, [%x[outptr], x28]\n"
+ "incb x28, ALL, MUL #4\n"
+ "st1b { z17.b }, p2, [%x[outptr], x27]\n"
+ "incb x27, ALL, MUL #4\n"
+ "st1b { z16.b }, p1, [%x[outptr], x26]\n"
+ "incb x26, ALL, MUL #4\n"
+ "whilelt p1.b, x26, %x[n_channels]\n"
+ "b.any 1b\n"
+ "7:" // Single vector of channels
+ "whilelt p4.b, x9, %x[n_channels]\n"
+ "b.none 14f\n"
+ "8:" // Single vector of channels: Loop
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z4.b, #0x80\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x20, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "ld1b { z0.b }, p4/Z, [x20, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "beq 10f\n"
+ "9:" // Single vector of channels: Loop: 4 inputs loop
+ "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "bgt 9b\n"
+ "10:" // Single vector of channels: Loop: 4 inputs tail
+ "movprfx z16, z0\n smax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n smax z17.b, p0/M, z17.b, z30.b\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
+ "11:" // Single vector of channels: Loop: After loop
+ "ands x21, %x[n_valid_cells], #0x3\n"
+ "beq 13f\n"
+ "12:" // Single vector of channels: Loop: Single input loop
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+ "subs x21, x21, #0x1\n"
+ "smax z4.b, p0/M, z4.b, z16.b\n"
+ "bgt 12b\n"
+ "13:" // Single vector of channels: Loop: Single input loop: End
+ ".inst 0x4508a091 // sshllb z17.h, z4.b, #0x0\n"
+ ".inst 0x4508a490 // sshllt z16.h, z4.b, #0x0\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+ "ld1rw { z18.s }, p0/Z, [x20]\n"
+ ".inst 0x4510a236 // sshllb z22.s, z17.h, #0x0\n"
+ ".inst 0x4510a635 // sshllt z21.s, z17.h, #0x0\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+ "ld1rw { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x4510a214 // sshllb z20.s, z16.h, #0x0\n"
+ ".inst 0x4510a613 // sshllt z19.s, z16.h, #0x0\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x44828256 // srshl z22.s, p0/M, z22.s, z18.s\n"
+ ".inst 0x44828255 // srshl z21.s, p0/M, z21.s, z18.s\n"
+ ".inst 0x44828254 // srshl z20.s, p0/M, z20.s, z18.s\n"
+ ".inst 0x44828253 // srshl z19.s, p0/M, z19.s, z18.s\n"
+ ".inst 0x04b176d6 // sqrdmulh z22.s, z22.s, z17.s\n"
+ ".inst 0x04b176b5 // sqrdmulh z21.s, z21.s, z17.s\n"
+ ".inst 0x04b17694 // sqrdmulh z20.s, z20.s, z17.s\n"
+ ".inst 0x04b17673 // sqrdmulh z19.s, z19.s, z17.s\n"
+ "mov z18.s, #0x7f\n"
+ ".inst 0x44828216 // srshl z22.s, p0/M, z22.s, z16.s\n"
+ ".inst 0x44828215 // srshl z21.s, p0/M, z21.s, z16.s\n"
+ ".inst 0x44828214 // srshl z20.s, p0/M, z20.s, z16.s\n"
+ ".inst 0x44828213 // srshl z19.s, p0/M, z19.s, z16.s\n"
+ "not z16.s, p0/M, z18.s\n"
+ "smax z22.s, p0/M, z22.s, z16.s\n"
+ "smax z21.s, p0/M, z21.s, z16.s\n"
+ "smax z20.s, p0/M, z20.s, z16.s\n"
+ "smax z19.s, p0/M, z19.s, z16.s\n"
+ "smin z22.s, p0/M, z22.s, z18.s\n"
+ "smin z21.s, p0/M, z21.s, z18.s\n"
+ "trn1 z17.h, z22.h, z21.h\n"
+ "smin z20.s, p0/M, z20.s, z18.s\n"
+ "smin z19.s, p0/M, z19.s, z18.s\n"
+ "trn1 z16.h, z20.h, z19.h\n"
+ "trn1 z16.b, z17.b, z16.b\n"
+ "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+ "incb x9\n"
+ "whilelt p4.b, x9, %x[n_channels]\n"
+ "b.any 8b\n"
+ "14:" // End
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [outptr] "r" (outptr), [quant_params] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp
new file mode 100644
index 0000000000..29a03ec509
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_u8_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
+
+struct sme_u8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t>
+{
+ using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t>;
+ sme_u8_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sme_u8_nhwc_avg_generic_depthfirst_impl; }
+};
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..f0e7bbf5cc
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+#include <cstring>
+#include <cmath>
+
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+namespace {
+ struct RescaleParams
+ {
+ int32_t multiplier, shift;
+ };
+
+ constexpr RescaleParams rescale_params[8] = {
+ {0x40000000, -0}, // 1/2
+ {0x55555556, -1}, // 1/3
+ {0x40000000, -1}, // 1/4
+ {0x66666666, -2}, // 1/5
+ {0x55555556, -2}, // 1/6
+ {0x49249249, -2}, // 1/7
+ {0x40000000, -2}, // 1/8
+ {0x71c71c72, -3}, // 1/9
+ };
+}
+
+void sme_u8_nhwc_avg_generic_depthfirst_impl(
+ const uint64_t window_cells,
+ const uint64_t n_valid_cells,
+ uint64_t n_channels,
+ const uint8_t *const *const inptrs,
+ uint8_t *outptr
+)
+{
+ if (n_valid_cells == 1 && window_cells == 1)
+ {
+ // In this case, simply copy from the input to the output
+ std::memcpy(outptr, *inptrs, n_channels);
+ return;
+ }
+
+ // Compute (or look up) the rescale values
+ int32_t shift_value = 0, rescale_value = 0;
+ if (2 <= window_cells && window_cells <= 9)
+ {
+ auto &params = rescale_params[window_cells - 2];
+ rescale_value = params.multiplier;
+ shift_value = params.shift;
+ }
+ else
+ {
+ auto f_rescale_value = 1.0f / static_cast<float>(window_cells);
+
+ shift_value = 0;
+ while (f_rescale_value < 0.5f)
+ {
+ shift_value--;
+ f_rescale_value *= 2.0f;
+ }
+
+ int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+ if (long_rescale_value == (1ll << 31))
+ {
+ shift_value++;
+ long_rescale_value >>= 1;
+ }
+ rescale_value = static_cast<int32_t>(long_rescale_value);
+ }
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x27, #0x0\n"
+ "cntb x26\n"
+ "cntb x25, ALL, MUL #2\n"
+ "cntb x24, ALL, MUL #3\n"
+ "whilelt p4.b, x27, %x[n_channels]\n"
+ "whilelt p3.b, x26, %x[n_channels]\n"
+ "whilelt p2.b, x25, %x[n_channels]\n"
+ "whilelt p1.b, x24, %x[n_channels]\n"
+ "ptrue p0.b\n"
+ "b.none 7f\n"
+ "1:" // 4-vectors of channels
+ "lsr x23, %x[n_valid_cells], #0x1\n"
+ "mov z15.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov x22, %x[inptrs]\n"
+ "mov z13.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z8.s, #0x0\n"
+ "mov z7.s, #0x0\n"
+ "mov z6.s, #0x0\n"
+ "mov z5.s, #0x0\n"
+ "mov z4.s, #0x0\n"
+ "mov z3.s, #0x0\n"
+ "mov z2.s, #0x0\n"
+ "mov z1.s, #0x0\n"
+ "mov z0.s, #0x0\n"
+ "cbz x23, 4f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+ "beq 3f\n"
+ "2:" // 4-vectors of channels: 2 inputs loop
+ ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0x455c0bb5 // uaddlb z21.h, z29.b, z28.b\n"
+ ".inst 0x455c0fb4 // uaddlt z20.h, z29.b, z28.b\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x455a0b73 // uaddlb z19.h, z27.b, z26.b\n"
+ ".inst 0x455a0f72 // uaddlt z18.h, z27.b, z26.b\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x45580b31 // uaddlb z17.h, z25.b, z24.b\n"
+ ".inst 0x45580f30 // uaddlt z16.h, z25.b, z24.b\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
+ ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
+ ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ ".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
+ ".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
+ ".inst 0x45944d08 // uaddwt z8.s, z8.s, z20.h\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ ".inst 0x459348e7 // uaddwb z7.s, z7.s, z19.h\n"
+ ".inst 0x45934cc6 // uaddwt z6.s, z6.s, z19.h\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x459248a5 // uaddwb z5.s, z5.s, z18.h\n"
+ ".inst 0x45924c84 // uaddwt z4.s, z4.s, z18.h\n"
+ ".inst 0x45914863 // uaddwb z3.s, z3.s, z17.h\n"
+ ".inst 0x45914c42 // uaddwt z2.s, z2.s, z17.h\n"
+ ".inst 0x45904821 // uaddwb z1.s, z1.s, z16.h\n"
+ ".inst 0x45904c00 // uaddwt z0.s, z0.s, z16.h\n"
+ "bgt 2b\n"
+ "3:" // 4-vectors of channels: 2 inputs tail
+ ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
+ ".inst 0x455c0bb5 // uaddlb z21.h, z29.b, z28.b\n"
+ ".inst 0x455c0fb4 // uaddlt z20.h, z29.b, z28.b\n"
+ ".inst 0x455a0b73 // uaddlb z19.h, z27.b, z26.b\n"
+ ".inst 0x455a0f72 // uaddlt z18.h, z27.b, z26.b\n"
+ ".inst 0x45580b31 // uaddlb z17.h, z25.b, z24.b\n"
+ ".inst 0x45580f30 // uaddlt z16.h, z25.b, z24.b\n"
+ ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
+ ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
+ ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
+ ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
+ ".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
+ ".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
+ ".inst 0x45944d08 // uaddwt z8.s, z8.s, z20.h\n"
+ ".inst 0x459348e7 // uaddwb z7.s, z7.s, z19.h\n"
+ ".inst 0x45934cc6 // uaddwt z6.s, z6.s, z19.h\n"
+ ".inst 0x459248a5 // uaddwb z5.s, z5.s, z18.h\n"
+ ".inst 0x45924c84 // uaddwt z4.s, z4.s, z18.h\n"
+ ".inst 0x45914863 // uaddwb z3.s, z3.s, z17.h\n"
+ ".inst 0x45914c42 // uaddwt z2.s, z2.s, z17.h\n"
+ ".inst 0x45904821 // uaddwb z1.s, z1.s, z16.h\n"
+ ".inst 0x45904c00 // uaddwt z0.s, z0.s, z16.h\n"
+ "4:" // 4-vectors of channels: After loop
+ "ands x21, %x[n_valid_cells], #0x1\n"
+ "beq 6f\n"
+ "5:" // 4-vectors of channels: Single input loop
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa17 // ushllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508ae16 // ushllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ ".inst 0x4508aa15 // ushllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508ae14 // ushllt z20.h, z16.b, #0x0\n"
+ "subs x21, x21, #0x1\n"
+ "ld1b { z16.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508aa13 // ushllb z19.h, z16.b, #0x0\n"
+ ".inst 0x4508ae12 // ushllt z18.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
+ ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
+ ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
+ ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
+ ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
+ ".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
+ ".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
+ ".inst 0x45944d08 // uaddwt z8.s, z8.s, z20.h\n"
+ ".inst 0x459348e7 // uaddwb z7.s, z7.s, z19.h\n"
+ ".inst 0x45934cc6 // uaddwt z6.s, z6.s, z19.h\n"
+ ".inst 0x459248a5 // uaddwb z5.s, z5.s, z18.h\n"
+ ".inst 0x45924c84 // uaddwt z4.s, z4.s, z18.h\n"
+ ".inst 0x45914863 // uaddwb z3.s, z3.s, z17.h\n"
+ ".inst 0x45914c42 // uaddwt z2.s, z2.s, z17.h\n"
+ ".inst 0x45904821 // uaddwb z1.s, z1.s, z16.h\n"
+ ".inst 0x45904c00 // uaddwt z0.s, z0.s, z16.h\n"
+ "bgt 5b\n"
+ "6:" // 4-vectors of channels: Single input loop: End
+ "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
+ ".inst 0x04b175ef // sqdmulh z15.s, z15.s, z17.s\n"
+ ".inst 0x04b175ce // sqdmulh z14.s, z14.s, z17.s\n"
+ ".inst 0x04b175ad // sqdmulh z13.s, z13.s, z17.s\n"
+ ".inst 0x04b1758c // sqdmulh z12.s, z12.s, z17.s\n"
+ "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
+ ".inst 0x04b1756b // sqdmulh z11.s, z11.s, z17.s\n"
+ ".inst 0x04b1754a // sqdmulh z10.s, z10.s, z17.s\n"
+ ".inst 0x04b17529 // sqdmulh z9.s, z9.s, z17.s\n"
+ ".inst 0x04b17508 // sqdmulh z8.s, z8.s, z17.s\n"
+ ".inst 0x04b174e7 // sqdmulh z7.s, z7.s, z17.s\n"
+ ".inst 0x04b174c6 // sqdmulh z6.s, z6.s, z17.s\n"
+ ".inst 0x04b174a5 // sqdmulh z5.s, z5.s, z17.s\n"
+ ".inst 0x04b17484 // sqdmulh z4.s, z4.s, z17.s\n"
+ ".inst 0x04b17463 // sqdmulh z3.s, z3.s, z17.s\n"
+ ".inst 0x04b17442 // sqdmulh z2.s, z2.s, z17.s\n"
+ ".inst 0x04b17421 // sqdmulh z1.s, z1.s, z17.s\n"
+ ".inst 0x04b17400 // sqdmulh z0.s, z0.s, z17.s\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
+ ".inst 0x4482820b // srshl z11.s, p0/M, z11.s, z16.s\n"
+ ".inst 0x4482820a // srshl z10.s, p0/M, z10.s, z16.s\n"
+ ".inst 0x44828209 // srshl z9.s, p0/M, z9.s, z16.s\n"
+ ".inst 0x44828208 // srshl z8.s, p0/M, z8.s, z16.s\n"
+ ".inst 0x44828207 // srshl z7.s, p0/M, z7.s, z16.s\n"
+ ".inst 0x44828206 // srshl z6.s, p0/M, z6.s, z16.s\n"
+ ".inst 0x44828205 // srshl z5.s, p0/M, z5.s, z16.s\n"
+ ".inst 0x44828204 // srshl z4.s, p0/M, z4.s, z16.s\n"
+ ".inst 0x44828203 // srshl z3.s, p0/M, z3.s, z16.s\n"
+ ".inst 0x44828202 // srshl z2.s, p0/M, z2.s, z16.s\n"
+ ".inst 0x44828201 // srshl z1.s, p0/M, z1.s, z16.s\n"
+ ".inst 0x44828200 // srshl z0.s, p0/M, z0.s, z16.s\n"
+ "mov z16.s, #0x0\n"
+ "mov z19.s, #0xff\n"
+ "smax z15.s, p0/M, z15.s, z16.s\n"
+ "smax z14.s, p0/M, z14.s, z16.s\n"
+ "smax z13.s, p0/M, z13.s, z16.s\n"
+ "smax z12.s, p0/M, z12.s, z16.s\n"
+ "smax z11.s, p0/M, z11.s, z16.s\n"
+ "smax z10.s, p0/M, z10.s, z16.s\n"
+ "smax z9.s, p0/M, z9.s, z16.s\n"
+ "smax z8.s, p0/M, z8.s, z16.s\n"
+ "smax z7.s, p0/M, z7.s, z16.s\n"
+ "smax z6.s, p0/M, z6.s, z16.s\n"
+ "smax z5.s, p0/M, z5.s, z16.s\n"
+ "smax z4.s, p0/M, z4.s, z16.s\n"
+ "smax z3.s, p0/M, z3.s, z16.s\n"
+ "smax z2.s, p0/M, z2.s, z16.s\n"
+ "smax z1.s, p0/M, z1.s, z16.s\n"
+ "smax z0.s, p0/M, z0.s, z16.s\n"
+ "smin z15.s, p0/M, z15.s, z19.s\n"
+ "smin z14.s, p0/M, z14.s, z19.s\n"
+ "trn1 z23.h, z15.h, z14.h\n"
+ "smin z13.s, p0/M, z13.s, z19.s\n"
+ "smin z12.s, p0/M, z12.s, z19.s\n"
+ "trn1 z16.h, z13.h, z12.h\n"
+ "smin z11.s, p0/M, z11.s, z19.s\n"
+ "smin z10.s, p0/M, z10.s, z19.s\n"
+ "trn1 z22.h, z11.h, z10.h\n"
+ "smin z9.s, p0/M, z9.s, z19.s\n"
+ "smin z8.s, p0/M, z8.s, z19.s\n"
+ "trn1 z18.h, z9.h, z8.h\n"
+ "smin z7.s, p0/M, z7.s, z19.s\n"
+ "smin z6.s, p0/M, z6.s, z19.s\n"
+ "trn1 z21.h, z7.h, z6.h\n"
+ "smin z5.s, p0/M, z5.s, z19.s\n"
+ "smin z4.s, p0/M, z4.s, z19.s\n"
+ "trn1 z17.h, z5.h, z4.h\n"
+ "smin z3.s, p0/M, z3.s, z19.s\n"
+ "smin z2.s, p0/M, z2.s, z19.s\n"
+ "trn1 z20.h, z3.h, z2.h\n"
+ "smin z1.s, p0/M, z1.s, z19.s\n"
+ "smin z0.s, p0/M, z0.s, z19.s\n"
+ "trn1 z19.h, z1.h, z0.h\n"
+ "trn1 z16.b, z23.b, z16.b\n"
+ "trn1 z18.b, z22.b, z18.b\n"
+ "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+ "incb x27, ALL, MUL #4\n"
+ "trn1 z17.b, z21.b, z17.b\n"
+ "trn1 z16.b, z20.b, z19.b\n"
+ "st1b { z18.b }, p3, [%x[outptr], x26]\n"
+ "incb x26, ALL, MUL #4\n"
+ "st1b { z17.b }, p2, [%x[outptr], x25]\n"
+ "incb x25, ALL, MUL #4\n"
+ "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+ "incb x24, ALL, MUL #4\n"
+ "whilelt p1.b, x24, %x[n_channels]\n"
+ "b.any 1b\n"
+ "7:" // Single vector of channels
+ "whilelt p4.b, x27, %x[n_channels]\n"
+ "b.none 14f\n"
+ "8:" // Single vector of channels: Loop
+ "lsr x23, %x[n_valid_cells], #0x1\n"
+ "mov z15.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov x22, %x[inptrs]\n"
+ "mov z13.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "cbz x23, 11f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "beq 10f\n"
+ "9:" // Single vector of channels: Loop: 2 inputs loop
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "bgt 9b\n"
+ "10:" // Single vector of channels: Loop: 2 inputs tail
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
+ "11:" // Single vector of channels: Loop: After loop
+ "ands x21, %x[n_valid_cells], #0x1\n"
+ "beq 13f\n"
+ "12:" // Single vector of channels: Loop: Single input loop
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
+ "bgt 12b\n"
+ "13:" // Single vector of channels: Loop: Single input loop: End
+ "ld1rw { z16.s }, p0/Z, [%x[rescale_ptr]]\n"
+ ".inst 0x04b075ef // sqdmulh z15.s, z15.s, z16.s\n"
+ ".inst 0x04b075ce // sqdmulh z14.s, z14.s, z16.s\n"
+ ".inst 0x04b075ad // sqdmulh z13.s, z13.s, z16.s\n"
+ ".inst 0x04b0758c // sqdmulh z12.s, z12.s, z16.s\n"
+ "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
+ "mov z17.s, #0x0\n"
+ "mov z16.s, #0xff\n"
+ "smax z15.s, p0/M, z15.s, z17.s\n"
+ "smax z14.s, p0/M, z14.s, z17.s\n"
+ "smax z13.s, p0/M, z13.s, z17.s\n"
+ "smax z12.s, p0/M, z12.s, z17.s\n"
+ "smin z15.s, p0/M, z15.s, z16.s\n"
+ "smin z14.s, p0/M, z14.s, z16.s\n"
+ "trn1 z17.h, z15.h, z14.h\n"
+ "smin z13.s, p0/M, z13.s, z16.s\n"
+ "smin z12.s, p0/M, z12.s, z16.s\n"
+ "trn1 z16.h, z13.h, z12.h\n"
+ "trn1 z16.b, z17.b, z16.b\n"
+ "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+ "incb x27\n"
+ "whilelt p4.b, x27, %x[n_channels]\n"
+ "b.any 8b\n"
+ "14:" // End
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value), [shift_ptr] "r" (&shift_value)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
new file mode 100644
index 0000000000..3df4e4efb8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const uint8_t *const *const, uint8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+
+struct sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<uint8_t, uint8_t>
+{
+ using Parent = DepthfirstStrategy<uint8_t, uint8_t>;
+
+ const static auto pooling_type = PoolingType::MAX;
+ const static auto pool_rows = 2u, pool_cols = 2u;
+ const static auto stride_rows = 1u, stride_cols = 1u;
+
+ sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+ : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
+
+ Parent::KernelType get_kernel(void) const { return sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
+};
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
new file mode 100644
index 0000000000..9088cbde89
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ uint8_t *const *const outptrs,
+ const bool exclude_padding,
+ const unsigned int pad_left,
+ const unsigned int pad_top,
+ const unsigned int pad_right,
+ const unsigned int pad_bottom
+)
+{
+ struct KernelArgs
+ {
+ const uint64_t n_channels;
+ const uint8_t *const *const inptrs;
+ uint8_t *const *const outptrs;
+ KernelArgs(
+ unsigned int channels,
+ const uint8_t *const *input_ptrs,
+ uint8_t *const * output_ptrs,
+ bool, unsigned int, unsigned int, unsigned int, unsigned int
+ ) : n_channels(channels),
+ inptrs(input_ptrs),
+ outptrs(output_ptrs)
+ {
+ }
+ };
+
+ const KernelArgs args(n_channels, inptrs, outptrs, exclude_padding,
+ pad_left, pad_top, pad_right, pad_bottom);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x15, #0x0\n"
+ "ptrue p2.b\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+ "mov x14, #0x0\n"
+ "ldr x13, [%x[args], %[offsetof_n_channels]]\n"
+ "whilelt p0.b, x15, x13\n"
+ "ldp x12, x11, [x21, #0x0]\n"
+ "ldp x10, x9, [x21, #0x10]\n"
+ "ldp x28, x27, [x20, #0x0]\n"
+ "ld1b { z30.b }, p0/Z, [x27, x15]\n"
+ "ldp x26, x25, [x20, #0x10]\n"
+ "ld1b { z29.b }, p0/Z, [x25, x15]\n"
+ "ldp x24, x23, [x20, #0x20]\n"
+ "ld1b { z28.b }, p0/Z, [x24, x15]\n"
+ "ldp x22, x21, [x20, #0x30]\n"
+ "ld1b { z27.b }, p0/Z, [x21, x15]\n"
+ "ldr x20, [x20, #0x40]\n"
+ "ld1b { z26.b }, p0/Z, [x28, x15]\n"
+ "ld1b { z25.b }, p0/Z, [x26, x15]\n"
+ "ld1b { z24.b }, p0/Z, [x23, x15]\n"
+ "ld1b { z19.b }, p0/Z, [x22, x15]\n"
+ "ld1b { z23.b }, p0/Z, [x20, x15]\n"
+ "incw x15\n"
+ "whilelt p1.b, x15, x13\n"
+ "b.none 2f\n"
+ "1:" // Vector: Loop
+ "movprfx z22, z30\n umax z22.b, p2/M, z22.b, z28.b\n"
+ "movprfx z21, z28\n umax z21.b, p2/M, z21.b, z27.b\n"
+ "ld1b { z30.b }, p1/Z, [x27, x15]\n"
+ "whilelt p0.b, x14, x13\n"
+ "movprfx z18, z29\n umax z18.b, p2/M, z18.b, z26.b\n"
+ "movprfx z17, z25\n umax z17.b, p2/M, z17.b, z24.b\n"
+ "ld1b { z28.b }, p1/Z, [x24, x15]\n"
+ "movprfx z16, z29\n umax z16.b, p2/M, z16.b, z19.b\n"
+ "movprfx z20, z24\n umax z20.b, p2/M, z20.b, z23.b\n"
+ "ld1b { z27.b }, p1/Z, [x21, x15]\n"
+ "ld1b { z29.b }, p1/Z, [x25, x15]\n"
+ "movprfx z19, z22\n umax z19.b, p2/M, z19.b, z18.b\n"
+ "movprfx z18, z17\n umax z18.b, p2/M, z18.b, z22.b\n"
+ "ld1b { z26.b }, p1/Z, [x28, x15]\n"
+ "movprfx z17, z16\n umax z17.b, p2/M, z17.b, z21.b\n"
+ "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z20.b\n"
+ "ld1b { z25.b }, p1/Z, [x26, x15]\n"
+ "st1b { z19.b }, p0, [x12, x14]\n"
+ "ld1b { z24.b }, p1/Z, [x23, x15]\n"
+ "st1b { z18.b }, p0, [x11, x14]\n"
+ "ld1b { z19.b }, p1/Z, [x22, x15]\n"
+ "st1b { z17.b }, p0, [x10, x14]\n"
+ "ld1b { z23.b }, p1/Z, [x20, x15]\n"
+ "incw x15\n"
+ "whilelt p1.b, x15, x13\n"
+ "st1b { z16.b }, p0, [x9, x14]\n"
+ "incw x14\n"
+ "b.any 1b\n"
+ "2:" // Vector: Tail
+ "movprfx z22, z30\n umax z22.b, p2/M, z22.b, z28.b\n"
+ "movprfx z21, z28\n umax z21.b, p2/M, z21.b, z27.b\n"
+ "whilelt p0.b, x14, x13\n"
+ "movprfx z20, z29\n umax z20.b, p2/M, z20.b, z26.b\n"
+ "movprfx z18, z25\n umax z18.b, p2/M, z18.b, z24.b\n"
+ "movprfx z17, z29\n umax z17.b, p2/M, z17.b, z19.b\n"
+ "movprfx z19, z24\n umax z19.b, p2/M, z19.b, z23.b\n"
+ "movprfx z16, z22\n umax z16.b, p2/M, z16.b, z20.b\n"
+ "umax z18.b, p2/M, z18.b, z22.b\n"
+ "st1b { z16.b }, p0, [x12, x14]\n"
+ "umax z17.b, p2/M, z17.b, z21.b\n"
+ "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z19.b\n"
+ "st1b { z18.b }, p0, [x11, x14]\n"
+ "st1b { z17.b }, p0, [x10, x14]\n"
+ "st1b { z16.b }, p0, [x9, x14]\n"
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst.hpp
new file mode 100644
index 0000000000..077c8ed2f7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_u8_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
+
+struct sme_u8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t>
+{
+ using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t>;
+ sme_u8_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sme_u8_nhwc_max_generic_depthfirst_impl; }
+};
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..06f13e8111
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8_nhwc_max_generic_depthfirst/generic.cpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_u8_nhwc_max_generic_depthfirst_impl(
+ const uint64_t,
+ const uint64_t n_valid_cells,
+ uint64_t n_channels,
+ const uint8_t *const *const inptrs,
+ uint8_t *outptr
+)
+{
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x9, #0x0\n"
+ "cntb x28\n"
+ "cntb x27, ALL, MUL #2\n"
+ "cntb x26, ALL, MUL #3\n"
+ "whilelt p4.b, x9, %x[n_channels]\n"
+ "whilelt p3.b, x28, %x[n_channels]\n"
+ "whilelt p2.b, x27, %x[n_channels]\n"
+ "whilelt p1.b, x26, %x[n_channels]\n"
+ "ptrue p0.b\n"
+ "b.none 7f\n"
+ "1:" // 4-vectors of channels
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z4.b, #0x0\n"
+ "mov z3.b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
+ "mov z2.b, #0x0\n"
+ "mov z1.b, #0x0\n"
+ "cbz x25, 4f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+ "beq 3f\n"
+ "2:" // 4-vectors of channels: 4 inputs loop
+ "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
+ "umax z23.b, p0/M, z23.b, z30.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "umax z18.b, p0/M, z18.b, z29.b\n"
+ "umax z22.b, p0/M, z22.b, z28.b\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "umax z17.b, p0/M, z17.b, z27.b\n"
+ "umax z21.b, p0/M, z21.b, z26.b\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "umax z16.b, p0/M, z16.b, z25.b\n"
+ "umax z20.b, p0/M, z20.b, z24.b\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "umax z19.b, p0/M, z19.b, z23.b\n"
+ "umax z18.b, p0/M, z18.b, z22.b\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "umax z17.b, p0/M, z17.b, z21.b\n"
+ "umax z16.b, p0/M, z16.b, z20.b\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "umax z4.b, p0/M, z4.b, z19.b\n"
+ "umax z3.b, p0/M, z3.b, z18.b\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+ "umax z2.b, p0/M, z2.b, z17.b\n"
+ "umax z1.b, p0/M, z1.b, z16.b\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+ "bgt 2b\n"
+ "3:" // 4-vectors of channels: 4 inputs tail
+ "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
+ "umax z23.b, p0/M, z23.b, z30.b\n"
+ "umax z18.b, p0/M, z18.b, z29.b\n"
+ "umax z22.b, p0/M, z22.b, z28.b\n"
+ "umax z17.b, p0/M, z17.b, z27.b\n"
+ "umax z21.b, p0/M, z21.b, z26.b\n"
+ "umax z16.b, p0/M, z16.b, z25.b\n"
+ "umax z20.b, p0/M, z20.b, z24.b\n"
+ "umax z19.b, p0/M, z19.b, z23.b\n"
+ "umax z18.b, p0/M, z18.b, z22.b\n"
+ "umax z17.b, p0/M, z17.b, z21.b\n"
+ "umax z16.b, p0/M, z16.b, z20.b\n"
+ "umax z4.b, p0/M, z4.b, z19.b\n"
+ "umax z3.b, p0/M, z3.b, z18.b\n"
+ "umax z2.b, p0/M, z2.b, z17.b\n"
+ "umax z1.b, p0/M, z1.b, z16.b\n"
+ "4:" // 4-vectors of channels: After loop
+ "ands x21, %x[n_valid_cells], #0x3\n"
+ "beq 6f\n"
+ "5:" // 4-vectors of channels: Single input loop
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+ "subs x21, x21, #0x1\n"
+ "umax z4.b, p0/M, z4.b, z16.b\n"
+ "ld1b { z16.b }, p3/Z, [x20, x28]\n"
+ "umax z3.b, p0/M, z3.b, z16.b\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "umax z2.b, p0/M, z2.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+ "umax z1.b, p0/M, z1.b, z16.b\n"
+ "bgt 5b\n"
+ "6:" // 4-vectors of channels: Single input loop: End
+ "st1b { z4.b }, p4, [%x[outptr], x9]\n"
+ "incb x9, ALL, MUL #4\n"
+ "st1b { z3.b }, p3, [%x[outptr], x28]\n"
+ "incb x28, ALL, MUL #4\n"
+ "st1b { z2.b }, p2, [%x[outptr], x27]\n"
+ "incb x27, ALL, MUL #4\n"
+ "st1b { z1.b }, p1, [%x[outptr], x26]\n"
+ "incb x26, ALL, MUL #4\n"
+ "whilelt p1.b, x26, %x[n_channels]\n"
+ "b.any 1b\n"
+ "7:" // Single vector of channels
+ "whilelt p4.b, x9, %x[n_channels]\n"
+ "b.none 14f\n"
+ "8:" // Single vector of channels: Loop
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z4.b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x20, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "ld1b { z0.b }, p4/Z, [x20, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "beq 10f\n"
+ "9:" // Single vector of channels: Loop: 4 inputs loop
+ "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "umax z4.b, p0/M, z4.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "bgt 9b\n"
+ "10:" // Single vector of channels: Loop: 4 inputs tail
+ "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "umax z4.b, p0/M, z4.b, z16.b\n"
+ "11:" // Single vector of channels: Loop: After loop
+ "ands x21, %x[n_valid_cells], #0x3\n"
+ "beq 13f\n"
+ "12:" // Single vector of channels: Loop: Single input loop
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+ "subs x21, x21, #0x1\n"
+ "umax z4.b, p0/M, z4.b, z16.b\n"
+ "bgt 12b\n"
+ "13:" // Single vector of channels: Loop: Single input loop: End
+ "st1b { z4.b }, p4, [%x[outptr], x9]\n"
+ "incb x9\n"
+ "whilelt p4.b, x9, %x[n_channels]\n"
+ "b.any 8b\n"
+ "14:" // End
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp
new file mode 100644
index 0000000000..bd30a32828
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_u8q_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
+
+struct sme_u8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>
+{
+ using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>;
+ sme_u8q_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sme_u8q_nhwc_avg_generic_depthfirst_impl; }
+};
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..52c52ccdb9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -0,0 +1,489 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pooling.hpp"
+#include <cstdint>
+#include <cstddef>
+#include <cstring>
+#include <cmath>
+
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+namespace {
+ struct RescaleParams
+ {
+ int32_t multiplier, shift;
+ };
+
+ constexpr RescaleParams rescale_params[8] = {
+ {0x40000000, -0}, // 1/2
+ {0x55555556, -1}, // 1/3
+ {0x40000000, -1}, // 1/4
+ {0x66666666, -2}, // 1/5
+ {0x55555556, -2}, // 1/6
+ {0x49249249, -2}, // 1/7
+ {0x40000000, -2}, // 1/8
+ {0x71c71c72, -3}, // 1/9
+ };
+}
+
+void sme_u8q_nhwc_avg_generic_depthfirst_impl(
+ const uint64_t window_cells,
+ const uint64_t n_valid_cells,
+ uint64_t n_channels,
+ const uint8_t *const *const inptrs,
+ uint8_t *outptr,
+ const Requantize32 &qp
+)
+{
+ if (n_valid_cells == 1 && window_cells == 1)
+ {
+ // In this case, simply copy from the input to the output
+ std::memcpy(outptr, *inptrs, n_channels);
+ return;
+ }
+
+ // Compute (or look up) the rescale values
+ int32_t shift_value = 0, rescale_value = 0;
+ if (2 <= window_cells && window_cells <= 9)
+ {
+ auto &params = rescale_params[window_cells - 2];
+ rescale_value = params.multiplier;
+ shift_value = params.shift;
+ }
+ else
+ {
+ auto f_rescale_value = 1.0f / static_cast<float>(window_cells);
+
+ shift_value = 0;
+ while (f_rescale_value < 0.5f)
+ {
+ shift_value--;
+ f_rescale_value *= 2.0f;
+ }
+
+ int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+ if (long_rescale_value == (1ll << 31))
+ {
+ shift_value++;
+ long_rescale_value >>= 1;
+ }
+ rescale_value = static_cast<int32_t>(long_rescale_value);
+ }
+
+
+ // Initialise the accumulators such that the offsets are subtracted for all
+ // valid inputs.
+ const int32_t accumulator_init = -qp.input_offset * n_valid_cells;
+
+ // Combine together the rescale value for the requantization and the scaling
+ // factor for the average pool.
+ const int32_t shift = qp.per_layer_left_shift - qp.per_layer_right_shift + shift_value;
+ const int32_t left_shift = shift > 0 ? shift : 0;
+ const int32_t right_shift = shift <= 0 ? shift : 0;
+
+ int32_t combined_rescale_value = 0;
+ __asm__ __volatile__ (
+ "mov v16.s[0], %w[per_layer_mul]\n"
+ "mov v17.s[0], %w[rescale_value]\n"
+ "sqrdmulh s18, s16, s17\n"
+ "mov %w[combined_rescale_value], v18.s[0]\n"
+ : [combined_rescale_value] "=r" (combined_rescale_value)
+ : [per_layer_mul] "r" (qp.per_layer_mul), [rescale_value] "r" (rescale_value)
+ : "v16", "v17", "v18"
+ );
+
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x27, #0x0\n"
+ "cntb x26\n"
+ "cntb x25, ALL, MUL #2\n"
+ "cntb x24, ALL, MUL #3\n"
+ "whilelt p4.b, x27, %x[n_channels]\n"
+ "whilelt p3.b, x26, %x[n_channels]\n"
+ "whilelt p2.b, x25, %x[n_channels]\n"
+ "whilelt p1.b, x24, %x[n_channels]\n"
+ "ptrue p0.b\n"
+ "b.none 7f\n"
+ "1:" // 4-vectors of channels
+ "ld1rw { z15.s }, p0/Z, [%x[accumulator_init]]\n"
+ "lsr x23, %x[n_valid_cells], #0x1\n"
+ "mov z14.d, z15.d\n"
+ "mov z13.d, z15.d\n"
+ "mov z12.d, z15.d\n"
+ "mov z11.d, z15.d\n"
+ "mov x22, %x[inptrs]\n"
+ "mov z10.d, z15.d\n"
+ "mov z9.d, z15.d\n"
+ "mov z8.d, z15.d\n"
+ "mov z7.d, z15.d\n"
+ "mov z6.d, z15.d\n"
+ "mov z5.d, z15.d\n"
+ "mov z4.d, z15.d\n"
+ "mov z3.d, z15.d\n"
+ "mov z2.d, z15.d\n"
+ "mov z1.d, z15.d\n"
+ "mov z0.d, z15.d\n"
+ "cbz x23, 4f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+ "beq 3f\n"
+ "2:" // 4-vectors of channels: 2 inputs loop
+ ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0x455c0bb5 // uaddlb z21.h, z29.b, z28.b\n"
+ ".inst 0x455c0fb4 // uaddlt z20.h, z29.b, z28.b\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x455a0b73 // uaddlb z19.h, z27.b, z26.b\n"
+ ".inst 0x455a0f72 // uaddlt z18.h, z27.b, z26.b\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x45580b31 // uaddlb z17.h, z25.b, z24.b\n"
+ ".inst 0x45580f30 // uaddlt z16.h, z25.b, z24.b\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
+ ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
+ ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ ".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
+ ".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
+ ".inst 0x45944d08 // uaddwt z8.s, z8.s, z20.h\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ ".inst 0x459348e7 // uaddwb z7.s, z7.s, z19.h\n"
+ ".inst 0x45934cc6 // uaddwt z6.s, z6.s, z19.h\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x459248a5 // uaddwb z5.s, z5.s, z18.h\n"
+ ".inst 0x45924c84 // uaddwt z4.s, z4.s, z18.h\n"
+ ".inst 0x45914863 // uaddwb z3.s, z3.s, z17.h\n"
+ ".inst 0x45914c42 // uaddwt z2.s, z2.s, z17.h\n"
+ ".inst 0x45904821 // uaddwb z1.s, z1.s, z16.h\n"
+ ".inst 0x45904c00 // uaddwt z0.s, z0.s, z16.h\n"
+ "bgt 2b\n"
+ "3:" // 4-vectors of channels: 2 inputs tail
+ ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
+ ".inst 0x455c0bb5 // uaddlb z21.h, z29.b, z28.b\n"
+ ".inst 0x455c0fb4 // uaddlt z20.h, z29.b, z28.b\n"
+ ".inst 0x455a0b73 // uaddlb z19.h, z27.b, z26.b\n"
+ ".inst 0x455a0f72 // uaddlt z18.h, z27.b, z26.b\n"
+ ".inst 0x45580b31 // uaddlb z17.h, z25.b, z24.b\n"
+ ".inst 0x45580f30 // uaddlt z16.h, z25.b, z24.b\n"
+ ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
+ ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
+ ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
+ ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
+ ".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
+ ".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
+ ".inst 0x45944d08 // uaddwt z8.s, z8.s, z20.h\n"
+ ".inst 0x459348e7 // uaddwb z7.s, z7.s, z19.h\n"
+ ".inst 0x45934cc6 // uaddwt z6.s, z6.s, z19.h\n"
+ ".inst 0x459248a5 // uaddwb z5.s, z5.s, z18.h\n"
+ ".inst 0x45924c84 // uaddwt z4.s, z4.s, z18.h\n"
+ ".inst 0x45914863 // uaddwb z3.s, z3.s, z17.h\n"
+ ".inst 0x45914c42 // uaddwt z2.s, z2.s, z17.h\n"
+ ".inst 0x45904821 // uaddwb z1.s, z1.s, z16.h\n"
+ ".inst 0x45904c00 // uaddwt z0.s, z0.s, z16.h\n"
+ "4:" // 4-vectors of channels: After loop
+ "ands x21, %x[n_valid_cells], #0x1\n"
+ "beq 6f\n"
+ "5:" // 4-vectors of channels: Single input loop
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa17 // ushllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508ae16 // ushllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ ".inst 0x4508aa15 // ushllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508ae14 // ushllt z20.h, z16.b, #0x0\n"
+ "subs x21, x21, #0x1\n"
+ "ld1b { z16.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508aa13 // ushllb z19.h, z16.b, #0x0\n"
+ ".inst 0x4508ae12 // ushllt z18.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
+ ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
+ ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
+ ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
+ ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
+ ".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
+ ".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
+ ".inst 0x45944d08 // uaddwt z8.s, z8.s, z20.h\n"
+ ".inst 0x459348e7 // uaddwb z7.s, z7.s, z19.h\n"
+ ".inst 0x45934cc6 // uaddwt z6.s, z6.s, z19.h\n"
+ ".inst 0x459248a5 // uaddwb z5.s, z5.s, z18.h\n"
+ ".inst 0x45924c84 // uaddwt z4.s, z4.s, z18.h\n"
+ ".inst 0x45914863 // uaddwb z3.s, z3.s, z17.h\n"
+ ".inst 0x45914c42 // uaddwt z2.s, z2.s, z17.h\n"
+ ".inst 0x45904821 // uaddwb z1.s, z1.s, z16.h\n"
+ ".inst 0x45904c00 // uaddwt z0.s, z0.s, z16.h\n"
+ "bgt 5b\n"
+ "6:" // 4-vectors of channels: Single input loop: End
+ "ld1rw { z19.s }, p0/Z, [%x[left_shift]]\n"
+ ".inst 0x4482826f // srshl z15.s, p0/M, z15.s, z19.s\n"
+ ".inst 0x4482826e // srshl z14.s, p0/M, z14.s, z19.s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+ ".inst 0x4482826d // srshl z13.s, p0/M, z13.s, z19.s\n"
+ ".inst 0x4482826c // srshl z12.s, p0/M, z12.s, z19.s\n"
+ "ld1rw { z18.s }, p0/Z, [%x[combined_rescale_value]]\n"
+ ".inst 0x4482826b // srshl z11.s, p0/M, z11.s, z19.s\n"
+ ".inst 0x4482826a // srshl z10.s, p0/M, z10.s, z19.s\n"
+ "ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
+ ".inst 0x44828269 // srshl z9.s, p0/M, z9.s, z19.s\n"
+ ".inst 0x44828268 // srshl z8.s, p0/M, z8.s, z19.s\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x44828267 // srshl z7.s, p0/M, z7.s, z19.s\n"
+ ".inst 0x44828266 // srshl z6.s, p0/M, z6.s, z19.s\n"
+ ".inst 0x44828265 // srshl z5.s, p0/M, z5.s, z19.s\n"
+ ".inst 0x44828264 // srshl z4.s, p0/M, z4.s, z19.s\n"
+ ".inst 0x44828263 // srshl z3.s, p0/M, z3.s, z19.s\n"
+ ".inst 0x44828262 // srshl z2.s, p0/M, z2.s, z19.s\n"
+ ".inst 0x44828261 // srshl z1.s, p0/M, z1.s, z19.s\n"
+ ".inst 0x44828260 // srshl z0.s, p0/M, z0.s, z19.s\n"
+ ".inst 0x04b275ef // sqrdmulh z15.s, z15.s, z18.s\n"
+ ".inst 0x04b275ce // sqrdmulh z14.s, z14.s, z18.s\n"
+ ".inst 0x04b275ad // sqrdmulh z13.s, z13.s, z18.s\n"
+ ".inst 0x04b2758c // sqrdmulh z12.s, z12.s, z18.s\n"
+ ".inst 0x04b2756b // sqrdmulh z11.s, z11.s, z18.s\n"
+ ".inst 0x04b2754a // sqrdmulh z10.s, z10.s, z18.s\n"
+ ".inst 0x04b27529 // sqrdmulh z9.s, z9.s, z18.s\n"
+ ".inst 0x04b27508 // sqrdmulh z8.s, z8.s, z18.s\n"
+ ".inst 0x04b274e7 // sqrdmulh z7.s, z7.s, z18.s\n"
+ ".inst 0x04b274c6 // sqrdmulh z6.s, z6.s, z18.s\n"
+ ".inst 0x04b274a5 // sqrdmulh z5.s, z5.s, z18.s\n"
+ ".inst 0x04b27484 // sqrdmulh z4.s, z4.s, z18.s\n"
+ ".inst 0x04b27463 // sqrdmulh z3.s, z3.s, z18.s\n"
+ ".inst 0x04b27442 // sqrdmulh z2.s, z2.s, z18.s\n"
+ ".inst 0x04b27421 // sqrdmulh z1.s, z1.s, z18.s\n"
+ ".inst 0x04b27400 // sqrdmulh z0.s, z0.s, z18.s\n"
+ ".inst 0x4482822f // srshl z15.s, p0/M, z15.s, z17.s\n"
+ ".inst 0x4482822e // srshl z14.s, p0/M, z14.s, z17.s\n"
+ ".inst 0x4482822d // srshl z13.s, p0/M, z13.s, z17.s\n"
+ ".inst 0x4482822c // srshl z12.s, p0/M, z12.s, z17.s\n"
+ ".inst 0x4482822b // srshl z11.s, p0/M, z11.s, z17.s\n"
+ ".inst 0x4482822a // srshl z10.s, p0/M, z10.s, z17.s\n"
+ ".inst 0x44828229 // srshl z9.s, p0/M, z9.s, z17.s\n"
+ ".inst 0x44828228 // srshl z8.s, p0/M, z8.s, z17.s\n"
+ ".inst 0x44828227 // srshl z7.s, p0/M, z7.s, z17.s\n"
+ ".inst 0x44828226 // srshl z6.s, p0/M, z6.s, z17.s\n"
+ ".inst 0x44828225 // srshl z5.s, p0/M, z5.s, z17.s\n"
+ ".inst 0x44828224 // srshl z4.s, p0/M, z4.s, z17.s\n"
+ ".inst 0x44828223 // srshl z3.s, p0/M, z3.s, z17.s\n"
+ ".inst 0x44828222 // srshl z2.s, p0/M, z2.s, z17.s\n"
+ ".inst 0x44828221 // srshl z1.s, p0/M, z1.s, z17.s\n"
+ ".inst 0x44828220 // srshl z0.s, p0/M, z0.s, z17.s\n"
+ "add z15.s, z15.s, z16.s\n"
+ "add z14.s, z14.s, z16.s\n"
+ "add z13.s, z13.s, z16.s\n"
+ "add z12.s, z12.s, z16.s\n"
+ "add z11.s, z11.s, z16.s\n"
+ "add z10.s, z10.s, z16.s\n"
+ "add z9.s, z9.s, z16.s\n"
+ "add z8.s, z8.s, z16.s\n"
+ "add z7.s, z7.s, z16.s\n"
+ "add z6.s, z6.s, z16.s\n"
+ "add z5.s, z5.s, z16.s\n"
+ "add z4.s, z4.s, z16.s\n"
+ "add z3.s, z3.s, z16.s\n"
+ "add z2.s, z2.s, z16.s\n"
+ "add z1.s, z1.s, z16.s\n"
+ "add z0.s, z0.s, z16.s\n"
+ "mov z16.s, #0x0\n"
+ "mov z19.s, #0xff\n"
+ "smax z15.s, p0/M, z15.s, z16.s\n"
+ "smax z14.s, p0/M, z14.s, z16.s\n"
+ "smax z13.s, p0/M, z13.s, z16.s\n"
+ "smax z12.s, p0/M, z12.s, z16.s\n"
+ "smax z11.s, p0/M, z11.s, z16.s\n"
+ "smax z10.s, p0/M, z10.s, z16.s\n"
+ "smax z9.s, p0/M, z9.s, z16.s\n"
+ "smax z8.s, p0/M, z8.s, z16.s\n"
+ "smax z7.s, p0/M, z7.s, z16.s\n"
+ "smax z6.s, p0/M, z6.s, z16.s\n"
+ "smax z5.s, p0/M, z5.s, z16.s\n"
+ "smax z4.s, p0/M, z4.s, z16.s\n"
+ "smax z3.s, p0/M, z3.s, z16.s\n"
+ "smax z2.s, p0/M, z2.s, z16.s\n"
+ "smax z1.s, p0/M, z1.s, z16.s\n"
+ "smax z0.s, p0/M, z0.s, z16.s\n"
+ "smin z15.s, p0/M, z15.s, z19.s\n"
+ "smin z14.s, p0/M, z14.s, z19.s\n"
+ "trn1 z23.h, z15.h, z14.h\n"
+ "smin z13.s, p0/M, z13.s, z19.s\n"
+ "smin z12.s, p0/M, z12.s, z19.s\n"
+ "trn1 z16.h, z13.h, z12.h\n"
+ "smin z11.s, p0/M, z11.s, z19.s\n"
+ "smin z10.s, p0/M, z10.s, z19.s\n"
+ "trn1 z22.h, z11.h, z10.h\n"
+ "smin z9.s, p0/M, z9.s, z19.s\n"
+ "smin z8.s, p0/M, z8.s, z19.s\n"
+ "trn1 z18.h, z9.h, z8.h\n"
+ "smin z7.s, p0/M, z7.s, z19.s\n"
+ "smin z6.s, p0/M, z6.s, z19.s\n"
+ "trn1 z21.h, z7.h, z6.h\n"
+ "smin z5.s, p0/M, z5.s, z19.s\n"
+ "smin z4.s, p0/M, z4.s, z19.s\n"
+ "trn1 z17.h, z5.h, z4.h\n"
+ "smin z3.s, p0/M, z3.s, z19.s\n"
+ "smin z2.s, p0/M, z2.s, z19.s\n"
+ "trn1 z20.h, z3.h, z2.h\n"
+ "smin z1.s, p0/M, z1.s, z19.s\n"
+ "smin z0.s, p0/M, z0.s, z19.s\n"
+ "trn1 z19.h, z1.h, z0.h\n"
+ "trn1 z16.b, z23.b, z16.b\n"
+ "trn1 z18.b, z22.b, z18.b\n"
+ "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+ "incb x27, ALL, MUL #4\n"
+ "trn1 z17.b, z21.b, z17.b\n"
+ "trn1 z16.b, z20.b, z19.b\n"
+ "st1b { z18.b }, p3, [%x[outptr], x26]\n"
+ "incb x26, ALL, MUL #4\n"
+ "st1b { z17.b }, p2, [%x[outptr], x25]\n"
+ "incb x25, ALL, MUL #4\n"
+ "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+ "incb x24, ALL, MUL #4\n"
+ "whilelt p1.b, x24, %x[n_channels]\n"
+ "b.any 1b\n"
+ "7:" // Single vector of channels
+ "whilelt p4.b, x27, %x[n_channels]\n"
+ "b.none 14f\n"
+ "8:" // Single vector of channels: Loop
+ "ld1rw { z15.s }, p0/Z, [%x[accumulator_init]]\n"
+ "lsr x23, %x[n_valid_cells], #0x1\n"
+ "mov z14.d, z15.d\n"
+ "mov z13.d, z15.d\n"
+ "mov z12.d, z15.d\n"
+ "mov x22, %x[inptrs]\n"
+ "cbz x23, 11f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "beq 10f\n"
+ "9:" // Single vector of channels: Loop: 2 inputs loop
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "bgt 9b\n"
+ "10:" // Single vector of channels: Loop: 2 inputs tail
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
+ "11:" // Single vector of channels: Loop: After loop
+ "ands x21, %x[n_valid_cells], #0x1\n"
+ "beq 13f\n"
+ "12:" // Single vector of channels: Loop: Single input loop
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
+ "bgt 12b\n"
+ "13:" // Single vector of channels: Loop: Single input loop: End
+ "ld1rw { z16.s }, p0/Z, [%x[left_shift]]\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
+ "ld1rw { z16.s }, p0/Z, [%x[combined_rescale_value]]\n"
+ ".inst 0x04b075ef // sqrdmulh z15.s, z15.s, z16.s\n"
+ ".inst 0x04b075ce // sqrdmulh z14.s, z14.s, z16.s\n"
+ "ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
+ ".inst 0x04b075ad // sqrdmulh z13.s, z13.s, z16.s\n"
+ ".inst 0x04b0758c // sqrdmulh z12.s, z12.s, z16.s\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x4482822f // srshl z15.s, p0/M, z15.s, z17.s\n"
+ ".inst 0x4482822e // srshl z14.s, p0/M, z14.s, z17.s\n"
+ ".inst 0x4482822d // srshl z13.s, p0/M, z13.s, z17.s\n"
+ ".inst 0x4482822c // srshl z12.s, p0/M, z12.s, z17.s\n"
+ "add z15.s, z15.s, z16.s\n"
+ "add z14.s, z14.s, z16.s\n"
+ "add z13.s, z13.s, z16.s\n"
+ "add z12.s, z12.s, z16.s\n"
+ "mov z17.s, #0x0\n"
+ "mov z16.s, #0xff\n"
+ "smax z15.s, p0/M, z15.s, z17.s\n"
+ "smax z14.s, p0/M, z14.s, z17.s\n"
+ "smax z13.s, p0/M, z13.s, z17.s\n"
+ "smax z12.s, p0/M, z12.s, z17.s\n"
+ "smin z15.s, p0/M, z15.s, z16.s\n"
+ "smin z14.s, p0/M, z14.s, z16.s\n"
+ "trn1 z17.h, z15.h, z14.h\n"
+ "smin z13.s, p0/M, z13.s, z16.s\n"
+ "smin z12.s, p0/M, z12.s, z16.s\n"
+ "trn1 z16.h, z13.h, z12.h\n"
+ "trn1 z16.b, z17.b, z16.b\n"
+ "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+ "incb x27\n"
+ "whilelt p4.b, x27, %x[n_channels]\n"
+ "b.any 8b\n"
+ "14:" // End
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [accumulator_init] "r" (&accumulator_init), [combined_rescale_value] "r" (&combined_rescale_value), [inptrs] "r" (inptrs), [left_shift] "r" (&left_shift), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [outptr] "r" (outptr), [quant_params] "r" (&qp), [right_shift] "r" (&right_shift)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp
new file mode 100644
index 0000000000..69d627c047
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+void sme_u8q_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
+
+struct sme_u8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>
+{
+ using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>;
+ sme_u8q_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sme_u8q_nhwc_max_generic_depthfirst_impl; }
+};
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp
new file mode 100644
index 0000000000..c8e8e7d399
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sme_u8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -0,0 +1,418 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "pooling.hpp"
+#include <cstdint>
+#include <cstddef>
+
+#if defined(ARM_COMPUTE_ENABLE_SME)
+
+namespace arm_conv {
+namespace pooling {
+
+
+void sme_u8q_nhwc_max_generic_depthfirst_impl(
+ const uint64_t,
+ const uint64_t n_valid_cells,
+ uint64_t n_channels,
+ const uint8_t *const *const inptrs,
+ uint8_t *outptr,
+ const Requantize32 &qp
+)
+{
+ __asm__ __volatile__(
+ ".inst 0xd503477f // SMSTART ZA\n"
+ "mov x9, #0x0\n"
+ "cntb x28\n"
+ "cntb x27, ALL, MUL #2\n"
+ "cntb x26, ALL, MUL #3\n"
+ "whilelt p4.b, x9, %x[n_channels]\n"
+ "whilelt p3.b, x28, %x[n_channels]\n"
+ "whilelt p2.b, x27, %x[n_channels]\n"
+ "whilelt p1.b, x26, %x[n_channels]\n"
+ "ptrue p0.b\n"
+ "b.none 7f\n"
+ "1:" // 4-vectors of channels
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z5.b, #0x0\n"
+ "mov z3.b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
+ "mov z2.b, #0x0\n"
+ "mov z1.b, #0x0\n"
+ "cbz x25, 4f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+ "beq 3f\n"
+ "2:" // 4-vectors of channels: 4 inputs loop
+ "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
+ "umax z23.b, p0/M, z23.b, z30.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "umax z18.b, p0/M, z18.b, z29.b\n"
+ "umax z22.b, p0/M, z22.b, z28.b\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "umax z17.b, p0/M, z17.b, z27.b\n"
+ "umax z21.b, p0/M, z21.b, z26.b\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "umax z16.b, p0/M, z16.b, z25.b\n"
+ "umax z20.b, p0/M, z20.b, z24.b\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "umax z19.b, p0/M, z19.b, z23.b\n"
+ "umax z18.b, p0/M, z18.b, z22.b\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "umax z17.b, p0/M, z17.b, z21.b\n"
+ "umax z16.b, p0/M, z16.b, z20.b\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "umax z5.b, p0/M, z5.b, z19.b\n"
+ "umax z3.b, p0/M, z3.b, z18.b\n"
+ "ld1b { z18.b }, p3/Z, [x23, x28]\n"
+ "umax z2.b, p0/M, z2.b, z17.b\n"
+ "umax z1.b, p0/M, z1.b, z16.b\n"
+ "ld1b { z29.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z17.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z16.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
+ "bgt 2b\n"
+ "3:" // 4-vectors of channels: 4 inputs tail
+ "movprfx z19, z0\n umax z19.b, p0/M, z19.b, z31.b\n"
+ "umax z23.b, p0/M, z23.b, z30.b\n"
+ "umax z18.b, p0/M, z18.b, z29.b\n"
+ "umax z22.b, p0/M, z22.b, z28.b\n"
+ "umax z17.b, p0/M, z17.b, z27.b\n"
+ "umax z21.b, p0/M, z21.b, z26.b\n"
+ "umax z16.b, p0/M, z16.b, z25.b\n"
+ "umax z20.b, p0/M, z20.b, z24.b\n"
+ "umax z19.b, p0/M, z19.b, z23.b\n"
+ "umax z18.b, p0/M, z18.b, z22.b\n"
+ "umax z17.b, p0/M, z17.b, z21.b\n"
+ "umax z16.b, p0/M, z16.b, z20.b\n"
+ "umax z5.b, p0/M, z5.b, z19.b\n"
+ "umax z3.b, p0/M, z3.b, z18.b\n"
+ "umax z2.b, p0/M, z2.b, z17.b\n"
+ "umax z1.b, p0/M, z1.b, z16.b\n"
+ "4:" // 4-vectors of channels: After loop
+ "ands x21, %x[n_valid_cells], #0x3\n"
+ "beq 6f\n"
+ "5:" // 4-vectors of channels: Single input loop
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+ "subs x21, x21, #0x1\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
+ "ld1b { z16.b }, p3/Z, [x20, x28]\n"
+ "umax z3.b, p0/M, z3.b, z16.b\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "umax z2.b, p0/M, z2.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+ "umax z1.b, p0/M, z1.b, z16.b\n"
+ "bgt 5b\n"
+ "6:" // 4-vectors of channels: Single input loop: End
+ "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+ "ld1rw { z4.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a8b7 // ushllb z23.h, z5.b, #0x0\n"
+ ".inst 0x4508acb9 // ushllt z25.h, z5.b, #0x0\n"
+ ".inst 0x4508a876 // ushllb z22.h, z3.b, #0x0\n"
+ ".inst 0x4508ac72 // ushllt z18.h, z3.b, #0x0\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+ "ld1rw { z3.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a855 // ushllb z21.h, z2.b, #0x0\n"
+ ".inst 0x4508ac51 // ushllt z17.h, z2.b, #0x0\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+ "ld1rw { z2.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a834 // ushllb z20.h, z1.b, #0x0\n"
+ ".inst 0x4508ac38 // ushllt z24.h, z1.b, #0x0\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+ "ld1rw { z19.s }, p0/Z, [x20]\n"
+ "neg z4.s, p0/M, z4.s\n"
+ ".inst 0x45974081 // saddwb z1.s, z4.s, z23.h\n"
+ "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x45974497 // saddwt z23.s, z4.s, z23.h\n"
+ ".inst 0x45994080 // saddwb z0.s, z4.s, z25.h\n"
+ ".inst 0x4599449f // saddwt z31.s, z4.s, z25.h\n"
+ ".inst 0x4596409e // saddwb z30.s, z4.s, z22.h\n"
+ ".inst 0x45964496 // saddwt z22.s, z4.s, z22.h\n"
+ ".inst 0x4592409d // saddwb z29.s, z4.s, z18.h\n"
+ ".inst 0x45924492 // saddwt z18.s, z4.s, z18.h\n"
+ ".inst 0x4595409c // saddwb z28.s, z4.s, z21.h\n"
+ ".inst 0x45954495 // saddwt z21.s, z4.s, z21.h\n"
+ ".inst 0x4591409b // saddwb z27.s, z4.s, z17.h\n"
+ ".inst 0x45914491 // saddwt z17.s, z4.s, z17.h\n"
+ ".inst 0x4594409a // saddwb z26.s, z4.s, z20.h\n"
+ ".inst 0x45944494 // saddwt z20.s, z4.s, z20.h\n"
+ ".inst 0x45984099 // saddwb z25.s, z4.s, z24.h\n"
+ ".inst 0x45984498 // saddwt z24.s, z4.s, z24.h\n"
+ ".inst 0x44828061 // srshl z1.s, p0/M, z1.s, z3.s\n"
+ ".inst 0x44828077 // srshl z23.s, p0/M, z23.s, z3.s\n"
+ ".inst 0x44828060 // srshl z0.s, p0/M, z0.s, z3.s\n"
+ ".inst 0x4482807f // srshl z31.s, p0/M, z31.s, z3.s\n"
+ ".inst 0x4482807e // srshl z30.s, p0/M, z30.s, z3.s\n"
+ ".inst 0x44828076 // srshl z22.s, p0/M, z22.s, z3.s\n"
+ ".inst 0x4482807d // srshl z29.s, p0/M, z29.s, z3.s\n"
+ ".inst 0x44828072 // srshl z18.s, p0/M, z18.s, z3.s\n"
+ ".inst 0x4482807c // srshl z28.s, p0/M, z28.s, z3.s\n"
+ ".inst 0x44828075 // srshl z21.s, p0/M, z21.s, z3.s\n"
+ ".inst 0x4482807b // srshl z27.s, p0/M, z27.s, z3.s\n"
+ ".inst 0x44828071 // srshl z17.s, p0/M, z17.s, z3.s\n"
+ ".inst 0x4482807a // srshl z26.s, p0/M, z26.s, z3.s\n"
+ ".inst 0x44828074 // srshl z20.s, p0/M, z20.s, z3.s\n"
+ ".inst 0x44828079 // srshl z25.s, p0/M, z25.s, z3.s\n"
+ ".inst 0x44828078 // srshl z24.s, p0/M, z24.s, z3.s\n"
+ ".inst 0x04a27421 // sqrdmulh z1.s, z1.s, z2.s\n"
+ ".inst 0x04a276f7 // sqrdmulh z23.s, z23.s, z2.s\n"
+ ".inst 0x04a27400 // sqrdmulh z0.s, z0.s, z2.s\n"
+ ".inst 0x04a277ff // sqrdmulh z31.s, z31.s, z2.s\n"
+ ".inst 0x04a277de // sqrdmulh z30.s, z30.s, z2.s\n"
+ ".inst 0x04a276d6 // sqrdmulh z22.s, z22.s, z2.s\n"
+ ".inst 0x04a277bd // sqrdmulh z29.s, z29.s, z2.s\n"
+ ".inst 0x04a27652 // sqrdmulh z18.s, z18.s, z2.s\n"
+ ".inst 0x04a2779c // sqrdmulh z28.s, z28.s, z2.s\n"
+ ".inst 0x04a276b5 // sqrdmulh z21.s, z21.s, z2.s\n"
+ ".inst 0x04a2777b // sqrdmulh z27.s, z27.s, z2.s\n"
+ ".inst 0x04a27631 // sqrdmulh z17.s, z17.s, z2.s\n"
+ ".inst 0x04a2775a // sqrdmulh z26.s, z26.s, z2.s\n"
+ ".inst 0x04a27694 // sqrdmulh z20.s, z20.s, z2.s\n"
+ ".inst 0x04a27739 // sqrdmulh z25.s, z25.s, z2.s\n"
+ ".inst 0x04a27718 // sqrdmulh z24.s, z24.s, z2.s\n"
+ ".inst 0x44828261 // srshl z1.s, p0/M, z1.s, z19.s\n"
+ ".inst 0x44828277 // srshl z23.s, p0/M, z23.s, z19.s\n"
+ ".inst 0x44828260 // srshl z0.s, p0/M, z0.s, z19.s\n"
+ ".inst 0x4482827f // srshl z31.s, p0/M, z31.s, z19.s\n"
+ ".inst 0x4482827e // srshl z30.s, p0/M, z30.s, z19.s\n"
+ ".inst 0x44828276 // srshl z22.s, p0/M, z22.s, z19.s\n"
+ ".inst 0x4482827d // srshl z29.s, p0/M, z29.s, z19.s\n"
+ ".inst 0x44828272 // srshl z18.s, p0/M, z18.s, z19.s\n"
+ ".inst 0x4482827c // srshl z28.s, p0/M, z28.s, z19.s\n"
+ ".inst 0x44828275 // srshl z21.s, p0/M, z21.s, z19.s\n"
+ ".inst 0x4482827b // srshl z27.s, p0/M, z27.s, z19.s\n"
+ ".inst 0x44828271 // srshl z17.s, p0/M, z17.s, z19.s\n"
+ ".inst 0x4482827a // srshl z26.s, p0/M, z26.s, z19.s\n"
+ ".inst 0x44828274 // srshl z20.s, p0/M, z20.s, z19.s\n"
+ ".inst 0x44828279 // srshl z25.s, p0/M, z25.s, z19.s\n"
+ ".inst 0x44828278 // srshl z24.s, p0/M, z24.s, z19.s\n"
+ "add z1.s, z1.s, z16.s\n"
+ "add z23.s, z23.s, z16.s\n"
+ "add z0.s, z0.s, z16.s\n"
+ "add z31.s, z31.s, z16.s\n"
+ "add z30.s, z30.s, z16.s\n"
+ "add z22.s, z22.s, z16.s\n"
+ "add z29.s, z29.s, z16.s\n"
+ "add z18.s, z18.s, z16.s\n"
+ "add z28.s, z28.s, z16.s\n"
+ "add z21.s, z21.s, z16.s\n"
+ "add z27.s, z27.s, z16.s\n"
+ "add z17.s, z17.s, z16.s\n"
+ "add z26.s, z26.s, z16.s\n"
+ "add z20.s, z20.s, z16.s\n"
+ "add z25.s, z25.s, z16.s\n"
+ "add z24.s, z24.s, z16.s\n"
+ "mov z16.s, #0x0\n"
+ "mov z19.s, #0xff\n"
+ "smax z1.s, p0/M, z1.s, z16.s\n"
+ "smax z23.s, p0/M, z23.s, z16.s\n"
+ "smax z0.s, p0/M, z0.s, z16.s\n"
+ "smax z31.s, p0/M, z31.s, z16.s\n"
+ "smax z30.s, p0/M, z30.s, z16.s\n"
+ "smax z22.s, p0/M, z22.s, z16.s\n"
+ "smax z29.s, p0/M, z29.s, z16.s\n"
+ "smax z18.s, p0/M, z18.s, z16.s\n"
+ "smax z28.s, p0/M, z28.s, z16.s\n"
+ "smax z21.s, p0/M, z21.s, z16.s\n"
+ "smax z27.s, p0/M, z27.s, z16.s\n"
+ "smax z17.s, p0/M, z17.s, z16.s\n"
+ "smax z26.s, p0/M, z26.s, z16.s\n"
+ "smax z20.s, p0/M, z20.s, z16.s\n"
+ "smax z25.s, p0/M, z25.s, z16.s\n"
+ "smax z24.s, p0/M, z24.s, z16.s\n"
+ "smin z1.s, p0/M, z1.s, z19.s\n"
+ "smin z23.s, p0/M, z23.s, z19.s\n"
+ "smin z0.s, p0/M, z0.s, z19.s\n"
+ "trn1 z23.h, z1.h, z23.h\n"
+ "smin z31.s, p0/M, z31.s, z19.s\n"
+ "smin z30.s, p0/M, z30.s, z19.s\n"
+ "trn1 z16.h, z0.h, z31.h\n"
+ "smin z22.s, p0/M, z22.s, z19.s\n"
+ "smin z29.s, p0/M, z29.s, z19.s\n"
+ "trn1 z22.h, z30.h, z22.h\n"
+ "smin z18.s, p0/M, z18.s, z19.s\n"
+ "smin z28.s, p0/M, z28.s, z19.s\n"
+ "trn1 z18.h, z29.h, z18.h\n"
+ "smin z21.s, p0/M, z21.s, z19.s\n"
+ "smin z27.s, p0/M, z27.s, z19.s\n"
+ "trn1 z21.h, z28.h, z21.h\n"
+ "smin z17.s, p0/M, z17.s, z19.s\n"
+ "smin z26.s, p0/M, z26.s, z19.s\n"
+ "trn1 z17.h, z27.h, z17.h\n"
+ "smin z20.s, p0/M, z20.s, z19.s\n"
+ "smin z25.s, p0/M, z25.s, z19.s\n"
+ "trn1 z20.h, z26.h, z20.h\n"
+ "smin z24.s, p0/M, z24.s, z19.s\n"
+ "trn1 z19.h, z25.h, z24.h\n"
+ "trn1 z16.b, z23.b, z16.b\n"
+ "trn1 z18.b, z22.b, z18.b\n"
+ "trn1 z17.b, z21.b, z17.b\n"
+ "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+ "incb x9, ALL, MUL #4\n"
+ "trn1 z16.b, z20.b, z19.b\n"
+ "st1b { z18.b }, p3, [%x[outptr], x28]\n"
+ "incb x28, ALL, MUL #4\n"
+ "st1b { z17.b }, p2, [%x[outptr], x27]\n"
+ "incb x27, ALL, MUL #4\n"
+ "st1b { z16.b }, p1, [%x[outptr], x26]\n"
+ "incb x26, ALL, MUL #4\n"
+ "whilelt p1.b, x26, %x[n_channels]\n"
+ "b.any 1b\n"
+ "7:" // Single vector of channels
+ "whilelt p4.b, x9, %x[n_channels]\n"
+ "b.none 14f\n"
+ "8:" // Single vector of channels: Loop
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z5.b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x20, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "ld1b { z0.b }, p4/Z, [x20, x9]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "beq 10f\n"
+ "9:" // Single vector of channels: Loop: 4 inputs loop
+ "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "subs x25, x25, #0x1\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z0.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z31.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z23.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x9]\n"
+ "bgt 9b\n"
+ "10:" // Single vector of channels: Loop: 4 inputs tail
+ "movprfx z16, z0\n umax z16.b, p0/M, z16.b, z31.b\n"
+ "movprfx z17, z23\n umax z17.b, p0/M, z17.b, z30.b\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
+ "11:" // Single vector of channels: Loop: After loop
+ "ands x21, %x[n_valid_cells], #0x3\n"
+ "beq 13f\n"
+ "12:" // Single vector of channels: Loop: Single input loop
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+ "subs x21, x21, #0x1\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
+ "bgt 12b\n"
+ "13:" // Single vector of channels: Loop: Single input loop: End
+ "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+ "ld1rw { z18.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a8b1 // ushllb z17.h, z5.b, #0x0\n"
+ ".inst 0x4508acb0 // ushllt z16.h, z5.b, #0x0\n"
+ "neg z18.s, p0/M, z18.s\n"
+ ".inst 0x45914257 // saddwb z23.s, z18.s, z17.h\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+ "ld1rw { z22.s }, p0/Z, [x20]\n"
+ ".inst 0x45914655 // saddwt z21.s, z18.s, z17.h\n"
+ ".inst 0x45904254 // saddwb z20.s, z18.s, z16.h\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+ "ld1rw { z19.s }, p0/Z, [x20]\n"
+ ".inst 0x45904652 // saddwt z18.s, z18.s, z16.h\n"
+ ".inst 0x448282d7 // srshl z23.s, p0/M, z23.s, z22.s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+ "ld1rw { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x448282d5 // srshl z21.s, p0/M, z21.s, z22.s\n"
+ ".inst 0x448282d4 // srshl z20.s, p0/M, z20.s, z22.s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x448282d2 // srshl z18.s, p0/M, z18.s, z22.s\n"
+ ".inst 0x04b376f7 // sqrdmulh z23.s, z23.s, z19.s\n"
+ ".inst 0x04b376b5 // sqrdmulh z21.s, z21.s, z19.s\n"
+ ".inst 0x04b37694 // sqrdmulh z20.s, z20.s, z19.s\n"
+ ".inst 0x04b37652 // sqrdmulh z18.s, z18.s, z19.s\n"
+ ".inst 0x44828237 // srshl z23.s, p0/M, z23.s, z17.s\n"
+ ".inst 0x44828235 // srshl z21.s, p0/M, z21.s, z17.s\n"
+ ".inst 0x44828234 // srshl z20.s, p0/M, z20.s, z17.s\n"
+ ".inst 0x44828232 // srshl z18.s, p0/M, z18.s, z17.s\n"
+ "add z23.s, z23.s, z16.s\n"
+ "add z21.s, z21.s, z16.s\n"
+ "add z20.s, z20.s, z16.s\n"
+ "add z18.s, z18.s, z16.s\n"
+ "mov z17.s, #0x0\n"
+ "mov z16.s, #0xff\n"
+ "smax z23.s, p0/M, z23.s, z17.s\n"
+ "smax z21.s, p0/M, z21.s, z17.s\n"
+ "smax z20.s, p0/M, z20.s, z17.s\n"
+ "smax z18.s, p0/M, z18.s, z17.s\n"
+ "smin z23.s, p0/M, z23.s, z16.s\n"
+ "smin z21.s, p0/M, z21.s, z16.s\n"
+ "smin z20.s, p0/M, z20.s, z16.s\n"
+ "trn1 z17.h, z23.h, z21.h\n"
+ "smin z18.s, p0/M, z18.s, z16.s\n"
+ "trn1 z16.h, z20.h, z18.h\n"
+ "trn1 z16.b, z17.b, z16.b\n"
+ "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+ "incb x9\n"
+ "whilelt p4.b, x9, %x[n_channels]\n"
+ "b.any 8b\n"
+ "14:" // End
+ ".inst 0xd503467f // SMSTOP\n"
+ :
+ : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_input_offset] "I" (offsetof(Requantize32, input_offset)), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [outptr] "r" (outptr), [quant_params] "r" (&qp)
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace pooling
+} // namespace arm_conv
+
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
index 8c7a497376..f8293233e6 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,37 +24,28 @@
#pragma once
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace pooling {
void sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-struct sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst
+struct sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy<__fp16, __fp16>
{
- typedef __fp16 operand_type;
- typedef __fp16 return_type;
+ using Parent = DepthfirstStrategy<__fp16, __fp16>;
- typedef void (*kern_type)(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+ const static auto pooling_type = PoolingType::AVERAGE;
+ const static auto pool_rows = 3u, pool_cols = 3u;
+ const static auto stride_rows = 1u, stride_cols = 1u;
- constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
+ sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *)
+ : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
- constexpr static unsigned int pool_rows(void) { return 3; }
- constexpr static unsigned int pool_cols(void) { return 3; }
-
- constexpr static unsigned int stride_rows(void) { return 1; }
- constexpr static unsigned int stride_cols(void) { return 1; }
-
- constexpr static unsigned int out_rows(void) { return 2; }
- constexpr static unsigned int out_cols(void) { return 2; }
-
- kern_type kernel = sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl;
-
- sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const { return sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 3c1858633b..1ba78f3fba 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,7 +26,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace pooling {
@@ -82,126 +82,126 @@ void sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
pad_left, pad_top, pad_right, pad_bottom);
__asm__ __volatile__(
- "ldr x3, [%x[args], %[offsetof_n_channels]]\n"
- "mov x4, #0x0\n"
- "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
- "mov x5, #0x0\n"
- "ldr x6, [%x[args], %[offsetof_inptrs]]\n"
- "mov x19, #0x4\n"
- "add x7, %x[args], %[offsetof_rescale]\n"
- "ldp x8, x17, [x20, #0x0]\n"
- "ldp x16, x15, [x20, #0x10]\n"
- "whilelt p0.h, XZR, x19\n"
- "ldp x14, x13, [x6, #0x0]\n"
- "whilelt p1.h, x4, x3\n"
- "ldp x12, x11, [x6, #0x10]\n"
- "ldp x10, x9, [x6, #0x20]\n"
- "ldp x28, x27, [x6, #0x30]\n"
- "ldp x26, x25, [x6, #0x40]\n"
- "ldp x24, x23, [x6, #0x50]\n"
- "ldp x22, x21, [x6, #0x60]\n"
- "ldp x20, x19, [x6, #0x70]\n"
- "ld1rqh { z7.h }, p0/Z, [x7]\n"
- "ld1h { z8.h }, p1/Z, [x9, x4, LSL #1]\n"
- "ld1h { z6.h }, p1/Z, [x28, x4, LSL #1]\n"
- "ld1h { z5.h }, p1/Z, [x25, x4, LSL #1]\n"
- "ld1h { z4.h }, p1/Z, [x24, x4, LSL #1]\n"
- "ld1h { z3.h }, p1/Z, [x13, x4, LSL #1]\n"
- "ld1h { z2.h }, p1/Z, [x12, x4, LSL #1]\n"
- "ld1h { z1.h }, p1/Z, [x10, x4, LSL #1]\n"
- "ld1h { z0.h }, p1/Z, [x26, x4, LSL #1]\n"
- "ld1h { z31.h }, p1/Z, [x27, x4, LSL #1]\n"
- "ld1h { z30.h }, p1/Z, [x23, x4, LSL #1]\n"
- "ld1h { z29.h }, p1/Z, [x21, x4, LSL #1]\n"
- "ld1h { z28.h }, p1/Z, [x20, x4, LSL #1]\n"
- "ld1h { z27.h }, p1/Z, [x14, x4, LSL #1]\n"
- "ld1h { z26.h }, p1/Z, [x11, x4, LSL #1]\n"
- "ld1h { z25.h }, p1/Z, [x22, x4, LSL #1]\n"
- "ld1h { z24.h }, p1/Z, [x19, x4, LSL #1]\n"
- "incw x4\n"
- "whilelt p1.h, x4, x3\n"
+ "ldr x2, [%x[args], %[offsetof_n_channels]]\n"
+ "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+ "mov x3, #0x0\n"
+ "mov x20, #0x4\n"
+ "ldr x4, [%x[args], %[offsetof_inptrs]]\n"
+ "ldp x5, x6, [x21, #0x0]\n"
+ "whilelt p2.h, XZR, x20\n"
+ "whilelt p0.h, x3, x2\n"
+ "ldp x7, x8, [x21, #0x10]\n"
+ "ldp x17, x16, [x4, #0x0]\n"
+ "add x15, %x[args], %[offsetof_rescale]\n"
+ "mov x14, #0x0\n"
+ "ldp x13, x12, [x4, #0x10]\n"
+ "ldp x11, x10, [x4, #0x20]\n"
+ "ldp x9, x28, [x4, #0x30]\n"
+ "ldp x27, x26, [x4, #0x40]\n"
+ "ldp x25, x24, [x4, #0x50]\n"
+ "ldp x23, x22, [x4, #0x60]\n"
+ "ldp x21, x20, [x4, #0x70]\n"
+ "ld1h { z7.h }, p0/Z, [x10, x3, LSL #1]\n"
+ "ld1h { z6.h }, p0/Z, [x9, x3, LSL #1]\n"
+ "ld1h { z5.h }, p0/Z, [x26, x3, LSL #1]\n"
+ "ld1h { z4.h }, p0/Z, [x25, x3, LSL #1]\n"
+ "ld1h { z3.h }, p0/Z, [x16, x3, LSL #1]\n"
+ "ld1h { z2.h }, p0/Z, [x13, x3, LSL #1]\n"
+ "ld1h { z1.h }, p0/Z, [x11, x3, LSL #1]\n"
+ "ld1h { z31.h }, p0/Z, [x27, x3, LSL #1]\n"
+ "ld1h { z30.h }, p0/Z, [x28, x3, LSL #1]\n"
+ "ld1h { z29.h }, p0/Z, [x24, x3, LSL #1]\n"
+ "ld1h { z28.h }, p0/Z, [x22, x3, LSL #1]\n"
+ "ld1h { z27.h }, p0/Z, [x21, x3, LSL #1]\n"
+ "ld1h { z26.h }, p0/Z, [x17, x3, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x12, x3, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x23, x3, LSL #1]\n"
+ "ld1h { z23.h }, p0/Z, [x20, x3, LSL #1]\n"
+ "incw x3\n"
+ "whilelt p1.h, x3, x2\n"
+ "ld1rqh { z0.h }, p2/Z, [x15]\n"
"b.none 2f\n"
"1:" // Vector: Loop
- "fadd z17.h, z8.h, z6.h\n"
- "ld1h { z8.h }, p1/Z, [x9, x4, LSL #1]\n"
- "whilelt p0.h, x5, x3\n"
+ "fadd z17.h, z7.h, z6.h\n"
"fadd z16.h, z5.h, z4.h\n"
- "ld1h { z6.h }, p1/Z, [x28, x4, LSL #1]\n"
+ "ld1h { z7.h }, p1/Z, [x10, x3, LSL #1]\n"
+ "ld1h { z6.h }, p1/Z, [x9, x3, LSL #1]\n"
+ "fadd z19.h, z17.h, z16.h\n"
"fadd z18.h, z3.h, z2.h\n"
- "ld1h { z5.h }, p1/Z, [x25, x4, LSL #1]\n"
- "fadd z23.h, z1.h, z0.h\n"
- "ld1h { z4.h }, p1/Z, [x24, x4, LSL #1]\n"
- "fadd z22.h, z31.h, z30.h\n"
- "ld1h { z3.h }, p1/Z, [x13, x4, LSL #1]\n"
- "fadd z17.h, z17.h, z16.h\n"
- "ld1h { z2.h }, p1/Z, [x12, x4, LSL #1]\n"
- "fadd z16.h, z29.h, z28.h\n"
- "ld1h { z1.h }, p1/Z, [x10, x4, LSL #1]\n"
- "fadd z19.h, z27.h, z23.h\n"
- "ld1h { z0.h }, p1/Z, [x26, x4, LSL #1]\n"
- "fadd z21.h, z18.h, z17.h\n"
- "ld1h { z31.h }, p1/Z, [x27, x4, LSL #1]\n"
- "fadd z20.h, z16.h, z17.h\n"
- "ld1h { z30.h }, p1/Z, [x23, x4, LSL #1]\n"
- "fadd z18.h, z26.h, z22.h\n"
- "ld1h { z29.h }, p1/Z, [x21, x4, LSL #1]\n"
- "fadd z17.h, z25.h, z23.h\n"
- "ld1h { z28.h }, p1/Z, [x20, x4, LSL #1]\n"
- "fadd z16.h, z24.h, z22.h\n"
- "ld1h { z27.h }, p1/Z, [x14, x4, LSL #1]\n"
+ "ld1h { z5.h }, p1/Z, [x26, x3, LSL #1]\n"
+ "ld1h { z4.h }, p1/Z, [x25, x3, LSL #1]\n"
+ "fadd z17.h, z1.h, z31.h\n"
+ "fadd z22.h, z30.h, z29.h\n"
+ "ld1h { z3.h }, p1/Z, [x16, x3, LSL #1]\n"
+ "ld1h { z2.h }, p1/Z, [x13, x3, LSL #1]\n"
+ "fadd z16.h, z28.h, z27.h\n"
+ "fadd z21.h, z18.h, z19.h\n"
+ "ld1h { z1.h }, p1/Z, [x11, x3, LSL #1]\n"
+ "ld1h { z31.h }, p1/Z, [x27, x3, LSL #1]\n"
+ "fadd z20.h, z16.h, z19.h\n"
+ "fadd z19.h, z26.h, z17.h\n"
+ "ld1h { z30.h }, p1/Z, [x28, x3, LSL #1]\n"
+ "ld1h { z29.h }, p1/Z, [x24, x3, LSL #1]\n"
+ "fadd z18.h, z25.h, z22.h\n"
+ "fadd z17.h, z24.h, z17.h\n"
+ "ld1h { z28.h }, p1/Z, [x22, x3, LSL #1]\n"
+ "ld1h { z27.h }, p1/Z, [x21, x3, LSL #1]\n"
+ "fadd z16.h, z23.h, z22.h\n"
+ "ld1h { z26.h }, p1/Z, [x17, x3, LSL #1]\n"
+ "ld1h { z25.h }, p1/Z, [x12, x3, LSL #1]\n"
"fadd z19.h, z21.h, z19.h\n"
- "ld1h { z26.h }, p1/Z, [x11, x4, LSL #1]\n"
+ "ld1h { z24.h }, p1/Z, [x23, x3, LSL #1]\n"
+ "ld1h { z23.h }, p1/Z, [x20, x3, LSL #1]\n"
+ "incw x3\n"
"fadd z18.h, z21.h, z18.h\n"
- "ld1h { z25.h }, p1/Z, [x22, x4, LSL #1]\n"
"fadd z17.h, z17.h, z20.h\n"
- "ld1h { z24.h }, p1/Z, [x19, x4, LSL #1]\n"
- "incw x4\n"
- "fadd z16.h, z20.h, z16.h\n"
- "whilelt p1.h, x4, x3\n"
- "fmul z19.h, z19.h, z7.h[0]\n"
- "st1h { z19.h }, p0, [x8, x5, LSL #1]\n"
- "fmul z18.h, z18.h, z7.h[1]\n"
- "fmul z17.h, z17.h, z7.h[2]\n"
- "st1h { z18.h }, p0, [x17, x5, LSL #1]\n"
- "fmul z16.h, z16.h, z7.h[3]\n"
- "st1h { z17.h }, p0, [x16, x5, LSL #1]\n"
- "st1h { z16.h }, p0, [x15, x5, LSL #1]\n"
- "incw x5\n"
+ "fadd z16.h, z16.h, z20.h\n"
+ "whilelt p0.h, x14, x2\n"
+ "whilelt p1.h, x3, x2\n"
+ "fmul z19.h, z19.h, z0.h[0]\n"
+ "fmul z18.h, z18.h, z0.h[1]\n"
+ "st1h { z19.h }, p0, [x5, x14, LSL #1]\n"
+ "fmul z17.h, z17.h, z0.h[2]\n"
+ "fmul z16.h, z16.h, z0.h[3]\n"
+ "st1h { z18.h }, p0, [x6, x14, LSL #1]\n"
+ "st1h { z17.h }, p0, [x7, x14, LSL #1]\n"
+ "st1h { z16.h }, p0, [x8, x14, LSL #1]\n"
+ "incw x14\n"
"b.any 1b\n"
"2:" // Vector: Tail
- "fadd z17.h, z8.h, z6.h\n"
- "whilelt p0.h, x5, x3\n"
+ "fadd z17.h, z7.h, z6.h\n"
"fadd z16.h, z5.h, z4.h\n"
+ "whilelt p0.h, x14, x2\n"
+ "fadd z20.h, z17.h, z16.h\n"
"fadd z18.h, z3.h, z2.h\n"
- "fadd z23.h, z1.h, z0.h\n"
- "fadd z17.h, z17.h, z16.h\n"
- "fadd z22.h, z31.h, z30.h\n"
- "fadd z16.h, z29.h, z28.h\n"
- "fadd z21.h, z18.h, z17.h\n"
- "fadd z19.h, z27.h, z23.h\n"
- "fadd z20.h, z16.h, z17.h\n"
- "fadd z18.h, z26.h, z22.h\n"
- "fadd z17.h, z25.h, z23.h\n"
- "fadd z16.h, z24.h, z22.h\n"
- "fadd z19.h, z21.h, z19.h\n"
+ "fadd z17.h, z1.h, z31.h\n"
+ "fadd z19.h, z30.h, z29.h\n"
+ "fadd z16.h, z28.h, z27.h\n"
+ "fadd z21.h, z18.h, z20.h\n"
+ "fadd z20.h, z16.h, z20.h\n"
+ "fadd z16.h, z26.h, z17.h\n"
+ "fadd z18.h, z25.h, z19.h\n"
+ "fadd z17.h, z24.h, z17.h\n"
+ "fadd z19.h, z23.h, z19.h\n"
+ "fadd z16.h, z21.h, z16.h\n"
+ "fmul z16.h, z16.h, z0.h[0]\n"
+ "st1h { z16.h }, p0, [x5, x14, LSL #1]\n"
"fadd z18.h, z21.h, z18.h\n"
"fadd z17.h, z17.h, z20.h\n"
- "fadd z16.h, z20.h, z16.h\n"
- "fmul z19.h, z19.h, z7.h[0]\n"
- "st1h { z19.h }, p0, [x8, x5, LSL #1]\n"
- "fmul z18.h, z18.h, z7.h[1]\n"
- "fmul z17.h, z17.h, z7.h[2]\n"
- "st1h { z18.h }, p0, [x17, x5, LSL #1]\n"
- "fmul z16.h, z16.h, z7.h[3]\n"
- "st1h { z17.h }, p0, [x16, x5, LSL #1]\n"
- "st1h { z16.h }, p0, [x15, x5, LSL #1]\n"
+ "fmul z18.h, z18.h, z0.h[1]\n"
+ "fmul z17.h, z17.h, z0.h[2]\n"
+ "fadd z16.h, z19.h, z20.h\n"
+ "fmul z16.h, z16.h, z0.h[3]\n"
+ "st1h { z18.h }, p0, [x6, x14, LSL #1]\n"
+ "st1h { z17.h }, p0, [x7, x14, LSL #1]\n"
+ "st1h { z16.h }, p0, [x8, x14, LSL #1]\n"
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
- : "cc", "memory", "p0", "p1", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst.hpp
index 391d47cf41..49231484e6 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,29 +26,21 @@
#pragma once
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace pooling {
void sve_fp16_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
-struct sve_fp16_nhwc_avg_generic_depthfirst
+struct sve_fp16_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<__fp16, __fp16>
{
- typedef __fp16 operand_type;
- typedef __fp16 return_type;
-
- typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
- kern_type kernel = sve_fp16_nhwc_avg_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<__fp16, __fp16>;
sve_fp16_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sve_fp16_nhwc_avg_generic_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp
index 84a6acf80d..2bef44ea5c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,8 +23,9 @@
*/
#include <cstdint>
+#include <cstddef>
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace pooling {
@@ -41,88 +42,88 @@ void sve_fp16_nhwc_avg_generic_depthfirst_impl(
const auto rescale_value = static_cast<__fp16>(1.0f / static_cast<float>(window_cells));
__asm__ __volatile__(
+ "mov x9, #0x0\n"
+ "cnth x28\n"
+ "cnth x27, ALL, MUL #2\n"
+ "cnth x26, ALL, MUL #3\n"
"ptrue p0.b\n"
- "ld1rh { z8.h }, p0/Z, [%x[rescale_ptr]]\n"
- "mov x28, #0x0\n"
- "cnth x27\n"
- "cnth x26, ALL, MUL #2\n"
- "cnth x25, ALL, MUL #3\n"
- "whilelt p3.h, x28, %x[n_channels]\n"
- "whilelt p2.h, x27, %x[n_channels]\n"
- "whilelt p1.h, x26, %x[n_channels]\n"
- "whilelt p0.h, x25, %x[n_channels]\n"
+ "whilelt p3.h, x9, %x[n_channels]\n"
+ "ld1rh { z7.h }, p0/Z, [%x[rescale_ptr]]\n"
+ "whilelt p2.h, x28, %x[n_channels]\n"
+ "whilelt p1.h, x27, %x[n_channels]\n"
+ "whilelt p0.h, x26, %x[n_channels]\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
- "mov z7.b, #0x0\n"
- "mov x19, %x[inptrs]\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
"mov z6.b, #0x0\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
"mov z5.b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
"mov z4.b, #0x0\n"
- "cbz x24, 4f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
- "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
- "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
- "ld1h { z31.h }, p2/Z, [x23, x27, LSL #1]\n"
- "ld1h { z30.h }, p2/Z, [x22, x27, LSL #1]\n"
- "ld1h { z29.h }, p2/Z, [x21, x27, LSL #1]\n"
- "ld1h { z28.h }, p2/Z, [x20, x27, LSL #1]\n"
- "ld1h { z27.h }, p1/Z, [x23, x26, LSL #1]\n"
- "ld1h { z21.h }, p1/Z, [x22, x26, LSL #1]\n"
- "ld1h { z26.h }, p1/Z, [x21, x26, LSL #1]\n"
- "ld1h { z17.h }, p1/Z, [x20, x26, LSL #1]\n"
- "ld1h { z25.h }, p0/Z, [x23, x25, LSL #1]\n"
- "ld1h { z20.h }, p0/Z, [x22, x25, LSL #1]\n"
- "ld1h { z24.h }, p0/Z, [x21, x25, LSL #1]\n"
- "ld1h { z16.h }, p0/Z, [x20, x25, LSL #1]\n"
+ "mov z3.b, #0x0\n"
+ "cbz x25, 4f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z30.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
+ "ld1h { z29.h }, p2/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z28.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z20.h }, p0/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "fadd z23.h, z3.h, z2.h\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "fadd z19.h, z1.h, z0.h\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "fadd z22.h, z31.h, z30.h\n"
- "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
+ "fadd z23.h, z2.h, z1.h\n"
+ "fadd z19.h, z0.h, z31.h\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fadd z22.h, z30.h, z22.h\n"
"fadd z18.h, z29.h, z28.h\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
"fadd z21.h, z27.h, z21.h\n"
- "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
"fadd z17.h, z26.h, z17.h\n"
- "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
"fadd z20.h, z25.h, z20.h\n"
- "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
"fadd z16.h, z24.h, z16.h\n"
- "ld1h { z31.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
"fadd z19.h, z23.h, z19.h\n"
- "ld1h { z30.h }, p2/Z, [x22, x27, LSL #1]\n"
"fadd z18.h, z22.h, z18.h\n"
- "ld1h { z29.h }, p2/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z30.h }, p2/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z22.h }, p2/Z, [x22, x28, LSL #1]\n"
"fadd z17.h, z21.h, z17.h\n"
- "ld1h { z28.h }, p2/Z, [x20, x27, LSL #1]\n"
"fadd z16.h, z20.h, z16.h\n"
- "ld1h { z27.h }, p1/Z, [x23, x26, LSL #1]\n"
- "fadd z7.h, z7.h, z19.h\n"
- "ld1h { z21.h }, p1/Z, [x22, x26, LSL #1]\n"
- "fadd z6.h, z6.h, z18.h\n"
- "ld1h { z26.h }, p1/Z, [x21, x26, LSL #1]\n"
- "fadd z5.h, z5.h, z17.h\n"
- "ld1h { z17.h }, p1/Z, [x20, x26, LSL #1]\n"
- "fadd z4.h, z4.h, z16.h\n"
- "ld1h { z25.h }, p0/Z, [x23, x25, LSL #1]\n"
- "ld1h { z20.h }, p0/Z, [x22, x25, LSL #1]\n"
- "ld1h { z24.h }, p0/Z, [x21, x25, LSL #1]\n"
- "ld1h { z16.h }, p0/Z, [x20, x25, LSL #1]\n"
+ "ld1h { z29.h }, p2/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z28.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "fadd z6.h, z6.h, z19.h\n"
+ "fadd z5.h, z5.h, z18.h\n"
+ "ld1h { z27.h }, p1/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z21.h }, p1/Z, [x22, x27, LSL #1]\n"
+ "fadd z4.h, z4.h, z17.h\n"
+ "fadd z3.h, z3.h, z16.h\n"
+ "ld1h { z26.h }, p1/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z17.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z20.h }, p0/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "fadd z23.h, z3.h, z2.h\n"
- "fadd z19.h, z1.h, z0.h\n"
- "fadd z22.h, z31.h, z30.h\n"
+ "fadd z23.h, z2.h, z1.h\n"
+ "fadd z19.h, z0.h, z31.h\n"
+ "fadd z22.h, z30.h, z22.h\n"
"fadd z18.h, z29.h, z28.h\n"
"fadd z21.h, z27.h, z21.h\n"
"fadd z17.h, z26.h, z17.h\n"
@@ -132,100 +133,99 @@ void sve_fp16_nhwc_avg_generic_depthfirst_impl(
"fadd z18.h, z22.h, z18.h\n"
"fadd z17.h, z21.h, z17.h\n"
"fadd z16.h, z20.h, z16.h\n"
- "fadd z7.h, z7.h, z19.h\n"
- "fadd z6.h, z6.h, z18.h\n"
- "fadd z5.h, z5.h, z17.h\n"
- "fadd z4.h, z4.h, z16.h\n"
+ "fadd z6.h, z6.h, z19.h\n"
+ "fadd z5.h, z5.h, z18.h\n"
+ "fadd z4.h, z4.h, z17.h\n"
+ "fadd z3.h, z3.h, z16.h\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
- "fadd z7.h, z7.h, z3.h\n"
- "ld1h { z31.h }, p2/Z, [x23, x27, LSL #1]\n"
- "ld1h { z27.h }, p1/Z, [x23, x26, LSL #1]\n"
- "fadd z6.h, z6.h, z31.h\n"
- "ld1h { z25.h }, p0/Z, [x23, x25, LSL #1]\n"
- "fadd z5.h, z5.h, z27.h\n"
- "fadd z4.h, z4.h, z25.h\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "subs x21, x21, #0x1\n"
+ "fadd z6.h, z6.h, z16.h\n"
+ "ld1h { z17.h }, p2/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z16.h }, p1/Z, [x20, x27, LSL #1]\n"
+ "fadd z5.h, z5.h, z17.h\n"
+ "fadd z4.h, z4.h, z16.h\n"
+ "ld1h { z16.h }, p0/Z, [x20, x26, LSL #1]\n"
+ "fadd z3.h, z3.h, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "fmul z7.h, z7.h, z8.h\n"
- "st1h { z7.h }, p3, [%x[outptr], x28, LSL #1]\n"
- "fmul z6.h, z6.h, z8.h\n"
+ "fmul z6.h, z6.h, z7.h\n"
+ "fmul z5.h, z5.h, z7.h\n"
+ "st1h { z6.h }, p3, [%x[outptr], x9, LSL #1]\n"
+ "fmul z4.h, z4.h, z7.h\n"
+ "fmul z3.h, z3.h, z7.h\n"
+ "st1h { z5.h }, p2, [%x[outptr], x28, LSL #1]\n"
+ "st1h { z4.h }, p1, [%x[outptr], x27, LSL #1]\n"
+ "inch x9, ALL, MUL #4\n"
"inch x28, ALL, MUL #4\n"
- "fmul z5.h, z5.h, z8.h\n"
- "st1h { z6.h }, p2, [%x[outptr], x27, LSL #1]\n"
- "fmul z4.h, z4.h, z8.h\n"
- "inch x27, ALL, MUL #4\n"
- "st1h { z5.h }, p1, [%x[outptr], x26, LSL #1]\n"
+ "st1h { z3.h }, p0, [%x[outptr], x26, LSL #1]\n"
"inch x26, ALL, MUL #4\n"
- "st1h { z4.h }, p0, [%x[outptr], x25, LSL #1]\n"
- "inch x25, ALL, MUL #4\n"
- "whilelt p0.h, x25, %x[n_channels]\n"
+ "whilelt p0.h, x26, %x[n_channels]\n"
+ "inch x27, ALL, MUL #4\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p3.h, x28, %x[n_channels]\n"
+ "whilelt p3.h, x9, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
- "mov z7.b, #0x0\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 11f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
- "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
- "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z6.b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fadd z23.h, z3.h, z2.h\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "fadd z19.h, z1.h, z0.h\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "fadd z19.h, z23.h, z19.h\n"
- "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
- "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
- "fadd z7.h, z7.h, z19.h\n"
- "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "fadd z17.h, z2.h, z1.h\n"
+ "fadd z16.h, z0.h, z31.h\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fadd z16.h, z17.h, z16.h\n"
+ "subs x25, x25, #0x1\n"
+ "fadd z6.h, z6.h, z16.h\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z2.h }, p3/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z1.h }, p3/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x20, x9, LSL #1]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fadd z23.h, z3.h, z2.h\n"
- "fadd z19.h, z1.h, z0.h\n"
- "fadd z19.h, z23.h, z19.h\n"
- "fadd z7.h, z7.h, z19.h\n"
+ "fadd z17.h, z2.h, z1.h\n"
+ "fadd z16.h, z0.h, z31.h\n"
+ "fadd z16.h, z17.h, z16.h\n"
+ "fadd z6.h, z6.h, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
- "fadd z7.h, z7.h, z3.h\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p3/Z, [x20, x9, LSL #1]\n"
+ "subs x21, x21, #0x1\n"
+ "fadd z6.h, z6.h, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "fmul z7.h, z7.h, z8.h\n"
- "st1h { z7.h }, p3, [%x[outptr], x28, LSL #1]\n"
- "inch x28\n"
- "whilelt p3.h, x28, %x[n_channels]\n"
+ "fmul z6.h, z6.h, z7.h\n"
+ "st1h { z6.h }, p3, [%x[outptr], x9, LSL #1]\n"
+ "inch x9\n"
+ "whilelt p3.h, x9, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
-
:
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 5fb297eb49..3691b6cb28 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,37 +24,28 @@
#pragma once
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace pooling {
void sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-struct sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst
+struct sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<__fp16, __fp16>
{
- typedef __fp16 operand_type;
- typedef __fp16 return_type;
+ using Parent = DepthfirstStrategy<__fp16, __fp16>;
- typedef void (*kern_type)(unsigned int, const __fp16 *const *const, __fp16 *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+ const static auto pooling_type = PoolingType::MAX;
+ const static auto pool_rows = 2u, pool_cols = 2u;
+ const static auto stride_rows = 1u, stride_cols = 1u;
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+ sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+ : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
- constexpr static unsigned int pool_rows(void) { return 2; }
- constexpr static unsigned int pool_cols(void) { return 2; }
-
- constexpr static unsigned int stride_rows(void) { return 1; }
- constexpr static unsigned int stride_cols(void) { return 1; }
-
- constexpr static unsigned int out_rows(void) { return 2; }
- constexpr static unsigned int out_cols(void) { return 2; }
-
- kern_type kernel = sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
- sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const { return sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index f6e23215b8..31bbfd085e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,7 +26,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace pooling {
@@ -63,84 +63,84 @@ void sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
pad_left, pad_top, pad_right, pad_bottom);
__asm__ __volatile__(
- "ldr x14, [%x[args], %[offsetof_n_channels]]\n"
+ "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
+ "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+ "mov x14, #0x0\n"
+ "whilelt p0.h, x14, x15\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+ "ldp x13, x12, [x21, #0x0]\n"
"ptrue p2.b\n"
- "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
- "mov x13, #0x0\n"
- "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
- "mov x12, #0x0\n"
- "ldp x11, x10, [x20, #0x0]\n"
- "whilelt p1.h, x13, x14\n"
- "ldp x9, x28, [x20, #0x10]\n"
- "ldp x27, x26, [x19, #0x0]\n"
- "ldp x25, x24, [x19, #0x10]\n"
- "ldp x23, x22, [x19, #0x20]\n"
- "ldp x21, x20, [x19, #0x30]\n"
- "ldr x19, [x19, #0x40]\n"
- "ld1h { z31.h }, p1/Z, [x26, x13, LSL #1]\n"
- "ld1h { z30.h }, p1/Z, [x23, x13, LSL #1]\n"
- "ld1h { z29.h }, p1/Z, [x20, x13, LSL #1]\n"
- "ld1h { z28.h }, p1/Z, [x24, x13, LSL #1]\n"
- "ld1h { z27.h }, p1/Z, [x27, x13, LSL #1]\n"
- "ld1h { z26.h }, p1/Z, [x22, x13, LSL #1]\n"
- "ld1h { z25.h }, p1/Z, [x25, x13, LSL #1]\n"
- "ld1h { z24.h }, p1/Z, [x21, x13, LSL #1]\n"
- "ld1h { z23.h }, p1/Z, [x19, x13, LSL #1]\n"
- "incw x13\n"
- "whilelt p1.h, x13, x14\n"
+ "mov x11, #0x0\n"
+ "ldp x10, x9, [x21, #0x10]\n"
+ "ldp x28, x27, [x20, #0x0]\n"
+ "ldp x26, x25, [x20, #0x10]\n"
+ "ldp x24, x23, [x20, #0x20]\n"
+ "ldp x22, x21, [x20, #0x30]\n"
+ "ldr x20, [x20, #0x40]\n"
+ "ld1h { z31.h }, p0/Z, [x27, x14, LSL #1]\n"
+ "ld1h { z30.h }, p0/Z, [x24, x14, LSL #1]\n"
+ "ld1h { z29.h }, p0/Z, [x21, x14, LSL #1]\n"
+ "ld1h { z28.h }, p0/Z, [x25, x14, LSL #1]\n"
+ "ld1h { z27.h }, p0/Z, [x28, x14, LSL #1]\n"
+ "ld1h { z26.h }, p0/Z, [x26, x14, LSL #1]\n"
+ "ld1h { z25.h }, p0/Z, [x23, x14, LSL #1]\n"
+ "ld1h { z24.h }, p0/Z, [x22, x14, LSL #1]\n"
+ "ld1h { z23.h }, p0/Z, [x20, x14, LSL #1]\n"
+ "incw x14\n"
+ "whilelt p1.h, x14, x15\n"
"b.none 2f\n"
"1:" // Vector: Loop
"movprfx z22, z31\n fmax z22.h, p2/M, z22.h, z30.h\n"
- "ld1h { z31.h }, p1/Z, [x26, x13, LSL #1]\n"
- "whilelt p0.h, x12, x14\n"
"movprfx z21, z30\n fmax z21.h, p2/M, z21.h, z29.h\n"
- "ld1h { z30.h }, p1/Z, [x23, x13, LSL #1]\n"
- "movprfx z18, z28\n fmax z18.h, p2/M, z18.h, z27.h\n"
- "ld1h { z29.h }, p1/Z, [x20, x13, LSL #1]\n"
- "movprfx z17, z26\n fmax z17.h, p2/M, z17.h, z25.h\n"
- "ld1h { z27.h }, p1/Z, [x27, x13, LSL #1]\n"
- "movprfx z16, z24\n fmax z16.h, p2/M, z16.h, z28.h\n"
- "ld1h { z28.h }, p1/Z, [x24, x13, LSL #1]\n"
- "movprfx z20, z26\n fmax z20.h, p2/M, z20.h, z23.h\n"
- "ld1h { z26.h }, p1/Z, [x22, x13, LSL #1]\n"
- "movprfx z19, z22\n fmax z19.h, p2/M, z19.h, z18.h\n"
- "ld1h { z25.h }, p1/Z, [x25, x13, LSL #1]\n"
- "movprfx z18, z22\n fmax z18.h, p2/M, z18.h, z17.h\n"
- "ld1h { z24.h }, p1/Z, [x21, x13, LSL #1]\n"
- "movprfx z17, z21\n fmax z17.h, p2/M, z17.h, z16.h\n"
- "ld1h { z23.h }, p1/Z, [x19, x13, LSL #1]\n"
- "incw x13\n"
- "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z20.h\n"
- "st1h { z19.h }, p0, [x11, x12, LSL #1]\n"
- "whilelt p1.h, x13, x14\n"
- "st1h { z18.h }, p0, [x10, x12, LSL #1]\n"
- "st1h { z17.h }, p0, [x9, x12, LSL #1]\n"
- "st1h { z16.h }, p0, [x28, x12, LSL #1]\n"
- "incw x12\n"
+ "ld1h { z31.h }, p1/Z, [x27, x14, LSL #1]\n"
+ "ld1h { z30.h }, p1/Z, [x24, x14, LSL #1]\n"
+ "movprfx z20, z28\n fmax z20.h, p2/M, z20.h, z27.h\n"
+ "movprfx z19, z26\n fmax z19.h, p2/M, z19.h, z25.h\n"
+ "ld1h { z29.h }, p1/Z, [x21, x14, LSL #1]\n"
+ "ld1h { z27.h }, p1/Z, [x28, x14, LSL #1]\n"
+ "movprfx z17, z28\n fmax z17.h, p2/M, z17.h, z24.h\n"
+ "movprfx z18, z25\n fmax z18.h, p2/M, z18.h, z23.h\n"
+ "ld1h { z28.h }, p1/Z, [x25, x14, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x26, x14, LSL #1]\n"
+ "ld1h { z25.h }, p1/Z, [x23, x14, LSL #1]\n"
+ "ld1h { z24.h }, p1/Z, [x22, x14, LSL #1]\n"
+ "whilelt p0.h, x11, x15\n"
+ "movprfx z16, z22\n fmax z16.h, p2/M, z16.h, z20.h\n"
+ "ld1h { z23.h }, p1/Z, [x20, x14, LSL #1]\n"
+ "incw x14\n"
+ "whilelt p1.h, x14, x15\n"
+ "st1h { z16.h }, p0, [x13, x11, LSL #1]\n"
+ "movprfx z16, z19\n fmax z16.h, p2/M, z16.h, z22.h\n"
+ "fmax z17.h, p2/M, z17.h, z21.h\n"
+ "st1h { z16.h }, p0, [x12, x11, LSL #1]\n"
+ "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z18.h\n"
+ "st1h { z17.h }, p0, [x10, x11, LSL #1]\n"
+ "st1h { z16.h }, p0, [x9, x11, LSL #1]\n"
+ "incw x11\n"
"b.any 1b\n"
"2:" // Vector: Tail
"movprfx z22, z31\n fmax z22.h, p2/M, z22.h, z30.h\n"
- "whilelt p0.h, x12, x14\n"
"movprfx z21, z30\n fmax z21.h, p2/M, z21.h, z29.h\n"
- "movprfx z18, z28\n fmax z18.h, p2/M, z18.h, z27.h\n"
- "movprfx z17, z26\n fmax z17.h, p2/M, z17.h, z25.h\n"
- "movprfx z16, z24\n fmax z16.h, p2/M, z16.h, z28.h\n"
- "movprfx z20, z26\n fmax z20.h, p2/M, z20.h, z23.h\n"
- "movprfx z19, z22\n fmax z19.h, p2/M, z19.h, z18.h\n"
- "st1h { z19.h }, p0, [x11, x12, LSL #1]\n"
- "movprfx z18, z22\n fmax z18.h, p2/M, z18.h, z17.h\n"
- "movprfx z17, z21\n fmax z17.h, p2/M, z17.h, z16.h\n"
- "st1h { z18.h }, p0, [x10, x12, LSL #1]\n"
- "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z20.h\n"
- "st1h { z17.h }, p0, [x9, x12, LSL #1]\n"
- "st1h { z16.h }, p0, [x28, x12, LSL #1]\n"
+ "movprfx z20, z28\n fmax z20.h, p2/M, z20.h, z27.h\n"
+ "movprfx z19, z26\n fmax z19.h, p2/M, z19.h, z25.h\n"
+ "movprfx z17, z28\n fmax z17.h, p2/M, z17.h, z24.h\n"
+ "movprfx z18, z25\n fmax z18.h, p2/M, z18.h, z23.h\n"
+ "whilelt p0.h, x11, x15\n"
+ "movprfx z16, z22\n fmax z16.h, p2/M, z16.h, z20.h\n"
+ "st1h { z16.h }, p0, [x13, x11, LSL #1]\n"
+ "movprfx z16, z19\n fmax z16.h, p2/M, z16.h, z22.h\n"
+ "fmax z17.h, p2/M, z17.h, z21.h\n"
+ "st1h { z16.h }, p0, [x12, x11, LSL #1]\n"
+ "movprfx z16, z21\n fmax z16.h, p2/M, z16.h, z18.h\n"
+ "st1h { z17.h }, p0, [x10, x11, LSL #1]\n"
+ "st1h { z16.h }, p0, [x9, x11, LSL #1]\n"
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst.hpp
index 1c17c27619..0ef0a793cc 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,29 +26,21 @@
#pragma once
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace pooling {
void sve_fp16_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
-struct sve_fp16_nhwc_max_generic_depthfirst
+struct sve_fp16_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<__fp16, __fp16>
{
- typedef __fp16 operand_type;
- typedef __fp16 return_type;
-
- typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const __fp16 *const *const inptrs, __fp16 *outptr);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
- kern_type kernel = sve_fp16_nhwc_max_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<__fp16, __fp16>;
sve_fp16_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sve_fp16_nhwc_max_generic_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp
index 58ab915605..1a01412836 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp16_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,8 +23,9 @@
*/
#include <cstdint>
+#include <cstddef>
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace pooling {
@@ -39,185 +40,184 @@ void sve_fp16_nhwc_max_generic_depthfirst_impl(
)
{
__asm__ __volatile__(
- "ptrue p4.b\n"
- "mov x28, #0x0\n"
- "cnth x27\n"
- "cnth x26, ALL, MUL #2\n"
- "cnth x25, ALL, MUL #3\n"
+ "mov x9, #0x0\n"
+ "cnth x28\n"
+ "cnth x27, ALL, MUL #2\n"
+ "cnth x26, ALL, MUL #3\n"
+ "whilelt p4.h, x9, %x[n_channels]\n"
"whilelt p3.h, x28, %x[n_channels]\n"
"whilelt p2.h, x27, %x[n_channels]\n"
"whilelt p1.h, x26, %x[n_channels]\n"
- "whilelt p0.h, x25, %x[n_channels]\n"
+ "ptrue p0.b\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z8.h, #0xfc00\n"
"mov z7.h, #0xfc00\n"
- "mov x19, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z6.h, #0xfc00\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
"mov z5.h, #0xfc00\n"
- "mov z4.h, #0xfc00\n"
- "cbz x24, 4f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
- "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
- "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
- "ld1h { z31.h }, p2/Z, [x23, x27, LSL #1]\n"
- "ld1h { z30.h }, p2/Z, [x22, x27, LSL #1]\n"
- "ld1h { z22.h }, p2/Z, [x21, x27, LSL #1]\n"
- "ld1h { z29.h }, p2/Z, [x20, x27, LSL #1]\n"
- "ld1h { z28.h }, p1/Z, [x23, x26, LSL #1]\n"
- "ld1h { z27.h }, p1/Z, [x22, x26, LSL #1]\n"
- "ld1h { z21.h }, p1/Z, [x21, x26, LSL #1]\n"
- "ld1h { z26.h }, p1/Z, [x20, x26, LSL #1]\n"
- "ld1h { z16.h }, p0/Z, [x23, x25, LSL #1]\n"
- "ld1h { z25.h }, p0/Z, [x22, x25, LSL #1]\n"
- "ld1h { z20.h }, p0/Z, [x21, x25, LSL #1]\n"
- "ld1h { z24.h }, p0/Z, [x20, x25, LSL #1]\n"
+ "cbz x25, 4f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "ld1h { z0.h }, p3/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x22, x28, LSL #1]\n"
+ "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z29.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z28.h }, p2/Z, [x22, x27, LSL #1]\n"
+ "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x20, x27, LSL #1]\n"
+ "ld1h { z26.h }, p1/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
+ "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "movprfx z19, z3\n fmax z19.h, p4/M, z19.h, z2.h\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "movprfx z23, z1\n fmax z23.h, p4/M, z23.h, z0.h\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "movprfx z18, z31\n fmax z18.h, p4/M, z18.h, z30.h\n"
- "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
- "fmax z22.h, p4/M, z22.h, z29.h\n"
- "movprfx z17, z28\n fmax z17.h, p4/M, z17.h, z27.h\n"
- "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
- "fmax z21.h, p4/M, z21.h, z26.h\n"
- "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
- "fmax z16.h, p4/M, z16.h, z25.h\n"
- "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
- "fmax z20.h, p4/M, z20.h, z24.h\n"
- "ld1h { z31.h }, p2/Z, [x23, x27, LSL #1]\n"
- "fmax z19.h, p4/M, z19.h, z23.h\n"
- "ld1h { z30.h }, p2/Z, [x22, x27, LSL #1]\n"
- "fmax z18.h, p4/M, z18.h, z22.h\n"
- "ld1h { z22.h }, p2/Z, [x21, x27, LSL #1]\n"
- "fmax z17.h, p4/M, z17.h, z21.h\n"
- "ld1h { z29.h }, p2/Z, [x20, x27, LSL #1]\n"
- "fmax z16.h, p4/M, z16.h, z20.h\n"
- "ld1h { z28.h }, p1/Z, [x23, x26, LSL #1]\n"
- "fmax z7.h, p4/M, z7.h, z19.h\n"
- "ld1h { z27.h }, p1/Z, [x22, x26, LSL #1]\n"
- "fmax z6.h, p4/M, z6.h, z18.h\n"
- "ld1h { z21.h }, p1/Z, [x21, x26, LSL #1]\n"
- "fmax z5.h, p4/M, z5.h, z17.h\n"
- "ld1h { z26.h }, p1/Z, [x20, x26, LSL #1]\n"
- "fmax z4.h, p4/M, z4.h, z16.h\n"
- "ld1h { z16.h }, p0/Z, [x23, x25, LSL #1]\n"
- "ld1h { z25.h }, p0/Z, [x22, x25, LSL #1]\n"
- "ld1h { z20.h }, p0/Z, [x21, x25, LSL #1]\n"
- "ld1h { z24.h }, p0/Z, [x20, x25, LSL #1]\n"
+ "movprfx z19, z4\n fmax z19.h, p0/M, z19.h, z3.h\n"
+ "movprfx z23, z2\n fmax z23.h, p0/M, z23.h, z1.h\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "movprfx z18, z0\n fmax z18.h, p0/M, z18.h, z31.h\n"
+ "fmax z22.h, p0/M, z22.h, z30.h\n"
+ "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "movprfx z17, z29\n fmax z17.h, p0/M, z17.h, z28.h\n"
+ "fmax z21.h, p0/M, z21.h, z27.h\n"
+ "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "movprfx z16, z26\n fmax z16.h, p0/M, z16.h, z25.h\n"
+ "fmax z20.h, p0/M, z20.h, z24.h\n"
+ "ld1h { z0.h }, p3/Z, [x23, x28, LSL #1]\n"
+ "ld1h { z31.h }, p3/Z, [x22, x28, LSL #1]\n"
+ "fmax z19.h, p0/M, z19.h, z23.h\n"
+ "fmax z18.h, p0/M, z18.h, z22.h\n"
+ "ld1h { z22.h }, p3/Z, [x21, x28, LSL #1]\n"
+ "ld1h { z30.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "fmax z17.h, p0/M, z17.h, z21.h\n"
+ "fmax z16.h, p0/M, z16.h, z20.h\n"
+ "ld1h { z29.h }, p2/Z, [x23, x27, LSL #1]\n"
+ "ld1h { z28.h }, p2/Z, [x22, x27, LSL #1]\n"
+ "subs x25, x25, #0x1\n"
+ "fmax z8.h, p0/M, z8.h, z19.h\n"
+ "ld1h { z21.h }, p2/Z, [x21, x27, LSL #1]\n"
+ "ld1h { z27.h }, p2/Z, [x20, x27, LSL #1]\n"
+ "fmax z7.h, p0/M, z7.h, z18.h\n"
+ "fmax z6.h, p0/M, z6.h, z17.h\n"
+ "ld1h { z26.h }, p1/Z, [x23, x26, LSL #1]\n"
+ "ld1h { z25.h }, p1/Z, [x22, x26, LSL #1]\n"
+ "fmax z5.h, p0/M, z5.h, z16.h\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z20.h }, p1/Z, [x21, x26, LSL #1]\n"
+ "ld1h { z24.h }, p1/Z, [x20, x26, LSL #1]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "movprfx z19, z3\n fmax z19.h, p4/M, z19.h, z2.h\n"
- "movprfx z23, z1\n fmax z23.h, p4/M, z23.h, z0.h\n"
- "movprfx z18, z31\n fmax z18.h, p4/M, z18.h, z30.h\n"
- "fmax z22.h, p4/M, z22.h, z29.h\n"
- "movprfx z17, z28\n fmax z17.h, p4/M, z17.h, z27.h\n"
- "fmax z21.h, p4/M, z21.h, z26.h\n"
- "fmax z16.h, p4/M, z16.h, z25.h\n"
- "fmax z20.h, p4/M, z20.h, z24.h\n"
- "fmax z19.h, p4/M, z19.h, z23.h\n"
- "fmax z18.h, p4/M, z18.h, z22.h\n"
- "fmax z17.h, p4/M, z17.h, z21.h\n"
- "fmax z16.h, p4/M, z16.h, z20.h\n"
- "fmax z7.h, p4/M, z7.h, z19.h\n"
- "fmax z6.h, p4/M, z6.h, z18.h\n"
- "fmax z5.h, p4/M, z5.h, z17.h\n"
- "fmax z4.h, p4/M, z4.h, z16.h\n"
+ "movprfx z19, z4\n fmax z19.h, p0/M, z19.h, z3.h\n"
+ "movprfx z23, z2\n fmax z23.h, p0/M, z23.h, z1.h\n"
+ "movprfx z18, z0\n fmax z18.h, p0/M, z18.h, z31.h\n"
+ "fmax z22.h, p0/M, z22.h, z30.h\n"
+ "movprfx z17, z29\n fmax z17.h, p0/M, z17.h, z28.h\n"
+ "fmax z21.h, p0/M, z21.h, z27.h\n"
+ "movprfx z16, z26\n fmax z16.h, p0/M, z16.h, z25.h\n"
+ "fmax z20.h, p0/M, z20.h, z24.h\n"
+ "fmax z19.h, p0/M, z19.h, z23.h\n"
+ "fmax z18.h, p0/M, z18.h, z22.h\n"
+ "fmax z17.h, p0/M, z17.h, z21.h\n"
+ "fmax z16.h, p0/M, z16.h, z20.h\n"
+ "fmax z8.h, p0/M, z8.h, z19.h\n"
+ "fmax z7.h, p0/M, z7.h, z18.h\n"
+ "fmax z6.h, p0/M, z6.h, z17.h\n"
+ "fmax z5.h, p0/M, z5.h, z16.h\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
- "fmax z7.h, p4/M, z7.h, z3.h\n"
- "ld1h { z31.h }, p2/Z, [x23, x27, LSL #1]\n"
- "ld1h { z28.h }, p1/Z, [x23, x26, LSL #1]\n"
- "fmax z6.h, p4/M, z6.h, z31.h\n"
- "ld1h { z16.h }, p0/Z, [x23, x25, LSL #1]\n"
- "fmax z5.h, p4/M, z5.h, z28.h\n"
- "fmax z4.h, p4/M, z4.h, z16.h\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "subs x21, x21, #0x1\n"
+ "fmax z8.h, p0/M, z8.h, z16.h\n"
+ "ld1h { z17.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "ld1h { z16.h }, p2/Z, [x20, x27, LSL #1]\n"
+ "fmax z7.h, p0/M, z7.h, z17.h\n"
+ "fmax z6.h, p0/M, z6.h, z16.h\n"
+ "ld1h { z16.h }, p1/Z, [x20, x26, LSL #1]\n"
+ "fmax z5.h, p0/M, z5.h, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
+ "st1h { z8.h }, p4, [%x[outptr], x9, LSL #1]\n"
+ "inch x9, ALL, MUL #4\n"
"st1h { z7.h }, p3, [%x[outptr], x28, LSL #1]\n"
"inch x28, ALL, MUL #4\n"
"st1h { z6.h }, p2, [%x[outptr], x27, LSL #1]\n"
"inch x27, ALL, MUL #4\n"
"st1h { z5.h }, p1, [%x[outptr], x26, LSL #1]\n"
"inch x26, ALL, MUL #4\n"
- "st1h { z4.h }, p0, [%x[outptr], x25, LSL #1]\n"
- "inch x25, ALL, MUL #4\n"
- "whilelt p0.h, x25, %x[n_channels]\n"
+ "whilelt p1.h, x26, %x[n_channels]\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p3.h, x28, %x[n_channels]\n"
+ "whilelt p4.h, x9, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
- "mov z7.h, #0xfc00\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 11f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
- "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
- "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z8.h, #0xfc00\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z3\n fmax z19.h, p4/M, z19.h, z2.h\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "movprfx z23, z1\n fmax z23.h, p4/M, z23.h, z0.h\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "fmax z19.h, p4/M, z19.h, z23.h\n"
- "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
- "ld1h { z2.h }, p3/Z, [x22, x28, LSL #1]\n"
- "fmax z7.h, p4/M, z7.h, z19.h\n"
- "ld1h { z1.h }, p3/Z, [x21, x28, LSL #1]\n"
- "ld1h { z0.h }, p3/Z, [x20, x28, LSL #1]\n"
+ "movprfx z16, z4\n fmax z16.h, p0/M, z16.h, z3.h\n"
+ "movprfx z17, z2\n fmax z17.h, p0/M, z17.h, z1.h\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fmax z16.h, p0/M, z16.h, z17.h\n"
+ "subs x25, x25, #0x1\n"
+ "ld1h { z4.h }, p4/Z, [x23, x9, LSL #1]\n"
+ "ld1h { z3.h }, p4/Z, [x22, x9, LSL #1]\n"
+ "fmax z8.h, p0/M, z8.h, z16.h\n"
+ "add x24, x24, #0x20\n"
+ "ld1h { z2.h }, p4/Z, [x21, x9, LSL #1]\n"
+ "ld1h { z1.h }, p4/Z, [x20, x9, LSL #1]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z3\n fmax z19.h, p4/M, z19.h, z2.h\n"
- "movprfx z23, z1\n fmax z23.h, p4/M, z23.h, z0.h\n"
- "fmax z19.h, p4/M, z19.h, z23.h\n"
- "fmax z7.h, p4/M, z7.h, z19.h\n"
+ "movprfx z16, z4\n fmax z16.h, p0/M, z16.h, z3.h\n"
+ "movprfx z17, z2\n fmax z17.h, p0/M, z17.h, z1.h\n"
+ "fmax z16.h, p0/M, z16.h, z17.h\n"
+ "fmax z8.h, p0/M, z8.h, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1h { z3.h }, p3/Z, [x23, x28, LSL #1]\n"
- "fmax z7.h, p4/M, z7.h, z3.h\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1h { z16.h }, p4/Z, [x20, x9, LSL #1]\n"
+ "subs x21, x21, #0x1\n"
+ "fmax z8.h, p0/M, z8.h, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "st1h { z7.h }, p3, [%x[outptr], x28, LSL #1]\n"
- "inch x28\n"
- "whilelt p3.h, x28, %x[n_channels]\n"
+ "st1h { z8.h }, p4, [%x[outptr], x9, LSL #1]\n"
+ "inch x9\n"
+ "whilelt p4.h, x9, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
-
:
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
index 9cbdb8a58d..d5578d617f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,37 +24,28 @@
#pragma once
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
void sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-struct sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst
+struct sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst : public DepthfirstStrategy<float, float>
{
- typedef float operand_type;
- typedef float return_type;
+ using Parent = DepthfirstStrategy<float, float>;
- typedef void (*kern_type)(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+ const static auto pooling_type = PoolingType::AVERAGE;
+ const static auto pool_rows = 3u, pool_cols = 3u;
+ const static auto stride_rows = 1u, stride_cols = 1u;
- constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
+ sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *)
+ : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
- constexpr static unsigned int pool_rows(void) { return 3; }
- constexpr static unsigned int pool_cols(void) { return 3; }
-
- constexpr static unsigned int stride_rows(void) { return 1; }
- constexpr static unsigned int stride_cols(void) { return 1; }
-
- constexpr static unsigned int out_rows(void) { return 2; }
- constexpr static unsigned int out_cols(void) { return 2; }
-
- kern_type kernel = sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl;
-
- sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const { return sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
index 50f5da4c3d..c5ea5adea0 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,7 +26,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
@@ -82,126 +82,126 @@ void sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst_impl(
pad_left, pad_top, pad_right, pad_bottom);
__asm__ __volatile__(
- "ldr x3, [%x[args], %[offsetof_n_channels]]\n"
- "mov x4, #0x0\n"
- "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
- "mov x5, #0x0\n"
- "ldr x6, [%x[args], %[offsetof_inptrs]]\n"
- "mov x19, #0x4\n"
- "add x7, %x[args], %[offsetof_rescale]\n"
- "ldp x8, x17, [x20, #0x0]\n"
- "ldp x16, x15, [x20, #0x10]\n"
- "whilelt p0.s, XZR, x19\n"
- "ldp x14, x13, [x6, #0x0]\n"
- "whilelt p1.s, x4, x3\n"
- "ldp x12, x11, [x6, #0x10]\n"
- "ldp x10, x9, [x6, #0x20]\n"
- "ldp x28, x27, [x6, #0x30]\n"
- "ldp x26, x25, [x6, #0x40]\n"
- "ldp x24, x23, [x6, #0x50]\n"
- "ldp x22, x21, [x6, #0x60]\n"
- "ldp x20, x19, [x6, #0x70]\n"
- "ld1rqw { z7.s }, p0/Z, [x7]\n"
- "ld1w { z8.s }, p1/Z, [x9, x4, LSL #2]\n"
- "ld1w { z6.s }, p1/Z, [x28, x4, LSL #2]\n"
- "ld1w { z5.s }, p1/Z, [x25, x4, LSL #2]\n"
- "ld1w { z4.s }, p1/Z, [x24, x4, LSL #2]\n"
- "ld1w { z3.s }, p1/Z, [x13, x4, LSL #2]\n"
- "ld1w { z2.s }, p1/Z, [x12, x4, LSL #2]\n"
- "ld1w { z1.s }, p1/Z, [x10, x4, LSL #2]\n"
- "ld1w { z0.s }, p1/Z, [x26, x4, LSL #2]\n"
- "ld1w { z31.s }, p1/Z, [x27, x4, LSL #2]\n"
- "ld1w { z30.s }, p1/Z, [x23, x4, LSL #2]\n"
- "ld1w { z29.s }, p1/Z, [x21, x4, LSL #2]\n"
- "ld1w { z28.s }, p1/Z, [x20, x4, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x14, x4, LSL #2]\n"
- "ld1w { z26.s }, p1/Z, [x11, x4, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x22, x4, LSL #2]\n"
- "ld1w { z24.s }, p1/Z, [x19, x4, LSL #2]\n"
- "incw x4\n"
- "whilelt p1.s, x4, x3\n"
+ "ldr x2, [%x[args], %[offsetof_n_channels]]\n"
+ "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+ "mov x3, #0x0\n"
+ "mov x20, #0x4\n"
+ "ldr x4, [%x[args], %[offsetof_inptrs]]\n"
+ "ldp x5, x6, [x21, #0x0]\n"
+ "whilelt p2.s, XZR, x20\n"
+ "whilelt p0.s, x3, x2\n"
+ "ldp x7, x8, [x21, #0x10]\n"
+ "ldp x17, x16, [x4, #0x0]\n"
+ "add x15, %x[args], %[offsetof_rescale]\n"
+ "mov x14, #0x0\n"
+ "ldp x13, x12, [x4, #0x10]\n"
+ "ldp x11, x10, [x4, #0x20]\n"
+ "ldp x9, x28, [x4, #0x30]\n"
+ "ldp x27, x26, [x4, #0x40]\n"
+ "ldp x25, x24, [x4, #0x50]\n"
+ "ldp x23, x22, [x4, #0x60]\n"
+ "ldp x21, x20, [x4, #0x70]\n"
+ "ld1w { z7.s }, p0/Z, [x10, x3, LSL #2]\n"
+ "ld1w { z6.s }, p0/Z, [x9, x3, LSL #2]\n"
+ "ld1w { z5.s }, p0/Z, [x26, x3, LSL #2]\n"
+ "ld1w { z4.s }, p0/Z, [x25, x3, LSL #2]\n"
+ "ld1w { z3.s }, p0/Z, [x16, x3, LSL #2]\n"
+ "ld1w { z2.s }, p0/Z, [x13, x3, LSL #2]\n"
+ "ld1w { z1.s }, p0/Z, [x11, x3, LSL #2]\n"
+ "ld1w { z31.s }, p0/Z, [x27, x3, LSL #2]\n"
+ "ld1w { z30.s }, p0/Z, [x28, x3, LSL #2]\n"
+ "ld1w { z29.s }, p0/Z, [x24, x3, LSL #2]\n"
+ "ld1w { z28.s }, p0/Z, [x22, x3, LSL #2]\n"
+ "ld1w { z27.s }, p0/Z, [x21, x3, LSL #2]\n"
+ "ld1w { z26.s }, p0/Z, [x17, x3, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x12, x3, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x23, x3, LSL #2]\n"
+ "ld1w { z23.s }, p0/Z, [x20, x3, LSL #2]\n"
+ "incw x3\n"
+ "whilelt p1.s, x3, x2\n"
+ "ld1rqw { z0.s }, p2/Z, [x15]\n"
"b.none 2f\n"
"1:" // Vector: Loop
- "fadd z17.s, z8.s, z6.s\n"
- "ld1w { z8.s }, p1/Z, [x9, x4, LSL #2]\n"
- "whilelt p0.s, x5, x3\n"
+ "fadd z17.s, z7.s, z6.s\n"
"fadd z16.s, z5.s, z4.s\n"
- "ld1w { z6.s }, p1/Z, [x28, x4, LSL #2]\n"
+ "ld1w { z7.s }, p1/Z, [x10, x3, LSL #2]\n"
+ "ld1w { z6.s }, p1/Z, [x9, x3, LSL #2]\n"
+ "fadd z19.s, z17.s, z16.s\n"
"fadd z18.s, z3.s, z2.s\n"
- "ld1w { z5.s }, p1/Z, [x25, x4, LSL #2]\n"
- "fadd z23.s, z1.s, z0.s\n"
- "ld1w { z4.s }, p1/Z, [x24, x4, LSL #2]\n"
- "fadd z22.s, z31.s, z30.s\n"
- "ld1w { z3.s }, p1/Z, [x13, x4, LSL #2]\n"
- "fadd z17.s, z17.s, z16.s\n"
- "ld1w { z2.s }, p1/Z, [x12, x4, LSL #2]\n"
- "fadd z16.s, z29.s, z28.s\n"
- "ld1w { z1.s }, p1/Z, [x10, x4, LSL #2]\n"
- "fadd z19.s, z27.s, z23.s\n"
- "ld1w { z0.s }, p1/Z, [x26, x4, LSL #2]\n"
- "fadd z21.s, z18.s, z17.s\n"
- "ld1w { z31.s }, p1/Z, [x27, x4, LSL #2]\n"
- "fadd z20.s, z16.s, z17.s\n"
- "ld1w { z30.s }, p1/Z, [x23, x4, LSL #2]\n"
- "fadd z18.s, z26.s, z22.s\n"
- "ld1w { z29.s }, p1/Z, [x21, x4, LSL #2]\n"
- "fadd z17.s, z25.s, z23.s\n"
- "ld1w { z28.s }, p1/Z, [x20, x4, LSL #2]\n"
- "fadd z16.s, z24.s, z22.s\n"
- "ld1w { z27.s }, p1/Z, [x14, x4, LSL #2]\n"
+ "ld1w { z5.s }, p1/Z, [x26, x3, LSL #2]\n"
+ "ld1w { z4.s }, p1/Z, [x25, x3, LSL #2]\n"
+ "fadd z17.s, z1.s, z31.s\n"
+ "fadd z22.s, z30.s, z29.s\n"
+ "ld1w { z3.s }, p1/Z, [x16, x3, LSL #2]\n"
+ "ld1w { z2.s }, p1/Z, [x13, x3, LSL #2]\n"
+ "fadd z16.s, z28.s, z27.s\n"
+ "fadd z21.s, z18.s, z19.s\n"
+ "ld1w { z1.s }, p1/Z, [x11, x3, LSL #2]\n"
+ "ld1w { z31.s }, p1/Z, [x27, x3, LSL #2]\n"
+ "fadd z20.s, z16.s, z19.s\n"
+ "fadd z19.s, z26.s, z17.s\n"
+ "ld1w { z30.s }, p1/Z, [x28, x3, LSL #2]\n"
+ "ld1w { z29.s }, p1/Z, [x24, x3, LSL #2]\n"
+ "fadd z18.s, z25.s, z22.s\n"
+ "fadd z17.s, z24.s, z17.s\n"
+ "ld1w { z28.s }, p1/Z, [x22, x3, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x21, x3, LSL #2]\n"
+ "fadd z16.s, z23.s, z22.s\n"
+ "ld1w { z26.s }, p1/Z, [x17, x3, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x12, x3, LSL #2]\n"
"fadd z19.s, z21.s, z19.s\n"
- "ld1w { z26.s }, p1/Z, [x11, x4, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x23, x3, LSL #2]\n"
+ "ld1w { z23.s }, p1/Z, [x20, x3, LSL #2]\n"
+ "incw x3\n"
"fadd z18.s, z21.s, z18.s\n"
- "ld1w { z25.s }, p1/Z, [x22, x4, LSL #2]\n"
"fadd z17.s, z17.s, z20.s\n"
- "ld1w { z24.s }, p1/Z, [x19, x4, LSL #2]\n"
- "incw x4\n"
- "fadd z16.s, z20.s, z16.s\n"
- "whilelt p1.s, x4, x3\n"
- "fmul z19.s, z19.s, z7.s[0]\n"
- "st1w { z19.s }, p0, [x8, x5, LSL #2]\n"
- "fmul z18.s, z18.s, z7.s[1]\n"
- "fmul z17.s, z17.s, z7.s[2]\n"
- "st1w { z18.s }, p0, [x17, x5, LSL #2]\n"
- "fmul z16.s, z16.s, z7.s[3]\n"
- "st1w { z17.s }, p0, [x16, x5, LSL #2]\n"
- "st1w { z16.s }, p0, [x15, x5, LSL #2]\n"
- "incw x5\n"
+ "fadd z16.s, z16.s, z20.s\n"
+ "whilelt p0.s, x14, x2\n"
+ "whilelt p1.s, x3, x2\n"
+ "fmul z19.s, z19.s, z0.s[0]\n"
+ "fmul z18.s, z18.s, z0.s[1]\n"
+ "st1w { z19.s }, p0, [x5, x14, LSL #2]\n"
+ "fmul z17.s, z17.s, z0.s[2]\n"
+ "fmul z16.s, z16.s, z0.s[3]\n"
+ "st1w { z18.s }, p0, [x6, x14, LSL #2]\n"
+ "st1w { z17.s }, p0, [x7, x14, LSL #2]\n"
+ "st1w { z16.s }, p0, [x8, x14, LSL #2]\n"
+ "incw x14\n"
"b.any 1b\n"
"2:" // Vector: Tail
- "fadd z17.s, z8.s, z6.s\n"
- "whilelt p0.s, x5, x3\n"
+ "fadd z17.s, z7.s, z6.s\n"
"fadd z16.s, z5.s, z4.s\n"
+ "whilelt p0.s, x14, x2\n"
+ "fadd z20.s, z17.s, z16.s\n"
"fadd z18.s, z3.s, z2.s\n"
- "fadd z23.s, z1.s, z0.s\n"
- "fadd z17.s, z17.s, z16.s\n"
- "fadd z22.s, z31.s, z30.s\n"
- "fadd z16.s, z29.s, z28.s\n"
- "fadd z21.s, z18.s, z17.s\n"
- "fadd z19.s, z27.s, z23.s\n"
- "fadd z20.s, z16.s, z17.s\n"
- "fadd z18.s, z26.s, z22.s\n"
- "fadd z17.s, z25.s, z23.s\n"
- "fadd z16.s, z24.s, z22.s\n"
- "fadd z19.s, z21.s, z19.s\n"
+ "fadd z17.s, z1.s, z31.s\n"
+ "fadd z19.s, z30.s, z29.s\n"
+ "fadd z16.s, z28.s, z27.s\n"
+ "fadd z21.s, z18.s, z20.s\n"
+ "fadd z20.s, z16.s, z20.s\n"
+ "fadd z16.s, z26.s, z17.s\n"
+ "fadd z18.s, z25.s, z19.s\n"
+ "fadd z17.s, z24.s, z17.s\n"
+ "fadd z19.s, z23.s, z19.s\n"
+ "fadd z16.s, z21.s, z16.s\n"
+ "fmul z16.s, z16.s, z0.s[0]\n"
+ "st1w { z16.s }, p0, [x5, x14, LSL #2]\n"
"fadd z18.s, z21.s, z18.s\n"
"fadd z17.s, z17.s, z20.s\n"
- "fadd z16.s, z20.s, z16.s\n"
- "fmul z19.s, z19.s, z7.s[0]\n"
- "st1w { z19.s }, p0, [x8, x5, LSL #2]\n"
- "fmul z18.s, z18.s, z7.s[1]\n"
- "fmul z17.s, z17.s, z7.s[2]\n"
- "st1w { z18.s }, p0, [x17, x5, LSL #2]\n"
- "fmul z16.s, z16.s, z7.s[3]\n"
- "st1w { z17.s }, p0, [x16, x5, LSL #2]\n"
- "st1w { z16.s }, p0, [x15, x5, LSL #2]\n"
+ "fmul z18.s, z18.s, z0.s[1]\n"
+ "fmul z17.s, z17.s, z0.s[2]\n"
+ "fadd z16.s, z19.s, z20.s\n"
+ "fmul z16.s, z16.s, z0.s[3]\n"
+ "st1w { z18.s }, p0, [x6, x14, LSL #2]\n"
+ "st1w { z17.s }, p0, [x7, x14, LSL #2]\n"
+ "st1w { z16.s }, p0, [x8, x14, LSL #2]\n"
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs)), [offsetof_rescale] "I" (offsetof(KernelArgs, rescale_vals))
- : "cc", "memory", "p0", "p1", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst.hpp
index 0daa046a02..a9e6b034e7 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,29 +26,21 @@
#pragma once
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
void sve_fp32_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
-struct sve_fp32_nhwc_avg_generic_depthfirst
+struct sve_fp32_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<float, float>
{
- typedef float operand_type;
- typedef float return_type;
-
- typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
- kern_type kernel = sve_fp32_nhwc_avg_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<float, float>;
sve_fp32_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sve_fp32_nhwc_avg_generic_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp
index c2f5745adc..7c94894892 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,8 +23,9 @@
*/
#include <cstdint>
+#include <cstddef>
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
@@ -41,88 +42,88 @@ void sve_fp32_nhwc_avg_generic_depthfirst_impl(
const auto rescale_value = static_cast<float>(1.0f / static_cast<float>(window_cells));
__asm__ __volatile__(
+ "mov x9, #0x0\n"
+ "cntw x28\n"
+ "cntw x27, ALL, MUL #2\n"
+ "cntw x26, ALL, MUL #3\n"
"ptrue p0.b\n"
- "ld1rw { z8.s }, p0/Z, [%x[rescale_ptr]]\n"
- "mov x28, #0x0\n"
- "cntw x27\n"
- "cntw x26, ALL, MUL #2\n"
- "cntw x25, ALL, MUL #3\n"
- "whilelt p3.s, x28, %x[n_channels]\n"
- "whilelt p2.s, x27, %x[n_channels]\n"
- "whilelt p1.s, x26, %x[n_channels]\n"
- "whilelt p0.s, x25, %x[n_channels]\n"
+ "whilelt p3.s, x9, %x[n_channels]\n"
+ "ld1rw { z7.s }, p0/Z, [%x[rescale_ptr]]\n"
+ "whilelt p2.s, x28, %x[n_channels]\n"
+ "whilelt p1.s, x27, %x[n_channels]\n"
+ "whilelt p0.s, x26, %x[n_channels]\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
- "mov z7.b, #0x0\n"
- "mov x19, %x[inptrs]\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
"mov z6.b, #0x0\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
"mov z5.b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
"mov z4.b, #0x0\n"
- "cbz x24, 4f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
- "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
- "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
- "ld1w { z31.s }, p2/Z, [x23, x27, LSL #2]\n"
- "ld1w { z30.s }, p2/Z, [x22, x27, LSL #2]\n"
- "ld1w { z29.s }, p2/Z, [x21, x27, LSL #2]\n"
- "ld1w { z28.s }, p2/Z, [x20, x27, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x23, x26, LSL #2]\n"
- "ld1w { z21.s }, p1/Z, [x22, x26, LSL #2]\n"
- "ld1w { z26.s }, p1/Z, [x21, x26, LSL #2]\n"
- "ld1w { z17.s }, p1/Z, [x20, x26, LSL #2]\n"
- "ld1w { z25.s }, p0/Z, [x23, x25, LSL #2]\n"
- "ld1w { z20.s }, p0/Z, [x22, x25, LSL #2]\n"
- "ld1w { z24.s }, p0/Z, [x21, x25, LSL #2]\n"
- "ld1w { z16.s }, p0/Z, [x20, x25, LSL #2]\n"
+ "mov z3.b, #0x0\n"
+ "cbz x25, 4f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z30.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z29.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z28.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "fadd z23.s, z3.s, z2.s\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "fadd z19.s, z1.s, z0.s\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "fadd z22.s, z31.s, z30.s\n"
- "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
+ "fadd z23.s, z2.s, z1.s\n"
+ "fadd z19.s, z0.s, z31.s\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fadd z22.s, z30.s, z22.s\n"
"fadd z18.s, z29.s, z28.s\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
"fadd z21.s, z27.s, z21.s\n"
- "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
"fadd z17.s, z26.s, z17.s\n"
- "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
"fadd z20.s, z25.s, z20.s\n"
- "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
"fadd z16.s, z24.s, z16.s\n"
- "ld1w { z31.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
"fadd z19.s, z23.s, z19.s\n"
- "ld1w { z30.s }, p2/Z, [x22, x27, LSL #2]\n"
"fadd z18.s, z22.s, z18.s\n"
- "ld1w { z29.s }, p2/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z30.s }, p2/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z22.s }, p2/Z, [x22, x28, LSL #2]\n"
"fadd z17.s, z21.s, z17.s\n"
- "ld1w { z28.s }, p2/Z, [x20, x27, LSL #2]\n"
"fadd z16.s, z20.s, z16.s\n"
- "ld1w { z27.s }, p1/Z, [x23, x26, LSL #2]\n"
- "fadd z7.s, z7.s, z19.s\n"
- "ld1w { z21.s }, p1/Z, [x22, x26, LSL #2]\n"
- "fadd z6.s, z6.s, z18.s\n"
- "ld1w { z26.s }, p1/Z, [x21, x26, LSL #2]\n"
- "fadd z5.s, z5.s, z17.s\n"
- "ld1w { z17.s }, p1/Z, [x20, x26, LSL #2]\n"
- "fadd z4.s, z4.s, z16.s\n"
- "ld1w { z25.s }, p0/Z, [x23, x25, LSL #2]\n"
- "ld1w { z20.s }, p0/Z, [x22, x25, LSL #2]\n"
- "ld1w { z24.s }, p0/Z, [x21, x25, LSL #2]\n"
- "ld1w { z16.s }, p0/Z, [x20, x25, LSL #2]\n"
+ "ld1w { z29.s }, p2/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z28.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "fadd z6.s, z6.s, z19.s\n"
+ "fadd z5.s, z5.s, z18.s\n"
+ "ld1w { z27.s }, p1/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z21.s }, p1/Z, [x22, x27, LSL #2]\n"
+ "fadd z4.s, z4.s, z17.s\n"
+ "fadd z3.s, z3.s, z16.s\n"
+ "ld1w { z26.s }, p1/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z17.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z20.s }, p0/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "fadd z23.s, z3.s, z2.s\n"
- "fadd z19.s, z1.s, z0.s\n"
- "fadd z22.s, z31.s, z30.s\n"
+ "fadd z23.s, z2.s, z1.s\n"
+ "fadd z19.s, z0.s, z31.s\n"
+ "fadd z22.s, z30.s, z22.s\n"
"fadd z18.s, z29.s, z28.s\n"
"fadd z21.s, z27.s, z21.s\n"
"fadd z17.s, z26.s, z17.s\n"
@@ -132,100 +133,99 @@ void sve_fp32_nhwc_avg_generic_depthfirst_impl(
"fadd z18.s, z22.s, z18.s\n"
"fadd z17.s, z21.s, z17.s\n"
"fadd z16.s, z20.s, z16.s\n"
- "fadd z7.s, z7.s, z19.s\n"
- "fadd z6.s, z6.s, z18.s\n"
- "fadd z5.s, z5.s, z17.s\n"
- "fadd z4.s, z4.s, z16.s\n"
+ "fadd z6.s, z6.s, z19.s\n"
+ "fadd z5.s, z5.s, z18.s\n"
+ "fadd z4.s, z4.s, z17.s\n"
+ "fadd z3.s, z3.s, z16.s\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
- "fadd z7.s, z7.s, z3.s\n"
- "ld1w { z31.s }, p2/Z, [x23, x27, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x23, x26, LSL #2]\n"
- "fadd z6.s, z6.s, z31.s\n"
- "ld1w { z25.s }, p0/Z, [x23, x25, LSL #2]\n"
- "fadd z5.s, z5.s, z27.s\n"
- "fadd z4.s, z4.s, z25.s\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "subs x21, x21, #0x1\n"
+ "fadd z6.s, z6.s, z16.s\n"
+ "ld1w { z17.s }, p2/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z16.s }, p1/Z, [x20, x27, LSL #2]\n"
+ "fadd z5.s, z5.s, z17.s\n"
+ "fadd z4.s, z4.s, z16.s\n"
+ "ld1w { z16.s }, p0/Z, [x20, x26, LSL #2]\n"
+ "fadd z3.s, z3.s, z16.s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "fmul z7.s, z7.s, z8.s\n"
- "st1w { z7.s }, p3, [%x[outptr], x28, LSL #2]\n"
- "fmul z6.s, z6.s, z8.s\n"
+ "fmul z6.s, z6.s, z7.s\n"
+ "fmul z5.s, z5.s, z7.s\n"
+ "st1w { z6.s }, p3, [%x[outptr], x9, LSL #2]\n"
+ "fmul z4.s, z4.s, z7.s\n"
+ "fmul z3.s, z3.s, z7.s\n"
+ "st1w { z5.s }, p2, [%x[outptr], x28, LSL #2]\n"
+ "st1w { z4.s }, p1, [%x[outptr], x27, LSL #2]\n"
+ "incw x9, ALL, MUL #4\n"
"incw x28, ALL, MUL #4\n"
- "fmul z5.s, z5.s, z8.s\n"
- "st1w { z6.s }, p2, [%x[outptr], x27, LSL #2]\n"
- "fmul z4.s, z4.s, z8.s\n"
- "incw x27, ALL, MUL #4\n"
- "st1w { z5.s }, p1, [%x[outptr], x26, LSL #2]\n"
+ "st1w { z3.s }, p0, [%x[outptr], x26, LSL #2]\n"
"incw x26, ALL, MUL #4\n"
- "st1w { z4.s }, p0, [%x[outptr], x25, LSL #2]\n"
- "incw x25, ALL, MUL #4\n"
- "whilelt p0.s, x25, %x[n_channels]\n"
+ "whilelt p0.s, x26, %x[n_channels]\n"
+ "incw x27, ALL, MUL #4\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p3.s, x28, %x[n_channels]\n"
+ "whilelt p3.s, x9, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
- "mov z7.b, #0x0\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 11f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
- "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
- "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z6.b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "fadd z23.s, z3.s, z2.s\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "fadd z19.s, z1.s, z0.s\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "fadd z19.s, z23.s, z19.s\n"
- "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
- "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
- "fadd z7.s, z7.s, z19.s\n"
- "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "fadd z17.s, z2.s, z1.s\n"
+ "fadd z16.s, z0.s, z31.s\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fadd z16.s, z17.s, z16.s\n"
+ "subs x25, x25, #0x1\n"
+ "fadd z6.s, z6.s, z16.s\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z2.s }, p3/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z1.s }, p3/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x20, x9, LSL #2]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "fadd z23.s, z3.s, z2.s\n"
- "fadd z19.s, z1.s, z0.s\n"
- "fadd z19.s, z23.s, z19.s\n"
- "fadd z7.s, z7.s, z19.s\n"
+ "fadd z17.s, z2.s, z1.s\n"
+ "fadd z16.s, z0.s, z31.s\n"
+ "fadd z16.s, z17.s, z16.s\n"
+ "fadd z6.s, z6.s, z16.s\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
- "fadd z7.s, z7.s, z3.s\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p3/Z, [x20, x9, LSL #2]\n"
+ "subs x21, x21, #0x1\n"
+ "fadd z6.s, z6.s, z16.s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "fmul z7.s, z7.s, z8.s\n"
- "st1w { z7.s }, p3, [%x[outptr], x28, LSL #2]\n"
- "incw x28\n"
- "whilelt p3.s, x28, %x[n_channels]\n"
+ "fmul z6.s, z6.s, z7.s\n"
+ "st1w { z6.s }, p3, [%x[outptr], x9, LSL #2]\n"
+ "incw x9\n"
+ "whilelt p3.s, x9, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
-
:
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value)
- : "cc", "memory", "p0", "p1", "p2", "p3", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 086f49e957..b97e3623c4 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,37 +24,28 @@
#pragma once
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
void sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-struct sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst
+struct sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<float, float>
{
- typedef float operand_type;
- typedef float return_type;
+ using Parent = DepthfirstStrategy<float, float>;
- typedef void (*kern_type)(unsigned int, const float *const *const, float *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+ const static auto pooling_type = PoolingType::MAX;
+ const static auto pool_rows = 2u, pool_cols = 2u;
+ const static auto stride_rows = 1u, stride_cols = 1u;
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+ sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+ : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
- constexpr static unsigned int pool_rows(void) { return 2; }
- constexpr static unsigned int pool_cols(void) { return 2; }
-
- constexpr static unsigned int stride_rows(void) { return 1; }
- constexpr static unsigned int stride_cols(void) { return 1; }
-
- constexpr static unsigned int out_rows(void) { return 2; }
- constexpr static unsigned int out_cols(void) { return 2; }
-
- kern_type kernel = sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
- sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const { return sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index 250cc24226..d9cebd1363 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,7 +26,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
@@ -63,84 +63,84 @@ void sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
pad_left, pad_top, pad_right, pad_bottom);
__asm__ __volatile__(
- "ldr x14, [%x[args], %[offsetof_n_channels]]\n"
+ "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
+ "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+ "mov x14, #0x0\n"
+ "whilelt p0.s, x14, x15\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+ "ldp x13, x12, [x21, #0x0]\n"
"ptrue p2.b\n"
- "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
- "mov x13, #0x0\n"
- "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
- "mov x12, #0x0\n"
- "ldp x11, x10, [x20, #0x0]\n"
- "whilelt p1.s, x13, x14\n"
- "ldp x9, x28, [x20, #0x10]\n"
- "ldp x27, x26, [x19, #0x0]\n"
- "ldp x25, x24, [x19, #0x10]\n"
- "ldp x23, x22, [x19, #0x20]\n"
- "ldp x21, x20, [x19, #0x30]\n"
- "ldr x19, [x19, #0x40]\n"
- "ld1w { z31.s }, p1/Z, [x26, x13, LSL #2]\n"
- "ld1w { z30.s }, p1/Z, [x23, x13, LSL #2]\n"
- "ld1w { z29.s }, p1/Z, [x20, x13, LSL #2]\n"
- "ld1w { z28.s }, p1/Z, [x24, x13, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x27, x13, LSL #2]\n"
- "ld1w { z26.s }, p1/Z, [x22, x13, LSL #2]\n"
- "ld1w { z25.s }, p1/Z, [x25, x13, LSL #2]\n"
- "ld1w { z24.s }, p1/Z, [x21, x13, LSL #2]\n"
- "ld1w { z23.s }, p1/Z, [x19, x13, LSL #2]\n"
- "incw x13\n"
- "whilelt p1.s, x13, x14\n"
+ "mov x11, #0x0\n"
+ "ldp x10, x9, [x21, #0x10]\n"
+ "ldp x28, x27, [x20, #0x0]\n"
+ "ldp x26, x25, [x20, #0x10]\n"
+ "ldp x24, x23, [x20, #0x20]\n"
+ "ldp x22, x21, [x20, #0x30]\n"
+ "ldr x20, [x20, #0x40]\n"
+ "ld1w { z31.s }, p0/Z, [x27, x14, LSL #2]\n"
+ "ld1w { z30.s }, p0/Z, [x24, x14, LSL #2]\n"
+ "ld1w { z29.s }, p0/Z, [x21, x14, LSL #2]\n"
+ "ld1w { z28.s }, p0/Z, [x25, x14, LSL #2]\n"
+ "ld1w { z27.s }, p0/Z, [x28, x14, LSL #2]\n"
+ "ld1w { z26.s }, p0/Z, [x26, x14, LSL #2]\n"
+ "ld1w { z25.s }, p0/Z, [x23, x14, LSL #2]\n"
+ "ld1w { z24.s }, p0/Z, [x22, x14, LSL #2]\n"
+ "ld1w { z23.s }, p0/Z, [x20, x14, LSL #2]\n"
+ "incw x14\n"
+ "whilelt p1.s, x14, x15\n"
"b.none 2f\n"
"1:" // Vector: Loop
"movprfx z22, z31\n fmax z22.s, p2/M, z22.s, z30.s\n"
- "ld1w { z31.s }, p1/Z, [x26, x13, LSL #2]\n"
- "whilelt p0.s, x12, x14\n"
"movprfx z21, z30\n fmax z21.s, p2/M, z21.s, z29.s\n"
- "ld1w { z30.s }, p1/Z, [x23, x13, LSL #2]\n"
- "movprfx z18, z28\n fmax z18.s, p2/M, z18.s, z27.s\n"
- "ld1w { z29.s }, p1/Z, [x20, x13, LSL #2]\n"
- "movprfx z17, z26\n fmax z17.s, p2/M, z17.s, z25.s\n"
- "ld1w { z27.s }, p1/Z, [x27, x13, LSL #2]\n"
- "movprfx z16, z24\n fmax z16.s, p2/M, z16.s, z28.s\n"
- "ld1w { z28.s }, p1/Z, [x24, x13, LSL #2]\n"
- "movprfx z20, z26\n fmax z20.s, p2/M, z20.s, z23.s\n"
- "ld1w { z26.s }, p1/Z, [x22, x13, LSL #2]\n"
- "movprfx z19, z22\n fmax z19.s, p2/M, z19.s, z18.s\n"
- "ld1w { z25.s }, p1/Z, [x25, x13, LSL #2]\n"
- "movprfx z18, z22\n fmax z18.s, p2/M, z18.s, z17.s\n"
- "ld1w { z24.s }, p1/Z, [x21, x13, LSL #2]\n"
- "movprfx z17, z21\n fmax z17.s, p2/M, z17.s, z16.s\n"
- "ld1w { z23.s }, p1/Z, [x19, x13, LSL #2]\n"
- "incw x13\n"
- "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z20.s\n"
- "st1w { z19.s }, p0, [x11, x12, LSL #2]\n"
- "whilelt p1.s, x13, x14\n"
- "st1w { z18.s }, p0, [x10, x12, LSL #2]\n"
- "st1w { z17.s }, p0, [x9, x12, LSL #2]\n"
- "st1w { z16.s }, p0, [x28, x12, LSL #2]\n"
- "incw x12\n"
+ "ld1w { z31.s }, p1/Z, [x27, x14, LSL #2]\n"
+ "ld1w { z30.s }, p1/Z, [x24, x14, LSL #2]\n"
+ "movprfx z20, z28\n fmax z20.s, p2/M, z20.s, z27.s\n"
+ "movprfx z19, z26\n fmax z19.s, p2/M, z19.s, z25.s\n"
+ "ld1w { z29.s }, p1/Z, [x21, x14, LSL #2]\n"
+ "ld1w { z27.s }, p1/Z, [x28, x14, LSL #2]\n"
+ "movprfx z17, z28\n fmax z17.s, p2/M, z17.s, z24.s\n"
+ "movprfx z18, z25\n fmax z18.s, p2/M, z18.s, z23.s\n"
+ "ld1w { z28.s }, p1/Z, [x25, x14, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x26, x14, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x23, x14, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x22, x14, LSL #2]\n"
+ "whilelt p0.s, x11, x15\n"
+ "movprfx z16, z22\n fmax z16.s, p2/M, z16.s, z20.s\n"
+ "ld1w { z23.s }, p1/Z, [x20, x14, LSL #2]\n"
+ "incw x14\n"
+ "whilelt p1.s, x14, x15\n"
+ "st1w { z16.s }, p0, [x13, x11, LSL #2]\n"
+ "movprfx z16, z19\n fmax z16.s, p2/M, z16.s, z22.s\n"
+ "fmax z17.s, p2/M, z17.s, z21.s\n"
+ "st1w { z16.s }, p0, [x12, x11, LSL #2]\n"
+ "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z18.s\n"
+ "st1w { z17.s }, p0, [x10, x11, LSL #2]\n"
+ "st1w { z16.s }, p0, [x9, x11, LSL #2]\n"
+ "incw x11\n"
"b.any 1b\n"
"2:" // Vector: Tail
"movprfx z22, z31\n fmax z22.s, p2/M, z22.s, z30.s\n"
- "whilelt p0.s, x12, x14\n"
"movprfx z21, z30\n fmax z21.s, p2/M, z21.s, z29.s\n"
- "movprfx z18, z28\n fmax z18.s, p2/M, z18.s, z27.s\n"
- "movprfx z17, z26\n fmax z17.s, p2/M, z17.s, z25.s\n"
- "movprfx z16, z24\n fmax z16.s, p2/M, z16.s, z28.s\n"
- "movprfx z20, z26\n fmax z20.s, p2/M, z20.s, z23.s\n"
- "movprfx z19, z22\n fmax z19.s, p2/M, z19.s, z18.s\n"
- "st1w { z19.s }, p0, [x11, x12, LSL #2]\n"
- "movprfx z18, z22\n fmax z18.s, p2/M, z18.s, z17.s\n"
- "movprfx z17, z21\n fmax z17.s, p2/M, z17.s, z16.s\n"
- "st1w { z18.s }, p0, [x10, x12, LSL #2]\n"
- "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z20.s\n"
- "st1w { z17.s }, p0, [x9, x12, LSL #2]\n"
- "st1w { z16.s }, p0, [x28, x12, LSL #2]\n"
+ "movprfx z20, z28\n fmax z20.s, p2/M, z20.s, z27.s\n"
+ "movprfx z19, z26\n fmax z19.s, p2/M, z19.s, z25.s\n"
+ "movprfx z17, z28\n fmax z17.s, p2/M, z17.s, z24.s\n"
+ "movprfx z18, z25\n fmax z18.s, p2/M, z18.s, z23.s\n"
+ "whilelt p0.s, x11, x15\n"
+ "movprfx z16, z22\n fmax z16.s, p2/M, z16.s, z20.s\n"
+ "st1w { z16.s }, p0, [x13, x11, LSL #2]\n"
+ "movprfx z16, z19\n fmax z16.s, p2/M, z16.s, z22.s\n"
+ "fmax z17.s, p2/M, z17.s, z21.s\n"
+ "st1w { z16.s }, p0, [x12, x11, LSL #2]\n"
+ "movprfx z16, z21\n fmax z16.s, p2/M, z16.s, z18.s\n"
+ "st1w { z17.s }, p0, [x10, x11, LSL #2]\n"
+ "st1w { z16.s }, p0, [x9, x11, LSL #2]\n"
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst.hpp
index 17e3e5f0ba..5f6535072b 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,29 +26,21 @@
#pragma once
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
void sve_fp32_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
-struct sve_fp32_nhwc_max_generic_depthfirst
+struct sve_fp32_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<float, float>
{
- typedef float operand_type;
- typedef float return_type;
-
- typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const float *const *const inptrs, float *outptr);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
- kern_type kernel = sve_fp32_nhwc_max_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<float, float>;
sve_fp32_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sve_fp32_nhwc_max_generic_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp
index 8166379ce4..87fc75adda 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_fp32_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,8 +23,9 @@
*/
#include <cstdint>
+#include <cstddef>
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
@@ -39,185 +40,184 @@ void sve_fp32_nhwc_max_generic_depthfirst_impl(
)
{
__asm__ __volatile__(
- "ptrue p4.b\n"
- "mov x28, #0x0\n"
- "cntw x27\n"
- "cntw x26, ALL, MUL #2\n"
- "cntw x25, ALL, MUL #3\n"
+ "mov x9, #0x0\n"
+ "cntw x28\n"
+ "cntw x27, ALL, MUL #2\n"
+ "cntw x26, ALL, MUL #3\n"
+ "whilelt p4.s, x9, %x[n_channels]\n"
"whilelt p3.s, x28, %x[n_channels]\n"
"whilelt p2.s, x27, %x[n_channels]\n"
"whilelt p1.s, x26, %x[n_channels]\n"
- "whilelt p0.s, x25, %x[n_channels]\n"
+ "ptrue p0.b\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z8.s, #0xff800000\n"
"mov z7.s, #0xff800000\n"
- "mov x19, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z6.s, #0xff800000\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
"mov z5.s, #0xff800000\n"
- "mov z4.s, #0xff800000\n"
- "cbz x24, 4f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
- "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
- "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
- "ld1w { z31.s }, p2/Z, [x23, x27, LSL #2]\n"
- "ld1w { z30.s }, p2/Z, [x22, x27, LSL #2]\n"
- "ld1w { z22.s }, p2/Z, [x21, x27, LSL #2]\n"
- "ld1w { z29.s }, p2/Z, [x20, x27, LSL #2]\n"
- "ld1w { z28.s }, p1/Z, [x23, x26, LSL #2]\n"
- "ld1w { z27.s }, p1/Z, [x22, x26, LSL #2]\n"
- "ld1w { z21.s }, p1/Z, [x21, x26, LSL #2]\n"
- "ld1w { z26.s }, p1/Z, [x20, x26, LSL #2]\n"
- "ld1w { z16.s }, p0/Z, [x23, x25, LSL #2]\n"
- "ld1w { z25.s }, p0/Z, [x22, x25, LSL #2]\n"
- "ld1w { z20.s }, p0/Z, [x21, x25, LSL #2]\n"
- "ld1w { z24.s }, p0/Z, [x20, x25, LSL #2]\n"
+ "cbz x25, 4f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "ld1w { z0.s }, p3/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x22, x28, LSL #2]\n"
+ "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z29.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z28.s }, p2/Z, [x22, x27, LSL #2]\n"
+ "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x20, x27, LSL #2]\n"
+ "ld1w { z26.s }, p1/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
+ "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "movprfx z19, z3\n fmax z19.s, p4/M, z19.s, z2.s\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "movprfx z23, z1\n fmax z23.s, p4/M, z23.s, z0.s\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "movprfx z18, z31\n fmax z18.s, p4/M, z18.s, z30.s\n"
- "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
- "fmax z22.s, p4/M, z22.s, z29.s\n"
- "movprfx z17, z28\n fmax z17.s, p4/M, z17.s, z27.s\n"
- "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
- "fmax z21.s, p4/M, z21.s, z26.s\n"
- "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
- "fmax z16.s, p4/M, z16.s, z25.s\n"
- "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
- "fmax z20.s, p4/M, z20.s, z24.s\n"
- "ld1w { z31.s }, p2/Z, [x23, x27, LSL #2]\n"
- "fmax z19.s, p4/M, z19.s, z23.s\n"
- "ld1w { z30.s }, p2/Z, [x22, x27, LSL #2]\n"
- "fmax z18.s, p4/M, z18.s, z22.s\n"
- "ld1w { z22.s }, p2/Z, [x21, x27, LSL #2]\n"
- "fmax z17.s, p4/M, z17.s, z21.s\n"
- "ld1w { z29.s }, p2/Z, [x20, x27, LSL #2]\n"
- "fmax z16.s, p4/M, z16.s, z20.s\n"
- "ld1w { z28.s }, p1/Z, [x23, x26, LSL #2]\n"
- "fmax z7.s, p4/M, z7.s, z19.s\n"
- "ld1w { z27.s }, p1/Z, [x22, x26, LSL #2]\n"
- "fmax z6.s, p4/M, z6.s, z18.s\n"
- "ld1w { z21.s }, p1/Z, [x21, x26, LSL #2]\n"
- "fmax z5.s, p4/M, z5.s, z17.s\n"
- "ld1w { z26.s }, p1/Z, [x20, x26, LSL #2]\n"
- "fmax z4.s, p4/M, z4.s, z16.s\n"
- "ld1w { z16.s }, p0/Z, [x23, x25, LSL #2]\n"
- "ld1w { z25.s }, p0/Z, [x22, x25, LSL #2]\n"
- "ld1w { z20.s }, p0/Z, [x21, x25, LSL #2]\n"
- "ld1w { z24.s }, p0/Z, [x20, x25, LSL #2]\n"
+ "movprfx z19, z4\n fmax z19.s, p0/M, z19.s, z3.s\n"
+ "movprfx z23, z2\n fmax z23.s, p0/M, z23.s, z1.s\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "movprfx z18, z0\n fmax z18.s, p0/M, z18.s, z31.s\n"
+ "fmax z22.s, p0/M, z22.s, z30.s\n"
+ "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "movprfx z17, z29\n fmax z17.s, p0/M, z17.s, z28.s\n"
+ "fmax z21.s, p0/M, z21.s, z27.s\n"
+ "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "movprfx z16, z26\n fmax z16.s, p0/M, z16.s, z25.s\n"
+ "fmax z20.s, p0/M, z20.s, z24.s\n"
+ "ld1w { z0.s }, p3/Z, [x23, x28, LSL #2]\n"
+ "ld1w { z31.s }, p3/Z, [x22, x28, LSL #2]\n"
+ "fmax z19.s, p0/M, z19.s, z23.s\n"
+ "fmax z18.s, p0/M, z18.s, z22.s\n"
+ "ld1w { z22.s }, p3/Z, [x21, x28, LSL #2]\n"
+ "ld1w { z30.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "fmax z17.s, p0/M, z17.s, z21.s\n"
+ "fmax z16.s, p0/M, z16.s, z20.s\n"
+ "ld1w { z29.s }, p2/Z, [x23, x27, LSL #2]\n"
+ "ld1w { z28.s }, p2/Z, [x22, x27, LSL #2]\n"
+ "subs x25, x25, #0x1\n"
+ "fmax z8.s, p0/M, z8.s, z19.s\n"
+ "ld1w { z21.s }, p2/Z, [x21, x27, LSL #2]\n"
+ "ld1w { z27.s }, p2/Z, [x20, x27, LSL #2]\n"
+ "fmax z7.s, p0/M, z7.s, z18.s\n"
+ "fmax z6.s, p0/M, z6.s, z17.s\n"
+ "ld1w { z26.s }, p1/Z, [x23, x26, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [x22, x26, LSL #2]\n"
+ "fmax z5.s, p0/M, z5.s, z16.s\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z20.s }, p1/Z, [x21, x26, LSL #2]\n"
+ "ld1w { z24.s }, p1/Z, [x20, x26, LSL #2]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "movprfx z19, z3\n fmax z19.s, p4/M, z19.s, z2.s\n"
- "movprfx z23, z1\n fmax z23.s, p4/M, z23.s, z0.s\n"
- "movprfx z18, z31\n fmax z18.s, p4/M, z18.s, z30.s\n"
- "fmax z22.s, p4/M, z22.s, z29.s\n"
- "movprfx z17, z28\n fmax z17.s, p4/M, z17.s, z27.s\n"
- "fmax z21.s, p4/M, z21.s, z26.s\n"
- "fmax z16.s, p4/M, z16.s, z25.s\n"
- "fmax z20.s, p4/M, z20.s, z24.s\n"
- "fmax z19.s, p4/M, z19.s, z23.s\n"
- "fmax z18.s, p4/M, z18.s, z22.s\n"
- "fmax z17.s, p4/M, z17.s, z21.s\n"
- "fmax z16.s, p4/M, z16.s, z20.s\n"
- "fmax z7.s, p4/M, z7.s, z19.s\n"
- "fmax z6.s, p4/M, z6.s, z18.s\n"
- "fmax z5.s, p4/M, z5.s, z17.s\n"
- "fmax z4.s, p4/M, z4.s, z16.s\n"
+ "movprfx z19, z4\n fmax z19.s, p0/M, z19.s, z3.s\n"
+ "movprfx z23, z2\n fmax z23.s, p0/M, z23.s, z1.s\n"
+ "movprfx z18, z0\n fmax z18.s, p0/M, z18.s, z31.s\n"
+ "fmax z22.s, p0/M, z22.s, z30.s\n"
+ "movprfx z17, z29\n fmax z17.s, p0/M, z17.s, z28.s\n"
+ "fmax z21.s, p0/M, z21.s, z27.s\n"
+ "movprfx z16, z26\n fmax z16.s, p0/M, z16.s, z25.s\n"
+ "fmax z20.s, p0/M, z20.s, z24.s\n"
+ "fmax z19.s, p0/M, z19.s, z23.s\n"
+ "fmax z18.s, p0/M, z18.s, z22.s\n"
+ "fmax z17.s, p0/M, z17.s, z21.s\n"
+ "fmax z16.s, p0/M, z16.s, z20.s\n"
+ "fmax z8.s, p0/M, z8.s, z19.s\n"
+ "fmax z7.s, p0/M, z7.s, z18.s\n"
+ "fmax z6.s, p0/M, z6.s, z17.s\n"
+ "fmax z5.s, p0/M, z5.s, z16.s\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
- "fmax z7.s, p4/M, z7.s, z3.s\n"
- "ld1w { z31.s }, p2/Z, [x23, x27, LSL #2]\n"
- "ld1w { z28.s }, p1/Z, [x23, x26, LSL #2]\n"
- "fmax z6.s, p4/M, z6.s, z31.s\n"
- "ld1w { z16.s }, p0/Z, [x23, x25, LSL #2]\n"
- "fmax z5.s, p4/M, z5.s, z28.s\n"
- "fmax z4.s, p4/M, z4.s, z16.s\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "subs x21, x21, #0x1\n"
+ "fmax z8.s, p0/M, z8.s, z16.s\n"
+ "ld1w { z17.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "ld1w { z16.s }, p2/Z, [x20, x27, LSL #2]\n"
+ "fmax z7.s, p0/M, z7.s, z17.s\n"
+ "fmax z6.s, p0/M, z6.s, z16.s\n"
+ "ld1w { z16.s }, p1/Z, [x20, x26, LSL #2]\n"
+ "fmax z5.s, p0/M, z5.s, z16.s\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
+ "st1w { z8.s }, p4, [%x[outptr], x9, LSL #2]\n"
+ "incw x9, ALL, MUL #4\n"
"st1w { z7.s }, p3, [%x[outptr], x28, LSL #2]\n"
"incw x28, ALL, MUL #4\n"
"st1w { z6.s }, p2, [%x[outptr], x27, LSL #2]\n"
"incw x27, ALL, MUL #4\n"
"st1w { z5.s }, p1, [%x[outptr], x26, LSL #2]\n"
"incw x26, ALL, MUL #4\n"
- "st1w { z4.s }, p0, [%x[outptr], x25, LSL #2]\n"
- "incw x25, ALL, MUL #4\n"
- "whilelt p0.s, x25, %x[n_channels]\n"
+ "whilelt p1.s, x26, %x[n_channels]\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p3.s, x28, %x[n_channels]\n"
+ "whilelt p4.s, x9, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
- "mov z7.s, #0xff800000\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 11f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
- "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
- "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z8.s, #0xff800000\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z3\n fmax z19.s, p4/M, z19.s, z2.s\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "movprfx z23, z1\n fmax z23.s, p4/M, z23.s, z0.s\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "fmax z19.s, p4/M, z19.s, z23.s\n"
- "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
- "ld1w { z2.s }, p3/Z, [x22, x28, LSL #2]\n"
- "fmax z7.s, p4/M, z7.s, z19.s\n"
- "ld1w { z1.s }, p3/Z, [x21, x28, LSL #2]\n"
- "ld1w { z0.s }, p3/Z, [x20, x28, LSL #2]\n"
+ "movprfx z16, z4\n fmax z16.s, p0/M, z16.s, z3.s\n"
+ "movprfx z17, z2\n fmax z17.s, p0/M, z17.s, z1.s\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "fmax z16.s, p0/M, z16.s, z17.s\n"
+ "subs x25, x25, #0x1\n"
+ "ld1w { z4.s }, p4/Z, [x23, x9, LSL #2]\n"
+ "ld1w { z3.s }, p4/Z, [x22, x9, LSL #2]\n"
+ "fmax z8.s, p0/M, z8.s, z16.s\n"
+ "add x24, x24, #0x20\n"
+ "ld1w { z2.s }, p4/Z, [x21, x9, LSL #2]\n"
+ "ld1w { z1.s }, p4/Z, [x20, x9, LSL #2]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z3\n fmax z19.s, p4/M, z19.s, z2.s\n"
- "movprfx z23, z1\n fmax z23.s, p4/M, z23.s, z0.s\n"
- "fmax z19.s, p4/M, z19.s, z23.s\n"
- "fmax z7.s, p4/M, z7.s, z19.s\n"
+ "movprfx z16, z4\n fmax z16.s, p0/M, z16.s, z3.s\n"
+ "movprfx z17, z2\n fmax z17.s, p0/M, z17.s, z1.s\n"
+ "fmax z16.s, p0/M, z16.s, z17.s\n"
+ "fmax z8.s, p0/M, z8.s, z16.s\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1w { z3.s }, p3/Z, [x23, x28, LSL #2]\n"
- "fmax z7.s, p4/M, z7.s, z3.s\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1w { z16.s }, p4/Z, [x20, x9, LSL #2]\n"
+ "subs x21, x21, #0x1\n"
+ "fmax z8.s, p0/M, z8.s, z16.s\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "st1w { z7.s }, p3, [%x[outptr], x28, LSL #2]\n"
- "incw x28\n"
- "whilelt p3.s, x28, %x[n_channels]\n"
+ "st1w { z8.s }, p4, [%x[outptr], x9, LSL #2]\n"
+ "incw x9\n"
+ "whilelt p4.s, x9, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
-
:
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst.hpp
index 2ae38b5b2f..dd2ff4fd2e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,29 +26,21 @@
#pragma once
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
void sve_s8_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
-struct sve_s8_nhwc_avg_generic_depthfirst
+struct sve_s8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t>
{
- typedef int8_t operand_type;
- typedef int8_t return_type;
-
- typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
- kern_type kernel = sve_s8_nhwc_avg_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<int8_t, int8_t>;
sve_s8_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sve_s8_nhwc_avg_generic_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp
index 2ea5b90561..7925905e64 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,11 +23,12 @@
*/
#include <cstdint>
+#include <cstddef>
#include <cstring>
#include <cmath>
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
@@ -84,30 +85,31 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
f_rescale_value *= 2.0f;
}
- rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
- if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+ int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+ if (long_rescale_value == (1ll << 31))
{
shift_value++;
- rescale_value >>= 1;
+ long_rescale_value >>= 1;
}
+ rescale_value = static_cast<int32_t>(long_rescale_value);
}
__asm__ __volatile__(
- "ptrue p4.b\n"
- "mov x26, #0x0\n"
- "cntb x25\n"
- "cntb x24, ALL, MUL #2\n"
- "cntb x23, ALL, MUL #3\n"
+ "mov x27, #0x0\n"
+ "cntb x26\n"
+ "cntb x25, ALL, MUL #2\n"
+ "cntb x24, ALL, MUL #3\n"
+ "whilelt p4.b, x27, %x[n_channels]\n"
"whilelt p3.b, x26, %x[n_channels]\n"
"whilelt p2.b, x25, %x[n_channels]\n"
"whilelt p1.b, x24, %x[n_channels]\n"
- "whilelt p0.b, x23, %x[n_channels]\n"
+ "ptrue p0.b\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
+ "lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
- "mov x19, %x[inptrs]\n"
"mov z14.s, #0x0\n"
- "lsr x22, %x[n_valid_cells], #0x1\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z11.s, #0x0\n"
@@ -122,43 +124,43 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
"mov z2.s, #0x0\n"
"mov z1.s, #0x0\n"
"mov z0.s, #0x0\n"
- "cbz x22, 4f\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
- "add x19, x19, #0x10\n"
- "ld1b { z30.b }, p3/Z, [x20, x26]\n"
- "subs x22, x22, #0x1\n"
- "ld1b { z29.b }, p2/Z, [x21, x25]\n"
- "ld1b { z28.b }, p2/Z, [x20, x25]\n"
- "ld1b { z27.b }, p1/Z, [x21, x24]\n"
- "ld1b { z26.b }, p1/Z, [x20, x24]\n"
- "ld1b { z25.b }, p0/Z, [x21, x23]\n"
- "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+ "cbz x23, 4f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "add x19, x19, #0x10\n"
".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
".inst 0x455c03b5 // saddlb z21.h, z29.b, z28.b\n"
- "subs x22, x22, #0x1\n"
".inst 0x455c07b4 // saddlt z20.h, z29.b, z28.b\n"
- "ld1b { z30.b }, p3/Z, [x20, x26]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0373 // saddlb z19.h, z27.b, z26.b\n"
- "ld1b { z29.b }, p2/Z, [x21, x25]\n"
".inst 0x455a0772 // saddlt z18.h, z27.b, z26.b\n"
- "ld1b { z28.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x45580331 // saddlb z17.h, z25.b, z24.b\n"
- "ld1b { z27.b }, p1/Z, [x21, x24]\n"
".inst 0x45580730 // saddlt z16.h, z25.b, z24.b\n"
- "ld1b { z26.b }, p1/Z, [x20, x24]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- "ld1b { z25.b }, p0/Z, [x21, x23]\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
@@ -198,219 +200,218 @@ void sve_s8_nhwc_avg_generic_depthfirst_impl(
".inst 0x45904021 // saddwb z1.s, z1.s, z16.h\n"
".inst 0x45904400 // saddwt z0.s, z0.s, z16.h\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x1\n"
+ "ands x21, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x21, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
- ".inst 0x4508a3f1 // sshllb z17.h, z31.b, #0x0\n"
- "ld1b { z29.b }, p2/Z, [x21, x25]\n"
- ".inst 0x4508a7f0 // sshllt z16.h, z31.b, #0x0\n"
- "ld1b { z27.b }, p1/Z, [x21, x24]\n"
- ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
- "ld1b { z25.b }, p0/Z, [x21, x23]\n"
- ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
- ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
- ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
- ".inst 0x4508a3b0 // sshllb z16.h, z29.b, #0x0\n"
- ".inst 0x4590416b // saddwb z11.s, z11.s, z16.h\n"
- ".inst 0x4590454a // saddwt z10.s, z10.s, z16.h\n"
- ".inst 0x4508a7b0 // sshllt z16.h, z29.b, #0x0\n"
- ".inst 0x45904129 // saddwb z9.s, z9.s, z16.h\n"
- ".inst 0x45904508 // saddwt z8.s, z8.s, z16.h\n"
- ".inst 0x4508a370 // sshllb z16.h, z27.b, #0x0\n"
- ".inst 0x459040e7 // saddwb z7.s, z7.s, z16.h\n"
- ".inst 0x459044c6 // saddwt z6.s, z6.s, z16.h\n"
- ".inst 0x4508a770 // sshllt z16.h, z27.b, #0x0\n"
- ".inst 0x459040a5 // saddwb z5.s, z5.s, z16.h\n"
- ".inst 0x45904484 // saddwt z4.s, z4.s, z16.h\n"
- ".inst 0x4508a330 // sshllb z16.h, z25.b, #0x0\n"
- ".inst 0x45904063 // saddwb z3.s, z3.s, z16.h\n"
- ".inst 0x45904442 // saddwt z2.s, z2.s, z16.h\n"
- ".inst 0x4508a730 // sshllt z16.h, z25.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a217 // sshllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508a616 // sshllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z17.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508a215 // sshllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508a614 // sshllt z20.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508a233 // sshllb z19.h, z17.b, #0x0\n"
+ ".inst 0x4508a632 // sshllt z18.h, z17.b, #0x0\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
+ ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
+ ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
+ ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
+ ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
+ ".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
+ ".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
+ ".inst 0x45944508 // saddwt z8.s, z8.s, z20.h\n"
+ ".inst 0x459340e7 // saddwb z7.s, z7.s, z19.h\n"
+ ".inst 0x459344c6 // saddwt z6.s, z6.s, z19.h\n"
+ ".inst 0x459240a5 // saddwb z5.s, z5.s, z18.h\n"
+ ".inst 0x45924484 // saddwt z4.s, z4.s, z18.h\n"
+ ".inst 0x45914063 // saddwb z3.s, z3.s, z17.h\n"
+ ".inst 0x45914442 // saddwt z2.s, z2.s, z17.h\n"
".inst 0x45904021 // saddwb z1.s, z1.s, z16.h\n"
".inst 0x45904400 // saddwt z0.s, z0.s, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "mov z20.s, #0x7f\n"
- "ld1rw { z17.s }, p4/Z, [%x[rescale_ptr]]\n"
- "ld1rw { z16.s }, p4/Z, [%x[shift_ptr]]\n"
- "not z19.s, p4/M, z20.s\n"
+ "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
+ "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
".inst 0x04b175ef // sqdmulh z15.s, z15.s, z17.s\n"
".inst 0x04b175ce // sqdmulh z14.s, z14.s, z17.s\n"
".inst 0x04b175ad // sqdmulh z13.s, z13.s, z17.s\n"
".inst 0x04b1758c // sqdmulh z12.s, z12.s, z17.s\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
".inst 0x04b1756b // sqdmulh z11.s, z11.s, z17.s\n"
".inst 0x04b1754a // sqdmulh z10.s, z10.s, z17.s\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
".inst 0x04b17529 // sqdmulh z9.s, z9.s, z17.s\n"
".inst 0x04b17508 // sqdmulh z8.s, z8.s, z17.s\n"
+ ".inst 0x4482820b // srshl z11.s, p0/M, z11.s, z16.s\n"
+ ".inst 0x4482820a // srshl z10.s, p0/M, z10.s, z16.s\n"
".inst 0x04b174e7 // sqdmulh z7.s, z7.s, z17.s\n"
".inst 0x04b174c6 // sqdmulh z6.s, z6.s, z17.s\n"
+ ".inst 0x44828209 // srshl z9.s, p0/M, z9.s, z16.s\n"
+ ".inst 0x44828208 // srshl z8.s, p0/M, z8.s, z16.s\n"
".inst 0x04b174a5 // sqdmulh z5.s, z5.s, z17.s\n"
".inst 0x04b17484 // sqdmulh z4.s, z4.s, z17.s\n"
+ ".inst 0x44828207 // srshl z7.s, p0/M, z7.s, z16.s\n"
+ ".inst 0x44828206 // srshl z6.s, p0/M, z6.s, z16.s\n"
".inst 0x04b17463 // sqdmulh z3.s, z3.s, z17.s\n"
".inst 0x04b17442 // sqdmulh z2.s, z2.s, z17.s\n"
+ ".inst 0x44828205 // srshl z5.s, p0/M, z5.s, z16.s\n"
+ ".inst 0x44828204 // srshl z4.s, p0/M, z4.s, z16.s\n"
".inst 0x04b17421 // sqdmulh z1.s, z1.s, z17.s\n"
".inst 0x04b17400 // sqdmulh z0.s, z0.s, z17.s\n"
- ".inst 0x4482920f // srshl z15.s, p4/M, z15.s, z16.s\n"
- ".inst 0x4482920e // srshl z14.s, p4/M, z14.s, z16.s\n"
- ".inst 0x4482920d // srshl z13.s, p4/M, z13.s, z16.s\n"
- ".inst 0x4482920c // srshl z12.s, p4/M, z12.s, z16.s\n"
- ".inst 0x4482920b // srshl z11.s, p4/M, z11.s, z16.s\n"
- ".inst 0x4482920a // srshl z10.s, p4/M, z10.s, z16.s\n"
- ".inst 0x44829209 // srshl z9.s, p4/M, z9.s, z16.s\n"
- ".inst 0x44829208 // srshl z8.s, p4/M, z8.s, z16.s\n"
- ".inst 0x44829207 // srshl z7.s, p4/M, z7.s, z16.s\n"
- ".inst 0x44829206 // srshl z6.s, p4/M, z6.s, z16.s\n"
- ".inst 0x44829205 // srshl z5.s, p4/M, z5.s, z16.s\n"
- ".inst 0x44829204 // srshl z4.s, p4/M, z4.s, z16.s\n"
- ".inst 0x44829203 // srshl z3.s, p4/M, z3.s, z16.s\n"
- ".inst 0x44829202 // srshl z2.s, p4/M, z2.s, z16.s\n"
- ".inst 0x44829201 // srshl z1.s, p4/M, z1.s, z16.s\n"
- ".inst 0x44829200 // srshl z0.s, p4/M, z0.s, z16.s\n"
- "smax z15.s, p4/M, z15.s, z19.s\n"
- "smax z14.s, p4/M, z14.s, z19.s\n"
- "smax z13.s, p4/M, z13.s, z19.s\n"
- "smax z12.s, p4/M, z12.s, z19.s\n"
- "smin z15.s, p4/M, z15.s, z20.s\n"
- "smin z14.s, p4/M, z14.s, z20.s\n"
- "smin z13.s, p4/M, z13.s, z20.s\n"
- "smin z12.s, p4/M, z12.s, z20.s\n"
- "smax z11.s, p4/M, z11.s, z19.s\n"
+ ".inst 0x44828203 // srshl z3.s, p0/M, z3.s, z16.s\n"
+ ".inst 0x44828202 // srshl z2.s, p0/M, z2.s, z16.s\n"
+ "mov z18.s, #0x7f\n"
+ ".inst 0x44828201 // srshl z1.s, p0/M, z1.s, z16.s\n"
+ ".inst 0x44828200 // srshl z0.s, p0/M, z0.s, z16.s\n"
+ "not z16.s, p0/M, z18.s\n"
+ "smax z15.s, p0/M, z15.s, z16.s\n"
+ "smax z14.s, p0/M, z14.s, z16.s\n"
+ "smax z13.s, p0/M, z13.s, z16.s\n"
+ "smax z12.s, p0/M, z12.s, z16.s\n"
+ "smax z11.s, p0/M, z11.s, z16.s\n"
+ "smax z10.s, p0/M, z10.s, z16.s\n"
+ "smax z9.s, p0/M, z9.s, z16.s\n"
+ "smax z8.s, p0/M, z8.s, z16.s\n"
+ "smax z7.s, p0/M, z7.s, z16.s\n"
+ "smax z6.s, p0/M, z6.s, z16.s\n"
+ "smax z5.s, p0/M, z5.s, z16.s\n"
+ "smax z4.s, p0/M, z4.s, z16.s\n"
+ "smax z3.s, p0/M, z3.s, z16.s\n"
+ "smax z2.s, p0/M, z2.s, z16.s\n"
+ "smax z1.s, p0/M, z1.s, z16.s\n"
+ "smax z0.s, p0/M, z0.s, z16.s\n"
+ "smin z15.s, p0/M, z15.s, z18.s\n"
+ "smin z14.s, p0/M, z14.s, z18.s\n"
+ "smin z13.s, p0/M, z13.s, z18.s\n"
"trn1 z17.h, z15.h, z14.h\n"
- "smax z10.s, p4/M, z10.s, z19.s\n"
+ "smin z12.s, p0/M, z12.s, z18.s\n"
+ "smin z11.s, p0/M, z11.s, z18.s\n"
"trn1 z16.h, z13.h, z12.h\n"
- "smin z11.s, p4/M, z11.s, z20.s\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p3, [%x[outptr], x26]\n"
- "smin z10.s, p4/M, z10.s, z20.s\n"
- "incb x26, ALL, MUL #4\n"
- "smax z9.s, p4/M, z9.s, z19.s\n"
- "smax z8.s, p4/M, z8.s, z19.s\n"
- "smax z7.s, p4/M, z7.s, z19.s\n"
- "smax z6.s, p4/M, z6.s, z19.s\n"
- "trn1 z18.h, z11.h, z10.h\n"
- "smin z9.s, p4/M, z9.s, z20.s\n"
- "smin z8.s, p4/M, z8.s, z20.s\n"
- "smin z7.s, p4/M, z7.s, z20.s\n"
- "smin z6.s, p4/M, z6.s, z20.s\n"
- "smax z5.s, p4/M, z5.s, z19.s\n"
+ "smin z10.s, p0/M, z10.s, z18.s\n"
+ "smin z9.s, p0/M, z9.s, z18.s\n"
+ "trn1 z17.h, z11.h, z10.h\n"
+ "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+ "smin z8.s, p0/M, z8.s, z18.s\n"
+ "smin z7.s, p0/M, z7.s, z18.s\n"
"trn1 z16.h, z9.h, z8.h\n"
- "smax z4.s, p4/M, z4.s, z19.s\n"
+ "trn1 z16.b, z17.b, z16.b\n"
+ "smin z6.s, p0/M, z6.s, z18.s\n"
+ "smin z5.s, p0/M, z5.s, z18.s\n"
"trn1 z17.h, z7.h, z6.h\n"
- "trn1 z16.b, z18.b, z16.b\n"
- "st1b { z16.b }, p2, [%x[outptr], x25]\n"
- "smin z5.s, p4/M, z5.s, z20.s\n"
- "incb x25, ALL, MUL #4\n"
- "smin z4.s, p4/M, z4.s, z20.s\n"
- "smax z3.s, p4/M, z3.s, z19.s\n"
- "smax z2.s, p4/M, z2.s, z19.s\n"
- "smax z1.s, p4/M, z1.s, z19.s\n"
- "smax z0.s, p4/M, z0.s, z19.s\n"
+ "st1b { z16.b }, p3, [%x[outptr], x26]\n"
+ "smin z4.s, p0/M, z4.s, z18.s\n"
+ "smin z3.s, p0/M, z3.s, z18.s\n"
"trn1 z16.h, z5.h, z4.h\n"
- "smin z3.s, p4/M, z3.s, z20.s\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p1, [%x[outptr], x24]\n"
- "smin z2.s, p4/M, z2.s, z20.s\n"
- "incb x24, ALL, MUL #4\n"
- "smin z1.s, p4/M, z1.s, z20.s\n"
- "smin z0.s, p4/M, z0.s, z20.s\n"
+ "smin z2.s, p0/M, z2.s, z18.s\n"
+ "smin z1.s, p0/M, z1.s, z18.s\n"
"trn1 z17.h, z3.h, z2.h\n"
+ "st1b { z16.b }, p2, [%x[outptr], x25]\n"
+ "smin z0.s, p0/M, z0.s, z18.s\n"
"trn1 z16.h, z1.h, z0.h\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p0, [%x[outptr], x23]\n"
- "incb x23, ALL, MUL #4\n"
- "whilelt p0.b, x23, %x[n_channels]\n"
+ "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+ "incb x24, ALL, MUL #4\n"
+ "whilelt p1.b, x24, %x[n_channels]\n"
+ "incb x27, ALL, MUL #4\n"
+ "incb x26, ALL, MUL #4\n"
+ "incb x25, ALL, MUL #4\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p3.b, x26, %x[n_channels]\n"
+ "whilelt p4.b, x27, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
+ "lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
- "mov x19, %x[inptrs]\n"
"mov z14.s, #0x0\n"
- "lsr x22, %x[n_valid_cells], #0x1\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
- "cbz x22, 11f\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
- "add x19, x19, #0x10\n"
- "ld1b { z30.b }, p3/Z, [x20, x26]\n"
- "subs x22, x22, #0x1\n"
+ "cbz x23, 11f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "add x19, x19, #0x10\n"
- ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- "subs x22, x22, #0x1\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "ld1b { z30.b }, p3/Z, [x20, x26]\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x1\n"
+ "ands x21, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x21, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
- ".inst 0x4508a3f1 // sshllb z17.h, z31.b, #0x0\n"
- ".inst 0x4508a7f0 // sshllt z16.h, z31.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
+ "subs x21, x21, #0x1\n"
".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "mov z20.s, #0x7f\n"
- "ld1rw { z17.s }, p4/Z, [%x[rescale_ptr]]\n"
- "ld1rw { z16.s }, p4/Z, [%x[shift_ptr]]\n"
- "not z19.s, p4/M, z20.s\n"
+ "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
+ "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
".inst 0x04b175ef // sqdmulh z15.s, z15.s, z17.s\n"
".inst 0x04b175ce // sqdmulh z14.s, z14.s, z17.s\n"
".inst 0x04b175ad // sqdmulh z13.s, z13.s, z17.s\n"
".inst 0x04b1758c // sqdmulh z12.s, z12.s, z17.s\n"
- ".inst 0x4482920f // srshl z15.s, p4/M, z15.s, z16.s\n"
- ".inst 0x4482920e // srshl z14.s, p4/M, z14.s, z16.s\n"
- ".inst 0x4482920d // srshl z13.s, p4/M, z13.s, z16.s\n"
- ".inst 0x4482920c // srshl z12.s, p4/M, z12.s, z16.s\n"
- "smax z15.s, p4/M, z15.s, z19.s\n"
- "smax z14.s, p4/M, z14.s, z19.s\n"
- "smax z13.s, p4/M, z13.s, z19.s\n"
- "smax z12.s, p4/M, z12.s, z19.s\n"
- "smin z15.s, p4/M, z15.s, z20.s\n"
- "smin z14.s, p4/M, z14.s, z20.s\n"
- "smin z13.s, p4/M, z13.s, z20.s\n"
- "smin z12.s, p4/M, z12.s, z20.s\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
+ "mov z18.s, #0x7f\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
+ "not z16.s, p0/M, z18.s\n"
+ "smax z15.s, p0/M, z15.s, z16.s\n"
+ "smax z14.s, p0/M, z14.s, z16.s\n"
+ "smax z13.s, p0/M, z13.s, z16.s\n"
+ "smax z12.s, p0/M, z12.s, z16.s\n"
+ "smin z15.s, p0/M, z15.s, z18.s\n"
+ "smin z14.s, p0/M, z14.s, z18.s\n"
+ "smin z13.s, p0/M, z13.s, z18.s\n"
"trn1 z17.h, z15.h, z14.h\n"
+ "smin z12.s, p0/M, z12.s, z18.s\n"
"trn1 z16.h, z13.h, z12.h\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p3, [%x[outptr], x26]\n"
- "incb x26\n"
- "whilelt p3.b, x26, %x[n_channels]\n"
+ "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+ "incb x27\n"
+ "whilelt p4.b, x27, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
-
:
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value), [shift_ptr] "r" (&shift_value)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 071e79c93d..ac842ac623 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,37 +24,28 @@
#pragma once
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
void sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const int8_t *const *const, int8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-struct sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst
+struct sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<int8_t, int8_t>
{
- typedef int8_t operand_type;
- typedef int8_t return_type;
+ using Parent = DepthfirstStrategy<int8_t, int8_t>;
- typedef void (*kern_type)(unsigned int, const int8_t *const *const, int8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+ const static auto pooling_type = PoolingType::MAX;
+ const static auto pool_rows = 2u, pool_cols = 2u;
+ const static auto stride_rows = 1u, stride_cols = 1u;
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+ sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+ : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
- constexpr static unsigned int pool_rows(void) { return 2; }
- constexpr static unsigned int pool_cols(void) { return 2; }
-
- constexpr static unsigned int stride_rows(void) { return 1; }
- constexpr static unsigned int stride_cols(void) { return 1; }
-
- constexpr static unsigned int out_rows(void) { return 2; }
- constexpr static unsigned int out_cols(void) { return 2; }
-
- kern_type kernel = sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
- sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const { return sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index bdf3f53292..5681cc1f3d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,7 +26,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
@@ -63,84 +63,84 @@ void sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
pad_left, pad_top, pad_right, pad_bottom);
__asm__ __volatile__(
- "ldr x14, [%x[args], %[offsetof_n_channels]]\n"
+ "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
+ "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+ "mov x14, #0x0\n"
+ "whilelt p0.b, x14, x15\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+ "ldp x13, x12, [x21, #0x0]\n"
"ptrue p2.b\n"
- "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
- "mov x13, #0x0\n"
- "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
- "mov x12, #0x0\n"
- "ldp x11, x10, [x20, #0x0]\n"
- "whilelt p1.b, x13, x14\n"
- "ldp x9, x28, [x20, #0x10]\n"
- "ldp x27, x26, [x19, #0x0]\n"
- "ldp x25, x24, [x19, #0x10]\n"
- "ldp x23, x22, [x19, #0x20]\n"
- "ldp x21, x20, [x19, #0x30]\n"
- "ldr x19, [x19, #0x40]\n"
- "ld1b { z31.b }, p1/Z, [x26, x13]\n"
- "ld1b { z30.b }, p1/Z, [x23, x13]\n"
- "ld1b { z29.b }, p1/Z, [x20, x13]\n"
- "ld1b { z28.b }, p1/Z, [x24, x13]\n"
- "ld1b { z27.b }, p1/Z, [x27, x13]\n"
- "ld1b { z26.b }, p1/Z, [x22, x13]\n"
- "ld1b { z25.b }, p1/Z, [x25, x13]\n"
- "ld1b { z24.b }, p1/Z, [x21, x13]\n"
- "ld1b { z23.b }, p1/Z, [x19, x13]\n"
- "incw x13\n"
- "whilelt p1.b, x13, x14\n"
+ "mov x11, #0x0\n"
+ "ldp x10, x9, [x21, #0x10]\n"
+ "ldp x28, x27, [x20, #0x0]\n"
+ "ldp x26, x25, [x20, #0x10]\n"
+ "ldp x24, x23, [x20, #0x20]\n"
+ "ldp x22, x21, [x20, #0x30]\n"
+ "ldr x20, [x20, #0x40]\n"
+ "ld1b { z31.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z30.b }, p0/Z, [x24, x14]\n"
+ "ld1b { z29.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z28.b }, p0/Z, [x25, x14]\n"
+ "ld1b { z27.b }, p0/Z, [x28, x14]\n"
+ "ld1b { z26.b }, p0/Z, [x26, x14]\n"
+ "ld1b { z25.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z24.b }, p0/Z, [x22, x14]\n"
+ "ld1b { z23.b }, p0/Z, [x20, x14]\n"
+ "incw x14\n"
+ "whilelt p1.b, x14, x15\n"
"b.none 2f\n"
"1:" // Vector: Loop
"movprfx z22, z31\n smax z22.b, p2/M, z22.b, z30.b\n"
- "ld1b { z31.b }, p1/Z, [x26, x13]\n"
- "whilelt p0.b, x12, x14\n"
"movprfx z21, z30\n smax z21.b, p2/M, z21.b, z29.b\n"
- "ld1b { z30.b }, p1/Z, [x23, x13]\n"
- "movprfx z18, z28\n smax z18.b, p2/M, z18.b, z27.b\n"
- "ld1b { z29.b }, p1/Z, [x20, x13]\n"
- "movprfx z17, z26\n smax z17.b, p2/M, z17.b, z25.b\n"
- "ld1b { z27.b }, p1/Z, [x27, x13]\n"
- "movprfx z16, z24\n smax z16.b, p2/M, z16.b, z28.b\n"
- "ld1b { z28.b }, p1/Z, [x24, x13]\n"
- "movprfx z20, z26\n smax z20.b, p2/M, z20.b, z23.b\n"
- "ld1b { z26.b }, p1/Z, [x22, x13]\n"
- "movprfx z19, z22\n smax z19.b, p2/M, z19.b, z18.b\n"
- "ld1b { z25.b }, p1/Z, [x25, x13]\n"
- "movprfx z18, z22\n smax z18.b, p2/M, z18.b, z17.b\n"
- "ld1b { z24.b }, p1/Z, [x21, x13]\n"
- "movprfx z17, z21\n smax z17.b, p2/M, z17.b, z16.b\n"
- "ld1b { z23.b }, p1/Z, [x19, x13]\n"
- "incw x13\n"
- "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z20.b\n"
- "st1b { z19.b }, p0, [x11, x12]\n"
- "whilelt p1.b, x13, x14\n"
- "st1b { z18.b }, p0, [x10, x12]\n"
- "st1b { z17.b }, p0, [x9, x12]\n"
- "st1b { z16.b }, p0, [x28, x12]\n"
- "incw x12\n"
+ "ld1b { z31.b }, p1/Z, [x27, x14]\n"
+ "ld1b { z30.b }, p1/Z, [x24, x14]\n"
+ "movprfx z20, z28\n smax z20.b, p2/M, z20.b, z27.b\n"
+ "movprfx z19, z26\n smax z19.b, p2/M, z19.b, z25.b\n"
+ "ld1b { z29.b }, p1/Z, [x21, x14]\n"
+ "ld1b { z27.b }, p1/Z, [x28, x14]\n"
+ "movprfx z17, z28\n smax z17.b, p2/M, z17.b, z24.b\n"
+ "movprfx z18, z25\n smax z18.b, p2/M, z18.b, z23.b\n"
+ "ld1b { z28.b }, p1/Z, [x25, x14]\n"
+ "ld1b { z26.b }, p1/Z, [x26, x14]\n"
+ "ld1b { z25.b }, p1/Z, [x23, x14]\n"
+ "ld1b { z24.b }, p1/Z, [x22, x14]\n"
+ "whilelt p0.b, x11, x15\n"
+ "movprfx z16, z22\n smax z16.b, p2/M, z16.b, z20.b\n"
+ "ld1b { z23.b }, p1/Z, [x20, x14]\n"
+ "incw x14\n"
+ "whilelt p1.b, x14, x15\n"
+ "st1b { z16.b }, p0, [x13, x11]\n"
+ "movprfx z16, z19\n smax z16.b, p2/M, z16.b, z22.b\n"
+ "smax z17.b, p2/M, z17.b, z21.b\n"
+ "st1b { z16.b }, p0, [x12, x11]\n"
+ "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z18.b\n"
+ "st1b { z17.b }, p0, [x10, x11]\n"
+ "st1b { z16.b }, p0, [x9, x11]\n"
+ "incw x11\n"
"b.any 1b\n"
"2:" // Vector: Tail
"movprfx z22, z31\n smax z22.b, p2/M, z22.b, z30.b\n"
- "whilelt p0.b, x12, x14\n"
"movprfx z21, z30\n smax z21.b, p2/M, z21.b, z29.b\n"
- "movprfx z18, z28\n smax z18.b, p2/M, z18.b, z27.b\n"
- "movprfx z17, z26\n smax z17.b, p2/M, z17.b, z25.b\n"
- "movprfx z16, z24\n smax z16.b, p2/M, z16.b, z28.b\n"
- "movprfx z20, z26\n smax z20.b, p2/M, z20.b, z23.b\n"
- "movprfx z19, z22\n smax z19.b, p2/M, z19.b, z18.b\n"
- "st1b { z19.b }, p0, [x11, x12]\n"
- "movprfx z18, z22\n smax z18.b, p2/M, z18.b, z17.b\n"
- "movprfx z17, z21\n smax z17.b, p2/M, z17.b, z16.b\n"
- "st1b { z18.b }, p0, [x10, x12]\n"
- "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z20.b\n"
- "st1b { z17.b }, p0, [x9, x12]\n"
- "st1b { z16.b }, p0, [x28, x12]\n"
+ "movprfx z20, z28\n smax z20.b, p2/M, z20.b, z27.b\n"
+ "movprfx z19, z26\n smax z19.b, p2/M, z19.b, z25.b\n"
+ "movprfx z17, z28\n smax z17.b, p2/M, z17.b, z24.b\n"
+ "movprfx z18, z25\n smax z18.b, p2/M, z18.b, z23.b\n"
+ "whilelt p0.b, x11, x15\n"
+ "movprfx z16, z22\n smax z16.b, p2/M, z16.b, z20.b\n"
+ "st1b { z16.b }, p0, [x13, x11]\n"
+ "movprfx z16, z19\n smax z16.b, p2/M, z16.b, z22.b\n"
+ "smax z17.b, p2/M, z17.b, z21.b\n"
+ "st1b { z16.b }, p0, [x12, x11]\n"
+ "movprfx z16, z21\n smax z16.b, p2/M, z16.b, z18.b\n"
+ "st1b { z17.b }, p0, [x10, x11]\n"
+ "st1b { z16.b }, p0, [x9, x11]\n"
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst.hpp
index 428902ad61..2ee5bc0527 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,29 +26,21 @@
#pragma once
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
void sve_s8_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
-struct sve_s8_nhwc_max_generic_depthfirst
+struct sve_s8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t>
{
- typedef int8_t operand_type;
- typedef int8_t return_type;
-
- typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
- kern_type kernel = sve_s8_nhwc_max_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<int8_t, int8_t>;
sve_s8_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sve_s8_nhwc_max_generic_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp
index 3e88c8729c..da9e1408f9 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,8 +23,9 @@
*/
#include <cstdint>
+#include <cstddef>
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
@@ -39,185 +40,184 @@ void sve_s8_nhwc_max_generic_depthfirst_impl(
)
{
__asm__ __volatile__(
- "ptrue p4.b\n"
- "mov x28, #0x0\n"
- "cntb x27\n"
- "cntb x26, ALL, MUL #2\n"
- "cntb x25, ALL, MUL #3\n"
+ "mov x9, #0x0\n"
+ "cntb x28\n"
+ "cntb x27, ALL, MUL #2\n"
+ "cntb x26, ALL, MUL #3\n"
+ "whilelt p4.b, x9, %x[n_channels]\n"
"whilelt p3.b, x28, %x[n_channels]\n"
"whilelt p2.b, x27, %x[n_channels]\n"
"whilelt p1.b, x26, %x[n_channels]\n"
- "whilelt p0.b, x25, %x[n_channels]\n"
+ "ptrue p0.b\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z8.b, #0x80\n"
"mov z7.b, #0x80\n"
- "mov x19, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z6.b, #0x80\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
"mov z5.b, #0x80\n"
- "mov z4.b, #0x80\n"
- "cbz x24, 4f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "ld1b { z2.b }, p3/Z, [x22, x28]\n"
- "ld1b { z1.b }, p3/Z, [x21, x28]\n"
- "ld1b { z0.b }, p3/Z, [x20, x28]\n"
- "ld1b { z31.b }, p2/Z, [x23, x27]\n"
- "ld1b { z30.b }, p2/Z, [x22, x27]\n"
- "ld1b { z22.b }, p2/Z, [x21, x27]\n"
- "ld1b { z29.b }, p2/Z, [x20, x27]\n"
- "ld1b { z28.b }, p1/Z, [x23, x26]\n"
- "ld1b { z27.b }, p1/Z, [x22, x26]\n"
- "ld1b { z21.b }, p1/Z, [x21, x26]\n"
- "ld1b { z26.b }, p1/Z, [x20, x26]\n"
- "ld1b { z16.b }, p0/Z, [x23, x25]\n"
- "ld1b { z25.b }, p0/Z, [x22, x25]\n"
- "ld1b { z20.b }, p0/Z, [x21, x25]\n"
- "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+ "cbz x25, 4f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "movprfx z18, z31\n smax z18.b, p4/M, z18.b, z30.b\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "smax z22.b, p4/M, z22.b, z29.b\n"
- "movprfx z17, z28\n smax z17.b, p4/M, z17.b, z27.b\n"
- "ld1b { z2.b }, p3/Z, [x22, x28]\n"
- "smax z21.b, p4/M, z21.b, z26.b\n"
- "ld1b { z1.b }, p3/Z, [x21, x28]\n"
- "smax z16.b, p4/M, z16.b, z25.b\n"
- "ld1b { z0.b }, p3/Z, [x20, x28]\n"
- "smax z20.b, p4/M, z20.b, z24.b\n"
- "ld1b { z31.b }, p2/Z, [x23, x27]\n"
- "smax z19.b, p4/M, z19.b, z23.b\n"
- "ld1b { z30.b }, p2/Z, [x22, x27]\n"
- "smax z18.b, p4/M, z18.b, z22.b\n"
- "ld1b { z22.b }, p2/Z, [x21, x27]\n"
- "smax z17.b, p4/M, z17.b, z21.b\n"
- "ld1b { z29.b }, p2/Z, [x20, x27]\n"
- "smax z16.b, p4/M, z16.b, z20.b\n"
- "ld1b { z28.b }, p1/Z, [x23, x26]\n"
- "smax z7.b, p4/M, z7.b, z19.b\n"
- "ld1b { z27.b }, p1/Z, [x22, x26]\n"
- "smax z6.b, p4/M, z6.b, z18.b\n"
- "ld1b { z21.b }, p1/Z, [x21, x26]\n"
- "smax z5.b, p4/M, z5.b, z17.b\n"
- "ld1b { z26.b }, p1/Z, [x20, x26]\n"
- "smax z4.b, p4/M, z4.b, z16.b\n"
- "ld1b { z16.b }, p0/Z, [x23, x25]\n"
- "ld1b { z25.b }, p0/Z, [x22, x25]\n"
- "ld1b { z20.b }, p0/Z, [x21, x25]\n"
- "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+ "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
+ "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "movprfx z18, z0\n smax z18.b, p0/M, z18.b, z31.b\n"
+ "smax z22.b, p0/M, z22.b, z30.b\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "movprfx z17, z29\n smax z17.b, p0/M, z17.b, z28.b\n"
+ "smax z21.b, p0/M, z21.b, z27.b\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "movprfx z16, z26\n smax z16.b, p0/M, z16.b, z25.b\n"
+ "smax z20.b, p0/M, z20.b, z24.b\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+ "smax z19.b, p0/M, z19.b, z23.b\n"
+ "smax z18.b, p0/M, z18.b, z22.b\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+ "smax z17.b, p0/M, z17.b, z21.b\n"
+ "smax z16.b, p0/M, z16.b, z20.b\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+ "subs x25, x25, #0x1\n"
+ "smax z8.b, p0/M, z8.b, z19.b\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+ "smax z7.b, p0/M, z7.b, z18.b\n"
+ "smax z6.b, p0/M, z6.b, z17.b\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "smax z5.b, p0/M, z5.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
- "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
- "movprfx z18, z31\n smax z18.b, p4/M, z18.b, z30.b\n"
- "smax z22.b, p4/M, z22.b, z29.b\n"
- "movprfx z17, z28\n smax z17.b, p4/M, z17.b, z27.b\n"
- "smax z21.b, p4/M, z21.b, z26.b\n"
- "smax z16.b, p4/M, z16.b, z25.b\n"
- "smax z20.b, p4/M, z20.b, z24.b\n"
- "smax z19.b, p4/M, z19.b, z23.b\n"
- "smax z18.b, p4/M, z18.b, z22.b\n"
- "smax z17.b, p4/M, z17.b, z21.b\n"
- "smax z16.b, p4/M, z16.b, z20.b\n"
- "smax z7.b, p4/M, z7.b, z19.b\n"
- "smax z6.b, p4/M, z6.b, z18.b\n"
- "smax z5.b, p4/M, z5.b, z17.b\n"
- "smax z4.b, p4/M, z4.b, z16.b\n"
+ "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
+ "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
+ "movprfx z18, z0\n smax z18.b, p0/M, z18.b, z31.b\n"
+ "smax z22.b, p0/M, z22.b, z30.b\n"
+ "movprfx z17, z29\n smax z17.b, p0/M, z17.b, z28.b\n"
+ "smax z21.b, p0/M, z21.b, z27.b\n"
+ "movprfx z16, z26\n smax z16.b, p0/M, z16.b, z25.b\n"
+ "smax z20.b, p0/M, z20.b, z24.b\n"
+ "smax z19.b, p0/M, z19.b, z23.b\n"
+ "smax z18.b, p0/M, z18.b, z22.b\n"
+ "smax z17.b, p0/M, z17.b, z21.b\n"
+ "smax z16.b, p0/M, z16.b, z20.b\n"
+ "smax z8.b, p0/M, z8.b, z19.b\n"
+ "smax z7.b, p0/M, z7.b, z18.b\n"
+ "smax z6.b, p0/M, z6.b, z17.b\n"
+ "smax z5.b, p0/M, z5.b, z16.b\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "smax z7.b, p4/M, z7.b, z3.b\n"
- "ld1b { z31.b }, p2/Z, [x23, x27]\n"
- "ld1b { z28.b }, p1/Z, [x23, x26]\n"
- "smax z6.b, p4/M, z6.b, z31.b\n"
- "ld1b { z16.b }, p0/Z, [x23, x25]\n"
- "smax z5.b, p4/M, z5.b, z28.b\n"
- "smax z4.b, p4/M, z4.b, z16.b\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+ "subs x21, x21, #0x1\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
+ "ld1b { z17.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "smax z7.b, p0/M, z7.b, z17.b\n"
+ "smax z6.b, p0/M, z6.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+ "smax z5.b, p0/M, z5.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
+ "st1b { z8.b }, p4, [%x[outptr], x9]\n"
+ "incb x9, ALL, MUL #4\n"
"st1b { z7.b }, p3, [%x[outptr], x28]\n"
"incb x28, ALL, MUL #4\n"
"st1b { z6.b }, p2, [%x[outptr], x27]\n"
"incb x27, ALL, MUL #4\n"
"st1b { z5.b }, p1, [%x[outptr], x26]\n"
"incb x26, ALL, MUL #4\n"
- "st1b { z4.b }, p0, [%x[outptr], x25]\n"
- "incb x25, ALL, MUL #4\n"
- "whilelt p0.b, x25, %x[n_channels]\n"
+ "whilelt p1.b, x26, %x[n_channels]\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p3.b, x28, %x[n_channels]\n"
+ "whilelt p4.b, x9, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
- "mov z7.b, #0x80\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 11f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "ld1b { z2.b }, p3/Z, [x22, x28]\n"
- "ld1b { z1.b }, p3/Z, [x21, x28]\n"
- "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z8.b, #0x80\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "smax z19.b, p4/M, z19.b, z23.b\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "ld1b { z2.b }, p3/Z, [x22, x28]\n"
- "smax z7.b, p4/M, z7.b, z19.b\n"
- "ld1b { z1.b }, p3/Z, [x21, x28]\n"
- "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+ "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "subs x25, x25, #0x1\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
- "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
- "smax z19.b, p4/M, z19.b, z23.b\n"
- "smax z7.b, p4/M, z7.b, z19.b\n"
+ "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "smax z7.b, p4/M, z7.b, z3.b\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+ "subs x21, x21, #0x1\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "st1b { z7.b }, p3, [%x[outptr], x28]\n"
- "incb x28\n"
- "whilelt p3.b, x28, %x[n_channels]\n"
+ "st1b { z8.b }, p4, [%x[outptr], x9]\n"
+ "incb x9\n"
+ "whilelt p4.b, x9, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
-
:
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst.hpp
index 1242eaf530..6f34faa121 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,29 +26,21 @@
#pragma once
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
void sve_s8q_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
-struct sve_s8q_nhwc_avg_generic_depthfirst
+struct sve_s8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>
{
- typedef int8_t operand_type;
- typedef int8_t return_type;
-
- typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
- kern_type kernel = sve_s8q_nhwc_avg_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>;
sve_s8q_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sve_s8q_nhwc_avg_generic_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp
index 928eb412b5..19a3b112ad 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,11 +24,12 @@
#include "pooling.hpp"
#include <cstdint>
+#include <cstddef>
#include <cstring>
#include <cmath>
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
@@ -86,12 +87,13 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
f_rescale_value *= 2.0f;
}
- rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
- if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+ int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+ if (long_rescale_value == (1ll << 31))
{
shift_value++;
- rescale_value >>= 1;
+ long_rescale_value >>= 1;
}
+ rescale_value = static_cast<int32_t>(long_rescale_value);
}
// Combine together the rescale value for the requantization and the scaling
@@ -112,21 +114,21 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
);
__asm__ __volatile__(
- "ptrue p4.b\n"
- "mov x26, #0x0\n"
- "cntb x25\n"
- "cntb x24, ALL, MUL #2\n"
- "cntb x23, ALL, MUL #3\n"
+ "mov x27, #0x0\n"
+ "cntb x26\n"
+ "cntb x25, ALL, MUL #2\n"
+ "cntb x24, ALL, MUL #3\n"
+ "whilelt p4.b, x27, %x[n_channels]\n"
"whilelt p3.b, x26, %x[n_channels]\n"
"whilelt p2.b, x25, %x[n_channels]\n"
"whilelt p1.b, x24, %x[n_channels]\n"
- "whilelt p0.b, x23, %x[n_channels]\n"
+ "ptrue p0.b\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
+ "lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
- "mov x19, %x[inptrs]\n"
"mov z14.s, #0x0\n"
- "lsr x22, %x[n_valid_cells], #0x1\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z11.s, #0x0\n"
@@ -141,43 +143,43 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
"mov z2.s, #0x0\n"
"mov z1.s, #0x0\n"
"mov z0.s, #0x0\n"
- "cbz x22, 4f\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
- "add x19, x19, #0x10\n"
- "ld1b { z30.b }, p3/Z, [x20, x26]\n"
- "subs x22, x22, #0x1\n"
- "ld1b { z29.b }, p2/Z, [x21, x25]\n"
- "ld1b { z28.b }, p2/Z, [x20, x25]\n"
- "ld1b { z27.b }, p1/Z, [x21, x24]\n"
- "ld1b { z26.b }, p1/Z, [x20, x24]\n"
- "ld1b { z25.b }, p0/Z, [x21, x23]\n"
- "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+ "cbz x23, 4f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "add x19, x19, #0x10\n"
".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
".inst 0x455c03b5 // saddlb z21.h, z29.b, z28.b\n"
- "subs x22, x22, #0x1\n"
".inst 0x455c07b4 // saddlt z20.h, z29.b, z28.b\n"
- "ld1b { z30.b }, p3/Z, [x20, x26]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0373 // saddlb z19.h, z27.b, z26.b\n"
- "ld1b { z29.b }, p2/Z, [x21, x25]\n"
".inst 0x455a0772 // saddlt z18.h, z27.b, z26.b\n"
- "ld1b { z28.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x45580331 // saddlb z17.h, z25.b, z24.b\n"
- "ld1b { z27.b }, p1/Z, [x21, x24]\n"
".inst 0x45580730 // saddlt z16.h, z25.b, z24.b\n"
- "ld1b { z26.b }, p1/Z, [x20, x24]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- "ld1b { z25.b }, p0/Z, [x21, x23]\n"
".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
@@ -217,241 +219,240 @@ void sve_s8q_nhwc_avg_generic_depthfirst_impl(
".inst 0x45904021 // saddwb z1.s, z1.s, z16.h\n"
".inst 0x45904400 // saddwt z0.s, z0.s, z16.h\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x1\n"
+ "ands x21, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x21, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
- ".inst 0x4508a3f1 // sshllb z17.h, z31.b, #0x0\n"
- "ld1b { z29.b }, p2/Z, [x21, x25]\n"
- ".inst 0x4508a7f0 // sshllt z16.h, z31.b, #0x0\n"
- "ld1b { z27.b }, p1/Z, [x21, x24]\n"
- ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
- "ld1b { z25.b }, p0/Z, [x21, x23]\n"
- ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
- ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
- ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
- ".inst 0x4508a3b0 // sshllb z16.h, z29.b, #0x0\n"
- ".inst 0x4590416b // saddwb z11.s, z11.s, z16.h\n"
- ".inst 0x4590454a // saddwt z10.s, z10.s, z16.h\n"
- ".inst 0x4508a7b0 // sshllt z16.h, z29.b, #0x0\n"
- ".inst 0x45904129 // saddwb z9.s, z9.s, z16.h\n"
- ".inst 0x45904508 // saddwt z8.s, z8.s, z16.h\n"
- ".inst 0x4508a370 // sshllb z16.h, z27.b, #0x0\n"
- ".inst 0x459040e7 // saddwb z7.s, z7.s, z16.h\n"
- ".inst 0x459044c6 // saddwt z6.s, z6.s, z16.h\n"
- ".inst 0x4508a770 // sshllt z16.h, z27.b, #0x0\n"
- ".inst 0x459040a5 // saddwb z5.s, z5.s, z16.h\n"
- ".inst 0x45904484 // saddwt z4.s, z4.s, z16.h\n"
- ".inst 0x4508a330 // sshllb z16.h, z25.b, #0x0\n"
- ".inst 0x45904063 // saddwb z3.s, z3.s, z16.h\n"
- ".inst 0x45904442 // saddwt z2.s, z2.s, z16.h\n"
- ".inst 0x4508a730 // sshllt z16.h, z25.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a217 // sshllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508a616 // sshllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z17.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508a215 // sshllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508a614 // sshllt z20.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508a233 // sshllb z19.h, z17.b, #0x0\n"
+ ".inst 0x4508a632 // sshllt z18.h, z17.b, #0x0\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
+ ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
+ ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
+ ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
+ ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x4595416b // saddwb z11.s, z11.s, z21.h\n"
+ ".inst 0x4595454a // saddwt z10.s, z10.s, z21.h\n"
+ ".inst 0x45944129 // saddwb z9.s, z9.s, z20.h\n"
+ ".inst 0x45944508 // saddwt z8.s, z8.s, z20.h\n"
+ ".inst 0x459340e7 // saddwb z7.s, z7.s, z19.h\n"
+ ".inst 0x459344c6 // saddwt z6.s, z6.s, z19.h\n"
+ ".inst 0x459240a5 // saddwb z5.s, z5.s, z18.h\n"
+ ".inst 0x45924484 // saddwt z4.s, z4.s, z18.h\n"
+ ".inst 0x45914063 // saddwb z3.s, z3.s, z17.h\n"
+ ".inst 0x45914442 // saddwt z2.s, z2.s, z17.h\n"
".inst 0x45904021 // saddwb z1.s, z1.s, z16.h\n"
".inst 0x45904400 // saddwt z0.s, z0.s, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "mov z20.s, #0x7f\n"
- "ld1rw { z18.s }, p4/Z, [%x[combined_rescale_value]]\n"
- "ld1rw { z17.s }, p4/Z, [%x[left_shift]]\n"
- "not z19.s, p4/M, z20.s\n"
- "ld1rw { z16.s }, p4/Z, [%x[right_shift]]\n"
- ".inst 0x4482922f // srshl z15.s, p4/M, z15.s, z17.s\n"
- ".inst 0x4482922e // srshl z14.s, p4/M, z14.s, z17.s\n"
- ".inst 0x4482922d // srshl z13.s, p4/M, z13.s, z17.s\n"
- ".inst 0x4482922c // srshl z12.s, p4/M, z12.s, z17.s\n"
- ".inst 0x4482922b // srshl z11.s, p4/M, z11.s, z17.s\n"
- ".inst 0x04b275ef // sqrdmulh z15.s, z15.s, z18.s\n"
- ".inst 0x04b275ce // sqrdmulh z14.s, z14.s, z18.s\n"
- ".inst 0x04b275ad // sqrdmulh z13.s, z13.s, z18.s\n"
- ".inst 0x04b2758c // sqrdmulh z12.s, z12.s, z18.s\n"
- ".inst 0x04b2756b // sqrdmulh z11.s, z11.s, z18.s\n"
- ".inst 0x4482920f // srshl z15.s, p4/M, z15.s, z16.s\n"
- ".inst 0x4482920e // srshl z14.s, p4/M, z14.s, z16.s\n"
- ".inst 0x4482920d // srshl z13.s, p4/M, z13.s, z16.s\n"
- ".inst 0x4482920c // srshl z12.s, p4/M, z12.s, z16.s\n"
- ".inst 0x4482920b // srshl z11.s, p4/M, z11.s, z16.s\n"
- ".inst 0x4482922a // srshl z10.s, p4/M, z10.s, z17.s\n"
- ".inst 0x44829229 // srshl z9.s, p4/M, z9.s, z17.s\n"
- ".inst 0x44829228 // srshl z8.s, p4/M, z8.s, z17.s\n"
- ".inst 0x44829227 // srshl z7.s, p4/M, z7.s, z17.s\n"
- ".inst 0x04b2754a // sqrdmulh z10.s, z10.s, z18.s\n"
- ".inst 0x04b27529 // sqrdmulh z9.s, z9.s, z18.s\n"
- ".inst 0x04b27508 // sqrdmulh z8.s, z8.s, z18.s\n"
- ".inst 0x04b274e7 // sqrdmulh z7.s, z7.s, z18.s\n"
- ".inst 0x4482920a // srshl z10.s, p4/M, z10.s, z16.s\n"
- ".inst 0x44829209 // srshl z9.s, p4/M, z9.s, z16.s\n"
- ".inst 0x44829208 // srshl z8.s, p4/M, z8.s, z16.s\n"
- ".inst 0x44829207 // srshl z7.s, p4/M, z7.s, z16.s\n"
- ".inst 0x44829226 // srshl z6.s, p4/M, z6.s, z17.s\n"
- ".inst 0x44829225 // srshl z5.s, p4/M, z5.s, z17.s\n"
- ".inst 0x44829224 // srshl z4.s, p4/M, z4.s, z17.s\n"
- ".inst 0x44829223 // srshl z3.s, p4/M, z3.s, z17.s\n"
- ".inst 0x04b274c6 // sqrdmulh z6.s, z6.s, z18.s\n"
- ".inst 0x04b274a5 // sqrdmulh z5.s, z5.s, z18.s\n"
- ".inst 0x04b27484 // sqrdmulh z4.s, z4.s, z18.s\n"
- ".inst 0x04b27463 // sqrdmulh z3.s, z3.s, z18.s\n"
- ".inst 0x44829206 // srshl z6.s, p4/M, z6.s, z16.s\n"
- ".inst 0x44829205 // srshl z5.s, p4/M, z5.s, z16.s\n"
- ".inst 0x44829204 // srshl z4.s, p4/M, z4.s, z16.s\n"
- ".inst 0x44829203 // srshl z3.s, p4/M, z3.s, z16.s\n"
- ".inst 0x44829222 // srshl z2.s, p4/M, z2.s, z17.s\n"
- ".inst 0x44829221 // srshl z1.s, p4/M, z1.s, z17.s\n"
- ".inst 0x44829220 // srshl z0.s, p4/M, z0.s, z17.s\n"
- "smax z15.s, p4/M, z15.s, z19.s\n"
- ".inst 0x04b27442 // sqrdmulh z2.s, z2.s, z18.s\n"
- ".inst 0x04b27421 // sqrdmulh z1.s, z1.s, z18.s\n"
- ".inst 0x04b27400 // sqrdmulh z0.s, z0.s, z18.s\n"
- "smin z15.s, p4/M, z15.s, z20.s\n"
- ".inst 0x44829202 // srshl z2.s, p4/M, z2.s, z16.s\n"
- ".inst 0x44829201 // srshl z1.s, p4/M, z1.s, z16.s\n"
- ".inst 0x44829200 // srshl z0.s, p4/M, z0.s, z16.s\n"
- "smax z14.s, p4/M, z14.s, z19.s\n"
- "smax z13.s, p4/M, z13.s, z19.s\n"
- "smax z12.s, p4/M, z12.s, z19.s\n"
- "smax z11.s, p4/M, z11.s, z19.s\n"
- "smin z14.s, p4/M, z14.s, z20.s\n"
- "smin z13.s, p4/M, z13.s, z20.s\n"
- "smin z12.s, p4/M, z12.s, z20.s\n"
- "smin z11.s, p4/M, z11.s, z20.s\n"
+ "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
+ "ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
+ ".inst 0x4482824f // srshl z15.s, p0/M, z15.s, z18.s\n"
+ ".inst 0x4482824e // srshl z14.s, p0/M, z14.s, z18.s\n"
+ ".inst 0x4482824d // srshl z13.s, p0/M, z13.s, z18.s\n"
+ ".inst 0x4482824c // srshl z12.s, p0/M, z12.s, z18.s\n"
+ "ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
+ ".inst 0x04b175ef // sqrdmulh z15.s, z15.s, z17.s\n"
+ ".inst 0x4482824b // srshl z11.s, p0/M, z11.s, z18.s\n"
+ ".inst 0x4482824a // srshl z10.s, p0/M, z10.s, z18.s\n"
+ ".inst 0x04b175ce // sqrdmulh z14.s, z14.s, z17.s\n"
+ ".inst 0x04b175ad // sqrdmulh z13.s, z13.s, z17.s\n"
+ ".inst 0x44828249 // srshl z9.s, p0/M, z9.s, z18.s\n"
+ ".inst 0x44828248 // srshl z8.s, p0/M, z8.s, z18.s\n"
+ ".inst 0x04b1758c // sqrdmulh z12.s, z12.s, z17.s\n"
+ ".inst 0x04b1756b // sqrdmulh z11.s, z11.s, z17.s\n"
+ ".inst 0x44828247 // srshl z7.s, p0/M, z7.s, z18.s\n"
+ ".inst 0x44828246 // srshl z6.s, p0/M, z6.s, z18.s\n"
+ ".inst 0x04b1754a // sqrdmulh z10.s, z10.s, z17.s\n"
+ ".inst 0x04b17529 // sqrdmulh z9.s, z9.s, z17.s\n"
+ ".inst 0x44828245 // srshl z5.s, p0/M, z5.s, z18.s\n"
+ ".inst 0x44828244 // srshl z4.s, p0/M, z4.s, z18.s\n"
+ ".inst 0x04b17508 // sqrdmulh z8.s, z8.s, z17.s\n"
+ ".inst 0x04b174e7 // sqrdmulh z7.s, z7.s, z17.s\n"
+ ".inst 0x44828243 // srshl z3.s, p0/M, z3.s, z18.s\n"
+ ".inst 0x44828242 // srshl z2.s, p0/M, z2.s, z18.s\n"
+ ".inst 0x04b174c6 // sqrdmulh z6.s, z6.s, z17.s\n"
+ ".inst 0x04b174a5 // sqrdmulh z5.s, z5.s, z17.s\n"
+ ".inst 0x44828241 // srshl z1.s, p0/M, z1.s, z18.s\n"
+ ".inst 0x44828240 // srshl z0.s, p0/M, z0.s, z18.s\n"
+ ".inst 0x04b17484 // sqrdmulh z4.s, z4.s, z17.s\n"
+ ".inst 0x04b17463 // sqrdmulh z3.s, z3.s, z17.s\n"
+ ".inst 0x04b17442 // sqrdmulh z2.s, z2.s, z17.s\n"
+ ".inst 0x04b17421 // sqrdmulh z1.s, z1.s, z17.s\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
+ ".inst 0x04b17400 // sqrdmulh z0.s, z0.s, z17.s\n"
+ "mov z18.s, #0x7f\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
+ ".inst 0x4482820b // srshl z11.s, p0/M, z11.s, z16.s\n"
+ ".inst 0x4482820a // srshl z10.s, p0/M, z10.s, z16.s\n"
+ ".inst 0x44828209 // srshl z9.s, p0/M, z9.s, z16.s\n"
+ ".inst 0x44828208 // srshl z8.s, p0/M, z8.s, z16.s\n"
+ ".inst 0x44828207 // srshl z7.s, p0/M, z7.s, z16.s\n"
+ ".inst 0x44828206 // srshl z6.s, p0/M, z6.s, z16.s\n"
+ ".inst 0x44828205 // srshl z5.s, p0/M, z5.s, z16.s\n"
+ ".inst 0x44828204 // srshl z4.s, p0/M, z4.s, z16.s\n"
+ ".inst 0x44828203 // srshl z3.s, p0/M, z3.s, z16.s\n"
+ ".inst 0x44828202 // srshl z2.s, p0/M, z2.s, z16.s\n"
+ ".inst 0x44828201 // srshl z1.s, p0/M, z1.s, z16.s\n"
+ ".inst 0x44828200 // srshl z0.s, p0/M, z0.s, z16.s\n"
+ "not z16.s, p0/M, z18.s\n"
+ "smax z15.s, p0/M, z15.s, z16.s\n"
+ "smax z14.s, p0/M, z14.s, z16.s\n"
+ "smax z13.s, p0/M, z13.s, z16.s\n"
+ "smax z12.s, p0/M, z12.s, z16.s\n"
+ "smax z11.s, p0/M, z11.s, z16.s\n"
+ "smax z10.s, p0/M, z10.s, z16.s\n"
+ "smax z9.s, p0/M, z9.s, z16.s\n"
+ "smax z8.s, p0/M, z8.s, z16.s\n"
+ "smax z7.s, p0/M, z7.s, z16.s\n"
+ "smax z6.s, p0/M, z6.s, z16.s\n"
+ "smax z5.s, p0/M, z5.s, z16.s\n"
+ "smax z4.s, p0/M, z4.s, z16.s\n"
+ "smax z3.s, p0/M, z3.s, z16.s\n"
+ "smax z2.s, p0/M, z2.s, z16.s\n"
+ "smax z1.s, p0/M, z1.s, z16.s\n"
+ "smax z0.s, p0/M, z0.s, z16.s\n"
+ "smin z15.s, p0/M, z15.s, z18.s\n"
+ "smin z14.s, p0/M, z14.s, z18.s\n"
+ "smin z13.s, p0/M, z13.s, z18.s\n"
"trn1 z17.h, z15.h, z14.h\n"
- "smax z10.s, p4/M, z10.s, z19.s\n"
+ "smin z12.s, p0/M, z12.s, z18.s\n"
+ "smin z11.s, p0/M, z11.s, z18.s\n"
"trn1 z16.h, z13.h, z12.h\n"
- "smax z9.s, p4/M, z9.s, z19.s\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p3, [%x[outptr], x26]\n"
- "smin z10.s, p4/M, z10.s, z20.s\n"
- "incb x26, ALL, MUL #4\n"
- "smin z9.s, p4/M, z9.s, z20.s\n"
- "smax z8.s, p4/M, z8.s, z19.s\n"
- "smax z7.s, p4/M, z7.s, z19.s\n"
- "smax z6.s, p4/M, z6.s, z19.s\n"
- "trn1 z18.h, z11.h, z10.h\n"
- "smin z8.s, p4/M, z8.s, z20.s\n"
- "smin z7.s, p4/M, z7.s, z20.s\n"
- "smin z6.s, p4/M, z6.s, z20.s\n"
- "smax z5.s, p4/M, z5.s, z19.s\n"
+ "smin z10.s, p0/M, z10.s, z18.s\n"
+ "smin z9.s, p0/M, z9.s, z18.s\n"
+ "trn1 z17.h, z11.h, z10.h\n"
+ "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+ "smin z8.s, p0/M, z8.s, z18.s\n"
+ "smin z7.s, p0/M, z7.s, z18.s\n"
"trn1 z16.h, z9.h, z8.h\n"
- "smax z4.s, p4/M, z4.s, z19.s\n"
+ "trn1 z16.b, z17.b, z16.b\n"
+ "smin z6.s, p0/M, z6.s, z18.s\n"
+ "smin z5.s, p0/M, z5.s, z18.s\n"
"trn1 z17.h, z7.h, z6.h\n"
- "trn1 z16.b, z18.b, z16.b\n"
- "st1b { z16.b }, p2, [%x[outptr], x25]\n"
- "smin z5.s, p4/M, z5.s, z20.s\n"
- "incb x25, ALL, MUL #4\n"
- "smin z4.s, p4/M, z4.s, z20.s\n"
- "smax z3.s, p4/M, z3.s, z19.s\n"
- "smax z2.s, p4/M, z2.s, z19.s\n"
- "smax z1.s, p4/M, z1.s, z19.s\n"
- "smax z0.s, p4/M, z0.s, z19.s\n"
+ "st1b { z16.b }, p3, [%x[outptr], x26]\n"
+ "smin z4.s, p0/M, z4.s, z18.s\n"
+ "smin z3.s, p0/M, z3.s, z18.s\n"
"trn1 z16.h, z5.h, z4.h\n"
- "smin z3.s, p4/M, z3.s, z20.s\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p1, [%x[outptr], x24]\n"
- "smin z2.s, p4/M, z2.s, z20.s\n"
- "incb x24, ALL, MUL #4\n"
- "smin z1.s, p4/M, z1.s, z20.s\n"
- "smin z0.s, p4/M, z0.s, z20.s\n"
+ "smin z2.s, p0/M, z2.s, z18.s\n"
+ "smin z1.s, p0/M, z1.s, z18.s\n"
"trn1 z17.h, z3.h, z2.h\n"
+ "st1b { z16.b }, p2, [%x[outptr], x25]\n"
+ "smin z0.s, p0/M, z0.s, z18.s\n"
"trn1 z16.h, z1.h, z0.h\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p0, [%x[outptr], x23]\n"
- "incb x23, ALL, MUL #4\n"
- "whilelt p0.b, x23, %x[n_channels]\n"
+ "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+ "incb x24, ALL, MUL #4\n"
+ "whilelt p1.b, x24, %x[n_channels]\n"
+ "incb x27, ALL, MUL #4\n"
+ "incb x26, ALL, MUL #4\n"
+ "incb x25, ALL, MUL #4\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p3.b, x26, %x[n_channels]\n"
+ "whilelt p4.b, x27, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
+ "lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
- "mov x19, %x[inptrs]\n"
"mov z14.s, #0x0\n"
- "lsr x22, %x[n_valid_cells], #0x1\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
- "cbz x22, 11f\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
- "add x19, x19, #0x10\n"
- "ld1b { z30.b }, p3/Z, [x20, x26]\n"
- "subs x22, x22, #0x1\n"
+ "cbz x23, 11f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "add x19, x19, #0x10\n"
- ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- "subs x22, x22, #0x1\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- "ld1b { z30.b }, p3/Z, [x20, x26]\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- ".inst 0x455e03f7 // saddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e07f6 // saddlt z22.h, z31.b, z30.b\n"
- ".inst 0x459741ef // saddwb z15.s, z15.s, z23.h\n"
- ".inst 0x459745ce // saddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459641ad // saddwb z13.s, z13.s, z22.h\n"
- ".inst 0x4596458c // saddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e03f1 // saddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e07f0 // saddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x1\n"
+ "ands x21, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x21, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
- ".inst 0x4508a3f1 // sshllb z17.h, z31.b, #0x0\n"
- ".inst 0x4508a7f0 // sshllt z16.h, z31.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508a211 // sshllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508a610 // sshllt z16.h, z16.b, #0x0\n"
+ "subs x21, x21, #0x1\n"
".inst 0x459141ef // saddwb z15.s, z15.s, z17.h\n"
".inst 0x459145ce // saddwt z14.s, z14.s, z17.h\n"
".inst 0x459041ad // saddwb z13.s, z13.s, z16.h\n"
".inst 0x4590458c // saddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "mov z20.s, #0x7f\n"
- "ld1rw { z18.s }, p4/Z, [%x[combined_rescale_value]]\n"
- "ld1rw { z17.s }, p4/Z, [%x[left_shift]]\n"
- "not z19.s, p4/M, z20.s\n"
- "ld1rw { z16.s }, p4/Z, [%x[right_shift]]\n"
- ".inst 0x4482922f // srshl z15.s, p4/M, z15.s, z17.s\n"
- ".inst 0x4482922e // srshl z14.s, p4/M, z14.s, z17.s\n"
- ".inst 0x4482922d // srshl z13.s, p4/M, z13.s, z17.s\n"
- ".inst 0x4482922c // srshl z12.s, p4/M, z12.s, z17.s\n"
- ".inst 0x04b275ef // sqrdmulh z15.s, z15.s, z18.s\n"
- ".inst 0x04b275ce // sqrdmulh z14.s, z14.s, z18.s\n"
- ".inst 0x04b275ad // sqrdmulh z13.s, z13.s, z18.s\n"
- ".inst 0x04b2758c // sqrdmulh z12.s, z12.s, z18.s\n"
- ".inst 0x4482920f // srshl z15.s, p4/M, z15.s, z16.s\n"
- ".inst 0x4482920e // srshl z14.s, p4/M, z14.s, z16.s\n"
- ".inst 0x4482920d // srshl z13.s, p4/M, z13.s, z16.s\n"
- ".inst 0x4482920c // srshl z12.s, p4/M, z12.s, z16.s\n"
- "smax z15.s, p4/M, z15.s, z19.s\n"
- "smax z14.s, p4/M, z14.s, z19.s\n"
- "smax z13.s, p4/M, z13.s, z19.s\n"
- "smax z12.s, p4/M, z12.s, z19.s\n"
- "smin z15.s, p4/M, z15.s, z20.s\n"
- "smin z14.s, p4/M, z14.s, z20.s\n"
- "smin z13.s, p4/M, z13.s, z20.s\n"
- "smin z12.s, p4/M, z12.s, z20.s\n"
+ "ld1rw { z16.s }, p0/Z, [%x[left_shift]]\n"
+ "ld1rw { z17.s }, p0/Z, [%x[combined_rescale_value]]\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
+ "ld1rw { z16.s }, p0/Z, [%x[right_shift]]\n"
+ ".inst 0x04b175ef // sqrdmulh z15.s, z15.s, z17.s\n"
+ ".inst 0x04b175ce // sqrdmulh z14.s, z14.s, z17.s\n"
+ ".inst 0x04b175ad // sqrdmulh z13.s, z13.s, z17.s\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
+ ".inst 0x04b1758c // sqrdmulh z12.s, z12.s, z17.s\n"
+ "mov z18.s, #0x7f\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
+ "not z16.s, p0/M, z18.s\n"
+ "smax z15.s, p0/M, z15.s, z16.s\n"
+ "smax z14.s, p0/M, z14.s, z16.s\n"
+ "smax z13.s, p0/M, z13.s, z16.s\n"
+ "smax z12.s, p0/M, z12.s, z16.s\n"
+ "smin z15.s, p0/M, z15.s, z18.s\n"
+ "smin z14.s, p0/M, z14.s, z18.s\n"
+ "smin z13.s, p0/M, z13.s, z18.s\n"
"trn1 z17.h, z15.h, z14.h\n"
+ "smin z12.s, p0/M, z12.s, z18.s\n"
"trn1 z16.h, z13.h, z12.h\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p3, [%x[outptr], x26]\n"
- "incb x26\n"
- "whilelt p3.b, x26, %x[n_channels]\n"
+ "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+ "incb x27\n"
+ "whilelt p4.b, x27, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
-
:
: [combined_rescale_value] "r" (&combined_rescale_value), [inptrs] "r" (inptrs), [left_shift] "r" (&left_shift), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [right_shift] "r" (&right_shift)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst.hpp
index 84aa0d3d6b..fc06ed09f6 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,29 +26,21 @@
#pragma once
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
void sve_s8q_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
-struct sve_s8q_nhwc_max_generic_depthfirst
+struct sve_s8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>
{
- typedef int8_t operand_type;
- typedef int8_t return_type;
-
- typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const int8_t *const *const inptrs, int8_t *outptr, const Requantize32 &qp);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
- kern_type kernel = sve_s8q_nhwc_max_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<int8_t, int8_t, Requantize32>;
sve_s8q_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sve_s8q_nhwc_max_generic_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp
index 3717f8cb30..4fc1532d5a 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_s8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,8 +24,9 @@
#include "pooling.hpp"
#include <cstdint>
+#include <cstddef>
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
@@ -41,346 +42,345 @@ void sve_s8q_nhwc_max_generic_depthfirst_impl(
)
{
__asm__ __volatile__(
- "ptrue p4.b\n"
- "mov x28, #0x0\n"
- "cntb x27\n"
- "cntb x26, ALL, MUL #2\n"
- "cntb x25, ALL, MUL #3\n"
+ "mov x9, #0x0\n"
+ "cntb x28\n"
+ "cntb x27, ALL, MUL #2\n"
+ "cntb x26, ALL, MUL #3\n"
+ "whilelt p4.b, x9, %x[n_channels]\n"
"whilelt p3.b, x28, %x[n_channels]\n"
"whilelt p2.b, x27, %x[n_channels]\n"
"whilelt p1.b, x26, %x[n_channels]\n"
- "whilelt p0.b, x25, %x[n_channels]\n"
+ "ptrue p0.b\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
+ "lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.b, #0x80\n"
- "mov x19, %x[inptrs]\n"
"mov z7.b, #0x80\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
+ "mov x24, %x[inptrs]\n"
"mov z6.b, #0x80\n"
"mov z5.b, #0x80\n"
- "cbz x24, 4f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "ld1b { z2.b }, p3/Z, [x22, x28]\n"
- "ld1b { z1.b }, p3/Z, [x21, x28]\n"
- "ld1b { z0.b }, p3/Z, [x20, x28]\n"
- "ld1b { z31.b }, p2/Z, [x23, x27]\n"
- "ld1b { z30.b }, p2/Z, [x22, x27]\n"
- "ld1b { z22.b }, p2/Z, [x21, x27]\n"
- "ld1b { z29.b }, p2/Z, [x20, x27]\n"
- "ld1b { z28.b }, p1/Z, [x23, x26]\n"
- "ld1b { z27.b }, p1/Z, [x22, x26]\n"
- "ld1b { z21.b }, p1/Z, [x21, x26]\n"
- "ld1b { z26.b }, p1/Z, [x20, x26]\n"
- "ld1b { z16.b }, p0/Z, [x23, x25]\n"
- "ld1b { z25.b }, p0/Z, [x22, x25]\n"
- "ld1b { z20.b }, p0/Z, [x21, x25]\n"
- "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+ "cbz x25, 4f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "movprfx z18, z31\n smax z18.b, p4/M, z18.b, z30.b\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "smax z22.b, p4/M, z22.b, z29.b\n"
- "movprfx z17, z28\n smax z17.b, p4/M, z17.b, z27.b\n"
- "ld1b { z2.b }, p3/Z, [x22, x28]\n"
- "smax z21.b, p4/M, z21.b, z26.b\n"
- "ld1b { z1.b }, p3/Z, [x21, x28]\n"
- "smax z16.b, p4/M, z16.b, z25.b\n"
- "ld1b { z0.b }, p3/Z, [x20, x28]\n"
- "smax z20.b, p4/M, z20.b, z24.b\n"
- "ld1b { z31.b }, p2/Z, [x23, x27]\n"
- "smax z19.b, p4/M, z19.b, z23.b\n"
- "ld1b { z30.b }, p2/Z, [x22, x27]\n"
- "smax z18.b, p4/M, z18.b, z22.b\n"
- "ld1b { z22.b }, p2/Z, [x21, x27]\n"
- "smax z17.b, p4/M, z17.b, z21.b\n"
- "ld1b { z29.b }, p2/Z, [x20, x27]\n"
- "smax z16.b, p4/M, z16.b, z20.b\n"
- "ld1b { z28.b }, p1/Z, [x23, x26]\n"
- "smax z8.b, p4/M, z8.b, z19.b\n"
- "ld1b { z27.b }, p1/Z, [x22, x26]\n"
- "smax z7.b, p4/M, z7.b, z18.b\n"
- "ld1b { z21.b }, p1/Z, [x21, x26]\n"
- "smax z6.b, p4/M, z6.b, z17.b\n"
- "ld1b { z26.b }, p1/Z, [x20, x26]\n"
- "smax z5.b, p4/M, z5.b, z16.b\n"
- "ld1b { z16.b }, p0/Z, [x23, x25]\n"
- "ld1b { z25.b }, p0/Z, [x22, x25]\n"
- "ld1b { z20.b }, p0/Z, [x21, x25]\n"
- "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+ "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
+ "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "movprfx z18, z0\n smax z18.b, p0/M, z18.b, z31.b\n"
+ "smax z22.b, p0/M, z22.b, z30.b\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "movprfx z17, z29\n smax z17.b, p0/M, z17.b, z28.b\n"
+ "smax z21.b, p0/M, z21.b, z27.b\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "movprfx z16, z26\n smax z16.b, p0/M, z16.b, z25.b\n"
+ "smax z20.b, p0/M, z20.b, z24.b\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+ "smax z19.b, p0/M, z19.b, z23.b\n"
+ "smax z18.b, p0/M, z18.b, z22.b\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+ "smax z17.b, p0/M, z17.b, z21.b\n"
+ "smax z16.b, p0/M, z16.b, z20.b\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+ "subs x25, x25, #0x1\n"
+ "smax z8.b, p0/M, z8.b, z19.b\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+ "smax z7.b, p0/M, z7.b, z18.b\n"
+ "smax z6.b, p0/M, z6.b, z17.b\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "smax z5.b, p0/M, z5.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
- "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
- "movprfx z18, z31\n smax z18.b, p4/M, z18.b, z30.b\n"
- "smax z22.b, p4/M, z22.b, z29.b\n"
- "movprfx z17, z28\n smax z17.b, p4/M, z17.b, z27.b\n"
- "smax z21.b, p4/M, z21.b, z26.b\n"
- "smax z16.b, p4/M, z16.b, z25.b\n"
- "smax z20.b, p4/M, z20.b, z24.b\n"
- "smax z19.b, p4/M, z19.b, z23.b\n"
- "smax z18.b, p4/M, z18.b, z22.b\n"
- "smax z17.b, p4/M, z17.b, z21.b\n"
- "smax z16.b, p4/M, z16.b, z20.b\n"
- "smax z8.b, p4/M, z8.b, z19.b\n"
- "smax z7.b, p4/M, z7.b, z18.b\n"
- "smax z6.b, p4/M, z6.b, z17.b\n"
- "smax z5.b, p4/M, z5.b, z16.b\n"
+ "movprfx z19, z4\n smax z19.b, p0/M, z19.b, z3.b\n"
+ "movprfx z23, z2\n smax z23.b, p0/M, z23.b, z1.b\n"
+ "movprfx z18, z0\n smax z18.b, p0/M, z18.b, z31.b\n"
+ "smax z22.b, p0/M, z22.b, z30.b\n"
+ "movprfx z17, z29\n smax z17.b, p0/M, z17.b, z28.b\n"
+ "smax z21.b, p0/M, z21.b, z27.b\n"
+ "movprfx z16, z26\n smax z16.b, p0/M, z16.b, z25.b\n"
+ "smax z20.b, p0/M, z20.b, z24.b\n"
+ "smax z19.b, p0/M, z19.b, z23.b\n"
+ "smax z18.b, p0/M, z18.b, z22.b\n"
+ "smax z17.b, p0/M, z17.b, z21.b\n"
+ "smax z16.b, p0/M, z16.b, z20.b\n"
+ "smax z8.b, p0/M, z8.b, z19.b\n"
+ "smax z7.b, p0/M, z7.b, z18.b\n"
+ "smax z6.b, p0/M, z6.b, z17.b\n"
+ "smax z5.b, p0/M, z5.b, z16.b\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "smax z8.b, p4/M, z8.b, z3.b\n"
- "ld1b { z31.b }, p2/Z, [x23, x27]\n"
- "ld1b { z28.b }, p1/Z, [x23, x26]\n"
- "smax z7.b, p4/M, z7.b, z31.b\n"
- "ld1b { z16.b }, p0/Z, [x23, x25]\n"
- "smax z6.b, p4/M, z6.b, z28.b\n"
- "smax z5.b, p4/M, z5.b, z16.b\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+ "subs x21, x21, #0x1\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
+ "ld1b { z17.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "smax z7.b, p0/M, z7.b, z17.b\n"
+ "smax z6.b, p0/M, z6.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+ "smax z5.b, p0/M, z5.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "mov z4.s, #0x7f\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1rw { z3.s }, p4/Z, [x19]\n"
".inst 0x4508a111 // sshllb z17.h, z8.b, #0x0\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- ".inst 0x4508a510 // sshllt z16.h, z8.b, #0x0\n"
- "ld1rw { z2.s }, p4/Z, [x19]\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- ".inst 0x4508a0f2 // sshllb z18.h, z7.b, #0x0\n"
- "ld1rw { z1.s }, p4/Z, [x19]\n"
- ".inst 0x4508a4f7 // sshllt z23.h, z7.b, #0x0\n"
- ".inst 0x4508a0d6 // sshllb z22.h, z6.b, #0x0\n"
- ".inst 0x4508a4d5 // sshllt z21.h, z6.b, #0x0\n"
- ".inst 0x4508a0b4 // sshllb z20.h, z5.b, #0x0\n"
- ".inst 0x4508a4b3 // sshllt z19.h, z5.b, #0x0\n"
- ".inst 0x4510a220 // sshllb z0.s, z17.h, #0x0\n"
+ ".inst 0x4508a517 // sshllt z23.h, z8.b, #0x0\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+ "ld1rw { z4.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a0f6 // sshllb z22.h, z7.b, #0x0\n"
+ ".inst 0x4508a4f5 // sshllt z21.h, z7.b, #0x0\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+ "ld1rw { z3.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a0d4 // sshllb z20.h, z6.b, #0x0\n"
+ ".inst 0x4508a4d3 // sshllt z19.h, z6.b, #0x0\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+ "ld1rw { z2.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a0b2 // sshllb z18.h, z5.b, #0x0\n"
+ ".inst 0x4508a4b0 // sshllt z16.h, z5.b, #0x0\n"
+ ".inst 0x4510a221 // sshllb z1.s, z17.h, #0x0\n"
".inst 0x4510a631 // sshllt z17.s, z17.h, #0x0\n"
- ".inst 0x4510a21f // sshllb z31.s, z16.h, #0x0\n"
- ".inst 0x4510a610 // sshllt z16.s, z16.h, #0x0\n"
- ".inst 0x4510a25e // sshllb z30.s, z18.h, #0x0\n"
- ".inst 0x4510a652 // sshllt z18.s, z18.h, #0x0\n"
- ".inst 0x4510a2fd // sshllb z29.s, z23.h, #0x0\n"
- ".inst 0x4510a6fc // sshllt z28.s, z23.h, #0x0\n"
- ".inst 0x4510a2db // sshllb z27.s, z22.h, #0x0\n"
- ".inst 0x4510a6da // sshllt z26.s, z22.h, #0x0\n"
- ".inst 0x4510a2b9 // sshllb z25.s, z21.h, #0x0\n"
- ".inst 0x4510a6b8 // sshllt z24.s, z21.h, #0x0\n"
- ".inst 0x4510a297 // sshllb z23.s, z20.h, #0x0\n"
- ".inst 0x4510a696 // sshllt z22.s, z20.h, #0x0\n"
- ".inst 0x4510a275 // sshllb z21.s, z19.h, #0x0\n"
- ".inst 0x4510a674 // sshllt z20.s, z19.h, #0x0\n"
- ".inst 0x44829040 // srshl z0.s, p4/M, z0.s, z2.s\n"
- ".inst 0x44829051 // srshl z17.s, p4/M, z17.s, z2.s\n"
- ".inst 0x4482905f // srshl z31.s, p4/M, z31.s, z2.s\n"
- ".inst 0x44829050 // srshl z16.s, p4/M, z16.s, z2.s\n"
- ".inst 0x04a37400 // sqrdmulh z0.s, z0.s, z3.s\n"
+ ".inst 0x44828081 // srshl z1.s, p0/M, z1.s, z4.s\n"
+ ".inst 0x44828091 // srshl z17.s, p0/M, z17.s, z4.s\n"
+ ".inst 0x4510a2e0 // sshllb z0.s, z23.h, #0x0\n"
+ ".inst 0x4510a6ff // sshllt z31.s, z23.h, #0x0\n"
+ ".inst 0x44828080 // srshl z0.s, p0/M, z0.s, z4.s\n"
+ ".inst 0x4482809f // srshl z31.s, p0/M, z31.s, z4.s\n"
+ ".inst 0x4510a2de // sshllb z30.s, z22.h, #0x0\n"
+ ".inst 0x4510a6dd // sshllt z29.s, z22.h, #0x0\n"
+ ".inst 0x4482809e // srshl z30.s, p0/M, z30.s, z4.s\n"
+ ".inst 0x4482809d // srshl z29.s, p0/M, z29.s, z4.s\n"
+ ".inst 0x4510a2bc // sshllb z28.s, z21.h, #0x0\n"
+ ".inst 0x4510a6bb // sshllt z27.s, z21.h, #0x0\n"
+ ".inst 0x4482809c // srshl z28.s, p0/M, z28.s, z4.s\n"
+ ".inst 0x4482809b // srshl z27.s, p0/M, z27.s, z4.s\n"
+ ".inst 0x4510a29a // sshllb z26.s, z20.h, #0x0\n"
+ ".inst 0x4510a699 // sshllt z25.s, z20.h, #0x0\n"
+ ".inst 0x4482809a // srshl z26.s, p0/M, z26.s, z4.s\n"
+ ".inst 0x44828099 // srshl z25.s, p0/M, z25.s, z4.s\n"
+ ".inst 0x4510a278 // sshllb z24.s, z19.h, #0x0\n"
+ ".inst 0x4510a677 // sshllt z23.s, z19.h, #0x0\n"
+ ".inst 0x44828098 // srshl z24.s, p0/M, z24.s, z4.s\n"
+ ".inst 0x44828097 // srshl z23.s, p0/M, z23.s, z4.s\n"
+ ".inst 0x4510a256 // sshllb z22.s, z18.h, #0x0\n"
+ ".inst 0x4510a655 // sshllt z21.s, z18.h, #0x0\n"
+ ".inst 0x44828096 // srshl z22.s, p0/M, z22.s, z4.s\n"
+ ".inst 0x44828095 // srshl z21.s, p0/M, z21.s, z4.s\n"
+ ".inst 0x4510a214 // sshllb z20.s, z16.h, #0x0\n"
+ ".inst 0x4510a613 // sshllt z19.s, z16.h, #0x0\n"
+ ".inst 0x44828094 // srshl z20.s, p0/M, z20.s, z4.s\n"
+ ".inst 0x44828093 // srshl z19.s, p0/M, z19.s, z4.s\n"
+ ".inst 0x04a37421 // sqrdmulh z1.s, z1.s, z3.s\n"
".inst 0x04a37631 // sqrdmulh z17.s, z17.s, z3.s\n"
+ ".inst 0x44828041 // srshl z1.s, p0/M, z1.s, z2.s\n"
+ ".inst 0x44828051 // srshl z17.s, p0/M, z17.s, z2.s\n"
+ ".inst 0x04a37400 // sqrdmulh z0.s, z0.s, z3.s\n"
".inst 0x04a377ff // sqrdmulh z31.s, z31.s, z3.s\n"
- ".inst 0x04a37610 // sqrdmulh z16.s, z16.s, z3.s\n"
- ".inst 0x44829020 // srshl z0.s, p4/M, z0.s, z1.s\n"
- ".inst 0x44829031 // srshl z17.s, p4/M, z17.s, z1.s\n"
- ".inst 0x4482903f // srshl z31.s, p4/M, z31.s, z1.s\n"
- ".inst 0x44829030 // srshl z16.s, p4/M, z16.s, z1.s\n"
- ".inst 0x4482905e // srshl z30.s, p4/M, z30.s, z2.s\n"
- ".inst 0x44829052 // srshl z18.s, p4/M, z18.s, z2.s\n"
- ".inst 0x4482905d // srshl z29.s, p4/M, z29.s, z2.s\n"
- ".inst 0x4482905c // srshl z28.s, p4/M, z28.s, z2.s\n"
+ ".inst 0x44828040 // srshl z0.s, p0/M, z0.s, z2.s\n"
+ ".inst 0x4482805f // srshl z31.s, p0/M, z31.s, z2.s\n"
".inst 0x04a377de // sqrdmulh z30.s, z30.s, z3.s\n"
- ".inst 0x04a37652 // sqrdmulh z18.s, z18.s, z3.s\n"
".inst 0x04a377bd // sqrdmulh z29.s, z29.s, z3.s\n"
+ ".inst 0x4482805e // srshl z30.s, p0/M, z30.s, z2.s\n"
+ ".inst 0x4482805d // srshl z29.s, p0/M, z29.s, z2.s\n"
".inst 0x04a3779c // sqrdmulh z28.s, z28.s, z3.s\n"
- ".inst 0x4482903e // srshl z30.s, p4/M, z30.s, z1.s\n"
- ".inst 0x44829032 // srshl z18.s, p4/M, z18.s, z1.s\n"
- ".inst 0x4482903d // srshl z29.s, p4/M, z29.s, z1.s\n"
- ".inst 0x4482903c // srshl z28.s, p4/M, z28.s, z1.s\n"
- ".inst 0x4482905b // srshl z27.s, p4/M, z27.s, z2.s\n"
- ".inst 0x4482905a // srshl z26.s, p4/M, z26.s, z2.s\n"
- ".inst 0x44829059 // srshl z25.s, p4/M, z25.s, z2.s\n"
- ".inst 0x44829058 // srshl z24.s, p4/M, z24.s, z2.s\n"
".inst 0x04a3777b // sqrdmulh z27.s, z27.s, z3.s\n"
+ ".inst 0x4482805c // srshl z28.s, p0/M, z28.s, z2.s\n"
+ ".inst 0x4482805b // srshl z27.s, p0/M, z27.s, z2.s\n"
".inst 0x04a3775a // sqrdmulh z26.s, z26.s, z3.s\n"
".inst 0x04a37739 // sqrdmulh z25.s, z25.s, z3.s\n"
+ ".inst 0x4482805a // srshl z26.s, p0/M, z26.s, z2.s\n"
+ ".inst 0x44828059 // srshl z25.s, p0/M, z25.s, z2.s\n"
".inst 0x04a37718 // sqrdmulh z24.s, z24.s, z3.s\n"
- ".inst 0x4482903b // srshl z27.s, p4/M, z27.s, z1.s\n"
- ".inst 0x4482903a // srshl z26.s, p4/M, z26.s, z1.s\n"
- ".inst 0x44829039 // srshl z25.s, p4/M, z25.s, z1.s\n"
- ".inst 0x44829038 // srshl z24.s, p4/M, z24.s, z1.s\n"
- ".inst 0x44829057 // srshl z23.s, p4/M, z23.s, z2.s\n"
- ".inst 0x44829056 // srshl z22.s, p4/M, z22.s, z2.s\n"
- ".inst 0x44829055 // srshl z21.s, p4/M, z21.s, z2.s\n"
- ".inst 0x44829054 // srshl z20.s, p4/M, z20.s, z2.s\n"
".inst 0x04a376f7 // sqrdmulh z23.s, z23.s, z3.s\n"
+ ".inst 0x44828058 // srshl z24.s, p0/M, z24.s, z2.s\n"
+ ".inst 0x44828057 // srshl z23.s, p0/M, z23.s, z2.s\n"
".inst 0x04a376d6 // sqrdmulh z22.s, z22.s, z3.s\n"
".inst 0x04a376b5 // sqrdmulh z21.s, z21.s, z3.s\n"
+ ".inst 0x44828056 // srshl z22.s, p0/M, z22.s, z2.s\n"
+ ".inst 0x44828055 // srshl z21.s, p0/M, z21.s, z2.s\n"
".inst 0x04a37694 // sqrdmulh z20.s, z20.s, z3.s\n"
- ".inst 0x44829037 // srshl z23.s, p4/M, z23.s, z1.s\n"
- ".inst 0x44829036 // srshl z22.s, p4/M, z22.s, z1.s\n"
- ".inst 0x44829035 // srshl z21.s, p4/M, z21.s, z1.s\n"
- ".inst 0x44829034 // srshl z20.s, p4/M, z20.s, z1.s\n"
- "not z19.s, p4/M, z4.s\n"
- "smax z0.s, p4/M, z0.s, z19.s\n"
- "smax z17.s, p4/M, z17.s, z19.s\n"
- "smax z31.s, p4/M, z31.s, z19.s\n"
- "smax z16.s, p4/M, z16.s, z19.s\n"
- "smin z0.s, p4/M, z0.s, z4.s\n"
- "smin z17.s, p4/M, z17.s, z4.s\n"
- "smin z31.s, p4/M, z31.s, z4.s\n"
- "smin z16.s, p4/M, z16.s, z4.s\n"
- "smax z30.s, p4/M, z30.s, z19.s\n"
- "trn1 z17.h, z0.h, z17.h\n"
- "smax z18.s, p4/M, z18.s, z19.s\n"
- "trn1 z16.h, z31.h, z16.h\n"
- "smin z30.s, p4/M, z30.s, z4.s\n"
+ ".inst 0x04a37673 // sqrdmulh z19.s, z19.s, z3.s\n"
+ ".inst 0x44828054 // srshl z20.s, p0/M, z20.s, z2.s\n"
+ ".inst 0x44828053 // srshl z19.s, p0/M, z19.s, z2.s\n"
+ "mov z18.s, #0x7f\n"
+ "not z16.s, p0/M, z18.s\n"
+ "smax z1.s, p0/M, z1.s, z16.s\n"
+ "smax z17.s, p0/M, z17.s, z16.s\n"
+ "smax z0.s, p0/M, z0.s, z16.s\n"
+ "smax z31.s, p0/M, z31.s, z16.s\n"
+ "smax z30.s, p0/M, z30.s, z16.s\n"
+ "smax z29.s, p0/M, z29.s, z16.s\n"
+ "smax z28.s, p0/M, z28.s, z16.s\n"
+ "smax z27.s, p0/M, z27.s, z16.s\n"
+ "smax z26.s, p0/M, z26.s, z16.s\n"
+ "smax z25.s, p0/M, z25.s, z16.s\n"
+ "smax z24.s, p0/M, z24.s, z16.s\n"
+ "smax z23.s, p0/M, z23.s, z16.s\n"
+ "smax z22.s, p0/M, z22.s, z16.s\n"
+ "smax z21.s, p0/M, z21.s, z16.s\n"
+ "smax z20.s, p0/M, z20.s, z16.s\n"
+ "smax z19.s, p0/M, z19.s, z16.s\n"
+ "smin z1.s, p0/M, z1.s, z18.s\n"
+ "smin z17.s, p0/M, z17.s, z18.s\n"
+ "smin z0.s, p0/M, z0.s, z18.s\n"
+ "trn1 z17.h, z1.h, z17.h\n"
+ "smin z31.s, p0/M, z31.s, z18.s\n"
+ "smin z30.s, p0/M, z30.s, z18.s\n"
+ "trn1 z16.h, z0.h, z31.h\n"
+ "trn1 z16.b, z17.b, z16.b\n"
+ "smin z29.s, p0/M, z29.s, z18.s\n"
+ "smin z28.s, p0/M, z28.s, z18.s\n"
+ "trn1 z17.h, z30.h, z29.h\n"
+ "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+ "smin z27.s, p0/M, z27.s, z18.s\n"
+ "smin z26.s, p0/M, z26.s, z18.s\n"
+ "trn1 z16.h, z28.h, z27.h\n"
"trn1 z16.b, z17.b, z16.b\n"
+ "smin z25.s, p0/M, z25.s, z18.s\n"
+ "smin z24.s, p0/M, z24.s, z18.s\n"
+ "trn1 z17.h, z26.h, z25.h\n"
"st1b { z16.b }, p3, [%x[outptr], x28]\n"
- "smin z18.s, p4/M, z18.s, z4.s\n"
- "incb x28, ALL, MUL #4\n"
- "smax z29.s, p4/M, z29.s, z19.s\n"
- "smax z28.s, p4/M, z28.s, z19.s\n"
- "smax z27.s, p4/M, z27.s, z19.s\n"
- "smax z26.s, p4/M, z26.s, z19.s\n"
- "trn1 z18.h, z30.h, z18.h\n"
- "smin z29.s, p4/M, z29.s, z4.s\n"
- "smin z28.s, p4/M, z28.s, z4.s\n"
- "smin z27.s, p4/M, z27.s, z4.s\n"
- "smin z26.s, p4/M, z26.s, z4.s\n"
- "smax z25.s, p4/M, z25.s, z19.s\n"
- "trn1 z16.h, z29.h, z28.h\n"
- "smax z24.s, p4/M, z24.s, z19.s\n"
- "trn1 z17.h, z27.h, z26.h\n"
- "trn1 z16.b, z18.b, z16.b\n"
+ "smin z23.s, p0/M, z23.s, z18.s\n"
+ "smin z22.s, p0/M, z22.s, z18.s\n"
+ "trn1 z16.h, z24.h, z23.h\n"
+ "trn1 z16.b, z17.b, z16.b\n"
+ "smin z21.s, p0/M, z21.s, z18.s\n"
+ "smin z20.s, p0/M, z20.s, z18.s\n"
+ "trn1 z17.h, z22.h, z21.h\n"
"st1b { z16.b }, p2, [%x[outptr], x27]\n"
- "smin z25.s, p4/M, z25.s, z4.s\n"
- "incb x27, ALL, MUL #4\n"
- "smin z24.s, p4/M, z24.s, z4.s\n"
- "smax z23.s, p4/M, z23.s, z19.s\n"
- "smax z22.s, p4/M, z22.s, z19.s\n"
- "smax z21.s, p4/M, z21.s, z19.s\n"
- "smax z20.s, p4/M, z20.s, z19.s\n"
- "trn1 z16.h, z25.h, z24.h\n"
- "smin z23.s, p4/M, z23.s, z4.s\n"
+ "smin z19.s, p0/M, z19.s, z18.s\n"
+ "trn1 z16.h, z20.h, z19.h\n"
"trn1 z16.b, z17.b, z16.b\n"
"st1b { z16.b }, p1, [%x[outptr], x26]\n"
- "smin z22.s, p4/M, z22.s, z4.s\n"
"incb x26, ALL, MUL #4\n"
- "smin z21.s, p4/M, z21.s, z4.s\n"
- "smin z20.s, p4/M, z20.s, z4.s\n"
- "trn1 z17.h, z23.h, z22.h\n"
- "trn1 z16.h, z21.h, z20.h\n"
- "trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p0, [%x[outptr], x25]\n"
- "incb x25, ALL, MUL #4\n"
- "whilelt p0.b, x25, %x[n_channels]\n"
+ "whilelt p1.b, x26, %x[n_channels]\n"
+ "incb x9, ALL, MUL #4\n"
+ "incb x28, ALL, MUL #4\n"
+ "incb x27, ALL, MUL #4\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p3.b, x28, %x[n_channels]\n"
+ "whilelt p4.b, x9, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
+ "lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.b, #0x80\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 11f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "ld1b { z2.b }, p3/Z, [x22, x28]\n"
- "ld1b { z1.b }, p3/Z, [x21, x28]\n"
- "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "smax z19.b, p4/M, z19.b, z23.b\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "ld1b { z2.b }, p3/Z, [x22, x28]\n"
- "smax z8.b, p4/M, z8.b, z19.b\n"
- "ld1b { z1.b }, p3/Z, [x21, x28]\n"
- "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+ "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "subs x25, x25, #0x1\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z3\n smax z19.b, p4/M, z19.b, z2.b\n"
- "movprfx z23, z1\n smax z23.b, p4/M, z23.b, z0.b\n"
- "smax z19.b, p4/M, z19.b, z23.b\n"
- "smax z8.b, p4/M, z8.b, z19.b\n"
+ "movprfx z16, z4\n smax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n smax z17.b, p0/M, z17.b, z1.b\n"
+ "smax z16.b, p0/M, z16.b, z17.b\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "smax z8.b, p4/M, z8.b, z3.b\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+ "subs x21, x21, #0x1\n"
+ "smax z8.b, p0/M, z8.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "mov z4.s, #0x7f\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- "ld1rw { z3.s }, p4/Z, [x19]\n"
".inst 0x4508a111 // sshllb z17.h, z8.b, #0x0\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- ".inst 0x4508a510 // sshllt z16.h, z8.b, #0x0\n"
- "ld1rw { z2.s }, p4/Z, [x19]\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- ".inst 0x4510a220 // sshllb z0.s, z17.h, #0x0\n"
- "ld1rw { z1.s }, p4/Z, [x19]\n"
- ".inst 0x4510a631 // sshllt z17.s, z17.h, #0x0\n"
- ".inst 0x4510a21f // sshllb z31.s, z16.h, #0x0\n"
- ".inst 0x4510a610 // sshllt z16.s, z16.h, #0x0\n"
- ".inst 0x44829040 // srshl z0.s, p4/M, z0.s, z2.s\n"
- ".inst 0x44829051 // srshl z17.s, p4/M, z17.s, z2.s\n"
- ".inst 0x4482905f // srshl z31.s, p4/M, z31.s, z2.s\n"
- ".inst 0x44829050 // srshl z16.s, p4/M, z16.s, z2.s\n"
- ".inst 0x04a37400 // sqrdmulh z0.s, z0.s, z3.s\n"
- ".inst 0x04a37631 // sqrdmulh z17.s, z17.s, z3.s\n"
- ".inst 0x04a377ff // sqrdmulh z31.s, z31.s, z3.s\n"
- ".inst 0x04a37610 // sqrdmulh z16.s, z16.s, z3.s\n"
- ".inst 0x44829020 // srshl z0.s, p4/M, z0.s, z1.s\n"
- ".inst 0x44829031 // srshl z17.s, p4/M, z17.s, z1.s\n"
- ".inst 0x4482903f // srshl z31.s, p4/M, z31.s, z1.s\n"
- ".inst 0x44829030 // srshl z16.s, p4/M, z16.s, z1.s\n"
- "not z19.s, p4/M, z4.s\n"
- "smax z0.s, p4/M, z0.s, z19.s\n"
- "smax z17.s, p4/M, z17.s, z19.s\n"
- "smax z31.s, p4/M, z31.s, z19.s\n"
- "smax z16.s, p4/M, z16.s, z19.s\n"
- "smin z0.s, p4/M, z0.s, z4.s\n"
- "smin z17.s, p4/M, z17.s, z4.s\n"
- "smin z31.s, p4/M, z31.s, z4.s\n"
- "smin z16.s, p4/M, z16.s, z4.s\n"
- "trn1 z17.h, z0.h, z17.h\n"
- "trn1 z16.h, z31.h, z16.h\n"
+ ".inst 0x4508a512 // sshllt z18.h, z8.b, #0x0\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x4510a236 // sshllb z22.s, z17.h, #0x0\n"
+ ".inst 0x4510a635 // sshllt z21.s, z17.h, #0x0\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+ "ld1rw { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x4510a254 // sshllb z20.s, z18.h, #0x0\n"
+ ".inst 0x4510a653 // sshllt z19.s, z18.h, #0x0\n"
+ ".inst 0x44828216 // srshl z22.s, p0/M, z22.s, z16.s\n"
+ ".inst 0x44828215 // srshl z21.s, p0/M, z21.s, z16.s\n"
+ ".inst 0x44828214 // srshl z20.s, p0/M, z20.s, z16.s\n"
+ ".inst 0x44828213 // srshl z19.s, p0/M, z19.s, z16.s\n"
+ ".inst 0x04b176d6 // sqrdmulh z22.s, z22.s, z17.s\n"
+ ".inst 0x04b176b5 // sqrdmulh z21.s, z21.s, z17.s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x04b17694 // sqrdmulh z20.s, z20.s, z17.s\n"
+ ".inst 0x04b17673 // sqrdmulh z19.s, z19.s, z17.s\n"
+ "mov z18.s, #0x7f\n"
+ ".inst 0x44828216 // srshl z22.s, p0/M, z22.s, z16.s\n"
+ ".inst 0x44828215 // srshl z21.s, p0/M, z21.s, z16.s\n"
+ ".inst 0x44828214 // srshl z20.s, p0/M, z20.s, z16.s\n"
+ ".inst 0x44828213 // srshl z19.s, p0/M, z19.s, z16.s\n"
+ "not z16.s, p0/M, z18.s\n"
+ "smax z22.s, p0/M, z22.s, z16.s\n"
+ "smax z21.s, p0/M, z21.s, z16.s\n"
+ "smax z20.s, p0/M, z20.s, z16.s\n"
+ "smax z19.s, p0/M, z19.s, z16.s\n"
+ "smin z22.s, p0/M, z22.s, z18.s\n"
+ "smin z21.s, p0/M, z21.s, z18.s\n"
+ "smin z20.s, p0/M, z20.s, z18.s\n"
+ "trn1 z17.h, z22.h, z21.h\n"
+ "smin z19.s, p0/M, z19.s, z18.s\n"
+ "trn1 z16.h, z20.h, z19.h\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p3, [%x[outptr], x28]\n"
- "incb x28\n"
- "whilelt p3.b, x28, %x[n_channels]\n"
+ "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+ "incb x9\n"
+ "whilelt p4.b, x9, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
-
:
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [outptr] "r" (outptr), [quant_params] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst.hpp
index 299e55c9be..714530bc43 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,29 +26,21 @@
#pragma once
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
void sve_u8_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
-struct sve_u8_nhwc_avg_generic_depthfirst
+struct sve_u8_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t>
{
- typedef uint8_t operand_type;
- typedef uint8_t return_type;
-
- typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
- kern_type kernel = sve_u8_nhwc_avg_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t>;
sve_u8_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sve_u8_nhwc_avg_generic_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp
index 51a69a42be..f3f4950a1f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,11 +23,12 @@
*/
#include <cstdint>
+#include <cstddef>
#include <cstring>
#include <cmath>
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
@@ -84,30 +85,31 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
f_rescale_value *= 2.0f;
}
- rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
- if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+ int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+ if (long_rescale_value == (1ll << 31))
{
shift_value++;
- rescale_value >>= 1;
+ long_rescale_value >>= 1;
}
+ rescale_value = static_cast<int32_t>(long_rescale_value);
}
__asm__ __volatile__(
- "ptrue p4.b\n"
- "mov x26, #0x0\n"
- "cntb x25\n"
- "cntb x24, ALL, MUL #2\n"
- "cntb x23, ALL, MUL #3\n"
+ "mov x27, #0x0\n"
+ "cntb x26\n"
+ "cntb x25, ALL, MUL #2\n"
+ "cntb x24, ALL, MUL #3\n"
+ "whilelt p4.b, x27, %x[n_channels]\n"
"whilelt p3.b, x26, %x[n_channels]\n"
"whilelt p2.b, x25, %x[n_channels]\n"
"whilelt p1.b, x24, %x[n_channels]\n"
- "whilelt p0.b, x23, %x[n_channels]\n"
+ "ptrue p0.b\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
+ "lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
- "mov x19, %x[inptrs]\n"
"mov z14.s, #0x0\n"
- "lsr x22, %x[n_valid_cells], #0x1\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
"mov z11.s, #0x0\n"
@@ -122,43 +124,43 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
"mov z2.s, #0x0\n"
"mov z1.s, #0x0\n"
"mov z0.s, #0x0\n"
- "cbz x22, 4f\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
- "add x19, x19, #0x10\n"
- "ld1b { z30.b }, p3/Z, [x20, x26]\n"
- "subs x22, x22, #0x1\n"
- "ld1b { z29.b }, p2/Z, [x21, x25]\n"
- "ld1b { z28.b }, p2/Z, [x20, x25]\n"
- "ld1b { z27.b }, p1/Z, [x21, x24]\n"
- "ld1b { z26.b }, p1/Z, [x20, x24]\n"
- "ld1b { z25.b }, p0/Z, [x21, x23]\n"
- "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+ "cbz x23, 4f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "add x19, x19, #0x10\n"
".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
".inst 0x455c0bb5 // uaddlb z21.h, z29.b, z28.b\n"
- "subs x22, x22, #0x1\n"
".inst 0x455c0fb4 // uaddlt z20.h, z29.b, z28.b\n"
- "ld1b { z30.b }, p3/Z, [x20, x26]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0b73 // uaddlb z19.h, z27.b, z26.b\n"
- "ld1b { z29.b }, p2/Z, [x21, x25]\n"
".inst 0x455a0f72 // uaddlt z18.h, z27.b, z26.b\n"
- "ld1b { z28.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x45580b31 // uaddlb z17.h, z25.b, z24.b\n"
- "ld1b { z27.b }, p1/Z, [x21, x24]\n"
".inst 0x45580f30 // uaddlt z16.h, z25.b, z24.b\n"
- "ld1b { z26.b }, p1/Z, [x20, x24]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- "ld1b { z25.b }, p0/Z, [x21, x23]\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
@@ -198,219 +200,218 @@ void sve_u8_nhwc_avg_generic_depthfirst_impl(
".inst 0x45904821 // uaddwb z1.s, z1.s, z16.h\n"
".inst 0x45904c00 // uaddwt z0.s, z0.s, z16.h\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x1\n"
+ "ands x21, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x21, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
- ".inst 0x4508abf1 // ushllb z17.h, z31.b, #0x0\n"
- "ld1b { z29.b }, p2/Z, [x21, x25]\n"
- ".inst 0x4508aff0 // ushllt z16.h, z31.b, #0x0\n"
- "ld1b { z27.b }, p1/Z, [x21, x24]\n"
- ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
- "ld1b { z25.b }, p0/Z, [x21, x23]\n"
- ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
- ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
- ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
- ".inst 0x4508abb0 // ushllb z16.h, z29.b, #0x0\n"
- ".inst 0x4590496b // uaddwb z11.s, z11.s, z16.h\n"
- ".inst 0x45904d4a // uaddwt z10.s, z10.s, z16.h\n"
- ".inst 0x4508afb0 // ushllt z16.h, z29.b, #0x0\n"
- ".inst 0x45904929 // uaddwb z9.s, z9.s, z16.h\n"
- ".inst 0x45904d08 // uaddwt z8.s, z8.s, z16.h\n"
- ".inst 0x4508ab70 // ushllb z16.h, z27.b, #0x0\n"
- ".inst 0x459048e7 // uaddwb z7.s, z7.s, z16.h\n"
- ".inst 0x45904cc6 // uaddwt z6.s, z6.s, z16.h\n"
- ".inst 0x4508af70 // ushllt z16.h, z27.b, #0x0\n"
- ".inst 0x459048a5 // uaddwb z5.s, z5.s, z16.h\n"
- ".inst 0x45904c84 // uaddwt z4.s, z4.s, z16.h\n"
- ".inst 0x4508ab30 // ushllb z16.h, z25.b, #0x0\n"
- ".inst 0x45904863 // uaddwb z3.s, z3.s, z16.h\n"
- ".inst 0x45904c42 // uaddwt z2.s, z2.s, z16.h\n"
- ".inst 0x4508af30 // ushllt z16.h, z25.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa17 // ushllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508ae16 // ushllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z17.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508aa15 // ushllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508ae14 // ushllt z20.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508aa33 // ushllb z19.h, z17.b, #0x0\n"
+ ".inst 0x4508ae32 // ushllt z18.h, z17.b, #0x0\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
+ ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
+ ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
+ ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
+ ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
+ ".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
+ ".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
+ ".inst 0x45944d08 // uaddwt z8.s, z8.s, z20.h\n"
+ ".inst 0x459348e7 // uaddwb z7.s, z7.s, z19.h\n"
+ ".inst 0x45934cc6 // uaddwt z6.s, z6.s, z19.h\n"
+ ".inst 0x459248a5 // uaddwb z5.s, z5.s, z18.h\n"
+ ".inst 0x45924c84 // uaddwt z4.s, z4.s, z18.h\n"
+ ".inst 0x45914863 // uaddwb z3.s, z3.s, z17.h\n"
+ ".inst 0x45914c42 // uaddwt z2.s, z2.s, z17.h\n"
".inst 0x45904821 // uaddwb z1.s, z1.s, z16.h\n"
".inst 0x45904c00 // uaddwt z0.s, z0.s, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "mov z20.s, #0x0\n"
- "ld1rw { z17.s }, p4/Z, [%x[rescale_ptr]]\n"
- "mov z19.s, #0xff\n"
- "ld1rw { z16.s }, p4/Z, [%x[shift_ptr]]\n"
+ "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
+ "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
".inst 0x04b175ef // sqdmulh z15.s, z15.s, z17.s\n"
".inst 0x04b175ce // sqdmulh z14.s, z14.s, z17.s\n"
".inst 0x04b175ad // sqdmulh z13.s, z13.s, z17.s\n"
".inst 0x04b1758c // sqdmulh z12.s, z12.s, z17.s\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
".inst 0x04b1756b // sqdmulh z11.s, z11.s, z17.s\n"
".inst 0x04b1754a // sqdmulh z10.s, z10.s, z17.s\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
".inst 0x04b17529 // sqdmulh z9.s, z9.s, z17.s\n"
".inst 0x04b17508 // sqdmulh z8.s, z8.s, z17.s\n"
+ ".inst 0x4482820b // srshl z11.s, p0/M, z11.s, z16.s\n"
+ ".inst 0x4482820a // srshl z10.s, p0/M, z10.s, z16.s\n"
".inst 0x04b174e7 // sqdmulh z7.s, z7.s, z17.s\n"
".inst 0x04b174c6 // sqdmulh z6.s, z6.s, z17.s\n"
+ ".inst 0x44828209 // srshl z9.s, p0/M, z9.s, z16.s\n"
+ ".inst 0x44828208 // srshl z8.s, p0/M, z8.s, z16.s\n"
".inst 0x04b174a5 // sqdmulh z5.s, z5.s, z17.s\n"
".inst 0x04b17484 // sqdmulh z4.s, z4.s, z17.s\n"
+ ".inst 0x44828207 // srshl z7.s, p0/M, z7.s, z16.s\n"
+ ".inst 0x44828206 // srshl z6.s, p0/M, z6.s, z16.s\n"
".inst 0x04b17463 // sqdmulh z3.s, z3.s, z17.s\n"
".inst 0x04b17442 // sqdmulh z2.s, z2.s, z17.s\n"
+ ".inst 0x44828205 // srshl z5.s, p0/M, z5.s, z16.s\n"
+ ".inst 0x44828204 // srshl z4.s, p0/M, z4.s, z16.s\n"
".inst 0x04b17421 // sqdmulh z1.s, z1.s, z17.s\n"
".inst 0x04b17400 // sqdmulh z0.s, z0.s, z17.s\n"
- ".inst 0x4482920f // srshl z15.s, p4/M, z15.s, z16.s\n"
- ".inst 0x4482920e // srshl z14.s, p4/M, z14.s, z16.s\n"
- ".inst 0x4482920d // srshl z13.s, p4/M, z13.s, z16.s\n"
- ".inst 0x4482920c // srshl z12.s, p4/M, z12.s, z16.s\n"
- ".inst 0x4482920b // srshl z11.s, p4/M, z11.s, z16.s\n"
- ".inst 0x4482920a // srshl z10.s, p4/M, z10.s, z16.s\n"
- ".inst 0x44829209 // srshl z9.s, p4/M, z9.s, z16.s\n"
- ".inst 0x44829208 // srshl z8.s, p4/M, z8.s, z16.s\n"
- ".inst 0x44829207 // srshl z7.s, p4/M, z7.s, z16.s\n"
- ".inst 0x44829206 // srshl z6.s, p4/M, z6.s, z16.s\n"
- ".inst 0x44829205 // srshl z5.s, p4/M, z5.s, z16.s\n"
- ".inst 0x44829204 // srshl z4.s, p4/M, z4.s, z16.s\n"
- ".inst 0x44829203 // srshl z3.s, p4/M, z3.s, z16.s\n"
- ".inst 0x44829202 // srshl z2.s, p4/M, z2.s, z16.s\n"
- ".inst 0x44829201 // srshl z1.s, p4/M, z1.s, z16.s\n"
- ".inst 0x44829200 // srshl z0.s, p4/M, z0.s, z16.s\n"
- "smax z15.s, p4/M, z15.s, z20.s\n"
- "smax z14.s, p4/M, z14.s, z20.s\n"
- "smax z13.s, p4/M, z13.s, z20.s\n"
- "smax z12.s, p4/M, z12.s, z20.s\n"
- "smin z15.s, p4/M, z15.s, z19.s\n"
- "smin z14.s, p4/M, z14.s, z19.s\n"
- "smin z13.s, p4/M, z13.s, z19.s\n"
- "smin z12.s, p4/M, z12.s, z19.s\n"
- "smax z11.s, p4/M, z11.s, z20.s\n"
+ ".inst 0x44828203 // srshl z3.s, p0/M, z3.s, z16.s\n"
+ ".inst 0x44828202 // srshl z2.s, p0/M, z2.s, z16.s\n"
+ ".inst 0x44828201 // srshl z1.s, p0/M, z1.s, z16.s\n"
+ ".inst 0x44828200 // srshl z0.s, p0/M, z0.s, z16.s\n"
+ "mov z16.s, #0x0\n"
+ "mov z18.s, #0xff\n"
+ "smax z15.s, p0/M, z15.s, z16.s\n"
+ "smax z14.s, p0/M, z14.s, z16.s\n"
+ "smax z13.s, p0/M, z13.s, z16.s\n"
+ "smax z12.s, p0/M, z12.s, z16.s\n"
+ "smax z11.s, p0/M, z11.s, z16.s\n"
+ "smax z10.s, p0/M, z10.s, z16.s\n"
+ "smax z9.s, p0/M, z9.s, z16.s\n"
+ "smax z8.s, p0/M, z8.s, z16.s\n"
+ "smax z7.s, p0/M, z7.s, z16.s\n"
+ "smax z6.s, p0/M, z6.s, z16.s\n"
+ "smax z5.s, p0/M, z5.s, z16.s\n"
+ "smax z4.s, p0/M, z4.s, z16.s\n"
+ "smax z3.s, p0/M, z3.s, z16.s\n"
+ "smax z2.s, p0/M, z2.s, z16.s\n"
+ "smax z1.s, p0/M, z1.s, z16.s\n"
+ "smax z0.s, p0/M, z0.s, z16.s\n"
+ "smin z15.s, p0/M, z15.s, z18.s\n"
+ "smin z14.s, p0/M, z14.s, z18.s\n"
"trn1 z17.h, z15.h, z14.h\n"
- "smax z10.s, p4/M, z10.s, z20.s\n"
+ "smin z13.s, p0/M, z13.s, z18.s\n"
+ "smin z12.s, p0/M, z12.s, z18.s\n"
"trn1 z16.h, z13.h, z12.h\n"
- "smin z11.s, p4/M, z11.s, z19.s\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p3, [%x[outptr], x26]\n"
- "smin z10.s, p4/M, z10.s, z19.s\n"
- "incb x26, ALL, MUL #4\n"
- "smax z9.s, p4/M, z9.s, z20.s\n"
- "smax z8.s, p4/M, z8.s, z20.s\n"
- "smax z7.s, p4/M, z7.s, z20.s\n"
- "smax z6.s, p4/M, z6.s, z20.s\n"
- "trn1 z18.h, z11.h, z10.h\n"
- "smin z9.s, p4/M, z9.s, z19.s\n"
- "smin z8.s, p4/M, z8.s, z19.s\n"
- "smin z7.s, p4/M, z7.s, z19.s\n"
- "smin z6.s, p4/M, z6.s, z19.s\n"
- "smax z5.s, p4/M, z5.s, z20.s\n"
+ "smin z11.s, p0/M, z11.s, z18.s\n"
+ "smin z10.s, p0/M, z10.s, z18.s\n"
+ "trn1 z17.h, z11.h, z10.h\n"
+ "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+ "smin z9.s, p0/M, z9.s, z18.s\n"
+ "smin z8.s, p0/M, z8.s, z18.s\n"
"trn1 z16.h, z9.h, z8.h\n"
- "smax z4.s, p4/M, z4.s, z20.s\n"
+ "trn1 z16.b, z17.b, z16.b\n"
+ "smin z7.s, p0/M, z7.s, z18.s\n"
+ "smin z6.s, p0/M, z6.s, z18.s\n"
"trn1 z17.h, z7.h, z6.h\n"
- "trn1 z16.b, z18.b, z16.b\n"
- "st1b { z16.b }, p2, [%x[outptr], x25]\n"
- "smin z5.s, p4/M, z5.s, z19.s\n"
- "incb x25, ALL, MUL #4\n"
- "smin z4.s, p4/M, z4.s, z19.s\n"
- "smax z3.s, p4/M, z3.s, z20.s\n"
- "smax z2.s, p4/M, z2.s, z20.s\n"
- "smax z1.s, p4/M, z1.s, z20.s\n"
- "smax z0.s, p4/M, z0.s, z20.s\n"
+ "st1b { z16.b }, p3, [%x[outptr], x26]\n"
+ "smin z5.s, p0/M, z5.s, z18.s\n"
+ "smin z4.s, p0/M, z4.s, z18.s\n"
"trn1 z16.h, z5.h, z4.h\n"
- "smin z3.s, p4/M, z3.s, z19.s\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p1, [%x[outptr], x24]\n"
- "smin z2.s, p4/M, z2.s, z19.s\n"
- "incb x24, ALL, MUL #4\n"
- "smin z1.s, p4/M, z1.s, z19.s\n"
- "smin z0.s, p4/M, z0.s, z19.s\n"
+ "smin z3.s, p0/M, z3.s, z18.s\n"
+ "smin z2.s, p0/M, z2.s, z18.s\n"
"trn1 z17.h, z3.h, z2.h\n"
+ "st1b { z16.b }, p2, [%x[outptr], x25]\n"
+ "smin z1.s, p0/M, z1.s, z18.s\n"
+ "smin z0.s, p0/M, z0.s, z18.s\n"
"trn1 z16.h, z1.h, z0.h\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p0, [%x[outptr], x23]\n"
- "incb x23, ALL, MUL #4\n"
- "whilelt p0.b, x23, %x[n_channels]\n"
+ "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+ "incb x24, ALL, MUL #4\n"
+ "whilelt p1.b, x24, %x[n_channels]\n"
+ "incb x27, ALL, MUL #4\n"
+ "incb x26, ALL, MUL #4\n"
+ "incb x25, ALL, MUL #4\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p3.b, x26, %x[n_channels]\n"
+ "whilelt p4.b, x27, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
+ "lsr x23, %x[n_valid_cells], #0x1\n"
"mov z15.s, #0x0\n"
- "mov x19, %x[inptrs]\n"
"mov z14.s, #0x0\n"
- "lsr x22, %x[n_valid_cells], #0x1\n"
+ "mov x22, %x[inptrs]\n"
"mov z13.s, #0x0\n"
"mov z12.s, #0x0\n"
- "cbz x22, 11f\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
- "add x19, x19, #0x10\n"
- "ld1b { z30.b }, p3/Z, [x20, x26]\n"
- "subs x22, x22, #0x1\n"
+ "cbz x23, 11f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "add x19, x19, #0x10\n"
- ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- "subs x22, x22, #0x1\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "ld1b { z30.b }, p3/Z, [x20, x26]\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x1\n"
+ "ands x21, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x21, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
- ".inst 0x4508abf1 // ushllb z17.h, z31.b, #0x0\n"
- ".inst 0x4508aff0 // ushllt z16.h, z31.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
+ "subs x21, x21, #0x1\n"
".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "mov z20.s, #0x0\n"
- "ld1rw { z17.s }, p4/Z, [%x[rescale_ptr]]\n"
- "mov z19.s, #0xff\n"
- "ld1rw { z16.s }, p4/Z, [%x[shift_ptr]]\n"
+ "ld1rw { z17.s }, p0/Z, [%x[rescale_ptr]]\n"
+ "ld1rw { z16.s }, p0/Z, [%x[shift_ptr]]\n"
".inst 0x04b175ef // sqdmulh z15.s, z15.s, z17.s\n"
".inst 0x04b175ce // sqdmulh z14.s, z14.s, z17.s\n"
".inst 0x04b175ad // sqdmulh z13.s, z13.s, z17.s\n"
".inst 0x04b1758c // sqdmulh z12.s, z12.s, z17.s\n"
- ".inst 0x4482920f // srshl z15.s, p4/M, z15.s, z16.s\n"
- ".inst 0x4482920e // srshl z14.s, p4/M, z14.s, z16.s\n"
- ".inst 0x4482920d // srshl z13.s, p4/M, z13.s, z16.s\n"
- ".inst 0x4482920c // srshl z12.s, p4/M, z12.s, z16.s\n"
- "smax z15.s, p4/M, z15.s, z20.s\n"
- "smax z14.s, p4/M, z14.s, z20.s\n"
- "smax z13.s, p4/M, z13.s, z20.s\n"
- "smax z12.s, p4/M, z12.s, z20.s\n"
- "smin z15.s, p4/M, z15.s, z19.s\n"
- "smin z14.s, p4/M, z14.s, z19.s\n"
- "smin z13.s, p4/M, z13.s, z19.s\n"
- "smin z12.s, p4/M, z12.s, z19.s\n"
+ ".inst 0x4482820f // srshl z15.s, p0/M, z15.s, z16.s\n"
+ ".inst 0x4482820e // srshl z14.s, p0/M, z14.s, z16.s\n"
+ ".inst 0x4482820d // srshl z13.s, p0/M, z13.s, z16.s\n"
+ ".inst 0x4482820c // srshl z12.s, p0/M, z12.s, z16.s\n"
+ "mov z17.s, #0x0\n"
+ "mov z16.s, #0xff\n"
+ "smax z15.s, p0/M, z15.s, z17.s\n"
+ "smax z14.s, p0/M, z14.s, z17.s\n"
+ "smax z13.s, p0/M, z13.s, z17.s\n"
+ "smax z12.s, p0/M, z12.s, z17.s\n"
+ "smin z15.s, p0/M, z15.s, z16.s\n"
+ "smin z14.s, p0/M, z14.s, z16.s\n"
"trn1 z17.h, z15.h, z14.h\n"
+ "smin z13.s, p0/M, z13.s, z16.s\n"
+ "smin z12.s, p0/M, z12.s, z16.s\n"
"trn1 z16.h, z13.h, z12.h\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p3, [%x[outptr], x26]\n"
- "incb x26\n"
- "whilelt p3.b, x26, %x[n_channels]\n"
+ "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+ "incb x27\n"
+ "whilelt p4.b, x27, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
-
:
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr), [rescale_ptr] "r" (&rescale_value), [shift_ptr] "r" (&shift_value)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
index 06df1515ad..eae83b99fe 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,37 +24,28 @@
#pragma once
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
void sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(unsigned int, const uint8_t *const *const, uint8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
-struct sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst
+struct sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst : public DepthfirstStrategy<uint8_t, uint8_t>
{
- typedef uint8_t operand_type;
- typedef uint8_t return_type;
+ using Parent = DepthfirstStrategy<uint8_t, uint8_t>;
- typedef void (*kern_type)(unsigned int, const uint8_t *const *const, uint8_t *const *const, bool, unsigned int, unsigned int, unsigned int, unsigned int);
+ const static auto pooling_type = PoolingType::MAX;
+ const static auto pool_rows = 2u, pool_cols = 2u;
+ const static auto stride_rows = 1u, stride_cols = 1u;
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
+ sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *)
+ : Parent(pool_rows, pool_cols, stride_rows, stride_cols, 2, 2) {}
- constexpr static unsigned int pool_rows(void) { return 2; }
- constexpr static unsigned int pool_cols(void) { return 2; }
-
- constexpr static unsigned int stride_rows(void) { return 1; }
- constexpr static unsigned int stride_cols(void) { return 1; }
-
- constexpr static unsigned int out_rows(void) { return 2; }
- constexpr static unsigned int out_cols(void) { return 2; }
-
- kern_type kernel = sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl;
-
- sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const { return sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
index e921f345d5..8612555bfb 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,7 +26,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
@@ -63,84 +63,84 @@ void sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst_impl(
pad_left, pad_top, pad_right, pad_bottom);
__asm__ __volatile__(
- "ldr x14, [%x[args], %[offsetof_n_channels]]\n"
+ "ldr x15, [%x[args], %[offsetof_n_channels]]\n"
+ "ldr x21, [%x[args], %[offsetof_outptrs]]\n"
+ "mov x14, #0x0\n"
+ "whilelt p0.b, x14, x15\n"
+ "ldr x20, [%x[args], %[offsetof_inptrs]]\n"
+ "ldp x13, x12, [x21, #0x0]\n"
"ptrue p2.b\n"
- "ldr x20, [%x[args], %[offsetof_outptrs]]\n"
- "mov x13, #0x0\n"
- "ldr x19, [%x[args], %[offsetof_inptrs]]\n"
- "mov x12, #0x0\n"
- "ldp x11, x10, [x20, #0x0]\n"
- "whilelt p1.b, x13, x14\n"
- "ldp x9, x28, [x20, #0x10]\n"
- "ldp x27, x26, [x19, #0x0]\n"
- "ldp x25, x24, [x19, #0x10]\n"
- "ldp x23, x22, [x19, #0x20]\n"
- "ldp x21, x20, [x19, #0x30]\n"
- "ldr x19, [x19, #0x40]\n"
- "ld1b { z31.b }, p1/Z, [x26, x13]\n"
- "ld1b { z30.b }, p1/Z, [x23, x13]\n"
- "ld1b { z29.b }, p1/Z, [x20, x13]\n"
- "ld1b { z28.b }, p1/Z, [x24, x13]\n"
- "ld1b { z27.b }, p1/Z, [x27, x13]\n"
- "ld1b { z26.b }, p1/Z, [x22, x13]\n"
- "ld1b { z25.b }, p1/Z, [x25, x13]\n"
- "ld1b { z24.b }, p1/Z, [x21, x13]\n"
- "ld1b { z23.b }, p1/Z, [x19, x13]\n"
- "incw x13\n"
- "whilelt p1.b, x13, x14\n"
+ "mov x11, #0x0\n"
+ "ldp x10, x9, [x21, #0x10]\n"
+ "ldp x28, x27, [x20, #0x0]\n"
+ "ldp x26, x25, [x20, #0x10]\n"
+ "ldp x24, x23, [x20, #0x20]\n"
+ "ldp x22, x21, [x20, #0x30]\n"
+ "ldr x20, [x20, #0x40]\n"
+ "ld1b { z31.b }, p0/Z, [x27, x14]\n"
+ "ld1b { z30.b }, p0/Z, [x24, x14]\n"
+ "ld1b { z29.b }, p0/Z, [x21, x14]\n"
+ "ld1b { z28.b }, p0/Z, [x25, x14]\n"
+ "ld1b { z27.b }, p0/Z, [x28, x14]\n"
+ "ld1b { z26.b }, p0/Z, [x26, x14]\n"
+ "ld1b { z25.b }, p0/Z, [x23, x14]\n"
+ "ld1b { z24.b }, p0/Z, [x22, x14]\n"
+ "ld1b { z23.b }, p0/Z, [x20, x14]\n"
+ "incw x14\n"
+ "whilelt p1.b, x14, x15\n"
"b.none 2f\n"
"1:" // Vector: Loop
"movprfx z22, z31\n umax z22.b, p2/M, z22.b, z30.b\n"
- "ld1b { z31.b }, p1/Z, [x26, x13]\n"
- "whilelt p0.b, x12, x14\n"
"movprfx z21, z30\n umax z21.b, p2/M, z21.b, z29.b\n"
- "ld1b { z30.b }, p1/Z, [x23, x13]\n"
- "movprfx z18, z28\n umax z18.b, p2/M, z18.b, z27.b\n"
- "ld1b { z29.b }, p1/Z, [x20, x13]\n"
- "movprfx z17, z26\n umax z17.b, p2/M, z17.b, z25.b\n"
- "ld1b { z27.b }, p1/Z, [x27, x13]\n"
- "movprfx z16, z24\n umax z16.b, p2/M, z16.b, z28.b\n"
- "ld1b { z28.b }, p1/Z, [x24, x13]\n"
- "movprfx z20, z26\n umax z20.b, p2/M, z20.b, z23.b\n"
- "ld1b { z26.b }, p1/Z, [x22, x13]\n"
- "movprfx z19, z22\n umax z19.b, p2/M, z19.b, z18.b\n"
- "ld1b { z25.b }, p1/Z, [x25, x13]\n"
- "movprfx z18, z22\n umax z18.b, p2/M, z18.b, z17.b\n"
- "ld1b { z24.b }, p1/Z, [x21, x13]\n"
- "movprfx z17, z21\n umax z17.b, p2/M, z17.b, z16.b\n"
- "ld1b { z23.b }, p1/Z, [x19, x13]\n"
- "incw x13\n"
- "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z20.b\n"
- "st1b { z19.b }, p0, [x11, x12]\n"
- "whilelt p1.b, x13, x14\n"
- "st1b { z18.b }, p0, [x10, x12]\n"
- "st1b { z17.b }, p0, [x9, x12]\n"
- "st1b { z16.b }, p0, [x28, x12]\n"
- "incw x12\n"
+ "ld1b { z31.b }, p1/Z, [x27, x14]\n"
+ "ld1b { z30.b }, p1/Z, [x24, x14]\n"
+ "movprfx z20, z28\n umax z20.b, p2/M, z20.b, z27.b\n"
+ "movprfx z19, z26\n umax z19.b, p2/M, z19.b, z25.b\n"
+ "ld1b { z29.b }, p1/Z, [x21, x14]\n"
+ "ld1b { z27.b }, p1/Z, [x28, x14]\n"
+ "movprfx z17, z28\n umax z17.b, p2/M, z17.b, z24.b\n"
+ "movprfx z18, z25\n umax z18.b, p2/M, z18.b, z23.b\n"
+ "ld1b { z28.b }, p1/Z, [x25, x14]\n"
+ "ld1b { z26.b }, p1/Z, [x26, x14]\n"
+ "ld1b { z25.b }, p1/Z, [x23, x14]\n"
+ "ld1b { z24.b }, p1/Z, [x22, x14]\n"
+ "whilelt p0.b, x11, x15\n"
+ "movprfx z16, z22\n umax z16.b, p2/M, z16.b, z20.b\n"
+ "ld1b { z23.b }, p1/Z, [x20, x14]\n"
+ "incw x14\n"
+ "whilelt p1.b, x14, x15\n"
+ "st1b { z16.b }, p0, [x13, x11]\n"
+ "movprfx z16, z19\n umax z16.b, p2/M, z16.b, z22.b\n"
+ "umax z17.b, p2/M, z17.b, z21.b\n"
+ "st1b { z16.b }, p0, [x12, x11]\n"
+ "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z18.b\n"
+ "st1b { z17.b }, p0, [x10, x11]\n"
+ "st1b { z16.b }, p0, [x9, x11]\n"
+ "incw x11\n"
"b.any 1b\n"
"2:" // Vector: Tail
"movprfx z22, z31\n umax z22.b, p2/M, z22.b, z30.b\n"
- "whilelt p0.b, x12, x14\n"
"movprfx z21, z30\n umax z21.b, p2/M, z21.b, z29.b\n"
- "movprfx z18, z28\n umax z18.b, p2/M, z18.b, z27.b\n"
- "movprfx z17, z26\n umax z17.b, p2/M, z17.b, z25.b\n"
- "movprfx z16, z24\n umax z16.b, p2/M, z16.b, z28.b\n"
- "movprfx z20, z26\n umax z20.b, p2/M, z20.b, z23.b\n"
- "movprfx z19, z22\n umax z19.b, p2/M, z19.b, z18.b\n"
- "st1b { z19.b }, p0, [x11, x12]\n"
- "movprfx z18, z22\n umax z18.b, p2/M, z18.b, z17.b\n"
- "movprfx z17, z21\n umax z17.b, p2/M, z17.b, z16.b\n"
- "st1b { z18.b }, p0, [x10, x12]\n"
- "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z20.b\n"
- "st1b { z17.b }, p0, [x9, x12]\n"
- "st1b { z16.b }, p0, [x28, x12]\n"
+ "movprfx z20, z28\n umax z20.b, p2/M, z20.b, z27.b\n"
+ "movprfx z19, z26\n umax z19.b, p2/M, z19.b, z25.b\n"
+ "movprfx z17, z28\n umax z17.b, p2/M, z17.b, z24.b\n"
+ "movprfx z18, z25\n umax z18.b, p2/M, z18.b, z23.b\n"
+ "whilelt p0.b, x11, x15\n"
+ "movprfx z16, z22\n umax z16.b, p2/M, z16.b, z20.b\n"
+ "st1b { z16.b }, p0, [x13, x11]\n"
+ "movprfx z16, z19\n umax z16.b, p2/M, z16.b, z22.b\n"
+ "umax z17.b, p2/M, z17.b, z21.b\n"
+ "st1b { z16.b }, p0, [x12, x11]\n"
+ "movprfx z16, z21\n umax z16.b, p2/M, z16.b, z18.b\n"
+ "st1b { z17.b }, p0, [x10, x11]\n"
+ "st1b { z16.b }, p0, [x9, x11]\n"
:
: [args] "r" (&args), [offsetof_inptrs] "I" (offsetof(KernelArgs, inptrs)), [offsetof_n_channels] "I" (offsetof(KernelArgs, n_channels)), [offsetof_outptrs] "I" (offsetof(KernelArgs, outptrs))
- : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst.hpp
index 59cd4b9c78..9f3c3a435d 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,29 +26,21 @@
#pragma once
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
void sve_u8_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
-struct sve_u8_nhwc_max_generic_depthfirst
+struct sve_u8_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t>
{
- typedef uint8_t operand_type;
- typedef uint8_t return_type;
-
- typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
- kern_type kernel = sve_u8_nhwc_max_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t>;
sve_u8_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sve_u8_nhwc_max_generic_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp
index 164847480b..be0eb398ae 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,8 +23,9 @@
*/
#include <cstdint>
+#include <cstddef>
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
@@ -39,185 +40,184 @@ void sve_u8_nhwc_max_generic_depthfirst_impl(
)
{
__asm__ __volatile__(
- "ptrue p4.b\n"
- "mov x28, #0x0\n"
- "cntb x27\n"
- "cntb x26, ALL, MUL #2\n"
- "cntb x25, ALL, MUL #3\n"
+ "mov x9, #0x0\n"
+ "cntb x28\n"
+ "cntb x27, ALL, MUL #2\n"
+ "cntb x26, ALL, MUL #3\n"
+ "whilelt p4.b, x9, %x[n_channels]\n"
"whilelt p3.b, x28, %x[n_channels]\n"
"whilelt p2.b, x27, %x[n_channels]\n"
"whilelt p1.b, x26, %x[n_channels]\n"
- "whilelt p0.b, x25, %x[n_channels]\n"
+ "ptrue p0.b\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z8.b, #0x0\n"
"mov z7.b, #0x0\n"
- "mov x19, %x[inptrs]\n"
+ "mov x24, %x[inptrs]\n"
"mov z6.b, #0x0\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
"mov z5.b, #0x0\n"
- "mov z4.b, #0x0\n"
- "cbz x24, 4f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "ld1b { z2.b }, p3/Z, [x22, x28]\n"
- "ld1b { z1.b }, p3/Z, [x21, x28]\n"
- "ld1b { z0.b }, p3/Z, [x20, x28]\n"
- "ld1b { z31.b }, p2/Z, [x23, x27]\n"
- "ld1b { z30.b }, p2/Z, [x22, x27]\n"
- "ld1b { z22.b }, p2/Z, [x21, x27]\n"
- "ld1b { z29.b }, p2/Z, [x20, x27]\n"
- "ld1b { z28.b }, p1/Z, [x23, x26]\n"
- "ld1b { z27.b }, p1/Z, [x22, x26]\n"
- "ld1b { z21.b }, p1/Z, [x21, x26]\n"
- "ld1b { z26.b }, p1/Z, [x20, x26]\n"
- "ld1b { z16.b }, p0/Z, [x23, x25]\n"
- "ld1b { z25.b }, p0/Z, [x22, x25]\n"
- "ld1b { z20.b }, p0/Z, [x21, x25]\n"
- "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+ "cbz x25, 4f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "movprfx z18, z31\n umax z18.b, p4/M, z18.b, z30.b\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "umax z22.b, p4/M, z22.b, z29.b\n"
- "movprfx z17, z28\n umax z17.b, p4/M, z17.b, z27.b\n"
- "ld1b { z2.b }, p3/Z, [x22, x28]\n"
- "umax z21.b, p4/M, z21.b, z26.b\n"
- "ld1b { z1.b }, p3/Z, [x21, x28]\n"
- "umax z16.b, p4/M, z16.b, z25.b\n"
- "ld1b { z0.b }, p3/Z, [x20, x28]\n"
- "umax z20.b, p4/M, z20.b, z24.b\n"
- "ld1b { z31.b }, p2/Z, [x23, x27]\n"
- "umax z19.b, p4/M, z19.b, z23.b\n"
- "ld1b { z30.b }, p2/Z, [x22, x27]\n"
- "umax z18.b, p4/M, z18.b, z22.b\n"
- "ld1b { z22.b }, p2/Z, [x21, x27]\n"
- "umax z17.b, p4/M, z17.b, z21.b\n"
- "ld1b { z29.b }, p2/Z, [x20, x27]\n"
- "umax z16.b, p4/M, z16.b, z20.b\n"
- "ld1b { z28.b }, p1/Z, [x23, x26]\n"
- "umax z7.b, p4/M, z7.b, z19.b\n"
- "ld1b { z27.b }, p1/Z, [x22, x26]\n"
- "umax z6.b, p4/M, z6.b, z18.b\n"
- "ld1b { z21.b }, p1/Z, [x21, x26]\n"
- "umax z5.b, p4/M, z5.b, z17.b\n"
- "ld1b { z26.b }, p1/Z, [x20, x26]\n"
- "umax z4.b, p4/M, z4.b, z16.b\n"
- "ld1b { z16.b }, p0/Z, [x23, x25]\n"
- "ld1b { z25.b }, p0/Z, [x22, x25]\n"
- "ld1b { z20.b }, p0/Z, [x21, x25]\n"
- "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+ "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
+ "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "movprfx z18, z0\n umax z18.b, p0/M, z18.b, z31.b\n"
+ "umax z22.b, p0/M, z22.b, z30.b\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "movprfx z17, z29\n umax z17.b, p0/M, z17.b, z28.b\n"
+ "umax z21.b, p0/M, z21.b, z27.b\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "movprfx z16, z26\n umax z16.b, p0/M, z16.b, z25.b\n"
+ "umax z20.b, p0/M, z20.b, z24.b\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+ "umax z19.b, p0/M, z19.b, z23.b\n"
+ "umax z18.b, p0/M, z18.b, z22.b\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+ "umax z17.b, p0/M, z17.b, z21.b\n"
+ "umax z16.b, p0/M, z16.b, z20.b\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+ "subs x25, x25, #0x1\n"
+ "umax z8.b, p0/M, z8.b, z19.b\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+ "umax z7.b, p0/M, z7.b, z18.b\n"
+ "umax z6.b, p0/M, z6.b, z17.b\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
- "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
- "movprfx z18, z31\n umax z18.b, p4/M, z18.b, z30.b\n"
- "umax z22.b, p4/M, z22.b, z29.b\n"
- "movprfx z17, z28\n umax z17.b, p4/M, z17.b, z27.b\n"
- "umax z21.b, p4/M, z21.b, z26.b\n"
- "umax z16.b, p4/M, z16.b, z25.b\n"
- "umax z20.b, p4/M, z20.b, z24.b\n"
- "umax z19.b, p4/M, z19.b, z23.b\n"
- "umax z18.b, p4/M, z18.b, z22.b\n"
- "umax z17.b, p4/M, z17.b, z21.b\n"
- "umax z16.b, p4/M, z16.b, z20.b\n"
- "umax z7.b, p4/M, z7.b, z19.b\n"
- "umax z6.b, p4/M, z6.b, z18.b\n"
- "umax z5.b, p4/M, z5.b, z17.b\n"
- "umax z4.b, p4/M, z4.b, z16.b\n"
+ "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
+ "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
+ "movprfx z18, z0\n umax z18.b, p0/M, z18.b, z31.b\n"
+ "umax z22.b, p0/M, z22.b, z30.b\n"
+ "movprfx z17, z29\n umax z17.b, p0/M, z17.b, z28.b\n"
+ "umax z21.b, p0/M, z21.b, z27.b\n"
+ "movprfx z16, z26\n umax z16.b, p0/M, z16.b, z25.b\n"
+ "umax z20.b, p0/M, z20.b, z24.b\n"
+ "umax z19.b, p0/M, z19.b, z23.b\n"
+ "umax z18.b, p0/M, z18.b, z22.b\n"
+ "umax z17.b, p0/M, z17.b, z21.b\n"
+ "umax z16.b, p0/M, z16.b, z20.b\n"
+ "umax z8.b, p0/M, z8.b, z19.b\n"
+ "umax z7.b, p0/M, z7.b, z18.b\n"
+ "umax z6.b, p0/M, z6.b, z17.b\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "umax z7.b, p4/M, z7.b, z3.b\n"
- "ld1b { z31.b }, p2/Z, [x23, x27]\n"
- "ld1b { z28.b }, p1/Z, [x23, x26]\n"
- "umax z6.b, p4/M, z6.b, z31.b\n"
- "ld1b { z16.b }, p0/Z, [x23, x25]\n"
- "umax z5.b, p4/M, z5.b, z28.b\n"
- "umax z4.b, p4/M, z4.b, z16.b\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+ "subs x21, x21, #0x1\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
+ "ld1b { z17.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "umax z7.b, p0/M, z7.b, z17.b\n"
+ "umax z6.b, p0/M, z6.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
+ "st1b { z8.b }, p4, [%x[outptr], x9]\n"
+ "incb x9, ALL, MUL #4\n"
"st1b { z7.b }, p3, [%x[outptr], x28]\n"
"incb x28, ALL, MUL #4\n"
"st1b { z6.b }, p2, [%x[outptr], x27]\n"
"incb x27, ALL, MUL #4\n"
"st1b { z5.b }, p1, [%x[outptr], x26]\n"
"incb x26, ALL, MUL #4\n"
- "st1b { z4.b }, p0, [%x[outptr], x25]\n"
- "incb x25, ALL, MUL #4\n"
- "whilelt p0.b, x25, %x[n_channels]\n"
+ "whilelt p1.b, x26, %x[n_channels]\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p3.b, x28, %x[n_channels]\n"
+ "whilelt p4.b, x9, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
- "mov z7.b, #0x0\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 11f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "ld1b { z2.b }, p3/Z, [x22, x28]\n"
- "ld1b { z1.b }, p3/Z, [x21, x28]\n"
- "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z8.b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "umax z19.b, p4/M, z19.b, z23.b\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "ld1b { z2.b }, p3/Z, [x22, x28]\n"
- "umax z7.b, p4/M, z7.b, z19.b\n"
- "ld1b { z1.b }, p3/Z, [x21, x28]\n"
- "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+ "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "subs x25, x25, #0x1\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
- "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
- "umax z19.b, p4/M, z19.b, z23.b\n"
- "umax z7.b, p4/M, z7.b, z19.b\n"
+ "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "umax z7.b, p4/M, z7.b, z3.b\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+ "subs x21, x21, #0x1\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "st1b { z7.b }, p3, [%x[outptr], x28]\n"
- "incb x28\n"
- "whilelt p3.b, x28, %x[n_channels]\n"
+ "st1b { z8.b }, p4, [%x[outptr], x9]\n"
+ "incb x9\n"
+ "whilelt p4.b, x9, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
-
:
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [outptr] "r" (outptr)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst.hpp
index f6fc1a58c1..f9d25a1b45 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,29 +26,21 @@
#pragma once
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
void sve_u8q_nhwc_avg_generic_depthfirst_impl(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
-struct sve_u8q_nhwc_avg_generic_depthfirst
+struct sve_u8q_nhwc_avg_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>
{
- typedef uint8_t operand_type;
- typedef uint8_t return_type;
-
- typedef void (*kern_type)(const uint64_t window_cells, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::AVERAGE; }
-
-
- kern_type kernel = sve_u8q_nhwc_avg_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>;
sve_u8q_nhwc_avg_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sve_u8q_nhwc_avg_generic_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp
index 373848ad2b..e8339a2cd9 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_avg_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,11 +24,12 @@
#include "pooling.hpp"
#include <cstdint>
+#include <cstddef>
#include <cstring>
#include <cmath>
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
@@ -86,12 +87,13 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
f_rescale_value *= 2.0f;
}
- rescale_value = static_cast<int32_t>(round(f_rescale_value * static_cast<float>(1ll << 31)));
- if (static_cast<int64_t>(rescale_value) == (1ll << 31))
+ int64_t long_rescale_value = round(f_rescale_value * static_cast<float>(1ll << 31));
+ if (long_rescale_value == (1ll << 31))
{
shift_value++;
- rescale_value >>= 1;
+ long_rescale_value >>= 1;
}
+ rescale_value = static_cast<int32_t>(long_rescale_value);
}
@@ -117,24 +119,24 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
);
__asm__ __volatile__(
- "ptrue p4.b\n"
- "mov x26, #0x0\n"
- "cntb x25\n"
- "cntb x24, ALL, MUL #2\n"
- "cntb x23, ALL, MUL #3\n"
+ "mov x27, #0x0\n"
+ "cntb x26\n"
+ "cntb x25, ALL, MUL #2\n"
+ "cntb x24, ALL, MUL #3\n"
+ "whilelt p4.b, x27, %x[n_channels]\n"
"whilelt p3.b, x26, %x[n_channels]\n"
"whilelt p2.b, x25, %x[n_channels]\n"
"whilelt p1.b, x24, %x[n_channels]\n"
- "whilelt p0.b, x23, %x[n_channels]\n"
+ "ptrue p0.b\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
- "ld1rw { z15.s }, p4/Z, [%x[accumulator_init]]\n"
+ "ld1rw { z15.s }, p0/Z, [%x[accumulator_init]]\n"
+ "lsr x23, %x[n_valid_cells], #0x1\n"
"mov z14.d, z15.d\n"
- "mov x19, %x[inptrs]\n"
"mov z13.d, z15.d\n"
- "lsr x22, %x[n_valid_cells], #0x1\n"
"mov z12.d, z15.d\n"
"mov z11.d, z15.d\n"
+ "mov x22, %x[inptrs]\n"
"mov z10.d, z15.d\n"
"mov z9.d, z15.d\n"
"mov z8.d, z15.d\n"
@@ -146,43 +148,43 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
"mov z2.d, z15.d\n"
"mov z1.d, z15.d\n"
"mov z0.d, z15.d\n"
- "cbz x22, 4f\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
- "add x19, x19, #0x10\n"
- "ld1b { z30.b }, p3/Z, [x20, x26]\n"
- "subs x22, x22, #0x1\n"
- "ld1b { z29.b }, p2/Z, [x21, x25]\n"
- "ld1b { z28.b }, p2/Z, [x20, x25]\n"
- "ld1b { z27.b }, p1/Z, [x21, x24]\n"
- "ld1b { z26.b }, p1/Z, [x20, x24]\n"
- "ld1b { z25.b }, p0/Z, [x21, x23]\n"
- "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+ "cbz x23, 4f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 2 inputs loop
".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "add x19, x19, #0x10\n"
".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
".inst 0x455c0bb5 // uaddlb z21.h, z29.b, z28.b\n"
- "subs x22, x22, #0x1\n"
".inst 0x455c0fb4 // uaddlt z20.h, z29.b, z28.b\n"
- "ld1b { z30.b }, p3/Z, [x20, x26]\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
".inst 0x455a0b73 // uaddlb z19.h, z27.b, z26.b\n"
- "ld1b { z29.b }, p2/Z, [x21, x25]\n"
".inst 0x455a0f72 // uaddlt z18.h, z27.b, z26.b\n"
- "ld1b { z28.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
+ "ld1b { z29.b }, p3/Z, [x21, x26]\n"
".inst 0x45580b31 // uaddlb z17.h, z25.b, z24.b\n"
- "ld1b { z27.b }, p1/Z, [x21, x24]\n"
".inst 0x45580f30 // uaddlt z16.h, z25.b, z24.b\n"
- "ld1b { z26.b }, p1/Z, [x20, x24]\n"
+ "ld1b { z28.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z27.b }, p2/Z, [x21, x25]\n"
".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- "ld1b { z25.b }, p0/Z, [x21, x23]\n"
".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "ld1b { z24.b }, p0/Z, [x20, x23]\n"
+ "ld1b { z26.b }, p2/Z, [x20, x25]\n"
+ "ld1b { z25.b }, p1/Z, [x21, x24]\n"
".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ "ld1b { z24.b }, p1/Z, [x20, x24]\n"
".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
@@ -222,265 +224,264 @@ void sve_u8q_nhwc_avg_generic_depthfirst_impl(
".inst 0x45904821 // uaddwb z1.s, z1.s, z16.h\n"
".inst 0x45904c00 // uaddwt z0.s, z0.s, z16.h\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x1\n"
+ "ands x21, %x[n_valid_cells], #0x1\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x21, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
- ".inst 0x4508abf1 // ushllb z17.h, z31.b, #0x0\n"
- "ld1b { z29.b }, p2/Z, [x21, x25]\n"
- ".inst 0x4508aff0 // ushllt z16.h, z31.b, #0x0\n"
- "ld1b { z27.b }, p1/Z, [x21, x24]\n"
- ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
- "ld1b { z25.b }, p0/Z, [x21, x23]\n"
- ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
- ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
- ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
- ".inst 0x4508abb0 // ushllb z16.h, z29.b, #0x0\n"
- ".inst 0x4590496b // uaddwb z11.s, z11.s, z16.h\n"
- ".inst 0x45904d4a // uaddwt z10.s, z10.s, z16.h\n"
- ".inst 0x4508afb0 // ushllt z16.h, z29.b, #0x0\n"
- ".inst 0x45904929 // uaddwb z9.s, z9.s, z16.h\n"
- ".inst 0x45904d08 // uaddwt z8.s, z8.s, z16.h\n"
- ".inst 0x4508ab70 // ushllb z16.h, z27.b, #0x0\n"
- ".inst 0x459048e7 // uaddwb z7.s, z7.s, z16.h\n"
- ".inst 0x45904cc6 // uaddwt z6.s, z6.s, z16.h\n"
- ".inst 0x4508af70 // ushllt z16.h, z27.b, #0x0\n"
- ".inst 0x459048a5 // uaddwb z5.s, z5.s, z16.h\n"
- ".inst 0x45904c84 // uaddwt z4.s, z4.s, z16.h\n"
- ".inst 0x4508ab30 // ushllb z16.h, z25.b, #0x0\n"
- ".inst 0x45904863 // uaddwb z3.s, z3.s, z16.h\n"
- ".inst 0x45904c42 // uaddwt z2.s, z2.s, z16.h\n"
- ".inst 0x4508af30 // ushllt z16.h, z25.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa17 // ushllb z23.h, z16.b, #0x0\n"
+ ".inst 0x4508ae16 // ushllt z22.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p3/Z, [x20, x26]\n"
+ "ld1b { z17.b }, p2/Z, [x20, x25]\n"
+ ".inst 0x4508aa15 // ushllb z21.h, z16.b, #0x0\n"
+ ".inst 0x4508ae14 // ushllt z20.h, z16.b, #0x0\n"
+ "ld1b { z16.b }, p1/Z, [x20, x24]\n"
+ ".inst 0x4508aa33 // ushllb z19.h, z17.b, #0x0\n"
+ ".inst 0x4508ae32 // ushllt z18.h, z17.b, #0x0\n"
+ "subs x21, x21, #0x1\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
+ ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
+ ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
+ ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
+ ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x4595496b // uaddwb z11.s, z11.s, z21.h\n"
+ ".inst 0x45954d4a // uaddwt z10.s, z10.s, z21.h\n"
+ ".inst 0x45944929 // uaddwb z9.s, z9.s, z20.h\n"
+ ".inst 0x45944d08 // uaddwt z8.s, z8.s, z20.h\n"
+ ".inst 0x459348e7 // uaddwb z7.s, z7.s, z19.h\n"
+ ".inst 0x45934cc6 // uaddwt z6.s, z6.s, z19.h\n"
+ ".inst 0x459248a5 // uaddwb z5.s, z5.s, z18.h\n"
+ ".inst 0x45924c84 // uaddwt z4.s, z4.s, z18.h\n"
+ ".inst 0x45914863 // uaddwb z3.s, z3.s, z17.h\n"
+ ".inst 0x45914c42 // uaddwt z2.s, z2.s, z17.h\n"
".inst 0x45904821 // uaddwb z1.s, z1.s, z16.h\n"
".inst 0x45904c00 // uaddwt z0.s, z0.s, z16.h\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "mov z21.s, #0x0\n"
- "ld1rw { z20.s }, p4/Z, [%x[combined_rescale_value]]\n"
- "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
- "mov z19.s, #0xff\n"
- "ld1rw { z18.s }, p4/Z, [%x[left_shift]]\n"
- "ld1rw { z17.s }, p4/Z, [%x[right_shift]]\n"
- ".inst 0x4482924f // srshl z15.s, p4/M, z15.s, z18.s\n"
- "ld1rw { z16.s }, p4/Z, [x19]\n"
- ".inst 0x4482924e // srshl z14.s, p4/M, z14.s, z18.s\n"
- ".inst 0x4482924d // srshl z13.s, p4/M, z13.s, z18.s\n"
- ".inst 0x4482924c // srshl z12.s, p4/M, z12.s, z18.s\n"
- ".inst 0x4482924b // srshl z11.s, p4/M, z11.s, z18.s\n"
- ".inst 0x04b475ef // sqrdmulh z15.s, z15.s, z20.s\n"
- ".inst 0x04b475ce // sqrdmulh z14.s, z14.s, z20.s\n"
- ".inst 0x04b475ad // sqrdmulh z13.s, z13.s, z20.s\n"
- ".inst 0x04b4758c // sqrdmulh z12.s, z12.s, z20.s\n"
- ".inst 0x04b4756b // sqrdmulh z11.s, z11.s, z20.s\n"
- ".inst 0x4482922f // srshl z15.s, p4/M, z15.s, z17.s\n"
- ".inst 0x4482922e // srshl z14.s, p4/M, z14.s, z17.s\n"
- ".inst 0x4482922d // srshl z13.s, p4/M, z13.s, z17.s\n"
- ".inst 0x4482922c // srshl z12.s, p4/M, z12.s, z17.s\n"
+ "ld1rw { z18.s }, p0/Z, [%x[left_shift]]\n"
+ "ld1rw { z16.s }, p0/Z, [%x[combined_rescale_value]]\n"
+ ".inst 0x4482824f // srshl z15.s, p0/M, z15.s, z18.s\n"
+ ".inst 0x4482824e // srshl z14.s, p0/M, z14.s, z18.s\n"
+ ".inst 0x4482824d // srshl z13.s, p0/M, z13.s, z18.s\n"
+ ".inst 0x4482824c // srshl z12.s, p0/M, z12.s, z18.s\n"
+ "ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
+ ".inst 0x04b075ef // sqrdmulh z15.s, z15.s, z16.s\n"
+ ".inst 0x4482824b // srshl z11.s, p0/M, z11.s, z18.s\n"
+ ".inst 0x4482824a // srshl z10.s, p0/M, z10.s, z18.s\n"
+ ".inst 0x04b075ce // sqrdmulh z14.s, z14.s, z16.s\n"
+ ".inst 0x04b075ad // sqrdmulh z13.s, z13.s, z16.s\n"
+ ".inst 0x44828249 // srshl z9.s, p0/M, z9.s, z18.s\n"
+ ".inst 0x44828248 // srshl z8.s, p0/M, z8.s, z18.s\n"
+ ".inst 0x04b0758c // sqrdmulh z12.s, z12.s, z16.s\n"
+ ".inst 0x04b0756b // sqrdmulh z11.s, z11.s, z16.s\n"
+ ".inst 0x44828247 // srshl z7.s, p0/M, z7.s, z18.s\n"
+ ".inst 0x44828246 // srshl z6.s, p0/M, z6.s, z18.s\n"
+ ".inst 0x04b0754a // sqrdmulh z10.s, z10.s, z16.s\n"
+ ".inst 0x04b07529 // sqrdmulh z9.s, z9.s, z16.s\n"
+ ".inst 0x44828245 // srshl z5.s, p0/M, z5.s, z18.s\n"
+ ".inst 0x44828244 // srshl z4.s, p0/M, z4.s, z18.s\n"
+ ".inst 0x04b07508 // sqrdmulh z8.s, z8.s, z16.s\n"
+ ".inst 0x04b074e7 // sqrdmulh z7.s, z7.s, z16.s\n"
+ ".inst 0x44828243 // srshl z3.s, p0/M, z3.s, z18.s\n"
+ ".inst 0x44828242 // srshl z2.s, p0/M, z2.s, z18.s\n"
+ ".inst 0x04b074c6 // sqrdmulh z6.s, z6.s, z16.s\n"
+ ".inst 0x04b074a5 // sqrdmulh z5.s, z5.s, z16.s\n"
+ ".inst 0x44828241 // srshl z1.s, p0/M, z1.s, z18.s\n"
+ ".inst 0x44828240 // srshl z0.s, p0/M, z0.s, z18.s\n"
+ ".inst 0x04b07484 // sqrdmulh z4.s, z4.s, z16.s\n"
+ ".inst 0x04b07463 // sqrdmulh z3.s, z3.s, z16.s\n"
+ ".inst 0x04b07442 // sqrdmulh z2.s, z2.s, z16.s\n"
+ ".inst 0x04b07421 // sqrdmulh z1.s, z1.s, z16.s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+ ".inst 0x4482822f // srshl z15.s, p0/M, z15.s, z17.s\n"
+ ".inst 0x04b07400 // sqrdmulh z0.s, z0.s, z16.s\n"
+ ".inst 0x4482822e // srshl z14.s, p0/M, z14.s, z17.s\n"
+ ".inst 0x4482822d // srshl z13.s, p0/M, z13.s, z17.s\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x4482822c // srshl z12.s, p0/M, z12.s, z17.s\n"
+ ".inst 0x4482822b // srshl z11.s, p0/M, z11.s, z17.s\n"
"add z15.s, z15.s, z16.s\n"
"add z14.s, z14.s, z16.s\n"
+ ".inst 0x4482822a // srshl z10.s, p0/M, z10.s, z17.s\n"
+ ".inst 0x44828229 // srshl z9.s, p0/M, z9.s, z17.s\n"
"add z13.s, z13.s, z16.s\n"
"add z12.s, z12.s, z16.s\n"
- ".inst 0x4482922b // srshl z11.s, p4/M, z11.s, z17.s\n"
- ".inst 0x4482924a // srshl z10.s, p4/M, z10.s, z18.s\n"
- ".inst 0x44829249 // srshl z9.s, p4/M, z9.s, z18.s\n"
- ".inst 0x44829248 // srshl z8.s, p4/M, z8.s, z18.s\n"
+ ".inst 0x44828228 // srshl z8.s, p0/M, z8.s, z17.s\n"
+ ".inst 0x44828227 // srshl z7.s, p0/M, z7.s, z17.s\n"
"add z11.s, z11.s, z16.s\n"
- ".inst 0x04b4754a // sqrdmulh z10.s, z10.s, z20.s\n"
- ".inst 0x04b47529 // sqrdmulh z9.s, z9.s, z20.s\n"
- ".inst 0x04b47508 // sqrdmulh z8.s, z8.s, z20.s\n"
- ".inst 0x44829247 // srshl z7.s, p4/M, z7.s, z18.s\n"
- ".inst 0x4482922a // srshl z10.s, p4/M, z10.s, z17.s\n"
- ".inst 0x44829229 // srshl z9.s, p4/M, z9.s, z17.s\n"
- ".inst 0x44829228 // srshl z8.s, p4/M, z8.s, z17.s\n"
- ".inst 0x04b474e7 // sqrdmulh z7.s, z7.s, z20.s\n"
"add z10.s, z10.s, z16.s\n"
+ ".inst 0x44828226 // srshl z6.s, p0/M, z6.s, z17.s\n"
+ ".inst 0x44828225 // srshl z5.s, p0/M, z5.s, z17.s\n"
"add z9.s, z9.s, z16.s\n"
"add z8.s, z8.s, z16.s\n"
- ".inst 0x44829227 // srshl z7.s, p4/M, z7.s, z17.s\n"
- ".inst 0x44829246 // srshl z6.s, p4/M, z6.s, z18.s\n"
- ".inst 0x44829245 // srshl z5.s, p4/M, z5.s, z18.s\n"
- ".inst 0x44829244 // srshl z4.s, p4/M, z4.s, z18.s\n"
+ ".inst 0x44828224 // srshl z4.s, p0/M, z4.s, z17.s\n"
+ ".inst 0x44828223 // srshl z3.s, p0/M, z3.s, z17.s\n"
"add z7.s, z7.s, z16.s\n"
- ".inst 0x04b474c6 // sqrdmulh z6.s, z6.s, z20.s\n"
- ".inst 0x04b474a5 // sqrdmulh z5.s, z5.s, z20.s\n"
- ".inst 0x04b47484 // sqrdmulh z4.s, z4.s, z20.s\n"
- ".inst 0x44829243 // srshl z3.s, p4/M, z3.s, z18.s\n"
- ".inst 0x44829226 // srshl z6.s, p4/M, z6.s, z17.s\n"
- ".inst 0x44829225 // srshl z5.s, p4/M, z5.s, z17.s\n"
- ".inst 0x44829224 // srshl z4.s, p4/M, z4.s, z17.s\n"
- ".inst 0x04b47463 // sqrdmulh z3.s, z3.s, z20.s\n"
"add z6.s, z6.s, z16.s\n"
+ ".inst 0x44828222 // srshl z2.s, p0/M, z2.s, z17.s\n"
+ ".inst 0x44828221 // srshl z1.s, p0/M, z1.s, z17.s\n"
"add z5.s, z5.s, z16.s\n"
"add z4.s, z4.s, z16.s\n"
- ".inst 0x44829223 // srshl z3.s, p4/M, z3.s, z17.s\n"
- ".inst 0x44829242 // srshl z2.s, p4/M, z2.s, z18.s\n"
- ".inst 0x44829241 // srshl z1.s, p4/M, z1.s, z18.s\n"
- ".inst 0x44829240 // srshl z0.s, p4/M, z0.s, z18.s\n"
+ ".inst 0x44828220 // srshl z0.s, p0/M, z0.s, z17.s\n"
"add z3.s, z3.s, z16.s\n"
- ".inst 0x04b47442 // sqrdmulh z2.s, z2.s, z20.s\n"
- ".inst 0x04b47421 // sqrdmulh z1.s, z1.s, z20.s\n"
- ".inst 0x04b47400 // sqrdmulh z0.s, z0.s, z20.s\n"
- "smax z15.s, p4/M, z15.s, z21.s\n"
- ".inst 0x44829222 // srshl z2.s, p4/M, z2.s, z17.s\n"
- ".inst 0x44829221 // srshl z1.s, p4/M, z1.s, z17.s\n"
- ".inst 0x44829220 // srshl z0.s, p4/M, z0.s, z17.s\n"
- "smin z15.s, p4/M, z15.s, z19.s\n"
"add z2.s, z2.s, z16.s\n"
"add z1.s, z1.s, z16.s\n"
"add z0.s, z0.s, z16.s\n"
- "smax z14.s, p4/M, z14.s, z21.s\n"
- "smax z13.s, p4/M, z13.s, z21.s\n"
- "smax z12.s, p4/M, z12.s, z21.s\n"
- "smax z11.s, p4/M, z11.s, z21.s\n"
- "smin z14.s, p4/M, z14.s, z19.s\n"
- "smin z13.s, p4/M, z13.s, z19.s\n"
- "smin z12.s, p4/M, z12.s, z19.s\n"
- "smin z11.s, p4/M, z11.s, z19.s\n"
+ "mov z16.s, #0x0\n"
+ "smax z15.s, p0/M, z15.s, z16.s\n"
+ "smax z14.s, p0/M, z14.s, z16.s\n"
+ "mov z18.s, #0xff\n"
+ "smax z13.s, p0/M, z13.s, z16.s\n"
+ "smax z12.s, p0/M, z12.s, z16.s\n"
+ "smax z11.s, p0/M, z11.s, z16.s\n"
+ "smax z10.s, p0/M, z10.s, z16.s\n"
+ "smax z9.s, p0/M, z9.s, z16.s\n"
+ "smax z8.s, p0/M, z8.s, z16.s\n"
+ "smax z7.s, p0/M, z7.s, z16.s\n"
+ "smax z6.s, p0/M, z6.s, z16.s\n"
+ "smax z5.s, p0/M, z5.s, z16.s\n"
+ "smax z4.s, p0/M, z4.s, z16.s\n"
+ "smax z3.s, p0/M, z3.s, z16.s\n"
+ "smax z2.s, p0/M, z2.s, z16.s\n"
+ "smax z1.s, p0/M, z1.s, z16.s\n"
+ "smax z0.s, p0/M, z0.s, z16.s\n"
+ "smin z15.s, p0/M, z15.s, z18.s\n"
+ "smin z14.s, p0/M, z14.s, z18.s\n"
"trn1 z17.h, z15.h, z14.h\n"
- "smax z10.s, p4/M, z10.s, z21.s\n"
+ "smin z13.s, p0/M, z13.s, z18.s\n"
+ "smin z12.s, p0/M, z12.s, z18.s\n"
"trn1 z16.h, z13.h, z12.h\n"
- "smax z9.s, p4/M, z9.s, z21.s\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p3, [%x[outptr], x26]\n"
- "smin z10.s, p4/M, z10.s, z19.s\n"
- "incb x26, ALL, MUL #4\n"
- "smin z9.s, p4/M, z9.s, z19.s\n"
- "smax z8.s, p4/M, z8.s, z21.s\n"
- "smax z7.s, p4/M, z7.s, z21.s\n"
- "smax z6.s, p4/M, z6.s, z21.s\n"
- "trn1 z18.h, z11.h, z10.h\n"
- "smin z8.s, p4/M, z8.s, z19.s\n"
- "smin z7.s, p4/M, z7.s, z19.s\n"
- "smin z6.s, p4/M, z6.s, z19.s\n"
- "smax z5.s, p4/M, z5.s, z21.s\n"
+ "smin z11.s, p0/M, z11.s, z18.s\n"
+ "smin z10.s, p0/M, z10.s, z18.s\n"
+ "trn1 z17.h, z11.h, z10.h\n"
+ "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+ "smin z9.s, p0/M, z9.s, z18.s\n"
+ "smin z8.s, p0/M, z8.s, z18.s\n"
"trn1 z16.h, z9.h, z8.h\n"
- "smax z4.s, p4/M, z4.s, z21.s\n"
+ "trn1 z16.b, z17.b, z16.b\n"
+ "smin z7.s, p0/M, z7.s, z18.s\n"
+ "smin z6.s, p0/M, z6.s, z18.s\n"
"trn1 z17.h, z7.h, z6.h\n"
- "trn1 z16.b, z18.b, z16.b\n"
- "st1b { z16.b }, p2, [%x[outptr], x25]\n"
- "smin z5.s, p4/M, z5.s, z19.s\n"
- "incb x25, ALL, MUL #4\n"
- "smin z4.s, p4/M, z4.s, z19.s\n"
- "smax z3.s, p4/M, z3.s, z21.s\n"
- "smax z2.s, p4/M, z2.s, z21.s\n"
- "smax z1.s, p4/M, z1.s, z21.s\n"
- "smax z0.s, p4/M, z0.s, z21.s\n"
+ "st1b { z16.b }, p3, [%x[outptr], x26]\n"
+ "smin z5.s, p0/M, z5.s, z18.s\n"
+ "smin z4.s, p0/M, z4.s, z18.s\n"
"trn1 z16.h, z5.h, z4.h\n"
- "smin z3.s, p4/M, z3.s, z19.s\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p1, [%x[outptr], x24]\n"
- "smin z2.s, p4/M, z2.s, z19.s\n"
- "incb x24, ALL, MUL #4\n"
- "smin z1.s, p4/M, z1.s, z19.s\n"
- "smin z0.s, p4/M, z0.s, z19.s\n"
+ "smin z3.s, p0/M, z3.s, z18.s\n"
+ "smin z2.s, p0/M, z2.s, z18.s\n"
"trn1 z17.h, z3.h, z2.h\n"
+ "st1b { z16.b }, p2, [%x[outptr], x25]\n"
+ "smin z1.s, p0/M, z1.s, z18.s\n"
+ "smin z0.s, p0/M, z0.s, z18.s\n"
"trn1 z16.h, z1.h, z0.h\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p0, [%x[outptr], x23]\n"
- "incb x23, ALL, MUL #4\n"
- "whilelt p0.b, x23, %x[n_channels]\n"
+ "st1b { z16.b }, p1, [%x[outptr], x24]\n"
+ "incb x24, ALL, MUL #4\n"
+ "whilelt p1.b, x24, %x[n_channels]\n"
+ "incb x27, ALL, MUL #4\n"
+ "incb x26, ALL, MUL #4\n"
+ "incb x25, ALL, MUL #4\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p3.b, x26, %x[n_channels]\n"
+ "whilelt p4.b, x27, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
- "ld1rw { z15.s }, p4/Z, [%x[accumulator_init]]\n"
+ "ld1rw { z15.s }, p0/Z, [%x[accumulator_init]]\n"
+ "lsr x23, %x[n_valid_cells], #0x1\n"
"mov z14.d, z15.d\n"
- "mov x19, %x[inptrs]\n"
"mov z13.d, z15.d\n"
- "lsr x22, %x[n_valid_cells], #0x1\n"
"mov z12.d, z15.d\n"
- "cbz x22, 11f\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
- "add x19, x19, #0x10\n"
- "ld1b { z30.b }, p3/Z, [x20, x26]\n"
- "subs x22, x22, #0x1\n"
+ "mov x22, %x[inptrs]\n"
+ "cbz x23, 11f\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 2 inputs loop
- ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- "ldp x21, x20, [x19, #0x0]\n"
- "add x19, x19, #0x10\n"
- ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- "subs x22, x22, #0x1\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- "ld1b { z30.b }, p3/Z, [x20, x26]\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ "ldp x21, x20, [x22, #0x0]\n"
+ "subs x23, x23, #0x1\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ "add x22, x22, #0x10\n"
+ "ld1b { z31.b }, p4/Z, [x21, x27]\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
+ "ld1b { z30.b }, p4/Z, [x20, x27]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 2 inputs tail
- ".inst 0x455e0bf7 // uaddlb z23.h, z31.b, z30.b\n"
- ".inst 0x455e0ff6 // uaddlt z22.h, z31.b, z30.b\n"
- ".inst 0x459749ef // uaddwb z15.s, z15.s, z23.h\n"
- ".inst 0x45974dce // uaddwt z14.s, z14.s, z23.h\n"
- ".inst 0x459649ad // uaddwb z13.s, z13.s, z22.h\n"
- ".inst 0x45964d8c // uaddwt z12.s, z12.s, z22.h\n"
+ ".inst 0x455e0bf1 // uaddlb z17.h, z31.b, z30.b\n"
+ ".inst 0x455e0ff0 // uaddlt z16.h, z31.b, z30.b\n"
+ ".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
+ ".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
+ ".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
+ ".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x1\n"
+ "ands x21, %x[n_valid_cells], #0x1\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x21, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1b { z31.b }, p3/Z, [x21, x26]\n"
- ".inst 0x4508abf1 // ushllb z17.h, z31.b, #0x0\n"
- ".inst 0x4508aff0 // ushllt z16.h, z31.b, #0x0\n"
+ "ldr x20, [x22], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x27]\n"
+ ".inst 0x4508aa11 // ushllb z17.h, z16.b, #0x0\n"
+ ".inst 0x4508ae10 // ushllt z16.h, z16.b, #0x0\n"
+ "subs x21, x21, #0x1\n"
".inst 0x459149ef // uaddwb z15.s, z15.s, z17.h\n"
".inst 0x45914dce // uaddwt z14.s, z14.s, z17.h\n"
".inst 0x459049ad // uaddwb z13.s, z13.s, z16.h\n"
".inst 0x45904d8c // uaddwt z12.s, z12.s, z16.h\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "mov z21.s, #0x0\n"
- "ld1rw { z20.s }, p4/Z, [%x[combined_rescale_value]]\n"
- "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
- "mov z19.s, #0xff\n"
- "ld1rw { z18.s }, p4/Z, [%x[left_shift]]\n"
- "ld1rw { z17.s }, p4/Z, [%x[right_shift]]\n"
- ".inst 0x4482924f // srshl z15.s, p4/M, z15.s, z18.s\n"
- "ld1rw { z16.s }, p4/Z, [x19]\n"
- ".inst 0x4482924e // srshl z14.s, p4/M, z14.s, z18.s\n"
- ".inst 0x4482924d // srshl z13.s, p4/M, z13.s, z18.s\n"
- ".inst 0x4482924c // srshl z12.s, p4/M, z12.s, z18.s\n"
- ".inst 0x04b475ef // sqrdmulh z15.s, z15.s, z20.s\n"
- ".inst 0x04b475ce // sqrdmulh z14.s, z14.s, z20.s\n"
- ".inst 0x04b475ad // sqrdmulh z13.s, z13.s, z20.s\n"
- ".inst 0x04b4758c // sqrdmulh z12.s, z12.s, z20.s\n"
- ".inst 0x4482922f // srshl z15.s, p4/M, z15.s, z17.s\n"
- ".inst 0x4482922e // srshl z14.s, p4/M, z14.s, z17.s\n"
- ".inst 0x4482922d // srshl z13.s, p4/M, z13.s, z17.s\n"
- ".inst 0x4482922c // srshl z12.s, p4/M, z12.s, z17.s\n"
+ "ld1rw { z17.s }, p0/Z, [%x[left_shift]]\n"
+ "ld1rw { z16.s }, p0/Z, [%x[combined_rescale_value]]\n"
+ ".inst 0x4482822f // srshl z15.s, p0/M, z15.s, z17.s\n"
+ ".inst 0x4482822e // srshl z14.s, p0/M, z14.s, z17.s\n"
+ ".inst 0x4482822d // srshl z13.s, p0/M, z13.s, z17.s\n"
+ ".inst 0x4482822c // srshl z12.s, p0/M, z12.s, z17.s\n"
+ "ld1rw { z17.s }, p0/Z, [%x[right_shift]]\n"
+ ".inst 0x04b075ef // sqrdmulh z15.s, z15.s, z16.s\n"
+ ".inst 0x04b075ce // sqrdmulh z14.s, z14.s, z16.s\n"
+ ".inst 0x04b075ad // sqrdmulh z13.s, z13.s, z16.s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+ ".inst 0x4482822f // srshl z15.s, p0/M, z15.s, z17.s\n"
+ ".inst 0x04b0758c // sqrdmulh z12.s, z12.s, z16.s\n"
+ ".inst 0x4482822e // srshl z14.s, p0/M, z14.s, z17.s\n"
+ ".inst 0x4482822d // srshl z13.s, p0/M, z13.s, z17.s\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x4482822c // srshl z12.s, p0/M, z12.s, z17.s\n"
"add z15.s, z15.s, z16.s\n"
"add z14.s, z14.s, z16.s\n"
"add z13.s, z13.s, z16.s\n"
"add z12.s, z12.s, z16.s\n"
- "smax z15.s, p4/M, z15.s, z21.s\n"
- "smax z14.s, p4/M, z14.s, z21.s\n"
- "smax z13.s, p4/M, z13.s, z21.s\n"
- "smax z12.s, p4/M, z12.s, z21.s\n"
- "smin z15.s, p4/M, z15.s, z19.s\n"
- "smin z14.s, p4/M, z14.s, z19.s\n"
- "smin z13.s, p4/M, z13.s, z19.s\n"
- "smin z12.s, p4/M, z12.s, z19.s\n"
+ "mov z17.s, #0x0\n"
+ "smax z15.s, p0/M, z15.s, z17.s\n"
+ "smax z14.s, p0/M, z14.s, z17.s\n"
+ "mov z16.s, #0xff\n"
+ "smax z13.s, p0/M, z13.s, z17.s\n"
+ "smax z12.s, p0/M, z12.s, z17.s\n"
+ "smin z15.s, p0/M, z15.s, z16.s\n"
+ "smin z14.s, p0/M, z14.s, z16.s\n"
"trn1 z17.h, z15.h, z14.h\n"
+ "smin z13.s, p0/M, z13.s, z16.s\n"
+ "smin z12.s, p0/M, z12.s, z16.s\n"
"trn1 z16.h, z13.h, z12.h\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p3, [%x[outptr], x26]\n"
- "incb x26\n"
- "whilelt p3.b, x26, %x[n_channels]\n"
+ "st1b { z16.b }, p4, [%x[outptr], x27]\n"
+ "incb x27\n"
+ "whilelt p4.b, x27, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
-
:
: [accumulator_init] "r" (&accumulator_init), [combined_rescale_value] "r" (&combined_rescale_value), [inptrs] "r" (inptrs), [left_shift] "r" (&left_shift), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [outptr] "r" (outptr), [quant_params] "r" (&qp), [right_shift] "r" (&right_shift)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst.hpp
index c3c0edd0d5..eece6c0578 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,29 +26,21 @@
#pragma once
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
void sve_u8q_nhwc_max_generic_depthfirst_impl(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
-struct sve_u8q_nhwc_max_generic_depthfirst
+struct sve_u8q_nhwc_max_generic_depthfirst : IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>
{
- typedef uint8_t operand_type;
- typedef uint8_t return_type;
-
- typedef void (*kern_type)(const uint64_t, const uint64_t n_valid_cells, uint64_t n_channels, const uint8_t *const *const inptrs, uint8_t *outptr, const Requantize32 &qp);
-
- constexpr static PoolingType pooling_type(void) { return PoolingType::MAX; }
-
-
- kern_type kernel = sve_u8q_nhwc_max_generic_depthfirst_impl;
-
+ using Parent = IGenericDepthfirstStrategy<uint8_t, uint8_t, Requantize32>;
sve_u8q_nhwc_max_generic_depthfirst(const CPUInfo *) {}
+ typename Parent::KernelType get_kernel(void) const override { return sve_u8q_nhwc_max_generic_depthfirst_impl; }
};
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp
index c1c1d29613..94522cdaaa 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/kernels/sve_u8q_nhwc_max_generic_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,8 +24,9 @@
#include "pooling.hpp"
#include <cstdint>
+#include <cstddef>
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace pooling {
@@ -41,376 +42,375 @@ void sve_u8q_nhwc_max_generic_depthfirst_impl(
)
{
__asm__ __volatile__(
- "ptrue p4.b\n"
- "mov x28, #0x0\n"
- "cntb x27\n"
- "cntb x26, ALL, MUL #2\n"
- "cntb x25, ALL, MUL #3\n"
+ "mov x9, #0x0\n"
+ "cntb x28\n"
+ "cntb x27, ALL, MUL #2\n"
+ "cntb x26, ALL, MUL #3\n"
+ "whilelt p4.b, x9, %x[n_channels]\n"
"whilelt p3.b, x28, %x[n_channels]\n"
"whilelt p2.b, x27, %x[n_channels]\n"
"whilelt p1.b, x26, %x[n_channels]\n"
- "whilelt p0.b, x25, %x[n_channels]\n"
+ "ptrue p0.b\n"
"b.none 7f\n"
"1:" // 4-vectors of channels
- "mov z10.b, #0x0\n"
- "mov x19, %x[inptrs]\n"
- "mov z9.b, #0x0\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
"mov z8.b, #0x0\n"
"mov z7.b, #0x0\n"
- "cbz x24, 4f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "ld1b { z2.b }, p3/Z, [x22, x28]\n"
- "ld1b { z1.b }, p3/Z, [x21, x28]\n"
- "ld1b { z0.b }, p3/Z, [x20, x28]\n"
- "ld1b { z31.b }, p2/Z, [x23, x27]\n"
- "ld1b { z30.b }, p2/Z, [x22, x27]\n"
- "ld1b { z22.b }, p2/Z, [x21, x27]\n"
- "ld1b { z29.b }, p2/Z, [x20, x27]\n"
- "ld1b { z28.b }, p1/Z, [x23, x26]\n"
- "ld1b { z27.b }, p1/Z, [x22, x26]\n"
- "ld1b { z21.b }, p1/Z, [x21, x26]\n"
- "ld1b { z26.b }, p1/Z, [x20, x26]\n"
- "ld1b { z16.b }, p0/Z, [x23, x25]\n"
- "ld1b { z25.b }, p0/Z, [x22, x25]\n"
- "ld1b { z20.b }, p0/Z, [x21, x25]\n"
- "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+ "mov x24, %x[inptrs]\n"
+ "mov z6.b, #0x0\n"
+ "mov z5.b, #0x0\n"
+ "cbz x25, 4f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"beq 3f\n"
"2:" // 4-vectors of channels: 4 inputs loop
- "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "movprfx z18, z31\n umax z18.b, p4/M, z18.b, z30.b\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "umax z22.b, p4/M, z22.b, z29.b\n"
- "movprfx z17, z28\n umax z17.b, p4/M, z17.b, z27.b\n"
- "ld1b { z2.b }, p3/Z, [x22, x28]\n"
- "umax z21.b, p4/M, z21.b, z26.b\n"
- "ld1b { z1.b }, p3/Z, [x21, x28]\n"
- "umax z16.b, p4/M, z16.b, z25.b\n"
- "ld1b { z0.b }, p3/Z, [x20, x28]\n"
- "umax z20.b, p4/M, z20.b, z24.b\n"
- "ld1b { z31.b }, p2/Z, [x23, x27]\n"
- "umax z19.b, p4/M, z19.b, z23.b\n"
- "ld1b { z30.b }, p2/Z, [x22, x27]\n"
- "umax z18.b, p4/M, z18.b, z22.b\n"
- "ld1b { z22.b }, p2/Z, [x21, x27]\n"
- "umax z17.b, p4/M, z17.b, z21.b\n"
- "ld1b { z29.b }, p2/Z, [x20, x27]\n"
- "umax z16.b, p4/M, z16.b, z20.b\n"
- "ld1b { z28.b }, p1/Z, [x23, x26]\n"
- "umax z10.b, p4/M, z10.b, z19.b\n"
- "ld1b { z27.b }, p1/Z, [x22, x26]\n"
- "umax z9.b, p4/M, z9.b, z18.b\n"
- "ld1b { z21.b }, p1/Z, [x21, x26]\n"
- "umax z8.b, p4/M, z8.b, z17.b\n"
- "ld1b { z26.b }, p1/Z, [x20, x26]\n"
- "umax z7.b, p4/M, z7.b, z16.b\n"
- "ld1b { z16.b }, p0/Z, [x23, x25]\n"
- "ld1b { z25.b }, p0/Z, [x22, x25]\n"
- "ld1b { z20.b }, p0/Z, [x21, x25]\n"
- "ld1b { z24.b }, p0/Z, [x20, x25]\n"
+ "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
+ "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "movprfx z18, z0\n umax z18.b, p0/M, z18.b, z31.b\n"
+ "umax z22.b, p0/M, z22.b, z30.b\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "movprfx z17, z29\n umax z17.b, p0/M, z17.b, z28.b\n"
+ "umax z21.b, p0/M, z21.b, z27.b\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
+ "movprfx z16, z26\n umax z16.b, p0/M, z16.b, z25.b\n"
+ "umax z20.b, p0/M, z20.b, z24.b\n"
+ "ld1b { z0.b }, p3/Z, [x23, x28]\n"
+ "ld1b { z31.b }, p3/Z, [x22, x28]\n"
+ "umax z19.b, p0/M, z19.b, z23.b\n"
+ "umax z18.b, p0/M, z18.b, z22.b\n"
+ "ld1b { z22.b }, p3/Z, [x21, x28]\n"
+ "ld1b { z30.b }, p3/Z, [x20, x28]\n"
+ "umax z17.b, p0/M, z17.b, z21.b\n"
+ "umax z16.b, p0/M, z16.b, z20.b\n"
+ "ld1b { z29.b }, p2/Z, [x23, x27]\n"
+ "ld1b { z28.b }, p2/Z, [x22, x27]\n"
+ "subs x25, x25, #0x1\n"
+ "umax z8.b, p0/M, z8.b, z19.b\n"
+ "ld1b { z21.b }, p2/Z, [x21, x27]\n"
+ "ld1b { z27.b }, p2/Z, [x20, x27]\n"
+ "umax z7.b, p0/M, z7.b, z18.b\n"
+ "umax z6.b, p0/M, z6.b, z17.b\n"
+ "ld1b { z26.b }, p1/Z, [x23, x26]\n"
+ "ld1b { z25.b }, p1/Z, [x22, x26]\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z20.b }, p1/Z, [x21, x26]\n"
+ "ld1b { z24.b }, p1/Z, [x20, x26]\n"
"bgt 2b\n"
"3:" // 4-vectors of channels: 4 inputs tail
- "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
- "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
- "movprfx z18, z31\n umax z18.b, p4/M, z18.b, z30.b\n"
- "umax z22.b, p4/M, z22.b, z29.b\n"
- "movprfx z17, z28\n umax z17.b, p4/M, z17.b, z27.b\n"
- "umax z21.b, p4/M, z21.b, z26.b\n"
- "umax z16.b, p4/M, z16.b, z25.b\n"
- "umax z20.b, p4/M, z20.b, z24.b\n"
- "umax z19.b, p4/M, z19.b, z23.b\n"
- "umax z18.b, p4/M, z18.b, z22.b\n"
- "umax z17.b, p4/M, z17.b, z21.b\n"
- "umax z16.b, p4/M, z16.b, z20.b\n"
- "umax z10.b, p4/M, z10.b, z19.b\n"
- "umax z9.b, p4/M, z9.b, z18.b\n"
- "umax z8.b, p4/M, z8.b, z17.b\n"
- "umax z7.b, p4/M, z7.b, z16.b\n"
+ "movprfx z19, z4\n umax z19.b, p0/M, z19.b, z3.b\n"
+ "movprfx z23, z2\n umax z23.b, p0/M, z23.b, z1.b\n"
+ "movprfx z18, z0\n umax z18.b, p0/M, z18.b, z31.b\n"
+ "umax z22.b, p0/M, z22.b, z30.b\n"
+ "movprfx z17, z29\n umax z17.b, p0/M, z17.b, z28.b\n"
+ "umax z21.b, p0/M, z21.b, z27.b\n"
+ "movprfx z16, z26\n umax z16.b, p0/M, z16.b, z25.b\n"
+ "umax z20.b, p0/M, z20.b, z24.b\n"
+ "umax z19.b, p0/M, z19.b, z23.b\n"
+ "umax z18.b, p0/M, z18.b, z22.b\n"
+ "umax z17.b, p0/M, z17.b, z21.b\n"
+ "umax z16.b, p0/M, z16.b, z20.b\n"
+ "umax z8.b, p0/M, z8.b, z19.b\n"
+ "umax z7.b, p0/M, z7.b, z18.b\n"
+ "umax z6.b, p0/M, z6.b, z17.b\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
"4:" // 4-vectors of channels: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 6f\n"
"5:" // 4-vectors of channels: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "umax z10.b, p4/M, z10.b, z3.b\n"
- "ld1b { z31.b }, p2/Z, [x23, x27]\n"
- "ld1b { z28.b }, p1/Z, [x23, x26]\n"
- "umax z9.b, p4/M, z9.b, z31.b\n"
- "ld1b { z16.b }, p0/Z, [x23, x25]\n"
- "umax z8.b, p4/M, z8.b, z28.b\n"
- "umax z7.b, p4/M, z7.b, z16.b\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+ "subs x21, x21, #0x1\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
+ "ld1b { z17.b }, p3/Z, [x20, x28]\n"
+ "ld1b { z16.b }, p2/Z, [x20, x27]\n"
+ "umax z7.b, p0/M, z7.b, z17.b\n"
+ "umax z6.b, p0/M, z6.b, z16.b\n"
+ "ld1b { z16.b }, p1/Z, [x20, x26]\n"
+ "umax z5.b, p0/M, z5.b, z16.b\n"
"bgt 5b\n"
"6:" // 4-vectors of channels: Single input loop: End
- "mov z6.s, #0x0\n"
- "add x19, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1rw { z5.s }, p4/Z, [x19]\n"
- "mov z4.s, #0xff\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- ".inst 0x4508a951 // ushllb z17.h, z10.b, #0x0\n"
- "ld1rw { z3.s }, p4/Z, [x19]\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- ".inst 0x4508ad50 // ushllt z16.h, z10.b, #0x0\n"
- "ld1rw { z2.s }, p4/Z, [x19]\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- ".inst 0x4508a937 // ushllb z23.h, z9.b, #0x0\n"
- "ld1rw { z1.s }, p4/Z, [x19]\n"
- "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
- ".inst 0x4508ad36 // ushllt z22.h, z9.b, #0x0\n"
- "ld1rw { z0.s }, p4/Z, [x19]\n"
- ".inst 0x4508a912 // ushllb z18.h, z8.b, #0x0\n"
- ".inst 0x4508ad15 // ushllt z21.h, z8.b, #0x0\n"
- ".inst 0x4508a8f4 // ushllb z20.h, z7.b, #0x0\n"
- ".inst 0x4508acf3 // ushllt z19.h, z7.b, #0x0\n"
- "neg z5.s, p4/M, z5.s\n"
- ".inst 0x459140bf // saddwb z31.s, z5.s, z17.h\n"
- ".inst 0x459144b1 // saddwt z17.s, z5.s, z17.h\n"
- ".inst 0x459040be // saddwb z30.s, z5.s, z16.h\n"
- ".inst 0x459044b0 // saddwt z16.s, z5.s, z16.h\n"
- ".inst 0x459740bd // saddwb z29.s, z5.s, z23.h\n"
- ".inst 0x459744bc // saddwt z28.s, z5.s, z23.h\n"
- ".inst 0x459640bb // saddwb z27.s, z5.s, z22.h\n"
- ".inst 0x459644ba // saddwt z26.s, z5.s, z22.h\n"
- ".inst 0x459240b9 // saddwb z25.s, z5.s, z18.h\n"
- ".inst 0x459244b2 // saddwt z18.s, z5.s, z18.h\n"
- ".inst 0x459540b8 // saddwb z24.s, z5.s, z21.h\n"
- ".inst 0x459544b7 // saddwt z23.s, z5.s, z21.h\n"
- ".inst 0x459440b6 // saddwb z22.s, z5.s, z20.h\n"
- ".inst 0x459444b5 // saddwt z21.s, z5.s, z20.h\n"
- ".inst 0x459340b4 // saddwb z20.s, z5.s, z19.h\n"
- ".inst 0x459344b3 // saddwt z19.s, z5.s, z19.h\n"
- ".inst 0x4482905f // srshl z31.s, p4/M, z31.s, z2.s\n"
- ".inst 0x44829051 // srshl z17.s, p4/M, z17.s, z2.s\n"
- ".inst 0x4482905e // srshl z30.s, p4/M, z30.s, z2.s\n"
- ".inst 0x44829050 // srshl z16.s, p4/M, z16.s, z2.s\n"
- ".inst 0x04a377ff // sqrdmulh z31.s, z31.s, z3.s\n"
- ".inst 0x04a37631 // sqrdmulh z17.s, z17.s, z3.s\n"
- ".inst 0x04a377de // sqrdmulh z30.s, z30.s, z3.s\n"
- ".inst 0x04a37610 // sqrdmulh z16.s, z16.s, z3.s\n"
- ".inst 0x4482903f // srshl z31.s, p4/M, z31.s, z1.s\n"
- ".inst 0x44829031 // srshl z17.s, p4/M, z17.s, z1.s\n"
- ".inst 0x4482903e // srshl z30.s, p4/M, z30.s, z1.s\n"
- ".inst 0x44829030 // srshl z16.s, p4/M, z16.s, z1.s\n"
- "add z31.s, z31.s, z0.s\n"
- "add z17.s, z17.s, z0.s\n"
- "add z30.s, z30.s, z0.s\n"
- "add z16.s, z16.s, z0.s\n"
- ".inst 0x4482905d // srshl z29.s, p4/M, z29.s, z2.s\n"
- ".inst 0x4482905c // srshl z28.s, p4/M, z28.s, z2.s\n"
- ".inst 0x4482905b // srshl z27.s, p4/M, z27.s, z2.s\n"
- ".inst 0x4482905a // srshl z26.s, p4/M, z26.s, z2.s\n"
- ".inst 0x04a377bd // sqrdmulh z29.s, z29.s, z3.s\n"
- ".inst 0x04a3779c // sqrdmulh z28.s, z28.s, z3.s\n"
- ".inst 0x04a3777b // sqrdmulh z27.s, z27.s, z3.s\n"
- ".inst 0x04a3775a // sqrdmulh z26.s, z26.s, z3.s\n"
- ".inst 0x4482903d // srshl z29.s, p4/M, z29.s, z1.s\n"
- ".inst 0x4482903c // srshl z28.s, p4/M, z28.s, z1.s\n"
- ".inst 0x4482903b // srshl z27.s, p4/M, z27.s, z1.s\n"
- ".inst 0x4482903a // srshl z26.s, p4/M, z26.s, z1.s\n"
- "add z29.s, z29.s, z0.s\n"
- "add z28.s, z28.s, z0.s\n"
- "add z27.s, z27.s, z0.s\n"
- "add z26.s, z26.s, z0.s\n"
- ".inst 0x44829059 // srshl z25.s, p4/M, z25.s, z2.s\n"
- ".inst 0x44829052 // srshl z18.s, p4/M, z18.s, z2.s\n"
- "smax z31.s, p4/M, z31.s, z6.s\n"
- "smax z17.s, p4/M, z17.s, z6.s\n"
- ".inst 0x04a37739 // sqrdmulh z25.s, z25.s, z3.s\n"
- ".inst 0x04a37652 // sqrdmulh z18.s, z18.s, z3.s\n"
- "smin z31.s, p4/M, z31.s, z4.s\n"
- "smin z17.s, p4/M, z17.s, z4.s\n"
- ".inst 0x44829039 // srshl z25.s, p4/M, z25.s, z1.s\n"
- ".inst 0x44829032 // srshl z18.s, p4/M, z18.s, z1.s\n"
- "smax z30.s, p4/M, z30.s, z6.s\n"
- "trn1 z17.h, z31.h, z17.h\n"
- "add z25.s, z25.s, z0.s\n"
- "add z18.s, z18.s, z0.s\n"
- ".inst 0x44829058 // srshl z24.s, p4/M, z24.s, z2.s\n"
- ".inst 0x44829057 // srshl z23.s, p4/M, z23.s, z2.s\n"
- "smin z30.s, p4/M, z30.s, z4.s\n"
- "smax z16.s, p4/M, z16.s, z6.s\n"
- ".inst 0x04a37718 // sqrdmulh z24.s, z24.s, z3.s\n"
- ".inst 0x04a376f7 // sqrdmulh z23.s, z23.s, z3.s\n"
- "smax z29.s, p4/M, z29.s, z6.s\n"
- "smin z16.s, p4/M, z16.s, z4.s\n"
- ".inst 0x44829038 // srshl z24.s, p4/M, z24.s, z1.s\n"
- ".inst 0x44829037 // srshl z23.s, p4/M, z23.s, z1.s\n"
- "smin z29.s, p4/M, z29.s, z4.s\n"
- "trn1 z16.h, z30.h, z16.h\n"
- "add z24.s, z24.s, z0.s\n"
- "add z23.s, z23.s, z0.s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+ "ld1rw { z3.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a911 // ushllb z17.h, z8.b, #0x0\n"
+ ".inst 0x4508ad18 // ushllt z24.h, z8.b, #0x0\n"
+ ".inst 0x4508a8f7 // ushllb z23.h, z7.b, #0x0\n"
+ ".inst 0x4508acf6 // ushllt z22.h, z7.b, #0x0\n"
+ "neg z3.s, p0/M, z3.s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+ ".inst 0x4508a8d5 // ushllb z21.h, z6.b, #0x0\n"
+ ".inst 0x4508acd4 // ushllt z20.h, z6.b, #0x0\n"
+ "ld1rw { z2.s }, p0/Z, [x20]\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+ ".inst 0x4508a8b3 // ushllb z19.h, z5.b, #0x0\n"
+ ".inst 0x4508acb0 // ushllt z16.h, z5.b, #0x0\n"
+ "ld1rw { z18.s }, p0/Z, [x20]\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+ ".inst 0x45914061 // saddwb z1.s, z3.s, z17.h\n"
+ ".inst 0x45914471 // saddwt z17.s, z3.s, z17.h\n"
+ ".inst 0x44828041 // srshl z1.s, p0/M, z1.s, z2.s\n"
+ ".inst 0x44828051 // srshl z17.s, p0/M, z17.s, z2.s\n"
+ ".inst 0x45984060 // saddwb z0.s, z3.s, z24.h\n"
+ ".inst 0x4598447f // saddwt z31.s, z3.s, z24.h\n"
+ ".inst 0x44828040 // srshl z0.s, p0/M, z0.s, z2.s\n"
+ ".inst 0x4482805f // srshl z31.s, p0/M, z31.s, z2.s\n"
+ ".inst 0x4597407e // saddwb z30.s, z3.s, z23.h\n"
+ ".inst 0x4597447d // saddwt z29.s, z3.s, z23.h\n"
+ ".inst 0x4482805e // srshl z30.s, p0/M, z30.s, z2.s\n"
+ ".inst 0x4482805d // srshl z29.s, p0/M, z29.s, z2.s\n"
+ ".inst 0x4596407c // saddwb z28.s, z3.s, z22.h\n"
+ ".inst 0x4596447b // saddwt z27.s, z3.s, z22.h\n"
+ ".inst 0x4482805c // srshl z28.s, p0/M, z28.s, z2.s\n"
+ ".inst 0x4482805b // srshl z27.s, p0/M, z27.s, z2.s\n"
+ ".inst 0x4595407a // saddwb z26.s, z3.s, z21.h\n"
+ ".inst 0x45954479 // saddwt z25.s, z3.s, z21.h\n"
+ ".inst 0x4482805a // srshl z26.s, p0/M, z26.s, z2.s\n"
+ ".inst 0x44828059 // srshl z25.s, p0/M, z25.s, z2.s\n"
+ ".inst 0x45944078 // saddwb z24.s, z3.s, z20.h\n"
+ ".inst 0x45944477 // saddwt z23.s, z3.s, z20.h\n"
+ ".inst 0x44828058 // srshl z24.s, p0/M, z24.s, z2.s\n"
+ ".inst 0x44828057 // srshl z23.s, p0/M, z23.s, z2.s\n"
+ ".inst 0x45934076 // saddwb z22.s, z3.s, z19.h\n"
+ ".inst 0x45934475 // saddwt z21.s, z3.s, z19.h\n"
+ ".inst 0x44828056 // srshl z22.s, p0/M, z22.s, z2.s\n"
+ ".inst 0x44828055 // srshl z21.s, p0/M, z21.s, z2.s\n"
+ ".inst 0x45904074 // saddwb z20.s, z3.s, z16.h\n"
+ ".inst 0x45904473 // saddwt z19.s, z3.s, z16.h\n"
+ ".inst 0x44828054 // srshl z20.s, p0/M, z20.s, z2.s\n"
+ ".inst 0x44828053 // srshl z19.s, p0/M, z19.s, z2.s\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x04b27421 // sqrdmulh z1.s, z1.s, z18.s\n"
+ ".inst 0x04b27631 // sqrdmulh z17.s, z17.s, z18.s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+ ".inst 0x04b27400 // sqrdmulh z0.s, z0.s, z18.s\n"
+ ".inst 0x04b277ff // sqrdmulh z31.s, z31.s, z18.s\n"
+ ".inst 0x44828201 // srshl z1.s, p0/M, z1.s, z16.s\n"
+ ".inst 0x44828211 // srshl z17.s, p0/M, z17.s, z16.s\n"
+ ".inst 0x04b277de // sqrdmulh z30.s, z30.s, z18.s\n"
+ ".inst 0x04b277bd // sqrdmulh z29.s, z29.s, z18.s\n"
+ ".inst 0x44828200 // srshl z0.s, p0/M, z0.s, z16.s\n"
+ ".inst 0x4482821f // srshl z31.s, p0/M, z31.s, z16.s\n"
+ ".inst 0x04b2779c // sqrdmulh z28.s, z28.s, z18.s\n"
+ ".inst 0x04b2777b // sqrdmulh z27.s, z27.s, z18.s\n"
+ ".inst 0x4482821e // srshl z30.s, p0/M, z30.s, z16.s\n"
+ ".inst 0x4482821d // srshl z29.s, p0/M, z29.s, z16.s\n"
+ ".inst 0x04b2775a // sqrdmulh z26.s, z26.s, z18.s\n"
+ ".inst 0x04b27739 // sqrdmulh z25.s, z25.s, z18.s\n"
+ ".inst 0x4482821c // srshl z28.s, p0/M, z28.s, z16.s\n"
+ ".inst 0x4482821b // srshl z27.s, p0/M, z27.s, z16.s\n"
+ ".inst 0x04b27718 // sqrdmulh z24.s, z24.s, z18.s\n"
+ ".inst 0x04b276f7 // sqrdmulh z23.s, z23.s, z18.s\n"
+ ".inst 0x4482821a // srshl z26.s, p0/M, z26.s, z16.s\n"
+ ".inst 0x44828219 // srshl z25.s, p0/M, z25.s, z16.s\n"
+ ".inst 0x04b276d6 // sqrdmulh z22.s, z22.s, z18.s\n"
+ ".inst 0x04b276b5 // sqrdmulh z21.s, z21.s, z18.s\n"
+ ".inst 0x44828218 // srshl z24.s, p0/M, z24.s, z16.s\n"
+ ".inst 0x44828217 // srshl z23.s, p0/M, z23.s, z16.s\n"
+ ".inst 0x04b27694 // sqrdmulh z20.s, z20.s, z18.s\n"
+ ".inst 0x04b27673 // sqrdmulh z19.s, z19.s, z18.s\n"
+ ".inst 0x44828216 // srshl z22.s, p0/M, z22.s, z16.s\n"
+ ".inst 0x44828215 // srshl z21.s, p0/M, z21.s, z16.s\n"
+ ".inst 0x44828214 // srshl z20.s, p0/M, z20.s, z16.s\n"
+ ".inst 0x44828213 // srshl z19.s, p0/M, z19.s, z16.s\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ "add z1.s, z1.s, z16.s\n"
+ "add z17.s, z17.s, z16.s\n"
+ "add z0.s, z0.s, z16.s\n"
+ "add z31.s, z31.s, z16.s\n"
+ "add z30.s, z30.s, z16.s\n"
+ "add z29.s, z29.s, z16.s\n"
+ "add z28.s, z28.s, z16.s\n"
+ "add z27.s, z27.s, z16.s\n"
+ "add z26.s, z26.s, z16.s\n"
+ "add z25.s, z25.s, z16.s\n"
+ "add z24.s, z24.s, z16.s\n"
+ "add z23.s, z23.s, z16.s\n"
+ "add z22.s, z22.s, z16.s\n"
+ "add z21.s, z21.s, z16.s\n"
+ "add z20.s, z20.s, z16.s\n"
+ "add z19.s, z19.s, z16.s\n"
+ "mov z16.s, #0x0\n"
+ "smax z1.s, p0/M, z1.s, z16.s\n"
+ "smax z17.s, p0/M, z17.s, z16.s\n"
+ "smax z0.s, p0/M, z0.s, z16.s\n"
+ "smax z31.s, p0/M, z31.s, z16.s\n"
+ "mov z18.s, #0xff\n"
+ "smax z30.s, p0/M, z30.s, z16.s\n"
+ "smax z29.s, p0/M, z29.s, z16.s\n"
+ "smax z28.s, p0/M, z28.s, z16.s\n"
+ "smax z27.s, p0/M, z27.s, z16.s\n"
+ "smax z26.s, p0/M, z26.s, z16.s\n"
+ "smax z25.s, p0/M, z25.s, z16.s\n"
+ "smax z24.s, p0/M, z24.s, z16.s\n"
+ "smax z23.s, p0/M, z23.s, z16.s\n"
+ "smax z22.s, p0/M, z22.s, z16.s\n"
+ "smax z21.s, p0/M, z21.s, z16.s\n"
+ "smax z20.s, p0/M, z20.s, z16.s\n"
+ "smax z19.s, p0/M, z19.s, z16.s\n"
+ "smin z1.s, p0/M, z1.s, z18.s\n"
+ "smin z17.s, p0/M, z17.s, z18.s\n"
+ "trn1 z17.h, z1.h, z17.h\n"
+ "smin z0.s, p0/M, z0.s, z18.s\n"
+ "smin z31.s, p0/M, z31.s, z18.s\n"
+ "trn1 z16.h, z0.h, z31.h\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p3, [%x[outptr], x28]\n"
- ".inst 0x44829056 // srshl z22.s, p4/M, z22.s, z2.s\n"
- "incb x28, ALL, MUL #4\n"
- ".inst 0x44829055 // srshl z21.s, p4/M, z21.s, z2.s\n"
- ".inst 0x44829054 // srshl z20.s, p4/M, z20.s, z2.s\n"
- ".inst 0x44829053 // srshl z19.s, p4/M, z19.s, z2.s\n"
- "smax z28.s, p4/M, z28.s, z6.s\n"
- ".inst 0x04a376d6 // sqrdmulh z22.s, z22.s, z3.s\n"
- ".inst 0x04a376b5 // sqrdmulh z21.s, z21.s, z3.s\n"
- ".inst 0x04a37694 // sqrdmulh z20.s, z20.s, z3.s\n"
- ".inst 0x04a37673 // sqrdmulh z19.s, z19.s, z3.s\n"
- ".inst 0x44829036 // srshl z22.s, p4/M, z22.s, z1.s\n"
- ".inst 0x44829035 // srshl z21.s, p4/M, z21.s, z1.s\n"
- ".inst 0x44829034 // srshl z20.s, p4/M, z20.s, z1.s\n"
- ".inst 0x44829033 // srshl z19.s, p4/M, z19.s, z1.s\n"
- "add z22.s, z22.s, z0.s\n"
- "add z21.s, z21.s, z0.s\n"
- "add z20.s, z20.s, z0.s\n"
- "add z19.s, z19.s, z0.s\n"
- "smax z27.s, p4/M, z27.s, z6.s\n"
- "smax z26.s, p4/M, z26.s, z6.s\n"
- "smax z25.s, p4/M, z25.s, z6.s\n"
- "smin z28.s, p4/M, z28.s, z4.s\n"
- "smin z27.s, p4/M, z27.s, z4.s\n"
- "smin z26.s, p4/M, z26.s, z4.s\n"
- "smin z25.s, p4/M, z25.s, z4.s\n"
- "trn1 z17.h, z29.h, z28.h\n"
- "smax z18.s, p4/M, z18.s, z6.s\n"
- "trn1 z16.h, z27.h, z26.h\n"
- "smax z24.s, p4/M, z24.s, z6.s\n"
+ "smin z30.s, p0/M, z30.s, z18.s\n"
+ "smin z29.s, p0/M, z29.s, z18.s\n"
+ "trn1 z17.h, z30.h, z29.h\n"
+ "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+ "smin z28.s, p0/M, z28.s, z18.s\n"
+ "smin z27.s, p0/M, z27.s, z18.s\n"
+ "trn1 z16.h, z28.h, z27.h\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p2, [%x[outptr], x27]\n"
- "smin z18.s, p4/M, z18.s, z4.s\n"
- "incb x27, ALL, MUL #4\n"
- "smin z24.s, p4/M, z24.s, z4.s\n"
- "smax z23.s, p4/M, z23.s, z6.s\n"
- "smax z22.s, p4/M, z22.s, z6.s\n"
- "smax z21.s, p4/M, z21.s, z6.s\n"
- "trn1 z18.h, z25.h, z18.h\n"
- "smin z23.s, p4/M, z23.s, z4.s\n"
- "smin z22.s, p4/M, z22.s, z4.s\n"
- "smin z21.s, p4/M, z21.s, z4.s\n"
- "smax z20.s, p4/M, z20.s, z6.s\n"
+ "smin z26.s, p0/M, z26.s, z18.s\n"
+ "smin z25.s, p0/M, z25.s, z18.s\n"
+ "trn1 z17.h, z26.h, z25.h\n"
+ "st1b { z16.b }, p3, [%x[outptr], x28]\n"
+ "smin z24.s, p0/M, z24.s, z18.s\n"
+ "smin z23.s, p0/M, z23.s, z18.s\n"
"trn1 z16.h, z24.h, z23.h\n"
- "smax z19.s, p4/M, z19.s, z6.s\n"
+ "trn1 z16.b, z17.b, z16.b\n"
+ "smin z22.s, p0/M, z22.s, z18.s\n"
+ "smin z21.s, p0/M, z21.s, z18.s\n"
"trn1 z17.h, z22.h, z21.h\n"
- "trn1 z16.b, z18.b, z16.b\n"
- "st1b { z16.b }, p1, [%x[outptr], x26]\n"
- "smin z20.s, p4/M, z20.s, z4.s\n"
- "incb x26, ALL, MUL #4\n"
- "smin z19.s, p4/M, z19.s, z4.s\n"
+ "st1b { z16.b }, p2, [%x[outptr], x27]\n"
+ "smin z20.s, p0/M, z20.s, z18.s\n"
+ "smin z19.s, p0/M, z19.s, z18.s\n"
"trn1 z16.h, z20.h, z19.h\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p0, [%x[outptr], x25]\n"
- "incb x25, ALL, MUL #4\n"
- "whilelt p0.b, x25, %x[n_channels]\n"
+ "st1b { z16.b }, p1, [%x[outptr], x26]\n"
+ "incb x26, ALL, MUL #4\n"
+ "whilelt p1.b, x26, %x[n_channels]\n"
+ "incb x9, ALL, MUL #4\n"
+ "incb x28, ALL, MUL #4\n"
+ "incb x27, ALL, MUL #4\n"
"b.any 1b\n"
"7:" // Single vector of channels
- "whilelt p3.b, x28, %x[n_channels]\n"
+ "whilelt p4.b, x9, %x[n_channels]\n"
"b.none 14f\n"
"8:" // Single vector of channels: Loop
- "mov z10.b, #0x0\n"
- "mov x19, %x[inptrs]\n"
- "lsr x24, %x[n_valid_cells], #0x2\n"
- "cbz x24, 11f\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "subs x24, x24, #0x1\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "ld1b { z2.b }, p3/Z, [x22, x28]\n"
- "ld1b { z1.b }, p3/Z, [x21, x28]\n"
- "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+ "lsr x25, %x[n_valid_cells], #0x2\n"
+ "mov z8.b, #0x0\n"
+ "mov x24, %x[inptrs]\n"
+ "cbz x25, 11f\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "subs x25, x25, #0x1\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"beq 10f\n"
"9:" // Single vector of channels: Loop: 4 inputs loop
- "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
- "ldp x23, x22, [x19, #0x0]\n"
- "subs x24, x24, #0x1\n"
- "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
- "ldp x21, x20, [x19, #0x10]\n"
- "add x19, x19, #0x20\n"
- "umax z19.b, p4/M, z19.b, z23.b\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "ld1b { z2.b }, p3/Z, [x22, x28]\n"
- "umax z10.b, p4/M, z10.b, z19.b\n"
- "ld1b { z1.b }, p3/Z, [x21, x28]\n"
- "ld1b { z0.b }, p3/Z, [x20, x28]\n"
+ "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+ "ldp x23, x22, [x24, #0x0]\n"
+ "ldp x21, x20, [x24, #0x10]\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "subs x25, x25, #0x1\n"
+ "ld1b { z4.b }, p4/Z, [x23, x9]\n"
+ "ld1b { z3.b }, p4/Z, [x22, x9]\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
+ "add x24, x24, #0x20\n"
+ "ld1b { z2.b }, p4/Z, [x21, x9]\n"
+ "ld1b { z1.b }, p4/Z, [x20, x9]\n"
"bgt 9b\n"
"10:" // Single vector of channels: Loop: 4 inputs tail
- "movprfx z19, z3\n umax z19.b, p4/M, z19.b, z2.b\n"
- "movprfx z23, z1\n umax z23.b, p4/M, z23.b, z0.b\n"
- "umax z19.b, p4/M, z19.b, z23.b\n"
- "umax z10.b, p4/M, z10.b, z19.b\n"
+ "movprfx z16, z4\n umax z16.b, p0/M, z16.b, z3.b\n"
+ "movprfx z17, z2\n umax z17.b, p0/M, z17.b, z1.b\n"
+ "umax z16.b, p0/M, z16.b, z17.b\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
"11:" // Single vector of channels: Loop: After loop
- "ands x20, %x[n_valid_cells], #0x3\n"
+ "ands x21, %x[n_valid_cells], #0x3\n"
"beq 13f\n"
"12:" // Single vector of channels: Loop: Single input loop
- "ldr x23, [x19], #0x8\n"
- "subs x20, x20, #0x1\n"
- "ld1b { z3.b }, p3/Z, [x23, x28]\n"
- "umax z10.b, p4/M, z10.b, z3.b\n"
+ "ldr x20, [x24], #0x8\n"
+ "ld1b { z16.b }, p4/Z, [x20, x9]\n"
+ "subs x21, x21, #0x1\n"
+ "umax z8.b, p0/M, z8.b, z16.b\n"
"bgt 12b\n"
"13:" // Single vector of channels: Loop: Single input loop: End
- "mov z6.s, #0x0\n"
- "add x19, %x[quant_params], %[offsetof_qp_input_offset]\n"
- "ld1rw { z5.s }, p4/Z, [x19]\n"
- "mov z4.s, #0xff\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
- ".inst 0x4508a951 // ushllb z17.h, z10.b, #0x0\n"
- "ld1rw { z3.s }, p4/Z, [x19]\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
- ".inst 0x4508ad50 // ushllt z16.h, z10.b, #0x0\n"
- "ld1rw { z2.s }, p4/Z, [x19]\n"
- "add x19, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
- "neg z5.s, p4/M, z5.s\n"
- "ld1rw { z1.s }, p4/Z, [x19]\n"
- "add x19, %x[quant_params], %[offsetof_qp_output_offset]\n"
- ".inst 0x459140bf // saddwb z31.s, z5.s, z17.h\n"
- "ld1rw { z0.s }, p4/Z, [x19]\n"
- ".inst 0x459144b1 // saddwt z17.s, z5.s, z17.h\n"
- ".inst 0x459040be // saddwb z30.s, z5.s, z16.h\n"
- ".inst 0x459044b0 // saddwt z16.s, z5.s, z16.h\n"
- ".inst 0x4482905f // srshl z31.s, p4/M, z31.s, z2.s\n"
- ".inst 0x44829051 // srshl z17.s, p4/M, z17.s, z2.s\n"
- ".inst 0x4482905e // srshl z30.s, p4/M, z30.s, z2.s\n"
- ".inst 0x44829050 // srshl z16.s, p4/M, z16.s, z2.s\n"
- ".inst 0x04a377ff // sqrdmulh z31.s, z31.s, z3.s\n"
- ".inst 0x04a37631 // sqrdmulh z17.s, z17.s, z3.s\n"
- ".inst 0x04a377de // sqrdmulh z30.s, z30.s, z3.s\n"
- ".inst 0x04a37610 // sqrdmulh z16.s, z16.s, z3.s\n"
- ".inst 0x4482903f // srshl z31.s, p4/M, z31.s, z1.s\n"
- ".inst 0x44829031 // srshl z17.s, p4/M, z17.s, z1.s\n"
- ".inst 0x4482903e // srshl z30.s, p4/M, z30.s, z1.s\n"
- ".inst 0x44829030 // srshl z16.s, p4/M, z16.s, z1.s\n"
- "add z31.s, z31.s, z0.s\n"
- "add z17.s, z17.s, z0.s\n"
- "add z30.s, z30.s, z0.s\n"
- "add z16.s, z16.s, z0.s\n"
- "smax z31.s, p4/M, z31.s, z6.s\n"
- "smax z17.s, p4/M, z17.s, z6.s\n"
- "smax z30.s, p4/M, z30.s, z6.s\n"
- "smax z16.s, p4/M, z16.s, z6.s\n"
- "smin z31.s, p4/M, z31.s, z4.s\n"
- "smin z17.s, p4/M, z17.s, z4.s\n"
- "smin z30.s, p4/M, z30.s, z4.s\n"
- "smin z16.s, p4/M, z16.s, z4.s\n"
- "trn1 z17.h, z31.h, z17.h\n"
- "trn1 z16.h, z30.h, z16.h\n"
+ "add x20, %x[quant_params], %[offsetof_qp_input_offset]\n"
+ "ld1rw { z18.s }, p0/Z, [x20]\n"
+ ".inst 0x4508a911 // ushllb z17.h, z8.b, #0x0\n"
+ ".inst 0x4508ad10 // ushllt z16.h, z8.b, #0x0\n"
+ "neg z18.s, p0/M, z18.s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_left_shift]\n"
+ ".inst 0x45914255 // saddwb z21.s, z18.s, z17.h\n"
+ ".inst 0x45914654 // saddwt z20.s, z18.s, z17.h\n"
+ ".inst 0x45904253 // saddwb z19.s, z18.s, z16.h\n"
+ ".inst 0x45904652 // saddwt z18.s, z18.s, z16.h\n"
+ "ld1rw { z17.s }, p0/Z, [x20]\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_mul]\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ ".inst 0x44828235 // srshl z21.s, p0/M, z21.s, z17.s\n"
+ ".inst 0x44828234 // srshl z20.s, p0/M, z20.s, z17.s\n"
+ ".inst 0x04b076b5 // sqrdmulh z21.s, z21.s, z16.s\n"
+ ".inst 0x44828233 // srshl z19.s, p0/M, z19.s, z17.s\n"
+ ".inst 0x44828232 // srshl z18.s, p0/M, z18.s, z17.s\n"
+ ".inst 0x04b07694 // sqrdmulh z20.s, z20.s, z16.s\n"
+ ".inst 0x04b07673 // sqrdmulh z19.s, z19.s, z16.s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_per_layer_right_shift]\n"
+ "ld1rw { z17.s }, p0/Z, [x20]\n"
+ ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
+ "add x20, %x[quant_params], %[offsetof_qp_output_offset]\n"
+ ".inst 0x44828235 // srshl z21.s, p0/M, z21.s, z17.s\n"
+ ".inst 0x44828234 // srshl z20.s, p0/M, z20.s, z17.s\n"
+ "ld1rw { z16.s }, p0/Z, [x20]\n"
+ "add z21.s, z21.s, z16.s\n"
+ ".inst 0x44828233 // srshl z19.s, p0/M, z19.s, z17.s\n"
+ ".inst 0x44828232 // srshl z18.s, p0/M, z18.s, z17.s\n"
+ "add z20.s, z20.s, z16.s\n"
+ "add z19.s, z19.s, z16.s\n"
+ "add z18.s, z18.s, z16.s\n"
+ "mov z16.s, #0x0\n"
+ "smax z21.s, p0/M, z21.s, z16.s\n"
+ "smax z20.s, p0/M, z20.s, z16.s\n"
+ "smax z19.s, p0/M, z19.s, z16.s\n"
+ "smax z18.s, p0/M, z18.s, z16.s\n"
+ "mov z16.s, #0xff\n"
+ "smin z21.s, p0/M, z21.s, z16.s\n"
+ "smin z20.s, p0/M, z20.s, z16.s\n"
+ "trn1 z17.h, z21.h, z20.h\n"
+ "smin z19.s, p0/M, z19.s, z16.s\n"
+ "smin z18.s, p0/M, z18.s, z16.s\n"
+ "trn1 z16.h, z19.h, z18.h\n"
"trn1 z16.b, z17.b, z16.b\n"
- "st1b { z16.b }, p3, [%x[outptr], x28]\n"
- "incb x28\n"
- "whilelt p3.b, x28, %x[n_channels]\n"
+ "st1b { z16.b }, p4, [%x[outptr], x9]\n"
+ "incb x9\n"
+ "whilelt p4.b, x9, %x[n_channels]\n"
"b.any 8b\n"
"14:" // End
-
:
: [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [n_valid_cells] "r" (n_valid_cells), [offsetof_qp_input_offset] "I" (offsetof(Requantize32, input_offset)), [offsetof_qp_output_offset] "I" (offsetof(Requantize32, output_offset)), [offsetof_qp_per_layer_left_shift] "I" (offsetof(Requantize32, per_layer_left_shift)), [offsetof_qp_per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [offsetof_qp_per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [outptr] "r" (outptr), [quant_params] "r" (&qp)
- : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "x9", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
} // namespace pooling
} // namespace arm_conv
-#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp
index ad95207fb3..1ca478513c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,288 +24,262 @@
#pragma once
-#include "pool_common.hpp"
+#include "depthfirst_driver.hpp"
+#include "src/core/NEON/kernels/arm_conv/addressing.hpp"
#include "utils.hpp"
-
-#include "arm_compute/core/Types.h"
+#if !defined(_WIN64) && !defined(__OpenBSD__)
+#include <alloca.h>
+#endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
#include <limits>
namespace arm_conv {
namespace pooling {
-template <class strategy>
-class PoolingDepthfirst : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type>
+template <typename TInput, typename TOutput>
+class DepthfirstStrategy : public IDepthfirstStrategy
{
- using TInput = typename strategy::operand_type;
- using TOutput = typename strategy::return_type;
-
- const PoolingArgs m_args; // Copy of arguments
+ unsigned int input_rows, input_cols, output_rows, output_cols;
- constexpr static unsigned int input_rows(void)
+ public:
+ DepthfirstStrategy(unsigned int window_rows, unsigned int window_cols,
+ unsigned int stride_rows, unsigned int stride_cols,
+ unsigned int output_rows, unsigned int output_cols)
+ : input_rows(output_rows + (window_rows - 1) * stride_rows),
+ input_cols(output_cols + (window_cols - 1) * stride_cols),
+ output_rows(output_rows), output_cols(output_cols)
{
- return (strategy::out_rows() - 1)*strategy::stride_rows() + strategy::pool_rows();
}
- constexpr static unsigned int input_cols(void)
- {
- return (strategy::out_cols() - 1)*strategy::stride_cols() + strategy::pool_cols();
- }
+ unsigned int get_input_rows() const override { return input_rows; }
+ unsigned int get_input_cols() const override { return input_cols; }
+ unsigned int get_output_rows() const override { return output_rows; }
+ unsigned int get_output_cols() const override { return output_cols; }
+
+ typedef void (*KernelType)(
+ unsigned int n_channels,
+ const TInput *const *,
+ TOutput *const *,
+ bool exclude_padding,
+ unsigned int pad_left,
+ unsigned int pad_top,
+ unsigned int pad_right,
+ unsigned int pad_bottom
+ );
+ virtual KernelType get_kernel(void) const = 0;
+};
+
+
+struct WorkingSpace
+{
+ void *input_buffer;
+ void *output_buffer;
+};
+
+template <typename TInput, typename TOutput=TInput, class OutputStage=Nothing>
+class PoolingDepthfirst : public DepthfirstDriver<TInput, TOutput>
+{
size_t sizeof_input_buffer(void) const
{
- return sizeof(TInput) * m_args.n_channels;
+ return sizeof(TInput) * this->m_args.n_channels;
}
size_t sizeof_output_buffer(void) const
{
- return sizeof(TOutput) * m_args.n_channels;
+ return sizeof(TOutput) * this->m_args.n_channels;
}
- public:
- PoolingDepthfirst(const PoolingArgs &args) : m_args(args)
+ protected:
+ /* Compute the amount of working space required for a single thread. */
+ size_t get_working_size_per_thread() const override
{
+ return sizeof(WorkingSpace) + this->m_args.n_channels * (sizeof(TInput) + sizeof(TOutput));
}
- PoolingDepthfirst(PoolingDepthfirst &) = delete;
- PoolingDepthfirst &operator=(PoolingDepthfirst &) = delete;
-
- size_t get_working_size(unsigned int num_threads) const override
+ /* Initialise the working space for a thread. */
+ void initialise_working_space(void *raw_ws) const override
{
- // We require a channel-length vector of input padding values
- // (to be shared amongst all threads) and (for each thread) a
- // channel-length vector in which to dump surplus output.
- return sizeof_input_buffer() + num_threads * sizeof_output_buffer();
+ auto ws = reinterpret_cast<WorkingSpace *>(raw_ws);
+ ws->input_buffer = ws + 1;
+ ws->output_buffer = reinterpret_cast<char *>(ws + 1) + sizeof(TInput) * this->m_args.n_channels;
+
+ // Fill the input buffer with an appropriate value
+ TInput fill_val = 0;
+ if (this->m_args.pool_type == PoolingType::MAX)
+ {
+ using limits = std::numeric_limits<TInput>;
+ if (limits::has_infinity)
+ {
+ fill_val = -limits::infinity();
+ }
+ else
+ {
+ fill_val = limits::min();
+ }
+ }
+
+ auto ptr = reinterpret_cast<TInput *>(ws->input_buffer);
+ auto n_channels = this->m_args.n_channels;
+ for (; n_channels; n_channels--)
+ {
+ *(ptr++) = fill_val;
+ }
}
- void execute(
- const void *const input,
- void *const output,
- void *const working_space,
- unsigned int thread_id,
- unsigned int num_threads
+ /* Compute a portion of the output tensor with padding. */
+ void compute_tile_padded(
+ unsigned int output_i, unsigned int output_j,
+ unsigned int channel_start, unsigned int channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ void *working_space
) const override
{
- const size_t ld_input_col = m_args.n_channels;
- const size_t ld_input_row = ld_input_col * m_args.input_cols;
- const size_t ld_input_batch = ld_input_row * m_args.input_rows;
- const size_t ld_output_col = ld_input_col;
- const size_t ld_output_row = ld_output_col * m_args.output_cols;
- const size_t ld_output_batch = ld_output_row * m_args.output_rows;
-
- execute(
- input, ld_input_col, ld_input_row, ld_input_batch,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space,
- thread_id, num_threads
+ const auto kern = reinterpret_cast<const DepthfirstStrategy<TInput, TOutput> *>(
+ this->m_strat.get())->get_kernel();
+
+ // Get the working space, and some space on the stack for pointer arrays
+ auto ws = reinterpret_cast<WorkingSpace *>(working_space);
+ auto inptr_array = reinterpret_cast<const TInput **>(alloca(
+ sizeof(TInput *) * this->m_strat->get_input_rows() * this->m_strat->get_input_cols()));
+ auto outptr_array = reinterpret_cast<TOutput **>(alloca(
+ sizeof(TOutput *) * this->m_strat->get_output_rows() * this->m_strat->get_output_cols()));
+
+ // Prepare the input pointers
+ const int ii = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
+ const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+ const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+
+ const unsigned int end_ii = ii + this->m_strat->get_input_rows();
+ const auto input_pad_bottom = end_ii < this->m_args.input_rows ? 0 : end_ii - this->m_args.input_rows;
+
+ const int ij = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
+ const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
+ const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
+
+ const unsigned int end_ij = ij + this->m_strat->get_input_cols();
+ const auto input_pad_right = end_ij < this->m_args.input_cols ? 0 : end_ij - this->m_args.input_cols;
+
+ fill_pointer_array<const TInput>(
+ inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
+ input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start,
+ input.ld_row, input.ld_col,
+ reinterpret_cast<const TInput *>(ws->input_buffer),
+ input_pad_top, this->m_args.input_rows - input_i,
+ input_pad_left, this->m_args.input_cols - input_j
);
- }
- void execute(
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *const working_space,
- unsigned int thread_id,
- unsigned int num_threads
- ) const override
- {
- execute(
- m_args.n_batches, m_args.input_rows, m_args.input_cols,
- m_args.n_channels,
- input, ld_input_col, ld_input_row, ld_input_batch,
- m_args.padding,
- m_args.output_rows, m_args.output_cols,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space,
- thread_id, num_threads
+ // Prepare the output pointers
+ fill_pointer_array(
+ outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+ output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start,
+ output.ld_row, output.ld_col,
+ reinterpret_cast<TOutput *>(ws->output_buffer),
+ 0, this->m_args.output_rows - output_i, // Top padding, # valid rows
+ 0, this->m_args.output_cols - output_j // Left padding, # valid columns
+ );
+
+ // Call the kernel
+ kern(
+ channel_end - channel_start, inptr_array, outptr_array,
+ this->m_args.exclude_padding,
+ input_pad_left, input_pad_top,
+ input_pad_right, input_pad_bottom
);
}
- void execute(
- unsigned int batches,
- unsigned int height,
- unsigned int width,
- unsigned int channels,
- const void *const _input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const PaddingValues &padding,
- unsigned int output_height,
- unsigned int output_width,
- void *const _output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *const _working_space,
- unsigned int thread_id,
- unsigned int num_threads
+ // Compute a portion of the work with only top/bottom padding.
+ void compute_row_padded_tile_row(
+ const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
+ const unsigned int channel_start, const unsigned int channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ void *working_space
) const override
{
- ARM_COMPUTE_UNUSED(batches, ld_input_batch, ld_output_batch);
- strategy strat(m_args.cpu_info);
-#ifdef CYCLE_PROFILING
- arm_gemm::profiler prof;
-#endif // CYCLE_PROFILING
-
- // Cast input and output pointers into the right types
- const TInput *const inptr = static_cast<const TInput *>(_input);
- TOutput *const outptr = static_cast<TOutput *>(_output);
-
- const unsigned int roundup_output_rows = roundup(output_height, num_threads);
- const unsigned int rows_per_thread = roundup_output_rows / num_threads;
- const int start_out_height = static_cast<int>(thread_id * rows_per_thread);
- const int end_out_height = std::min<int>(output_height, static_cast<int>((thread_id + 1) * rows_per_thread));
-
- // Create an array for the input pointers
- const TInput * _inptr_array[input_rows() * input_cols()];
- const TInput **const inptr_array = _inptr_array;
-
- // Create an array for the output pointers
- TOutput * _outptr_array[strategy::out_rows() * strategy::out_cols()];
- TOutput **const outptr_array = _outptr_array;
-
- // Allocate portions of the working space
- uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
- TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space + thread_id * sizeof_output_buffer());
- TInput *const input_buffer = reinterpret_cast<TInput *>(working_space + num_threads * sizeof_output_buffer());
-
- // Initialise the input buffer
- for (unsigned int c = 0; c < channels; c++)
- {
- TInput &val = input_buffer[c];
+ const auto kern = reinterpret_cast<const DepthfirstStrategy<TInput, TOutput> *>(
+ this->m_strat.get())->get_kernel();
+
+ // Get the working space, and some space on the stack for pointer arrays
+ auto ws = reinterpret_cast<WorkingSpace *>(working_space);
+ auto inptr_array = reinterpret_cast<const TInput **>(alloca(
+ sizeof(TInput *) * this->m_strat->get_input_rows() * this->m_strat->get_input_cols()));
+ auto outptr_array = reinterpret_cast<TOutput **>(alloca(
+ sizeof(TOutput *) * this->m_strat->get_output_rows() * this->m_strat->get_output_cols()));
+
+ // Prepare the initial input pointers
+ const int ii = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
+ const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+ const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+
+ const unsigned int end_ii = ii + this->m_strat->get_input_rows();
+ const auto input_pad_bottom = end_ii < this->m_args.input_rows ? 0 : end_ii - this->m_args.input_rows;
+
+ const int ij = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
+ const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
+
+ const auto end_oi = output_i + this->m_strat->get_output_cols();
+ const auto output_pad_bottom = end_oi < this->m_args.output_rows ? 0 : end_oi - this->m_args.output_rows;
+
+ fill_pointer_array<const TInput>(
+ inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
+ input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start,
+ input.ld_row, input.ld_col,
+ reinterpret_cast<const TInput *>(ws->input_buffer),
+ input_pad_top, this->m_args.input_rows - input_i,
+ 0, this->m_args.input_cols - input_j
+ );
- if (strategy::pooling_type() == PoolingType::AVERAGE)
- {
- val = static_cast<TInput>(0);
- }
- else if (strategy::pooling_type() == PoolingType::MAX)
- {
-#if defined(__aarch64__)
- using InputType = typename std::conditional<std::is_same<TInput, __fp16>::value, arm_compute::half, TInput>::type;
- using limits = std::numeric_limits<InputType>;
-#else // defined(__aarch64__)
- using limits = std::numeric_limits<TInput>;
-#endif // defined(__aarch64__)
- if (limits::has_infinity)
- {
- val = -limits::infinity();
- }
- else
- {
- val = limits::min();
- }
- }
- }
+ // Prepare the initial output pointers
+ fill_pointer_array(
+ outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+ output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start,
+ output.ld_row, output.ld_col,
+ reinterpret_cast<TOutput *>(ws->output_buffer),
+ 0, this->m_args.output_rows - output_i, // Top padding, # valid rows
+ 0, this->m_args.output_cols - output_j // Left padding, # valid columns
+ );
- // For each output tile, construct the requisite set of pointers and call
- // into the kernel.
- for (unsigned int batch = 0; batch < batches; batch++)
+ // Call the kernel
+ for (; n_tile_cols; n_tile_cols--)
{
- // Get batch pointers
- const auto inptr_batch = inptr + batch * ld_input_batch;
- const auto outptr_batch = outptr + batch * ld_output_batch;
+ kern(
+ channel_end - channel_start, inptr_array, outptr_array,
+ this->m_args.exclude_padding,
+ 0, input_pad_top,
+ 0, input_pad_bottom
+ );
+
+ // Progress the input and output pointer arrays
+ const auto input_col_stride = input.ld_col * this->m_strat->get_output_cols() * this->m_args.pool_stride.cols;
+ for (
+ auto n = input_pad_top * this->m_strat->get_input_cols();
+ n < (this->m_strat->get_input_rows() - input_pad_bottom) * this->m_strat->get_input_cols();
+ n++
+ )
+ {
+ inptr_array[n] += input_col_stride;
+ }
- for (int start_out_i = start_out_height;
- start_out_i < end_out_height;
- start_out_i += static_cast<int>(strategy::out_rows()))
+ const auto output_col_stride = output.ld_col * this->m_strat->get_output_cols();
+ for (
+ auto n = 0u;
+ n < (this->m_strat->get_output_rows() - output_pad_bottom) * this->m_strat->get_output_cols();
+ n++
+ )
{
- const int end_out_i = start_out_i + strategy::out_rows();
- const int start_in_i = start_out_i * strategy::stride_rows() - padding.top;
- const int end_in_i = start_in_i + input_rows();
-
- // Compute top/bottom padding - TODO Is this right for average pooling?
- const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
- const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(height) - end_in_i, 0));
- const unsigned int valid_output_rows = std::min(
- end_out_i - start_out_i,
- static_cast<int>(end_out_height) - start_out_i
- );
-
- // Fill the input pointer array with padding values
- for (auto index = 0u; index < input_rows() * input_cols(); index++)
- {
- inptr_array[index] = input_buffer;
- }
-
- for (int start_out_j = 0, start_in_j = -padding.left;
- start_out_j < static_cast<int>(output_width);
- start_out_j += static_cast<int>(strategy::out_cols()),
- start_in_j += static_cast<int>(strategy::out_cols()) * strategy::stride_cols())
- {
- const int end_out_j = start_out_j + strategy::out_cols();
- const int end_in_j = start_in_j + input_cols();
-
- // Compute left/right padding - TODO Is this right for average pooling?
- const auto pad_left = static_cast<unsigned int>(-std::min(start_in_j, 0));
- const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(width) - end_in_j, 0));
-
- const unsigned int valid_output_cols = std::min(
- end_out_j - start_out_j,
- static_cast<int>(output_width) - start_out_j
- );
-
- // Construct the input pointer array - fill the array with pointers to
- // the input buffer and then fill in the required values.
- for (auto i = pad_top; i < input_rows() - pad_bottom; i++)
- {
- // Can skip over the left padding because we will have either the
- // same or less than the previous tile.
- unsigned int j = pad_left;
- const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col;
- const TInput **ptrs = inptr_array + i * input_cols() + j;
- for (; j < input_cols() - pad_right; j++)
- {
- *(ptrs++) = colptr;
- colptr += ld_input_col;
- }
- for (; j < input_cols(); j++)
- {
- *(ptrs++) = input_buffer;
- }
- }
-
- // Construct the output pointer array.
- TOutput **outptr_pos = outptr_array;
- for (auto i = 0u; i < valid_output_rows; i++)
- {
- unsigned int j = 0u;
- TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col;
- for (; j < valid_output_cols; j++)
- {
- *(outptr_pos++) = colptr;
- colptr += ld_output_col;
- }
- for (; j < strategy::out_cols(); j++)
- {
- *(outptr_pos++) = output_buffer;
- }
- }
- for (auto i = valid_output_rows; i < strategy::out_rows(); i++)
- {
- for (auto j = 0u; j < strategy::out_cols(); j++)
- {
- *(outptr_pos++) = output_buffer;
- }
- }
-
-#ifdef CYCLE_PROFILING
- // TODO Work number
- auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::out_rows() * strategy::out_cols() * strategy::pool_rows() * strategy::pool_cols()));
-#endif
- strat.kernel(
- channels, inptr_array, outptr_array,
- m_args.exclude_padding, pad_left, pad_top, pad_right, pad_bottom
- );
- }
+ outptr_array[n] += output_col_stride;
}
}
}
+
+ public:
+ PoolingDepthfirst(const DepthfirstStrategy<TInput, TOutput> *strat,
+ const PoolingArgs &args, const OutputStage &os = {})
+ : DepthfirstDriver<TInput, TOutput>(strat, args)
+ {
+ ARM_COMPUTE_UNUSED(os);
+ }
};
} // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_cache_oblivious.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_cache_oblivious.hpp
deleted file mode 100644
index 4aabd957cd..0000000000
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_cache_oblivious.hpp
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "pool_common.hpp"
-
-#include <stack>
-#include <vector>
-
-namespace arm_conv {
-namespace pooling {
-
-template <class strategy>
-class PoolingDepthfirstCacheOblivious : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type>
-{
- using TInput = typename strategy::operand_type;
- using TOutput = typename strategy::return_type;
-
- const PoolingArgs m_args; // Copy of arguments
-
- constexpr static unsigned int input_rows(void)
- {
- return (strategy::out_rows() - 1)*strategy::stride_rows() + strategy::pool_rows();
- }
-
- constexpr static unsigned int input_cols(void)
- {
- return (strategy::out_cols() - 1)*strategy::stride_cols() + strategy::pool_cols();
- }
-
- size_t sizeof_input_buffer(void) const
- {
- return sizeof(TInput) * m_args.n_channels;
- }
-
- size_t sizeof_output_buffer(void) const
- {
- return sizeof(TOutput) * m_args.n_channels;
- }
-
- public:
- PoolingDepthfirstCacheOblivious(const PoolingArgs &args) : m_args(args)
- {
- }
-
- PoolingDepthfirstCacheOblivious(PoolingDepthfirstCacheOblivious &) = delete;
- PoolingDepthfirstCacheOblivious &operator=(PoolingDepthfirstCacheOblivious &) = delete;
-
- size_t get_working_size(void) const override
- {
- // We require an array of pointers for the inputs and outputs, a
- // channel-length vector in which to dump surplus output, and a
- // channel-length vector of padding values.
- return sizeof_input_buffer() + sizeof_output_buffer();
- }
-
- void execute(
- const void *const input,
- void *const output,
- void *const working_space
- ) const override
- {
- const size_t ld_input_col = m_args.n_channels;
- const size_t ld_input_row = ld_input_col * m_args.input_cols;
- const size_t ld_input_batch = ld_input_row * m_args.input_rows;
- const size_t ld_output_col = ld_input_col;
- const size_t ld_output_row = ld_output_col * m_args.output_cols;
- const size_t ld_output_batch = ld_output_row * m_args.output_rows;
-
- execute(
- input, ld_input_col, ld_input_row, ld_input_batch,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space
- );
- }
-
- void execute(
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *const working_space
- ) const override
- {
- execute(
- m_args.n_batches, m_args.input_rows, m_args.input_cols,
- m_args.n_channels,
- input, ld_input_col, ld_input_row, ld_input_batch,
- m_args.padding,
- m_args.output_rows, m_args.output_cols,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space
- );
- }
-
- void execute(
- unsigned int batches,
- unsigned int input_height,
- unsigned int input_width,
- unsigned int channels,
- const void *const _input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const PaddingValues &padding,
- unsigned int output_height,
- unsigned int output_width,
- void *const _output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *const _working_space
- ) const override
- {
- strategy strat(m_args.cpu_info);
-#ifdef CYCLE_PROFILING
- arm_gemm::profiler prof;
-#endif // CYCLE_PROFILING
-
- // Cast input and output pointers into the right types
- const TInput *const inptr = static_cast<const TInput *>(_input);
- TOutput *const outptr = static_cast<TOutput *>(_output);
-
- // Allocate portions of the working space
- uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
- TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
- TInput *const input_buffer = reinterpret_cast<TInput *>(working_space + sizeof_output_buffer());
-
- // Fill the input buffer
- const TInput pad_value = (m_args.pool_type == PoolingType::AVERAGE)
- ? static_cast<TInput>(0)
- : (std::numeric_limits<TInput>::has_infinity
- ? -std::numeric_limits<TInput>::infinity()
- : std::numeric_limits<TInput>::lowest());
- for (unsigned int i = 0; i < channels; i++)
- {
- input_buffer[i] = pad_value;
- }
-
- // Keep subdividing the output plane across the longest dimension until we
- // reach the size of the tile. Queue items for later processing. Note - we
- // can determine the largest size of the queue a priori from the input
- // tensor size, this would allow us to allocate memory within the working
- // space and improve performance.
- struct WorkItem
- {
- unsigned int output_i, output_j;
- unsigned int output_height, output_width;
-
- WorkItem(unsigned int i, unsigned int j, unsigned int height, unsigned int width)
- : output_i(i), output_j(j), output_height(height), output_width(width) {}
- };
-
- auto execute = [&] (const WorkItem &item) {
- // Create an array for the output pointers
- TOutput * _outptr_array[strategy::out_rows() * strategy::out_cols()];
- TOutput **const outptr_array = _outptr_array;
-
- // Construct the output pointer array
- {
- const auto output_pad_right = strategy::out_rows() - item.output_width;
- auto outptr_element = outptr_array;
- auto outptr_row = outptr + item.output_i * ld_output_row + item.output_j * ld_output_col;
-
- // Fill the array with pointers to the output buffer
- for (unsigned int i = 0; i < strategy::out_rows() * strategy::out_cols(); i++)
- {
- outptr_array[i] = output_buffer;
- }
-
- // Fill in the valid portion of the array
- for (unsigned int i = 0; i < item.output_height; i++)
- {
- auto outptr_col = outptr_row;
- for (unsigned int j = 0; j < item.output_width; j++)
- {
- *(outptr_element++) = outptr_col;
- outptr_col += ld_output_col;
- }
- outptr_element += output_pad_right;
- outptr_row += ld_output_row;
- }
- }
-
- const int start_i = item.output_i * strategy::stride_rows() - padding.top;
- const int end_i = start_i + input_rows();
- const unsigned int pad_top = std::max(0, 0 - start_i);
- const unsigned int pad_bottom = std::max(0, end_i - static_cast<int>(input_height));
-
- const int start_j = item.output_j * strategy::stride_cols() - padding.left;
- const int end_j = start_j + input_cols();
- const unsigned int pad_left = std::max(0, 0 - start_j);
- const unsigned int pad_right = std::max(0, end_j - static_cast<int>(input_width));
-
- // Create an array for the input pointers
- const TInput * _inptr_array[input_rows() * input_cols()];
- const TInput **const inptr_array = _inptr_array;
- {
- const unsigned int row_padding = pad_top + pad_bottom;
- const unsigned int valid_rows = input_rows() - row_padding;
-
- const unsigned int col_padding = pad_left + pad_right;
- const unsigned int valid_cols = input_cols() - col_padding;
-
- // Fill the array with pointers to the input buffer
- for (unsigned int i = 0; i < input_rows() * input_cols(); i++)
- {
- inptr_array[i] = input_buffer;
- }
-
- // Compute valid initial pointer
- auto inptr_row = inptr + std::max(start_i, 0) * ld_input_row + std::max(start_j, 0) * ld_input_col;
-
- // Fill in the valid portion of the input array
- auto inptr_element = inptr_array + pad_top * input_cols() + pad_left;
- for (unsigned int i = 0; i < valid_rows; i++)
- {
- auto inptr_col = inptr_row;
- for (unsigned int j = 0; j < valid_cols; j++)
- {
- *(inptr_element++) = inptr_col;
- inptr_col += ld_input_col;
- }
-
- inptr_row += ld_input_row;
- inptr_element += col_padding; // Skip the padding elements
- }
- }
-
- // Call the kernel
-#ifdef CYCLE_PROFILING
- // TODO Work number
- auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(item.output_height * item.output_width * strategy::pool_rows() * strategy::pool_cols()));
-#endif // CYCLE_PROFILING
- strat.kernel(channels, inptr_array, outptr_array,
- pad_left, pad_top, pad_right, pad_bottom);
- };
-
- // Add the initial work item to the stack of work.
- std::stack<WorkItem, std::vector<WorkItem>> stack;
- stack.push(WorkItem(0, 0, output_height, output_width));
- while (!stack.empty())
- {
- // Pop an item from the stack, bisect the largest dimension and either
- // execute the resulting tiles or add them to the stack if they are too
- // large.
- const WorkItem item(stack.top());
- stack.pop();
-
- if (item.output_height <= strategy::out_rows() &&
- item.output_width <= strategy::out_cols())
- {
- execute(item);
- }
- else
- {
- // Split the largest dimension, such that we get an exact number of
- // tiles in the first partition.
- if (item.output_height >= item.output_width)
- {
- const unsigned int height_in_tiles = (item.output_height + strategy::out_rows() - 1) / strategy::out_rows();
- const unsigned int tiles_first = height_in_tiles - height_in_tiles / 2;
-
- const unsigned int height_first = tiles_first * strategy::out_rows();
- const unsigned int height_second = item.output_height - height_first;
-
- stack.push(WorkItem(item.output_i + height_first, item.output_j, height_second, item.output_width));
- stack.push(WorkItem(item.output_i, item.output_j, height_first, item.output_width));
- }
- else
- {
- const unsigned int width_in_tiles = item.output_width / strategy::out_cols();
- const unsigned int tiles_first = width_in_tiles - width_in_tiles / 2;
-
- const unsigned int width_first = tiles_first * strategy::out_cols();
- const unsigned int width_second = item.output_width - width_first;
-
- stack.push(WorkItem(item.output_i, item.output_j + width_first, item.output_height, width_second));
- stack.push(WorkItem(item.output_i, item.output_j, item.output_height, width_first));
- }
- }
- }
- }
-};
-
-} // namespace pooling
-} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp
index 5979862ed8..ded2c75127 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,236 +24,264 @@
#pragma once
-#include "pool_common.hpp"
+#include "depthfirst_driver.hpp"
#include "utils.hpp"
+#if !defined(_WIN64) && !defined(__OpenBSD__)
+#include <alloca.h>
+#endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
namespace arm_conv {
namespace pooling {
-template <class strategy>
-class PoolingDepthfirstGeneric : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type>
+template <typename TInput, typename TOutput, typename OutputStage = Nothing>
+class IGenericDepthfirstStrategy;
+
+template <typename TInput, typename TOutput>
+class IGenericDepthfirstStrategy<TInput, TOutput, Nothing>
{
- using TInput = typename strategy::operand_type;
- using TOutput = typename strategy::return_type;
+ public:
+ virtual ~IGenericDepthfirstStrategy() = default;
- const PoolingArgs m_args; // Copy of arguments
+ typedef void (*KernelType)(
+ uint64_t window_cells,
+ uint64_t n_valid_cells,
+ uint64_t n_channels,
+ const TInput *const *,
+ TOutput *
+ );
- unsigned int input_rows(void) const
- {
- return m_args.pool_window.rows;
- }
-
- unsigned int input_cols(void) const
- {
- return m_args.pool_window.cols;
- }
+ virtual KernelType get_kernel(void) const = 0;
+};
+template <typename TInput, typename TOutput>
+class IGenericDepthfirstStrategy<TInput, TOutput, Requantize32>
+{
public:
- PoolingDepthfirstGeneric(const PoolingArgs &args) : m_args(args)
- {
- }
+ virtual ~IGenericDepthfirstStrategy() = default;
+
+ typedef void (*KernelType)(
+ uint64_t window_cells,
+ uint64_t n_valid_cells,
+ uint64_t n_channels,
+ const TInput *const *,
+ TOutput *,
+ const Requantize32 &
+ );
+
+ virtual KernelType get_kernel(void) const = 0;
+};
- PoolingDepthfirstGeneric(PoolingDepthfirstGeneric &) = delete;
- PoolingDepthfirstGeneric &operator=(PoolingDepthfirstGeneric &) = delete;
+template <typename TInput, typename TOutput, typename OutputStage>
+struct Invoker;
- size_t sizeof_input_pointer_array(void) const
+template <typename TInput, typename TOutput>
+struct Invoker<TInput, TOutput, Nothing>
+{
+ static inline void invoke(
+ const typename IGenericDepthfirstStrategy<TInput, TOutput, Nothing>::KernelType kern,
+ uint64_t window_cells,
+ uint64_t n_valid_cells,
+ uint64_t n_channels,
+ const TInput *const *inptrs,
+ TOutput *outptr,
+ const Nothing &
+ )
{
- return sizeof(TInput *) * input_rows() * input_cols();
+ kern(window_cells, n_valid_cells, n_channels, inptrs, outptr);
}
+};
- size_t get_working_size(unsigned int num_threads) const override
+template <typename TInput, typename TOutput>
+struct Invoker<TInput, TOutput, Requantize32>
+{
+ static inline void invoke(
+ const typename IGenericDepthfirstStrategy<TInput, TOutput, Requantize32>::KernelType kern,
+ uint64_t window_cells,
+ uint64_t n_valid_cells,
+ uint64_t n_channels,
+ const TInput *const *inptrs,
+ TOutput *outptr,
+ const Requantize32 &qp
+ )
{
- return num_threads * sizeof_input_pointer_array();
+ kern(window_cells, n_valid_cells, n_channels, inptrs, outptr, qp);
}
+};
- void execute(
- const void *const input,
- void *const output,
- void *const working_space,
- unsigned int thread_id,
- unsigned int num_threads
- ) const override
+template <typename TInput, typename TOutput, typename OutputStage>
+class GenericDepthfirstWrapper : public IDepthfirstStrategy
+{
+ using StratType = IGenericDepthfirstStrategy<TInput, TOutput, OutputStage>;
+
+ std::unique_ptr<const StratType> m_strat;
+ const unsigned int window_rows, window_cols;
+
+ public:
+ GenericDepthfirstWrapper(const StratType *strat, const PoolingArgs &args)
+ : m_strat(strat), window_rows(args.pool_window.rows), window_cols(args.pool_window.cols)
{
- const size_t ld_input_col = m_args.n_channels;
- const size_t ld_input_row = ld_input_col * m_args.input_cols;
- const size_t ld_input_batch = ld_input_row * m_args.input_rows;
- const size_t ld_output_col = ld_input_col;
- const size_t ld_output_row = ld_output_col * m_args.output_cols;
- const size_t ld_output_batch = ld_output_row * m_args.output_rows;
-
- execute(
- input, ld_input_col, ld_input_row, ld_input_batch,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space,
- thread_id, num_threads
- );
}
- void execute(
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *const working_space,
- unsigned int thread_id,
- unsigned int num_threads
+ unsigned int get_input_rows(void) const override { return window_rows; }
+ unsigned int get_input_cols(void) const override { return window_cols; }
+ unsigned int get_output_rows(void) const override { return 1; }
+ unsigned int get_output_cols(void) const override { return 1; }
+
+ typename StratType::KernelType get_kernel(void) const { return m_strat->get_kernel(); }
+};
+
+template <typename TInput, typename TOutput=TInput, typename OutputStage=Nothing>
+class PoolingDepthfirstGeneric : public DepthfirstDriver<TInput, TOutput>
+{
+ const OutputStage m_os;
+
+ protected:
+ size_t get_working_size_per_thread() const override { return 0; }
+ void initialise_working_space(void *) const override { /* Nothing */ }
+
+ /* Compute a portion of the output tensor with padding. */
+ void compute_tile_padded(
+ unsigned int output_i, unsigned int output_j,
+ unsigned int channel_start, unsigned int channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ void *
) const override
{
- execute(
- m_args.n_batches, m_args.input_rows, m_args.input_cols,
- m_args.n_channels,
- input, ld_input_col, ld_input_row, ld_input_batch,
- m_args.padding,
- m_args.output_rows, m_args.output_cols,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space,
- thread_id, num_threads
+ // Determine start position and padding
+ const int start_i = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
+ const auto input_i = static_cast<unsigned int>(start_i < 0 ? 0 : start_i);
+ const auto pad_top = static_cast<unsigned int>(start_i < 0 ? -start_i : 0);
+ const int end_i = start_i + this->m_args.pool_window.rows;
+ const auto pad_bottom = static_cast<unsigned int>((unsigned int) end_i < this->m_args.input_rows ? 0 : end_i - this->m_args.input_rows);
+ const auto valid_rows = this->m_args.pool_window.rows - (pad_top + pad_bottom);
+
+ const int start_j = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
+ const auto input_j = static_cast<unsigned int>(start_j < 0 ? 0 : start_j);
+ const auto pad_left = static_cast<unsigned int>(start_j < 0 ? -start_j : 0);
+ const int end_j = start_j + this->m_args.pool_window.cols;
+ const auto pad_right = static_cast<unsigned int>((unsigned int) end_j < this->m_args.input_cols ? 0 : end_j - this->m_args.input_cols);
+ const auto valid_cols = this->m_args.pool_window.cols - (pad_left + pad_right);
+
+ // Determine the number of valid cells and prepare the pointers
+ const auto n_valid_cells = valid_rows * valid_cols;
+ auto inptrs = reinterpret_cast<const TInput **>(alloca(n_valid_cells * sizeof(TInput *)));
+ {
+ auto my_ptr = inptrs;
+ auto row_ptr = input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start;
+ for (auto i = valid_rows; i; i--)
+ {
+ auto ptr = row_ptr;
+ row_ptr += input.ld_row;
+
+ for (auto j = valid_cols; j; j--)
+ {
+ *(my_ptr++) = ptr;
+ ptr += input.ld_col;
+ }
+ }
+ }
+
+ auto outptr = output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start;
+
+ // Some padding variants include (or exclude) the padding values; we handle
+ // this by computing the extent of the padded input tensor and hence
+ // computing the total number of cells captured in the pooling window.
+ const auto bottom_padded_height = this->m_args.input_rows + this->m_args.padding.bottom;
+ const auto captured_rows = std::min<int>(end_i, bottom_padded_height) - start_i;
+ const auto right_padded_width = this->m_args.input_cols + this->m_args.padding.right;
+ const auto captured_cols = std::min<int>(end_j, right_padded_width) - start_j;
+ const auto captured_cells = captured_rows * captured_cols;
+ const auto window_cells = this->m_args.exclude_padding ? n_valid_cells : captured_cells;
+
+ // Execute the kernel
+ Invoker<TInput, TOutput, OutputStage>::invoke(
+ reinterpret_cast<const GenericDepthfirstWrapper<TInput, TOutput, OutputStage> *>(this->m_strat.get())->get_kernel(),
+ window_cells, n_valid_cells, channel_end - channel_start, inptrs, outptr, m_os
);
}
- void execute(
- unsigned int batches,
- unsigned int height,
- unsigned int width,
- unsigned int channels,
- const void *const _input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const PaddingValues &padding,
- unsigned int output_height,
- unsigned int output_width,
- void *const _output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *const _working_space,
- unsigned int thread_id,
- unsigned int num_threads
+ // Compute a portion of the work with only top/bottom padding.
+ void compute_row_padded_tile_row(
+ const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
+ const unsigned int channel_start, const unsigned int channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ void *
) const override
{
- strategy strat(m_args.cpu_info);
-#ifdef CYCLE_PROFILING
- arm_gemm::profiler prof;
-#endif // CYCLE_PROFILING
-
- const unsigned int roundup_output_rows = roundup(output_height, num_threads);
- const unsigned int rows_per_thread = roundup_output_rows / num_threads;
- int start_out_height = static_cast<int>(thread_id * rows_per_thread);
- int end_out_height = std::min<int>(output_height, static_cast<int>((thread_id + 1) * rows_per_thread));
-
- unsigned int start_channel = 0;
- unsigned int end_channel = channels;
- if(output_height == 1)
+ // Determine start position and padding
+ const int start_i = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
+ const auto input_i = static_cast<unsigned int>(start_i < 0 ? 0 : start_i);
+ const auto pad_top = static_cast<unsigned int>(start_i < 0 ? -start_i : 0);
+ const int end_i = start_i + this->m_args.pool_window.rows;
+ const auto pad_bottom = static_cast<unsigned int>((unsigned int) end_i < this->m_args.input_rows ? 0 : end_i - this->m_args.input_rows);
+ const auto valid_rows = this->m_args.pool_window.rows - (pad_top + pad_bottom);
+
+ const int start_j = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
+ const auto input_j = static_cast<unsigned int>(start_j < 0 ? 0 : start_j);
+ const auto valid_cols = this->m_args.pool_window.cols;
+
+ // Determine the number of valid cells and prepare the pointers
+ const auto n_valid_cells = valid_rows * valid_cols;
+ auto inptrs = reinterpret_cast<const TInput **>(alloca(n_valid_cells * sizeof(TInput *)));
{
- const unsigned int channels_per_thread = roundup(channels, num_threads) / num_threads;
- start_channel = thread_id * channels_per_thread;
- end_channel = std::min(start_channel + channels_per_thread, channels);
-
- // Reset start and end rows
- start_out_height = 0;
- end_out_height = output_height;
- }
+ auto my_ptr = inptrs;
+ auto row_ptr = input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start;
+ for (auto i = valid_rows; i; i--)
+ {
+ auto ptr = row_ptr;
+ row_ptr += input.ld_row;
- if(start_channel >= end_channel)
- {
- // Early exit in case of multiple threads parallelising on channels
- return;
+ for (auto j = valid_cols; j; j--)
+ {
+ *(my_ptr++) = ptr;
+ ptr += input.ld_col;
+ }
+ }
}
- // Cast input and output pointers into the right types
- const TInput *const inptr = static_cast<const TInput *>(_input) + start_channel;
- TOutput *const outptr = static_cast<TOutput *>(_output) + start_channel;
+ auto outptr = output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start;
- // Grab the input pointer array
- uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
- const TInput **const inptr_array = reinterpret_cast<const TInput **>(working_space + thread_id * sizeof_input_pointer_array());
+ // Some padding variants include (or exclude) the padding values; we handle
+ // this by computing the extent of the padded input tensor and hence
+ // computing the total number of cells captured in the pooling window.
+ const auto bottom_padded_height = this->m_args.input_rows + this->m_args.padding.bottom;
+ const auto captured_rows = std::min<int>(end_i, bottom_padded_height) - start_i;
+ const auto captured_cells = captured_rows * valid_cols;
+ const auto window_cells = this->m_args.exclude_padding ? n_valid_cells : captured_cells;
- // For each output tile, construct the requisite set of pointers and call
- // into the kernel.
- for (unsigned int batch = 0; batch < batches; batch++)
+ for (; n_tile_cols; n_tile_cols--)
{
- // Get batch pointers
- const auto inptr_batch = inptr + batch * ld_input_batch;
- auto outptr_row = outptr + batch * ld_output_batch + start_out_height * ld_output_row;
-
- for (int out_i = start_out_height; out_i < end_out_height; out_i++)
+ // Execute the kernel
+ Invoker<TInput, TOutput, OutputStage>::invoke(
+ reinterpret_cast<const GenericDepthfirstWrapper<TInput, TOutput, OutputStage> *>(this->m_strat.get())->get_kernel(),
+ window_cells, n_valid_cells, channel_end - channel_start, inptrs, outptr, m_os
+ );
+
+ // Update the pointers; the output strides by a column and the inputs
+ // stride by a number of columns.
+ outptr += output.ld_col;
+ for (auto n = 0u; n < n_valid_cells; n++)
{
- const int start_in_i = out_i * m_args.pool_stride.rows - padding.top;
- const int end_in_i = start_in_i + m_args.pool_window.rows;
-
- // Compute top/bottom padding
- const auto pad_top = static_cast<unsigned int>(std::max(0 - start_in_i, 0));
- const auto pad_bottom = static_cast<unsigned int>(std::max<int>(end_in_i - height, 0));
- const auto valid_rows = input_rows() - pad_top - pad_bottom;
-
- // Compute the number of pooling window rows which are contained in
- // either the valid region of the input tensor, or the padding.
- const auto padded_bottom = std::min<unsigned int>(
- start_in_i + m_args.pool_window.rows, height + padding.bottom
- );
- const auto n_total_rows = padded_bottom - start_in_i;
-
- auto outptr_col = outptr_row;
- auto inptr_row = inptr_batch + (start_in_i + pad_top) * ld_input_row;
-
- for (int out_j = 0, start_in_j = -padding.left;
- out_j < static_cast<int>(output_width);
- out_j++, start_in_j += m_args.pool_stride.cols)
- {
- const int end_in_j = start_in_j + m_args.pool_window.cols;
-
- // Compute left/right padding
- const auto pad_left = static_cast<unsigned int>(std::max(0 - start_in_j, 0));
- const auto pad_right = static_cast<unsigned int>(std::max<int>(0, end_in_j - width));
- const auto valid_cols = input_cols() - pad_left - pad_right;
-
- // Compute the number of pooling window columns which are contained
- // in either the valid region of the input tensor, or the padding.
- const auto padded_right = std::min<unsigned int>(
- start_in_j + m_args.pool_window.cols, width + padding.right
- );
- const auto n_total_cols = padded_right - start_in_j;
-
- // Construct the input pointer array - fill in all valid points
- // contiguously.
- const TInput **ptrs = inptr_array;
- const TInput *rowptr = inptr_row + (start_in_j + pad_left) * ld_input_col;
- for (auto i = 0u; i < valid_rows; i++)
- {
- const TInput *colptr = rowptr;
- for (auto j = 0u; j < valid_cols; j++)
- {
- *(ptrs++) = colptr;
- colptr += ld_input_col;
- }
- rowptr += ld_input_row;
- }
-
- // Compute the number of valid cells
- const auto valid_cells = valid_rows * valid_cols;
- const auto cells_in_range = n_total_rows * n_total_cols;
- const auto window_cells = m_args.exclude_padding ? valid_cells : cells_in_range;
-
- // Get the output pointer for this call
- TOutput *outptr = outptr_col;
- outptr_col += ld_output_col;
-
-#ifdef CYCLE_PROFILING
- // TODO Work number
- auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::out_rows() * strategy::out_cols() * strategy::pool_rows() * strategy::pool_cols()));
-#endif // CYCLE_PROFILING
- strat.kernel(window_cells, valid_cells, end_channel - start_channel, inptr_array, outptr);
- }
-
- outptr_row += ld_output_row;
+ inptrs[n] += this->m_args.pool_stride.cols * input.ld_col;
}
}
}
+
+ public:
+ PoolingDepthfirstGeneric(
+ const IGenericDepthfirstStrategy<TInput, TOutput, OutputStage> *strat,
+ const PoolingArgs &args,
+ const OutputStage &os = {}
+ )
+ : DepthfirstDriver<TInput, TOutput>(
+ new GenericDepthfirstWrapper<TInput, TOutput, OutputStage>(strat, args),
+ args
+ ),
+ m_os(os)
+ {
+ }
};
} // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic_quantized.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic_quantized.hpp
deleted file mode 100644
index f3cb9a1d1f..0000000000
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_depthfirst_generic_quantized.hpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "pool_common.hpp"
-#include "utils.hpp"
-
-namespace arm_conv {
-namespace pooling {
-
-template <class strategy>
-class PoolingDepthfirstGenericQuantized : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type, Requantize32>
-{
- using TInput = typename strategy::operand_type;
- using TOutput = typename strategy::return_type;
-
- const PoolingArgs m_args; // Copy of arguments
- const Requantize32 m_requant; // Quantization parameters
-
- unsigned int input_rows(void) const
- {
- return m_args.pool_window.rows;
- }
-
- unsigned int input_cols(void) const
- {
- return m_args.pool_window.cols;
- }
-
- public:
- PoolingDepthfirstGenericQuantized(const PoolingArgs &args, const Requantize32 &rq) : m_args(args), m_requant(rq)
- {
- }
-
- PoolingDepthfirstGenericQuantized(PoolingDepthfirstGenericQuantized &) = delete;
- PoolingDepthfirstGenericQuantized &operator=(PoolingDepthfirstGenericQuantized &) = delete;
-
- size_t sizeof_input_pointer_array(void) const
- {
- return sizeof(TInput *) * input_rows() * input_cols();
- }
-
- size_t get_working_size(unsigned int num_threads) const override
- {
- return num_threads * sizeof_input_pointer_array();
- }
-
- void execute(
- const void *const input,
- void *const output,
- void *const working_space,
- unsigned int thread_id,
- unsigned int num_threads
- ) const override
- {
- const size_t ld_input_col = m_args.n_channels;
- const size_t ld_input_row = ld_input_col * m_args.input_cols;
- const size_t ld_input_batch = ld_input_row * m_args.input_rows;
- const size_t ld_output_col = ld_input_col;
- const size_t ld_output_row = ld_output_col * m_args.output_cols;
- const size_t ld_output_batch = ld_output_row * m_args.output_rows;
-
- execute(
- input, ld_input_col, ld_input_row, ld_input_batch,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space,
- thread_id, num_threads
- );
- }
-
- void execute(
- const void *const input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- void *const output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *const working_space,
- unsigned int thread_id,
- unsigned int num_threads
- ) const override
- {
- execute(
- m_args.n_batches, m_args.input_rows, m_args.input_cols,
- m_args.n_channels,
- input, ld_input_col, ld_input_row, ld_input_batch,
- m_args.padding,
- m_args.output_rows, m_args.output_cols,
- output, ld_output_col, ld_output_row, ld_output_batch,
- working_space,
- thread_id, num_threads
- );
- }
-
- void execute(
- unsigned int batches,
- unsigned int height,
- unsigned int width,
- unsigned int channels,
- const void *const _input,
- size_t ld_input_col,
- size_t ld_input_row,
- size_t ld_input_batch,
- const PaddingValues &padding,
- unsigned int output_height,
- unsigned int output_width,
- void *const _output,
- size_t ld_output_col,
- size_t ld_output_row,
- size_t ld_output_batch,
- void *const _working_space,
- unsigned int thread_id,
- unsigned int num_threads
- ) const override
- {
- strategy strat(m_args.cpu_info);
-#ifdef CYCLE_PROFILING
- arm_gemm::profiler prof;
-#endif // CYCLE_PROFILING
-
- const unsigned int roundup_output_rows = roundup(output_height, num_threads);
- const unsigned int rows_per_thread = roundup_output_rows / num_threads;
- int start_out_height = static_cast<int>(thread_id * rows_per_thread);
- int end_out_height = std::min<int>(output_height, static_cast<int>((thread_id + 1) * rows_per_thread));
-
- unsigned int start_channel = 0;
- unsigned int end_channel = channels;
- if(output_height == 1)
- {
- const unsigned int channels_per_thread = roundup(channels, num_threads) / num_threads;
- start_channel = thread_id * channels_per_thread;
- end_channel = std::min(start_channel + channels_per_thread, channels);
-
- // Reset start and end rows
- start_out_height = 0;
- end_out_height = output_height;
- }
-
- if(start_channel >= end_channel)
- {
- // Early exit in case of multiple threads parallelising on channels
- return;
- }
-
- // Cast input and output pointers into the right types
- const TInput *const inptr = static_cast<const TInput *>(_input) + start_channel;
- TOutput *const outptr = static_cast<TOutput *>(_output) + start_channel;
-
- // Grab the input pointer array
- uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
- const TInput **const inptr_array = reinterpret_cast<const TInput **>(working_space + thread_id * sizeof_input_pointer_array());
-
- // For each output tile, construct the requisite set of pointers and call
- // into the kernel.
- for (unsigned int batch = 0; batch < batches; batch++)
- {
- // Get batch pointers
- const auto inptr_batch = inptr + batch * ld_input_batch;
- const auto outptr_batch = outptr + batch * ld_output_batch;
-
- for (int out_i = start_out_height; out_i < end_out_height; out_i++)
- {
- const int start_in_i = out_i * m_args.pool_stride.rows - padding.top;
- const int end_in_i = start_in_i + m_args.pool_window.rows;
-
- // Compute top/bottom padding
- const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
- const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(height) - end_in_i, 0));
-
- // Compute the number of pooling window rows which are contained in
- // either the valid region of the input tensor, or the padding.
- const auto padded_bottom = std::min<unsigned int>(
- start_in_i + m_args.pool_window.rows, height + padding.bottom
- );
- const auto n_total_rows = padded_bottom - start_in_i;
-
- for (int out_j = 0, start_in_j = -padding.left;
- out_j < static_cast<int>(output_width);
- out_j++, start_in_j += m_args.pool_stride.cols)
- {
- const int end_in_j = start_in_j + m_args.pool_window.cols;
-
- // Compute left/right padding
- const auto pad_left = static_cast<unsigned int>(-std::min(start_in_j, 0));
- const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(width) - end_in_j, 0));
-
- // Compute the number of pooling window columns which are contained
- // in either the valid region of the input tensor, or the padding.
- const auto padded_right = std::min<unsigned int>(
- start_in_j + m_args.pool_window.cols, width + padding.right
- );
- const auto n_total_cols = padded_right - start_in_j;
-
- // Construct the input pointer array - fill in all valid points
- // contiguously.
- const TInput **ptrs = inptr_array;
- for (auto i = pad_top; i < input_rows() - pad_bottom; i++)
- {
- // Can skip over the left padding because we will have either the
- // same or less than the previous tile.
- unsigned int j = pad_left;
- const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col;
- for (; j < input_cols() - pad_right; j++)
- {
- *(ptrs++) = colptr;
- colptr += ld_input_col;
- }
- }
-
- // Compute the number of valid cells
- const auto valid_rows = input_rows() - pad_top - pad_bottom;
- const auto valid_cols = input_cols() - pad_left - pad_right;
- const auto valid_cells = valid_rows * valid_cols;
- const auto cells_in_range = n_total_rows * n_total_cols;
- const auto window_cells = m_args.exclude_padding ? valid_cells : cells_in_range;
-
- // Get the output pointer for this call
- TOutput *outptr = outptr_batch + out_i * ld_output_row + out_j * ld_output_col;
-
-#ifdef CYCLE_PROFILING
- // TODO Work number
- auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long) 0);
-#endif
- strat.kernel(window_cells, valid_cells, end_channel - start_channel, inptr_array, outptr, m_requant);
- }
- }
- }
- }
-};
-
-} // namespace pooling
-} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp
index 094c6aa301..a7f3dd3a93 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,12 +33,18 @@
#include "kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp"
#if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+#include "kernels/sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
+#include "kernels/sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp"
+#include "kernels/sme_fp16_nhwc_avg_generic_depthfirst.hpp"
+#include "kernels/sme_fp16_nhwc_max_generic_depthfirst.hpp"
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
#include "kernels/sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
#include "kernels/sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp"
#include "kernels/sve_fp16_nhwc_avg_generic_depthfirst.hpp"
#include "kernels/sve_fp16_nhwc_max_generic_depthfirst.hpp"
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
#include "kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
#include "kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp"
#include "kernels/a64_fp16_nhwc_avg_generic_depthfirst.hpp"
@@ -48,19 +54,6 @@
namespace arm_conv {
namespace pooling {
-namespace
-{
- template <class Strategy>
- bool is_supported(const PoolingArgs &args, const Nothing &)
- {
- return ((args.pool_type == Strategy::pooling_type()) &&
- (args.pool_window.rows == Strategy::pool_rows()) &&
- (args.pool_window.cols == Strategy::pool_cols()) &&
- (args.pool_stride.rows == Strategy::stride_rows()) &&
- (args.pool_stride.cols == Strategy::stride_cols()));
- }
-}
-
static const PoolingImplementation<__fp16, __fp16> pooling_fp16_methods[] = {
{
PoolingMethod::DEPTHFIRST,
@@ -70,48 +63,115 @@ static const PoolingImplementation<__fp16, __fp16> pooling_fp16_methods[] = {
},
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
- return new PoolingDepthfirstGeneric<cpp_nhwc_1x1_stride_any_depthfirst<__fp16>>(args);
+ auto strat = new cpp_nhwc_1x1_stride_any_depthfirst<__fp16>(args.cpu_info);
+ return new PoolingDepthfirstGeneric<__fp16>(strat, args);
},
},
#if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+ {
+ PoolingMethod::DEPTHFIRST,
+ "sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst",
+ [] (const PoolingArgs &args, const Nothing &os) -> bool {
+ return args.cpu_info->has_sme() &&
+ is_supported<sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+ },
+ nullptr,
+ [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
+ auto strat = new sme_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+ return new PoolingDepthfirst<__fp16>(strat, args);
+ },
+ },
+ {
+ PoolingMethod::DEPTHFIRST,
+ "sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst",
+ [] (const PoolingArgs &args, const Nothing &os) -> bool {
+ return args.cpu_info->has_sme() &&
+ is_supported<sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst>(args, os);
+ },
+ nullptr,
+ [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
+ auto strat = new sme_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(args.cpu_info);
+ return new PoolingDepthfirst<__fp16>(strat, args);
+ },
+ },
+ {
+ PoolingMethod::DEPTHFIRST,
+ "sme_fp16_nhwc_avg_generic_depthfirst",
+ [] (const PoolingArgs &args, const Nothing &) -> bool {
+ return args.cpu_info->has_sme() && args.pool_type == PoolingType::AVERAGE;
+ },
+ nullptr,
+ [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
+ auto strat = new sme_fp16_nhwc_avg_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<__fp16>(strat, args);
+ },
+ },
+ {
+ PoolingMethod::DEPTHFIRST,
+ "sme_fp16_nhwc_max_generic_depthfirst",
+ [] (const PoolingArgs &args, const Nothing &) -> bool {
+ return args.cpu_info->has_sme() && args.pool_type == PoolingType::MAX;
+ },
+ nullptr,
+ [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
+ auto strat = new sme_fp16_nhwc_max_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<__fp16>(strat, args);
+ },
+ },
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
{
PoolingMethod::DEPTHFIRST,
"sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst",
- is_supported<sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst>,
+ [] (const PoolingArgs &args, const Nothing &os) -> bool {
+ return args.cpu_info->has_sve() &&
+ is_supported<sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+ },
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
- return new PoolingDepthfirst<sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+ auto strat = new sve_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+ return new PoolingDepthfirst<__fp16>(strat, args);
},
},
{
PoolingMethod::DEPTHFIRST,
"sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst",
- is_supported<sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst>,
+ [] (const PoolingArgs &args, const Nothing &os) -> bool {
+ return args.cpu_info->has_sve() &&
+ is_supported<sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst>(args, os);
+ },
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
- return new PoolingDepthfirst<sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst>(args);
+ auto strat = new sve_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(args.cpu_info);
+ return new PoolingDepthfirst<__fp16>(strat, args);
},
},
{
PoolingMethod::DEPTHFIRST,
"sve_fp16_nhwc_avg_generic_depthfirst",
- [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::AVERAGE; },
+ [] (const PoolingArgs &args, const Nothing &) -> bool {
+ return args.cpu_info->has_sve() && args.pool_type == PoolingType::AVERAGE;
+ },
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
- return new PoolingDepthfirstGeneric<sve_fp16_nhwc_avg_generic_depthfirst>(args);
+ auto strat = new sve_fp16_nhwc_avg_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<__fp16>(strat, args);
},
},
{
PoolingMethod::DEPTHFIRST,
"sve_fp16_nhwc_max_generic_depthfirst",
- [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
+ [] (const PoolingArgs &args, const Nothing &) -> bool {
+ return args.cpu_info->has_sve() && args.pool_type == PoolingType::MAX;
+ },
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
- return new PoolingDepthfirstGeneric<sve_fp16_nhwc_max_generic_depthfirst>(args);
+ auto strat = new sve_fp16_nhwc_max_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<__fp16>(strat, args);
},
},
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
{
PoolingMethod::DEPTHFIRST,
@@ -119,7 +179,8 @@ static const PoolingImplementation<__fp16, __fp16> pooling_fp16_methods[] = {
is_supported<a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst>,
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
- return new PoolingDepthfirst<a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+ auto strat = new a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+ return new PoolingDepthfirst<__fp16>(strat, args);
},
},
{
@@ -128,7 +189,8 @@ static const PoolingImplementation<__fp16, __fp16> pooling_fp16_methods[] = {
is_supported<a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst>,
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
- return new PoolingDepthfirst<a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst>(args);
+ auto strat = new a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst(args.cpu_info);
+ return new PoolingDepthfirst<__fp16>(strat, args);
},
},
{
@@ -137,7 +199,8 @@ static const PoolingImplementation<__fp16, __fp16> pooling_fp16_methods[] = {
[] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::AVERAGE; },
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
- return new PoolingDepthfirstGeneric<a64_fp16_nhwc_avg_generic_depthfirst>(args);
+ auto strat = new a64_fp16_nhwc_avg_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<__fp16>(strat, args);
},
},
{
@@ -146,7 +209,8 @@ static const PoolingImplementation<__fp16, __fp16> pooling_fp16_methods[] = {
[] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<__fp16, __fp16> * {
- return new PoolingDepthfirstGeneric<a64_fp16_nhwc_max_generic_depthfirst>(args);
+ auto strat = new a64_fp16_nhwc_max_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<__fp16>(strat, args);
},
},
#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp
index 002115d78c..99d106583e 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,12 +30,18 @@
#include "kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp"
#if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+#include "kernels/sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
+#include "kernels/sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp"
+#include "kernels/sme_fp32_nhwc_avg_generic_depthfirst.hpp"
+#include "kernels/sme_fp32_nhwc_max_generic_depthfirst.hpp"
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
#include "kernels/sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
#include "kernels/sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp"
#include "kernels/sve_fp32_nhwc_avg_generic_depthfirst.hpp"
#include "kernels/sve_fp32_nhwc_max_generic_depthfirst.hpp"
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
#include "kernels/a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
#include "kernels/a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst.hpp"
#include "kernels/a64_fp32_nhwc_avg_generic_depthfirst.hpp"
@@ -45,19 +51,6 @@
namespace arm_conv {
namespace pooling {
-namespace
-{
- template <class Strategy>
- bool is_supported(const PoolingArgs &args, const Nothing &)
- {
- return ((args.pool_type == Strategy::pooling_type()) &&
- (args.pool_window.rows == Strategy::pool_rows()) &&
- (args.pool_window.cols == Strategy::pool_cols()) &&
- (args.pool_stride.rows == Strategy::stride_rows()) &&
- (args.pool_stride.cols == Strategy::stride_cols()));
- }
-}
-
static const PoolingImplementation<float, float> pooling_fp32_methods[] = {
{
PoolingMethod::DEPTHFIRST,
@@ -67,55 +60,123 @@ static const PoolingImplementation<float, float> pooling_fp32_methods[] = {
},
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
- return new PoolingDepthfirstGeneric<cpp_nhwc_1x1_stride_any_depthfirst<float>>(args);
+ auto strat = new cpp_nhwc_1x1_stride_any_depthfirst<float>(args.cpu_info);
+ return new PoolingDepthfirstGeneric<float, float, Nothing>(strat, args);
},
},
#if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+ {
+ PoolingMethod::DEPTHFIRST,
+ "sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst",
+ [] (const PoolingArgs &args, const Nothing &os) -> bool {
+ return args.cpu_info->has_sme() &&
+ is_supported<sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+ },
+ nullptr,
+ [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
+ auto strat = new sme_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+ return new PoolingDepthfirst<float>(strat, args);
+ },
+ },
+ {
+ PoolingMethod::DEPTHFIRST,
+ "sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst",
+ [] (const PoolingArgs &args, const Nothing &os) -> bool {
+ return args.cpu_info->has_sme() &&
+ is_supported<sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>(args, os);
+ },
+ nullptr,
+ [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
+ auto strat = new sme_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(args.cpu_info);
+ return new PoolingDepthfirst<float>(strat, args);
+ },
+ },
+ {
+ PoolingMethod::DEPTHFIRST,
+ "sme_fp32_nhwc_avg_generic_depthfirst",
+ [] (const PoolingArgs &args, const Nothing &) -> bool {
+ return args.cpu_info->has_sme() && args.pool_type == PoolingType::AVERAGE;
+ },
+ nullptr,
+ [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
+ auto strat = new sme_fp32_nhwc_avg_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<float>(strat, args);
+ },
+ },
+ {
+ PoolingMethod::DEPTHFIRST,
+ "sme_fp32_nhwc_max_generic_depthfirst",
+ [] (const PoolingArgs &args, const Nothing &) -> bool {
+ return args.cpu_info->has_sme() && args.pool_type == PoolingType::MAX;
+ },
+ nullptr,
+ [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
+ auto strat = new sme_fp32_nhwc_max_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<float>(strat, args);
+ },
+ },
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
{
PoolingMethod::DEPTHFIRST,
"sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst",
- is_supported<sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>,
+ [] (const PoolingArgs &args, const Nothing &os) -> bool {
+ return args.cpu_info->has_sve() &&
+ is_supported<sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+ },
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
- return new PoolingDepthfirst<sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+ auto strat = new sve_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+ return new PoolingDepthfirst<float>(strat, args);
},
},
{
PoolingMethod::DEPTHFIRST,
"sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst",
- is_supported<sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>,
+ [] (const PoolingArgs &args, const Nothing &os) -> bool {
+ return args.cpu_info->has_sve() &&
+ is_supported<sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>(args, os);
+ },
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
- return new PoolingDepthfirst<sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>(args);
+ auto strat = new sve_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(args.cpu_info);
+ return new PoolingDepthfirst<float>(strat, args);
},
},
{
PoolingMethod::DEPTHFIRST,
"sve_fp32_nhwc_avg_generic_depthfirst",
- [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::AVERAGE; },
+ [] (const PoolingArgs &args, const Nothing &) -> bool {
+ return args.cpu_info->has_sve() && args.pool_type == PoolingType::AVERAGE;
+ },
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
- return new PoolingDepthfirstGeneric<sve_fp32_nhwc_avg_generic_depthfirst>(args);
+ auto strat = new sve_fp32_nhwc_avg_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<float>(strat, args);
},
},
{
PoolingMethod::DEPTHFIRST,
"sve_fp32_nhwc_max_generic_depthfirst",
- [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
+ [] (const PoolingArgs &args, const Nothing &) -> bool {
+ return args.cpu_info->has_sve() && args.pool_type == PoolingType::MAX;
+ },
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
- return new PoolingDepthfirstGeneric<sve_fp32_nhwc_max_generic_depthfirst>(args);
+ auto strat = new sve_fp32_nhwc_max_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<float>(strat, args);
},
},
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
{
PoolingMethod::DEPTHFIRST,
"a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst",
is_supported<a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>,
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
- return new PoolingDepthfirst<a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+ auto strat = new a64_fp32_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+ return new PoolingDepthfirst<float>(strat, args);
},
},
{
@@ -124,7 +185,8 @@ static const PoolingImplementation<float, float> pooling_fp32_methods[] = {
is_supported<a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>,
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
- return new PoolingDepthfirst<a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst>(args);
+ auto strat = new a64_fp32_nhwc_avg_3x3_s1_output2x2_depthfirst(args.cpu_info);
+ return new PoolingDepthfirst<float>(strat, args);
},
},
{
@@ -133,7 +195,8 @@ static const PoolingImplementation<float, float> pooling_fp32_methods[] = {
[] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::AVERAGE; },
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
- return new PoolingDepthfirstGeneric<a64_fp32_nhwc_avg_generic_depthfirst>(args);
+ auto strat = new a64_fp32_nhwc_avg_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<float>(strat, args);
},
},
{
@@ -142,7 +205,8 @@ static const PoolingImplementation<float, float> pooling_fp32_methods[] = {
[] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<float, float> * {
- return new PoolingDepthfirstGeneric<a64_fp32_nhwc_max_generic_depthfirst>(args);
+ auto strat = new a64_fp32_nhwc_max_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<float>(strat, args);
},
},
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_implementation.hpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_implementation.hpp
index 3d968b84e5..235aa1b635 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_implementation.hpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_implementation.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -39,7 +39,7 @@ struct PoolingImplementation
const char * name;
std::function<bool(const PoolingArgs &, const OutputStage &)> is_supported;
std::function<uint64_t(const PoolingArgs &, const OutputStage &)> cycle_estimate;
- std::function<PoolingCommon<TInput, TOutput, OutputStage> *(const PoolingArgs &, const OutputStage &)> initialise;
+ std::function<PoolingCommon<TInput, TOutput> *(const PoolingArgs &, const OutputStage &)> initialise;
bool get_is_supported(const PoolingArgs &args, const OutputStage &os) const
{
@@ -51,12 +51,15 @@ struct PoolingImplementation
return (cycle_estimate == nullptr) ? 0 : cycle_estimate(args, os);
}
- PoolingCommon<TInput, TOutput, OutputStage> *get_instance(const PoolingArgs &args, const OutputStage &os) const
+ PoolingCommon<TInput, TOutput> *get_instance(const PoolingArgs &args, const OutputStage &os) const
{
return initialise(args, os);
}
};
+/**
+ * \relates PoolingImplementation
+ */
template <typename TInput, typename TOutput, class OutputStage = Nothing>
const PoolingImplementation<TInput, TOutput, OutputStage> *pooling_implementation_list();
@@ -92,11 +95,21 @@ bool find_implementation(
}
template <typename TInput, typename TOutput, class OutputStage>
-UniquePoolingCommon<TInput, TOutput, OutputStage> pooling(const PoolingArgs &args, const OutputStage &os)
+UniquePoolingCommon<TInput, TOutput> pooling(const PoolingArgs &args, const OutputStage &os)
{
const PoolingImplementation<TInput, TOutput, OutputStage> *impl = nullptr;
const bool success = find_implementation<TInput, TOutput, OutputStage>(args, os, impl);
- return UniquePoolingCommon<TInput, TOutput, OutputStage>(success ? impl->get_instance(args, os) : nullptr);
+ return UniquePoolingCommon<TInput, TOutput>(success ? impl->get_instance(args, os) : nullptr);
+}
+
+template <class Strategy>
+bool is_supported(const PoolingArgs &args, const Nothing &)
+{
+ return ((args.pool_type == Strategy::pooling_type) &&
+ (args.pool_window.rows == Strategy::pool_rows) &&
+ (args.pool_window.cols == Strategy::pool_cols) &&
+ (args.pool_stride.rows == Strategy::stride_rows) &&
+ (args.pool_stride.cols == Strategy::stride_cols));
}
} // namespace pooling
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp
index 490fc0d863..8d08ddc43f 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_s8.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,13 +30,16 @@
#include "kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp"
#if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
-#if defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+#include "kernels/sme_s8_nhwc_avg_generic_depthfirst.hpp"
+#include "kernels/sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
+#include "kernels/sme_s8_nhwc_max_generic_depthfirst.hpp"
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
#include "kernels/sve_s8_nhwc_avg_generic_depthfirst.hpp"
-#endif // defined(SVE2)
#include "kernels/sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
#include "kernels/sve_s8_nhwc_max_generic_depthfirst.hpp"
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
#include "kernels/a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
#include "kernels/a64_s8_nhwc_avg_generic_depthfirst.hpp"
#include "kernels/a64_s8_nhwc_max_generic_depthfirst.hpp"
@@ -47,19 +50,6 @@
namespace arm_conv {
namespace pooling {
-namespace
-{
- template <class Strategy>
- bool is_supported(const PoolingArgs &args, const Nothing &)
- {
- return ((args.pool_type == Strategy::pooling_type()) &&
- (args.pool_window.rows == Strategy::pool_rows()) &&
- (args.pool_window.cols == Strategy::pool_cols()) &&
- (args.pool_stride.rows == Strategy::stride_rows()) &&
- (args.pool_stride.cols == Strategy::stride_cols()));
- }
-}
-
static const PoolingImplementation<int8_t, int8_t> pooling_s8_methods[] = {
{
PoolingMethod::DEPTHFIRST,
@@ -69,48 +59,97 @@ static const PoolingImplementation<int8_t, int8_t> pooling_s8_methods[] = {
},
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
- return new PoolingDepthfirstGeneric<cpp_nhwc_1x1_stride_any_depthfirst<int8_t>>(args);
+ auto strat = new cpp_nhwc_1x1_stride_any_depthfirst<int8_t>(args.cpu_info);
+ return new PoolingDepthfirstGeneric<int8_t>(strat, args);
},
},
#if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
-#if defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
{
PoolingMethod::DEPTHFIRST,
- "sve_s8_nhwc_avg_generic_depthfirst",
- [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::AVERAGE; },
+ "sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst",
+ [] (const PoolingArgs &args, const Nothing &os) -> bool {
+ return args.cpu_info->has_sme() &&
+ is_supported<sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+ },
+ nullptr,
+ [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
+ auto strat = new sme_s8_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+ return new PoolingDepthfirst<int8_t>(strat, args);
+ },
+ },
+ {
+ PoolingMethod::DEPTHFIRST,
+ "sme_s8_nhwc_avg_generic_depthfirst",
+ [] (const PoolingArgs &args, const Nothing &) -> bool {
+ return args.cpu_info->has_sme2() && args.pool_type == PoolingType::AVERAGE;
+ },
+ nullptr,
+ [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
+ auto strat = new sme_s8_nhwc_avg_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<int8_t>(strat, args);
+ },
+ },
+ {
+ PoolingMethod::DEPTHFIRST,
+ "sme_s8_nhwc_max_generic_depthfirst",
+ [] (const PoolingArgs &args, const Nothing &) -> bool {
+ return args.cpu_info->has_sme() && args.pool_type == PoolingType::MAX;
+ },
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
- return new PoolingDepthfirstGeneric<sve_s8_nhwc_avg_generic_depthfirst>(args);
+ auto strat = new sme_s8_nhwc_max_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<int8_t>(strat, args);
},
},
-#endif // defined(SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
{
PoolingMethod::DEPTHFIRST,
"sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst",
- is_supported<sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst>,
+ [] (const PoolingArgs &args, const Nothing &os) -> bool {
+ return args.cpu_info->has_sve() &&
+ is_supported<sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+ },
+ nullptr,
+ [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
+ auto strat = new sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+ return new PoolingDepthfirst<int8_t>(strat, args);
+ },
+ },
+ {
+ PoolingMethod::DEPTHFIRST,
+ "sve_s8_nhwc_avg_generic_depthfirst",
+ [] (const PoolingArgs &args, const Nothing &) -> bool {
+ return args.cpu_info->has_sve2() && args.pool_type == PoolingType::AVERAGE;
+ },
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
- return new PoolingDepthfirst<sve_s8_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+ auto strat = new sve_s8_nhwc_avg_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<int8_t>(strat, args);
},
},
{
PoolingMethod::DEPTHFIRST,
"sve_s8_nhwc_max_generic_depthfirst",
- [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
+ [] (const PoolingArgs &args, const Nothing &) -> bool {
+ return args.cpu_info->has_sve() && args.pool_type == PoolingType::MAX;
+ },
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
- return new PoolingDepthfirstGeneric<sve_s8_nhwc_max_generic_depthfirst>(args);
+ auto strat = new sve_s8_nhwc_max_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<int8_t>(strat, args);
},
},
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
{
PoolingMethod::DEPTHFIRST,
"a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst",
is_supported<a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst>,
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
- return new PoolingDepthfirst<a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+ auto strat = new a64_s8_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+ return new PoolingDepthfirst<int8_t>(strat, args);
},
},
{
@@ -119,7 +158,8 @@ static const PoolingImplementation<int8_t, int8_t> pooling_s8_methods[] = {
[] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::AVERAGE; },
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
- return new PoolingDepthfirstGeneric<a64_s8_nhwc_avg_generic_depthfirst>(args);
+ auto strat = new a64_s8_nhwc_avg_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<int8_t>(strat, args);
},
},
{
@@ -128,7 +168,8 @@ static const PoolingImplementation<int8_t, int8_t> pooling_s8_methods[] = {
[] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<int8_t, int8_t> * {
- return new PoolingDepthfirstGeneric<a64_s8_nhwc_max_generic_depthfirst>(args);
+ auto strat = new a64_s8_nhwc_max_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<int8_t>(strat, args);
},
},
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp
index fd4e045035..dcb3c8f57c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_s8q.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,13 +25,17 @@
#include "arm_gemm_local.hpp"
#include "pooling_implementation.hpp"
-#include "pooling_depthfirst_generic_quantized.hpp"
+#include "pooling_depthfirst_generic.hpp"
#if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+#include "kernels/sme_s8q_nhwc_avg_generic_depthfirst.hpp"
+#include "kernels/sme_s8q_nhwc_max_generic_depthfirst.hpp"
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
#include "kernels/sve_s8q_nhwc_avg_generic_depthfirst.hpp"
#include "kernels/sve_s8q_nhwc_max_generic_depthfirst.hpp"
-#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
#include "kernels/a64_s8q_nhwc_avg_generic_depthfirst.hpp"
#include "kernels/a64_s8q_nhwc_max_generic_depthfirst.hpp"
#endif // defined(__aarch64__)
@@ -41,30 +45,60 @@
namespace arm_conv {
namespace pooling {
-static const PoolingImplementation<int8_t, int8_t, Requantize32> pooling_u8_methods[] = {
+static const PoolingImplementation<int8_t, int8_t, Requantize32> pooling_s8q_methods[] = {
#if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+ {
+ PoolingMethod::DEPTHFIRST,
+ "sme_s8q_nhwc_avg_generic_depthfirst",
+ [] (const PoolingArgs &args, const Requantize32 &) -> bool {
+ return args.cpu_info->has_sme2() && args.pool_type == PoolingType::AVERAGE;
+ },
+ nullptr,
+ [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t> * {
+ auto strat = new sme_s8q_nhwc_avg_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<int8_t, int8_t, Requantize32>(strat, args, rq);
+ },
+ },
+ {
+ PoolingMethod::DEPTHFIRST,
+ "sme_s8q_nhwc_max_generic_depthfirst",
+ [] (const PoolingArgs &args, const Requantize32 &) -> bool {
+ return args.cpu_info->has_sme2() && args.pool_type == PoolingType::MAX;
+ },
+ nullptr,
+ [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t> * {
+ auto strat = new sme_s8q_nhwc_max_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<int8_t, int8_t, Requantize32>(strat, args, rq);
+ },
+ },
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
{
PoolingMethod::DEPTHFIRST,
"sve_s8q_nhwc_avg_generic_depthfirst",
[] (const PoolingArgs &args, const Requantize32 &) -> bool {
- return args.pool_type == PoolingType::AVERAGE;
+ return args.cpu_info->has_sve2() && args.pool_type == PoolingType::AVERAGE;
},
nullptr,
- [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t, Requantize32> * {
- return new PoolingDepthfirstGenericQuantized<sve_s8q_nhwc_avg_generic_depthfirst>(args, rq);
+ [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t> * {
+ auto strat = new sve_s8q_nhwc_avg_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<int8_t, int8_t, Requantize32>(strat, args, rq);
},
},
{
PoolingMethod::DEPTHFIRST,
"sve_s8q_nhwc_max_generic_depthfirst",
- [] (const PoolingArgs &args, const Requantize32 &) -> bool { return args.pool_type == PoolingType::MAX; },
+ [] (const PoolingArgs &args, const Requantize32 &) -> bool {
+ return args.cpu_info->has_sve2() && args.pool_type == PoolingType::MAX;
+ },
nullptr,
- [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t, Requantize32> * {
- return new PoolingDepthfirstGenericQuantized<sve_s8q_nhwc_max_generic_depthfirst>(args, rq);
+ [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t> * {
+ auto strat = new sve_s8q_nhwc_max_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<int8_t, int8_t, Requantize32>(strat, args, rq);
},
},
-#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
{
PoolingMethod::DEPTHFIRST,
"a64_s8q_nhwc_avg_generic_depthfirst",
@@ -72,8 +106,9 @@ static const PoolingImplementation<int8_t, int8_t, Requantize32> pooling_u8_meth
return args.pool_type == PoolingType::AVERAGE;
},
nullptr,
- [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t, Requantize32> * {
- return new PoolingDepthfirstGenericQuantized<a64_s8q_nhwc_avg_generic_depthfirst>(args, rq);
+ [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t> * {
+ auto strat = new a64_s8q_nhwc_avg_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<int8_t, int8_t, Requantize32>(strat, args, rq);
},
},
{
@@ -81,8 +116,9 @@ static const PoolingImplementation<int8_t, int8_t, Requantize32> pooling_u8_meth
"a64_s8q_nhwc_max_generic_depthfirst",
[] (const PoolingArgs &args, const Requantize32 &) -> bool { return args.pool_type == PoolingType::MAX; },
nullptr,
- [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t, Requantize32> * {
- return new PoolingDepthfirstGenericQuantized<a64_s8q_nhwc_max_generic_depthfirst>(args, rq);
+ [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<int8_t, int8_t> * {
+ auto strat = new a64_s8q_nhwc_max_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<int8_t, int8_t, Requantize32>(strat, args, rq);
},
},
#endif // defined(__aarch64__)
@@ -92,10 +128,10 @@ static const PoolingImplementation<int8_t, int8_t, Requantize32> pooling_u8_meth
template <>
const PoolingImplementation<int8_t, int8_t, Requantize32> *pooling_implementation_list()
{
- return pooling_u8_methods;
+ return pooling_s8q_methods;
}
-template UniquePoolingCommon<int8_t, int8_t, Requantize32> pooling(const PoolingArgs &, const Requantize32 &);
+template UniquePoolingCommon<int8_t, int8_t> pooling(const PoolingArgs &, const Requantize32 &);
} // namespace pooling
} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp
index 052354922e..ee5a79b4ff 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_u8.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,13 +30,16 @@
#include "kernels/cpp_nhwc_1x1_stride_any_depthfirst.hpp"
#if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
-#if defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+#include "kernels/sme_u8_nhwc_avg_generic_depthfirst.hpp"
+#include "kernels/sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
+#include "kernels/sme_u8_nhwc_max_generic_depthfirst.hpp"
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
#include "kernels/sve_u8_nhwc_avg_generic_depthfirst.hpp"
-#endif // defined(SVE2)
#include "kernels/sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
#include "kernels/sve_u8_nhwc_max_generic_depthfirst.hpp"
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
#include "kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst.hpp"
#include "kernels/a64_u8_nhwc_avg_generic_depthfirst.hpp"
#include "kernels/a64_u8_nhwc_max_generic_depthfirst.hpp"
@@ -47,19 +50,6 @@
namespace arm_conv {
namespace pooling {
-namespace
-{
- template <class Strategy>
- bool is_supported(const PoolingArgs &args, const Nothing &)
- {
- return ((args.pool_type == Strategy::pooling_type()) &&
- (args.pool_window.rows == Strategy::pool_rows()) &&
- (args.pool_window.cols == Strategy::pool_cols()) &&
- (args.pool_stride.rows == Strategy::stride_rows()) &&
- (args.pool_stride.cols == Strategy::stride_cols()));
- }
-}
-
static const PoolingImplementation<uint8_t, uint8_t> pooling_u8_methods[] = {
{
PoolingMethod::DEPTHFIRST,
@@ -69,15 +59,28 @@ static const PoolingImplementation<uint8_t, uint8_t> pooling_u8_methods[] = {
},
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
- return new PoolingDepthfirstGeneric<cpp_nhwc_1x1_stride_any_depthfirst<uint8_t>>(args);
+ auto strat = new cpp_nhwc_1x1_stride_any_depthfirst<uint8_t>(args.cpu_info);
+ return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
},
},
#if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE)
-#if defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
{
PoolingMethod::DEPTHFIRST,
- "sve_u8_nhwc_avg_generic_depthfirst",
+ "sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst",
+ [] (const PoolingArgs &args, const Nothing &os) -> bool {
+ return args.cpu_info->has_sme() &&
+ is_supported<sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+ },
+ nullptr,
+ [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
+ auto strat = new sme_u8_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+ return new PoolingDepthfirst<uint8_t>(strat, args);
+ },
+ },
+ {
+ PoolingMethod::DEPTHFIRST,
+ "sme_u8_nhwc_avg_generic_depthfirst",
[] (const PoolingArgs &args, const Nothing &) -> bool {
// This kernel can only be used when there is either no padding, or we don't care
// about the value of the padding. Otherwise, we would need to pass in the zero-point
@@ -85,40 +88,82 @@ static const PoolingImplementation<uint8_t, uint8_t> pooling_u8_methods[] = {
return (args.exclude_padding ||
(args.padding.top == 0 && args.padding.bottom == 0 &&
args.padding.left == 0 && args.padding.right == 0)
- ) && args.pool_type == PoolingType::AVERAGE;
+ ) && args.pool_type == PoolingType::AVERAGE &&
+ args.cpu_info->has_sme2();
+ },
+ nullptr,
+ [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
+ auto strat = new sme_u8_nhwc_avg_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
+ },
+ },
+ {
+ PoolingMethod::DEPTHFIRST,
+ "sme_u8_nhwc_max_generic_depthfirst",
+ [] (const PoolingArgs &args, const Nothing &) -> bool {
+ return args.cpu_info->has_sme() && args.pool_type == PoolingType::MAX;
},
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
- return new PoolingDepthfirstGeneric<sve_u8_nhwc_avg_generic_depthfirst>(args);
+ auto strat = new sme_u8_nhwc_max_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
},
},
-#endif // defined(SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
{
PoolingMethod::DEPTHFIRST,
"sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst",
- is_supported<sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst>,
+ [] (const PoolingArgs &args, const Nothing &os) -> bool {
+ return args.cpu_info->has_sve() &&
+ is_supported<sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst>(args, os);
+ },
+ nullptr,
+ [] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
+ auto strat = new sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+ return new PoolingDepthfirst<uint8_t>(strat, args);
+ },
+ },
+ {
+ PoolingMethod::DEPTHFIRST,
+ "sve_u8_nhwc_avg_generic_depthfirst",
+ [] (const PoolingArgs &args, const Nothing &) -> bool {
+ // This kernel can only be used when there is either no padding, or we don't care
+ // about the value of the padding. Otherwise, we would need to pass in the zero-point
+ // for the quantization regime.
+ return (args.exclude_padding ||
+ (args.padding.top == 0 && args.padding.bottom == 0 &&
+ args.padding.left == 0 && args.padding.right == 0)
+ ) && args.pool_type == PoolingType::AVERAGE &&
+ args.cpu_info->has_sve2();
+ },
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
- return new PoolingDepthfirst<sve_u8_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+ auto strat = new sve_u8_nhwc_avg_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
},
},
{
PoolingMethod::DEPTHFIRST,
"sve_u8_nhwc_max_generic_depthfirst",
- [] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
+ [] (const PoolingArgs &args, const Nothing &) -> bool {
+ return args.cpu_info->has_sve() && args.pool_type == PoolingType::MAX;
+ },
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
- return new PoolingDepthfirstGeneric<sve_u8_nhwc_max_generic_depthfirst>(args);
+ auto strat = new sve_u8_nhwc_max_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
},
},
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
{
PoolingMethod::DEPTHFIRST,
"a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst",
is_supported<a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst>,
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
- return new PoolingDepthfirst<a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst>(args);
+ auto strat = new a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst(args.cpu_info);
+ return new PoolingDepthfirst<uint8_t>(strat, args);
},
},
{
@@ -135,7 +180,8 @@ static const PoolingImplementation<uint8_t, uint8_t> pooling_u8_methods[] = {
},
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
- return new PoolingDepthfirstGeneric<a64_u8_nhwc_avg_generic_depthfirst>(args);
+ auto strat = new a64_u8_nhwc_avg_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
},
},
{
@@ -144,7 +190,8 @@ static const PoolingImplementation<uint8_t, uint8_t> pooling_u8_methods[] = {
[] (const PoolingArgs &args, const Nothing &) -> bool { return args.pool_type == PoolingType::MAX; },
nullptr,
[] (const PoolingArgs &args, const Nothing &) -> PoolingCommon<uint8_t, uint8_t> * {
- return new PoolingDepthfirstGeneric<a64_u8_nhwc_max_generic_depthfirst>(args);
+ auto strat = new a64_u8_nhwc_max_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<uint8_t>(strat, args);
},
},
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp b/src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp
index 41303fb418..cd1b02889c 100644
--- a/src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp
+++ b/src/core/NEON/kernels/arm_conv/pooling/pooling_u8q.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,13 +25,17 @@
#include "arm_gemm_local.hpp"
#include "pooling_implementation.hpp"
-#include "pooling_depthfirst_generic_quantized.hpp"
+#include "pooling_depthfirst_generic.hpp"
#if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+#include "kernels/sme_u8q_nhwc_avg_generic_depthfirst.hpp"
+#include "kernels/sme_u8q_nhwc_max_generic_depthfirst.hpp"
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
#include "kernels/sve_u8q_nhwc_avg_generic_depthfirst.hpp"
#include "kernels/sve_u8q_nhwc_max_generic_depthfirst.hpp"
-#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
#include "kernels/a64_u8q_nhwc_avg_generic_depthfirst.hpp"
#include "kernels/a64_u8q_nhwc_max_generic_depthfirst.hpp"
#endif // defined(__aarch64__)
@@ -41,30 +45,60 @@
namespace arm_conv {
namespace pooling {
-static const PoolingImplementation<uint8_t, uint8_t, Requantize32> pooling_u8_methods[] = {
+static const PoolingImplementation<uint8_t, uint8_t, Requantize32> pooling_u8q_methods[] = {
#if defined(__aarch64__)
-#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SME)
+ {
+ PoolingMethod::DEPTHFIRST,
+ "sme_u8q_nhwc_avg_generic_depthfirst",
+ [] (const PoolingArgs &args, const Requantize32 &) -> bool {
+ return args.cpu_info->has_sme2() && args.pool_type == PoolingType::AVERAGE;
+ },
+ nullptr,
+ [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t> * {
+ auto strat = new sme_u8q_nhwc_avg_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<uint8_t, uint8_t, Requantize32>(strat, args, rq);
+ },
+ },
+ {
+ PoolingMethod::DEPTHFIRST,
+ "sme_u8q_nhwc_max_generic_depthfirst",
+ [] (const PoolingArgs &args, const Requantize32 &) -> bool {
+ return args.cpu_info->has_sme2() && args.pool_type == PoolingType::MAX;
+ },
+ nullptr,
+ [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t> * {
+ auto strat = new sme_u8q_nhwc_max_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<uint8_t, uint8_t, Requantize32>(strat, args, rq);
+ },
+ },
+#endif // defined(ARM_COMPUTE_ENABLE_SME)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
{
PoolingMethod::DEPTHFIRST,
"sve_u8q_nhwc_avg_generic_depthfirst",
[] (const PoolingArgs &args, const Requantize32 &) -> bool {
- return args.pool_type == PoolingType::AVERAGE;
+ return args.cpu_info->has_sve2() && args.pool_type == PoolingType::AVERAGE;
},
nullptr,
- [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t, Requantize32> * {
- return new PoolingDepthfirstGenericQuantized<sve_u8q_nhwc_avg_generic_depthfirst>(args, rq);
+ [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t> * {
+ auto strat = new sve_u8q_nhwc_avg_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<uint8_t, uint8_t, Requantize32>(strat, args, rq);
},
},
{
PoolingMethod::DEPTHFIRST,
"sve_u8q_nhwc_max_generic_depthfirst",
- [] (const PoolingArgs &args, const Requantize32 &) -> bool { return args.pool_type == PoolingType::MAX; },
+ [] (const PoolingArgs &args, const Requantize32 &) -> bool {
+ return args.cpu_info->has_sve2() && args.pool_type == PoolingType::MAX;
+ },
nullptr,
- [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t, Requantize32> * {
- return new PoolingDepthfirstGenericQuantized<sve_u8q_nhwc_max_generic_depthfirst>(args, rq);
+ [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t> * {
+ auto strat = new sve_u8q_nhwc_max_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<uint8_t, uint8_t, Requantize32>(strat, args, rq);
},
},
-#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
{
PoolingMethod::DEPTHFIRST,
"a64_u8q_nhwc_avg_generic_depthfirst",
@@ -72,8 +106,9 @@ static const PoolingImplementation<uint8_t, uint8_t, Requantize32> pooling_u8_me
return args.pool_type == PoolingType::AVERAGE;
},
nullptr,
- [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t, Requantize32> * {
- return new PoolingDepthfirstGenericQuantized<a64_u8q_nhwc_avg_generic_depthfirst>(args, rq);
+ [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t> * {
+ auto strat = new a64_u8q_nhwc_avg_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<uint8_t, uint8_t, Requantize32>(strat, args, rq);
},
},
{
@@ -81,8 +116,9 @@ static const PoolingImplementation<uint8_t, uint8_t, Requantize32> pooling_u8_me
"a64_u8q_nhwc_max_generic_depthfirst",
[] (const PoolingArgs &args, const Requantize32 &) -> bool { return args.pool_type == PoolingType::MAX; },
nullptr,
- [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t, Requantize32> * {
- return new PoolingDepthfirstGenericQuantized<a64_u8q_nhwc_max_generic_depthfirst>(args, rq);
+ [] (const PoolingArgs &args, const Requantize32 &rq) -> PoolingCommon<uint8_t, uint8_t> * {
+ auto strat = new a64_u8q_nhwc_max_generic_depthfirst(args.cpu_info);
+ return new PoolingDepthfirstGeneric<uint8_t, uint8_t, Requantize32>(strat, args, rq);
},
},
#endif // defined(__aarch64__)
@@ -92,10 +128,10 @@ static const PoolingImplementation<uint8_t, uint8_t, Requantize32> pooling_u8_me
template <>
const PoolingImplementation<uint8_t, uint8_t, Requantize32> *pooling_implementation_list()
{
- return pooling_u8_methods;
+ return pooling_u8q_methods;
}
-template UniquePoolingCommon<uint8_t, uint8_t, Requantize32> pooling(const PoolingArgs &, const Requantize32 &);
+template UniquePoolingCommon<uint8_t, uint8_t> pooling(const PoolingArgs &, const Requantize32 &);
} // namespace pooling
} // namespace arm_conv