aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichele Di Giorgio <michele.digiorgio@arm.com>2021-01-22 09:47:04 +0000
committerMichele Di Giorgio <michele.digiorgio@arm.com>2021-06-18 10:33:48 +0000
commitd02d5edfa15ba6c04a9986a8a362a945cb38ac31 (patch)
treeced4f49691d6c7038e347a8709b315bff59c64cf
parentb014c27ba6db9840e4a72519760d51a87a2af7e7 (diff)
downloadComputeLibrary-master.tar.gz
Integrate improved CPU depthwise convolution kernelsHEADmaster
* Replace assembly kernels for depthwise convolution with more optimized ones. * Add int8 assembly kernels. * Fix implicit padding on optimized kernels Resolves: COMPMID-3867, COMPMID-4361 Change-Id: I0b0867e05f61be4f368f62190d55e14d0ab3ebf2 Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5622 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
-rw-r--r--Android.bp115
-rw-r--r--SConscript17
-rw-r--r--arm_compute/core/utils/quantization/AsymmHelpers.h4
-rw-r--r--arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h2
-rw-r--r--examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp2
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp347
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp388
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier.hpp480
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier_quantized.hpp127
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_quantized.hpp125
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp527
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier_quantized.hpp251
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_quantized.hpp412
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp224
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp276
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp143
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp105
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp266
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp228
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp157
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp128
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp250
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp250
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp119
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp136
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp136
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp528
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp515
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp829
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp907
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp1233
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1399
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp616
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp631
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp973
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp1022
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp59
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp527
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp62
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp1049
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp68
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp524
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp511
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp68
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp825
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp903
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp68
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp1229
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp1395
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp68
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp612
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp627
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp68
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp969
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp1018
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp (renamed from src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp)38
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp379
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp66
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp532
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp66
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp916
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp58
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp851
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp1318
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1192
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1423
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp2213
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp (renamed from src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp)37
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp624
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp66
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp527
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp66
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp662
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp58
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp1484
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp1184
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp1318
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1192
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1423
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp2213
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp (renamed from src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp)39
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp624
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp66
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp527
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp66
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp662
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp58
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp1484
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1192
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1423
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp2213
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp55
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp624
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp58
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp1484
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp324
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp284
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp478
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp495
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp688
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp746
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp345
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp345
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp531
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp559
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp255
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp364
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp318
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided.hpp66
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided/generic.cpp247
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp538
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp547
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp688
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp820
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp405
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp397
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp531
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp633
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp59
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp166
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp70
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp259
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp70
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp392
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp62
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp454
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp457
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp418
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp459
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp660
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp70
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp353
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp70
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp428
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp388
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp457
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp418
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp459
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp660
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp70
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp353
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp70
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp428
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp418
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp459
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp75
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp660
-rw-r--r--src/core/NEON/kernels/arm_gemm/utils.hpp12
-rw-r--r--src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h88
-rw-r--r--src/core/NEON/kernels/assembly/common.hpp (renamed from src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp)17
-rw-r--r--src/core/NEON/kernels/assembly/depthwise.hpp170
-rw-r--r--src/core/NEON/kernels/assembly/depthwise_common.hpp131
-rw-r--r--src/core/NEON/kernels/assembly/pool_common.hpp9
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise.hpp551
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp1168
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp2809
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp2341
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp769
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp6018
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp42
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp156
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp144
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp102
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp291
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp88
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_base.hpp505
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_dilated.hpp295
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_fp16_fp16.hpp439
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_fp32_fp32.hpp438
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp511
-rw-r--r--src/core/NEON/kernels/convolution/depthwise/impl_qa8_qs8_per_channel.hpp457
-rw-r--r--src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp33
-rw-r--r--src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp359
-rw-r--r--src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h120
-rw-r--r--src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp7
-rw-r--r--src/core/utils/AssemblyUtils.cpp70
-rw-r--r--src/core/utils/AssemblyUtils.h52
-rw-r--r--src/core/utils/quantization/AsymmHelpers.cpp7
-rw-r--r--src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp4
-rw-r--r--src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp2
-rw-r--r--src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp7
-rw-r--r--src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp16
-rw-r--r--src/runtime/cpu/operators/CpuDepthwiseConv2d.h5
-rw-r--r--src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp520
-rw-r--r--src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h27
-rw-r--r--src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp37
-rw-r--r--tests/datasets/DepthwiseConvolutionLayerDataset.h8
-rw-r--r--tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h3
226 files changed, 78761 insertions, 17837 deletions
diff --git a/Android.bp b/Android.bp
index 87bdcfccc..19645c0c2 100644
--- a/Android.bp
+++ b/Android.bp
@@ -48,7 +48,6 @@ cc_library_static {
"src/core/helpers",
"src/core/NEON/kernels/assembly",
"src/core/NEON/kernels/convolution/common",
- "src/core/NEON/kernels/convolution/depthwise",
"src/core/NEON/kernels/convolution/winograd",
"src/core/cpu/kernels/assembly"],
export_include_dirs: [".", "./include"],
@@ -206,6 +205,12 @@ cc_library_static {
"src/core/NEON/kernels/NETileKernel.cpp",
"src/core/NEON/kernels/NEWeightsReshapeKernel.cpp",
"src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp",
"src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp",
"src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp",
@@ -236,18 +241,6 @@ cc_library_static {
"src/core/NEON/kernels/convolution/common/qasymm8.cpp",
"src/core/NEON/kernels/convolution/common/qsymm8.cpp",
"src/core/NEON/kernels/convolution/common/utils.cpp",
- "src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_1x1_fp32_fp32.cpp",
- "src/core/NEON/kernels/convolution/depthwise/depthwise_2x2_3x3_2x2_fp32_fp32.cpp",
- "src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_1x1_fp32_fp32.cpp",
- "src/core/NEON/kernels/convolution/depthwise/depthwise_3x3_3x3_2x2_fp32_fp32.cpp",
- "src/core/NEON/kernels/convolution/depthwise/depthwise_4x4_3x3_1x1_fp32_fp32.cpp",
- "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.cpp",
- "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated_qa8_qa8.cpp",
- "src/core/NEON/kernels/convolution/depthwise/depthwise_fp16.cpp",
- "src/core/NEON/kernels/convolution/depthwise/depthwise_fp32.cpp",
- "src/core/NEON/kernels/convolution/depthwise/depthwise_pack_parameters.cpp",
- "src/core/NEON/kernels/convolution/depthwise/depthwise_qa8_qa8.cpp",
- "src/core/NEON/kernels/convolution/depthwise/depthwise_qs8_qs8.cpp",
"src/core/NEON/kernels/convolution/winograd/padding.cpp",
"src/core/NEON/kernels/convolution/winograd/winograd.cpp",
"src/core/NEON/kernels/convolution/winograd/winograd_transforms/input_1x8_fp32_fp32_integers.cpp",
@@ -325,6 +318,7 @@ cc_library_static {
"src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp",
"src/core/cpu/kernels/floor/neon/fp16.cpp",
"src/core/cpu/kernels/floor/neon/fp32.cpp",
+ "src/core/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp",
"src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp",
"src/core/cpu/kernels/pooling/neon/fp16.cpp",
"src/core/cpu/kernels/pooling/neon/fp32.cpp",
@@ -392,6 +386,7 @@ cc_library_static {
"src/core/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultReshapedRhsOnlyValhall.cpp",
"src/core/helpers/SoftmaxHelpers.cpp",
"src/core/helpers/WindowHelpers.cpp",
+ "src/core/utils/AssemblyUtils.cpp",
"src/core/utils/ScaleUtils.cpp",
"src/core/utils/helpers/fft.cpp",
"src/core/utils/helpers/tensor_transform.cpp",
@@ -703,6 +698,100 @@ cc_library_static {
},
arm64: {
srcs: [
+ "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_strided/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_3x3_s1_output2x2_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_avg_generic_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/pooling/kernels/a64_fp16_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
diff --git a/SConscript b/SConscript
index 94ba6d423..3e834e347 100644
--- a/SConscript
+++ b/SConscript
@@ -1,4 +1,4 @@
-# Copyright (c) 2016, 2017 Arm Limited.
+# Copyright (c) 2016-2021 Arm Limited.
#
# SPDX-License-Identifier: MIT
#
@@ -252,12 +252,8 @@ core_files_sve = []
if env['neon']:
core_files += Glob('src/core/NEON/*.cpp')
core_files += Glob('src/core/NEON/kernels/*.cpp')
- core_files += Glob('src/core/NEON/kernels/assembly/*.cpp')
core_files += Glob('src/core/NEON/kernels/arm_gemm/*.cpp')
- core_files += Glob('src/core/NEON/kernels/arm_conv/*.cpp')
- core_files += Glob('src/core/NEON/kernels/arm_conv/pooling/*.cpp')
- core_files += Glob('src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_*/*.cpp')
# build winograd/depthwise sources for either v7a / v8a
core_files += Glob('src/core/NEON/kernels/convolution/*/*.cpp')
@@ -275,11 +271,22 @@ if env['neon']:
core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/a32_*/*.cpp')
if env['estate'] == '64':
+ core_files += Glob('src/core/NEON/kernels/assembly/*.cpp')
+ core_files += Glob('src/core/NEON/kernels/arm_conv/depthwise/*.cpp')
+ core_files += Glob('src/core/NEON/kernels/arm_conv/depthwise/kernels/cpp_*/*.cpp')
+ core_files += Glob('src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp')
+ core_files += Glob('src/core/NEON/kernels/arm_conv/pooling/*.cpp')
+ core_files += Glob('src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_*/*.cpp')
+
core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/a64_*/*.cpp')
+ core_files += Glob('src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_*.cpp')
+ core_files += Glob('src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_*/*.cpp')
core_files += Glob('src/core/NEON/kernels/arm_conv/pooling/kernels/a64_*/*.cpp')
if "sve" in env['arch'] or env['fat_binary']:
core_files_sve += filelist['cpu']['core']['sve']['all']
core_files_sve += Glob('src/core/NEON/kernels/arm_gemm/kernels/sve_*/*.cpp')
+ core_files += Glob('src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_*.cpp')
+ core_files += Glob('src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_*/*.cpp')
core_files_sve += Glob('src/core/NEON/kernels/arm_conv/pooling/kernels/sve_*/*.cpp')
if any(i in env['data_layout_support'] for i in ['all', 'nchw']):
diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h
index cbf7559bc..c9d0930c3 100644
--- a/arm_compute/core/utils/quantization/AsymmHelpers.h
+++ b/arm_compute/core/utils/quantization/AsymmHelpers.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -89,7 +89,6 @@ std::pair<int, int> get_min_max_values_from_quantized_data_type(DataType data_ty
* @param[in] input Input tensor info.
* @param[in] weights Weights tensor info.
* @param[in] output Output tensor info.
- * @param[in] idx_ofms Dimension index to get OFMs from the weights tensor.
* @param[out] output_multipliers_ptr Pointer to the buffer where to store per-channel multipliers.
* @param[out] output_shifts_ptr Pointer to the buffer where to store per-channel shifts.
*
@@ -98,7 +97,6 @@ std::pair<int, int> get_min_max_values_from_quantized_data_type(DataType data_ty
void compute_quantized_multipliers_and_shifts(const ITensorInfo *input,
const ITensorInfo *weights,
const ITensorInfo *output,
- unsigned int idx_ofms,
int32_t *output_multipliers_ptr,
int32_t *output_shifts_ptr);
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index 2f541758f..6f2ec8cdd 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -112,7 +112,7 @@ private:
*
* -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present
* -# @ref NEDepthwiseConvolutionLayer3x3Kernel if 3x3 and no assembly kernel implementation is present
- * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present
+ * -# @ref cpu::CpuDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present
* -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of output is required
* -# @ref NEActivationLayer if fused activation is required
*
diff --git a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp
index d8f8f1498..ca7b7a5f0 100644
--- a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp
+++ b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp
@@ -236,7 +236,6 @@ public:
gemmlowp_output_stage.output_data_type = dst.info()->data_type();
gemmlowp_output_stage.gemmlowp_offset = 0;
{
- const int idx_kernels = get_data_layout_dimension_index(lhs.info()->data_layout(), DataLayoutDimension::BATCHES);
gemmlowp_output_stage.is_quantized_per_channel = false;
// Num_filters is 1 unless quantized type is of per_channel type. Could be extended in the future to support per-channel quantization.
const unsigned int num_filters = 1;
@@ -249,7 +248,6 @@ public:
quantization::compute_quantized_multipliers_and_shifts(lhs.info(),
rhs.info(),
dst.info(),
- idx_kernels,
gemmlowp_output_stage.gemmlowp_multipliers.data(),
gemmlowp_output_stage.gemmlowp_shifts.data());
gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
new file mode 100644
index 000000000..fe635d6fa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_conv {
+namespace depthwise {
+
+template <class strategy>
+class DepthwiseDepthfirst : public DepthwiseCommon<typename strategy::input_type,
+ typename strategy::weight_type,
+ typename strategy::return_type>
+{
+ using TInput = typename strategy::input_type;
+ using TWeight = typename strategy::weight_type;
+ using TOutput = typename strategy::return_type;
+ using TAccum = typename strategy::bias_type;
+
+ size_t sizeof_input_buffer(unsigned int n_input_channels) const
+ {
+ return sizeof(TInput) * n_input_channels;
+ }
+
+ size_t sizeof_output_buffer(unsigned int n_output_channels) const
+ {
+ return sizeof(TOutput) * n_output_channels;
+ }
+
+ public:
+
+ DepthwiseDepthfirst(const DepthwiseArgs &args) : DepthwiseCommon<TInput, TWeight, TOutput>(args)
+ {
+ }
+
+ DepthwiseDepthfirst(DepthwiseDepthfirst &) = delete;
+ DepthwiseDepthfirst &operator=(DepthwiseDepthfirst &) = delete;
+
+ size_t get_storage_size(void) const override
+ {
+ // TODO What if we insert extra padding? Biases are a different size to the inputs, ...
+ const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(strategy::vl_type);
+ const auto rounded_channels = arm_gemm::roundup(this->m_args.input_channels, vl);
+ return (1 + this->m_args.kernel_rows * this->m_args.kernel_cols) * rounded_channels * sizeof(TWeight);
+ }
+
+ void pack_parameters(void *_buffer, const void *_biases, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override
+ {
+ // TODO What if the kernel needs a different packing function?
+
+ // Cast the pointers
+ uint8_t *buffer = static_cast<uint8_t *>(_buffer);
+ const TAccum *biases = static_cast<const TAccum *>(_biases);
+ const TWeight *const weights = static_cast<const TWeight *>(_weights);
+
+ const unsigned int vl = arm_gemm::utils::get_vector_length<TAccum>(strategy::vl_type);
+ ld_weight_col = (ld_weight_col == 0) ? this->m_args.input_channels : ld_weight_col;
+ ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row;
+
+ for (unsigned int n = 0; n < this->m_args.input_channels; n += vl)
+ {
+ const unsigned int todo = std::min(vl, this->m_args.input_channels - n);
+
+ // Copy across the correct amount of bias (or 0)
+ for (unsigned int i = 0; i < todo; i++)
+ {
+ reinterpret_cast<TAccum *>(buffer)[i] = (biases == nullptr) ? 0 : biases[n + i];
+ }
+ buffer += vl * sizeof(TAccum);
+
+ // Copy each of the weights in turn
+ auto weights_row = weights + n;
+ for (unsigned int i = 0; i < this->m_args.kernel_rows; i++)
+ {
+ auto weights_col = weights_row;
+
+ for (unsigned int j = 0; j < this->m_args.kernel_cols; j++)
+ {
+ for (unsigned int m = 0; m < todo; m++)
+ {
+ reinterpret_cast<TWeight *>(buffer)[m] = weights_col[m];
+ }
+ buffer += vl * sizeof(TWeight);
+
+ weights_col += ld_weight_col;
+ }
+
+ weights_row += ld_weight_row;
+ }
+ }
+ }
+
+ size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
+ {
+ const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
+ return n_threads * (sizeof_output_buffer(n_output_channels) + sizeof_input_buffer(n_channels));
+ }
+
+ using DepthwiseCommon<typename strategy::input_type, typename strategy::weight_type, typename strategy::return_type>::execute;
+ void execute(
+ const unsigned int batches,
+ const unsigned int input_height,
+ const unsigned int input_width,
+ const unsigned int input_channels,
+ const PaddingValues &padding,
+ const void *const _input,
+ const size_t ld_input_col,
+ const size_t ld_input_row,
+ const size_t ld_input_batch,
+ const void *const parameters,
+ const unsigned int output_height,
+ const unsigned int output_width,
+ void *const _output,
+ const size_t ld_output_col,
+ const size_t ld_output_row,
+ const size_t ld_output_batch,
+ void *const _working_space,
+ const unsigned int thread_id,
+ const unsigned int n_threads
+ ) const override
+ {
+ strategy strat(this->m_args.cpu_info);
+#ifdef CYCLE_PROFILING
+ arm_gemm::profiler prof;
+#endif
+
+ // Compute activation values
+ TAccum activation_min, activation_max;
+ if (std::numeric_limits<TAccum>::is_integer)
+ {
+ activation_min = std::numeric_limits<TAccum>::min();
+ activation_max = std::numeric_limits<TAccum>::max();
+ }
+ else
+ {
+ activation_min = static_cast<TAccum>(-std::numeric_limits<float>::infinity());
+ activation_max = static_cast<TAccum>(std::numeric_limits<float>::infinity());
+ }
+
+ switch (this->m_args.activation.type)
+ {
+ case arm_gemm::Activation::Type::BoundedReLU:
+ activation_max = static_cast<TAccum>(this->m_args.activation.param1);
+ // Fall through
+ case arm_gemm::Activation::Type::ReLU:
+ activation_min = static_cast<TAccum>(0);
+ break;
+ default:
+ break;
+ }
+
+ // Determine what portion of the work to do.
+ const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads);
+ const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
+ const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
+
+ // Cast input and output pointers into the right types
+ const TInput *const inptr = static_cast<const TInput *>(_input);
+ TOutput *const outptr = static_cast<TOutput *>(_output);
+
+ // Create an array for the input pointers
+ const TInput * _inptr_array[strategy::input_rows * strategy::input_cols];
+ const TInput **const inptr_array = _inptr_array;
+
+ // Create an array for the output pointers
+ TOutput * _outptr_array[strategy::output_rows * strategy::output_cols];
+ TOutput **const outptr_array = _outptr_array;
+
+ // Allocate portions of the working space
+ uint8_t *const working_space = static_cast<uint8_t *>(_working_space) + get_working_size(thread_id, input_channels);
+ TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
+ TInput *const input_buffer = reinterpret_cast<TInput *>(working_space + sizeof_output_buffer(input_channels * this->m_args.channel_multiplier));
+
+ // Initialise the input buffer
+ for (unsigned int c = 0; c < input_channels; c++)
+ {
+ input_buffer[c] = static_cast<TInput>(0);
+ }
+
+ // For each output tile, construct the requisite set of pointers and call
+ // into the kernel.
+ for (unsigned int batch = 0; batch < batches; batch++)
+ {
+ // Get batch pointers
+ const auto inptr_batch = inptr + batch * ld_input_batch;
+ const auto outptr_batch = outptr + batch * ld_output_batch;
+
+ for (int start_out_i = start_out_height;
+ start_out_i < end_out_height;
+ start_out_i += static_cast<int>(strategy::output_rows))
+ {
+ const int end_out_i = start_out_i + strategy::output_rows;
+ const int start_in_i = start_out_i * strategy::stride_rows - padding.top;
+ const int end_in_i = start_in_i + strategy::input_rows;
+
+ // Compute top/bottom padding
+ const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
+ const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(input_height) - end_in_i, 0));
+ const unsigned int valid_output_rows = std::min(
+ end_out_i - start_out_i,
+ static_cast<int>(output_height) - start_out_i
+ );
+
+ // Fill the input pointer array with padding values
+ for (auto index = 0u; index < strategy::input_rows * strategy::input_cols; index++)
+ {
+ inptr_array[index] = input_buffer;
+ }
+
+ for (int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
+ {
+ const int start_in_j = start_out_j * strategy::stride_cols - this->m_args.padding.left;
+ const int pad_left = -std::min(0, start_in_j);
+
+ // Compute how many output tiles we can compute with the direct kernel.
+ int n_direct_tiles = 0;
+ if (!pad_top && !pad_bottom && !pad_left)
+ {
+ // Determine the maximum number of tiles we could handle.
+ n_direct_tiles = (output_width - start_out_j) / strategy::output_cols;
+
+ // Continue to reduce this number as required to avoid reading
+ // padding on the right edge.
+ int end_in_j = start_in_j + n_direct_tiles * strategy::input_cols;
+ int pad_right = std::max(0, end_in_j - static_cast<int>(input_width));
+
+ while (pad_right && n_direct_tiles)
+ {
+ n_direct_tiles--;
+ end_in_j -= strategy::input_cols;
+ pad_right = std::max(0, end_in_j - static_cast<int>(input_width));
+ }
+ }
+
+ // Use the unpadded kernel if we can, otherwise use the padded one.
+ if (n_direct_tiles)
+ {
+ auto inptr = inptr_batch + start_in_i*ld_input_row + start_in_j*ld_input_col;
+ auto outptr = outptr_batch + start_out_i*ld_output_row + start_out_j*ld_output_col;
+ start_out_j += n_direct_tiles*strategy::output_cols;
+
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, 0);
+#endif
+ strat.direct_kernel(1, n_direct_tiles,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ parameters, this->m_args.input_channels,
+ activation_min, activation_max);
+ continue;
+ }
+
+ const int end_out_j = start_out_j + strategy::output_cols;
+ const int end_in_j = start_in_j + strategy::input_cols;
+
+ const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(input_width) - end_in_j, 0));
+ const unsigned int valid_output_cols = std::min(
+ end_out_j - start_out_j,
+ static_cast<int>(output_width) - start_out_j
+ );
+
+ // Construct the input pointer array - fill the array with pointers to
+ // the input buffer and then fill in the required values.
+ for (auto i = pad_top; i < strategy::input_rows - pad_bottom; i++)
+ {
+ // Can skip over the left padding because we will have either the
+ // same or less than the previous tile.
+ unsigned int j = pad_left;
+ const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col;
+ const TInput **ptrs = inptr_array + i * strategy::input_cols + j;
+ for (; j < strategy::input_cols - pad_right; j++)
+ {
+ *(ptrs++) = colptr;
+ colptr += ld_input_col;
+ }
+ for (; j < strategy::input_cols; j++)
+ {
+ *(ptrs++) = input_buffer;
+ }
+ }
+
+ // Construct the output pointer array.
+ TOutput **outptr_pos = outptr_array;
+ for (auto i = 0u; i < valid_output_rows; i++)
+ {
+ unsigned int j = 0u;
+ TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col;
+ for (; j < valid_output_cols; j++)
+ {
+ *(outptr_pos++) = colptr;
+ colptr += ld_output_col;
+ }
+ for (; j < strategy::output_cols; j++)
+ {
+ *(outptr_pos++) = output_buffer;
+ }
+ }
+ for (auto i = valid_output_rows; i < strategy::output_rows; i++)
+ {
+ for (auto j = 0u; j < strategy::output_cols; j++)
+ {
+ *(outptr_pos++) = output_buffer;
+ }
+ }
+
+ start_out_j += strategy::output_cols;
+
+#ifdef CYCLE_PROFILING
+ // TODO Work number
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(0));
+#endif
+ strat.indirect_kernel(inptr_array, outptr_array, parameters,
+ this->m_args.input_channels, activation_min, activation_max);
+ }
+ }
+ }
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp
new file mode 100644
index 000000000..29f37c569
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_conv {
+namespace depthwise {
+
+template <class Strategy, unsigned OutputRows, unsigned int OutputCols>
+class DepthwiseDepthfirstGenericBase :
+ public DepthwiseCommon<typename Strategy::input_type,
+ typename Strategy::weight_type,
+ typename Strategy::return_type>
+{
+ protected:
+
+ using TInput = typename Strategy::input_type;
+ using TWeight = typename Strategy::weight_type;
+ using TOutput = typename Strategy::return_type;
+ using TAccum = typename Strategy::bias_type;
+
+ size_t sizeof_input_ptr_array(void) const
+ {
+ return sizeof(TInput *) * this->m_args.kernel_rows * this->m_args.kernel_cols * Strategy::n_output_points;
+ }
+
+ size_t sizeof_input_buffer(unsigned int n_channels) const
+ {
+ const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(Strategy::vl_type);
+ const auto rounded_channels = arm_gemm::roundup(n_channels, vl);
+ return sizeof(TInput) * rounded_channels;
+ }
+
+ size_t sizeof_output_buffer(unsigned int n_channels) const
+ {
+ const unsigned int vl = arm_gemm::utils::get_vector_length<TOutput>(Strategy::vl_type);
+ const auto rounded_channels = arm_gemm::roundup(n_channels, vl);
+ return sizeof(TOutput) * rounded_channels;
+ }
+
+ unsigned int input_rows(void) const
+ {
+ return this->m_args.kernel_rows + (OutputRows - 1)*this->m_args.stride_rows;
+ }
+
+ unsigned int input_cols(void) const
+ {
+ return this->m_args.kernel_cols + (OutputCols - 1)*this->m_args.stride_cols;
+ }
+
+ void execute_tiles(
+ std::function<void(const TInput *const *, TOutput *const *)> tile_fn,
+ std::function<void(TInput *, unsigned int)> initialise_input_buffer,
+ const unsigned int batches,
+ const unsigned int input_height,
+ const unsigned int input_width,
+ const unsigned int input_channels,
+ const PaddingValues &padding,
+ const void *const _input,
+ const size_t ld_input_col,
+ const size_t ld_input_row,
+ const size_t ld_input_batch,
+ const unsigned int output_height,
+ const unsigned int output_width,
+ void *const _output,
+ const size_t ld_output_col,
+ const size_t ld_output_row,
+ const size_t ld_output_batch,
+ void *const _working_space,
+ const unsigned int thread_id,
+ const unsigned int n_threads
+ ) const
+ {
+ static_assert(OutputRows * OutputCols <= Strategy::n_output_points,
+ "Too many output points for kernel.");
+
+ // Determine what portion of the work to do.
+ const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads);
+ const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
+ const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
+
+ // Cast input and output pointers into the right types
+ const TInput *const inptr = static_cast<const TInput *>(_input);
+ TOutput *const outptr = static_cast<TOutput *>(_output);
+
+ // Allocate portions of the working space
+ uint8_t *const working_space = static_cast<uint8_t *>(_working_space) + this->get_working_size(thread_id, input_channels);
+ const TInput **const inptr_array = reinterpret_cast<const TInput **>(working_space);
+ TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space + this->sizeof_input_ptr_array());
+ TInput *const input_buffer = reinterpret_cast<TInput *>(working_space + this->sizeof_input_ptr_array() + this->sizeof_output_buffer(input_channels * this->m_args.channel_multiplier));
+
+ // Create an array for the output pointers
+ TOutput * _outptr_array[Strategy::n_output_points];
+ TOutput **const outptr_array = _outptr_array;
+
+ // Initialise the input buffer
+ initialise_input_buffer(input_buffer, input_channels);
+
+ // For each output tile, construct the requisite set of pointers and call
+ // into the kernel.
+ for (unsigned int batch = 0; batch < batches; batch++)
+ {
+ // Get batch pointers
+ const auto inptr_batch = inptr + batch * ld_input_batch;
+ const auto outptr_batch = outptr + batch * ld_output_batch;
+
+ for (int start_out_i = start_out_height;
+ start_out_i < end_out_height;
+ start_out_i += static_cast<int>(OutputRows))
+ {
+ const int end_out_i = std::min(start_out_i + OutputRows,
+ output_height);
+
+ for (int start_out_j = 0;
+ start_out_j < static_cast<int>(output_width);
+ start_out_j += static_cast<int>(OutputCols))
+ {
+ const int end_out_j = std::min(start_out_j + OutputCols,
+ output_width);
+
+ // Fill the pointer arrays with pointers to the input/output buffers.
+ for (auto index = 0u;
+ index < (Strategy::n_output_points * this->m_args.kernel_rows * this->m_args.kernel_cols);
+ index++)
+ {
+ inptr_array[index] = input_buffer;
+ }
+ for (auto index = 0u; index < Strategy::n_output_points; index++)
+ {
+ outptr_array[index] = output_buffer;
+ }
+
+ // Construct the pointer arrays together. Note that the input pointer
+ // array is striped. Since the array has already been filled with
+ // pointers to the padding array we merely fill in the valid points
+ // as we get to them.
+ unsigned int output_index = 0;
+ auto outptr_row = outptr_batch + start_out_i * ld_output_row + start_out_j * ld_output_col;
+ for (auto out_i = start_out_i; out_i < end_out_i; out_i++)
+ {
+ auto outptr_col = outptr_row;
+
+ // Compute the padding for this row of tiles.
+ const int start_in_i = out_i * this->m_args.stride_rows - padding.top;
+ const int end_in_i = start_in_i + this->m_args.kernel_rows;
+ const auto pad_top = static_cast<unsigned int>(std::max<int>(0, 0 - start_in_i));
+ const auto pad_bottom = static_cast<unsigned int>(std::max<int>(0, end_in_i - input_height));
+ const unsigned int valid_rows = this->m_args.kernel_rows - pad_top - pad_bottom;
+
+ for (auto out_j = start_out_j; out_j < end_out_j; out_j++, output_index++)
+ {
+ // Compute the output pointer.
+ outptr_array[output_index] = outptr_col;
+ outptr_col += ld_output_col;
+
+ // Compute the padding for this tile.
+ const int start_in_j = out_j * this->m_args.stride_cols - padding.left;
+ const int end_in_j = start_in_j + this->m_args.kernel_cols;
+ const auto pad_left = static_cast<unsigned int>(std::max<int>(0, 0 - start_in_j));
+ const auto pad_right = static_cast<unsigned int>(std::max<int>(0, end_in_j - input_width));
+ const unsigned int valid_cols = this->m_args.kernel_cols - pad_left - pad_right;
+
+ // Hence compute the input pointers.
+ auto input_index = output_index + Strategy::n_output_points * (pad_top * this->m_args.kernel_cols + pad_left);
+ auto inptr_row = inptr_batch + (start_in_i + pad_top) * ld_input_row + (start_in_j + pad_left) * ld_input_col;
+ for (auto in_i = 0u; in_i < valid_rows; in_i++)
+ {
+ auto inptr_col = inptr_row;
+ auto input_index_col = input_index;
+
+ for (auto in_j = 0u; in_j < valid_cols; in_j++)
+ {
+ inptr_array[input_index_col] = inptr_col;
+ inptr_col += ld_input_col;
+ input_index_col += Strategy::n_output_points;
+ }
+
+ inptr_row += ld_input_row;
+ input_index += Strategy::n_output_points * this->m_args.kernel_cols;
+ }
+ }
+
+ outptr_row += ld_output_row;
+ }
+
+ tile_fn(inptr_array, outptr_array);
+ }
+ }
+ }
+ }
+
+ public:
+ DepthwiseDepthfirstGenericBase(const DepthwiseArgs &args) : DepthwiseCommon<TInput, TWeight, TOutput>(args)
+ {
+ }
+
+ DepthwiseDepthfirstGenericBase(DepthwiseDepthfirstGenericBase &) = delete;
+ DepthwiseDepthfirstGenericBase &operator=(DepthwiseDepthfirstGenericBase &) = delete;
+
+ size_t get_storage_size(void) const override
+ {
+ const unsigned int vl = arm_gemm::utils::get_vector_length<TAccum>(Strategy::vl_type);
+ const auto rounded_channels = arm_gemm::roundup(this->m_args.input_channels, vl);
+ return (this->m_args.kernel_rows * this->m_args.kernel_cols) * rounded_channels * sizeof(TWeight);
+ }
+
+ void pack_parameters(void *_buffer, const void *, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override
+ {
+ // Cast the pointers
+ TWeight *buffer = static_cast<TWeight *>(_buffer);
+ const TWeight *const weights = static_cast<const TWeight *>(_weights);
+
+ const unsigned int vl = arm_gemm::utils::get_vector_length<TAccum>(Strategy::vl_type);
+ ld_weight_col = (ld_weight_col == 0) ? this->m_args.input_channels : ld_weight_col;
+ ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row;
+
+ for (unsigned int n = 0; n < this->m_args.input_channels; n += vl)
+ {
+ const unsigned int todo = std::min(vl, this->m_args.input_channels - n);
+
+ // Copy each of the weights in turn
+ auto weights_row = weights + n;
+ for (unsigned int i = 0; i < this->m_args.kernel_rows; i++)
+ {
+ auto weights_col = weights_row;
+
+ for (unsigned int j = 0; j < this->m_args.kernel_cols; j++)
+ {
+ for (unsigned int m = 0; m < todo; m++)
+ {
+ buffer[m] = weights_col[m];
+ }
+ buffer += vl;
+
+ weights_col += ld_weight_col;
+ }
+
+ weights_row += ld_weight_row;
+ }
+ }
+ }
+
+ size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
+ {
+ const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
+ return n_threads * (sizeof_input_ptr_array() +
+ sizeof_output_buffer(n_output_channels) +
+ sizeof_input_buffer(n_channels));
+ }
+};
+
+template <class Strategy, unsigned OutputRows, unsigned int OutputCols>
+class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstGenericBase<Strategy, OutputRows, OutputCols>
+{
+ using Parent = DepthwiseDepthfirstGenericBase<Strategy, OutputRows, OutputCols>;
+ using TInput = typename Parent::TInput;
+ using TWeight = typename Parent::TWeight;
+ using TAccum = typename Parent::TAccum;
+ using TOutput = typename Parent::TOutput;
+
+ const TAccum *m_bias = nullptr;
+
+ public:
+ DepthwiseDepthfirstGeneric(const DepthwiseArgs &args) : Parent(args)
+ {
+ }
+
+ DepthwiseDepthfirstGeneric(DepthwiseDepthfirstGeneric &) = delete;
+ DepthwiseDepthfirstGeneric &operator=(DepthwiseDepthfirstGeneric &) = delete;
+
+ void pack_parameters(void *buffer, const void *bias, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+ {
+ m_bias = static_cast<const TAccum *>(bias);
+ Parent::pack_parameters(buffer, bias, weights, ld_weight_col, ld_weight_row);
+ }
+
+ using DepthwiseDepthfirstGenericBase<Strategy, OutputRows, OutputCols>::execute;
+ void execute(
+ const unsigned int batches,
+ const unsigned int input_height,
+ const unsigned int input_width,
+ const unsigned int input_channels,
+ const PaddingValues &padding,
+ const void *const _input,
+ const size_t ld_input_col,
+ const size_t ld_input_row,
+ const size_t ld_input_batch,
+ const void *const parameters,
+ const unsigned int output_height,
+ const unsigned int output_width,
+ void *const _output,
+ const size_t ld_output_col,
+ const size_t ld_output_row,
+ const size_t ld_output_batch,
+ void *const _working_space,
+ const unsigned int thread_id,
+ const unsigned int n_threads
+ ) const override
+ {
+ Strategy strat(this->m_args.cpu_info);
+#ifdef CYCLE_PROFILING
+ arm_gemm::profiler prof;
+#endif
+
+ // Compute activation values
+ TAccum activation_min, activation_max;
+ if (std::numeric_limits<TAccum>::is_integer)
+ {
+ activation_min = std::numeric_limits<TAccum>::min();
+ activation_max = std::numeric_limits<TAccum>::max();
+ }
+ else
+ {
+ activation_min = static_cast<TAccum>(-std::numeric_limits<float>::infinity());
+ activation_max = static_cast<TAccum>(std::numeric_limits<float>::infinity());
+ }
+
+ switch (this->m_args.activation.type)
+ {
+ case arm_gemm::Activation::Type::BoundedReLU:
+ activation_max = static_cast<TAccum>(this->m_args.activation.param1);
+ // Fall through
+ case arm_gemm::Activation::Type::ReLU:
+ activation_min = static_cast<TAccum>(0);
+ break;
+ default:
+ break;
+ }
+
+ // Create a function to initialise the input buffer
+ const auto initialise_input_buffer = [] (TInput *const buffer, const unsigned int n) {
+ std::memset(buffer, 0, n * sizeof(TInput));
+ };
+
+ // Create a function to execute a tile of work
+ const auto tile_fn = [&] (const TInput *const *const inptrs, TOutput *const * const outptrs) {
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(
+ PROFILE_KERNEL,
+ (unsigned long) (OutputRows * OutputCols * this->m_args.kernel_rows* this->m_args.kernel_cols)
+ );
+#endif
+ strat.kernel(inptrs, outptrs, parameters, m_bias,
+ this->m_args.kernel_rows * this->m_args.kernel_cols,
+ this->m_args.input_channels, activation_min, activation_max);
+ };
+
+ // Call into a parent utility function to do the actual work.
+ Parent::execute_tiles(
+ tile_fn, initialise_input_buffer,
+ batches, input_height, input_width, input_channels, padding,
+ _input, ld_input_col, ld_input_row, ld_input_batch,
+ output_height, output_width,
+ _output, ld_output_col, ld_output_row, ld_output_batch,
+ _working_space, thread_id, n_threads
+ );
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier.hpp
new file mode 100644
index 000000000..656e4413b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier.hpp
@@ -0,0 +1,480 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_conv {
+namespace depthwise {
+
+template <class strategy>
+class DepthwiseDepthfirstGenericWithMultiplierBase :
+ public DepthwiseCommon<typename strategy::input_type,
+ typename strategy::weight_type,
+ typename strategy::return_type>
+{
+ protected:
+
+ using TInput = typename strategy::input_type;
+ using TWeight = typename strategy::weight_type;
+ using TOutput = typename strategy::return_type;
+ using TAccum = typename strategy::bias_type;
+
+ unsigned int kernel_points(void) const
+ {
+ return this->m_args.kernel_rows * this->m_args.kernel_cols;
+ }
+
+ unsigned int input_rows(void) const
+ {
+ return (strategy::output_rows() - 1) * this->m_args.stride_rows + this->m_args.kernel_rows;
+ }
+
+ unsigned int input_cols(void) const
+ {
+ return (strategy::output_cols() - 1) * this->m_args.stride_cols + this->m_args.kernel_cols;
+ }
+
+ size_t sizeof_inptr_array(void) const
+ {
+ return sizeof(TInput *) * kernel_points() * strategy::output_rows();
+ }
+
+ size_t sizeof_input_samples(void) const
+ {
+ // We have a sample for each kernel point, for each point of the output array.
+ return sizeof(TInput) * kernel_points() *
+ strategy::output_rows() *
+ strategy::output_col_regs() *
+ (16 / sizeof(TAccum));
+ }
+
+ size_t sizeof_outptr_array(void) const
+ {
+ return sizeof(TOutput *) * strategy::output_rows() * strategy::output_cols();
+ }
+
+ size_t sizeof_output_buffer(unsigned int n_channels) const
+ {
+ const unsigned int vl = arm_gemm::utils::get_vector_length<TOutput>(strategy::vl_type);
+ const auto rounded_channels = arm_gemm::roundup(n_channels, vl);
+ return sizeof(TOutput) * rounded_channels;
+ }
+
+ void pack_weights(TWeight *buffer, const TWeight *weights, size_t ld_weight_col, size_t ld_weight_row) const
+ {
+ const unsigned int vl = arm_gemm::utils::get_vector_length<TAccum>(strategy::vl_type);
+ ld_weight_col = (ld_weight_col == 0) ? this->m_args.channel_multiplier * this->m_args.input_channels : ld_weight_col;
+ ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row;
+
+ for (unsigned int in_c = 0; in_c < this->m_args.input_channels; in_c++)
+ {
+ for (unsigned int n = 0; n < this->m_args.channel_multiplier; n += vl)
+ {
+ const unsigned int out_c = in_c * this->m_args.channel_multiplier + n;
+ const unsigned int todo = std::min(vl, this->m_args.channel_multiplier - n);
+
+ // Copy each of the weights in turn
+ auto weights_row = weights + out_c;
+ for (unsigned int i = 0; i < this->m_args.kernel_rows; i++)
+ {
+ auto weights_col = weights_row;
+
+ for (unsigned int j = 0; j < this->m_args.kernel_cols; j++)
+ {
+ for (unsigned int m = 0; m < todo; m++)
+ {
+ buffer[m] = weights_col[m];
+ }
+ buffer += vl;
+
+ weights_col += ld_weight_col;
+ }
+
+ weights_row += ld_weight_row;
+ }
+ }
+ }
+ }
+
+ void execute_tiles(
+ std::function<void(const TInput **, TOutput **, const TWeight *, unsigned int, unsigned int)> tile_fn,
+ const TInput pad_value,
+ const unsigned int batches,
+ const unsigned int input_height,
+ const unsigned int input_width,
+ const unsigned int input_channels,
+ const PaddingValues &padding,
+ const void *const _input,
+ const size_t ld_input_col,
+ const size_t ld_input_row,
+ const size_t ld_input_batch,
+ const void *const parameters,
+ const unsigned int output_height,
+ const unsigned int output_width,
+ void *const _output,
+ const size_t ld_output_col,
+ const size_t ld_output_row,
+ const size_t ld_output_batch,
+ void *const _working_space,
+ const unsigned int thread_id,
+ const unsigned int n_threads
+ ) const
+ {
+#ifdef CYCLE_PROFILING
+ arm_gemm::profiler prof;
+#endif
+
+ // Determine what portion of the work to do.
+ const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads);
+ const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
+ const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
+
+ // Need a stride over blocks of parameters
+ const unsigned int vl = arm_gemm::utils::get_vector_length<TAccum>(strategy::vl_type);
+ const unsigned int param_stride = arm_gemm::roundup(this->m_args.channel_multiplier, vl) * kernel_points();
+
+ // Cast input and output pointers into the right types
+ const TInput *const inptr = static_cast<const TInput *>(_input);
+ TOutput *const outptr = static_cast<TOutput *>(_output);
+
+ // Allocate portions of the working space
+ uint8_t *working_space = static_cast<uint8_t *>(_working_space) +
+ get_working_size(thread_id, input_channels);
+
+ const TInput **inptrs = reinterpret_cast<const TInput **>(working_space);
+ working_space += sizeof_inptr_array();
+
+ // To simplify the kernel, we process padded or non-NCHW-ordered input into
+ // a form which can be consumed by the kernel. This data is stored here and
+ // passed into the kernel as an array of N pointers (one per row of the
+ // input).
+ TInput *rearranged_input = reinterpret_cast<TInput *>(working_space);
+ working_space += sizeof_input_samples();
+
+ TOutput **outptr_array = reinterpret_cast<TOutput **>(working_space);
+ working_space += sizeof_outptr_array();
+
+ TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
+
+ // TODO Dynamically change the input pointer array in cases where we could
+ // read directly from the input tensor; for now though assume we will
+ // always read from the sample array.
+ {
+ auto my_inptrs = inptrs;
+ auto my_input_samples = rearranged_input;
+
+ // For each kernel point; for each row of output; for each register of
+ // values containing a QUAD of source values.
+ const unsigned int quad_length = 16 / sizeof(TAccum);
+
+ for (auto p = 0u; p < kernel_points() * strategy::output_rows(); p++)
+ {
+ *(my_inptrs)++ = my_input_samples;
+ my_input_samples += arm_gemm::roundup(strategy::output_cols(), quad_length);
+ }
+ }
+
+ // For each output tile, construct the requisite set of pointers and call
+ // into the kernel.
+ for (unsigned int batch = 0; batch < batches; batch++)
+ {
+ // Get batch pointers
+ const auto inptr_batch = inptr + batch * ld_input_batch;
+ const auto outptr_batch = outptr + batch * ld_output_batch;
+
+ for (int start_out_i = start_out_height;
+ start_out_i < end_out_height;
+ start_out_i += static_cast<int>(strategy::output_rows()))
+ {
+ const int end_out_i = std::min(start_out_i + static_cast<int>(strategy::output_rows()), end_out_height);
+ const int start_in_i = start_out_i * this->m_args.stride_rows - padding.top;
+ const int end_in_i = start_in_i + input_rows();
+
+ // Compute top/bottom padding
+ const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
+ const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(input_height) - end_in_i, 0));
+ const unsigned int valid_output_rows = std::min(
+ end_out_i - start_out_i,
+ static_cast<int>(output_height) - start_out_i
+ );
+
+ const int pad_rows = pad_top + pad_bottom;
+
+ for (int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
+ {
+ const int start_in_j = start_out_j * this->m_args.stride_cols - this->m_args.padding.left;
+ const int pad_left = -std::min(0, start_in_j);
+
+ const int end_out_j = start_out_j + strategy::output_cols();
+ const int end_in_j = start_in_j + input_cols();
+
+ const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(input_width) - end_in_j, 0));
+ const unsigned int valid_output_cols = std::min(
+ end_out_j - start_out_j,
+ static_cast<int>(output_width) - start_out_j
+ );
+
+ const int pad_cols = pad_left + pad_right;
+
+ // Construct the output pointer array.
+ TOutput **outptr_pos = outptr_array;
+ for (auto i = 0u; i < valid_output_rows; i++)
+ {
+ unsigned int j = 0u;
+ TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col;
+ for (; j < valid_output_cols; j++)
+ {
+ *(outptr_pos++) = colptr;
+ colptr += ld_output_col;
+ }
+ for (; j < strategy::output_cols(); j++)
+ {
+ *(outptr_pos++) = output_buffer;
+ }
+ }
+ for (auto i = valid_output_rows; i < strategy::output_rows(); i++)
+ {
+ for (auto j = 0u; j < strategy::output_cols(); j++)
+ {
+ *(outptr_pos++) = output_buffer;
+ }
+ }
+
+ start_out_j += strategy::output_cols();
+
+ const TWeight *params = static_cast<const TWeight *>(parameters);
+
+ // Fill the input samples with padding. We can do this outside of
+ // the channel loop, as the position of padding isn't going to
+ // change as a function of channel.
+ for (auto i = 0u; i < kernel_points() * strategy::output_rows() * strategy::output_cols(); i++)
+ {
+ rearranged_input[i] = pad_value;
+ }
+
+ // Loop over the input channels
+ for (unsigned int in_c = 0; in_c < input_channels; in_c++)
+ {
+ auto inptr_row = inptr_batch + in_c +
+ (start_in_i + pad_top) * ld_input_row +
+ (start_in_j + pad_left) * ld_input_col;
+
+ // Construct the array of input samples; for each point of the
+ // kernel we provide an input value for each output point.
+ auto input_samples = rearranged_input;
+ for (auto ki = 0u; ki < this->m_args.kernel_rows; ki++)
+ {
+ for (auto kj = 0u; kj < this->m_args.kernel_cols; kj++)
+ {
+ // Copy the pointer for the input samples associated with this
+ // kernel point. Then update the main pointer to account for
+ // this point.
+ auto point_input_samples = input_samples;
+ input_samples += strategy::output_rows() * strategy::output_cols();
+
+ int ii = static_cast<int>(ki) - static_cast<int>(pad_top);
+ for (auto oi = 0u;
+ oi < strategy::output_rows() &&
+ ii < static_cast<int>(input_rows()) - pad_rows;
+ oi++, ii += this->m_args.stride_rows)
+ {
+ if (0 <= ii) // Fill in values only if this row is in range.
+ {
+ int ij = static_cast<int>(kj) - static_cast<int>(pad_left);
+ for (auto oj = 0u;
+ oj < strategy::output_cols() &&
+ ij < static_cast<int>(input_cols()) - pad_cols;
+ oj++, ij += this->m_args.stride_cols)
+ {
+ if (0 <= ij) // Sample if the point is in range.
+ {
+ point_input_samples[oj] = *(inptr_row + ii*ld_input_row + ij*ld_input_col);
+ }
+ }
+ }
+
+ point_input_samples += strategy::output_cols();
+ }
+ }
+ }
+
+ tile_fn(inptrs, outptr_array, params, in_c, in_c*this->m_args.channel_multiplier);
+
+ // Progress the output pointers
+ TOutput **outptr_pos = outptr_array;
+ for (auto i = 0u; i < strategy::output_rows() * strategy::output_cols(); i++)
+ {
+ outptr_pos[i] += this->m_args.channel_multiplier;
+ }
+
+ // Progress the pointer into the parameters
+ params += param_stride;
+ }
+ }
+ }
+ }
+ }
+
+ public:
+ DepthwiseDepthfirstGenericWithMultiplierBase(const DepthwiseArgs &args) : DepthwiseCommon<TInput, TWeight, TOutput>(args)
+ {
+ }
+
+ DepthwiseDepthfirstGenericWithMultiplierBase(DepthwiseDepthfirstGenericWithMultiplierBase &) = delete;
+ DepthwiseDepthfirstGenericWithMultiplierBase &operator=(DepthwiseDepthfirstGenericWithMultiplierBase &) = delete;
+
+ size_t get_storage_size(void) const override
+ {
+ const unsigned int vl = arm_gemm::utils::get_vector_length<TAccum>(strategy::vl_type);
+ const auto rounded_channels = this->m_args.input_channels * arm_gemm::roundup(this->m_args.channel_multiplier, vl);
+ return kernel_points() * rounded_channels * sizeof(TWeight);
+ }
+
+ size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
+ {
+ const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
+ return n_threads * (sizeof_inptr_array() +
+ sizeof_input_samples() +
+ sizeof_outptr_array() +
+ sizeof_output_buffer(n_output_channels));
+ }
+};
+
+template <class strategy>
+class DepthwiseDepthfirstGenericWithMultiplier : public DepthwiseDepthfirstGenericWithMultiplierBase<strategy>
+{
+ using TInput = typename strategy::input_type;
+ using TWeight = typename strategy::weight_type;
+ using TOutput = typename strategy::return_type;
+ using TAccum = typename strategy::bias_type;
+
+ using Parent = DepthwiseDepthfirstGenericWithMultiplierBase<strategy>;
+
+ const TAccum *m_biases; // Pointer to bias vector
+
+ public:
+ DepthwiseDepthfirstGenericWithMultiplier(const DepthwiseArgs &args)
+ : Parent(args), m_biases(nullptr)
+ {
+ }
+
+ DepthwiseDepthfirstGenericWithMultiplier(DepthwiseDepthfirstGenericWithMultiplier &) = delete;
+ DepthwiseDepthfirstGenericWithMultiplier &operator=(DepthwiseDepthfirstGenericWithMultiplier &) = delete;
+
+ void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+ {
+ m_biases = static_cast<const TAccum *>(biases);
+ Parent::pack_weights(static_cast<TAccum *>(buffer), static_cast<const TWeight *>(weights), ld_weight_col, ld_weight_row);
+ }
+
+ using DepthwiseDepthfirstGenericWithMultiplierBase<strategy>::execute;
+ void execute(
+ const unsigned int batches,
+ const unsigned int input_height,
+ const unsigned int input_width,
+ const unsigned int input_channels,
+ const PaddingValues &padding,
+ const void *const _input,
+ const size_t ld_input_col,
+ const size_t ld_input_row,
+ const size_t ld_input_batch,
+ const void *const parameters,
+ const unsigned int output_height,
+ const unsigned int output_width,
+ void *const _output,
+ const size_t ld_output_col,
+ const size_t ld_output_row,
+ const size_t ld_output_batch,
+ void *const _working_space,
+ const unsigned int thread_id,
+ const unsigned int n_threads
+ ) const override
+ {
+ strategy strat(this->m_args.cpu_info);
+#ifdef CYCLE_PROFILING
+ arm_gemm::profiler prof;
+#endif
+
+ // Compute activation values
+ TAccum activation_min, activation_max;
+ if (std::numeric_limits<TAccum>::is_integer)
+ {
+ activation_min = std::numeric_limits<TAccum>::min();
+ activation_max = std::numeric_limits<TAccum>::max();
+ }
+ else
+ {
+ activation_min = static_cast<TAccum>(-std::numeric_limits<float>::infinity());
+ activation_max = static_cast<TAccum>(std::numeric_limits<float>::infinity());
+ }
+
+ switch (this->m_args.activation.type)
+ {
+ case arm_gemm::Activation::Type::BoundedReLU:
+ activation_max = static_cast<TAccum>(this->m_args.activation.param1);
+ // Fall through
+ case arm_gemm::Activation::Type::ReLU:
+ activation_min = static_cast<TAccum>(0);
+ break;
+ default:
+ break;
+ }
+
+ // Get a function to call for each point of the output
+ auto tile_fn = [&] (const TInput **inptrs,
+ TOutput **outptrs,
+ const TWeight *weights,
+ const unsigned int,
+ const unsigned int start_output_channel) {
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::output_rows() * strategy::output_cols() * this->m_args.channel_multiplier * this->m_args.kernel_rows * this->m_args.kernel_cols));
+#endif
+ strat.kernel(
+ inptrs, outptrs, weights,
+ m_biases ? m_biases + start_output_channel : nullptr,
+ this->kernel_points(), this->m_args.channel_multiplier,
+ activation_min, activation_max
+ );
+ };
+
+ Parent::execute_tiles(
+ tile_fn, 0.0f,
+ batches, input_height, input_width, input_channels, padding,
+ _input, ld_input_col, ld_input_row, ld_input_batch,
+ parameters,
+ output_height, output_width,
+ _output, ld_output_col, ld_output_row, ld_output_batch,
+ _working_space, thread_id, n_threads
+ );
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier_quantized.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier_quantized.hpp
new file mode 100644
index 000000000..d42382e20
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier_quantized.hpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+#include "depthwise_depthfirst_generic_multiplier.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+template <class strategy>
+class DepthwiseDepthfirstGenericWithMultiplierQuantized : public DepthwiseDepthfirstGenericWithMultiplierBase<strategy>
+{
+ using TInput = typename strategy::input_type;
+ using TWeight = typename strategy::weight_type;
+ using TOutput = typename strategy::return_type;
+ using TAccum = typename strategy::bias_type;
+
+ using Parent = DepthwiseDepthfirstGenericWithMultiplierBase<strategy>;
+
+ arm_gemm::Requantize32 m_qp;
+
+ public:
+ DepthwiseDepthfirstGenericWithMultiplierQuantized(const DepthwiseArgs &args, const arm_gemm::Requantize32 &qp)
+ : Parent(args), m_qp(qp)
+ {
+ }
+
+ DepthwiseDepthfirstGenericWithMultiplierQuantized(DepthwiseDepthfirstGenericWithMultiplierQuantized &) = delete;
+ DepthwiseDepthfirstGenericWithMultiplierQuantized &operator=(DepthwiseDepthfirstGenericWithMultiplierQuantized &) = delete;
+
+ void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+ {
+ m_qp.bias = static_cast<const TAccum *>(biases);
+ Parent::pack_weights(static_cast<TWeight *>(buffer), static_cast<const TWeight *>(weights), ld_weight_col, ld_weight_row);
+ }
+
+ using Parent::execute;
+ void execute(
+ const unsigned int batches,
+ const unsigned int input_height,
+ const unsigned int input_width,
+ const unsigned int input_channels,
+ const PaddingValues &padding,
+ const void *const _input,
+ const size_t ld_input_col,
+ const size_t ld_input_row,
+ const size_t ld_input_batch,
+ const void *const parameters,
+ const unsigned int output_height,
+ const unsigned int output_width,
+ void *const _output,
+ const size_t ld_output_col,
+ const size_t ld_output_row,
+ const size_t ld_output_batch,
+ void *const _working_space,
+ const unsigned int thread_id,
+ const unsigned int n_threads
+ ) const override
+ {
+ strategy strat(this->m_args.cpu_info);
+#ifdef CYCLE_PROFILING
+ arm_gemm::profiler prof;
+#endif
+
+ // Get a function to call for each point of the output
+ auto tile_fn = [&] (const TInput **inptrs,
+ TOutput **outptrs,
+ const TWeight *weights,
+ const unsigned int,
+ const unsigned int start_output_channel) {
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::output_rows() * strategy::output_cols() * this->m_args.channel_multiplier * this->m_args.kernel_rows * this->m_args.kernel_cols));
+#endif
+ strat.kernel(
+ inptrs, outptrs, weights,
+ m_qp.bias == nullptr ? nullptr : m_qp.bias + start_output_channel,
+ this->kernel_points(),
+ this->m_args.channel_multiplier,
+ m_qp.per_channel_left_shifts == nullptr ? nullptr : m_qp.per_channel_left_shifts + start_output_channel,
+ m_qp.per_channel_muls == nullptr ? nullptr : m_qp.per_channel_muls + start_output_channel,
+ m_qp.per_channel_right_shifts == nullptr ? nullptr : m_qp.per_channel_right_shifts + start_output_channel,
+ m_qp
+ );
+ };
+
+ Parent::execute_tiles(
+ tile_fn, m_qp.a_offset,
+ batches, input_height, input_width, input_channels, padding,
+ _input, ld_input_col, ld_input_row, ld_input_batch,
+ parameters,
+ output_height, output_width,
+ _output, ld_output_col, ld_output_row, ld_output_batch,
+ _working_space, thread_id, n_threads
+ );
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_quantized.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_quantized.hpp
new file mode 100644
index 000000000..cfb0d4bc0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_quantized.hpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "depthwise_depthfirst_generic.hpp"
+
+#include "arm_gemm.hpp"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+template <class Strategy, unsigned OutputRows, unsigned int OutputCols>
+class DepthwiseDepthfirstGenericQuantized : public DepthwiseDepthfirstGenericBase<Strategy, OutputRows, OutputCols>
+{
+ using Parent = DepthwiseDepthfirstGenericBase<Strategy, OutputRows, OutputCols>;
+ using TInput = typename Parent::TInput;
+ using TAccum = typename Parent::TAccum;
+ using TOutput = typename Parent::TOutput;
+
+ Requantize32 m_qp;
+
+ public:
+ DepthwiseDepthfirstGenericQuantized(const DepthwiseArgs &args, const Requantize32 &qp)
+ : Parent(args), m_qp(qp)
+ {
+ }
+
+ DepthwiseDepthfirstGenericQuantized(DepthwiseDepthfirstGenericQuantized &) = delete;
+ DepthwiseDepthfirstGenericQuantized &operator=(DepthwiseDepthfirstGenericQuantized &) = delete;
+
+ void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+ {
+ m_qp.bias = static_cast<const TAccum *>(biases);
+ Parent::pack_parameters(buffer, biases, weights, ld_weight_col, ld_weight_row);
+ }
+
+ using DepthwiseDepthfirstGenericBase<Strategy, OutputRows, OutputCols>::execute;
+ void execute(
+ const unsigned int batches,
+ const unsigned int input_height,
+ const unsigned int input_width,
+ const unsigned int input_channels,
+ const PaddingValues &padding,
+ const void *const _input,
+ const size_t ld_input_col,
+ const size_t ld_input_row,
+ const size_t ld_input_batch,
+ const void *const parameters,
+ const unsigned int output_height,
+ const unsigned int output_width,
+ void *const _output,
+ const size_t ld_output_col,
+ const size_t ld_output_row,
+ const size_t ld_output_batch,
+ void *const _working_space,
+ const unsigned int thread_id,
+ const unsigned int n_threads
+ ) const override
+ {
+ Strategy strat(this->m_args.cpu_info);
+#ifdef CYCLE_PROFILING
+ arm_gemm::profiler prof;
+#endif
+
+ // Create a function to initialise the input buffer
+ const auto initialise_input_buffer = [this] (TInput *const buffer, const unsigned int n) {
+ std::memset(buffer, static_cast<TInput>(m_qp.a_offset), n * sizeof(TInput));
+ };
+
+ // Create a function to execute a tile of work
+ const auto tile_fn = [&] (const TInput *const *const inptrs, TOutput *const * const outptrs) {
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(
+ PROFILE_KERNEL,
+ (unsigned long) (OutputRows * OutputCols * this->m_args.kernel_rows* this->m_args.kernel_cols)
+ );
+#endif
+ strat.kernel(inptrs, outptrs, parameters, m_qp,
+ this->m_args.kernel_rows * this->m_args.kernel_cols,
+ this->m_args.input_channels);
+ };
+
+ // Call into a parent utility function to do the actual work.
+ Parent::execute_tiles(
+ tile_fn, initialise_input_buffer,
+ batches, input_height, input_width, input_channels, padding,
+ _input, ld_input_col, ld_input_row, ld_input_batch,
+ output_height, output_width,
+ _output, ld_output_col, ld_output_row, ld_output_batch,
+ _working_space, thread_id, n_threads
+ );
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
new file mode 100644
index 000000000..7c64e0be6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
@@ -0,0 +1,527 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace common
+{
+ template <typename strategy, typename F>
+ void depthwise_multiplier_execute(
+ const F execute_tile,
+ typename strategy::input_type pad_value,
+ const DepthwiseArgs &args,
+ const unsigned int batches,
+ const unsigned int input_height,
+ const unsigned int input_width,
+ const unsigned int input_channels,
+ const PaddingValues &padding,
+ const void *const _input,
+ const size_t ld_input_col,
+ const size_t ld_input_row,
+ const size_t ld_input_batch,
+ const void *const parameters,
+ const size_t param_stride,
+ const unsigned int output_height,
+ const unsigned int output_width,
+ void *const _output,
+ const size_t ld_output_col,
+ const size_t ld_output_row,
+ const size_t ld_output_batch,
+ void *const _working_space,
+ const unsigned int thread_id,
+ const unsigned int n_threads
+ )
+ {
+ using TInput = typename strategy::input_type;
+ using TOutput = typename strategy::return_type;
+
+ // Determine what portion of the work to do.
+ const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads);
+ const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
+ const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
+
+ // Cast input and output pointers into the right types
+ const TInput *const inptr = static_cast<const TInput *>(_input);
+ TOutput *const outptr = static_cast<TOutput *>(_output);
+
+ // To simplify the kernel, we process padded or non-NCHW-ordered input into
+ // a form which can be consumed by the kernel. This data is stored here and
+ // passed into the kernel as an array of N pointers (one per row of the
+ // input).
+ TInput rearranged_input[strategy::input_rows][strategy::input_col_quads*(16 / sizeof(TInput))];
+ const TInput *inptrs[strategy::input_rows];
+
+ // Create an array for the output pointers
+ TOutput * _outptr_array[strategy::output_rows * strategy::output_cols];
+ TOutput **const outptr_array = _outptr_array;
+
+ // Allocate portions of the working space
+ uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
+ TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
+
+ // For each output tile, construct the requisite set of pointers and call
+ // into the kernel.
+ for (unsigned int batch = 0; batch < batches; batch++)
+ {
+ // Get batch pointers
+ const auto inptr_batch = inptr + batch * ld_input_batch;
+ const auto outptr_batch = outptr + batch * ld_output_batch;
+
+ for (int start_out_i = start_out_height;
+ start_out_i < end_out_height;
+ start_out_i += static_cast<int>(strategy::output_rows))
+ {
+ const int end_out_i = start_out_i + strategy::output_rows;
+ const int start_in_i = start_out_i * strategy::stride_rows - padding.top;
+ const int end_in_i = start_in_i + strategy::input_rows;
+
+ // Compute top/bottom padding
+ const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
+ const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(input_height) - end_in_i, 0));
+ const unsigned int valid_output_rows = std::min(
+ end_out_i - start_out_i,
+ static_cast<int>(output_height) - start_out_i
+ );
+
+ for (int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
+ {
+ const int start_in_j = start_out_j * strategy::stride_cols - args.padding.left;
+ const int pad_left = -std::min(0, start_in_j);
+
+ const int end_out_j = start_out_j + strategy::output_cols;
+ const int end_in_j = start_in_j + strategy::input_cols;
+
+ const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(input_width) - end_in_j, 0));
+ const unsigned int valid_output_cols = std::min(
+ end_out_j - start_out_j,
+ static_cast<int>(output_width) - start_out_j
+ );
+
+ // Construct the output pointer array.
+ TOutput **outptr_pos = outptr_array;
+ for (auto i = 0u; i < valid_output_rows; i++)
+ {
+ unsigned int j = 0u;
+ TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col;
+ for (; j < valid_output_cols; j++)
+ {
+ *(outptr_pos++) = colptr;
+ colptr += ld_output_col;
+ }
+ for (; j < strategy::output_cols; j++)
+ {
+ *(outptr_pos++) = output_buffer;
+ }
+ }
+ for (auto i = valid_output_rows; i < strategy::output_rows; i++)
+ {
+ for (auto j = 0u; j < strategy::output_cols; j++)
+ {
+ *(outptr_pos++) = output_buffer;
+ }
+ }
+
+ start_out_j += strategy::output_cols;
+
+ const uint8_t *params = static_cast<const uint8_t *>(parameters);
+
+ // Loop over the input channels
+ for (unsigned int in_c = 0; in_c < input_channels; in_c++)
+ {
+ // Construct the input array - first fill with padding values and
+ // then fill in correct values.
+ for (unsigned int i = 0; i < strategy::input_rows; i++)
+ {
+ for (unsigned int j = 0;
+ j < (16 / sizeof(TInput)) * strategy::input_col_quads; j++)
+ {
+ rearranged_input[i][j] = pad_value;
+ }
+ inptrs[i] = rearranged_input[i];
+ }
+
+ auto inptr_row = inptr_batch + in_c +
+ (start_in_i + pad_top) * ld_input_row +
+ (start_in_j + pad_left) * ld_input_col;
+ if (ld_input_col == 1 && !pad_left &&
+ start_in_j + (16 / sizeof(TInput)) * strategy::input_col_quads < input_width)
+ {
+ // The input tensor is already in NCHW format, and we're reading
+ // an unpadded section of it - allow the kernel to read it
+ // directly.
+ for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
+ {
+ inptrs[i] = inptr_row;
+ inptr_row += ld_input_row;
+ }
+ }
+ else
+ {
+ // Either the input tensor isn't in NCHW format, or we're reading
+ // a padded section. Copy the relevant portion of the input here
+ // and allow the kernel to read this.
+ for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
+ {
+ auto inptr_col = inptr_row;
+ for (unsigned int j = pad_left; j < strategy::input_cols - pad_right; j++)
+ {
+ rearranged_input[i][j] = *inptr_col;
+ inptr_col += ld_input_col;
+ }
+ inptr_row += ld_input_row;
+ }
+ }
+
+ execute_tile(inptrs, outptr_array, params);
+
+ // Progress the output pointers
+ TOutput **outptr_pos = outptr_array;
+ for (auto i = 0u; i < strategy::output_rows * strategy::output_cols; i++)
+ {
+ outptr_pos[i] += args.channel_multiplier;
+ }
+
+ // Progress the pointer into the parameters
+ params += param_stride;
+ }
+ }
+ }
+ }
+ }
+}
+
+template <class strategy>
+class DepthwiseDepthfirstWithMultiplier :
+ public DepthwiseCommon<typename strategy::input_type,
+ typename strategy::weight_type,
+ typename strategy::return_type>
+{
+ using TInput = typename strategy::input_type;
+ using TWeight = typename strategy::weight_type;
+ using TOutput = typename strategy::return_type;
+ using TAccum = typename strategy::bias_type;
+
+ size_t sizeof_output_buffer(unsigned int n_channels) const
+ {
+ const unsigned int vl = arm_gemm::utils::get_vector_length<TOutput>(strategy::vl_type);
+ const auto rounded_channels = arm_gemm::roundup(n_channels, vl);
+ return sizeof(TOutput) * rounded_channels;
+ }
+
+ public:
+ DepthwiseDepthfirstWithMultiplier(const DepthwiseArgs &args) : DepthwiseCommon<TInput, TWeight, TOutput>(args)
+ {
+ }
+
+ DepthwiseDepthfirstWithMultiplier(DepthwiseDepthfirstWithMultiplier &) = delete;
+ DepthwiseDepthfirstWithMultiplier &operator=(DepthwiseDepthfirstWithMultiplier &) = delete;
+
+ size_t get_storage_size(void) const override
+ {
+ // TODO What if we insert extra padding? Biases are a different size to the inputs, ...
+ const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(strategy::vl_type);
+ const auto rounded_channels = this->m_args.input_channels * arm_gemm::roundup(this->m_args.channel_multiplier, vl);
+ return (1 + this->m_args.kernel_rows * this->m_args.kernel_cols) * rounded_channels * sizeof(TWeight);
+ }
+
+ void pack_parameters(void *_buffer, const void *_biases, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override
+ {
+ // TODO What if the kernel needs a different packing function?
+
+ // Cast the pointers
+ float *buffer = static_cast<float *>(_buffer);
+ const float *biases = static_cast<const float *>(_biases);
+ const float *const weights = static_cast<const float *>(_weights);
+
+ const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(strategy::vl_type);
+ ld_weight_col = (ld_weight_col == 0) ? this->m_args.channel_multiplier * this->m_args.input_channels : ld_weight_col;
+ ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row;
+
+ for (unsigned int in_c = 0; in_c < this->m_args.input_channels; in_c++)
+ {
+ for (unsigned int n = 0; n < this->m_args.channel_multiplier; n += vl)
+ {
+ const unsigned int out_c = in_c * this->m_args.channel_multiplier + n;
+ const unsigned int todo = std::min(vl, this->m_args.channel_multiplier - n);
+
+ // Copy across the correct amount of bias (or 0)
+ for (unsigned int i = 0; i < todo; i++)
+ {
+ buffer[i] = (biases == nullptr) ? 0 : biases[out_c + i];
+ }
+ buffer += vl;
+
+ // Copy each of the weights in turn
+ auto weights_row = weights + out_c;
+ for (unsigned int i = 0; i < this->m_args.kernel_rows; i++)
+ {
+ auto weights_col = weights_row;
+
+ for (unsigned int j = 0; j < this->m_args.kernel_cols; j++)
+ {
+ for (unsigned int m = 0; m < todo; m++)
+ {
+ buffer[m] = weights_col[m];
+ }
+ buffer += vl;
+
+ weights_col += ld_weight_col;
+ }
+
+ weights_row += ld_weight_row;
+ }
+ }
+ }
+ }
+
+ size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
+ {
+ const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
+ return n_threads * sizeof_output_buffer(n_output_channels);
+ }
+
+ using DepthwiseCommon<typename strategy::input_type, typename strategy::weight_type, typename strategy::return_type>::execute;
+ void execute(
+ const unsigned int batches,
+ const unsigned int input_height,
+ const unsigned int input_width,
+ const unsigned int input_channels,
+ const PaddingValues &padding,
+ const void *const _input,
+ const size_t ld_input_col,
+ const size_t ld_input_row,
+ const size_t ld_input_batch,
+ const void *const parameters,
+ const unsigned int output_height,
+ const unsigned int output_width,
+ void *const _output,
+ const size_t ld_output_col,
+ const size_t ld_output_row,
+ const size_t ld_output_batch,
+ void *const _working_space,
+ const unsigned int thread_id,
+ const unsigned int n_threads
+ ) const override
+ {
+ strategy strat(this->m_args.cpu_info);
+#ifdef CYCLE_PROFILING
+ arm_gemm::profiler prof;
+#endif
+
+ // Compute activation values
+ TAccum activation_min = std::numeric_limits<TAccum>::has_infinity ? -std::numeric_limits<TAccum>::infinity() : std::numeric_limits<TAccum>::min();
+ TAccum activation_max = std::numeric_limits<TAccum>::has_infinity ? std::numeric_limits<TAccum>::infinity() : std::numeric_limits<TAccum>::max();
+
+ switch (this->m_args.activation.type)
+ {
+ case arm_gemm::Activation::Type::BoundedReLU:
+ activation_max = static_cast<TAccum>(this->m_args.activation.param1);
+ // Fall through
+ case arm_gemm::Activation::Type::ReLU:
+ activation_min = static_cast<TAccum>(0);
+ break;
+ default:
+ break;
+ }
+
+ // Determine what portion of the work to do.
+ const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads);
+ const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
+ const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
+
+ // Need a stride over blocks of parameters
+ const unsigned int vl = arm_gemm::utils::get_vector_length<TOutput>(strategy::vl_type);
+ const unsigned int param_stride =
+ arm_gemm::roundup(this->m_args.channel_multiplier, vl) *
+ (sizeof(TAccum) + sizeof(TWeight) * strategy::kernel_rows * strategy::kernel_cols);
+
+ // Cast input and output pointers into the right types
+ const TInput *const inptr = static_cast<const TInput *>(_input);
+ TOutput *const outptr = static_cast<TOutput *>(_output);
+
+ // To simplify the kernel, we process padded or non-NCHW-ordered input into
+ // a form which can be consumed by the kernel. This data is stored here and
+ // passed into the kernel as an array of N pointers (one per row of the
+ // input).
+ TInput rearranged_input[strategy::input_rows][strategy::input_col_quads*4];
+ const TInput *inptrs[strategy::input_rows];
+
+ // Create an array for the output pointers
+ TOutput * _outptr_array[strategy::output_rows * strategy::output_cols];
+ TOutput **const outptr_array = _outptr_array;
+
+ // Allocate portions of the working space
+ uint8_t *const working_space = static_cast<uint8_t *>(_working_space) + get_working_size(thread_id, input_channels);
+ TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
+
+ // For each output tile, construct the requisite set of pointers and call
+ // into the kernel.
+ for (unsigned int batch = 0; batch < batches; batch++)
+ {
+ // Get batch pointers
+ const auto inptr_batch = inptr + batch * ld_input_batch;
+ const auto outptr_batch = outptr + batch * ld_output_batch;
+
+ for (int start_out_i = start_out_height;
+ start_out_i < end_out_height;
+ start_out_i += static_cast<int>(strategy::output_rows))
+ {
+ const int end_out_i = start_out_i + strategy::output_rows;
+ const int start_in_i = start_out_i * strategy::stride_rows - padding.top;
+ const int end_in_i = start_in_i + strategy::input_rows;
+
+ // Compute top/bottom padding
+ const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
+ const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(input_height) - end_in_i, 0));
+ const unsigned int valid_output_rows = std::min(
+ end_out_i - start_out_i,
+ static_cast<int>(output_height) - start_out_i
+ );
+
+ for (int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
+ {
+ const int start_in_j = start_out_j * strategy::stride_cols - this->m_args.padding.left;
+ const int pad_left = -std::min(0, start_in_j);
+
+ const int end_out_j = start_out_j + strategy::output_cols;
+ const int end_in_j = start_in_j + strategy::input_cols;
+
+ const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(input_width) - end_in_j, 0));
+ const unsigned int valid_output_cols = std::min(
+ end_out_j - start_out_j,
+ static_cast<int>(output_width) - start_out_j
+ );
+
+ // Construct the output pointer array.
+ TOutput **outptr_pos = outptr_array;
+ for (auto i = 0u; i < valid_output_rows; i++)
+ {
+ unsigned int j = 0u;
+ TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col;
+ for (; j < valid_output_cols; j++)
+ {
+ *(outptr_pos++) = colptr;
+ colptr += ld_output_col;
+ }
+ for (; j < strategy::output_cols; j++)
+ {
+ *(outptr_pos++) = output_buffer;
+ }
+ }
+ for (auto i = valid_output_rows; i < strategy::output_rows; i++)
+ {
+ for (auto j = 0u; j < strategy::output_cols; j++)
+ {
+ *(outptr_pos++) = output_buffer;
+ }
+ }
+
+ start_out_j += strategy::output_cols;
+
+ const uint8_t *params = static_cast<const uint8_t *>(parameters);
+
+ // Loop over the input channels
+ for (unsigned int in_c = 0; in_c < input_channels; in_c++)
+ {
+ // Construct the input array - first fill with padding values and
+ // then fill in correct values.
+ for (unsigned int i = 0; i < strategy::input_rows; i++)
+ {
+ for (unsigned int j = 0; j < 4 * strategy::input_col_quads; j++)
+ {
+ rearranged_input[i][j] = static_cast<TInput>(0);
+ }
+ inptrs[i] = rearranged_input[i];
+ }
+
+ auto inptr_row = inptr_batch + in_c +
+ (start_in_i + pad_top) * ld_input_row +
+ (start_in_j + pad_left) * ld_input_col;
+ if (ld_input_col == 1 && !pad_left &&
+ start_in_j + 4 * strategy::input_col_quads < input_width)
+ {
+ // The input tensor is already in NCHW format, and we're reading
+ // an unpadded section of it - allow the kernel to read it
+ // directly.
+ for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
+ {
+ inptrs[i] = inptr_row;
+ inptr_row += ld_input_row;
+ }
+ }
+ else
+ {
+ // Either the input tensor isn't in NCHW format, or we're reading
+ // a padded section. Copy the relevant portion of the input here
+ // and allow the kernel to read this.
+ for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
+ {
+ auto inptr_col = inptr_row;
+ for (unsigned int j = pad_left; j < strategy::input_cols - pad_right; j++)
+ {
+ rearranged_input[i][j] = *inptr_col;
+ inptr_col += ld_input_col;
+ }
+ inptr_row += ld_input_row;
+ }
+ }
+
+ {
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::output_rows * strategy::output_cols * this->m_args.channel_multiplier * strategy::kernel_rows * strategy::kernel_cols));
+#endif
+ strat.kernel(
+ inptrs, outptr_array, params,
+ this->m_args.channel_multiplier,
+ activation_min, activation_max
+ );
+ }
+
+ // Progress the output pointers
+ TOutput **outptr_pos = outptr_array;
+ for (auto i = 0u; i < strategy::output_rows * strategy::output_cols; i++)
+ {
+ outptr_pos[i] += this->m_args.channel_multiplier;
+ }
+
+ // Progress the pointer into the parameters
+ params += param_stride;
+ }
+ }
+ }
+ }
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier_quantized.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier_quantized.hpp
new file mode 100644
index 000000000..07ce0d3b5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier_quantized.hpp
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "depthwise_depthfirst_multiplier.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+template <class strategy>
+class DepthwiseDepthfirstWithMultiplierQuantized :
+ public DepthwiseCommon<typename strategy::input_type,
+ typename strategy::weight_type,
+ typename strategy::return_type>
+{
+ using Parent = DepthwiseCommon<typename strategy::input_type,
+ typename strategy::weight_type,
+ typename strategy::return_type>;
+ using TInput = typename strategy::input_type;
+ using TWeight = typename strategy::weight_type;
+ using TOutput = typename strategy::return_type;
+
+ const arm_gemm::Requantize32 m_qp;
+
+ size_t sizeof_output_buffer(unsigned int n_channels) const
+ {
+ const unsigned int vl = arm_gemm::utils::get_vector_length<typename strategy::return_type>(strategy::vl_type);
+ const auto rounded_channels = arm_gemm::roundup(n_channels, vl);
+ return sizeof(typename strategy::return_type) * rounded_channels;
+ }
+
+ public:
+ DepthwiseDepthfirstWithMultiplierQuantized(const DepthwiseArgs &args, const arm_gemm::Requantize32 &qp)
+ : Parent(args), m_qp(qp)
+ {
+ }
+
+ DepthwiseDepthfirstWithMultiplierQuantized(DepthwiseDepthfirstWithMultiplierQuantized &) = delete;
+ DepthwiseDepthfirstWithMultiplierQuantized &operator=(DepthwiseDepthfirstWithMultiplierQuantized &) = delete;
+
+ size_t get_storage_size(void) const override
+ {
+ // We produce VL<int32_t> channels at a time, for each of these blocks of
+ // channels we store a vector of biases, weights (complicated) and
+ // requantize parameters.
+ const unsigned int iter_length =
+ arm_gemm::utils::get_vector_length<int32_t>(strategy::vl_type);
+ const unsigned int n_iters =
+ this->m_args.input_channels * arm_gemm::iceildiv(this->m_args.channel_multiplier, iter_length);
+
+ // Compute the cost of storing the weights
+ const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(strategy::kernel_cols, 4u);
+
+ return n_iters * iter_length * (
+ sizeof(int32_t) + // Bias
+ 4 * n_dots_per_kernel_row * strategy::kernel_rows * sizeof(TWeight) + // Weights
+ 2 * sizeof(int32_t) // Requantisation parameters
+ );
+ }
+
+ // We'll want an optimised version of this, but for now a C++ implementation
+ // is probably sufficient.
+ void pack_parameters(void *_buffer, const void *_biases, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override
+ {
+ auto buffer = static_cast<uint8_t *>(_buffer);
+ auto biases = static_cast<const int32_t *>(_biases);
+ auto weights = static_cast<const TWeight *>(_weights);
+ auto requant_muls = m_qp.per_channel_muls;
+ auto requant_shifts = m_qp.per_channel_right_shifts;
+
+ const unsigned int iter_length =
+ arm_gemm::utils::get_vector_length<int32_t>(strategy::vl_type);
+ const unsigned int n_iters_per_input_channel =
+ arm_gemm::iceildiv(this->m_args.channel_multiplier, iter_length);
+
+ const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(strategy::kernel_cols, 4u);
+
+ const size_t iter_stride = iter_length * (
+ sizeof(int32_t) + // Bias
+ 4 * n_dots_per_kernel_row * strategy::kernel_rows * sizeof(int8_t) + // Weights
+ 2 * sizeof(int32_t) // Requantisation parameters
+ );
+
+ ld_weight_col = (ld_weight_col == 0) ? this->m_args.input_channels * this->m_args.channel_multiplier : ld_weight_col;
+ ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row;
+
+ for (unsigned int input_channel = 0; input_channel < this->m_args.input_channels; input_channel++)
+ {
+ auto buffer_input_channel = buffer + input_channel * n_iters_per_input_channel * iter_stride;
+ auto weights_input_channel = weights + input_channel * this->m_args.channel_multiplier;
+
+ for (unsigned int iter = 0; iter < n_iters_per_input_channel; iter++)
+ {
+ // Get a pointer to the start of this portion of the buffer; consequently
+ // derive pointers to the bias, weight and requantisation portions of
+ // this frame.
+ auto buffer_base = buffer_input_channel + iter_stride * iter;
+ auto buffer_biases = reinterpret_cast<int32_t *>(buffer_base);
+ auto buffer_weights = buffer_base + sizeof(int32_t) * iter_length;
+ auto buffer_requant_mul = reinterpret_cast<int32_t *>(
+ buffer_weights + strategy::kernel_rows * n_dots_per_kernel_row * 4 * iter_length);
+ auto buffer_requant_shift = buffer_requant_mul + iter_length;
+ auto weights_base = weights_input_channel + iter * iter_length;
+
+ // Hence work through the data for this iteration, on a
+ // channel-by-channel basis.
+ const auto this_iter_length = std::min<unsigned int>(
+ iter_length, this->m_args.channel_multiplier - iter * iter_length
+ );
+ for (unsigned int i = 0; i < this_iter_length; i++)
+ {
+ auto weights_channel = weights_base + i;
+
+ // Read the bias value, we modify this as we read the weights.
+ auto bias_value = biases == nullptr ? 0 : *(biases++);
+ int32_t elements_sum = 0;
+
+ // Read through the kernel; for each row, marshal together as many dot
+ // product terms as are required.
+ for (unsigned int ki = 0; ki < strategy::kernel_rows; ki++)
+ {
+ auto buffer_row = buffer_weights + i*4 + ki * 4 * n_dots_per_kernel_row * iter_length;
+ auto weights_row = weights_channel + ki * ld_weight_row;
+
+ unsigned int kj = 0;
+ for (; kj < strategy::kernel_cols; kj++)
+ {
+ // Determine which element to which we're writing
+ const auto dot = kj / 4;
+ const auto elem = kj % 4;
+
+ // Copy the value; include in the sum
+ const auto val = weights_row[kj * ld_weight_col];
+ buffer_row[dot * 4 * iter_length + elem] = val;
+ elements_sum += val;
+ }
+ for (; kj < 4 * n_dots_per_kernel_row; kj++)
+ {
+ const auto dot = kj / 4;
+ const auto elem = kj % 4;
+ buffer_row[dot * 4 * iter_length + elem] = 0;
+ }
+
+ buffer_row += 4 * n_dots_per_kernel_row * iter_length;
+ }
+
+ // Write back the bias and offset values
+ *(buffer_biases++) =
+ bias_value - m_qp.a_offset * elements_sum +
+ strategy::kernel_rows * strategy::kernel_cols * m_qp.a_offset * m_qp.b_offset;
+
+ // Write out the requantisation parameters
+ *(buffer_requant_mul++) = m_qp.per_channel_requant ? *(requant_muls++) : m_qp.per_layer_mul;
+ *(buffer_requant_shift++) = m_qp.per_channel_requant ? *(requant_shifts++) : m_qp.per_layer_right_shift;
+ }
+ }
+ }
+ }
+
+ size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
+ {
+ const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
+ return n_threads * sizeof_output_buffer(n_output_channels);
+ }
+
+ using Parent::execute;
+ void execute(
+ const unsigned int batches,
+ const unsigned int input_height,
+ const unsigned int input_width,
+ const unsigned int input_channels,
+ const PaddingValues &padding,
+ const void *const _input,
+ const size_t ld_input_col,
+ const size_t ld_input_row,
+ const size_t ld_input_batch,
+ const void *const parameters,
+ const unsigned int output_height,
+ const unsigned int output_width,
+ void *const _output,
+ const size_t ld_output_col,
+ const size_t ld_output_row,
+ const size_t ld_output_batch,
+ void *const _working_space,
+ const unsigned int thread_id,
+ const unsigned int n_threads
+ ) const override
+ {
+ strategy strat(this->m_args.cpu_info);
+#ifdef CYCLE_PROFILING
+ arm_gemm::profiler prof;
+#endif
+
+ auto executefn = [strat, this] (
+ const TInput *const *const inptrs,
+ TOutput *const *const outptr_array,
+ const void *const params
+ ) {
+ strat.kernel(inptrs, outptr_array, params, this->m_args.channel_multiplier, m_qp);
+ };
+
+ // Get working space for this thread
+ uint8_t *const working_space = static_cast<uint8_t *>(_working_space) + get_working_size(1, input_channels) * thread_id;
+
+ // Determine the stride across blocks of parameters
+ const unsigned int iter_length =
+ arm_gemm::utils::get_vector_length<int32_t>(strategy::vl_type);
+ const unsigned int n_iters_per_input_channel = arm_gemm::iceildiv(this->m_args.channel_multiplier, iter_length);
+ const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(strategy::kernel_cols, 4u);
+ const size_t param_stride = n_iters_per_input_channel * iter_length * (
+ sizeof(int32_t) + // Bias
+ 4 * n_dots_per_kernel_row * strategy::kernel_rows * sizeof(int8_t) + // Weights
+ 2 * sizeof(int32_t) // Requantisation parameters
+ );
+
+ common::depthwise_multiplier_execute<strategy>(
+ executefn, m_qp.a_offset, this->m_args,
+ batches, input_height, input_width, input_channels, padding,
+ _input, ld_input_col, ld_input_row, ld_input_batch,
+ parameters, param_stride,
+ output_height, output_width,
+ _output, ld_output_col, ld_output_row, ld_output_batch,
+ working_space, thread_id, n_threads
+ );
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_quantized.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_quantized.hpp
new file mode 100644
index 000000000..f97569e95
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_quantized.hpp
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+
+// We have two sets of quantized kernels; those which use the dot-product
+// instructions and which require the biases and quantisation parameters to be
+// ravelled into weights/parameter array, and those which use the MLAL
+// instructions and which consume separate bias and quantisation parameter
+// arrays. The following code adapts these two sets of kernels to use the same
+// API - allowing the same driver loop to call them both.
+
+template <typename TIn, typename TWeight, typename TOut>
+using UnravelledKernFn = std::function<void(unsigned int, const TIn *const *, const TWeight *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, TOut *const *)>;
+
+template <typename TIn, typename TOut>
+using RavelledKernFn = std::function<void(const TIn *const *, TOut *const *, const void *, uint64_t, const arm_gemm::Requantize32 &)>;
+
+template <typename TIn, typename TWeight, typename TOut>
+const UnravelledKernFn<TIn, TWeight, TOut> get_unified_kernel(const UnravelledKernFn<TIn, TWeight, TOut> &f) { return f; }
+
+template <typename TIn, typename TWeight, typename TOut>
+const UnravelledKernFn<TIn, TWeight, TOut> get_unified_kernel(const RavelledKernFn<TIn, TOut> &f)
+{
+ return [f] (const unsigned int n_channels,
+ const TIn *const *const inptrs,
+ const TWeight *const weights,
+ const int32_t *, // Bias (ravelled)
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *, // Requantisation muls (ravelled)
+ const int32_t *, // Requantisation shifts (ravelled)
+ TOut *const *const outptrs) {
+ return f(inptrs, outptrs, weights, n_channels, qp);
+ };
+}
+
+template <typename T>
+using UnravelledPackingFn = std::function<void(unsigned int, void *, const T *, size_t, size_t)>;
+
+template <typename T>
+using RavelledPackingFn = std::function<void(unsigned int, void *, const int32_t *, const T *, const arm_gemm::Requantize32 &, size_t, size_t)>;
+
+template <typename T>
+const RavelledPackingFn<T> get_unified_packer(const UnravelledPackingFn<T> &f)
+{
+ return [f] (const unsigned int n_channels,
+ void *buffer,
+ const int32_t *, // Bias
+ const T *weights,
+ const arm_gemm::Requantize32 &,
+ size_t ld_weight_col,
+ size_t ld_weight_row)
+ {
+ return f(n_channels, buffer, weights, ld_weight_col, ld_weight_row);
+ };
+}
+
+template <typename T>
+const RavelledPackingFn<T> get_unified_packer(const RavelledPackingFn<T> &f) { return f; }
+
+template <typename T>
+constexpr bool requires_unravelled_bias_and_quant_params(const UnravelledPackingFn<T> &) { return true; }
+
+template <typename T>
+constexpr bool requires_unravelled_bias_and_quant_params(const RavelledPackingFn<T> &) { return false; }
+
+template <class strategy>
+constexpr bool strategy_requires_unravelled_bias_and_quant_params(void)
+{
+ return requires_unravelled_bias_and_quant_params<typename strategy::weight_type>(strategy::pack_parameters);
+}
+
+}
+
+template <class strategy>
+class DepthwiseDepthfirstQuantized :
+ public DepthwiseCommon<typename strategy::input_type,
+ typename strategy::weight_type,
+ typename strategy::return_type>
+{
+ using TInput = typename strategy::input_type;
+ using TWeight = typename strategy::weight_type;
+ using TOutput = typename strategy::return_type;
+ using TAccum = typename strategy::bias_type;
+
+ arm_gemm::Requantize32 m_qp;
+
+ size_t sizeof_input_buffer(unsigned int n_channels) const
+ {
+ const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(strategy::vl_type);
+ const auto rounded_channels = arm_gemm::roundup(n_channels, vl);
+ return sizeof(TInput) * rounded_channels;
+ }
+
+ size_t sizeof_output_buffer(unsigned int n_channels) const
+ {
+ const unsigned int vl = arm_gemm::utils::get_vector_length<TOutput>(strategy::vl_type);
+ const auto rounded_channels = arm_gemm::roundup(n_channels, vl);
+ return sizeof(TOutput) * rounded_channels;
+ }
+
+ size_t sizeof_bias_buffer(unsigned int n_channels) const
+ {
+ if (strategy_requires_unravelled_bias_and_quant_params<strategy>())
+ {
+ return (m_qp.bias == nullptr) ? sizeof(TAccum) * n_channels : 0;
+ }
+
+ return 0;
+ }
+
+ size_t sizeof_requant_mul_buffer(unsigned int n_channels) const
+ {
+ if (strategy_requires_unravelled_bias_and_quant_params<strategy>())
+ {
+ return m_qp.per_channel_requant ? 0 : sizeof(int32_t) * n_channels;
+ }
+
+ return 0;
+ }
+
+ size_t sizeof_requant_shift_buffer(unsigned int n_channels) const
+ {
+ if (strategy_requires_unravelled_bias_and_quant_params<strategy>())
+ {
+ return m_qp.per_channel_requant ? 0 : sizeof(int32_t) * n_channels;
+ }
+
+ return 0;
+ }
+
+ public:
+ DepthwiseDepthfirstQuantized(const DepthwiseArgs &args, const arm_gemm::Requantize32 &qp)
+ : DepthwiseCommon<TInput, TWeight, TOutput>(args), m_qp(qp)
+ {
+ }
+
+ DepthwiseDepthfirstQuantized(DepthwiseDepthfirstQuantized &) = delete;
+ DepthwiseDepthfirstQuantized &operator=(DepthwiseDepthfirstQuantized &) = delete;
+
+ size_t get_storage_size(void) const override
+ {
+ return strategy::get_packed_size(this->m_args);
+ }
+
+ void pack_parameters(void *buffer, const void *const bias, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+ {
+ if (strategy_requires_unravelled_bias_and_quant_params<strategy>())
+ {
+ m_qp.bias = static_cast<const int32_t *>(bias);
+ }
+
+ get_unified_packer<TWeight>(strategy::pack_parameters)(
+ this->m_args.input_channels,
+ buffer,
+ static_cast<const int32_t *>(bias),
+ reinterpret_cast<const TWeight *>(weights),
+ m_qp,
+ ld_weight_col,
+ ld_weight_row
+ );
+ }
+
+ size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
+ {
+ const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
+ return n_threads * (
+ sizeof_output_buffer(n_output_channels) +
+ sizeof_input_buffer(n_channels) +
+ sizeof_bias_buffer(n_channels) +
+ sizeof_requant_mul_buffer(n_channels) +
+ sizeof_requant_shift_buffer(n_channels)
+ );
+ }
+
+ using DepthwiseCommon<typename strategy::input_type, typename strategy::weight_type, typename strategy::return_type>::execute;
+ void execute(
+ const unsigned int batches,
+ const unsigned int input_height,
+ const unsigned int input_width,
+ const unsigned int input_channels,
+ const PaddingValues &padding,
+ const void *const _input,
+ const size_t ld_input_col,
+ const size_t ld_input_row,
+ const size_t ld_input_batch,
+ const void *const parameters,
+ const unsigned int output_height,
+ const unsigned int output_width,
+ void *const _output,
+ const size_t ld_output_col,
+ const size_t ld_output_row,
+ const size_t ld_output_batch,
+ void *_working_space,
+ const unsigned int thread_id,
+ const unsigned int n_threads
+ ) const override
+ {
+ strategy strat(this->m_args.cpu_info);
+#ifdef CYCLE_PROFILING
+ arm_gemm::profiler prof;
+#endif
+ // Get a unified API for the kernel function
+ auto kernel = get_unified_kernel<TInput, TWeight, TOutput>(strat.kernel);
+
+ // Determine what portion of the work to do.
+ const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads);
+ const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
+ const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
+
+ // Cast input and output pointers into the right types
+ const TInput *const inptr = static_cast<const TInput *>(_input);
+ TOutput *const outptr = static_cast<TOutput *>(_output);
+
+ // Create an array for the input pointers
+ const TInput * _inptr_array[strategy::input_rows * strategy::input_cols];
+ const TInput **const inptr_array = _inptr_array;
+
+ // Create an array for the output pointers
+ TOutput * _outptr_array[strategy::output_rows * strategy::output_cols];
+ TOutput **const outptr_array = _outptr_array;
+
+ // Allocate portions of the working space
+ uint8_t *working_space = static_cast<uint8_t *>(_working_space) + get_working_size(thread_id, input_channels);
+
+ TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
+ working_space += sizeof_output_buffer(input_channels * this->m_args.channel_multiplier);
+
+ TInput *const input_buffer = reinterpret_cast<TInput *>(working_space);
+ working_space += sizeof_input_buffer(input_channels);
+
+ const int32_t *const bias_ptr = (m_qp.bias == nullptr) ? reinterpret_cast<int32_t *>(working_space)
+ : m_qp.bias;
+ working_space += sizeof_bias_buffer(input_channels * this->m_args.channel_multiplier);
+
+ const int32_t *const requant_mul_vec = !m_qp.per_channel_requant ? reinterpret_cast<int32_t *>(working_space)
+ : m_qp.per_channel_muls;
+ working_space += sizeof_requant_mul_buffer(input_channels * this->m_args.channel_multiplier);
+
+ const int32_t *const requant_shift_vec = !m_qp.per_channel_requant ? reinterpret_cast<int32_t *>(working_space)
+ : m_qp.per_channel_right_shifts;
+
+ if (strategy_requires_unravelled_bias_and_quant_params<strategy>())
+ {
+ // Initialise the bias buffer
+ if (m_qp.bias == nullptr)
+ {
+ for (unsigned int c = 0; c < input_channels * this->m_args.channel_multiplier; c++)
+ {
+ const_cast<int32_t *>(bias_ptr)[c] = 0;
+ }
+ }
+
+ // Initialise the requantisation parameters
+ if (!m_qp.per_channel_requant)
+ {
+ for (unsigned int c = 0; c < input_channels * this->m_args.channel_multiplier; c++)
+ {
+ const_cast<int32_t *>(requant_mul_vec)[c] = m_qp.per_layer_mul;
+ const_cast<int32_t *>(requant_shift_vec)[c] = m_qp.per_layer_right_shift;
+ }
+ }
+ }
+
+ // Initialise the input buffer
+ for (unsigned int c = 0; c < input_channels; c++)
+ {
+ input_buffer[c] = static_cast<TInput>(m_qp.a_offset);
+ }
+
+ // For each output tile, construct the requisite set of pointers and call
+ // into the kernel.
+ for (unsigned int batch = 0; batch < batches; batch++)
+ {
+ // Get batch pointers
+ const auto inptr_batch = inptr + batch * ld_input_batch;
+ const auto outptr_batch = outptr + batch * ld_output_batch;
+
+ for (int start_out_i = start_out_height;
+ start_out_i < end_out_height;
+ start_out_i += static_cast<int>(strategy::output_rows))
+ {
+ const int end_out_i = start_out_i + strategy::output_rows;
+ const int start_in_i = start_out_i * strategy::stride_rows - padding.top;
+ const int end_in_i = start_in_i + strategy::input_rows;
+
+ // Compute top/bottom padding
+ const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
+ const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(input_height) - end_in_i, 0));
+ const unsigned int valid_output_rows = std::min(
+ end_out_i - start_out_i,
+ static_cast<int>(output_height) - start_out_i
+ );
+
+ // Fill the input pointer array with padding values
+ for (auto index = 0u; index < strategy::input_rows * strategy::input_cols; index++)
+ {
+ inptr_array[index] = input_buffer;
+ }
+
+ for (int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
+ {
+ const int start_in_j = start_out_j * strategy::stride_cols - this->m_args.padding.left;
+ const int pad_left = -std::min(0, start_in_j);
+
+ const int end_out_j = start_out_j + strategy::output_cols;
+ const int end_in_j = start_in_j + strategy::input_cols;
+
+ const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(input_width) - end_in_j, 0));
+ const unsigned int valid_output_cols = std::min(
+ end_out_j - start_out_j,
+ static_cast<int>(output_width) - start_out_j
+ );
+
+ // Construct the input pointer array - fill the array with pointers to
+ // the input buffer and then fill in the required values.
+ for (auto i = pad_top; i < strategy::input_rows - pad_bottom; i++)
+ {
+ // Can skip over the left padding because we will have either the
+ // same or less than the previous tile.
+ unsigned int j = pad_left;
+ const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col;
+ const TInput **ptrs = inptr_array + i * strategy::input_cols + j;
+ for (; j < strategy::input_cols - pad_right; j++)
+ {
+ *(ptrs++) = colptr;
+ colptr += ld_input_col;
+ }
+ for (; j < strategy::input_cols; j++)
+ {
+ *(ptrs++) = input_buffer;
+ }
+ }
+
+ // Construct the output pointer array.
+ TOutput **outptr_pos = outptr_array;
+ for (auto i = 0u; i < valid_output_rows; i++)
+ {
+ unsigned int j = 0u;
+ TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col;
+ for (; j < valid_output_cols; j++)
+ {
+ *(outptr_pos++) = colptr;
+ colptr += ld_output_col;
+ }
+ for (; j < strategy::output_cols; j++)
+ {
+ *(outptr_pos++) = output_buffer;
+ }
+ }
+ for (auto i = valid_output_rows; i < strategy::output_rows; i++)
+ {
+ for (auto j = 0u; j < strategy::output_cols; j++)
+ {
+ *(outptr_pos++) = output_buffer;
+ }
+ }
+
+ start_out_j += strategy::output_cols;
+
+#ifdef CYCLE_PROFILING
+ // TODO Work number
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::output_rows * strategy::output_cols * this->m_args.kernel_rows * this->m_args.kernel_cols));
+#endif
+ kernel(
+ this->m_args.input_channels,
+ inptr_array,
+ reinterpret_cast<const TWeight *>(parameters),
+ bias_ptr, m_qp, requant_mul_vec, requant_shift_vec,
+ outptr_array
+ );
+ }
+ }
+ }
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
new file mode 100644
index 000000000..fdb36fc1d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
+#include "depthwise_depthfirst_generic_multiplier.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE)
+#include "kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#endif // defined(__ARM_FEATURE_SVE)
+#include "kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif // defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+ template <class Strategy>
+ unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+ {
+ // First-pass: compute the number of output pixels which will be computed.
+ return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+ arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
+ arm_gemm::iceildiv(
+ (long unsigned) args.input_channels * args.channel_multiplier,
+ arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+ );
+ }
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
+ {
+ return std::numeric_limits<unsigned int>::max();
+ }
+
+ unsigned int not_preferred_if_no_multiplier(const DepthwiseArgs &args, const Nothing &)
+ {
+ return args.channel_multiplier > 1 ? 0 : std::numeric_limits<unsigned int>::max();
+ }
+#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+}
+
+#if defined(__ARM_FP16_ARGS)
+
+static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = {
+#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
+ constraint(is_supported<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+ has_no_channel_multiplier),
+ cycle_estimate<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ return new DepthwiseDepthfirst<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
+ constraint(is_supported<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+ has_no_channel_multiplier),
+ cycle_estimate<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ return new DepthwiseDepthfirst<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint(is_supported<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier),
+ cycle_estimate<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ return new DepthwiseDepthfirst<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint(is_supported<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier),
+ cycle_estimate<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ return new DepthwiseDepthfirst<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint(is_supported<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier),
+ cycle_estimate<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ return new DepthwiseDepthfirst<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>(args);
+ },
+ },
+#endif // defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
+ constraint(is_supported<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+ has_no_channel_multiplier),
+ cycle_estimate<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ return new DepthwiseDepthfirst<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
+ constraint(is_supported<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+ has_no_channel_multiplier),
+ cycle_estimate<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ return new DepthwiseDepthfirst<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint(is_supported<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier),
+ cycle_estimate<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ return new DepthwiseDepthfirst<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint(is_supported<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier),
+ cycle_estimate<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ return new DepthwiseDepthfirst<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint(is_supported<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier),
+ cycle_estimate<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ return new DepthwiseDepthfirst<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp16_nhwc_generic_output3x3_mla_depthfirst",
+ constraint(has_no_channel_multiplier),
+ not_preferred,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ return new DepthwiseDepthfirstGeneric<a64_fp16_nhwc_generic_output9_mla_depthfirst, 3, 3>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp16_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+ nullptr,
+ not_preferred_if_no_multiplier,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
+ return new DepthwiseDepthfirstGenericWithMultiplier<a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst>(args);
+ },
+ },
+#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // defined(__aarch64__)
+ { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
+};
+
+template <>
+const DepthwiseImplementation<__fp16> *depthwise_implementation_list()
+{
+ return depthwise_fp16_methods;
+}
+
+template UniqueDepthwiseCommon<__fp16> depthwise(const DepthwiseArgs &, const Nothing &);
+template std::vector<KernelDescription> get_compatible_kernels<__fp16>(const DepthwiseArgs &, const Nothing &);
+
+#endif // defined(__ARM_FP16_ARGS)
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
new file mode 100644
index 000000000..aea750a47
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
+#include "depthwise_depthfirst_generic_multiplier.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE)
+#include "kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp"
+#include "kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif // defined(__ARM_FEATURE_SVE)
+#include "kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp"
+#include "kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif // defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+ template <class Strategy>
+ unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &)
+ {
+ // First-pass: compute the number of output pixels which will be computed.
+ return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
+ arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
+ arm_gemm::iceildiv(
+ (long unsigned) args.input_channels * args.channel_multiplier,
+ arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
+ );
+ }
+
+ unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
+ {
+ return std::numeric_limits<unsigned int>::max();
+ }
+
+ unsigned int not_preferred_if_no_multiplier(const DepthwiseArgs &args, const Nothing &)
+ {
+ return args.channel_multiplier > 1 ? 0 : std::numeric_limits<unsigned int>::max();
+ }
+}
+
+static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
+#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
+ constraint(is_supported<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+ has_no_channel_multiplier),
+ cycle_estimate<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ return new DepthwiseDepthfirst<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
+ constraint(is_supported<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+ has_no_channel_multiplier),
+ cycle_estimate<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ return new DepthwiseDepthfirst<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint(is_supported<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier),
+ cycle_estimate<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ return new DepthwiseDepthfirst<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint(is_supported<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier),
+ cycle_estimate<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ return new DepthwiseDepthfirst<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint(is_supported<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier),
+ cycle_estimate<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ return new DepthwiseDepthfirst<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp32_nhwc_generic_output3x3_mla_depthfirst",
+ constraint(has_no_channel_multiplier),
+ not_preferred,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ return new DepthwiseDepthfirstGeneric<sve_fp32_nhwc_generic_output9_mla_depthfirst, 3, 3>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
+ constraint(is_supported<sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>),
+ not_preferred_if_no_multiplier,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ return new DepthwiseDepthfirstWithMultiplier<sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
+ constraint(is_supported<sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>),
+ not_preferred_if_no_multiplier,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ return new DepthwiseDepthfirstWithMultiplier<sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+ nullptr,
+ not_preferred_if_no_multiplier,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ return new DepthwiseDepthfirstGenericWithMultiplier<sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst>(args);
+ },
+ },
+#endif // defined(__ARM_FEATURE_SVE)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
+ constraint(is_supported<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+ has_no_channel_multiplier),
+ cycle_estimate<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ return new DepthwiseDepthfirst<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
+ constraint(is_supported<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+ has_no_channel_multiplier),
+ cycle_estimate<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ return new DepthwiseDepthfirst<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint(is_supported<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier),
+ cycle_estimate<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ return new DepthwiseDepthfirst<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint(is_supported<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier),
+ cycle_estimate<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ return new DepthwiseDepthfirst<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint(is_supported<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier),
+ cycle_estimate<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ return new DepthwiseDepthfirst<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp32_nhwc_generic_output3x3_mla_depthfirst",
+ constraint(has_no_channel_multiplier),
+ not_preferred,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ return new DepthwiseDepthfirstGeneric<a64_fp32_nhwc_generic_output9_mla_depthfirst, 3, 3>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
+ constraint(is_supported<a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>),
+ not_preferred_if_no_multiplier,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ return new DepthwiseDepthfirstWithMultiplier<a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
+ constraint(is_supported<a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>),
+ not_preferred_if_no_multiplier,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ return new DepthwiseDepthfirstWithMultiplier<a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>(args);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+ nullptr,
+ not_preferred_if_no_multiplier,
+ [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
+ return new DepthwiseDepthfirstGenericWithMultiplier<a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst>(args);
+ },
+ },
+#endif // defined(__aarch64__)
+ { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
+};
+
+template <>
+const DepthwiseImplementation<float> *depthwise_implementation_list()
+{
+ return depthwise_fp32_methods;
+}
+
+template UniqueDepthwiseCommon<float> depthwise(const DepthwiseArgs &, const Nothing &);
+template std::vector<KernelDescription> get_compatible_kernels<float>(const DepthwiseArgs &, const Nothing &);
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
new file mode 100644
index 000000000..1d52b56d3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+
+#include <cstddef>
+#include <functional>
+
+using arm_gemm::Nothing;
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+struct DepthwiseImplementation
+{
+ const DepthwiseMethod method;
+ const char *name;
+ std::function<bool(const DepthwiseArgs &, const OutputStage &)> is_supported;
+ std::function<uint64_t(const DepthwiseArgs &, const OutputStage &)> cycle_estimate;
+ std::function<DepthwiseCommon<TInput, TWeight, TOutput> *(const DepthwiseArgs &, const OutputStage &)> initialise;
+
+ bool get_is_supported(const DepthwiseArgs &args, const OutputStage &os) const
+ {
+ return (is_supported == nullptr) ? true : is_supported(args, os);
+ }
+
+ uint64_t get_cycle_estimate(const DepthwiseArgs &args, const OutputStage &os) const
+ {
+ return (cycle_estimate == nullptr) ? 0 : cycle_estimate(args, os);
+ }
+
+ DepthwiseCommon<TInput, TWeight, TOutput> *get_instance(const DepthwiseArgs &args, const OutputStage &os) const
+ {
+ return initialise(args, os);
+ }
+};
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *depthwise_implementation_list();
+
+template <typename TInput, typename TWeight = TInput, typename TOutput = TInput, class OutputStage = Nothing>
+bool find_implementation(
+ const DepthwiseArgs &args,
+ const OutputStage &os,
+ const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> * &selected
+)
+{
+ selected = nullptr;
+ uint64_t best_cycle_estimate = UINT64_MAX;
+
+ const auto *impl = depthwise_implementation_list<TInput, TWeight, TOutput, OutputStage>();
+ for (; impl->method != DepthwiseMethod::DEFAULT; impl++)
+ {
+ const bool has_cfg = (args.config != nullptr);
+ const auto &cfg = args.config;
+
+ if (
+ !impl->get_is_supported(args, os) || // Problem is unsupported
+ (has_cfg && cfg->method != DepthwiseMethod::DEFAULT && cfg->method != impl->method) ||
+ (has_cfg && cfg->filter != "" && !std::strstr(impl->name, cfg->filter.c_str()))
+ )
+ {
+ continue;
+ }
+
+ const auto cycle_estimate = impl->get_cycle_estimate(args, os);
+
+ if (cycle_estimate == 0)
+ {
+ selected = impl;
+ break;
+ }
+
+ if (selected == nullptr || cycle_estimate < best_cycle_estimate)
+ {
+ selected = impl;
+ best_cycle_estimate = cycle_estimate;
+ }
+ }
+
+ return (selected != nullptr);
+}
+
+template <typename TInput, typename TWeight, typename TOutput, class OutputStage>
+std::vector<KernelDescription> get_compatible_kernels(const DepthwiseArgs &args, const OutputStage &os)
+{
+ std::vector<KernelDescription> kerns;
+
+ // Find the default implementation so we can flag it accordingly
+ const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *default_impl;
+ find_implementation<TInput, TWeight, TOutput, OutputStage>(args, os, default_impl);
+
+ for (auto impl = depthwise_implementation_list<TInput, TWeight, TOutput, OutputStage>();
+ impl->method != DepthwiseMethod::DEFAULT; impl++)
+ {
+ if (!impl->get_is_supported(args, os))
+ {
+ continue;
+ }
+
+ kerns.emplace_back(
+ impl->method, impl->name, impl == default_impl,
+ impl->get_cycle_estimate(args, os)
+ );
+ }
+
+ return kerns;
+}
+
+template <typename TInput, typename TWeight, typename TOutput, class OutputStage>
+UniqueDepthwiseCommon<TInput, TWeight, TOutput> depthwise(const DepthwiseArgs &args, const OutputStage &os)
+{
+ const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *impl = nullptr;
+ const bool success = find_implementation<TInput, TWeight, TOutput, OutputStage>(args, os, impl);
+ return UniqueDepthwiseCommon<TInput, TWeight, TOutput>(success ? impl->get_instance(args, os) : nullptr);
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
new file mode 100644
index 000000000..b4814bef9
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Utilities for constructing functions which constrain which kernels are
+ * selected for a given depthwise problem.
+ *
+ * It is expected that this will be included in the files which list the
+ * available kernels. To avoid multiple definitions, an anonymous namespace is
+ * used.
+ */
+
+#pragma once
+
+#include "arm_gemm.hpp"
+#include "depthwise.hpp"
+
+namespace arm_conv
+{
+namespace depthwise
+{
+namespace
+{
+
+template <class OutputStage>
+using ConstraintFn = std::function<bool(const DepthwiseArgs &, const OutputStage &)>;
+
+using GenericConstraintFn = std::function<bool(const DepthwiseArgs &, const void *)>;
+
+GenericConstraintFn make_constraint(const GenericConstraintFn &f) __attribute__ ((unused));
+GenericConstraintFn make_constraint(const GenericConstraintFn &f)
+{
+ return f;
+}
+
+template <typename ... Fs>
+GenericConstraintFn make_constraint(const GenericConstraintFn &f, Fs ... fs)
+{
+ return [f, fs...] (const DepthwiseArgs &args, const void *os) -> bool {
+ return f(args, os) && make_constraint(fs...)(args, os);
+ };
+}
+
+template <typename OutputStage=Nothing, typename ... Fs>
+ConstraintFn<OutputStage> constraint(Fs ... fs)
+{
+ return [fs...] (const DepthwiseArgs &args, const OutputStage &os) -> bool {
+ return make_constraint(fs...)(args, &os);
+ };
+}
+
+// Some useful constraints
+template <class Strategy>
+bool is_supported(const DepthwiseArgs &args, const void *)
+{
+ return ((args.kernel_rows == Strategy::kernel_rows) &&
+ (args.kernel_cols == Strategy::kernel_cols) &&
+ (args.stride_rows == Strategy::stride_rows) &&
+ (args.stride_cols == Strategy::stride_cols));
+}
+
+bool cpu_has_dot_product(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool cpu_has_dot_product(const DepthwiseArgs &args, const void *)
+{
+ return args.cpu_info->has_dotprod();
+}
+
+bool has_no_channel_multiplier(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool has_no_channel_multiplier(const DepthwiseArgs &args, const void *)
+{
+ return args.channel_multiplier == 1;
+}
+
+bool qp_has_no_left_shift(const DepthwiseArgs &args, const void *_qp) __attribute__ ((unused));
+bool qp_has_no_left_shift(const DepthwiseArgs &, const void *_qp)
+{
+ const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
+ return qp->per_channel_requant ?
+ (qp->per_channel_left_shifts == nullptr) :
+ (qp->per_layer_left_shift == 0);
+}
+
+} // namespace
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp
new file mode 100644
index 000000000..40370fe59
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst_quantized.hpp"
+#include "depthwise_depthfirst_generic_quantized.hpp"
+#include "depthwise_depthfirst_multiplier_quantized.hpp"
+#include "depthwise_depthfirst_generic_multiplier_quantized.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#include "kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
+#include "kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
+#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#include "kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
+#include "kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
+#include "kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif // defined(__aarch64__)
+
+#include <cstdint>
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+namespace
+{
+
+bool qp_weights_are_symmetric(const DepthwiseArgs &, const void *_qp)
+{
+ const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
+ return qp->b_offset == 0;
+}
+
+}
+
+static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depthwise_s8q_methods[] = {
+#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst",
+ constraint<Requantize32>(is_supported<sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift,
+ qp_weights_are_symmetric),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ return new DepthwiseDepthfirstQuantized<sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
+ constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ return new DepthwiseDepthfirstQuantized<sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ return new DepthwiseDepthfirstQuantized<sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ return new DepthwiseDepthfirstQuantized<sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ return new DepthwiseDepthfirstQuantized<sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
+ constraint<Requantize32>(is_supported<sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ return new DepthwiseDepthfirstWithMultiplierQuantized<sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
+ constraint<Requantize32>(is_supported<sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ return new DepthwiseDepthfirstWithMultiplierQuantized<sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>(args, qp);
+ },
+ },
+#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst",
+ constraint<Requantize32>(is_supported<a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+ has_no_channel_multiplier,
+ qp_weights_are_symmetric,
+ qp_has_no_left_shift,
+ cpu_has_dot_product),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ return new DepthwiseDepthfirstQuantized<a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
+ constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift,
+ cpu_has_dot_product),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ return new DepthwiseDepthfirstQuantized<a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ return new DepthwiseDepthfirstQuantized<a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ return new DepthwiseDepthfirstQuantized<a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ return new DepthwiseDepthfirstQuantized<a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_s8q_nhwc_generic_output3x3_mla_depthfirst",
+ constraint<Requantize32>(has_no_channel_multiplier),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ return new DepthwiseDepthfirstGenericQuantized<a64_s8q_nhwc_generic_output9_mla_depthfirst, 3, 3>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
+ constraint<Requantize32>(is_supported<a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
+ qp_has_no_left_shift,
+ cpu_has_dot_product),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ return new DepthwiseDepthfirstWithMultiplierQuantized<a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
+ constraint<Requantize32>(is_supported<a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
+ qp_has_no_left_shift,
+ cpu_has_dot_product),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ return new DepthwiseDepthfirstWithMultiplierQuantized<a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+ nullptr,
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
+ return new DepthwiseDepthfirstGenericWithMultiplierQuantized<a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst>(args, qp);
+ },
+ },
+#endif // defined(__aarch64__)
+ { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
+};
+
+template <>
+const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> *depthwise_implementation_list()
+{
+ return depthwise_s8q_methods;
+}
+
+template UniqueDepthwiseCommon<int8_t, int8_t, int8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
+template std::vector<KernelDescription> get_compatible_kernels<int8_t, int8_t, int8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp
new file mode 100644
index 000000000..3e190d242
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst_quantized.hpp"
+#include "depthwise_depthfirst_generic_quantized.hpp"
+#include "depthwise_depthfirst_multiplier_quantized.hpp"
+#include "depthwise_depthfirst_generic_multiplier_quantized.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#include "kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
+#include "kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
+#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#include "kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+#include "kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
+#include "kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
+#include "kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif // defined(__aarch64__)
+
+#include <cstdint>
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> depthwise_u8q_methods[] = {
+#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
+ constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ return new DepthwiseDepthfirstQuantized<sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ return new DepthwiseDepthfirstQuantized<sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ return new DepthwiseDepthfirstQuantized<sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ return new DepthwiseDepthfirstQuantized<sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
+ constraint<Requantize32>(is_supported<sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ return new DepthwiseDepthfirstWithMultiplierQuantized<sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
+ constraint<Requantize32>(is_supported<sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ return new DepthwiseDepthfirstWithMultiplierQuantized<sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>(args, qp);
+ },
+ },
+#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
+ cpu_has_dot_product,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ return new DepthwiseDepthfirstQuantized<a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ return new DepthwiseDepthfirstQuantized<a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ return new DepthwiseDepthfirstQuantized<a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ return new DepthwiseDepthfirstQuantized<a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8q_nhwc_generic_output3x3_mla_depthfirst",
+ constraint<Requantize32>(has_no_channel_multiplier),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ return new DepthwiseDepthfirstGenericQuantized<a64_u8q_nhwc_generic_output9_mla_depthfirst, 3, 3>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
+ cpu_has_dot_product,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ return new DepthwiseDepthfirstWithMultiplierQuantized<a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
+ cpu_has_dot_product,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ return new DepthwiseDepthfirstWithMultiplierQuantized<a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+ nullptr,
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ return new DepthwiseDepthfirstGenericWithMultiplierQuantized<a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst>(args, qp);
+ },
+ },
+#endif // defined(__aarch64__)
+ { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
+};
+
+template <>
+const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> *depthwise_implementation_list()
+{
+ return depthwise_u8q_methods;
+}
+
+template UniqueDepthwiseCommon<uint8_t, uint8_t, uint8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
+template std::vector<KernelDescription> get_compatible_kernels<uint8_t, uint8_t, uint8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp
new file mode 100644
index 000000000..537a7c5e0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm_local.hpp"
+
+#include "depthwise_implementation.hpp"
+#include "depthwise_depthfirst_quantized.hpp"
+#include "depthwise_depthfirst_generic_quantized.hpp"
+#include "depthwise_depthfirst_multiplier_quantized.hpp"
+#include "depthwise_depthfirst_generic_multiplier_quantized.hpp"
+
+#include "depthwise_implementation_constraints.hpp"
+
+#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#include "kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+#include "kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp"
+#include "kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif // defined(__aarch64__)
+
+#include <cstdint>
+
+using arm_gemm::Requantize32;
+
+namespace arm_conv {
+namespace depthwise {
+
+static const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> depthwise_u8q_methods[] = {
+#if defined(__aarch64__)
+#if defined(__ARM_FEATURE_SVE) && defined(SVE2)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+ return new DepthwiseDepthfirstQuantized<sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+ return new DepthwiseDepthfirstQuantized<sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+ return new DepthwiseDepthfirstQuantized<sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>(args, qp);
+ },
+ },
+#endif // defined(__ARM_FEATURE_SVE) && defined(SVE2)
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+ return new DepthwiseDepthfirstQuantized<a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+ return new DepthwiseDepthfirstQuantized<a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+ return new DepthwiseDepthfirstQuantized<a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8s8u8q_nhwc_generic_output3x3_mla_depthfirst",
+ constraint<Requantize32>(has_no_channel_multiplier),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+ return new DepthwiseDepthfirstGenericQuantized<a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst, 3, 3>(args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+ nullptr,
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
+ return new DepthwiseDepthfirstGenericWithMultiplierQuantized<a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst>(args, qp);
+ },
+ },
+#endif // defined(__aarch64__)
+ { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
+};
+
+template <>
+const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> *depthwise_implementation_list()
+{
+ return depthwise_u8q_methods;
+}
+
+template UniqueDepthwiseCommon<uint8_t, int8_t, uint8_t> depthwise(const DepthwiseArgs &, const Requantize32 &);
+template std::vector<KernelDescription> get_compatible_kernels<uint8_t, int8_t, uint8_t, Requantize32>(const DepthwiseArgs &, const Requantize32 &);
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp
new file mode 100644
index 000000000..6c5ef2368
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include <cstdint>
+#include <cstring>
+
+using namespace arm_gemm;
+
+size_t generic_get_packed_size(
+ const VLType vec_type,
+ const unsigned int acc_depth,
+ const unsigned int kernel_rows,
+ const unsigned int kernel_cols,
+ const unsigned int n_input_channels
+)
+{
+ const auto per_iter = acc_depth * arm_gemm::utils::get_vector_length<int32_t>(vec_type);
+ return arm_gemm::roundup((long unsigned int) n_input_channels, per_iter) * kernel_rows * kernel_cols * sizeof(int8_t);
+}
+
+void generic_pack(
+ const VLType vec_type,
+ const unsigned int acc_depth,
+ const unsigned int kernel_rows,
+ const unsigned int kernel_cols,
+ const unsigned int n_channels,
+ void *_outptr,
+ const void *_weights,
+ size_t ld_weight_col,
+ size_t ld_weight_row
+)
+{
+ int8_t *outptr = reinterpret_cast<int8_t *>(_outptr);
+ const int8_t *weights = reinterpret_cast<const int8_t *>(_weights);
+
+ // Get the strides
+ ld_weight_col = (ld_weight_col == 0) ? n_channels * sizeof(int8_t) : ld_weight_col;
+ ld_weight_row = (ld_weight_row == 0) ? kernel_cols * ld_weight_col : ld_weight_row;
+
+ // Pack into per-iter chunks.
+ const auto per_iter = acc_depth * arm_gemm::utils::get_vector_length<int32_t>(vec_type);
+ for (unsigned int c = 0; c < n_channels; c += per_iter)
+ {
+ auto weight_row = weights + c;
+ const auto to_copy = std::min<unsigned int>(per_iter, n_channels - c);
+
+ for (unsigned int i = 0; i < kernel_rows; i++)
+ {
+ auto weight_col = weight_row;
+
+ for (unsigned int j = 0; j < kernel_cols; j++)
+ {
+ memcpy(outptr, weight_col, to_copy);
+ outptr += per_iter;
+ weight_col += ld_weight_col;
+ }
+
+ weight_row += ld_weight_row;
+ }
+ }
+}
+
+
+#define ADD_IMPLEMENTATION(ARCH, TYPENAME, TYPE, VEC_TYPE, ACC_DEPTH, KERN_ROWS, KERN_COLS) \
+struct interleave_ ## ARCH ## _ ## TYPENAME ## _ ## KERN_ROWS ## x ## KERN_COLS ## _mla \
+{ \
+ static size_t get_packed_size(const DepthwiseArgs &args); \
+ static void pack_parameters( \
+ unsigned int n_channels, void *outptr, \
+ const TYPE *weights, size_t ld_weight_col, size_t ld_weight_row \
+ ); \
+}; \
+\
+size_t interleave_ ## ARCH ## _ ## TYPENAME ## _ ## KERN_ROWS ## x ## KERN_COLS ## _mla::get_packed_size(const DepthwiseArgs &args) \
+{ \
+ return generic_get_packed_size(VLType::VEC_TYPE, ACC_DEPTH, KERN_ROWS, KERN_COLS, args.input_channels); \
+} \
+\
+void interleave_ ## ARCH ## _ ## TYPENAME ## _ ## KERN_ROWS ## x ## KERN_COLS ## _mla::pack_parameters(unsigned int n_channels, void *outptr, \
+ const TYPE *weights, size_t ld_weight_col, size_t ld_weight_row) \
+{ \
+ generic_pack(VLType::VEC_TYPE, ACC_DEPTH, KERN_ROWS, KERN_COLS, n_channels, outptr, weights, ld_weight_col, ld_weight_row); \
+}
+
+
+namespace arm_conv {
+namespace depthwise {
+
+#if defined(__ARM_FEATURE_SVE)
+
+ADD_IMPLEMENTATION(sve, s8q, int8_t, SVE, 2, 3, 3)
+ADD_IMPLEMENTATION(sve, s8q, int8_t, SVE, 2, 5, 5)
+ADD_IMPLEMENTATION(sve, u8q, uint8_t, SVE, 2, 3, 3)
+ADD_IMPLEMENTATION(sve, u8q, uint8_t, SVE, 2, 5, 5)
+
+#endif // defined(__ARM_FEATURE_SVE)
+
+ADD_IMPLEMENTATION(a64, s8q, int8_t, None, 2, 3, 3)
+ADD_IMPLEMENTATION(a64, s8q, int8_t, None, 2, 5, 5)
+ADD_IMPLEMENTATION(a64, u8q, uint8_t, None, 2, 3, 3)
+ADD_IMPLEMENTATION(a64, u8q, uint8_t, None, 2, 5, 5)
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp
new file mode 100644
index 000000000..3d3447bf3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+struct interleave_a64_s8q_3x3_dot
+{
+ static size_t get_packed_size(const DepthwiseArgs &);
+ static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row);
+};
+
+size_t interleave_a64_s8q_3x3_dot::get_packed_size(const DepthwiseArgs &args)
+{
+ // We store 7 vectors for every <vector_of_ints> of channels.
+ const unsigned int n = arm_gemm::roundup(
+ arm_gemm::iceildiv((long unsigned int) args.input_channels,
+ get_vector_length<int32_t>(arm_gemm::VLType::None)), 4lu
+ );
+ return n * 7 * get_vector_length<int8_t>(arm_gemm::VLType::None);
+}
+
+void interleave_a64_s8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row)
+{
+ __asm__ __volatile__(
+ "movi v0.16b, #0x0\n"
+ "cmp %x[ld_weight_col], XZR\n"
+ "movi v31.16b, #0x1\n"
+ "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+ "movi v16.4s, #0x9\n"
+ "mov x19, #0x3\n"
+ "cmp %x[ld_weight_row], XZR\n"
+ "mul x19, %x[ld_weight_col], x19\n"
+ "csel %x[ld_weight_row], %x[ld_weight_row], x19, NE\n"
+ "add x24, %x[weights], %x[ld_weight_row]\n"
+ "add x23, x24, %x[ld_weight_row]\n"
+ "add x22, %x[ld_weight_col], %x[ld_weight_col]\n"
+ "lsr x20, %x[n_channels], #0x2\n"
+ "mov x21, #0x0\n"
+ "add x19, %x[qp], %[offsetof_input_offset]\n"
+ "ld1r { v30.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_weights_offset]\n"
+ "ld1r { v29.4s }, [x19]\n"
+ "mul v29.4s, v29.4s, v30.4s\n"
+ "add x19, %x[qp], %[offsetof_per_layer_mul]\n"
+ "ld1r { v28.4s }, [x19]\n"
+ "mul v29.4s, v29.4s, v16.4s\n"
+ "add x19, %x[qp], %[offsetof_per_layer_right_shift]\n"
+ "ld1r { v27.4s }, [x19]\n"
+ "cbz x20, 4f\n"
+ "1:" // Loop
+ "movi v26.4s, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "ldr q26, [%x[bias], x21]\n"
+ "2:" // Loop: Skip bias load
+ "movi v25.4s, #0x0\n"
+ "ldr s24, [%x[weights], #0x0]\n"
+ "ldr s23, [%x[weights], %x[ld_weight_col]]\n"
+ "zip1 v23.16b, v23.16b, v0.16b\n"
+ "ldr s21, [%x[weights], x22]\n"
+ "add %x[weights], %x[weights], #0x4\n"
+ "zip1 v21.16b, v24.16b, v21.16b\n"
+ "ldr s22, [x24, #0x0]\n"
+ "ldr s20, [x24, %x[ld_weight_col]]\n"
+ "zip1 v21.16b, v21.16b, v23.16b\n"
+ "ldr s18, [x24, x22]\n"
+ ".inst 0x4e9597f9 // sdot v25.4s, v31.16b, v21.16b\n"
+ "add x24, x24, #0x4\n"
+ "zip1 v20.16b, v20.16b, v0.16b\n"
+ "ldr s19, [x23, #0x0]\n"
+ "ldr s17, [x23, %x[ld_weight_col]]\n"
+ "zip1 v18.16b, v22.16b, v18.16b\n"
+ "ldr s16, [x23, x22]\n"
+ "zip1 v18.16b, v18.16b, v20.16b\n"
+ "add x23, x23, #0x4\n"
+ ".inst 0x4e9297f9 // sdot v25.4s, v31.16b, v18.16b\n"
+ "zip1 v17.16b, v17.16b, v0.16b\n"
+ "zip1 v16.16b, v19.16b, v16.16b\n"
+ "zip1 v16.16b, v16.16b, v17.16b\n"
+ ".inst 0x4e9097f9 // sdot v25.4s, v31.16b, v16.16b\n"
+ "mls v26.4s, v25.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "str q26, [%x[outptr], #0x0]\n"
+ "str q21, [%x[outptr], #0x10]\n"
+ "str q18, [%x[outptr], #0x20]\n"
+ "str q16, [%x[outptr], #0x30]\n"
+ "add %x[outptr], %x[outptr], #0x40\n"
+ "cbz %x[rq_mul_perchannel], 3f\n"
+ "ldr q28, [%x[rq_mul_perchannel], x21]\n"
+ "ldr q27, [%x[rq_shift_perchannel], x21]\n"
+ "3:" // Loop: Quantisation parameters: Store
+ "str q28, [%x[outptr], #0x0]\n"
+ "add x21, x21, #0x10\n"
+ "str q27, [%x[outptr], #0x10]\n"
+ "subs x20, x20, #0x1\n"
+ "add %x[outptr], %x[outptr], #0x20\n"
+ "bgt 1b\n"
+ "tst %x[n_channels], #0x3\n"
+ "beq 13f\n"
+ "4:" // Oddments
+ "movi v26.4s, #0x0\n"
+ "cbz %x[bias], 7f\n"
+ "add %x[bias], %x[bias], x21\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ld1 { v26.d }[0], [%x[bias]], #0x8\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v26.s }[2], [%x[bias]], #0x4\n"
+ "b 6f\n"
+ "5:" // Oddments: Load bias: Bit 1: Unset
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v26.s }[0], [%x[bias]], #0x4\n"
+ "6:" // Oddments: Load bias: Bit 1: End
+
+ "7:" // Oddments: Skip bias load
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v24.h }[0], [%x[weights]]\n"
+ "ld1 { v22.h }[0], [x24]\n"
+ "add x20, %x[weights], %x[ld_weight_col]\n"
+ "ld1 { v19.h }[0], [x23]\n"
+ "add x19, %x[weights], x22\n"
+ "ld1 { v23.h }[0], [x20]\n"
+ "add %x[weights], %x[weights], #0x2\n"
+ "ld1 { v21.h }[0], [x19]\n"
+ "add x20, x24, %x[ld_weight_col]\n"
+ "add x19, x24, x22\n"
+ "ld1 { v20.h }[0], [x20]\n"
+ "ld1 { v18.h }[0], [x19]\n"
+ "add x24, x24, #0x2\n"
+ "add x19, x23, %x[ld_weight_col]\n"
+ "ld1 { v17.h }[0], [x19]\n"
+ "add x19, x23, x22\n"
+ "ld1 { v16.h }[0], [x19]\n"
+ "add x23, x23, #0x2\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v24.b }[2], [%x[weights]]\n"
+ "ld1 { v22.b }[2], [x24]\n"
+ "add x20, %x[weights], %x[ld_weight_col]\n"
+ "ld1 { v19.b }[2], [x23]\n"
+ "add x19, %x[weights], x22\n"
+ "ld1 { v23.b }[2], [x20]\n"
+ "add %x[weights], %x[weights], #0x1\n"
+ "ld1 { v21.b }[2], [x19]\n"
+ "add x20, x24, %x[ld_weight_col]\n"
+ "add x19, x24, x22\n"
+ "ld1 { v20.b }[2], [x20]\n"
+ "ld1 { v18.b }[2], [x19]\n"
+ "add x20, x23, %x[ld_weight_col]\n"
+ "add x19, x23, x22\n"
+ "ld1 { v17.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x19]\n"
+ "b 9f\n"
+ "8:" // Oddments: Load weights: Bit 1: Unset
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v24.b }[0], [%x[weights]]\n"
+ "ld1 { v22.b }[0], [x24]\n"
+ "add x20, %x[weights], %x[ld_weight_col]\n"
+ "ld1 { v19.b }[0], [x23]\n"
+ "add x19, %x[weights], x22\n"
+ "ld1 { v23.b }[0], [x20]\n"
+ "add %x[weights], %x[weights], #0x1\n"
+ "ld1 { v21.b }[0], [x19]\n"
+ "add x20, x24, %x[ld_weight_col]\n"
+ "add x19, x24, x22\n"
+ "ld1 { v20.b }[0], [x20]\n"
+ "ld1 { v18.b }[0], [x19]\n"
+ "add x20, x23, %x[ld_weight_col]\n"
+ "add x19, x23, x22\n"
+ "ld1 { v17.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x19]\n"
+ "9:" // Oddments: Load weights: Bit 1: End
+ "zip1 v21.16b, v24.16b, v21.16b\n"
+ "zip1 v23.16b, v23.16b, v0.16b\n"
+ "zip1 v18.16b, v22.16b, v18.16b\n"
+ "zip1 v20.16b, v20.16b, v0.16b\n"
+ "zip1 v16.16b, v19.16b, v16.16b\n"
+ "zip1 v17.16b, v17.16b, v0.16b\n"
+ "zip1 v21.16b, v21.16b, v23.16b\n"
+ "zip1 v18.16b, v18.16b, v20.16b\n"
+ "zip1 v16.16b, v16.16b, v17.16b\n"
+ "movi v25.4s, #0x0\n"
+ ".inst 0x4e9597f9 // sdot v25.4s, v31.16b, v21.16b\n"
+ ".inst 0x4e9297f9 // sdot v25.4s, v31.16b, v18.16b\n"
+ ".inst 0x4e9097f9 // sdot v25.4s, v31.16b, v16.16b\n"
+ "mls v26.4s, v25.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "str q26, [%x[outptr], #0x0]\n"
+ "str q21, [%x[outptr], #0x10]\n"
+ "str q18, [%x[outptr], #0x20]\n"
+ "str q16, [%x[outptr], #0x30]\n"
+ "add %x[outptr], %x[outptr], #0x40\n"
+ "cbz %x[rq_mul_perchannel], 12f\n"
+ "add x20, %x[rq_mul_perchannel], x21\n"
+ "add x19, %x[rq_shift_perchannel], x21\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v28.d }[0], [x20], #0x8\n"
+ "ld1 { v27.d }[0], [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v28.s }[2], [x20], #0x4\n"
+ "ld1 { v27.s }[2], [x19], #0x4\n"
+ "b 11f\n"
+ "10:" // Oddments: Quantisation parameters: Load quant params: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "11:" // Oddments: Quantisation parameters: Load quant params: Bit 1: End
+
+ "12:" // Oddments: Quantisation parameters: Store
+ "str q28, [%x[outptr], #0x0]\n"
+ "str q27, [%x[outptr], #0x10]\n"
+ "add %x[outptr], %x[outptr], #0x20\n"
+ "13:" // End
+
+ : [bias] "+&r" (bias), [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
+ : [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
+ : "cc", "memory", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp
new file mode 100644
index 000000000..a725dcab5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__aarch64__)
+
+#include "arm_gemm.hpp"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+struct interleave_a64_u8q_3x3_dot
+{
+ static size_t get_packed_size(const DepthwiseArgs &);
+ static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row);
+};
+
+size_t interleave_a64_u8q_3x3_dot::get_packed_size(const DepthwiseArgs &args)
+{
+ // We store 7 vectors for every <vector_of_ints> of channels.
+ const unsigned int n = arm_gemm::roundup(
+ arm_gemm::iceildiv((long unsigned int) args.input_channels,
+ get_vector_length<int32_t>(arm_gemm::VLType::None)), 4lu
+ );
+ return n * 7 * get_vector_length<uint8_t>(arm_gemm::VLType::None);
+}
+
+void interleave_a64_u8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row)
+{
+ __asm__ __volatile__(
+ "movi v0.16b, #0x0\n"
+ "cmp %x[ld_weight_col], XZR\n"
+ "movi v31.16b, #0x1\n"
+ "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+ "movi v16.4s, #0x9\n"
+ "mov x19, #0x3\n"
+ "cmp %x[ld_weight_row], XZR\n"
+ "mul x19, %x[ld_weight_col], x19\n"
+ "csel %x[ld_weight_row], %x[ld_weight_row], x19, NE\n"
+ "add x24, %x[weights], %x[ld_weight_row]\n"
+ "add x23, x24, %x[ld_weight_row]\n"
+ "add x22, %x[ld_weight_col], %x[ld_weight_col]\n"
+ "lsr x20, %x[n_channels], #0x2\n"
+ "mov x21, #0x0\n"
+ "add x19, %x[qp], %[offsetof_input_offset]\n"
+ "ld1r { v30.4s }, [x19]\n"
+ "add x19, %x[qp], %[offsetof_weights_offset]\n"
+ "ld1r { v29.4s }, [x19]\n"
+ "mul v29.4s, v29.4s, v30.4s\n"
+ "add x19, %x[qp], %[offsetof_per_layer_mul]\n"
+ "ld1r { v28.4s }, [x19]\n"
+ "mul v29.4s, v29.4s, v16.4s\n"
+ "add x19, %x[qp], %[offsetof_per_layer_right_shift]\n"
+ "ld1r { v27.4s }, [x19]\n"
+ "cbz x20, 4f\n"
+ "1:" // Loop
+ "movi v26.4s, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "ldr q26, [%x[bias], x21]\n"
+ "2:" // Loop: Skip bias load
+ "movi v25.4s, #0x0\n"
+ "ldr s24, [%x[weights], #0x0]\n"
+ "ldr s23, [%x[weights], %x[ld_weight_col]]\n"
+ "zip1 v23.16b, v23.16b, v0.16b\n"
+ "ldr s21, [%x[weights], x22]\n"
+ "add %x[weights], %x[weights], #0x4\n"
+ "zip1 v21.16b, v24.16b, v21.16b\n"
+ "ldr s22, [x24, #0x0]\n"
+ "ldr s20, [x24, %x[ld_weight_col]]\n"
+ "zip1 v21.16b, v21.16b, v23.16b\n"
+ "ldr s18, [x24, x22]\n"
+ ".inst 0x6e9597f9 // udot v25.4s, v31.16b, v21.16b\n"
+ "add x24, x24, #0x4\n"
+ "zip1 v20.16b, v20.16b, v0.16b\n"
+ "ldr s19, [x23, #0x0]\n"
+ "ldr s17, [x23, %x[ld_weight_col]]\n"
+ "zip1 v18.16b, v22.16b, v18.16b\n"
+ "ldr s16, [x23, x22]\n"
+ "zip1 v18.16b, v18.16b, v20.16b\n"
+ "add x23, x23, #0x4\n"
+ ".inst 0x6e9297f9 // udot v25.4s, v31.16b, v18.16b\n"
+ "zip1 v17.16b, v17.16b, v0.16b\n"
+ "zip1 v16.16b, v19.16b, v16.16b\n"
+ "zip1 v16.16b, v16.16b, v17.16b\n"
+ ".inst 0x6e9097f9 // udot v25.4s, v31.16b, v16.16b\n"
+ "mls v26.4s, v25.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "str q26, [%x[outptr], #0x0]\n"
+ "str q21, [%x[outptr], #0x10]\n"
+ "str q18, [%x[outptr], #0x20]\n"
+ "str q16, [%x[outptr], #0x30]\n"
+ "add %x[outptr], %x[outptr], #0x40\n"
+ "cbz %x[rq_mul_perchannel], 3f\n"
+ "ldr q28, [%x[rq_mul_perchannel], x21]\n"
+ "ldr q27, [%x[rq_shift_perchannel], x21]\n"
+ "3:" // Loop: Quantisation parameters: Store
+ "str q28, [%x[outptr], #0x0]\n"
+ "add x21, x21, #0x10\n"
+ "str q27, [%x[outptr], #0x10]\n"
+ "subs x20, x20, #0x1\n"
+ "add %x[outptr], %x[outptr], #0x20\n"
+ "bgt 1b\n"
+ "tst %x[n_channels], #0x3\n"
+ "beq 13f\n"
+ "4:" // Oddments
+ "movi v26.4s, #0x0\n"
+ "cbz %x[bias], 7f\n"
+ "add %x[bias], %x[bias], x21\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ld1 { v26.d }[0], [%x[bias]], #0x8\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v26.s }[2], [%x[bias]], #0x4\n"
+ "b 6f\n"
+ "5:" // Oddments: Load bias: Bit 1: Unset
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v26.s }[0], [%x[bias]], #0x4\n"
+ "6:" // Oddments: Load bias: Bit 1: End
+
+ "7:" // Oddments: Skip bias load
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v24.h }[0], [%x[weights]]\n"
+ "ld1 { v22.h }[0], [x24]\n"
+ "add x20, %x[weights], %x[ld_weight_col]\n"
+ "ld1 { v19.h }[0], [x23]\n"
+ "add x19, %x[weights], x22\n"
+ "ld1 { v23.h }[0], [x20]\n"
+ "add %x[weights], %x[weights], #0x2\n"
+ "ld1 { v21.h }[0], [x19]\n"
+ "add x20, x24, %x[ld_weight_col]\n"
+ "add x19, x24, x22\n"
+ "ld1 { v20.h }[0], [x20]\n"
+ "ld1 { v18.h }[0], [x19]\n"
+ "add x24, x24, #0x2\n"
+ "add x19, x23, %x[ld_weight_col]\n"
+ "ld1 { v17.h }[0], [x19]\n"
+ "add x19, x23, x22\n"
+ "ld1 { v16.h }[0], [x19]\n"
+ "add x23, x23, #0x2\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v24.b }[2], [%x[weights]]\n"
+ "ld1 { v22.b }[2], [x24]\n"
+ "add x20, %x[weights], %x[ld_weight_col]\n"
+ "ld1 { v19.b }[2], [x23]\n"
+ "add x19, %x[weights], x22\n"
+ "ld1 { v23.b }[2], [x20]\n"
+ "add %x[weights], %x[weights], #0x1\n"
+ "ld1 { v21.b }[2], [x19]\n"
+ "add x20, x24, %x[ld_weight_col]\n"
+ "add x19, x24, x22\n"
+ "ld1 { v20.b }[2], [x20]\n"
+ "ld1 { v18.b }[2], [x19]\n"
+ "add x20, x23, %x[ld_weight_col]\n"
+ "add x19, x23, x22\n"
+ "ld1 { v17.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x19]\n"
+ "b 9f\n"
+ "8:" // Oddments: Load weights: Bit 1: Unset
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v24.b }[0], [%x[weights]]\n"
+ "ld1 { v22.b }[0], [x24]\n"
+ "add x20, %x[weights], %x[ld_weight_col]\n"
+ "ld1 { v19.b }[0], [x23]\n"
+ "add x19, %x[weights], x22\n"
+ "ld1 { v23.b }[0], [x20]\n"
+ "add %x[weights], %x[weights], #0x1\n"
+ "ld1 { v21.b }[0], [x19]\n"
+ "add x20, x24, %x[ld_weight_col]\n"
+ "add x19, x24, x22\n"
+ "ld1 { v20.b }[0], [x20]\n"
+ "ld1 { v18.b }[0], [x19]\n"
+ "add x20, x23, %x[ld_weight_col]\n"
+ "add x19, x23, x22\n"
+ "ld1 { v17.b }[0], [x20]\n"
+ "ld1 { v16.b }[0], [x19]\n"
+ "9:" // Oddments: Load weights: Bit 1: End
+ "zip1 v21.16b, v24.16b, v21.16b\n"
+ "zip1 v23.16b, v23.16b, v0.16b\n"
+ "zip1 v18.16b, v22.16b, v18.16b\n"
+ "zip1 v20.16b, v20.16b, v0.16b\n"
+ "zip1 v16.16b, v19.16b, v16.16b\n"
+ "zip1 v17.16b, v17.16b, v0.16b\n"
+ "zip1 v21.16b, v21.16b, v23.16b\n"
+ "zip1 v18.16b, v18.16b, v20.16b\n"
+ "zip1 v16.16b, v16.16b, v17.16b\n"
+ "movi v25.4s, #0x0\n"
+ ".inst 0x6e9597f9 // udot v25.4s, v31.16b, v21.16b\n"
+ ".inst 0x6e9297f9 // udot v25.4s, v31.16b, v18.16b\n"
+ ".inst 0x6e9097f9 // udot v25.4s, v31.16b, v16.16b\n"
+ "mls v26.4s, v25.4s, v30.4s\n"
+ "add v26.4s, v26.4s, v29.4s\n"
+ "str q26, [%x[outptr], #0x0]\n"
+ "str q21, [%x[outptr], #0x10]\n"
+ "str q18, [%x[outptr], #0x20]\n"
+ "str q16, [%x[outptr], #0x30]\n"
+ "add %x[outptr], %x[outptr], #0x40\n"
+ "cbz %x[rq_mul_perchannel], 12f\n"
+ "add x20, %x[rq_mul_perchannel], x21\n"
+ "add x19, %x[rq_shift_perchannel], x21\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v28.d }[0], [x20], #0x8\n"
+ "ld1 { v27.d }[0], [x19], #0x8\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v28.s }[2], [x20], #0x4\n"
+ "ld1 { v27.s }[2], [x19], #0x4\n"
+ "b 11f\n"
+ "10:" // Oddments: Quantisation parameters: Load quant params: Bit 1: Unset
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "11:" // Oddments: Quantisation parameters: Load quant params: Bit 1: End
+
+ "12:" // Oddments: Quantisation parameters: Store
+ "str q28, [%x[outptr], #0x0]\n"
+ "str q27, [%x[outptr], #0x10]\n"
+ "add %x[outptr], %x[outptr], #0x20\n"
+ "13:" // End
+
+ : [bias] "+&r" (bias), [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
+ : [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
+ : "cc", "memory", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp
new file mode 100644
index 000000000..41f0495ac
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+namespace arm_conv {
+namespace depthwise {
+
+#if defined(__ARM_FEATURE_SVE)
+
+class interleave_sve_u8q_3x3_dot
+{
+ public:
+ static void pack_parameters(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+ static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+class interleave_sve_s8q_3x3_dot
+{
+ public:
+ static void pack_parameters(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+ static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+class interleave_sve_u8q_3x3_mla
+{
+ public:
+ static void pack_parameters(unsigned int, void *, const uint8_t *, size_t, size_t);
+ static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+class interleave_sve_s8q_3x3_mla
+{
+ public:
+ static void pack_parameters(unsigned int, void *, const int8_t *, size_t, size_t);
+ static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+class interleave_sve_u8q_5x5_mla
+{
+ public:
+ static void pack_parameters(unsigned int, void *, const uint8_t *, size_t, size_t);
+ static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+class interleave_sve_s8q_5x5_mla
+{
+ public:
+ static void pack_parameters(unsigned int, void *, const int8_t *, size_t, size_t);
+ static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+#endif // defined(__ARM_FEATURE_SVE)
+
+class interleave_a64_u8q_3x3_dot
+{
+ public:
+ static void pack_parameters(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+ static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+class interleave_a64_s8q_3x3_dot
+{
+ public:
+ static void pack_parameters(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+ static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+class interleave_a64_u8q_3x3_mla
+{
+ public:
+ static void pack_parameters(unsigned int, void *, const uint8_t *, size_t, size_t);
+ static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+class interleave_a64_s8q_3x3_mla
+{
+ public:
+ static void pack_parameters(unsigned int, void *, const int8_t *, size_t, size_t);
+ static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+class interleave_a64_u8q_5x5_mla
+{
+ public:
+ static void pack_parameters(unsigned int, void *, const uint8_t *, size_t, size_t);
+ static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+class interleave_a64_s8q_5x5_mla
+{
+ public:
+ static void pack_parameters(unsigned int, void *, const int8_t *, size_t, size_t);
+ static size_t get_packed_size(const DepthwiseArgs &);
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp
new file mode 100644
index 000000000..ea0c35b7c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_s8q_3x3_dot.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__ARM_FEATURE_SVE)
+
+#include "arm_gemm.hpp"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+struct interleave_sve_s8q_3x3_dot
+{
+ static size_t get_packed_size(const DepthwiseArgs &);
+ static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row);
+};
+
+size_t interleave_sve_s8q_3x3_dot::get_packed_size(const DepthwiseArgs &args)
+{
+ // We store 7 vectors for every <vector_of_ints> of channels.
+ const unsigned int n = arm_gemm::roundup(
+ arm_gemm::iceildiv((long unsigned int) args.input_channels,
+ get_vector_length<int32_t>(arm_gemm::VLType::SVE)), 4lu
+ );
+ return n * 7 * get_vector_length<int8_t>(arm_gemm::VLType::SVE);
+}
+
+void interleave_sve_s8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const int8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row)
+{
+ __asm__ __volatile__(
+ "mov z30.b, #0x0\n"
+ "ptrue p2.b\n"
+ "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_input_offset]]\n"
+ "mov z28.b, #0x1\n"
+ "cmp %x[ld_weight_col], XZR\n"
+ "mov z16.s, #0x9\n"
+ "ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_weights_offset]]\n"
+ "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+ "mul z27.s, p2/M, z27.s, z29.s\n"
+ "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_per_layer_mul]]\n"
+ "mov x19, #0x3\n"
+ "mul z27.s, p2/M, z27.s, z16.s\n"
+ "ld1rw { z25.s }, p2/Z, [%x[qp], %[offsetof_per_layer_right_shift]]\n"
+ "mul x19, %x[ld_weight_col], x19\n"
+ "cmp %x[ld_weight_row], XZR\n"
+ "add x23, %x[ld_weight_col], %x[ld_weight_col]\n"
+ "csel %x[ld_weight_row], %x[ld_weight_row], x19, NE\n"
+ "add x22, %x[weights], %x[ld_weight_row]\n"
+ "add x21, x22, %x[ld_weight_row]\n"
+ "whilelt p1.s, XZR, %x[n_channels]\n"
+ "mov x20, #0x0\n"
+ "pfalse p8.b\n"
+ "cbz %x[bias], 1f\n"
+ "ptrue p8.s\n"
+ "1:" // No bias
+
+ "2:" // Loop
+ "mov z24.s, #0x0\n"
+ "cntp x19, p2, p1.s\n"
+ "and p0.b, p2/Z, p8.b, p1.b\n"
+ "ld1w { z23.s }, p0/Z, [%x[bias], x20, LSL #2]\n"
+ "whilelt p0.b, XZR, x19\n"
+ "ld1b { z17.b }, p0/Z, [%x[weights]]\n"
+ "ld1b { z16.b }, p0/Z, [%x[weights], %x[ld_weight_col]]\n"
+ "zip1 z18.b, z16.b, z30.b\n"
+ "ld1b { z16.b }, p0/Z, [%x[weights], x23]\n"
+ "add %x[weights], %x[weights], x19\n"
+ "zip1 z16.b, z17.b, z16.b\n"
+ "ld1b { z22.b }, p0/Z, [x22]\n"
+ "ld1b { z17.b }, p0/Z, [x22, %x[ld_weight_col]]\n"
+ "zip1 z21.b, z16.b, z18.b\n"
+ "ld1b { z16.b }, p0/Z, [x22, x23]\n"
+ "sdot z24.s, z28.b, z21.b\n"
+ "add x22, x22, x19\n"
+ "zip1 z18.b, z17.b, z30.b\n"
+ "ld1b { z20.b }, p0/Z, [x21]\n"
+ "ld1b { z19.b }, p0/Z, [x21, %x[ld_weight_col]]\n"
+ "zip1 z17.b, z22.b, z16.b\n"
+ "ld1b { z16.b }, p0/Z, [x21, x23]\n"
+ "zip1 z18.b, z17.b, z18.b\n"
+ "add x21, x21, x19\n"
+ "zip1 z17.b, z19.b, z30.b\n"
+ "sdot z24.s, z28.b, z18.b\n"
+ "zip1 z16.b, z20.b, z16.b\n"
+ "zip1 z16.b, z16.b, z17.b\n"
+ "sdot z24.s, z28.b, z16.b\n"
+ "mls z23.s, p2/M, z24.s, z29.s\n"
+ "add z23.s, z23.s, z27.s\n"
+ "st1w { z23.s }, p2, [%x[outptr]]\n"
+ "st1b { z21.b }, p2, [%x[outptr], #1, MUL VL]\n"
+ "st1b { z18.b }, p2, [%x[outptr], #2, MUL VL]\n"
+ "st1b { z16.b }, p2, [%x[outptr], #3, MUL VL]\n"
+ "addvl %x[outptr], %x[outptr], #4\n"
+ "cbz %x[rq_mul_perchannel], 3f\n"
+ "ld1w { z26.s }, p1/Z, [%x[rq_mul_perchannel], x20, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [%x[rq_shift_perchannel], x20, LSL #2]\n"
+ "3:" // Loop: Quantisation parameters: Store
+ "st1w { z26.s }, p2, [%x[outptr]]\n"
+ "incw x20\n"
+ "st1w { z25.s }, p2, [%x[outptr], #1, MUL VL]\n"
+ "whilelt p1.s, x20, %x[n_channels]\n"
+ "addvl %x[outptr], %x[outptr], #2\n"
+ "b.any 2b\n"
+ : [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
+ : [bias] "r" (bias), [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
+ : "cc", "memory", "p0", "p1", "p2", "p8", "x19", "x20", "x21", "x22", "x23", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp
new file mode 100644
index 000000000..edd32a43f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/sve_u8q_3x3_dot.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if defined(__ARM_FEATURE_SVE)
+
+#include "arm_gemm.hpp"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+
+struct interleave_sve_u8q_3x3_dot
+{
+ static size_t get_packed_size(const DepthwiseArgs &);
+ static void pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row);
+};
+
+size_t interleave_sve_u8q_3x3_dot::get_packed_size(const DepthwiseArgs &args)
+{
+ // We store 7 vectors for every <vector_of_ints> of channels.
+ const unsigned int n = arm_gemm::roundup(
+ arm_gemm::iceildiv((long unsigned int) args.input_channels,
+ get_vector_length<int32_t>(arm_gemm::VLType::SVE)), 4lu
+ );
+ return n * 7 * get_vector_length<uint8_t>(arm_gemm::VLType::SVE);
+}
+
+void interleave_sve_u8q_3x3_dot::pack_parameters(unsigned int n_channels, void *outptr, const int32_t *bias, const uint8_t *weights, const arm_gemm::Requantize32 &qp, size_t ld_weight_col, size_t ld_weight_row)
+{
+ __asm__ __volatile__(
+ "mov z30.b, #0x0\n"
+ "ptrue p2.b\n"
+ "ld1rw { z29.s }, p2/Z, [%x[qp], %[offsetof_input_offset]]\n"
+ "mov z28.b, #0x1\n"
+ "cmp %x[ld_weight_col], XZR\n"
+ "mov z16.s, #0x9\n"
+ "ld1rw { z27.s }, p2/Z, [%x[qp], %[offsetof_weights_offset]]\n"
+ "csel %x[ld_weight_col], %x[ld_weight_col], %x[n_channels], NE\n"
+ "mul z27.s, p2/M, z27.s, z29.s\n"
+ "ld1rw { z26.s }, p2/Z, [%x[qp], %[offsetof_per_layer_mul]]\n"
+ "mov x19, #0x3\n"
+ "mul z27.s, p2/M, z27.s, z16.s\n"
+ "ld1rw { z25.s }, p2/Z, [%x[qp], %[offsetof_per_layer_right_shift]]\n"
+ "mul x19, %x[ld_weight_col], x19\n"
+ "cmp %x[ld_weight_row], XZR\n"
+ "add x23, %x[ld_weight_col], %x[ld_weight_col]\n"
+ "csel %x[ld_weight_row], %x[ld_weight_row], x19, NE\n"
+ "add x22, %x[weights], %x[ld_weight_row]\n"
+ "add x21, x22, %x[ld_weight_row]\n"
+ "whilelt p1.s, XZR, %x[n_channels]\n"
+ "mov x20, #0x0\n"
+ "pfalse p8.b\n"
+ "cbz %x[bias], 1f\n"
+ "ptrue p8.s\n"
+ "1:" // No bias
+
+ "2:" // Loop
+ "mov z24.s, #0x0\n"
+ "cntp x19, p2, p1.s\n"
+ "and p0.b, p2/Z, p8.b, p1.b\n"
+ "ld1w { z23.s }, p0/Z, [%x[bias], x20, LSL #2]\n"
+ "whilelt p0.b, XZR, x19\n"
+ "ld1b { z17.b }, p0/Z, [%x[weights]]\n"
+ "ld1b { z16.b }, p0/Z, [%x[weights], %x[ld_weight_col]]\n"
+ "zip1 z18.b, z16.b, z30.b\n"
+ "ld1b { z16.b }, p0/Z, [%x[weights], x23]\n"
+ "add %x[weights], %x[weights], x19\n"
+ "zip1 z16.b, z17.b, z16.b\n"
+ "ld1b { z22.b }, p0/Z, [x22]\n"
+ "ld1b { z17.b }, p0/Z, [x22, %x[ld_weight_col]]\n"
+ "zip1 z21.b, z16.b, z18.b\n"
+ "ld1b { z16.b }, p0/Z, [x22, x23]\n"
+ "udot z24.s, z28.b, z21.b\n"
+ "add x22, x22, x19\n"
+ "zip1 z18.b, z17.b, z30.b\n"
+ "ld1b { z20.b }, p0/Z, [x21]\n"
+ "ld1b { z19.b }, p0/Z, [x21, %x[ld_weight_col]]\n"
+ "zip1 z17.b, z22.b, z16.b\n"
+ "ld1b { z16.b }, p0/Z, [x21, x23]\n"
+ "zip1 z18.b, z17.b, z18.b\n"
+ "add x21, x21, x19\n"
+ "zip1 z17.b, z19.b, z30.b\n"
+ "udot z24.s, z28.b, z18.b\n"
+ "zip1 z16.b, z20.b, z16.b\n"
+ "zip1 z16.b, z16.b, z17.b\n"
+ "udot z24.s, z28.b, z16.b\n"
+ "mls z23.s, p2/M, z24.s, z29.s\n"
+ "add z23.s, z23.s, z27.s\n"
+ "st1w { z23.s }, p2, [%x[outptr]]\n"
+ "st1b { z21.b }, p2, [%x[outptr], #1, MUL VL]\n"
+ "st1b { z18.b }, p2, [%x[outptr], #2, MUL VL]\n"
+ "st1b { z16.b }, p2, [%x[outptr], #3, MUL VL]\n"
+ "addvl %x[outptr], %x[outptr], #4\n"
+ "cbz %x[rq_mul_perchannel], 3f\n"
+ "ld1w { z26.s }, p1/Z, [%x[rq_mul_perchannel], x20, LSL #2]\n"
+ "ld1w { z25.s }, p1/Z, [%x[rq_shift_perchannel], x20, LSL #2]\n"
+ "3:" // Loop: Quantisation parameters: Store
+ "st1w { z26.s }, p2, [%x[outptr]]\n"
+ "incw x20\n"
+ "st1w { z25.s }, p2, [%x[outptr], #1, MUL VL]\n"
+ "whilelt p1.s, x20, %x[n_channels]\n"
+ "addvl %x[outptr], %x[outptr], #2\n"
+ "b.any 2b\n"
+ : [ld_weight_col] "+&r" (ld_weight_col), [ld_weight_row] "+&r" (ld_weight_row), [outptr] "+&r" (outptr), [weights] "+&r" (weights)
+ : [bias] "r" (bias), [n_channels] "r" (n_channels), [offsetof_input_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_per_layer_mul] "I" (offsetof(arm_gemm::Requantize32, per_layer_mul)), [offsetof_per_layer_right_shift] "I" (offsetof(arm_gemm::Requantize32, per_layer_right_shift)), [offsetof_weights_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [qp] "r" (&qp), [rq_mul_perchannel] "r" (qp.per_channel_muls), [rq_shift_perchannel] "r" (qp.per_channel_right_shifts)
+ : "cc", "memory", "p0", "p1", "p2", "p8", "x19", "x20", "x21", "x22", "x23", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FEATURE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 000000000..bb43d5701
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst
+{
+ typedef __fp16 bias_type;
+ typedef __fp16 input_type;
+ typedef __fp16 weight_type;
+ typedef __fp16 return_type;
+
+ typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 4;
+ constexpr static unsigned int input_cols = 4;
+
+ indirect_kern_type indirect_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+
+ a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 000000000..99f46015a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,528 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x17, #0x0\n"
+ "mov x16, #0x0\n"
+ "1:" // Tile loop
+ "str x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x2\n"
+ "str x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x15, #0x2\n"
+ "ldr x14, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x24, %x[params_struct], %[offsetof_args_min]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "add x21, %x[params_struct], %[offsetof_args_max]\n"
+ "ldr x13, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mov x22, #0x0\n"
+ "ldr x12, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x17, x23\n" // offset = tile_i * ld_input_row
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x16, x13, x19\n" // offset += tile_j * ld_input_col
+ "ldr x11, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x25\n" // offset *= kernel_stride * output_size
+ "ldr x10, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x12, x12, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ld1r { v18.8h }, [x24]\n"
+ "add x9, x12, x23, LSL #1\n"
+ "ld1r { v17.8h }, [x21]\n"
+ "add x28, x9, x23, LSL #1\n"
+ "lsl x13, x13, #0x1\n"
+ "add x27, x28, x23, LSL #1\n"
+ "add x26, x13, x13\n"
+ "add x25, x26, x13\n"
+ "mul x19, x17, x20\n" // offset = tile_i * ld_output_row
+ "madd x19, x16, x11, x19\n" // offset += tile_j * ld_output_col
+ "mul x19, x19, x15\n" // offset *= output_tile_size
+ "add x10, x10, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x24, x10, x20, LSL #1\n"
+ "lsl x11, x11, #0x1\n"
+ "mov x21, #0x10\n" // cntb _, ALL, #1
+ "sub x20, XZR, x21\n"
+ "lsr x19, %x[n_channels], #0x3\n"
+ "cbz x19, 4f\n"
+ "ldr q16, [x14, #0x0]\n"
+ "ldr q0, [x14, #0x10]\n"
+ "cmp x21, x19, LSL #4\n"
+ "ldr q1, [x14, #0x20]\n"
+ "ldr q2, [x14, #0x30]\n"
+ "ldr q3, [x14, #0x40]\n"
+ "ldr q4, [x14, #0x50]\n"
+ "ldr q5, [x14, #0x60]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "add x14, x14, #0xa0\n"
+ "ldr q9, [x9, x13]\n"
+ "ld1 { v10.8h }, [x12]\n"
+ "ldr q11, [x12, x25]\n"
+ "ldr q12, [x9, x26]\n"
+ "ldr q13, [x28, x13]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+ "add x20, x20, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n"
+ "add x22, x22, #0x10\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "add x21, x21, #0x10\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x27]\n"
+ "cmp x21, x19, LSL #4\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x28, x26]\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x27, x25]\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "ldr q16, [x14, #0x0]\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v31.8h, v5.8h, v12.8h\n"
+ "fmla v30.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x12, x13]\n"
+ "fmla v29.8h, v6.8h, v9.8h\n"
+ "ldr q9, [x12, x26]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "fmla v31.8h, v7.8h, v13.8h\n"
+ "fmla v30.8h, v6.8h, v13.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "ld1 { v11.8h }, [x9]\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x9, x25]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v29.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q4, [x14, #0x50]\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x28]\n"
+ "ldr q1, [x14, #0x20]\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q0, [x14, #0x10]\n"
+ "fmla v28.8h, v2.8h, v12.8h\n"
+ "ldr q2, [x14, #0x30]\n"
+ "fmla v31.8h, v8.8h, v10.8h\n"
+ "fmla v30.8h, v7.8h, v10.8h\n"
+ "ldr q10, [x28, x25]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "ldr q13, [x28, x13]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x27, x13]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr q12, [x27, x26]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v28.8h, v5.8h, v10.8h\n"
+ "ldr q3, [x14, #0x40]\n"
+ "ldr q5, [x14, #0x60]\n"
+ "fmla v31.8h, v6.8h, v9.8h\n"
+ "ldr q9, [x9, x13]\n"
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x12]\n"
+ "fmla v29.8h, v7.8h, v11.8h\n"
+ "fmla v28.8h, v6.8h, v11.8h\n"
+ "ldr q11, [x12, x25]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "ldr q8, [x14, #0x90]\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "ldr q12, [x9, x26]\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "ldr q7, [x14, #0x80]\n"
+ "add x14, x14, #0xa0\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "st1 { v31.8h }, [x10]\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "str q30, [x10, x11]\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "st1 { v29.8h }, [x24]\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "add x10, x10, #0x10\n"
+ "str q28, [x24, x11]\n"
+ "add x24, x24, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x27]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x28, x26]\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x27, x25]\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v31.8h, v5.8h, v12.8h\n"
+ "fmla v30.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x12, x13]\n"
+ "fmla v29.8h, v6.8h, v9.8h\n"
+ "ldr q9, [x12, x26]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "fmla v31.8h, v7.8h, v13.8h\n"
+ "fmla v30.8h, v6.8h, v13.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "ld1 { v11.8h }, [x9]\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x9, x25]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v29.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x28]\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "fmla v28.8h, v2.8h, v12.8h\n"
+ "fmla v31.8h, v8.8h, v10.8h\n"
+ "fmla v30.8h, v7.8h, v10.8h\n"
+ "ldr q10, [x28, x25]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x27, x13]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr q12, [x27, x26]\n"
+ "add x27, x27, #0x10\n"
+ "fmla v28.8h, v5.8h, v10.8h\n"
+ "fmla v31.8h, v6.8h, v9.8h\n"
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "fmla v29.8h, v7.8h, v11.8h\n"
+ "fmla v28.8h, v6.8h, v11.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "st1 { v31.8h }, [x10]\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "str q30, [x10, x11]\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "add x10, x10, #0x10\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "st1 { v29.8h }, [x24]\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "str q28, [x24, x11]\n"
+ "add x24, x24, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x1\n"
+ "beq 31f\n"
+ "ldr q16, [x14, #0x0]\n"
+ "ldr q0, [x14, #0x10]\n"
+ "add x23, x9, x13\n"
+ "ldr q1, [x14, #0x20]\n"
+ "add x22, x12, XZR\n"
+ "ldr q2, [x14, #0x30]\n"
+ "add x21, x12, x25\n"
+ "ldr q3, [x14, #0x40]\n"
+ "add x20, x9, x26\n"
+ "ldr q4, [x14, #0x50]\n"
+ "add x19, x28, x13\n"
+ "ldr q5, [x14, #0x60]\n"
+ "ldr q6, [x14, #0x70]\n"
+ "ldr q7, [x14, #0x80]\n"
+ "ldr q8, [x14, #0x90]\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ldr s9, [x23], #0x4\n"
+ "ldr s10, [x22], #0x4\n"
+ "ldr s11, [x21], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v9.h }[2], [x23]\n"
+ "ld1 { v10.h }[2], [x22]\n"
+ "ld1 { v11.h }[2], [x21]\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 6f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset
+ "ldr h9, [x23, #0x0]\n"
+ "ldr h10, [x22, #0x0]\n"
+ "ldr h11, [x21, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
+ "ldr h13, [x19, #0x0]\n"
+ "6:" // Tile loop: Oddments: Load inputs: (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
+ "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+ "add x19, x27, XZR\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v31.8h, v5.8h, v12.8h\n"
+ "fmla v30.8h, v4.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr s9, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v9.h }[2], [x19]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+ "ldr h9, [x19, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+ "fmla v29.8h, v6.8h, v9.8h\n"
+ "add x19, x27, x25\n"
+ "fmla v31.8h, v7.8h, v13.8h\n"
+ "fmla v30.8h, v6.8h, v13.8h\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 10f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "10:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "add x19, x12, x13\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "add x19, x12, x26\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr s9, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v9.h }[2], [x19]\n"
+ "b 14f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+ "ldr h9, [x19, #0x0]\n"
+ "14:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "add x19, x28, x26\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End
+ "fmla v31.8h, v8.8h, v10.8h\n"
+ "add x19, x9, XZR\n"
+ "fmla v30.8h, v7.8h, v10.8h\n"
+ "fmla v29.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 18f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "18:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "add x19, x9, x25\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "add x19, x28, XZR\n"
+ "fmla v28.8h, v2.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ldr s9, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 22f\n"
+ "ld1 { v9.h }[2], [x19]\n"
+ "b 22f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+ "ldr h9, [x19, #0x0]\n"
+ "22:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+ "fmla v31.8h, v6.8h, v9.8h\n"
+ "add x19, x28, x25\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "add x19, x27, x13\n"
+ "fmla v28.8h, v5.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 26f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 26f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "26:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+ "fmla v29.8h, v7.8h, v11.8h\n"
+ "add x19, x27, x26\n"
+ "fmla v28.8h, v6.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "mov x19, x10\n"
+ "st1 { v31.s }[0], [x19], x11\n"
+ "add x10, x10, #0x4\n"
+ "st1 { v30.s }[0], [x19]\n"
+ "mov x19, x24\n"
+ "st1 { v29.s }[0], [x19], x11\n"
+ "add x24, x24, #0x4\n"
+ "st1 { v28.s }[0], [x19]\n"
+ "tbz %x[n_channels], #0, 30f\n"
+ "mov x20, x10\n"
+ "st1 { v31.h }[2], [x20], x11\n"
+ "mov x19, x24\n"
+ "st1 { v30.h }[2], [x20]\n"
+ "st1 { v29.h }[2], [x19], x11\n"
+ "st1 { v28.h }[2], [x19]\n"
+ "b 30f\n"
+ "29:" // Tile loop: Oddments: Store: Bit 1: Unset
+ "mov x20, x10\n"
+ "st1 { v31.h }[0], [x20], x11\n"
+ "mov x19, x24\n"
+ "st1 { v30.h }[0], [x20]\n"
+ "st1 { v29.h }[0], [x19], x11\n"
+ "st1 { v28.h }[0], [x19]\n"
+ "30:" // Tile loop: Oddments: Store: Bit 1: End
+
+ "31:" // Tile loop: End
+ "ldr x17, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x17, #0x1\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x16, x16, #0x1\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x16, x19\n"
+ "csel x16, x16, XZR, LT\n"
+ "csel x17, x17, x21, LT\n"
+ "cmp x17, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 000000000..af83238d2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,515 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[16];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[5];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[3];
+ inptrs[3] = input_ptrs[6];
+ inptrs[4] = input_ptrs[9];
+ inptrs[5] = input_ptrs[12];
+ inptrs[6] = input_ptrs[15];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[2];
+ inptrs[9] = input_ptrs[10];
+ inptrs[10] = input_ptrs[4];
+ inptrs[11] = input_ptrs[7];
+ inptrs[12] = input_ptrs[8];
+ inptrs[13] = input_ptrs[11];
+ inptrs[14] = input_ptrs[13];
+ inptrs[15] = input_ptrs[14];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "add x19, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v17.8h }, [x19]\n"
+ "mov x14, #0x0\n"
+ "ldp x13, x12, [x21, #0x0]\n"
+ "mov x11, #0x10\n" // cntb _, ALL, #1
+ "ldp x10, x9, [x21, #0x10]\n"
+ "sub x28, XZR, x11\n"
+ "lsr x27, %x[n_channels], #0x3\n"
+ "cbz x27, 3f\n"
+ "ldr q16, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x11, x27, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ "ldr x22, [x16, #0x20]\n"
+ "ldr q9, [x26, x14]\n"
+ "ldr q10, [x25, x14]\n"
+ "ldr q11, [x24, x14]\n"
+ "ldr q12, [x23, x14]\n"
+ "ldr q13, [x22, x14]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+ "ldr x21, [x16, #0x28]\n"
+ "add x28, x28, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n"
+ "ldr x20, [x16, #0x30]\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "ldr x19, [x16, #0x38]\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x21, x14]\n"
+ "ldr x26, [x16, #0x40]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr x25, [x16, #0x48]\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "ldr q11, [x20, x14]\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q10, [x25, x14]\n"
+ "ldr x24, [x16, #0x50]\n"
+ "fmla v31.8h, v5.8h, v12.8h\n"
+ "ldr x23, [x16, #0x58]\n"
+ "fmla v30.8h, v4.8h, v12.8h\n"
+ "fmla v29.8h, v6.8h, v9.8h\n"
+ "ldr q12, [x19, x14]\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "ldr q9, [x26, x14]\n"
+ "ldr x22, [x16, #0x60]\n"
+ "fmla v31.8h, v7.8h, v13.8h\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v30.8h, v6.8h, v13.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x24, x14]\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "fmla v29.8h, v5.8h, v10.8h\n"
+ "ldr q12, [x23, x14]\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ "ldr q16, [x15, #0x0]\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "ldr q4, [x15, #0x50]\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q9, [x22, x14]\n"
+ "fmla v28.8h, v2.8h, v12.8h\n"
+ "ldr x22, [x16, #0x20]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "fmla v31.8h, v8.8h, v10.8h\n"
+ "ldr q1, [x15, #0x20]\n"
+ "fmla v30.8h, v7.8h, v10.8h\n"
+ "ldr q10, [x21, x14]\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "ldr q13, [x22, x11]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x20, x14]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "fmla v28.8h, v5.8h, v10.8h\n"
+ "ldr q12, [x19, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.8h, v6.8h, v9.8h\n"
+ "ldr q9, [x26, x11]\n"
+ "fmla v29.8h, v7.8h, v11.8h\n"
+ "ldr q3, [x15, #0x40]\n"
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "ldr q10, [x25, x11]\n"
+ "fmla v28.8h, v6.8h, v11.8h\n"
+ "ldr q11, [x24, x11]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "ldr q6, [x15, #0x70]\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "ldr q8, [x15, #0x90]\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "ldr q12, [x23, x11]\n"
+ "add x11, x11, #0x10\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "ldr q7, [x15, #0x80]\n"
+ "cmp x11, x27, LSL #4\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "str q31, [x13, x28]\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "add x15, x15, #0xa0\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "str q30, [x12, x28]\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "str q29, [x10, x28]\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "str q28, [x9, x28]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+ "ldr x21, [x16, #0x28]\n"
+ "add x28, x28, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n"
+ "ldr x20, [x16, #0x30]\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "ldr x19, [x16, #0x38]\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x21, x14]\n"
+ "ldr x26, [x16, #0x40]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr x25, [x16, #0x48]\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "ldr q11, [x20, x14]\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q10, [x25, x14]\n"
+ "ldr x24, [x16, #0x50]\n"
+ "fmla v31.8h, v5.8h, v12.8h\n"
+ "ldr x23, [x16, #0x58]\n"
+ "fmla v30.8h, v4.8h, v12.8h\n"
+ "fmla v29.8h, v6.8h, v9.8h\n"
+ "ldr q12, [x19, x14]\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "ldr q9, [x26, x14]\n"
+ "ldr x22, [x16, #0x60]\n"
+ "fmla v31.8h, v7.8h, v13.8h\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v30.8h, v6.8h, v13.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x24, x14]\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x23, x14]\n"
+ "fmla v29.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "ldr q9, [x22, x14]\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "fmla v28.8h, v2.8h, v12.8h\n"
+ "fmla v31.8h, v8.8h, v10.8h\n"
+ "fmla v30.8h, v7.8h, v10.8h\n"
+ "ldr q10, [x21, x14]\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x20, x14]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr q12, [x19, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v28.8h, v5.8h, v10.8h\n"
+ "fmla v31.8h, v6.8h, v9.8h\n"
+ "fmla v29.8h, v7.8h, v11.8h\n"
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "fmla v28.8h, v6.8h, v11.8h\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "str q31, [x13, x28]\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "str q30, [x12, x28]\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "str q29, [x10, x28]\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "str q28, [x9, x28]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x1\n"
+ "beq 30f\n"
+ "ldr q16, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "mov x28, x14\n"
+ "ldr q1, [x15, #0x20]\n"
+ "add x13, x13, x28\n"
+ "ldr q2, [x15, #0x30]\n"
+ "add x12, x12, x28\n"
+ "ldr q3, [x15, #0x40]\n"
+ "add x10, x10, x28\n"
+ "ldr q4, [x15, #0x50]\n"
+ "add x9, x9, x28\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "ldr x26, [x16, #0x0]\n"
+ "ldr x25, [x16, #0x8]\n"
+ "add x26, x26, x14\n"
+ "ldr x24, [x16, #0x10]\n"
+ "ldr x23, [x16, #0x18]\n"
+ "add x25, x25, x14\n"
+ "ldr x22, [x16, #0x20]\n"
+ "add x24, x24, x14\n"
+ "add x23, x23, x14\n"
+ "add x22, x22, x14\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.s }[0], [x26], #0x4\n"
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v13.s }[0], [x22], #0x4\n"
+ "tbz %x[n_channels], #0, 5f\n"
+ "ld1 { v9.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x25], #0x2\n"
+ "ld1 { v11.h }[2], [x24], #0x2\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "ld1 { v13.h }[2], [x22], #0x2\n"
+ "b 5f\n"
+ "4:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: Unset
+ "ld1 { v9.h }[0], [x26], #0x2\n"
+ "ld1 { v10.h }[0], [x25], #0x2\n"
+ "ld1 { v11.h }[0], [x24], #0x2\n"
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "ld1 { v13.h }[0], [x22], #0x2\n"
+ "5:" // Oddments: Load inputs (1, 1), (0, 0), (0, 3), (1, 2), (2, 1): Bit 1: End
+ "mov v31.16b, v16.16b\n fmla v31.8h, v4.8h, v9.8h\n"
+ "ldr x21, [x16, #0x28]\n"
+ "add x21, x21, x14\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v3.8h, v9.8h\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v1.8h, v9.8h\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v31.8h, v5.8h, v12.8h\n"
+ "fmla v30.8h, v4.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v9.s }[0], [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v9.h }[2], [x21], #0x2\n"
+ "b 7f\n"
+ "6:" // Oddments: Load input (3, 0): Bit 1: Unset
+ "ld1 { v9.h }[0], [x21], #0x2\n"
+ "7:" // Oddments: Load input (3, 0): Bit 1: End
+ "fmla v29.8h, v6.8h, v9.8h\n"
+ "ldr x20, [x16, #0x30]\n"
+ "fmla v31.8h, v7.8h, v13.8h\n"
+ "add x20, x20, x14\n"
+ "fmla v30.8h, v6.8h, v13.8h\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 9f\n"
+ "8:" // Oddments: Load input (3, 3): Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "9:" // Oddments: Load input (3, 3): Bit 1: End
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "ldr x19, [x16, #0x38]\n"
+ "add x19, x19, x14\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v12.s }[0], [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v12.h }[2], [x19], #0x2\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (0, 1): Bit 1: Unset
+ "ld1 { v12.h }[0], [x19], #0x2\n"
+ "11:" // Oddments: Load input (0, 1): Bit 1: End
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "ldr x26, [x16, #0x40]\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "add x26, x26, x14\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v9.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v9.h }[2], [x26], #0x2\n"
+ "b 13f\n"
+ "12:" // Oddments: Load input (0, 2): Bit 1: Unset
+ "ld1 { v9.h }[0], [x26], #0x2\n"
+ "13:" // Oddments: Load input (0, 2): Bit 1: End
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "ldr x25, [x16, #0x48]\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v10.h }[2], [x25], #0x2\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (2, 2): Bit 1: Unset
+ "ld1 { v10.h }[0], [x25], #0x2\n"
+ "15:" // Oddments: Load input (2, 2): Bit 1: End
+ "fmla v31.8h, v8.8h, v10.8h\n"
+ "ldr x24, [x16, #0x50]\n"
+ "fmla v30.8h, v7.8h, v10.8h\n"
+ "add x24, x24, x14\n"
+ "fmla v29.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 17f\n"
+ "ld1 { v11.h }[2], [x24], #0x2\n"
+ "b 17f\n"
+ "16:" // Oddments: Load input (1, 0): Bit 1: Unset
+ "ld1 { v11.h }[0], [x24], #0x2\n"
+ "17:" // Oddments: Load input (1, 0): Bit 1: End
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr x23, [x16, #0x58]\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "add x23, x23, x14\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (1, 3): Bit 1: Unset
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "19:" // Oddments: Load input (1, 3): Bit 1: End
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr x22, [x16, #0x60]\n"
+ "fmla v28.8h, v2.8h, v12.8h\n"
+ "add x22, x22, x14\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "tbz %x[n_channels], #0, 21f\n"
+ "ld1 { v9.h }[2], [x22], #0x2\n"
+ "b 21f\n"
+ "20:" // Oddments: Load input (2, 0): Bit 1: Unset
+ "ld1 { v9.h }[0], [x22], #0x2\n"
+ "21:" // Oddments: Load input (2, 0): Bit 1: End
+ "fmla v31.8h, v6.8h, v9.8h\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "add x21, x21, x14\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v10.s }[0], [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v10.h }[2], [x21], #0x2\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (2, 3): Bit 1: Unset
+ "ld1 { v10.h }[0], [x21], #0x2\n"
+ "23:" // Oddments: Load input (2, 3): Bit 1: End
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v28.8h, v5.8h, v10.8h\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 25f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 25f\n"
+ "24:" // Oddments: Load input (3, 1): Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "25:" // Oddments: Load input (3, 1): Bit 1: End
+ "fmla v29.8h, v7.8h, v11.8h\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v28.8h, v6.8h, v11.8h\n"
+ "add x19, x19, x14\n"
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v12.s }[0], [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v12.h }[2], [x19], #0x2\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (3, 2): Bit 1: Unset
+ "ld1 { v12.h }[0], [x19], #0x2\n"
+ "27:" // Oddments: Load input (3, 2): Bit 1: End
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "st1 { v31.s }[0], [x13], #0x4\n"
+ "st1 { v30.s }[0], [x12], #0x4\n"
+ "st1 { v29.s }[0], [x10], #0x4\n"
+ "st1 { v28.s }[0], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 29f\n"
+ "st1 { v31.h }[2], [x13], #0x2\n"
+ "st1 { v30.h }[2], [x12], #0x2\n"
+ "st1 { v29.h }[2], [x10], #0x2\n"
+ "st1 { v28.h }[2], [x9], #0x2\n"
+ "b 29f\n"
+ "28:" // Oddments: Store: Bit 1: Unset
+ "st1 { v31.h }[0], [x13], #0x2\n"
+ "st1 { v30.h }[0], [x12], #0x2\n"
+ "st1 { v29.h }[0], [x10], #0x2\n"
+ "st1 { v28.h }[0], [x9], #0x2\n"
+ "29:" // Oddments: Store: Bit 1: End
+
+ "30:" // End
+
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
new file mode 100644
index 000000000..90db8703b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst
+{
+ typedef __fp16 bias_type;
+ typedef __fp16 input_type;
+ typedef __fp16 weight_type;
+ typedef __fp16 return_type;
+
+ typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 3;
+ constexpr static unsigned int output_cols = 3;
+
+ constexpr static unsigned int input_rows = 5;
+ constexpr static unsigned int input_cols = 5;
+
+ indirect_kern_type indirect_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+
+ a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 000000000..3bdd544a5
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,829 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x7, #0x0\n"
+ "mov x8, #0x0\n"
+ "1:" // Tile loop
+ "str x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x3\n"
+ "str x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x25, #0x3\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x24, %x[params_struct], %[offsetof_args_min]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "add x21, %x[params_struct], %[offsetof_args_max]\n"
+ "ldr x16, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mov x23, #0x0\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x7, x22\n" // offset = tile_i * ld_input_row
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x8, x16, x19\n" // offset += tile_j * ld_input_col
+ "ldr x14, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x26\n" // offset *= kernel_stride * output_size
+ "ldr x13, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x15, x15, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ld1r { v18.8h }, [x24]\n"
+ "add x12, x15, x22, LSL #1\n"
+ "ld1r { v17.8h }, [x21]\n"
+ "add x11, x12, x22, LSL #1\n"
+ "lsl x16, x16, #0x1\n"
+ "add x10, x11, x22, LSL #1\n"
+ "add x9, x10, x22, LSL #1\n"
+ "add x28, x16, x16\n"
+ "add x27, x28, x16\n"
+ "add x26, x27, x16\n"
+ "mul x19, x7, x20\n" // offset = tile_i * ld_output_row
+ "madd x19, x8, x14, x19\n" // offset += tile_j * ld_output_col
+ "mul x19, x19, x25\n" // offset *= output_tile_size
+ "add x13, x13, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x25, x13, x20, LSL #1\n"
+ "add x24, x25, x20, LSL #1\n"
+ "lsl x14, x14, #0x1\n"
+ "add x22, x14, x14\n"
+ "mov x21, #0x10\n" // cntb _, ALL, #1
+ "sub x20, XZR, x21\n"
+ "lsr x19, %x[n_channels], #0x3\n"
+ "cbz x19, 4f\n"
+ "ldr q16, [x17, #0x0]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "cmp x21, x19, LSL #4\n"
+ "ldr q1, [x17, #0x20]\n"
+ "ldr q2, [x17, #0x30]\n"
+ "ldr q3, [x17, #0x40]\n"
+ "ldr q4, [x17, #0x50]\n"
+ "ldr q5, [x17, #0x60]\n"
+ "ldr q6, [x17, #0x70]\n"
+ "ldr q7, [x17, #0x80]\n"
+ "ldr q8, [x17, #0x90]\n"
+ "add x17, x17, #0xa0\n"
+ "ldr q9, [x11, x28]\n"
+ "ld1 { v10.8h }, [x15]\n"
+ "ldr q11, [x15, x26]\n"
+ "ld1 { v12.8h }, [x9]\n"
+ "ldr q13, [x12, x28]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "add x20, x20, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "add x23, x23, #0x10\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "add x21, x21, #0x10\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+ "cmp x21, x19, LSL #4\n"
+ "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+ "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n"
+ "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+ "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n"
+ "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+ "ldr q16, [x17, #0x0]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x11, x27]\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x11, x16]\n"
+ "fmla v25.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x9, x26]\n"
+ "fmla v30.8h, v4.8h, v13.8h\n"
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v13.8h\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "fmla v27.8h, v1.8h, v13.8h\n"
+ "fmla v26.8h, v0.8h, v13.8h\n"
+ "ldr q13, [x15, x16]\n"
+ "fmla v23.8h, v8.8h, v12.8h\n"
+ "ldr q12, [x15, x27]\n"
+ "fmla v31.8h, v7.8h, v11.8h\n"
+ "fmla v30.8h, v6.8h, v11.8h\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "fmla v27.8h, v3.8h, v11.8h\n"
+ "fmla v25.8h, v1.8h, v11.8h\n"
+ "fmla v24.8h, v0.8h, v11.8h\n"
+ "ld1 { v11.8h }, [x12]\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "ldr q13, [x12, x26]\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v27.8h, v5.8h, v10.8h\n"
+ "fmla v26.8h, v4.8h, v10.8h\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "ld1 { v12.8h }, [x10]\n"
+ "fmla v29.8h, v7.8h, v10.8h\n"
+ "fmla v24.8h, v2.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "ldr q10, [x10, x28]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x10, x26]\n"
+ "fmla v29.8h, v5.8h, v13.8h\n"
+ "fmla v26.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x9, x16]\n"
+ "fmla v25.8h, v3.8h, v12.8h\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x12, x16]\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
+ "fmla v26.8h, v6.8h, v10.8h\n"
+ "fmla v25.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v8.8h, v10.8h\n"
+ "fmla v24.8h, v4.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "fmla v26.8h, v8.8h, v11.8h\n"
+ "fmla v25.8h, v7.8h, v13.8h\n"
+ "fmla v24.8h, v6.8h, v13.8h\n"
+ "ldr q13, [x9, x27]\n"
+ "fmla v23.8h, v5.8h, v11.8h\n"
+ "ldr q11, [x12, x27]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v27.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x10, x16]\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "fmla v30.8h, v5.8h, v11.8h\n"
+ "fmla v26.8h, v1.8h, v11.8h\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x15, x28]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v24.8h, v8.8h, v13.8h\n"
+ "ld1 { v10.8h }, [x15]\n"
+ "fmla v23.8h, v7.8h, v13.8h\n"
+ "ldr q13, [x10, x27]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v27.8h, v6.8h, v12.8h\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "fmla v24.8h, v3.8h, v12.8h\n"
+ "ld1 { v12.8h }, [x11]\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v1.8h, v11.8h\n"
+ "ldr q1, [x17, #0x20]\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x11, x26]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v27.8h, v8.8h, v13.8h\n"
+ "ldr q9, [x11, x28]\n"
+ "fmla v26.8h, v7.8h, v13.8h\n"
+ "fmla v24.8h, v5.8h, v13.8h\n"
+ "fmla v23.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x9, x28]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v31.8h, v6.8h, v12.8h\n"
+ "ldr q4, [x17, #0x50]\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldr q3, [x17, #0x40]\n"
+ "fmla v25.8h, v0.8h, v12.8h\n"
+ "ld1 { v12.8h }, [x9]\n"
+ "fmla v29.8h, v8.8h, v11.8h\n"
+ "ldr q0, [x17, #0x10]\n"
+ "fmla v26.8h, v5.8h, v11.8h\n"
+ "ldr q5, [x17, #0x60]\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x15, x26]\n"
+ "fmla v25.8h, v8.8h, v13.8h\n"
+ "ldr q2, [x17, #0x30]\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "ldr q7, [x17, #0x80]\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "ldr q8, [x17, #0x90]\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "ldr q13, [x12, x28]\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "ldr q6, [x17, #0x70]\n"
+ "add x17, x17, #0xa0\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "st1 { v31.8h }, [x13]\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "str q30, [x13, x14]\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "str q29, [x13, x22]\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "add x13, x13, #0x10\n"
+ "fmax v27.8h, v27.8h, v18.8h\n"
+ "st1 { v28.8h }, [x25]\n"
+ "fmax v26.8h, v26.8h, v18.8h\n"
+ "fmax v25.8h, v25.8h, v18.8h\n"
+ "fmin v27.8h, v27.8h, v17.8h\n"
+ "str q27, [x25, x14]\n"
+ "fmin v26.8h, v26.8h, v17.8h\n"
+ "fmin v25.8h, v25.8h, v17.8h\n"
+ "str q26, [x25, x22]\n"
+ "fmax v24.8h, v24.8h, v18.8h\n"
+ "add x25, x25, #0x10\n"
+ "fmax v23.8h, v23.8h, v18.8h\n"
+ "st1 { v25.8h }, [x24]\n"
+ "fmin v24.8h, v24.8h, v17.8h\n"
+ "str q24, [x24, x14]\n"
+ "fmin v23.8h, v23.8h, v17.8h\n"
+ "str q23, [x24, x22]\n"
+ "add x24, x24, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+ "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+ "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n"
+ "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+ "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n"
+ "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x11, x27]\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x11, x16]\n"
+ "fmla v25.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x9, x26]\n"
+ "fmla v30.8h, v4.8h, v13.8h\n"
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v13.8h\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "fmla v27.8h, v1.8h, v13.8h\n"
+ "fmla v26.8h, v0.8h, v13.8h\n"
+ "ldr q13, [x15, x16]\n"
+ "fmla v23.8h, v8.8h, v12.8h\n"
+ "ldr q12, [x15, x27]\n"
+ "fmla v31.8h, v7.8h, v11.8h\n"
+ "fmla v30.8h, v6.8h, v11.8h\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "fmla v27.8h, v3.8h, v11.8h\n"
+ "fmla v25.8h, v1.8h, v11.8h\n"
+ "fmla v24.8h, v0.8h, v11.8h\n"
+ "ld1 { v11.8h }, [x12]\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "ldr q13, [x12, x26]\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v27.8h, v5.8h, v10.8h\n"
+ "fmla v26.8h, v4.8h, v10.8h\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "ld1 { v12.8h }, [x10]\n"
+ "fmla v29.8h, v7.8h, v10.8h\n"
+ "fmla v24.8h, v2.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "ldr q10, [x10, x28]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x10, x26]\n"
+ "fmla v29.8h, v5.8h, v13.8h\n"
+ "fmla v26.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x9, x16]\n"
+ "fmla v25.8h, v3.8h, v12.8h\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x12, x16]\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
+ "fmla v26.8h, v6.8h, v10.8h\n"
+ "fmla v25.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v8.8h, v10.8h\n"
+ "fmla v24.8h, v4.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "fmla v26.8h, v8.8h, v11.8h\n"
+ "fmla v25.8h, v7.8h, v13.8h\n"
+ "fmla v24.8h, v6.8h, v13.8h\n"
+ "ldr q13, [x9, x27]\n"
+ "fmla v23.8h, v5.8h, v11.8h\n"
+ "ldr q11, [x12, x27]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v27.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x10, x16]\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "fmla v30.8h, v5.8h, v11.8h\n"
+ "fmla v26.8h, v1.8h, v11.8h\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x15, x28]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v24.8h, v8.8h, v13.8h\n"
+ "fmla v23.8h, v7.8h, v13.8h\n"
+ "ldr q13, [x10, x27]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v27.8h, v6.8h, v12.8h\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "fmla v24.8h, v3.8h, v12.8h\n"
+ "ld1 { v12.8h }, [x11]\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v1.8h, v11.8h\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x11, x26]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v27.8h, v8.8h, v13.8h\n"
+ "fmla v26.8h, v7.8h, v13.8h\n"
+ "fmla v24.8h, v5.8h, v13.8h\n"
+ "fmla v23.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x9, x28]\n"
+ "add x9, x9, #0x10\n"
+ "fmla v31.8h, v6.8h, v12.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v25.8h, v0.8h, v12.8h\n"
+ "fmla v29.8h, v8.8h, v11.8h\n"
+ "fmla v26.8h, v5.8h, v11.8h\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "fmla v25.8h, v8.8h, v13.8h\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "st1 { v31.8h }, [x13]\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "str q30, [x13, x14]\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "str q29, [x13, x22]\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "add x13, x13, #0x10\n"
+ "fmax v27.8h, v27.8h, v18.8h\n"
+ "st1 { v28.8h }, [x25]\n"
+ "fmax v26.8h, v26.8h, v18.8h\n"
+ "fmax v25.8h, v25.8h, v18.8h\n"
+ "fmin v27.8h, v27.8h, v17.8h\n"
+ "str q27, [x25, x14]\n"
+ "fmin v26.8h, v26.8h, v17.8h\n"
+ "fmin v25.8h, v25.8h, v17.8h\n"
+ "str q26, [x25, x22]\n"
+ "fmax v24.8h, v24.8h, v18.8h\n"
+ "add x25, x25, #0x10\n"
+ "fmax v23.8h, v23.8h, v18.8h\n"
+ "st1 { v25.8h }, [x24]\n"
+ "fmin v24.8h, v24.8h, v17.8h\n"
+ "str q24, [x24, x14]\n"
+ "fmin v23.8h, v23.8h, v17.8h\n"
+ "str q23, [x24, x22]\n"
+ "add x24, x24, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x1\n"
+ "beq 49f\n"
+ "ldr q16, [x17, #0x0]\n"
+ "ldr q0, [x17, #0x10]\n"
+ "add x23, x11, x28\n"
+ "ldr q1, [x17, #0x20]\n"
+ "add x22, x15, XZR\n"
+ "ldr q2, [x17, #0x30]\n"
+ "add x21, x15, x26\n"
+ "ldr q3, [x17, #0x40]\n"
+ "add x20, x9, XZR\n"
+ "ldr q4, [x17, #0x50]\n"
+ "add x19, x12, x28\n"
+ "ldr q5, [x17, #0x60]\n"
+ "ldr q6, [x17, #0x70]\n"
+ "ldr q7, [x17, #0x80]\n"
+ "ldr q8, [x17, #0x90]\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ldr s9, [x23], #0x4\n"
+ "ldr s10, [x22], #0x4\n"
+ "ldr s11, [x21], #0x4\n"
+ "ldr s12, [x20], #0x4\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v9.h }[2], [x23]\n"
+ "ld1 { v10.h }[2], [x22]\n"
+ "ld1 { v11.h }[2], [x21]\n"
+ "ld1 { v12.h }[2], [x20]\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 6f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset
+ "ldr h9, [x23, #0x0]\n"
+ "ldr h10, [x22, #0x0]\n"
+ "ldr h11, [x21, #0x0]\n"
+ "ldr h12, [x20, #0x0]\n"
+ "ldr h13, [x19, #0x0]\n"
+ "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
+ "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "add x19, x9, x26\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+ "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+ "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n"
+ "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+ "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n"
+ "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v25.8h, v6.8h, v12.8h\n"
+ "fmla v30.8h, v4.8h, v13.8h\n"
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v13.8h\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "fmla v27.8h, v1.8h, v13.8h\n"
+ "fmla v26.8h, v0.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+ "fmla v23.8h, v8.8h, v12.8h\n"
+ "add x19, x11, x16\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 10f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "10:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+ "fmla v31.8h, v7.8h, v11.8h\n"
+ "add x19, x15, x16\n"
+ "fmla v30.8h, v6.8h, v11.8h\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "fmla v27.8h, v3.8h, v11.8h\n"
+ "fmla v25.8h, v1.8h, v11.8h\n"
+ "fmla v24.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "add x19, x15, x27\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 14f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "14:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "add x19, x11, x27\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "add x19, x12, XZR\n"
+ "fmla v29.8h, v7.8h, v10.8h\n"
+ "fmla v27.8h, v5.8h, v10.8h\n"
+ "fmla v26.8h, v4.8h, v10.8h\n"
+ "fmla v24.8h, v2.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 18f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "18:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "add x19, x12, x26\n"
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+ "fmla v29.8h, v5.8h, v13.8h\n"
+ "add x19, x10, XZR\n"
+ "fmla v26.8h, v2.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 22f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 22f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "22:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "add x19, x10, x28\n"
+ "fmla v25.8h, v3.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+ "fmla v28.8h, v8.8h, v10.8h\n"
+ "add x19, x10, x26\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
+ "fmla v26.8h, v6.8h, v10.8h\n"
+ "fmla v25.8h, v5.8h, v10.8h\n"
+ "fmla v24.8h, v4.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 26f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 26f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "26:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+ "fmla v26.8h, v8.8h, v11.8h\n"
+ "add x19, x9, x16\n"
+ "fmla v23.8h, v5.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+ "fmla v25.8h, v7.8h, v13.8h\n"
+ "add x19, x12, x16\n"
+ "fmla v24.8h, v6.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 30f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 30f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "30:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "add x19, x12, x27\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v27.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+ "fmla v30.8h, v5.8h, v11.8h\n"
+ "add x19, x9, x27\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "fmla v26.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 34f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 34f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "34:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+ "fmla v24.8h, v8.8h, v13.8h\n"
+ "add x19, x10, x16\n"
+ "fmla v23.8h, v7.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "add x19, x15, x28\n"
+ "fmla v27.8h, v6.8h, v12.8h\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "fmla v24.8h, v3.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 38f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 38f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "38:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "add x19, x10, x27\n"
+ "fmla v30.8h, v1.8h, v11.8h\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+ "fmla v27.8h, v8.8h, v13.8h\n"
+ "add x19, x11, XZR\n"
+ "fmla v26.8h, v7.8h, v13.8h\n"
+ "fmla v24.8h, v5.8h, v13.8h\n"
+ "fmla v23.8h, v4.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 42f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 42f\n"
+ "41:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "42:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+ "fmla v31.8h, v6.8h, v12.8h\n"
+ "add x19, x11, x26\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v25.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 43f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 44f\n"
+ "43:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "44:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+ "fmla v29.8h, v8.8h, v11.8h\n"
+ "add x19, x9, x28\n"
+ "fmla v26.8h, v5.8h, v11.8h\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 45f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 46f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 46f\n"
+ "45:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "46:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+ "fmla v25.8h, v8.8h, v13.8h\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "fmax v27.8h, v27.8h, v18.8h\n"
+ "fmax v26.8h, v26.8h, v18.8h\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "fmin v27.8h, v27.8h, v17.8h\n"
+ "fmin v26.8h, v26.8h, v17.8h\n"
+ "fmax v25.8h, v25.8h, v18.8h\n"
+ "fmax v24.8h, v24.8h, v18.8h\n"
+ "fmax v23.8h, v23.8h, v18.8h\n"
+ "fmin v25.8h, v25.8h, v17.8h\n"
+ "fmin v24.8h, v24.8h, v17.8h\n"
+ "fmin v23.8h, v23.8h, v17.8h\n"
+ "tbz %x[n_channels], #1, 47f\n"
+ "mov x19, x13\n"
+ "st1 { v31.s }[0], [x19], x14\n"
+ "add x13, x13, #0x4\n"
+ "st1 { v30.s }[0], [x19], x14\n"
+ "mov x20, x25\n"
+ "st1 { v29.s }[0], [x19]\n"
+ "st1 { v28.s }[0], [x20], x14\n"
+ "add x25, x25, #0x4\n"
+ "st1 { v27.s }[0], [x20], x14\n"
+ "mov x19, x24\n"
+ "st1 { v26.s }[0], [x20]\n"
+ "add x24, x24, #0x4\n"
+ "st1 { v25.s }[0], [x19], x14\n"
+ "st1 { v24.s }[0], [x19], x14\n"
+ "st1 { v23.s }[0], [x19]\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "mov x21, x13\n"
+ "st1 { v31.h }[2], [x21], x14\n"
+ "mov x20, x25\n"
+ "st1 { v30.h }[2], [x21], x14\n"
+ "st1 { v28.h }[2], [x20], x14\n"
+ "mov x19, x24\n"
+ "st1 { v29.h }[2], [x21]\n"
+ "st1 { v27.h }[2], [x20], x14\n"
+ "st1 { v26.h }[2], [x20]\n"
+ "st1 { v25.h }[2], [x19], x14\n"
+ "st1 { v24.h }[2], [x19], x14\n"
+ "st1 { v23.h }[2], [x19]\n"
+ "b 48f\n"
+ "47:" // Tile loop: Oddments: Store: Bit 1: Unset
+ "mov x21, x13\n"
+ "st1 { v31.h }[0], [x21], x14\n"
+ "mov x20, x25\n"
+ "mov x19, x24\n"
+ "st1 { v30.h }[0], [x21], x14\n"
+ "st1 { v28.h }[0], [x20], x14\n"
+ "st1 { v29.h }[0], [x21]\n"
+ "st1 { v27.h }[0], [x20], x14\n"
+ "st1 { v26.h }[0], [x20]\n"
+ "st1 { v25.h }[0], [x19], x14\n"
+ "st1 { v24.h }[0], [x19], x14\n"
+ "st1 { v23.h }[0], [x19]\n"
+ "48:" // Tile loop: Oddments: Store: Bit 1: End
+
+ "49:" // Tile loop: End
+ "ldr x7, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x7, #0x1\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x8, x8, #0x1\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x8, x19\n"
+ "csel x8, x8, XZR, LT\n"
+ "csel x7, x7, x21, LT\n"
+ "cmp x7, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 000000000..ed47c308c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,907 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[25];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[4];
+ inptrs[3] = input_ptrs[20];
+ inptrs[4] = input_ptrs[7];
+ inptrs[5] = input_ptrs[24];
+ inptrs[6] = input_ptrs[11];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[3];
+ inptrs[9] = input_ptrs[13];
+ inptrs[10] = input_ptrs[5];
+ inptrs[11] = input_ptrs[9];
+ inptrs[12] = input_ptrs[15];
+ inptrs[13] = input_ptrs[17];
+ inptrs[14] = input_ptrs[19];
+ inptrs[15] = input_ptrs[21];
+ inptrs[16] = input_ptrs[6];
+ inptrs[17] = input_ptrs[8];
+ inptrs[18] = input_ptrs[23];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[18];
+ inptrs[22] = input_ptrs[10];
+ inptrs[23] = input_ptrs[14];
+ inptrs[24] = input_ptrs[22];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "add x19, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v17.8h }, [x19]\n"
+ "mov x14, #0x0\n"
+ "mov x13, #0x10\n" // cntb _, ALL, #1
+ "sub x12, XZR, x13\n"
+ "lsr x11, %x[n_channels], #0x3\n"
+ "cbz x11, 3f\n"
+ "ldr q16, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x13, x11, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "ldp x10, x9, [x16, #0x0]\n"
+ "ldp x28, x27, [x16, #0x10]\n"
+ "ldr x26, [x16, #0x20]\n"
+ "ldr q9, [x10, x14]\n"
+ "ldr q10, [x9, x14]\n"
+ "ldr q11, [x28, x14]\n"
+ "ldr q12, [x27, x14]\n"
+ "ldr q13, [x26, x14]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "ldr x25, [x16, #0x28]\n"
+ "add x12, x12, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "ldr x23, [x16, #0x38]\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+ "ldr x10, [x16, #0x40]\n"
+ "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+ "ldr x9, [x16, #0x48]\n"
+ "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n"
+ "ldr x28, [x16, #0x50]\n"
+ "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+ "ldr x27, [x16, #0x58]\n"
+ "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n"
+ "ldr x26, [x16, #0x60]\n"
+ "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+ "ldr x22, [x17, #0x0]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x9, x14]\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v25.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x25, x14]\n"
+ "fmla v30.8h, v4.8h, v13.8h\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "ldr x24, [x16, #0x70]\n"
+ "fmla v29.8h, v3.8h, v13.8h\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "ldr x21, [x17, #0x8]\n"
+ "fmla v27.8h, v1.8h, v13.8h\n"
+ "ldr x20, [x17, #0x10]\n"
+ "fmla v26.8h, v0.8h, v13.8h\n"
+ "ldr q13, [x23, x14]\n"
+ "fmla v23.8h, v8.8h, v12.8h\n"
+ "ldr q12, [x10, x14]\n"
+ "fmla v31.8h, v7.8h, v11.8h\n"
+ "ldr x23, [x16, #0x78]\n"
+ "fmla v30.8h, v6.8h, v11.8h\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "ldr x19, [x17, #0x18]\n"
+ "fmla v27.8h, v3.8h, v11.8h\n"
+ "ldr q16, [x15, #0x0]\n"
+ "fmla v25.8h, v1.8h, v11.8h\n"
+ "fmla v24.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x28, x14]\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "ldr x28, [x16, #0x90]\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "ldr q13, [x27, x14]\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "ldr x27, [x16, #0x98]\n"
+ "fmla v27.8h, v5.8h, v10.8h\n"
+ "fmla v26.8h, v4.8h, v10.8h\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x26, x14]\n"
+ "fmla v29.8h, v7.8h, v10.8h\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v24.8h, v2.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "ldr q10, [x25, x14]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v29.8h, v5.8h, v13.8h\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v26.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x23, x14]\n"
+ "fmla v25.8h, v3.8h, v12.8h\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x10, x14]\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla v26.8h, v6.8h, v10.8h\n"
+ "fmla v25.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v8.8h, v10.8h\n"
+ "fmla v24.8h, v4.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "fmla v26.8h, v8.8h, v11.8h\n"
+ "fmla v25.8h, v7.8h, v13.8h\n"
+ "fmla v24.8h, v6.8h, v13.8h\n"
+ "ldr q13, [x28, x14]\n"
+ "fmla v23.8h, v5.8h, v11.8h\n"
+ "ldr q11, [x9, x14]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v27.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x27, x14]\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "fmla v30.8h, v5.8h, v11.8h\n"
+ "fmla v26.8h, v1.8h, v11.8h\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x26, x14]\n"
+ "fmla v24.8h, v8.8h, v13.8h\n"
+ "ldr x26, [x16, #0x20]\n"
+ "fmla v23.8h, v7.8h, v13.8h\n"
+ "ldr q13, [x25, x14]\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v27.8h, v6.8h, v12.8h\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "fmla v24.8h, v3.8h, v12.8h\n"
+ "ldr q12, [x24, x14]\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v1.8h, v11.8h\n"
+ "ldr q1, [x15, #0x20]\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x23, x14]\n"
+ "fmla v27.8h, v8.8h, v13.8h\n"
+ "fmla v26.8h, v7.8h, v13.8h\n"
+ "fmla v24.8h, v5.8h, v13.8h\n"
+ "fmla v23.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x10, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.8h, v6.8h, v12.8h\n"
+ "ldp x10, x9, [x16, #0x0]\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldp x28, x27, [x16, #0x10]\n"
+ "fmla v25.8h, v0.8h, v12.8h\n"
+ "ldr q0, [x15, #0x10]\n"
+ "fmla v29.8h, v8.8h, v11.8h\n"
+ "ldr q9, [x10, x13]\n"
+ "fmla v26.8h, v5.8h, v11.8h\n"
+ "ldr q10, [x9, x13]\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x28, x13]\n"
+ "fmla v25.8h, v8.8h, v13.8h\n"
+ "ldr q12, [x27, x13]\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "ldr q2, [x15, #0x30]\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "ldr q3, [x15, #0x40]\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "ldr q13, [x26, x13]\n"
+ "add x13, x13, #0x10\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "ldr q4, [x15, #0x50]\n"
+ "cmp x13, x11, LSL #4\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "ldr q5, [x15, #0x60]\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "ldr q6, [x15, #0x70]\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "str q31, [x22, x12]\n"
+ "fmax v27.8h, v27.8h, v18.8h\n"
+ "ldr x22, [x17, #0x20]\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "ldr q7, [x15, #0x80]\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "str q30, [x21, x12]\n"
+ "fmin v27.8h, v27.8h, v17.8h\n"
+ "str q29, [x20, x12]\n"
+ "fmax v26.8h, v26.8h, v18.8h\n"
+ "ldr x21, [x17, #0x28]\n"
+ "fmax v25.8h, v25.8h, v18.8h\n"
+ "str q28, [x19, x12]\n"
+ "fmax v24.8h, v24.8h, v18.8h\n"
+ "str q27, [x22, x12]\n"
+ "fmin v26.8h, v26.8h, v17.8h\n"
+ "ldr x20, [x17, #0x30]\n"
+ "fmin v25.8h, v25.8h, v17.8h\n"
+ "ldr x19, [x17, #0x38]\n"
+ "fmin v24.8h, v24.8h, v17.8h\n"
+ "str q26, [x21, x12]\n"
+ "fmax v23.8h, v23.8h, v18.8h\n"
+ "str q25, [x20, x12]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "fmin v23.8h, v23.8h, v17.8h\n"
+ "str q24, [x19, x12]\n"
+ "str q23, [x22, x12]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "ldr x25, [x16, #0x28]\n"
+ "add x12, x12, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "ldr x23, [x16, #0x38]\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+ "ldr x10, [x16, #0x40]\n"
+ "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+ "ldr x9, [x16, #0x48]\n"
+ "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n"
+ "ldr x28, [x16, #0x50]\n"
+ "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+ "ldr x27, [x16, #0x58]\n"
+ "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n"
+ "ldr x26, [x16, #0x60]\n"
+ "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+ "ldr x22, [x17, #0x0]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x9, x14]\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v25.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x25, x14]\n"
+ "fmla v30.8h, v4.8h, v13.8h\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "ldr x24, [x16, #0x70]\n"
+ "fmla v29.8h, v3.8h, v13.8h\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "ldr x21, [x17, #0x8]\n"
+ "fmla v27.8h, v1.8h, v13.8h\n"
+ "ldr x20, [x17, #0x10]\n"
+ "fmla v26.8h, v0.8h, v13.8h\n"
+ "ldr q13, [x23, x14]\n"
+ "fmla v23.8h, v8.8h, v12.8h\n"
+ "ldr q12, [x10, x14]\n"
+ "fmla v31.8h, v7.8h, v11.8h\n"
+ "ldr x23, [x16, #0x78]\n"
+ "fmla v30.8h, v6.8h, v11.8h\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "ldr x19, [x17, #0x18]\n"
+ "fmla v27.8h, v3.8h, v11.8h\n"
+ "fmla v25.8h, v1.8h, v11.8h\n"
+ "fmla v24.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x28, x14]\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "ldr x28, [x16, #0x90]\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "ldr q13, [x27, x14]\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "ldr x27, [x16, #0x98]\n"
+ "fmla v27.8h, v5.8h, v10.8h\n"
+ "fmla v26.8h, v4.8h, v10.8h\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x26, x14]\n"
+ "fmla v29.8h, v7.8h, v10.8h\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v24.8h, v2.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "ldr q10, [x25, x14]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v29.8h, v5.8h, v13.8h\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v26.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x23, x14]\n"
+ "fmla v25.8h, v3.8h, v12.8h\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x10, x14]\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla v26.8h, v6.8h, v10.8h\n"
+ "fmla v25.8h, v5.8h, v10.8h\n"
+ "fmla v28.8h, v8.8h, v10.8h\n"
+ "fmla v24.8h, v4.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "fmla v26.8h, v8.8h, v11.8h\n"
+ "fmla v25.8h, v7.8h, v13.8h\n"
+ "fmla v24.8h, v6.8h, v13.8h\n"
+ "ldr q13, [x28, x14]\n"
+ "fmla v23.8h, v5.8h, v11.8h\n"
+ "ldr q11, [x9, x14]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v27.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x27, x14]\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "fmla v30.8h, v5.8h, v11.8h\n"
+ "fmla v26.8h, v1.8h, v11.8h\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x26, x14]\n"
+ "fmla v24.8h, v8.8h, v13.8h\n"
+ "fmla v23.8h, v7.8h, v13.8h\n"
+ "ldr q13, [x25, x14]\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v27.8h, v6.8h, v12.8h\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "fmla v24.8h, v3.8h, v12.8h\n"
+ "ldr q12, [x24, x14]\n"
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v1.8h, v11.8h\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x23, x14]\n"
+ "fmla v27.8h, v8.8h, v13.8h\n"
+ "fmla v26.8h, v7.8h, v13.8h\n"
+ "fmla v24.8h, v5.8h, v13.8h\n"
+ "fmla v23.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x10, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.8h, v6.8h, v12.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v25.8h, v0.8h, v12.8h\n"
+ "fmla v29.8h, v8.8h, v11.8h\n"
+ "fmla v26.8h, v5.8h, v11.8h\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "fmla v25.8h, v8.8h, v13.8h\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "str q31, [x22, x12]\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "ldr x22, [x17, #0x20]\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "str q30, [x21, x12]\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmax v27.8h, v27.8h, v18.8h\n"
+ "ldr x21, [x17, #0x28]\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "str q29, [x20, x12]\n"
+ "fmin v27.8h, v27.8h, v17.8h\n"
+ "fmax v26.8h, v26.8h, v18.8h\n"
+ "str q28, [x19, x12]\n"
+ "fmax v25.8h, v25.8h, v18.8h\n"
+ "ldr x20, [x17, #0x30]\n"
+ "fmax v24.8h, v24.8h, v18.8h\n"
+ "str q27, [x22, x12]\n"
+ "fmin v26.8h, v26.8h, v17.8h\n"
+ "ldr x19, [x17, #0x38]\n"
+ "fmin v25.8h, v25.8h, v17.8h\n"
+ "ldr x22, [x17, #0x40]\n"
+ "fmin v24.8h, v24.8h, v17.8h\n"
+ "str q26, [x21, x12]\n"
+ "fmax v23.8h, v23.8h, v18.8h\n"
+ "str q25, [x20, x12]\n"
+ "str q24, [x19, x12]\n"
+ "fmin v23.8h, v23.8h, v17.8h\n"
+ "str q23, [x22, x12]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x1\n"
+ "beq 48f\n"
+ "ldr q16, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "mov x12, x14\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "ldr x10, [x16, #0x0]\n"
+ "add x10, x10, x14\n"
+ "ldr x9, [x16, #0x8]\n"
+ "ldr x28, [x16, #0x10]\n"
+ "add x9, x9, x14\n"
+ "ldr x27, [x16, #0x18]\n"
+ "ldr x26, [x16, #0x20]\n"
+ "add x28, x28, x14\n"
+ "add x27, x27, x14\n"
+ "add x26, x26, x14\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.s }[0], [x10], #0x4\n"
+ "ld1 { v10.s }[0], [x9], #0x4\n"
+ "ld1 { v11.s }[0], [x28], #0x4\n"
+ "ld1 { v12.s }[0], [x27], #0x4\n"
+ "ld1 { v13.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 5f\n"
+ "ld1 { v9.h }[2], [x10], #0x2\n"
+ "ld1 { v10.h }[2], [x9], #0x2\n"
+ "ld1 { v11.h }[2], [x28], #0x2\n"
+ "ld1 { v12.h }[2], [x27], #0x2\n"
+ "ld1 { v13.h }[2], [x26], #0x2\n"
+ "b 5f\n"
+ "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: Unset
+ "ld1 { v9.h }[0], [x10], #0x2\n"
+ "ld1 { v10.h }[0], [x9], #0x2\n"
+ "ld1 { v11.h }[0], [x28], #0x2\n"
+ "ld1 { v12.h }[0], [x27], #0x2\n"
+ "ld1 { v13.h }[0], [x26], #0x2\n"
+ "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 4), (4, 0), (1, 2): Bit 1: End
+ "mov v31.16b, v16.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "ldr x25, [x16, #0x28]\n"
+ "add x25, x25, x14\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v5.8h, v9.8h\n"
+ "mov v27.16b, v16.16b\n fmla v27.8h, v4.8h, v9.8h\n"
+ "mov v26.16b, v16.16b\n fmla v26.8h, v3.8h, v9.8h\n"
+ "mov v25.16b, v16.16b\n fmla v25.8h, v2.8h, v9.8h\n"
+ "mov v24.16b, v16.16b\n fmla v24.8h, v1.8h, v9.8h\n"
+ "mov v23.16b, v16.16b\n fmla v23.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v25.8h, v6.8h, v12.8h\n"
+ "fmla v30.8h, v4.8h, v13.8h\n"
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v13.8h\n"
+ "fmla v28.8h, v2.8h, v13.8h\n"
+ "fmla v27.8h, v1.8h, v13.8h\n"
+ "fmla v26.8h, v0.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v12.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v12.h }[2], [x25], #0x2\n"
+ "b 7f\n"
+ "6:" // Oddments: Load input (4, 4): Bit 1: Unset
+ "ld1 { v12.h }[0], [x25], #0x2\n"
+ "7:" // Oddments: Load input (4, 4): Bit 1: End
+ "fmla v23.8h, v8.8h, v12.8h\n"
+ "ldr x24, [x16, #0x30]\n"
+ "add x24, x24, x14\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v11.h }[2], [x24], #0x2\n"
+ "b 9f\n"
+ "8:" // Oddments: Load input (2, 1): Bit 1: Unset
+ "ld1 { v11.h }[0], [x24], #0x2\n"
+ "9:" // Oddments: Load input (2, 1): Bit 1: End
+ "fmla v31.8h, v7.8h, v11.8h\n"
+ "ldr x23, [x16, #0x38]\n"
+ "fmla v30.8h, v6.8h, v11.8h\n"
+ "add x23, x23, x14\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "fmla v27.8h, v3.8h, v11.8h\n"
+ "fmla v25.8h, v1.8h, v11.8h\n"
+ "fmla v24.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v13.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v13.h }[2], [x23], #0x2\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (0, 1): Bit 1: Unset
+ "ld1 { v13.h }[0], [x23], #0x2\n"
+ "11:" // Oddments: Load input (0, 1): Bit 1: End
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "ldr x10, [x16, #0x40]\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "add x10, x10, x14\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v12.s }[0], [x10], #0x4\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v12.h }[2], [x10], #0x2\n"
+ "b 13f\n"
+ "12:" // Oddments: Load input (0, 3): Bit 1: Unset
+ "ld1 { v12.h }[0], [x10], #0x2\n"
+ "13:" // Oddments: Load input (0, 3): Bit 1: End
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "ldr x9, [x16, #0x48]\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "add x9, x9, x14\n"
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v10.s }[0], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v10.h }[2], [x9], #0x2\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (2, 3): Bit 1: Unset
+ "ld1 { v10.h }[0], [x9], #0x2\n"
+ "15:" // Oddments: Load input (2, 3): Bit 1: End
+ "fmla v30.8h, v8.8h, v10.8h\n"
+ "ldr x28, [x16, #0x50]\n"
+ "fmla v29.8h, v7.8h, v10.8h\n"
+ "add x28, x28, x14\n"
+ "fmla v27.8h, v5.8h, v10.8h\n"
+ "fmla v26.8h, v4.8h, v10.8h\n"
+ "fmla v24.8h, v2.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v11.s }[0], [x28], #0x4\n"
+ "tbz %x[n_channels], #0, 17f\n"
+ "ld1 { v11.h }[2], [x28], #0x2\n"
+ "b 17f\n"
+ "16:" // Oddments: Load input (1, 0): Bit 1: Unset
+ "ld1 { v11.h }[0], [x28], #0x2\n"
+ "17:" // Oddments: Load input (1, 0): Bit 1: End
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr x27, [x16, #0x58]\n"
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "add x27, x27, x14\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v13.s }[0], [x27], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v13.h }[2], [x27], #0x2\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (1, 4): Bit 1: Unset
+ "ld1 { v13.h }[0], [x27], #0x2\n"
+ "19:" // Oddments: Load input (1, 4): Bit 1: End
+ "fmla v29.8h, v5.8h, v13.8h\n"
+ "ldr x26, [x16, #0x60]\n"
+ "fmla v26.8h, v2.8h, v13.8h\n"
+ "add x26, x26, x14\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v12.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 21f\n"
+ "ld1 { v12.h }[2], [x26], #0x2\n"
+ "b 21f\n"
+ "20:" // Oddments: Load input (3, 0): Bit 1: Unset
+ "ld1 { v12.h }[0], [x26], #0x2\n"
+ "21:" // Oddments: Load input (3, 0): Bit 1: End
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla v25.8h, v3.8h, v12.8h\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v10.h }[2], [x25], #0x2\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (3, 2): Bit 1: Unset
+ "ld1 { v10.h }[0], [x25], #0x2\n"
+ "23:" // Oddments: Load input (3, 2): Bit 1: End
+ "fmla v28.8h, v8.8h, v10.8h\n"
+ "ldr x24, [x16, #0x70]\n"
+ "fmla v27.8h, v7.8h, v10.8h\n"
+ "add x24, x24, x14\n"
+ "fmla v26.8h, v6.8h, v10.8h\n"
+ "fmla v25.8h, v5.8h, v10.8h\n"
+ "fmla v24.8h, v4.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 25f\n"
+ "ld1 { v11.h }[2], [x24], #0x2\n"
+ "b 25f\n"
+ "24:" // Oddments: Load input (3, 4): Bit 1: Unset
+ "ld1 { v11.h }[0], [x24], #0x2\n"
+ "25:" // Oddments: Load input (3, 4): Bit 1: End
+ "fmla v26.8h, v8.8h, v11.8h\n"
+ "ldr x23, [x16, #0x78]\n"
+ "fmla v23.8h, v5.8h, v11.8h\n"
+ "add x23, x23, x14\n"
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v13.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v13.h }[2], [x23], #0x2\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (4, 1): Bit 1: Unset
+ "ld1 { v13.h }[0], [x23], #0x2\n"
+ "27:" // Oddments: Load input (4, 1): Bit 1: End
+ "fmla v25.8h, v7.8h, v13.8h\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla v24.8h, v6.8h, v13.8h\n"
+ "add x10, x10, x14\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v12.s }[0], [x10], #0x4\n"
+ "tbz %x[n_channels], #0, 29f\n"
+ "ld1 { v12.h }[2], [x10], #0x2\n"
+ "b 29f\n"
+ "28:" // Oddments: Load input (1, 1): Bit 1: Unset
+ "ld1 { v12.h }[0], [x10], #0x2\n"
+ "29:" // Oddments: Load input (1, 1): Bit 1: End
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "add x9, x9, x14\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v27.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v11.s }[0], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v11.h }[2], [x9], #0x2\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (1, 3): Bit 1: Unset
+ "ld1 { v11.h }[0], [x9], #0x2\n"
+ "31:" // Oddments: Load input (1, 3): Bit 1: End
+ "fmla v30.8h, v5.8h, v11.8h\n"
+ "ldr x28, [x16, #0x90]\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "add x28, x28, x14\n"
+ "fmla v27.8h, v2.8h, v11.8h\n"
+ "fmla v26.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v13.s }[0], [x28], #0x4\n"
+ "tbz %x[n_channels], #0, 33f\n"
+ "ld1 { v13.h }[2], [x28], #0x2\n"
+ "b 33f\n"
+ "32:" // Oddments: Load input (4, 3): Bit 1: Unset
+ "ld1 { v13.h }[0], [x28], #0x2\n"
+ "33:" // Oddments: Load input (4, 3): Bit 1: End
+ "fmla v24.8h, v8.8h, v13.8h\n"
+ "ldr x27, [x16, #0x98]\n"
+ "fmla v23.8h, v7.8h, v13.8h\n"
+ "add x27, x27, x14\n"
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v12.s }[0], [x27], #0x4\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v12.h }[2], [x27], #0x2\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (3, 1): Bit 1: Unset
+ "ld1 { v12.h }[0], [x27], #0x2\n"
+ "35:" // Oddments: Load input (3, 1): Bit 1: End
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v27.8h, v6.8h, v12.8h\n"
+ "add x26, x26, x14\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "fmla v24.8h, v3.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v11.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 37f\n"
+ "ld1 { v11.h }[2], [x26], #0x2\n"
+ "b 37f\n"
+ "36:" // Oddments: Load input (0, 2): Bit 1: Unset
+ "ld1 { v11.h }[0], [x26], #0x2\n"
+ "37:" // Oddments: Load input (0, 2): Bit 1: End
+ "fmla v31.8h, v2.8h, v11.8h\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v30.8h, v1.8h, v11.8h\n"
+ "add x25, x25, x14\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v13.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v13.h }[2], [x25], #0x2\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (3, 3): Bit 1: Unset
+ "ld1 { v13.h }[0], [x25], #0x2\n"
+ "39:" // Oddments: Load input (3, 3): Bit 1: End
+ "fmla v27.8h, v8.8h, v13.8h\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v26.8h, v7.8h, v13.8h\n"
+ "add x24, x24, x14\n"
+ "fmla v24.8h, v5.8h, v13.8h\n"
+ "fmla v23.8h, v4.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "ld1 { v12.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 41f\n"
+ "ld1 { v12.h }[2], [x24], #0x2\n"
+ "b 41f\n"
+ "40:" // Oddments: Load input (2, 0): Bit 1: Unset
+ "ld1 { v12.h }[0], [x24], #0x2\n"
+ "41:" // Oddments: Load input (2, 0): Bit 1: End
+ "fmla v31.8h, v6.8h, v12.8h\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "add x23, x23, x14\n"
+ "fmla v25.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 42f\n"
+ "ld1 { v11.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v11.h }[2], [x23], #0x2\n"
+ "b 43f\n"
+ "42:" // Oddments: Load input (2, 4): Bit 1: Unset
+ "ld1 { v11.h }[0], [x23], #0x2\n"
+ "43:" // Oddments: Load input (2, 4): Bit 1: End
+ "fmla v29.8h, v8.8h, v11.8h\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla v26.8h, v5.8h, v11.8h\n"
+ "add x10, x10, x14\n"
+ "fmla v23.8h, v2.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 44f\n"
+ "ld1 { v13.s }[0], [x10], #0x4\n"
+ "tbz %x[n_channels], #0, 45f\n"
+ "ld1 { v13.h }[2], [x10], #0x2\n"
+ "b 45f\n"
+ "44:" // Oddments: Load input (4, 2): Bit 1: Unset
+ "ld1 { v13.h }[0], [x10], #0x2\n"
+ "45:" // Oddments: Load input (4, 2): Bit 1: End
+ "fmla v25.8h, v8.8h, v13.8h\n"
+ "fmla v24.8h, v7.8h, v13.8h\n"
+ "fmla v23.8h, v6.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "fmax v27.8h, v27.8h, v18.8h\n"
+ "fmax v26.8h, v26.8h, v18.8h\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "fmin v27.8h, v27.8h, v17.8h\n"
+ "fmin v26.8h, v26.8h, v17.8h\n"
+ "fmax v25.8h, v25.8h, v18.8h\n"
+ "fmax v24.8h, v24.8h, v18.8h\n"
+ "fmax v23.8h, v23.8h, v18.8h\n"
+ "fmin v25.8h, v25.8h, v17.8h\n"
+ "fmin v24.8h, v24.8h, v17.8h\n"
+ "fmin v23.8h, v23.8h, v17.8h\n"
+ "tbz %x[n_channels], #1, 46f\n"
+ "ldr x22, [x17, #0x0]\n"
+ "ldr x21, [x17, #0x8]\n"
+ "add x22, x22, x12\n"
+ "ldr x20, [x17, #0x10]\n"
+ "ldr x19, [x17, #0x18]\n"
+ "add x21, x21, x12\n"
+ "st1 { v31.s }[0], [x22]\n"
+ "add x20, x20, x12\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "ldr x22, [x17, #0x20]\n"
+ "add x19, x19, x12\n"
+ "st1 { v29.s }[0], [x20]\n"
+ "add x22, x22, x12\n"
+ "st1 { v28.s }[0], [x19]\n"
+ "ldr x21, [x17, #0x28]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.s }[0], [x22]\n"
+ "ldr x20, [x17, #0x30]\n"
+ "add x20, x20, x12\n"
+ "st1 { v26.s }[0], [x21]\n"
+ "ldr x19, [x17, #0x38]\n"
+ "add x19, x19, x12\n"
+ "st1 { v25.s }[0], [x20]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "add x22, x22, x12\n"
+ "st1 { v24.s }[0], [x19]\n"
+ "add x12, x12, #0x4\n"
+ "st1 { v23.s }[0], [x22]\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ldr x22, [x17, #0x0]\n"
+ "ldr x21, [x17, #0x8]\n"
+ "add x22, x22, x12\n"
+ "ldr x20, [x17, #0x10]\n"
+ "ldr x19, [x17, #0x18]\n"
+ "add x21, x21, x12\n"
+ "st1 { v31.h }[2], [x22]\n"
+ "add x20, x20, x12\n"
+ "st1 { v30.h }[2], [x21]\n"
+ "ldr x22, [x17, #0x20]\n"
+ "add x19, x19, x12\n"
+ "st1 { v29.h }[2], [x20]\n"
+ "add x22, x22, x12\n"
+ "st1 { v28.h }[2], [x19]\n"
+ "ldr x21, [x17, #0x28]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.h }[2], [x22]\n"
+ "ldr x20, [x17, #0x30]\n"
+ "add x20, x20, x12\n"
+ "st1 { v26.h }[2], [x21]\n"
+ "ldr x19, [x17, #0x38]\n"
+ "add x19, x19, x12\n"
+ "st1 { v25.h }[2], [x20]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "add x22, x22, x12\n"
+ "st1 { v24.h }[2], [x19]\n"
+ "st1 { v23.h }[2], [x22]\n"
+ "b 47f\n"
+ "46:" // Oddments: Store: Bit 1: Unset
+ "ldr x22, [x17, #0x0]\n"
+ "add x22, x22, x12\n"
+ "ldr x21, [x17, #0x8]\n"
+ "ldr x20, [x17, #0x10]\n"
+ "add x21, x21, x12\n"
+ "st1 { v31.h }[0], [x22]\n"
+ "ldr x19, [x17, #0x18]\n"
+ "add x20, x20, x12\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "add x19, x19, x12\n"
+ "st1 { v29.h }[0], [x20]\n"
+ "ldr x22, [x17, #0x20]\n"
+ "add x22, x22, x12\n"
+ "st1 { v28.h }[0], [x19]\n"
+ "ldr x21, [x17, #0x28]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.h }[0], [x22]\n"
+ "ldr x20, [x17, #0x30]\n"
+ "add x20, x20, x12\n"
+ "st1 { v26.h }[0], [x21]\n"
+ "ldr x19, [x17, #0x38]\n"
+ "add x19, x19, x12\n"
+ "st1 { v25.h }[0], [x20]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "add x22, x22, x12\n"
+ "st1 { v24.h }[0], [x19]\n"
+ "st1 { v23.h }[0], [x22]\n"
+ "47:" // Oddments: Store: Bit 1: End
+
+ "48:" // End
+
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
new file mode 100644
index 000000000..df5328724
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst
+{
+ typedef __fp16 bias_type;
+ typedef __fp16 input_type;
+ typedef __fp16 weight_type;
+ typedef __fp16 return_type;
+
+ typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 4;
+ constexpr static unsigned int output_cols = 4;
+
+ constexpr static unsigned int input_rows = 6;
+ constexpr static unsigned int input_cols = 6;
+
+ indirect_kern_type indirect_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+
+ a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 000000000..bf1846919
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,1233 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x4, #0x0\n"
+ "mov x26, #0x0\n"
+ "1:" // Tile loop
+ "str x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x25, #0x4\n"
+ "str x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x24, #0x4\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x23, %x[params_struct], %[offsetof_args_min]\n"
+ "ldr x22, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "add x21, %x[params_struct], %[offsetof_args_max]\n"
+ "ldr x6, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mov x7, #0x0\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x4, x22\n" // offset = tile_i * ld_input_row
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x26, x6, x19\n" // offset += tile_j * ld_input_col
+ "ldr x17, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x25\n" // offset *= kernel_stride * output_size
+ "ldr x16, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x8, x8, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ld1r { v15.8h }, [x23]\n"
+ "add x15, x8, x22, LSL #1\n"
+ "ld1r { v14.8h }, [x21]\n"
+ "add x14, x15, x22, LSL #1\n"
+ "lsl x6, x6, #0x1\n"
+ "add x13, x14, x22, LSL #1\n"
+ "add x12, x13, x22, LSL #1\n"
+ "add x11, x12, x22, LSL #1\n"
+ "add x10, x6, x6\n"
+ "add x9, x10, x6\n"
+ "add x28, x9, x6\n"
+ "add x27, x28, x6\n"
+ "mul x19, x4, x20\n" // offset = tile_i * ld_output_row
+ "madd x19, x26, x17, x19\n" // offset += tile_j * ld_output_col
+ "mul x19, x19, x24\n" // offset *= output_tile_size
+ "add x16, x16, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x26, x16, x20, LSL #1\n"
+ "add x25, x26, x20, LSL #1\n"
+ "add x24, x25, x20, LSL #1\n"
+ "lsl x17, x17, #0x1\n"
+ "add x23, x17, x17\n"
+ "add x22, x23, x17\n"
+ "mov x21, #0x10\n" // cntb _, ALL, #1
+ "sub x20, XZR, x21\n"
+ "lsr x19, %x[n_channels], #0x3\n"
+ "cbz x19, 4f\n"
+ "ldr q13, [x5, #0x0]\n"
+ "ldr q0, [x5, #0x10]\n"
+ "cmp x21, x19, LSL #4\n"
+ "ldr q1, [x5, #0x20]\n"
+ "ldr q2, [x5, #0x30]\n"
+ "ldr q3, [x5, #0x40]\n"
+ "ldr q4, [x5, #0x50]\n"
+ "ldr q5, [x5, #0x60]\n"
+ "ldr q6, [x5, #0x70]\n"
+ "ldr q7, [x5, #0x80]\n"
+ "ldr q8, [x5, #0x90]\n"
+ "add x5, x5, #0xa0\n"
+ "ldr q9, [x14, x10]\n"
+ "ld1 { v10.8h }, [x8]\n"
+ "ldr q11, [x8, x27]\n"
+ "ldr q12, [x14, x9]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "add x20, x20, #0x10\n"
+ "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "add x7, x7, #0x10\n"
+ "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "add x21, x21, #0x10\n"
+ "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+ "cmp x21, x19, LSL #4\n"
+ "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+ "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+ "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x13, x10]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x11]\n"
+ "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x11, x27]\n"
+ "fmla v30.8h, v8.8h, v12.8h\n"
+ "fmla v29.8h, v7.8h, v12.8h\n"
+ "fmla v26.8h, v5.8h, v12.8h\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x8, x6]\n"
+ "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n"
+ "ldr q10, [x13, x9]\n"
+ "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x8, x28]\n"
+ "fmla v27.8h, v8.8h, v9.8h\n"
+ "fmla v26.8h, v7.8h, v9.8h\n"
+ "fmla v25.8h, v6.8h, v9.8h\n"
+ "fmla v23.8h, v5.8h, v9.8h\n"
+ "fmla v22.8h, v4.8h, v9.8h\n"
+ "fmla v21.8h, v3.8h, v9.8h\n"
+ "fmla v19.8h, v2.8h, v9.8h\n"
+ "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n"
+ "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x15]\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "ldr q13, [x5, #0x0]\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x15, x27]\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "ld1 { v11.8h }, [x12]\n"
+ "fmla v26.8h, v8.8h, v10.8h\n"
+ "fmla v25.8h, v7.8h, v10.8h\n"
+ "fmla v24.8h, v6.8h, v10.8h\n"
+ "fmla v22.8h, v5.8h, v10.8h\n"
+ "fmla v21.8h, v4.8h, v10.8h\n"
+ "fmla v20.8h, v3.8h, v10.8h\n"
+ "fmla v18.8h, v2.8h, v10.8h\n"
+ "fmla v17.8h, v1.8h, v10.8h\n"
+ "fmla v16.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x15, x10]\n"
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "fmla v27.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v5.8h, v12.8h\n"
+ "fmla v24.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x15, x9]\n"
+ "fmla v23.8h, v6.8h, v11.8h\n"
+ "fmla v19.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x12, x27]\n"
+ "fmla v31.8h, v5.8h, v10.8h\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v2.8h, v10.8h\n"
+ "fmla v26.8h, v1.8h, v10.8h\n"
+ "fmla v25.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x14, x6]\n"
+ "fmla v20.8h, v8.8h, v11.8h\n"
+ "fmla v16.8h, v5.8h, v11.8h\n"
+ "ldr q11, [x11, x6]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v26.8h, v2.8h, v12.8h\n"
+ "fmla v25.8h, v1.8h, v12.8h\n"
+ "fmla v24.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x14, x28]\n"
+ "fmla v19.8h, v7.8h, v11.8h\n"
+ "fmla v18.8h, v6.8h, v11.8h\n"
+ "ldr q11, [x11, x28]\n"
+ "fmla v31.8h, v7.8h, v10.8h\n"
+ "fmla v30.8h, v6.8h, v10.8h\n"
+ "fmla v27.8h, v4.8h, v10.8h\n"
+ "fmla v26.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v22.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x8, x10]\n"
+ "fmla v17.8h, v8.8h, v11.8h\n"
+ "fmla v16.8h, v7.8h, v11.8h\n"
+ "ldr q11, [x13, x6]\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v25.8h, v5.8h, v12.8h\n"
+ "fmla v24.8h, v4.8h, v12.8h\n"
+ "fmla v21.8h, v2.8h, v12.8h\n"
+ "fmla v20.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x8, x9]\n"
+ "add x8, x8, #0x10\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x14]\n"
+ "fmla v27.8h, v7.8h, v11.8h\n"
+ "fmla v26.8h, v6.8h, v11.8h\n"
+ "fmla v23.8h, v4.8h, v11.8h\n"
+ "fmla v22.8h, v3.8h, v11.8h\n"
+ "fmla v19.8h, v1.8h, v11.8h\n"
+ "fmla v18.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x13, x28]\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x14, x27]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.8h, v6.8h, v10.8h\n"
+ "ldr q9, [x14, x10]\n"
+ "fmla v27.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v0.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x13]\n"
+ "fmla v25.8h, v8.8h, v11.8h\n"
+ "fmla v24.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v5.8h, v11.8h\n"
+ "fmla v20.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v2.8h, v11.8h\n"
+ "fmla v16.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x12, x10]\n"
+ "fmla v28.8h, v8.8h, v12.8h\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v20.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x13, x27]\n"
+ "add x13, x13, #0x10\n"
+ "fmla v27.8h, v6.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "fmla v19.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x11, x10]\n"
+ "fmla v22.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v6.8h, v11.8h\n"
+ "fmla v23.8h, v8.8h, v11.8h\n"
+ "fmla v19.8h, v5.8h, v11.8h\n"
+ "fmla v18.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x12, x9]\n"
+ "fmla v24.8h, v8.8h, v12.8h\n"
+ "fmla v20.8h, v5.8h, v12.8h\n"
+ "fmla v16.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x11, x9]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v19.8h, v8.8h, v10.8h\n"
+ "fmla v18.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v6.8h, v10.8h\n"
+ "ldr q10, [x15, x6]\n"
+ "fmla v22.8h, v8.8h, v11.8h\n"
+ "fmla v21.8h, v7.8h, v11.8h\n"
+ "fmla v20.8h, v6.8h, v11.8h\n"
+ "fmla v18.8h, v5.8h, v11.8h\n"
+ "fmla v17.8h, v4.8h, v11.8h\n"
+ "fmla v16.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x15, x28]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v18.8h, v8.8h, v12.8h\n"
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmla v17.8h, v7.8h, v12.8h\n"
+ "fmla v16.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x12, x6]\n"
+ "fmla v30.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v1.8h, v10.8h\n"
+ "fmla v26.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x12, x28]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.8h, v5.8h, v11.8h\n"
+ "ldr q0, [x5, #0x10]\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "fmla v25.8h, v2.8h, v11.8h\n"
+ "ldr q2, [x5, #0x30]\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x8, x27]\n"
+ "fmla v23.8h, v7.8h, v12.8h\n"
+ "ldr q1, [x5, #0x20]\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "ldr q6, [x5, #0x70]\n"
+ "fmla v19.8h, v4.8h, v12.8h\n"
+ "fmla v18.8h, v3.8h, v12.8h\n"
+ "ldr q12, [x14, x9]\n"
+ "fmla v21.8h, v8.8h, v10.8h\n"
+ "ldr q3, [x5, #0x40]\n"
+ "fmla v20.8h, v7.8h, v10.8h\n"
+ "ldr q7, [x5, #0x80]\n"
+ "fmla v17.8h, v5.8h, v10.8h\n"
+ "ldr q5, [x5, #0x60]\n"
+ "fmla v16.8h, v4.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x8]\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
+ "ldr q4, [x5, #0x50]\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "ldr q8, [x5, #0x90]\n"
+ "add x5, x5, #0xa0\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
+ "st1 { v31.8h }, [x16]\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "str q30, [x16, x17]\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "str q29, [x16, x23]\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "str q28, [x16, x22]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "add x16, x16, #0x10\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "st1 { v27.8h }, [x26]\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "str q26, [x26, x17]\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "str q25, [x26, x23]\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "str q24, [x26, x22]\n"
+ "add x26, x26, #0x10\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "st1 { v23.8h }, [x25]\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "str q22, [x25, x17]\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
+ "fmax v19.8h, v19.8h, v15.8h\n"
+ "str q21, [x25, x23]\n"
+ "fmax v18.8h, v18.8h, v15.8h\n"
+ "str q20, [x25, x22]\n"
+ "fmin v19.8h, v19.8h, v14.8h\n"
+ "add x25, x25, #0x10\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "st1 { v19.8h }, [x24]\n"
+ "fmax v17.8h, v17.8h, v15.8h\n"
+ "fmax v16.8h, v16.8h, v15.8h\n"
+ "str q18, [x24, x17]\n"
+ "fmin v17.8h, v17.8h, v14.8h\n"
+ "str q17, [x24, x23]\n"
+ "fmin v16.8h, v16.8h, v14.8h\n"
+ "str q16, [x24, x22]\n"
+ "add x24, x24, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+ "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+ "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+ "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x13, x10]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x11]\n"
+ "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x11, x27]\n"
+ "fmla v30.8h, v8.8h, v12.8h\n"
+ "fmla v29.8h, v7.8h, v12.8h\n"
+ "fmla v26.8h, v5.8h, v12.8h\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x8, x6]\n"
+ "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n"
+ "ldr q10, [x13, x9]\n"
+ "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x8, x28]\n"
+ "fmla v27.8h, v8.8h, v9.8h\n"
+ "fmla v26.8h, v7.8h, v9.8h\n"
+ "fmla v25.8h, v6.8h, v9.8h\n"
+ "fmla v23.8h, v5.8h, v9.8h\n"
+ "fmla v22.8h, v4.8h, v9.8h\n"
+ "fmla v21.8h, v3.8h, v9.8h\n"
+ "fmla v19.8h, v2.8h, v9.8h\n"
+ "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n"
+ "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x15]\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x15, x27]\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "ld1 { v11.8h }, [x12]\n"
+ "fmla v26.8h, v8.8h, v10.8h\n"
+ "fmla v25.8h, v7.8h, v10.8h\n"
+ "fmla v24.8h, v6.8h, v10.8h\n"
+ "fmla v22.8h, v5.8h, v10.8h\n"
+ "fmla v21.8h, v4.8h, v10.8h\n"
+ "fmla v20.8h, v3.8h, v10.8h\n"
+ "fmla v18.8h, v2.8h, v10.8h\n"
+ "fmla v17.8h, v1.8h, v10.8h\n"
+ "fmla v16.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x15, x10]\n"
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "fmla v27.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v5.8h, v12.8h\n"
+ "fmla v24.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x15, x9]\n"
+ "fmla v23.8h, v6.8h, v11.8h\n"
+ "fmla v19.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x12, x27]\n"
+ "fmla v31.8h, v5.8h, v10.8h\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v2.8h, v10.8h\n"
+ "fmla v26.8h, v1.8h, v10.8h\n"
+ "fmla v25.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x14, x6]\n"
+ "fmla v20.8h, v8.8h, v11.8h\n"
+ "fmla v16.8h, v5.8h, v11.8h\n"
+ "ldr q11, [x11, x6]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v26.8h, v2.8h, v12.8h\n"
+ "fmla v25.8h, v1.8h, v12.8h\n"
+ "fmla v24.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x14, x28]\n"
+ "fmla v19.8h, v7.8h, v11.8h\n"
+ "fmla v18.8h, v6.8h, v11.8h\n"
+ "ldr q11, [x11, x28]\n"
+ "fmla v31.8h, v7.8h, v10.8h\n"
+ "fmla v30.8h, v6.8h, v10.8h\n"
+ "fmla v27.8h, v4.8h, v10.8h\n"
+ "fmla v26.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v22.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x8, x10]\n"
+ "fmla v17.8h, v8.8h, v11.8h\n"
+ "fmla v16.8h, v7.8h, v11.8h\n"
+ "ldr q11, [x13, x6]\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v25.8h, v5.8h, v12.8h\n"
+ "fmla v24.8h, v4.8h, v12.8h\n"
+ "fmla v21.8h, v2.8h, v12.8h\n"
+ "fmla v20.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x8, x9]\n"
+ "add x8, x8, #0x10\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x14]\n"
+ "fmla v27.8h, v7.8h, v11.8h\n"
+ "fmla v26.8h, v6.8h, v11.8h\n"
+ "fmla v23.8h, v4.8h, v11.8h\n"
+ "fmla v22.8h, v3.8h, v11.8h\n"
+ "fmla v19.8h, v1.8h, v11.8h\n"
+ "fmla v18.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x13, x28]\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x14, x27]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.8h, v6.8h, v10.8h\n"
+ "fmla v27.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v0.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x13]\n"
+ "fmla v25.8h, v8.8h, v11.8h\n"
+ "fmla v24.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v5.8h, v11.8h\n"
+ "fmla v20.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v2.8h, v11.8h\n"
+ "fmla v16.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x12, x10]\n"
+ "fmla v28.8h, v8.8h, v12.8h\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v20.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x13, x27]\n"
+ "add x13, x13, #0x10\n"
+ "fmla v27.8h, v6.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "fmla v19.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x11, x10]\n"
+ "fmla v22.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v6.8h, v11.8h\n"
+ "fmla v23.8h, v8.8h, v11.8h\n"
+ "fmla v19.8h, v5.8h, v11.8h\n"
+ "fmla v18.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x12, x9]\n"
+ "fmla v24.8h, v8.8h, v12.8h\n"
+ "fmla v20.8h, v5.8h, v12.8h\n"
+ "fmla v16.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x11, x9]\n"
+ "add x11, x11, #0x10\n"
+ "fmla v19.8h, v8.8h, v10.8h\n"
+ "fmla v18.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v6.8h, v10.8h\n"
+ "ldr q10, [x15, x6]\n"
+ "fmla v22.8h, v8.8h, v11.8h\n"
+ "fmla v21.8h, v7.8h, v11.8h\n"
+ "fmla v20.8h, v6.8h, v11.8h\n"
+ "fmla v18.8h, v5.8h, v11.8h\n"
+ "fmla v17.8h, v4.8h, v11.8h\n"
+ "fmla v16.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x15, x28]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v18.8h, v8.8h, v12.8h\n"
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmla v17.8h, v7.8h, v12.8h\n"
+ "fmla v16.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x12, x6]\n"
+ "fmla v30.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v1.8h, v10.8h\n"
+ "fmla v26.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x12, x28]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.8h, v5.8h, v11.8h\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "fmla v25.8h, v2.8h, v11.8h\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "fmla v23.8h, v7.8h, v12.8h\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "fmla v19.8h, v4.8h, v12.8h\n"
+ "fmla v18.8h, v3.8h, v12.8h\n"
+ "fmla v21.8h, v8.8h, v10.8h\n"
+ "fmla v20.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v5.8h, v10.8h\n"
+ "fmla v16.8h, v4.8h, v10.8h\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
+ "st1 { v31.8h }, [x16]\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "str q30, [x16, x17]\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "str q29, [x16, x23]\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "str q28, [x16, x22]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "add x16, x16, #0x10\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "st1 { v27.8h }, [x26]\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "str q26, [x26, x17]\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "str q25, [x26, x23]\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "str q24, [x26, x22]\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "add x26, x26, #0x10\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "st1 { v23.8h }, [x25]\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "str q22, [x25, x17]\n"
+ "fmax v19.8h, v19.8h, v15.8h\n"
+ "fmax v18.8h, v18.8h, v15.8h\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "str q21, [x25, x23]\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
+ "fmin v19.8h, v19.8h, v14.8h\n"
+ "str q20, [x25, x22]\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "add x25, x25, #0x10\n"
+ "fmax v17.8h, v17.8h, v15.8h\n"
+ "st1 { v19.8h }, [x24]\n"
+ "fmax v16.8h, v16.8h, v15.8h\n"
+ "str q18, [x24, x17]\n"
+ "fmin v17.8h, v17.8h, v14.8h\n"
+ "str q17, [x24, x23]\n"
+ "fmin v16.8h, v16.8h, v14.8h\n"
+ "str q16, [x24, x22]\n"
+ "add x24, x24, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x1\n"
+ "beq 73f\n"
+ "ldr q13, [x5, #0x0]\n"
+ "ldr q0, [x5, #0x10]\n"
+ "add x22, x14, x10\n"
+ "ldr q1, [x5, #0x20]\n"
+ "add x21, x8, XZR\n"
+ "ldr q2, [x5, #0x30]\n"
+ "add x20, x8, x27\n"
+ "ldr q3, [x5, #0x40]\n"
+ "add x19, x14, x9\n"
+ "ldr q4, [x5, #0x50]\n"
+ "ldr q5, [x5, #0x60]\n"
+ "ldr q6, [x5, #0x70]\n"
+ "ldr q7, [x5, #0x80]\n"
+ "ldr q8, [x5, #0x90]\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ldr s9, [x22], #0x4\n"
+ "ldr s10, [x21], #0x4\n"
+ "ldr s11, [x20], #0x4\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v9.h }[2], [x22]\n"
+ "ld1 { v10.h }[2], [x21]\n"
+ "ld1 { v11.h }[2], [x20]\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 6f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset
+ "ldr h9, [x22, #0x0]\n"
+ "ldr h10, [x21, #0x0]\n"
+ "ldr h11, [x20, #0x0]\n"
+ "ldr h12, [x19, #0x0]\n"
+ "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
+ "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "add x19, x11, XZR\n"
+ "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+ "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+ "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+ "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v8.8h, v12.8h\n"
+ "fmla v29.8h, v7.8h, v12.8h\n"
+ "fmla v26.8h, v5.8h, v12.8h\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
+ "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n"
+ "add x19, x11, x27\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 10f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "10:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
+ "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n"
+ "add x19, x13, x10\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr s9, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v9.h }[2], [x19]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+ "ldr h9, [x19, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+ "fmla v27.8h, v8.8h, v9.8h\n"
+ "add x19, x8, x6\n"
+ "fmla v26.8h, v7.8h, v9.8h\n"
+ "fmla v25.8h, v6.8h, v9.8h\n"
+ "fmla v23.8h, v5.8h, v9.8h\n"
+ "fmla v22.8h, v4.8h, v9.8h\n"
+ "fmla v21.8h, v3.8h, v9.8h\n"
+ "fmla v19.8h, v2.8h, v9.8h\n"
+ "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n"
+ "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 14f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "14:" // Tile loop: Oddments: Load inputs: (0, 1): Bit 1: End
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "add x19, x8, x28\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (0, 4): Bit 1: End
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "add x19, x13, x9\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 18f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "18:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+ "fmla v26.8h, v8.8h, v10.8h\n"
+ "add x19, x15, XZR\n"
+ "fmla v25.8h, v7.8h, v10.8h\n"
+ "fmla v24.8h, v6.8h, v10.8h\n"
+ "fmla v22.8h, v5.8h, v10.8h\n"
+ "fmla v21.8h, v4.8h, v10.8h\n"
+ "fmla v20.8h, v3.8h, v10.8h\n"
+ "fmla v18.8h, v2.8h, v10.8h\n"
+ "fmla v17.8h, v1.8h, v10.8h\n"
+ "fmla v16.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr s9, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v9.h }[2], [x19]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: Unset
+ "ldr h9, [x19, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (1, 0): Bit 1: End
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "add x19, x15, x27\n"
+ "fmla v27.8h, v0.8h, v9.8h\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 22f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 22f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "22:" // Tile loop: Oddments: Load inputs: (1, 5): Bit 1: End
+ "fmla v28.8h, v5.8h, v12.8h\n"
+ "add x19, x12, XZR\n"
+ "fmla v24.8h, v2.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+ "fmla v23.8h, v6.8h, v11.8h\n"
+ "add x19, x15, x10\n"
+ "fmla v19.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 26f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 26f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "26:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End
+ "fmla v31.8h, v5.8h, v10.8h\n"
+ "add x19, x12, x27\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v2.8h, v10.8h\n"
+ "fmla v26.8h, v1.8h, v10.8h\n"
+ "fmla v25.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End
+ "fmla v20.8h, v8.8h, v11.8h\n"
+ "add x19, x15, x9\n"
+ "fmla v16.8h, v5.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 30f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 30f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "30:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "add x19, x11, x6\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v26.8h, v2.8h, v12.8h\n"
+ "fmla v25.8h, v1.8h, v12.8h\n"
+ "fmla v24.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End
+ "fmla v19.8h, v7.8h, v11.8h\n"
+ "add x19, x14, x6\n"
+ "fmla v18.8h, v6.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 34f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 34f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "34:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+ "fmla v31.8h, v7.8h, v10.8h\n"
+ "add x19, x11, x28\n"
+ "fmla v30.8h, v6.8h, v10.8h\n"
+ "fmla v27.8h, v4.8h, v10.8h\n"
+ "fmla v26.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v22.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End
+ "fmla v17.8h, v8.8h, v11.8h\n"
+ "add x19, x14, x28\n"
+ "fmla v16.8h, v7.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 38f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 38f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "38:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "add x19, x8, x10\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v25.8h, v5.8h, v12.8h\n"
+ "fmla v24.8h, v4.8h, v12.8h\n"
+ "fmla v21.8h, v2.8h, v12.8h\n"
+ "fmla v20.8h, v1.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (0, 2): Bit 1: End
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "add x19, x13, x6\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 42f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 42f\n"
+ "41:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "42:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+ "fmla v27.8h, v7.8h, v11.8h\n"
+ "add x19, x8, x9\n"
+ "fmla v26.8h, v6.8h, v11.8h\n"
+ "fmla v23.8h, v4.8h, v11.8h\n"
+ "fmla v22.8h, v3.8h, v11.8h\n"
+ "fmla v19.8h, v1.8h, v11.8h\n"
+ "fmla v18.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 43f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 44f\n"
+ "43:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "44:" // Tile loop: Oddments: Load inputs: (0, 3): Bit 1: End
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "add x19, x14, XZR\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 45f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 46f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 46f\n"
+ "45:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "46:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+ "fmla v31.8h, v6.8h, v10.8h\n"
+ "add x19, x13, x28\n"
+ "fmla v27.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 47f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 48f\n"
+ "47:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "48:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+ "fmla v25.8h, v8.8h, v11.8h\n"
+ "add x19, x14, x27\n"
+ "fmla v24.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v5.8h, v11.8h\n"
+ "fmla v20.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v2.8h, v11.8h\n"
+ "fmla v16.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 49f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 50f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 50f\n"
+ "49:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "50:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End
+ "fmla v28.8h, v8.8h, v12.8h\n"
+ "add x19, x13, XZR\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v20.8h, v2.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 51f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 52f\n"
+ "51:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "52:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+ "fmla v27.8h, v6.8h, v10.8h\n"
+ "add x19, x12, x10\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "fmla v19.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 53f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 54f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 54f\n"
+ "53:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "54:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+ "fmla v23.8h, v8.8h, v11.8h\n"
+ "add x19, x13, x27\n"
+ "fmla v22.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v6.8h, v11.8h\n"
+ "fmla v19.8h, v5.8h, v11.8h\n"
+ "fmla v18.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 55f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 56f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 56f\n"
+ "55:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "56:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End
+ "fmla v24.8h, v8.8h, v12.8h\n"
+ "add x19, x11, x10\n"
+ "fmla v20.8h, v5.8h, v12.8h\n"
+ "fmla v16.8h, v2.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 57f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 58f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 58f\n"
+ "57:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "58:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End
+ "fmla v19.8h, v8.8h, v10.8h\n"
+ "add x19, x12, x9\n"
+ "fmla v18.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v6.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 59f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 60f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 60f\n"
+ "59:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "60:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+ "fmla v22.8h, v8.8h, v11.8h\n"
+ "add x19, x11, x9\n"
+ "fmla v21.8h, v7.8h, v11.8h\n"
+ "fmla v20.8h, v6.8h, v11.8h\n"
+ "fmla v18.8h, v5.8h, v11.8h\n"
+ "fmla v17.8h, v4.8h, v11.8h\n"
+ "fmla v16.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 61f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 62f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 62f\n"
+ "61:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "62:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End
+ "fmla v18.8h, v8.8h, v12.8h\n"
+ "add x19, x15, x6\n"
+ "fmla v17.8h, v7.8h, v12.8h\n"
+ "fmla v16.8h, v6.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 63f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 64f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 64f\n"
+ "63:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "64:" // Tile loop: Oddments: Load inputs: (1, 1): Bit 1: End
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "add x19, x15, x28\n"
+ "fmla v30.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v1.8h, v10.8h\n"
+ "fmla v26.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 65f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 66f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 66f\n"
+ "65:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "66:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+ "fmla v29.8h, v5.8h, v11.8h\n"
+ "add x19, x12, x6\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "fmla v25.8h, v2.8h, v11.8h\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 67f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 68f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 68f\n"
+ "67:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "68:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+ "fmla v23.8h, v7.8h, v12.8h\n"
+ "add x19, x12, x28\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "fmla v19.8h, v4.8h, v12.8h\n"
+ "fmla v18.8h, v3.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 69f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 70f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 70f\n"
+ "69:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "70:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+ "fmla v21.8h, v8.8h, v10.8h\n"
+ "fmla v20.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v5.8h, v10.8h\n"
+ "fmla v16.8h, v4.8h, v10.8h\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
+ "fmax v19.8h, v19.8h, v15.8h\n"
+ "fmax v18.8h, v18.8h, v15.8h\n"
+ "fmax v17.8h, v17.8h, v15.8h\n"
+ "fmin v19.8h, v19.8h, v14.8h\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "fmin v17.8h, v17.8h, v14.8h\n"
+ "fmax v16.8h, v16.8h, v15.8h\n"
+ "fmin v16.8h, v16.8h, v14.8h\n"
+ "tbz %x[n_channels], #1, 71f\n"
+ "mov x19, x16\n"
+ "st1 { v31.s }[0], [x19], x17\n"
+ "add x16, x16, #0x4\n"
+ "st1 { v30.s }[0], [x19], x17\n"
+ "mov x21, x26\n"
+ "st1 { v29.s }[0], [x19], x17\n"
+ "st1 { v27.s }[0], [x21], x17\n"
+ "add x26, x26, #0x4\n"
+ "st1 { v28.s }[0], [x19]\n"
+ "mov x20, x25\n"
+ "st1 { v26.s }[0], [x21], x17\n"
+ "add x25, x25, #0x4\n"
+ "st1 { v25.s }[0], [x21], x17\n"
+ "mov x19, x24\n"
+ "st1 { v24.s }[0], [x21]\n"
+ "add x24, x24, #0x4\n"
+ "st1 { v23.s }[0], [x20], x17\n"
+ "st1 { v22.s }[0], [x20], x17\n"
+ "st1 { v21.s }[0], [x20], x17\n"
+ "st1 { v20.s }[0], [x20]\n"
+ "st1 { v19.s }[0], [x19], x17\n"
+ "st1 { v18.s }[0], [x19], x17\n"
+ "st1 { v17.s }[0], [x19], x17\n"
+ "st1 { v16.s }[0], [x19]\n"
+ "tbz %x[n_channels], #0, 72f\n"
+ "mov x22, x16\n"
+ "st1 { v31.h }[2], [x22], x17\n"
+ "mov x21, x26\n"
+ "st1 { v30.h }[2], [x22], x17\n"
+ "st1 { v27.h }[2], [x21], x17\n"
+ "mov x20, x25\n"
+ "st1 { v29.h }[2], [x22], x17\n"
+ "mov x19, x24\n"
+ "st1 { v28.h }[2], [x22]\n"
+ "st1 { v26.h }[2], [x21], x17\n"
+ "st1 { v25.h }[2], [x21], x17\n"
+ "st1 { v24.h }[2], [x21]\n"
+ "st1 { v23.h }[2], [x20], x17\n"
+ "st1 { v22.h }[2], [x20], x17\n"
+ "st1 { v21.h }[2], [x20], x17\n"
+ "st1 { v20.h }[2], [x20]\n"
+ "st1 { v19.h }[2], [x19], x17\n"
+ "st1 { v18.h }[2], [x19], x17\n"
+ "st1 { v17.h }[2], [x19], x17\n"
+ "st1 { v16.h }[2], [x19]\n"
+ "b 72f\n"
+ "71:" // Tile loop: Oddments: Store: Bit 1: Unset
+ "mov x22, x16\n"
+ "st1 { v31.h }[0], [x22], x17\n"
+ "mov x21, x26\n"
+ "mov x20, x25\n"
+ "st1 { v30.h }[0], [x22], x17\n"
+ "st1 { v27.h }[0], [x21], x17\n"
+ "mov x19, x24\n"
+ "st1 { v29.h }[0], [x22], x17\n"
+ "st1 { v28.h }[0], [x22]\n"
+ "st1 { v26.h }[0], [x21], x17\n"
+ "st1 { v25.h }[0], [x21], x17\n"
+ "st1 { v24.h }[0], [x21]\n"
+ "st1 { v23.h }[0], [x20], x17\n"
+ "st1 { v22.h }[0], [x20], x17\n"
+ "st1 { v21.h }[0], [x20], x17\n"
+ "st1 { v20.h }[0], [x20]\n"
+ "st1 { v19.h }[0], [x19], x17\n"
+ "st1 { v18.h }[0], [x19], x17\n"
+ "st1 { v17.h }[0], [x19], x17\n"
+ "st1 { v16.h }[0], [x19]\n"
+ "72:" // Tile loop: Oddments: Store: Bit 1: End
+
+ "73:" // Tile loop: End
+ "ldr x4, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x4, #0x1\n"
+ "ldr x26, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x26, x26, #0x1\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x26, x19\n"
+ "csel x26, x26, XZR, LT\n"
+ "csel x4, x4, x21, LT\n"
+ "cmp x4, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 000000000..40c019a36
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,1399 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[36];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[14];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[5];
+ inptrs[3] = input_ptrs[15];
+ inptrs[4] = input_ptrs[30];
+ inptrs[5] = input_ptrs[35];
+ inptrs[6] = input_ptrs[20];
+ inptrs[7] = input_ptrs[1];
+ inptrs[8] = input_ptrs[4];
+ inptrs[9] = input_ptrs[21];
+ inptrs[10] = input_ptrs[6];
+ inptrs[11] = input_ptrs[11];
+ inptrs[12] = input_ptrs[24];
+ inptrs[13] = input_ptrs[8];
+ inptrs[14] = input_ptrs[29];
+ inptrs[15] = input_ptrs[9];
+ inptrs[16] = input_ptrs[31];
+ inptrs[17] = input_ptrs[13];
+ inptrs[18] = input_ptrs[34];
+ inptrs[19] = input_ptrs[16];
+ inptrs[20] = input_ptrs[2];
+ inptrs[21] = input_ptrs[19];
+ inptrs[22] = input_ptrs[3];
+ inptrs[23] = input_ptrs[12];
+ inptrs[24] = input_ptrs[22];
+ inptrs[25] = input_ptrs[17];
+ inptrs[26] = input_ptrs[18];
+ inptrs[27] = input_ptrs[26];
+ inptrs[28] = input_ptrs[23];
+ inptrs[29] = input_ptrs[32];
+ inptrs[30] = input_ptrs[27];
+ inptrs[31] = input_ptrs[33];
+ inptrs[32] = input_ptrs[7];
+ inptrs[33] = input_ptrs[10];
+ inptrs[34] = input_ptrs[25];
+ inptrs[35] = input_ptrs[28];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x17, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "add x19, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v15.8h }, [x20]\n"
+ "ld1r { v14.8h }, [x19]\n"
+ "mov x14, #0x0\n"
+ "mov x13, #0x10\n" // cntb _, ALL, #1
+ "sub x12, XZR, x13\n"
+ "lsr x11, %x[n_channels], #0x3\n"
+ "cbz x11, 3f\n"
+ "ldr q13, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x13, x11, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "ldp x10, x9, [x16, #0x0]\n"
+ "ldp x28, x27, [x16, #0x10]\n"
+ "ldr q9, [x10, x14]\n"
+ "ldr q10, [x9, x14]\n"
+ "ldr q11, [x28, x14]\n"
+ "ldr q12, [x27, x14]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "ldr x26, [x16, #0x20]\n"
+ "add x12, x12, #0x10\n"
+ "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "ldr x25, [x16, #0x28]\n"
+ "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+ "ldr x23, [x16, #0x38]\n"
+ "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+ "ldr x10, [x16, #0x40]\n"
+ "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+ "ldr x9, [x16, #0x48]\n"
+ "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "ldr x28, [x16, #0x50]\n"
+ "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "ldr x27, [x16, #0x58]\n"
+ "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x24, x14]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x26, x14]\n"
+ "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x25, x14]\n"
+ "fmla v30.8h, v8.8h, v12.8h\n"
+ "ldr x26, [x16, #0x60]\n"
+ "fmla v29.8h, v7.8h, v12.8h\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla v26.8h, v5.8h, v12.8h\n"
+ "ldr x24, [x16, #0x70]\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "ldr x22, [x17, #0x0]\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "ldr x21, [x17, #0x8]\n"
+ "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+ "ldr x20, [x17, #0x10]\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "ldr x19, [x17, #0x18]\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x23, x14]\n"
+ "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n"
+ "ldr q10, [x9, x14]\n"
+ "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x10, x14]\n"
+ "fmla v27.8h, v8.8h, v9.8h\n"
+ "ldr x23, [x16, #0x78]\n"
+ "fmla v26.8h, v7.8h, v9.8h\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla v25.8h, v6.8h, v9.8h\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla v23.8h, v5.8h, v9.8h\n"
+ "fmla v22.8h, v4.8h, v9.8h\n"
+ "fmla v21.8h, v3.8h, v9.8h\n"
+ "fmla v19.8h, v2.8h, v9.8h\n"
+ "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n"
+ "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x28, x14]\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "ldr x28, [x16, #0x90]\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x27, x14]\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "ldr x27, [x16, #0x98]\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x26, x14]\n"
+ "fmla v26.8h, v8.8h, v10.8h\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v25.8h, v7.8h, v10.8h\n"
+ "ldr q13, [x15, #0x0]\n"
+ "fmla v24.8h, v6.8h, v10.8h\n"
+ "fmla v22.8h, v5.8h, v10.8h\n"
+ "fmla v21.8h, v4.8h, v10.8h\n"
+ "fmla v20.8h, v3.8h, v10.8h\n"
+ "fmla v18.8h, v2.8h, v10.8h\n"
+ "fmla v17.8h, v1.8h, v10.8h\n"
+ "fmla v16.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x25, x14]\n"
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v27.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v5.8h, v12.8h\n"
+ "fmla v24.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x23, x14]\n"
+ "fmla v23.8h, v6.8h, v11.8h\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v19.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v31.8h, v5.8h, v10.8h\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v2.8h, v10.8h\n"
+ "fmla v26.8h, v1.8h, v10.8h\n"
+ "fmla v25.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x9, x14]\n"
+ "fmla v20.8h, v8.8h, v11.8h\n"
+ "ldr x9, [x16, #0xc8]\n"
+ "fmla v16.8h, v5.8h, v11.8h\n"
+ "ldr q11, [x10, x14]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v26.8h, v2.8h, v12.8h\n"
+ "fmla v25.8h, v1.8h, v12.8h\n"
+ "fmla v24.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x27, x14]\n"
+ "fmla v19.8h, v7.8h, v11.8h\n"
+ "ldr x27, [x16, #0xd8]\n"
+ "fmla v18.8h, v6.8h, v11.8h\n"
+ "ldr q11, [x28, x14]\n"
+ "fmla v31.8h, v7.8h, v10.8h\n"
+ "ldr x28, [x16, #0xd0]\n"
+ "fmla v30.8h, v6.8h, v10.8h\n"
+ "fmla v27.8h, v4.8h, v10.8h\n"
+ "fmla v26.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v22.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x26, x14]\n"
+ "fmla v17.8h, v8.8h, v11.8h\n"
+ "ldr x26, [x16, #0xe0]\n"
+ "fmla v16.8h, v7.8h, v11.8h\n"
+ "ldr q11, [x25, x14]\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v25.8h, v5.8h, v12.8h\n"
+ "fmla v24.8h, v4.8h, v12.8h\n"
+ "fmla v21.8h, v2.8h, v12.8h\n"
+ "fmla v20.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x24, x14]\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x23, x14]\n"
+ "fmla v27.8h, v7.8h, v11.8h\n"
+ "ldr x23, [x16, #0xf8]\n"
+ "fmla v26.8h, v6.8h, v11.8h\n"
+ "fmla v23.8h, v4.8h, v11.8h\n"
+ "fmla v22.8h, v3.8h, v11.8h\n"
+ "fmla v19.8h, v1.8h, v11.8h\n"
+ "fmla v18.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x10, x14]\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "ldr x10, [x16, #0x100]\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x9, x14]\n"
+ "fmla v31.8h, v6.8h, v10.8h\n"
+ "ldr x9, [x16, #0x108]\n"
+ "fmla v27.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x28, x14]\n"
+ "fmla v25.8h, v8.8h, v11.8h\n"
+ "ldr x28, [x16, #0x110]\n"
+ "fmla v24.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v5.8h, v11.8h\n"
+ "fmla v20.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v2.8h, v11.8h\n"
+ "fmla v16.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x27, x14]\n"
+ "fmla v28.8h, v8.8h, v12.8h\n"
+ "ldr x27, [x16, #0x118]\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v20.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x26, x14]\n"
+ "fmla v27.8h, v6.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "fmla v19.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x25, x14]\n"
+ "fmla v22.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v6.8h, v11.8h\n"
+ "fmla v23.8h, v8.8h, v11.8h\n"
+ "fmla v19.8h, v5.8h, v11.8h\n"
+ "fmla v18.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v24.8h, v8.8h, v12.8h\n"
+ "fmla v20.8h, v5.8h, v12.8h\n"
+ "fmla v16.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x23, x14]\n"
+ "fmla v19.8h, v8.8h, v10.8h\n"
+ "fmla v18.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v6.8h, v10.8h\n"
+ "ldr q10, [x10, x14]\n"
+ "fmla v22.8h, v8.8h, v11.8h\n"
+ "fmla v21.8h, v7.8h, v11.8h\n"
+ "fmla v20.8h, v6.8h, v11.8h\n"
+ "fmla v18.8h, v5.8h, v11.8h\n"
+ "fmla v17.8h, v4.8h, v11.8h\n"
+ "fmla v16.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x9, x14]\n"
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "ldp x10, x9, [x16, #0x0]\n"
+ "fmla v18.8h, v8.8h, v12.8h\n"
+ "ldr q9, [x10, x13]\n"
+ "fmla v17.8h, v7.8h, v12.8h\n"
+ "fmla v16.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x28, x14]\n"
+ "fmla v30.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v1.8h, v10.8h\n"
+ "fmla v26.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x27, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v29.8h, v5.8h, v11.8h\n"
+ "ldp x28, x27, [x16, #0x10]\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "ldr q0, [x15, #0x10]\n"
+ "fmla v25.8h, v2.8h, v11.8h\n"
+ "ldr q2, [x15, #0x30]\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x28, x13]\n"
+ "fmla v23.8h, v7.8h, v12.8h\n"
+ "ldr q1, [x15, #0x20]\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "ldr q6, [x15, #0x70]\n"
+ "fmla v19.8h, v4.8h, v12.8h\n"
+ "fmla v18.8h, v3.8h, v12.8h\n"
+ "ldr q12, [x27, x13]\n"
+ "fmla v21.8h, v8.8h, v10.8h\n"
+ "ldr q3, [x15, #0x40]\n"
+ "fmla v20.8h, v7.8h, v10.8h\n"
+ "ldr q7, [x15, #0x80]\n"
+ "fmla v17.8h, v5.8h, v10.8h\n"
+ "ldr q5, [x15, #0x60]\n"
+ "fmla v16.8h, v4.8h, v10.8h\n"
+ "ldr q10, [x9, x13]\n"
+ "add x13, x13, #0x10\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
+ "ldr q4, [x15, #0x50]\n"
+ "cmp x13, x11, LSL #4\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
+ "str q31, [x22, x12]\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "ldr x22, [x17, #0x20]\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "str q30, [x21, x12]\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "str q29, [x20, x12]\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "str q28, [x19, x12]\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "ldr x21, [x17, #0x28]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "ldr x20, [x17, #0x30]\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "ldr x19, [x17, #0x38]\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "str q27, [x22, x12]\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "str q26, [x21, x12]\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "str q25, [x20, x12]\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "str q24, [x19, x12]\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "ldr x22, [x17, #0x40]\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "ldr x21, [x17, #0x48]\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "ldr x20, [x17, #0x50]\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "str q23, [x22, x12]\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "str q22, [x21, x12]\n"
+ "fmax v19.8h, v19.8h, v15.8h\n"
+ "str q21, [x20, x12]\n"
+ "fmax v18.8h, v18.8h, v15.8h\n"
+ "ldr x19, [x17, #0x58]\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
+ "ldr x22, [x17, #0x60]\n"
+ "fmin v19.8h, v19.8h, v14.8h\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "str q20, [x19, x12]\n"
+ "fmax v17.8h, v17.8h, v15.8h\n"
+ "str q19, [x22, x12]\n"
+ "fmax v16.8h, v16.8h, v15.8h\n"
+ "str q18, [x21, x12]\n"
+ "ldr x20, [x17, #0x70]\n"
+ "fmin v17.8h, v17.8h, v14.8h\n"
+ "ldr x19, [x17, #0x78]\n"
+ "fmin v16.8h, v16.8h, v14.8h\n"
+ "str q17, [x20, x12]\n"
+ "str q16, [x19, x12]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "ldr x26, [x16, #0x20]\n"
+ "add x12, x12, #0x10\n"
+ "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "ldr x25, [x16, #0x28]\n"
+ "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "ldr x24, [x16, #0x30]\n"
+ "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+ "ldr x23, [x16, #0x38]\n"
+ "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+ "ldr x10, [x16, #0x40]\n"
+ "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+ "ldr x9, [x16, #0x48]\n"
+ "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "ldr x28, [x16, #0x50]\n"
+ "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "ldr x27, [x16, #0x58]\n"
+ "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x24, x14]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x26, x14]\n"
+ "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q11, [x25, x14]\n"
+ "fmla v30.8h, v8.8h, v12.8h\n"
+ "ldr x26, [x16, #0x60]\n"
+ "fmla v29.8h, v7.8h, v12.8h\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla v26.8h, v5.8h, v12.8h\n"
+ "ldr x24, [x16, #0x70]\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "ldr x22, [x17, #0x0]\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "ldr x21, [x17, #0x8]\n"
+ "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+ "ldr x20, [x17, #0x10]\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "ldr x19, [x17, #0x18]\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x23, x14]\n"
+ "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n"
+ "ldr q10, [x9, x14]\n"
+ "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x10, x14]\n"
+ "fmla v27.8h, v8.8h, v9.8h\n"
+ "ldr x23, [x16, #0x78]\n"
+ "fmla v26.8h, v7.8h, v9.8h\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla v25.8h, v6.8h, v9.8h\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla v23.8h, v5.8h, v9.8h\n"
+ "fmla v22.8h, v4.8h, v9.8h\n"
+ "fmla v21.8h, v3.8h, v9.8h\n"
+ "fmla v19.8h, v2.8h, v9.8h\n"
+ "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n"
+ "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x28, x14]\n"
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "ldr x28, [x16, #0x90]\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x27, x14]\n"
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "ldr x27, [x16, #0x98]\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x26, x14]\n"
+ "fmla v26.8h, v8.8h, v10.8h\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v25.8h, v7.8h, v10.8h\n"
+ "fmla v24.8h, v6.8h, v10.8h\n"
+ "fmla v22.8h, v5.8h, v10.8h\n"
+ "fmla v21.8h, v4.8h, v10.8h\n"
+ "fmla v20.8h, v3.8h, v10.8h\n"
+ "fmla v18.8h, v2.8h, v10.8h\n"
+ "fmla v17.8h, v1.8h, v10.8h\n"
+ "fmla v16.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x25, x14]\n"
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v27.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v5.8h, v12.8h\n"
+ "fmla v24.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x23, x14]\n"
+ "fmla v23.8h, v6.8h, v11.8h\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v19.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v31.8h, v5.8h, v10.8h\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v2.8h, v10.8h\n"
+ "fmla v26.8h, v1.8h, v10.8h\n"
+ "fmla v25.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x9, x14]\n"
+ "fmla v20.8h, v8.8h, v11.8h\n"
+ "ldr x9, [x16, #0xc8]\n"
+ "fmla v16.8h, v5.8h, v11.8h\n"
+ "ldr q11, [x10, x14]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v26.8h, v2.8h, v12.8h\n"
+ "fmla v25.8h, v1.8h, v12.8h\n"
+ "fmla v24.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x27, x14]\n"
+ "fmla v19.8h, v7.8h, v11.8h\n"
+ "ldr x27, [x16, #0xd8]\n"
+ "fmla v18.8h, v6.8h, v11.8h\n"
+ "ldr q11, [x28, x14]\n"
+ "fmla v31.8h, v7.8h, v10.8h\n"
+ "ldr x28, [x16, #0xd0]\n"
+ "fmla v30.8h, v6.8h, v10.8h\n"
+ "fmla v27.8h, v4.8h, v10.8h\n"
+ "fmla v26.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v22.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x26, x14]\n"
+ "fmla v17.8h, v8.8h, v11.8h\n"
+ "ldr x26, [x16, #0xe0]\n"
+ "fmla v16.8h, v7.8h, v11.8h\n"
+ "ldr q11, [x25, x14]\n"
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "fmla v25.8h, v5.8h, v12.8h\n"
+ "fmla v24.8h, v4.8h, v12.8h\n"
+ "fmla v21.8h, v2.8h, v12.8h\n"
+ "fmla v20.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x24, x14]\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x23, x14]\n"
+ "fmla v27.8h, v7.8h, v11.8h\n"
+ "ldr x23, [x16, #0xf8]\n"
+ "fmla v26.8h, v6.8h, v11.8h\n"
+ "fmla v23.8h, v4.8h, v11.8h\n"
+ "fmla v22.8h, v3.8h, v11.8h\n"
+ "fmla v19.8h, v1.8h, v11.8h\n"
+ "fmla v18.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x10, x14]\n"
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "ldr x10, [x16, #0x100]\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "ldr q12, [x9, x14]\n"
+ "fmla v31.8h, v6.8h, v10.8h\n"
+ "ldr x9, [x16, #0x108]\n"
+ "fmla v27.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x28, x14]\n"
+ "fmla v25.8h, v8.8h, v11.8h\n"
+ "ldr x28, [x16, #0x110]\n"
+ "fmla v24.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v5.8h, v11.8h\n"
+ "fmla v20.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v2.8h, v11.8h\n"
+ "fmla v16.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x27, x14]\n"
+ "fmla v28.8h, v8.8h, v12.8h\n"
+ "ldr x27, [x16, #0x118]\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "fmla v20.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x26, x14]\n"
+ "fmla v27.8h, v6.8h, v10.8h\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "fmla v19.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x25, x14]\n"
+ "fmla v22.8h, v7.8h, v11.8h\n"
+ "fmla v21.8h, v6.8h, v11.8h\n"
+ "fmla v23.8h, v8.8h, v11.8h\n"
+ "fmla v19.8h, v5.8h, v11.8h\n"
+ "fmla v18.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x24, x14]\n"
+ "fmla v24.8h, v8.8h, v12.8h\n"
+ "fmla v20.8h, v5.8h, v12.8h\n"
+ "fmla v16.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x23, x14]\n"
+ "fmla v19.8h, v8.8h, v10.8h\n"
+ "fmla v18.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v6.8h, v10.8h\n"
+ "ldr q10, [x10, x14]\n"
+ "fmla v22.8h, v8.8h, v11.8h\n"
+ "fmla v21.8h, v7.8h, v11.8h\n"
+ "fmla v20.8h, v6.8h, v11.8h\n"
+ "fmla v18.8h, v5.8h, v11.8h\n"
+ "fmla v17.8h, v4.8h, v11.8h\n"
+ "fmla v16.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x9, x14]\n"
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "fmla v18.8h, v8.8h, v12.8h\n"
+ "fmla v17.8h, v7.8h, v12.8h\n"
+ "fmla v16.8h, v6.8h, v12.8h\n"
+ "ldr q12, [x28, x14]\n"
+ "fmla v30.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v1.8h, v10.8h\n"
+ "fmla v26.8h, v0.8h, v10.8h\n"
+ "ldr q10, [x27, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v29.8h, v5.8h, v11.8h\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "fmla v25.8h, v2.8h, v11.8h\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "fmla v23.8h, v7.8h, v12.8h\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "fmla v19.8h, v4.8h, v12.8h\n"
+ "fmla v18.8h, v3.8h, v12.8h\n"
+ "fmla v21.8h, v8.8h, v10.8h\n"
+ "fmla v20.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v5.8h, v10.8h\n"
+ "fmla v16.8h, v4.8h, v10.8h\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
+ "str q31, [x22, x12]\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "ldr x22, [x17, #0x20]\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "str q30, [x21, x12]\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "str q29, [x20, x12]\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "ldr x21, [x17, #0x28]\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "ldr x20, [x17, #0x30]\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "str q28, [x19, x12]\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "ldr x19, [x17, #0x38]\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "str q27, [x22, x12]\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "str q26, [x21, x12]\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "str q25, [x20, x12]\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "ldr x22, [x17, #0x40]\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "ldr x21, [x17, #0x48]\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "ldr x20, [x17, #0x50]\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "str q24, [x19, x12]\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "str q23, [x22, x12]\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "str q22, [x21, x12]\n"
+ "fmax v19.8h, v19.8h, v15.8h\n"
+ "ldr x19, [x17, #0x58]\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "ldr x22, [x17, #0x60]\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
+ "ldr x21, [x17, #0x68]\n"
+ "fmin v19.8h, v19.8h, v14.8h\n"
+ "str q21, [x20, x12]\n"
+ "fmax v18.8h, v18.8h, v15.8h\n"
+ "str q20, [x19, x12]\n"
+ "fmax v17.8h, v17.8h, v15.8h\n"
+ "str q19, [x22, x12]\n"
+ "fmax v16.8h, v16.8h, v15.8h\n"
+ "ldr x20, [x17, #0x70]\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "ldr x19, [x17, #0x78]\n"
+ "fmin v17.8h, v17.8h, v14.8h\n"
+ "str q18, [x21, x12]\n"
+ "fmin v16.8h, v16.8h, v14.8h\n"
+ "str q17, [x20, x12]\n"
+ "str q16, [x19, x12]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x1\n"
+ "beq 72f\n"
+ "ldr q13, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "mov x12, x14\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "ldr x10, [x16, #0x0]\n"
+ "add x10, x10, x14\n"
+ "ldr x9, [x16, #0x8]\n"
+ "ldr x28, [x16, #0x10]\n"
+ "add x9, x9, x14\n"
+ "ldr x27, [x16, #0x18]\n"
+ "add x28, x28, x14\n"
+ "add x27, x27, x14\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.s }[0], [x10], #0x4\n"
+ "ld1 { v10.s }[0], [x9], #0x4\n"
+ "ld1 { v11.s }[0], [x28], #0x4\n"
+ "ld1 { v12.s }[0], [x27], #0x4\n"
+ "tbz %x[n_channels], #0, 5f\n"
+ "ld1 { v9.h }[2], [x10], #0x2\n"
+ "ld1 { v10.h }[2], [x9], #0x2\n"
+ "ld1 { v11.h }[2], [x28], #0x2\n"
+ "ld1 { v12.h }[2], [x27], #0x2\n"
+ "b 5f\n"
+ "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: Unset
+ "ld1 { v9.h }[0], [x10], #0x2\n"
+ "ld1 { v10.h }[0], [x9], #0x2\n"
+ "ld1 { v11.h }[0], [x28], #0x2\n"
+ "ld1 { v12.h }[0], [x27], #0x2\n"
+ "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 5), (2, 3): Bit 1: End
+ "mov v31.16b, v13.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "ldr x26, [x16, #0x20]\n"
+ "add x26, x26, x14\n"
+ "mov v30.16b, v13.16b\n fmla v30.8h, v7.8h, v9.8h\n"
+ "mov v29.16b, v13.16b\n fmla v29.8h, v6.8h, v9.8h\n"
+ "mov v27.16b, v13.16b\n fmla v27.8h, v5.8h, v9.8h\n"
+ "mov v26.16b, v13.16b\n fmla v26.8h, v4.8h, v9.8h\n"
+ "mov v25.16b, v13.16b\n fmla v25.8h, v3.8h, v9.8h\n"
+ "mov v23.16b, v13.16b\n fmla v23.8h, v2.8h, v9.8h\n"
+ "mov v22.16b, v13.16b\n fmla v22.8h, v1.8h, v9.8h\n"
+ "mov v21.16b, v13.16b\n fmla v21.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "mov v28.16b, v13.16b\n fmla v28.8h, v2.8h, v11.8h\n"
+ "fmla v30.8h, v8.8h, v12.8h\n"
+ "fmla v29.8h, v7.8h, v12.8h\n"
+ "fmla v26.8h, v5.8h, v12.8h\n"
+ "fmla v28.8h, v6.8h, v12.8h\n"
+ "fmla v25.8h, v4.8h, v12.8h\n"
+ "mov v24.16b, v13.16b\n fmla v24.8h, v3.8h, v12.8h\n"
+ "fmla v22.8h, v2.8h, v12.8h\n"
+ "fmla v21.8h, v1.8h, v12.8h\n"
+ "mov v20.16b, v13.16b\n fmla v20.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v10.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v10.h }[2], [x26], #0x2\n"
+ "b 7f\n"
+ "6:" // Oddments: Load input (5, 0): Bit 1: Unset
+ "ld1 { v10.h }[0], [x26], #0x2\n"
+ "7:" // Oddments: Load input (5, 0): Bit 1: End
+ "mov v19.16b, v13.16b\n fmla v19.8h, v6.8h, v10.8h\n"
+ "ldr x25, [x16, #0x28]\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v11.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v11.h }[2], [x25], #0x2\n"
+ "b 9f\n"
+ "8:" // Oddments: Load input (5, 5): Bit 1: Unset
+ "ld1 { v11.h }[0], [x25], #0x2\n"
+ "9:" // Oddments: Load input (5, 5): Bit 1: End
+ "mov v16.16b, v13.16b\n fmla v16.8h, v8.8h, v11.8h\n"
+ "ldr x24, [x16, #0x30]\n"
+ "add x24, x24, x14\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v9.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.h }[2], [x24], #0x2\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (3, 2): Bit 1: Unset
+ "ld1 { v9.h }[0], [x24], #0x2\n"
+ "11:" // Oddments: Load input (3, 2): Bit 1: End
+ "fmla v27.8h, v8.8h, v9.8h\n"
+ "ldr x23, [x16, #0x38]\n"
+ "fmla v26.8h, v7.8h, v9.8h\n"
+ "add x23, x23, x14\n"
+ "fmla v25.8h, v6.8h, v9.8h\n"
+ "fmla v23.8h, v5.8h, v9.8h\n"
+ "fmla v22.8h, v4.8h, v9.8h\n"
+ "fmla v21.8h, v3.8h, v9.8h\n"
+ "fmla v19.8h, v2.8h, v9.8h\n"
+ "mov v18.16b, v13.16b\n fmla v18.8h, v1.8h, v9.8h\n"
+ "mov v17.16b, v13.16b\n fmla v17.8h, v0.8h, v9.8h\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "b 13f\n"
+ "12:" // Oddments: Load input (0, 1): Bit 1: Unset
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "13:" // Oddments: Load input (0, 1): Bit 1: End
+ "fmla v31.8h, v1.8h, v12.8h\n"
+ "ldr x10, [x16, #0x40]\n"
+ "fmla v30.8h, v0.8h, v12.8h\n"
+ "add x10, x10, x14\n"
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v11.s }[0], [x10], #0x4\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v11.h }[2], [x10], #0x2\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (0, 4): Bit 1: Unset
+ "ld1 { v11.h }[0], [x10], #0x2\n"
+ "15:" // Oddments: Load input (0, 4): Bit 1: End
+ "fmla v29.8h, v2.8h, v11.8h\n"
+ "ldr x9, [x16, #0x48]\n"
+ "fmla v28.8h, v1.8h, v11.8h\n"
+ "add x9, x9, x14\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v10.s }[0], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 17f\n"
+ "ld1 { v10.h }[2], [x9], #0x2\n"
+ "b 17f\n"
+ "16:" // Oddments: Load input (3, 3): Bit 1: Unset
+ "ld1 { v10.h }[0], [x9], #0x2\n"
+ "17:" // Oddments: Load input (3, 3): Bit 1: End
+ "fmla v26.8h, v8.8h, v10.8h\n"
+ "ldr x28, [x16, #0x50]\n"
+ "fmla v25.8h, v7.8h, v10.8h\n"
+ "add x28, x28, x14\n"
+ "fmla v24.8h, v6.8h, v10.8h\n"
+ "fmla v22.8h, v5.8h, v10.8h\n"
+ "fmla v21.8h, v4.8h, v10.8h\n"
+ "fmla v20.8h, v3.8h, v10.8h\n"
+ "fmla v18.8h, v2.8h, v10.8h\n"
+ "fmla v17.8h, v1.8h, v10.8h\n"
+ "fmla v16.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v9.s }[0], [x28], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v9.h }[2], [x28], #0x2\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (1, 0): Bit 1: Unset
+ "ld1 { v9.h }[0], [x28], #0x2\n"
+ "19:" // Oddments: Load input (1, 0): Bit 1: End
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "ldr x27, [x16, #0x58]\n"
+ "fmla v27.8h, v0.8h, v9.8h\n"
+ "add x27, x27, x14\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v12.s }[0], [x27], #0x4\n"
+ "tbz %x[n_channels], #0, 21f\n"
+ "ld1 { v12.h }[2], [x27], #0x2\n"
+ "b 21f\n"
+ "20:" // Oddments: Load input (1, 5): Bit 1: Unset
+ "ld1 { v12.h }[0], [x27], #0x2\n"
+ "21:" // Oddments: Load input (1, 5): Bit 1: End
+ "fmla v28.8h, v5.8h, v12.8h\n"
+ "ldr x26, [x16, #0x60]\n"
+ "fmla v24.8h, v2.8h, v12.8h\n"
+ "add x26, x26, x14\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v11.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v11.h }[2], [x26], #0x2\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (4, 0): Bit 1: Unset
+ "ld1 { v11.h }[0], [x26], #0x2\n"
+ "23:" // Oddments: Load input (4, 0): Bit 1: End
+ "fmla v23.8h, v6.8h, v11.8h\n"
+ "ldr x25, [x16, #0x68]\n"
+ "fmla v19.8h, v3.8h, v11.8h\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 25f\n"
+ "ld1 { v10.h }[2], [x25], #0x2\n"
+ "b 25f\n"
+ "24:" // Oddments: Load input (1, 2): Bit 1: Unset
+ "ld1 { v10.h }[0], [x25], #0x2\n"
+ "25:" // Oddments: Load input (1, 2): Bit 1: End
+ "fmla v31.8h, v5.8h, v10.8h\n"
+ "ldr x24, [x16, #0x70]\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "add x24, x24, x14\n"
+ "fmla v29.8h, v3.8h, v10.8h\n"
+ "fmla v27.8h, v2.8h, v10.8h\n"
+ "fmla v26.8h, v1.8h, v10.8h\n"
+ "fmla v25.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v11.h }[2], [x24], #0x2\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (4, 5): Bit 1: Unset
+ "ld1 { v11.h }[0], [x24], #0x2\n"
+ "27:" // Oddments: Load input (4, 5): Bit 1: End
+ "fmla v20.8h, v8.8h, v11.8h\n"
+ "ldr x23, [x16, #0x78]\n"
+ "fmla v16.8h, v5.8h, v11.8h\n"
+ "add x23, x23, x14\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 29f\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "b 29f\n"
+ "28:" // Oddments: Load input (1, 3): Bit 1: Unset
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "29:" // Oddments: Load input (1, 3): Bit 1: End
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr x10, [x16, #0x80]\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "add x10, x10, x14\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v26.8h, v2.8h, v12.8h\n"
+ "fmla v25.8h, v1.8h, v12.8h\n"
+ "fmla v24.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v11.s }[0], [x10], #0x4\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v11.h }[2], [x10], #0x2\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (5, 1): Bit 1: Unset
+ "ld1 { v11.h }[0], [x10], #0x2\n"
+ "31:" // Oddments: Load input (5, 1): Bit 1: End
+ "fmla v19.8h, v7.8h, v11.8h\n"
+ "ldr x9, [x16, #0x88]\n"
+ "fmla v18.8h, v6.8h, v11.8h\n"
+ "add x9, x9, x14\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v10.s }[0], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 33f\n"
+ "ld1 { v10.h }[2], [x9], #0x2\n"
+ "b 33f\n"
+ "32:" // Oddments: Load input (2, 1): Bit 1: Unset
+ "ld1 { v10.h }[0], [x9], #0x2\n"
+ "33:" // Oddments: Load input (2, 1): Bit 1: End
+ "fmla v31.8h, v7.8h, v10.8h\n"
+ "ldr x28, [x16, #0x90]\n"
+ "fmla v30.8h, v6.8h, v10.8h\n"
+ "add x28, x28, x14\n"
+ "fmla v27.8h, v4.8h, v10.8h\n"
+ "fmla v26.8h, v3.8h, v10.8h\n"
+ "fmla v23.8h, v1.8h, v10.8h\n"
+ "fmla v22.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v11.s }[0], [x28], #0x4\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v11.h }[2], [x28], #0x2\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (5, 4): Bit 1: Unset
+ "ld1 { v11.h }[0], [x28], #0x2\n"
+ "35:" // Oddments: Load input (5, 4): Bit 1: End
+ "fmla v17.8h, v8.8h, v11.8h\n"
+ "ldr x27, [x16, #0x98]\n"
+ "fmla v16.8h, v7.8h, v11.8h\n"
+ "add x27, x27, x14\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v12.s }[0], [x27], #0x4\n"
+ "tbz %x[n_channels], #0, 37f\n"
+ "ld1 { v12.h }[2], [x27], #0x2\n"
+ "b 37f\n"
+ "36:" // Oddments: Load input (2, 4): Bit 1: Unset
+ "ld1 { v12.h }[0], [x27], #0x2\n"
+ "37:" // Oddments: Load input (2, 4): Bit 1: End
+ "fmla v29.8h, v8.8h, v12.8h\n"
+ "ldr x26, [x16, #0xa0]\n"
+ "fmla v28.8h, v7.8h, v12.8h\n"
+ "add x26, x26, x14\n"
+ "fmla v25.8h, v5.8h, v12.8h\n"
+ "fmla v24.8h, v4.8h, v12.8h\n"
+ "fmla v21.8h, v2.8h, v12.8h\n"
+ "fmla v20.8h, v1.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v10.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v10.h }[2], [x26], #0x2\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (0, 2): Bit 1: Unset
+ "ld1 { v10.h }[0], [x26], #0x2\n"
+ "39:" // Oddments: Load input (0, 2): Bit 1: End
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "ldr x25, [x16, #0xa8]\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "add x25, x25, x14\n"
+ "fmla v29.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "ld1 { v11.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 41f\n"
+ "ld1 { v11.h }[2], [x25], #0x2\n"
+ "b 41f\n"
+ "40:" // Oddments: Load input (3, 1): Bit 1: Unset
+ "ld1 { v11.h }[0], [x25], #0x2\n"
+ "41:" // Oddments: Load input (3, 1): Bit 1: End
+ "fmla v27.8h, v7.8h, v11.8h\n"
+ "ldr x24, [x16, #0xb0]\n"
+ "fmla v26.8h, v6.8h, v11.8h\n"
+ "add x24, x24, x14\n"
+ "fmla v23.8h, v4.8h, v11.8h\n"
+ "fmla v22.8h, v3.8h, v11.8h\n"
+ "fmla v19.8h, v1.8h, v11.8h\n"
+ "fmla v18.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 42f\n"
+ "ld1 { v12.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v12.h }[2], [x24], #0x2\n"
+ "b 43f\n"
+ "42:" // Oddments: Load input (0, 3): Bit 1: Unset
+ "ld1 { v12.h }[0], [x24], #0x2\n"
+ "43:" // Oddments: Load input (0, 3): Bit 1: End
+ "fmla v30.8h, v2.8h, v12.8h\n"
+ "ldr x23, [x16, #0xb8]\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "add x23, x23, x14\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 44f\n"
+ "ld1 { v10.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 45f\n"
+ "ld1 { v10.h }[2], [x23], #0x2\n"
+ "b 45f\n"
+ "44:" // Oddments: Load input (2, 0): Bit 1: Unset
+ "ld1 { v10.h }[0], [x23], #0x2\n"
+ "45:" // Oddments: Load input (2, 0): Bit 1: End
+ "fmla v31.8h, v6.8h, v10.8h\n"
+ "ldr x10, [x16, #0xc0]\n"
+ "fmla v27.8h, v3.8h, v10.8h\n"
+ "add x10, x10, x14\n"
+ "fmla v23.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 46f\n"
+ "ld1 { v11.s }[0], [x10], #0x4\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v11.h }[2], [x10], #0x2\n"
+ "b 47f\n"
+ "46:" // Oddments: Load input (3, 4): Bit 1: Unset
+ "ld1 { v11.h }[0], [x10], #0x2\n"
+ "47:" // Oddments: Load input (3, 4): Bit 1: End
+ "fmla v25.8h, v8.8h, v11.8h\n"
+ "ldr x9, [x16, #0xc8]\n"
+ "fmla v24.8h, v7.8h, v11.8h\n"
+ "add x9, x9, x14\n"
+ "fmla v21.8h, v5.8h, v11.8h\n"
+ "fmla v20.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v2.8h, v11.8h\n"
+ "fmla v16.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 48f\n"
+ "ld1 { v12.s }[0], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 49f\n"
+ "ld1 { v12.h }[2], [x9], #0x2\n"
+ "b 49f\n"
+ "48:" // Oddments: Load input (2, 5): Bit 1: Unset
+ "ld1 { v12.h }[0], [x9], #0x2\n"
+ "49:" // Oddments: Load input (2, 5): Bit 1: End
+ "fmla v28.8h, v8.8h, v12.8h\n"
+ "ldr x28, [x16, #0xd0]\n"
+ "fmla v24.8h, v5.8h, v12.8h\n"
+ "add x28, x28, x14\n"
+ "fmla v20.8h, v2.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 50f\n"
+ "ld1 { v10.s }[0], [x28], #0x4\n"
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v10.h }[2], [x28], #0x2\n"
+ "b 51f\n"
+ "50:" // Oddments: Load input (3, 0): Bit 1: Unset
+ "ld1 { v10.h }[0], [x28], #0x2\n"
+ "51:" // Oddments: Load input (3, 0): Bit 1: End
+ "fmla v27.8h, v6.8h, v10.8h\n"
+ "ldr x27, [x16, #0xd8]\n"
+ "fmla v23.8h, v3.8h, v10.8h\n"
+ "add x27, x27, x14\n"
+ "fmla v19.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 52f\n"
+ "ld1 { v11.s }[0], [x27], #0x4\n"
+ "tbz %x[n_channels], #0, 53f\n"
+ "ld1 { v11.h }[2], [x27], #0x2\n"
+ "b 53f\n"
+ "52:" // Oddments: Load input (4, 2): Bit 1: Unset
+ "ld1 { v11.h }[0], [x27], #0x2\n"
+ "53:" // Oddments: Load input (4, 2): Bit 1: End
+ "fmla v23.8h, v8.8h, v11.8h\n"
+ "ldr x26, [x16, #0xe0]\n"
+ "fmla v22.8h, v7.8h, v11.8h\n"
+ "add x26, x26, x14\n"
+ "fmla v21.8h, v6.8h, v11.8h\n"
+ "fmla v19.8h, v5.8h, v11.8h\n"
+ "fmla v18.8h, v4.8h, v11.8h\n"
+ "fmla v17.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 54f\n"
+ "ld1 { v12.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 55f\n"
+ "ld1 { v12.h }[2], [x26], #0x2\n"
+ "b 55f\n"
+ "54:" // Oddments: Load input (3, 5): Bit 1: Unset
+ "ld1 { v12.h }[0], [x26], #0x2\n"
+ "55:" // Oddments: Load input (3, 5): Bit 1: End
+ "fmla v24.8h, v8.8h, v12.8h\n"
+ "ldr x25, [x16, #0xe8]\n"
+ "fmla v20.8h, v5.8h, v12.8h\n"
+ "add x25, x25, x14\n"
+ "fmla v16.8h, v2.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 56f\n"
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 57f\n"
+ "ld1 { v10.h }[2], [x25], #0x2\n"
+ "b 57f\n"
+ "56:" // Oddments: Load input (5, 2): Bit 1: Unset
+ "ld1 { v10.h }[0], [x25], #0x2\n"
+ "57:" // Oddments: Load input (5, 2): Bit 1: End
+ "fmla v19.8h, v8.8h, v10.8h\n"
+ "ldr x24, [x16, #0xf0]\n"
+ "fmla v18.8h, v7.8h, v10.8h\n"
+ "add x24, x24, x14\n"
+ "fmla v17.8h, v6.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 58f\n"
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 59f\n"
+ "ld1 { v11.h }[2], [x24], #0x2\n"
+ "b 59f\n"
+ "58:" // Oddments: Load input (4, 3): Bit 1: Unset
+ "ld1 { v11.h }[0], [x24], #0x2\n"
+ "59:" // Oddments: Load input (4, 3): Bit 1: End
+ "fmla v22.8h, v8.8h, v11.8h\n"
+ "ldr x23, [x16, #0xf8]\n"
+ "fmla v21.8h, v7.8h, v11.8h\n"
+ "add x23, x23, x14\n"
+ "fmla v20.8h, v6.8h, v11.8h\n"
+ "fmla v18.8h, v5.8h, v11.8h\n"
+ "fmla v17.8h, v4.8h, v11.8h\n"
+ "fmla v16.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 60f\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 61f\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "b 61f\n"
+ "60:" // Oddments: Load input (5, 3): Bit 1: Unset
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "61:" // Oddments: Load input (5, 3): Bit 1: End
+ "fmla v18.8h, v8.8h, v12.8h\n"
+ "ldr x10, [x16, #0x100]\n"
+ "fmla v17.8h, v7.8h, v12.8h\n"
+ "add x10, x10, x14\n"
+ "fmla v16.8h, v6.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 62f\n"
+ "ld1 { v10.s }[0], [x10], #0x4\n"
+ "tbz %x[n_channels], #0, 63f\n"
+ "ld1 { v10.h }[2], [x10], #0x2\n"
+ "b 63f\n"
+ "62:" // Oddments: Load input (1, 1): Bit 1: Unset
+ "ld1 { v10.h }[0], [x10], #0x2\n"
+ "63:" // Oddments: Load input (1, 1): Bit 1: End
+ "fmla v31.8h, v4.8h, v10.8h\n"
+ "ldr x9, [x16, #0x108]\n"
+ "fmla v30.8h, v3.8h, v10.8h\n"
+ "add x9, x9, x14\n"
+ "fmla v27.8h, v1.8h, v10.8h\n"
+ "fmla v26.8h, v0.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 64f\n"
+ "ld1 { v11.s }[0], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 65f\n"
+ "ld1 { v11.h }[2], [x9], #0x2\n"
+ "b 65f\n"
+ "64:" // Oddments: Load input (1, 4): Bit 1: Unset
+ "ld1 { v11.h }[0], [x9], #0x2\n"
+ "65:" // Oddments: Load input (1, 4): Bit 1: End
+ "fmla v29.8h, v5.8h, v11.8h\n"
+ "ldr x28, [x16, #0x110]\n"
+ "fmla v28.8h, v4.8h, v11.8h\n"
+ "add x28, x28, x14\n"
+ "fmla v25.8h, v2.8h, v11.8h\n"
+ "fmla v24.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 66f\n"
+ "ld1 { v12.s }[0], [x28], #0x4\n"
+ "tbz %x[n_channels], #0, 67f\n"
+ "ld1 { v12.h }[2], [x28], #0x2\n"
+ "b 67f\n"
+ "66:" // Oddments: Load input (4, 1): Bit 1: Unset
+ "ld1 { v12.h }[0], [x28], #0x2\n"
+ "67:" // Oddments: Load input (4, 1): Bit 1: End
+ "fmla v23.8h, v7.8h, v12.8h\n"
+ "ldr x27, [x16, #0x118]\n"
+ "fmla v22.8h, v6.8h, v12.8h\n"
+ "add x27, x27, x14\n"
+ "fmla v19.8h, v4.8h, v12.8h\n"
+ "fmla v18.8h, v3.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 68f\n"
+ "ld1 { v10.s }[0], [x27], #0x4\n"
+ "tbz %x[n_channels], #0, 69f\n"
+ "ld1 { v10.h }[2], [x27], #0x2\n"
+ "b 69f\n"
+ "68:" // Oddments: Load input (4, 4): Bit 1: Unset
+ "ld1 { v10.h }[0], [x27], #0x2\n"
+ "69:" // Oddments: Load input (4, 4): Bit 1: End
+ "fmla v21.8h, v8.8h, v10.8h\n"
+ "fmla v20.8h, v7.8h, v10.8h\n"
+ "fmla v17.8h, v5.8h, v10.8h\n"
+ "fmla v16.8h, v4.8h, v10.8h\n"
+ "fmax v31.8h, v31.8h, v15.8h\n"
+ "fmax v30.8h, v30.8h, v15.8h\n"
+ "fmax v29.8h, v29.8h, v15.8h\n"
+ "fmin v31.8h, v31.8h, v14.8h\n"
+ "fmin v30.8h, v30.8h, v14.8h\n"
+ "fmin v29.8h, v29.8h, v14.8h\n"
+ "fmax v28.8h, v28.8h, v15.8h\n"
+ "fmax v27.8h, v27.8h, v15.8h\n"
+ "fmax v26.8h, v26.8h, v15.8h\n"
+ "fmin v28.8h, v28.8h, v14.8h\n"
+ "fmin v27.8h, v27.8h, v14.8h\n"
+ "fmin v26.8h, v26.8h, v14.8h\n"
+ "fmax v25.8h, v25.8h, v15.8h\n"
+ "fmax v24.8h, v24.8h, v15.8h\n"
+ "fmax v23.8h, v23.8h, v15.8h\n"
+ "fmin v25.8h, v25.8h, v14.8h\n"
+ "fmin v24.8h, v24.8h, v14.8h\n"
+ "fmin v23.8h, v23.8h, v14.8h\n"
+ "fmax v22.8h, v22.8h, v15.8h\n"
+ "fmax v21.8h, v21.8h, v15.8h\n"
+ "fmax v20.8h, v20.8h, v15.8h\n"
+ "fmin v22.8h, v22.8h, v14.8h\n"
+ "fmin v21.8h, v21.8h, v14.8h\n"
+ "fmin v20.8h, v20.8h, v14.8h\n"
+ "fmax v19.8h, v19.8h, v15.8h\n"
+ "fmax v18.8h, v18.8h, v15.8h\n"
+ "fmax v17.8h, v17.8h, v15.8h\n"
+ "fmin v19.8h, v19.8h, v14.8h\n"
+ "fmin v18.8h, v18.8h, v14.8h\n"
+ "fmin v17.8h, v17.8h, v14.8h\n"
+ "fmax v16.8h, v16.8h, v15.8h\n"
+ "fmin v16.8h, v16.8h, v14.8h\n"
+ "tbz %x[n_channels], #1, 70f\n"
+ "ldr x22, [x17, #0x0]\n"
+ "ldr x21, [x17, #0x8]\n"
+ "add x22, x22, x12\n"
+ "ldr x20, [x17, #0x10]\n"
+ "ldr x19, [x17, #0x18]\n"
+ "add x21, x21, x12\n"
+ "st1 { v31.s }[0], [x22]\n"
+ "add x20, x20, x12\n"
+ "st1 { v30.s }[0], [x21]\n"
+ "ldr x22, [x17, #0x20]\n"
+ "add x19, x19, x12\n"
+ "st1 { v29.s }[0], [x20]\n"
+ "add x22, x22, x12\n"
+ "st1 { v28.s }[0], [x19]\n"
+ "ldr x21, [x17, #0x28]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.s }[0], [x22]\n"
+ "ldr x20, [x17, #0x30]\n"
+ "add x20, x20, x12\n"
+ "st1 { v26.s }[0], [x21]\n"
+ "ldr x19, [x17, #0x38]\n"
+ "add x19, x19, x12\n"
+ "st1 { v25.s }[0], [x20]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "add x22, x22, x12\n"
+ "st1 { v24.s }[0], [x19]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "add x21, x21, x12\n"
+ "st1 { v23.s }[0], [x22]\n"
+ "ldr x20, [x17, #0x50]\n"
+ "add x20, x20, x12\n"
+ "st1 { v22.s }[0], [x21]\n"
+ "ldr x19, [x17, #0x58]\n"
+ "add x19, x19, x12\n"
+ "st1 { v21.s }[0], [x20]\n"
+ "ldr x22, [x17, #0x60]\n"
+ "add x22, x22, x12\n"
+ "st1 { v20.s }[0], [x19]\n"
+ "ldr x21, [x17, #0x68]\n"
+ "add x21, x21, x12\n"
+ "st1 { v19.s }[0], [x22]\n"
+ "ldr x20, [x17, #0x70]\n"
+ "add x20, x20, x12\n"
+ "st1 { v18.s }[0], [x21]\n"
+ "ldr x19, [x17, #0x78]\n"
+ "add x19, x19, x12\n"
+ "st1 { v17.s }[0], [x20]\n"
+ "add x12, x12, #0x4\n"
+ "st1 { v16.s }[0], [x19]\n"
+ "tbz %x[n_channels], #0, 71f\n"
+ "ldr x22, [x17, #0x0]\n"
+ "ldr x21, [x17, #0x8]\n"
+ "add x22, x22, x12\n"
+ "ldr x20, [x17, #0x10]\n"
+ "ldr x19, [x17, #0x18]\n"
+ "add x21, x21, x12\n"
+ "st1 { v31.h }[2], [x22]\n"
+ "add x20, x20, x12\n"
+ "st1 { v30.h }[2], [x21]\n"
+ "ldr x22, [x17, #0x20]\n"
+ "add x19, x19, x12\n"
+ "st1 { v29.h }[2], [x20]\n"
+ "add x22, x22, x12\n"
+ "st1 { v28.h }[2], [x19]\n"
+ "ldr x21, [x17, #0x28]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.h }[2], [x22]\n"
+ "ldr x20, [x17, #0x30]\n"
+ "add x20, x20, x12\n"
+ "st1 { v26.h }[2], [x21]\n"
+ "ldr x19, [x17, #0x38]\n"
+ "add x19, x19, x12\n"
+ "st1 { v25.h }[2], [x20]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "add x22, x22, x12\n"
+ "st1 { v24.h }[2], [x19]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "add x21, x21, x12\n"
+ "st1 { v23.h }[2], [x22]\n"
+ "ldr x20, [x17, #0x50]\n"
+ "add x20, x20, x12\n"
+ "st1 { v22.h }[2], [x21]\n"
+ "ldr x19, [x17, #0x58]\n"
+ "add x19, x19, x12\n"
+ "st1 { v21.h }[2], [x20]\n"
+ "ldr x22, [x17, #0x60]\n"
+ "add x22, x22, x12\n"
+ "st1 { v20.h }[2], [x19]\n"
+ "ldr x21, [x17, #0x68]\n"
+ "add x21, x21, x12\n"
+ "st1 { v19.h }[2], [x22]\n"
+ "ldr x20, [x17, #0x70]\n"
+ "add x20, x20, x12\n"
+ "st1 { v18.h }[2], [x21]\n"
+ "ldr x19, [x17, #0x78]\n"
+ "add x19, x19, x12\n"
+ "st1 { v17.h }[2], [x20]\n"
+ "st1 { v16.h }[2], [x19]\n"
+ "b 71f\n"
+ "70:" // Oddments: Store: Bit 1: Unset
+ "ldr x22, [x17, #0x0]\n"
+ "add x22, x22, x12\n"
+ "ldr x21, [x17, #0x8]\n"
+ "ldr x20, [x17, #0x10]\n"
+ "add x21, x21, x12\n"
+ "st1 { v31.h }[0], [x22]\n"
+ "ldr x19, [x17, #0x18]\n"
+ "add x20, x20, x12\n"
+ "st1 { v30.h }[0], [x21]\n"
+ "add x19, x19, x12\n"
+ "st1 { v29.h }[0], [x20]\n"
+ "ldr x22, [x17, #0x20]\n"
+ "add x22, x22, x12\n"
+ "st1 { v28.h }[0], [x19]\n"
+ "ldr x21, [x17, #0x28]\n"
+ "add x21, x21, x12\n"
+ "st1 { v27.h }[0], [x22]\n"
+ "ldr x20, [x17, #0x30]\n"
+ "add x20, x20, x12\n"
+ "st1 { v26.h }[0], [x21]\n"
+ "ldr x19, [x17, #0x38]\n"
+ "add x19, x19, x12\n"
+ "st1 { v25.h }[0], [x20]\n"
+ "ldr x22, [x17, #0x40]\n"
+ "add x22, x22, x12\n"
+ "st1 { v24.h }[0], [x19]\n"
+ "ldr x21, [x17, #0x48]\n"
+ "add x21, x21, x12\n"
+ "st1 { v23.h }[0], [x22]\n"
+ "ldr x20, [x17, #0x50]\n"
+ "add x20, x20, x12\n"
+ "st1 { v22.h }[0], [x21]\n"
+ "ldr x19, [x17, #0x58]\n"
+ "add x19, x19, x12\n"
+ "st1 { v21.h }[0], [x20]\n"
+ "ldr x22, [x17, #0x60]\n"
+ "add x22, x22, x12\n"
+ "st1 { v20.h }[0], [x19]\n"
+ "ldr x21, [x17, #0x68]\n"
+ "add x21, x21, x12\n"
+ "st1 { v19.h }[0], [x22]\n"
+ "ldr x20, [x17, #0x70]\n"
+ "add x20, x20, x12\n"
+ "st1 { v18.h }[0], [x21]\n"
+ "ldr x19, [x17, #0x78]\n"
+ "add x19, x19, x12\n"
+ "st1 { v17.h }[0], [x20]\n"
+ "st1 { v16.h }[0], [x19]\n"
+ "71:" // Oddments: Store: Bit 1: End
+
+ "72:" // End
+
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 000000000..ca367cc1a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst
+{
+ typedef __fp16 bias_type;
+ typedef __fp16 input_type;
+ typedef __fp16 weight_type;
+ typedef __fp16 return_type;
+
+ typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 5;
+ constexpr static unsigned int input_cols = 5;
+
+ indirect_kern_type indirect_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+
+ a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 000000000..32a6fb964
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,616 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x6, #0x0\n"
+ "mov x27, #0x0\n"
+ "1:" // Tile loop
+ "str x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x4\n"
+ "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x25, #0x2\n"
+ "ldr x7, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x24, %x[params_struct], %[offsetof_args_min]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "add x21, %x[params_struct], %[offsetof_args_max]\n"
+ "ldr x8, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mov x22, #0x0\n"
+ "ldr x17, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x6, x23\n" // offset = tile_i * ld_input_row
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x27, x8, x19\n" // offset += tile_j * ld_input_col
+ "ldr x16, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x26\n" // offset *= kernel_stride * output_size
+ "ldr x15, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x17, x17, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ld1r { v19.8h }, [x24]\n"
+ "add x14, x17, x23, LSL #1\n"
+ "ld1r { v18.8h }, [x21]\n"
+ "add x13, x14, x23, LSL #1\n"
+ "lsl x8, x8, #0x1\n"
+ "add x12, x13, x23, LSL #1\n"
+ "add x11, x12, x23, LSL #1\n"
+ "add x10, x8, x8\n"
+ "add x9, x10, x8\n"
+ "add x28, x9, x8\n"
+ "mul x19, x6, x20\n" // offset = tile_i * ld_output_row
+ "madd x19, x27, x16, x19\n" // offset += tile_j * ld_output_col
+ "mul x19, x19, x25\n" // offset *= output_tile_size
+ "add x15, x15, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x27, x15, x20, LSL #1\n"
+ "lsl x16, x16, #0x1\n"
+ "mov x21, #0x10\n" // cntb _, ALL, #1
+ "sub x20, XZR, x21\n"
+ "lsr x19, %x[n_channels], #0x3\n"
+ "cbz x19, 4f\n"
+ "ldr q17, [x7, #0x0]\n"
+ "ldr q0, [x7, #0x10]\n"
+ "cmp x21, x19, LSL #4\n"
+ "ldr q1, [x7, #0x20]\n"
+ "ldr q2, [x7, #0x30]\n"
+ "ldr q3, [x7, #0x40]\n"
+ "ldr q4, [x7, #0x50]\n"
+ "ldr q5, [x7, #0x60]\n"
+ "ldr q6, [x7, #0x70]\n"
+ "ldr q7, [x7, #0x80]\n"
+ "ldr q8, [x7, #0x90]\n"
+ "add x7, x7, #0xa0\n"
+ "ldr q9, [x13, x10]\n"
+ "ld1 { v10.8h }, [x17]\n"
+ "ldr q11, [x17, x8]\n"
+ "ldr q12, [x17, x9]\n"
+ "ldr q13, [x17, x28]\n"
+ "ld1 { v14.8h }, [x14]\n"
+ "ldr q15, [x14, x8]\n"
+ "ldr q16, [x17, x10]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "add x20, x20, #0x10\n"
+ "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n"
+ "add x22, x22, #0x10\n"
+ "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "add x17, x17, #0x10\n"
+ "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "ldr q17, [x7, #0x0]\n"
+ "add x21, x21, #0x10\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ld1 { v10.8h }, [x17]\n"
+ "cmp x21, x19, LSL #4\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x14, x28]\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x14, x9]\n"
+ "fmla v30.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x14, x10]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.8h, v3.8h, v14.8h\n"
+ "ld1 { v14.8h }, [x12]\n"
+ "fmla v30.8h, v0.8h, v16.8h\n"
+ "fmla v31.8h, v4.8h, v15.8h\n"
+ "ld1 { v15.8h }, [x13]\n"
+ "fmla v29.8h, v3.8h, v14.8h\n"
+ "ldr q14, [x12, x28]\n"
+ "fmla v30.8h, v4.8h, v11.8h\n"
+ "ldr q11, [x12, x8]\n"
+ "fmla v31.8h, v2.8h, v16.8h\n"
+ "ldr q16, [x13, x8]\n"
+ "fmla v29.8h, v0.8h, v15.8h\n"
+ "ldr q0, [x7, #0x10]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr q12, [x13, x9]\n"
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "ldr q11, [x13, x28]\n"
+ "add x13, x13, #0x10\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "ldr q13, [x12, x9]\n"
+ "ldr q9, [x13, x10]\n"
+ "fmla v31.8h, v6.8h, v15.8h\n"
+ "ld1 { v15.8h }, [x11]\n"
+ "fmla v29.8h, v1.8h, v16.8h\n"
+ "fmla v28.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x11, x8]\n"
+ "fmla v30.8h, v7.8h, v12.8h\n"
+ "ldr q4, [x7, #0x50]\n"
+ "fmla v31.8h, v7.8h, v16.8h\n"
+ "ldr q16, [x12, x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.8h, v6.8h, v15.8h\n"
+ "ldr q15, [x11, x10]\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x17, x9]\n"
+ "fmla v30.8h, v8.8h, v11.8h\n"
+ "ldr q1, [x7, #0x20]\n"
+ "fmax v31.8h, v31.8h, v19.8h\n"
+ "fmla v29.8h, v7.8h, v13.8h\n"
+ "ldr q13, [x17, x28]\n"
+ "fmla v28.8h, v5.8h, v14.8h\n"
+ "ldr q14, [x11, x9]\n"
+ "fmax v30.8h, v30.8h, v19.8h\n"
+ "fmin v31.8h, v31.8h, v18.8h\n"
+ "st1 { v31.8h }, [x15]\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v5.8h, v16.8h\n"
+ "ldr q11, [x11, x28]\n"
+ "add x11, x11, #0x10\n"
+ "fmin v30.8h, v30.8h, v18.8h\n"
+ "ldr q2, [x7, #0x30]\n"
+ "ldr q5, [x7, #0x60]\n"
+ "fmla v28.8h, v3.8h, v16.8h\n"
+ "ldr q16, [x17, x10]\n"
+ "fmla v29.8h, v8.8h, v15.8h\n"
+ "str q30, [x15, x16]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v28.8h, v7.8h, v14.8h\n"
+ "ld1 { v14.8h }, [x14]\n"
+ "fmax v29.8h, v29.8h, v19.8h\n"
+ "ldr q3, [x7, #0x40]\n"
+ "ldr q7, [x7, #0x80]\n"
+ "fmin v29.8h, v29.8h, v18.8h\n"
+ "st1 { v29.8h }, [x27]\n"
+ "fmla v28.8h, v6.8h, v15.8h\n"
+ "ldr q15, [x14, x8]\n"
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x17, x8]\n"
+ "ldr q6, [x7, #0x70]\n"
+ "fmax v28.8h, v28.8h, v19.8h\n"
+ "ldr q8, [x7, #0x90]\n"
+ "add x7, x7, #0xa0\n"
+ "fmin v28.8h, v28.8h, v18.8h\n"
+ "str q28, [x27, x16]\n"
+ "add x27, x27, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "add x17, x17, #0x10\n"
+ "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n"
+ "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x14, x28]\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x14, x9]\n"
+ "fmla v30.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x14, x10]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v31.8h, v3.8h, v14.8h\n"
+ "ld1 { v14.8h }, [x12]\n"
+ "fmla v30.8h, v0.8h, v16.8h\n"
+ "fmla v31.8h, v4.8h, v15.8h\n"
+ "ld1 { v15.8h }, [x13]\n"
+ "fmla v30.8h, v4.8h, v11.8h\n"
+ "ldr q11, [x12, x8]\n"
+ "fmla v29.8h, v3.8h, v14.8h\n"
+ "ldr q14, [x12, x28]\n"
+ "fmla v31.8h, v2.8h, v16.8h\n"
+ "ldr q16, [x13, x8]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr q12, [x13, x9]\n"
+ "fmla v29.8h, v0.8h, v15.8h\n"
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "ldr q13, [x12, x9]\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "ldr q11, [x13, x28]\n"
+ "add x13, x13, #0x10\n"
+ "fmla v31.8h, v6.8h, v15.8h\n"
+ "ld1 { v15.8h }, [x11]\n"
+ "fmla v30.8h, v7.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v16.8h\n"
+ "fmla v28.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x11, x8]\n"
+ "fmla v31.8h, v7.8h, v16.8h\n"
+ "ldr q16, [x12, x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.8h, v6.8h, v15.8h\n"
+ "ldr q15, [x11, x10]\n"
+ "fmla v30.8h, v8.8h, v11.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmax v31.8h, v31.8h, v19.8h\n"
+ "fmla v29.8h, v7.8h, v13.8h\n"
+ "fmax v30.8h, v30.8h, v19.8h\n"
+ "fmla v28.8h, v5.8h, v14.8h\n"
+ "ldr q14, [x11, x9]\n"
+ "fmin v31.8h, v31.8h, v18.8h\n"
+ "st1 { v31.8h }, [x15]\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v5.8h, v16.8h\n"
+ "ldr q11, [x11, x28]\n"
+ "add x11, x11, #0x10\n"
+ "fmin v30.8h, v30.8h, v18.8h\n"
+ "str q30, [x15, x16]\n"
+ "fmla v28.8h, v3.8h, v16.8h\n"
+ "add x15, x15, #0x10\n"
+ "fmla v29.8h, v8.8h, v15.8h\n"
+ "fmla v28.8h, v7.8h, v14.8h\n"
+ "fmax v29.8h, v29.8h, v19.8h\n"
+ "fmla v28.8h, v6.8h, v15.8h\n"
+ "fmin v29.8h, v29.8h, v18.8h\n"
+ "st1 { v29.8h }, [x27]\n"
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "fmax v28.8h, v28.8h, v19.8h\n"
+ "fmin v28.8h, v28.8h, v18.8h\n"
+ "str q28, [x27, x16]\n"
+ "add x27, x27, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x1\n"
+ "beq 43f\n"
+ "ldr q17, [x7, #0x0]\n"
+ "ldr q0, [x7, #0x10]\n"
+ "add x26, x13, x10\n"
+ "ldr q1, [x7, #0x20]\n"
+ "add x25, x17, XZR\n"
+ "ldr q2, [x7, #0x30]\n"
+ "add x24, x17, x8\n"
+ "ldr q3, [x7, #0x40]\n"
+ "add x23, x17, x9\n"
+ "ldr q4, [x7, #0x50]\n"
+ "add x22, x17, x28\n"
+ "ldr q5, [x7, #0x60]\n"
+ "add x21, x14, XZR\n"
+ "ldr q6, [x7, #0x70]\n"
+ "add x20, x14, x8\n"
+ "ldr q7, [x7, #0x80]\n"
+ "add x19, x17, x10\n"
+ "ldr q8, [x7, #0x90]\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ldr s9, [x26], #0x4\n"
+ "ldr s10, [x25], #0x4\n"
+ "ldr s11, [x24], #0x4\n"
+ "ldr s12, [x23], #0x4\n"
+ "ldr s13, [x22], #0x4\n"
+ "ldr s14, [x21], #0x4\n"
+ "ldr s15, [x20], #0x4\n"
+ "ldr s16, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v9.h }[2], [x26]\n"
+ "ld1 { v10.h }[2], [x25]\n"
+ "ld1 { v11.h }[2], [x24]\n"
+ "ld1 { v12.h }[2], [x23]\n"
+ "ld1 { v13.h }[2], [x22]\n"
+ "ld1 { v14.h }[2], [x21]\n"
+ "ld1 { v15.h }[2], [x20]\n"
+ "ld1 { v16.h }[2], [x19]\n"
+ "b 6f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset
+ "ldr h9, [x26, #0x0]\n"
+ "ldr h10, [x25, #0x0]\n"
+ "ldr h11, [x24, #0x0]\n"
+ "ldr h12, [x23, #0x0]\n"
+ "ldr h13, [x22, #0x0]\n"
+ "ldr h14, [x21, #0x0]\n"
+ "ldr h15, [x20, #0x0]\n"
+ "ldr h16, [x19, #0x0]\n"
+ "6:" // Tile loop: Oddments: Load inputs: (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
+ "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "add x19, x14, x9\n"
+ "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n"
+ "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "fmla v30.8h, v2.8h, v13.8h\n"
+ "fmla v31.8h, v3.8h, v14.8h\n"
+ "fmla v30.8h, v0.8h, v16.8h\n"
+ "fmla v31.8h, v4.8h, v15.8h\n"
+ "fmla v31.8h, v2.8h, v16.8h\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+ "fmla v30.8h, v4.8h, v11.8h\n"
+ "add x19, x14, x28\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 10f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "10:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "add x19, x14, x10\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (1, 2): Bit 1: End
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "add x19, x12, XZR\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr s14, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v14.h }[2], [x19]\n"
+ "b 14f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+ "ldr h14, [x19, #0x0]\n"
+ "14:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+ "fmla v29.8h, v3.8h, v14.8h\n"
+ "add x19, x13, XZR\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr s15, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v15.h }[2], [x19]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: Unset
+ "ldr h15, [x19, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (2, 0): Bit 1: End
+ "fmla v31.8h, v6.8h, v15.8h\n"
+ "add x19, x12, x8\n"
+ "fmla v29.8h, v0.8h, v15.8h\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 18f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "18:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "add x19, x13, x8\n"
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr s16, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v16.h }[2], [x19]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+ "ldr h16, [x19, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+ "fmla v31.8h, v7.8h, v16.8h\n"
+ "add x19, x12, x9\n"
+ "fmla v29.8h, v1.8h, v16.8h\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 22f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 22f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "22:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+ "fmla v28.8h, v4.8h, v13.8h\n"
+ "add x19, x13, x9\n"
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+ "fmla v30.8h, v7.8h, v12.8h\n"
+ "add x19, x12, x28\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ldr s14, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 26f\n"
+ "ld1 { v14.h }[2], [x19]\n"
+ "b 26f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+ "ldr h14, [x19, #0x0]\n"
+ "26:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+ "fmla v28.8h, v5.8h, v14.8h\n"
+ "add x19, x11, XZR\n"
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr s15, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v15.h }[2], [x19]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+ "ldr h15, [x19, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+ "fmla v29.8h, v6.8h, v15.8h\n"
+ "add x19, x13, x28\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 30f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 30f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "30:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+ "fmla v30.8h, v8.8h, v11.8h\n"
+ "add x19, x11, x8\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+ "fmla v29.8h, v7.8h, v13.8h\n"
+ "add x19, x12, x10\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ldr s16, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 34f\n"
+ "ld1 { v16.h }[2], [x19]\n"
+ "b 34f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+ "ldr h16, [x19, #0x0]\n"
+ "34:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+ "fmla v29.8h, v5.8h, v16.8h\n"
+ "add x19, x11, x9\n"
+ "fmla v28.8h, v3.8h, v16.8h\n"
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr s14, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v14.h }[2], [x19]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+ "ldr h14, [x19, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+ "fmla v28.8h, v7.8h, v14.8h\n"
+ "add x19, x11, x10\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ldr s15, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 38f\n"
+ "ld1 { v15.h }[2], [x19]\n"
+ "b 38f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+ "ldr h15, [x19, #0x0]\n"
+ "38:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+ "fmla v29.8h, v8.8h, v15.8h\n"
+ "add x19, x11, x28\n"
+ "fmla v28.8h, v6.8h, v15.8h\n"
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "fmax v31.8h, v31.8h, v19.8h\n"
+ "fmax v30.8h, v30.8h, v19.8h\n"
+ "fmax v29.8h, v29.8h, v19.8h\n"
+ "fmin v31.8h, v31.8h, v18.8h\n"
+ "fmin v30.8h, v30.8h, v18.8h\n"
+ "fmin v29.8h, v29.8h, v18.8h\n"
+ "fmax v28.8h, v28.8h, v19.8h\n"
+ "fmin v28.8h, v28.8h, v18.8h\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "mov x19, x15\n"
+ "st1 { v31.s }[0], [x19], x16\n"
+ "add x15, x15, #0x4\n"
+ "st1 { v30.s }[0], [x19]\n"
+ "mov x19, x27\n"
+ "st1 { v29.s }[0], [x19], x16\n"
+ "add x27, x27, #0x4\n"
+ "st1 { v28.s }[0], [x19]\n"
+ "tbz %x[n_channels], #0, 42f\n"
+ "mov x20, x15\n"
+ "st1 { v31.h }[2], [x20], x16\n"
+ "mov x19, x27\n"
+ "st1 { v30.h }[2], [x20]\n"
+ "st1 { v29.h }[2], [x19], x16\n"
+ "st1 { v28.h }[2], [x19]\n"
+ "b 42f\n"
+ "41:" // Tile loop: Oddments: Store: Bit 1: Unset
+ "mov x20, x15\n"
+ "st1 { v31.h }[0], [x20], x16\n"
+ "mov x19, x27\n"
+ "st1 { v30.h }[0], [x20]\n"
+ "st1 { v29.h }[0], [x19], x16\n"
+ "st1 { v28.h }[0], [x19]\n"
+ "42:" // Tile loop: Oddments: Store: Bit 1: End
+
+ "43:" // Tile loop: End
+ "ldr x6, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x6, #0x1\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x27, x27, #0x1\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x27, x19\n"
+ "csel x27, x27, XZR, LT\n"
+ "csel x6, x6, x21, LT\n"
+ "cmp x6, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 000000000..f071e2197
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,631 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[25];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[12];
+ inptrs[1] = input_ptrs[0];
+ inptrs[2] = input_ptrs[1];
+ inptrs[3] = input_ptrs[3];
+ inptrs[4] = input_ptrs[4];
+ inptrs[5] = input_ptrs[5];
+ inptrs[6] = input_ptrs[6];
+ inptrs[7] = input_ptrs[2];
+ inptrs[8] = input_ptrs[8];
+ inptrs[9] = input_ptrs[9];
+ inptrs[10] = input_ptrs[7];
+ inptrs[11] = input_ptrs[15];
+ inptrs[12] = input_ptrs[10];
+ inptrs[13] = input_ptrs[16];
+ inptrs[14] = input_ptrs[11];
+ inptrs[15] = input_ptrs[18];
+ inptrs[16] = input_ptrs[13];
+ inptrs[17] = input_ptrs[19];
+ inptrs[18] = input_ptrs[20];
+ inptrs[19] = input_ptrs[14];
+ inptrs[20] = input_ptrs[21];
+ inptrs[21] = input_ptrs[17];
+ inptrs[22] = input_ptrs[23];
+ inptrs[23] = input_ptrs[22];
+ inptrs[24] = input_ptrs[24];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "add x19, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v19.8h }, [x20]\n"
+ "ld1r { v18.8h }, [x19]\n"
+ "mov x14, #0x0\n"
+ "ldp x13, x12, [x21, #0x0]\n"
+ "mov x11, #0x10\n" // cntb _, ALL, #1
+ "ldp x10, x9, [x21, #0x10]\n"
+ "sub x28, XZR, x11\n"
+ "lsr x27, %x[n_channels], #0x3\n"
+ "cbz x27, 3f\n"
+ "ldr q17, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x11, x27, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ "ldp x22, x21, [x16, #0x20]\n"
+ "ldr q9, [x26, x14]\n"
+ "ldr q10, [x25, x14]\n"
+ "ldr q11, [x24, x14]\n"
+ "ldr q12, [x23, x14]\n"
+ "ldr q13, [x22, x14]\n"
+ "ldr q14, [x21, x14]\n"
+ "ldp x20, x19, [x16, #0x30]\n"
+ "ldr q15, [x20, x14]\n"
+ "ldr q16, [x19, x14]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "ldr x26, [x16, #0x40]\n"
+ "add x28, x28, #0x10\n"
+ "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n"
+ "ldr x25, [x16, #0x48]\n"
+ "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "ldr x24, [x16, #0x50]\n"
+ "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "ldr x23, [x16, #0x58]\n"
+ "ldr x22, [x16, #0x60]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x25, x14]\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x26, x14]\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v30.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x24, x14]\n"
+ "fmla v31.8h, v3.8h, v14.8h\n"
+ "ldr q14, [x23, x14]\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v30.8h, v0.8h, v16.8h\n"
+ "ldr x26, [x16, #0x80]\n"
+ "fmla v31.8h, v4.8h, v15.8h\n"
+ "ldr q15, [x22, x14]\n"
+ "fmla v29.8h, v3.8h, v14.8h\n"
+ "ldr x25, [x16, #0x88]\n"
+ "fmla v30.8h, v4.8h, v11.8h\n"
+ "ldr q11, [x21, x14]\n"
+ "ldr x24, [x16, #0x90]\n"
+ "fmla v31.8h, v2.8h, v16.8h\n"
+ "ldr q16, [x20, x14]\n"
+ "fmla v29.8h, v0.8h, v15.8h\n"
+ "ldr q14, [x25, x14]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr q12, [x26, x14]\n"
+ "ldr x23, [x16, #0x98]\n"
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "ldr q11, [x23, x14]\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "ldr q13, [x19, x14]\n"
+ "ldr x21, [x16, #0xa8]\n"
+ "fmla v31.8h, v6.8h, v15.8h\n"
+ "ldr q15, [x24, x14]\n"
+ "fmla v29.8h, v1.8h, v16.8h\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "fmla v30.8h, v7.8h, v12.8h\n"
+ "ldr x19, [x16, #0xb8]\n"
+ "fmla v28.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x22, x14]\n"
+ "ldr x26, [x16, #0xc0]\n"
+ "fmla v31.8h, v7.8h, v16.8h\n"
+ "fmla v29.8h, v6.8h, v15.8h\n"
+ "ldr q16, [x21, x14]\n"
+ "fmla v30.8h, v8.8h, v11.8h\n"
+ "ldr q15, [x19, x14]\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q17, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "fmla v29.8h, v7.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v19.8h\n"
+ "ldr q1, [x15, #0x20]\n"
+ "fmax v30.8h, v30.8h, v19.8h\n"
+ "ldr q4, [x15, #0x50]\n"
+ "fmla v28.8h, v5.8h, v14.8h\n"
+ "ldr q14, [x20, x14]\n"
+ "fmin v31.8h, v31.8h, v18.8h\n"
+ "str q31, [x13, x28]\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v5.8h, v16.8h\n"
+ "ldr q11, [x26, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmin v30.8h, v30.8h, v18.8h\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ "fmla v28.8h, v3.8h, v16.8h\n"
+ "ldp x22, x21, [x16, #0x20]\n"
+ "fmla v29.8h, v8.8h, v15.8h\n"
+ "ldr q9, [x26, x11]\n"
+ "ldr q10, [x25, x11]\n"
+ "fmla v28.8h, v7.8h, v14.8h\n"
+ "ldr q12, [x23, x11]\n"
+ "fmax v29.8h, v29.8h, v19.8h\n"
+ "ldr q13, [x22, x11]\n"
+ "ldr q14, [x21, x11]\n"
+ "fmin v29.8h, v29.8h, v18.8h\n"
+ "ldp x20, x19, [x16, #0x30]\n"
+ "str q30, [x12, x28]\n"
+ "fmla v28.8h, v6.8h, v15.8h\n"
+ "ldr q2, [x15, #0x30]\n"
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "ldr q11, [x24, x11]\n"
+ "ldr q15, [x20, x11]\n"
+ "fmax v28.8h, v28.8h, v19.8h\n"
+ "ldr q16, [x19, x11]\n"
+ "add x11, x11, #0x10\n"
+ "fmin v28.8h, v28.8h, v18.8h\n"
+ "str q29, [x10, x28]\n"
+ "cmp x11, x27, LSL #4\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "str q28, [x9, x28]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "add x15, x15, #0xa0\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "ldr x26, [x16, #0x40]\n"
+ "add x28, x28, #0x10\n"
+ "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n"
+ "ldr x25, [x16, #0x48]\n"
+ "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "ldr x24, [x16, #0x50]\n"
+ "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "ldr x23, [x16, #0x58]\n"
+ "ldr x22, [x16, #0x60]\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x25, x14]\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x26, x14]\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v30.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x24, x14]\n"
+ "fmla v31.8h, v3.8h, v14.8h\n"
+ "ldr q14, [x23, x14]\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v30.8h, v0.8h, v16.8h\n"
+ "ldr x26, [x16, #0x80]\n"
+ "fmla v31.8h, v4.8h, v15.8h\n"
+ "ldr q15, [x22, x14]\n"
+ "fmla v29.8h, v3.8h, v14.8h\n"
+ "ldr x25, [x16, #0x88]\n"
+ "fmla v30.8h, v4.8h, v11.8h\n"
+ "ldr q11, [x21, x14]\n"
+ "ldr x24, [x16, #0x90]\n"
+ "fmla v31.8h, v2.8h, v16.8h\n"
+ "ldr q16, [x20, x14]\n"
+ "fmla v29.8h, v0.8h, v15.8h\n"
+ "ldr q14, [x25, x14]\n"
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr q12, [x26, x14]\n"
+ "ldr x23, [x16, #0x98]\n"
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "ldr q11, [x23, x14]\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "ldr q13, [x19, x14]\n"
+ "ldr x21, [x16, #0xa8]\n"
+ "fmla v31.8h, v6.8h, v15.8h\n"
+ "ldr q15, [x24, x14]\n"
+ "fmla v29.8h, v1.8h, v16.8h\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "fmla v30.8h, v7.8h, v12.8h\n"
+ "ldr x19, [x16, #0xb8]\n"
+ "fmla v28.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x22, x14]\n"
+ "ldr x26, [x16, #0xc0]\n"
+ "fmla v31.8h, v7.8h, v16.8h\n"
+ "fmla v29.8h, v6.8h, v15.8h\n"
+ "ldr q16, [x21, x14]\n"
+ "fmla v30.8h, v8.8h, v11.8h\n"
+ "ldr q15, [x19, x14]\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "fmla v29.8h, v7.8h, v13.8h\n"
+ "fmax v31.8h, v31.8h, v19.8h\n"
+ "fmax v30.8h, v30.8h, v19.8h\n"
+ "fmla v28.8h, v5.8h, v14.8h\n"
+ "ldr q14, [x20, x14]\n"
+ "fmin v31.8h, v31.8h, v18.8h\n"
+ "str q31, [x13, x28]\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v5.8h, v16.8h\n"
+ "ldr q11, [x26, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmin v30.8h, v30.8h, v18.8h\n"
+ "str q30, [x12, x28]\n"
+ "fmla v28.8h, v3.8h, v16.8h\n"
+ "fmla v29.8h, v8.8h, v15.8h\n"
+ "fmla v28.8h, v7.8h, v14.8h\n"
+ "fmax v29.8h, v29.8h, v19.8h\n"
+ "fmin v29.8h, v29.8h, v18.8h\n"
+ "str q29, [x10, x28]\n"
+ "fmla v28.8h, v6.8h, v15.8h\n"
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "fmax v28.8h, v28.8h, v19.8h\n"
+ "fmin v28.8h, v28.8h, v18.8h\n"
+ "str q28, [x9, x28]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x1\n"
+ "beq 42f\n"
+ "ldr q17, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "mov x28, x14\n"
+ "ldr q1, [x15, #0x20]\n"
+ "add x13, x13, x28\n"
+ "ldr q2, [x15, #0x30]\n"
+ "add x12, x12, x28\n"
+ "ldr q3, [x15, #0x40]\n"
+ "add x10, x10, x28\n"
+ "ldr q4, [x15, #0x50]\n"
+ "add x9, x9, x28\n"
+ "ldr q5, [x15, #0x60]\n"
+ "ldr q6, [x15, #0x70]\n"
+ "ldr q7, [x15, #0x80]\n"
+ "ldr q8, [x15, #0x90]\n"
+ "ldr x26, [x16, #0x0]\n"
+ "ldr x25, [x16, #0x8]\n"
+ "ldr x24, [x16, #0x10]\n"
+ "add x26, x26, x14\n"
+ "ldr x23, [x16, #0x18]\n"
+ "add x25, x25, x14\n"
+ "ldr x22, [x16, #0x20]\n"
+ "add x24, x24, x14\n"
+ "ldr x21, [x16, #0x28]\n"
+ "add x23, x23, x14\n"
+ "ldr x20, [x16, #0x30]\n"
+ "add x22, x22, x14\n"
+ "ldr x19, [x16, #0x38]\n"
+ "add x21, x21, x14\n"
+ "add x20, x20, x14\n"
+ "add x19, x19, x14\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v9.s }[0], [x26], #0x4\n"
+ "ld1 { v10.s }[0], [x25], #0x4\n"
+ "ld1 { v11.s }[0], [x24], #0x4\n"
+ "ld1 { v12.s }[0], [x23], #0x4\n"
+ "ld1 { v13.s }[0], [x22], #0x4\n"
+ "ld1 { v14.s }[0], [x21], #0x4\n"
+ "ld1 { v15.s }[0], [x20], #0x4\n"
+ "ld1 { v16.s }[0], [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 5f\n"
+ "ld1 { v9.h }[2], [x26], #0x2\n"
+ "ld1 { v10.h }[2], [x25], #0x2\n"
+ "ld1 { v11.h }[2], [x24], #0x2\n"
+ "ld1 { v12.h }[2], [x23], #0x2\n"
+ "ld1 { v13.h }[2], [x22], #0x2\n"
+ "ld1 { v14.h }[2], [x21], #0x2\n"
+ "ld1 { v15.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x19], #0x2\n"
+ "b 5f\n"
+ "4:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: Unset
+ "ld1 { v9.h }[0], [x26], #0x2\n"
+ "ld1 { v10.h }[0], [x25], #0x2\n"
+ "ld1 { v11.h }[0], [x24], #0x2\n"
+ "ld1 { v12.h }[0], [x23], #0x2\n"
+ "ld1 { v13.h }[0], [x22], #0x2\n"
+ "ld1 { v14.h }[0], [x21], #0x2\n"
+ "ld1 { v15.h }[0], [x20], #0x2\n"
+ "ld1 { v16.h }[0], [x19], #0x2\n"
+ "5:" // Oddments: Load inputs (2, 2), (0, 0), (0, 1), (0, 3), (0, 4), (1, 0), (1, 1), (0, 2): Bit 1: End
+ "mov v31.16b, v17.16b\n fmla v31.8h, v8.8h, v9.8h\n"
+ "ldr x26, [x16, #0x40]\n"
+ "add x26, x26, x14\n"
+ "mov v30.16b, v17.16b\n fmla v30.8h, v6.8h, v9.8h\n"
+ "mov v29.16b, v17.16b\n fmla v29.8h, v2.8h, v9.8h\n"
+ "mov v28.16b, v17.16b\n fmla v28.8h, v0.8h, v9.8h\n"
+ "fmla v31.8h, v0.8h, v10.8h\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "fmla v30.8h, v2.8h, v13.8h\n"
+ "fmla v31.8h, v3.8h, v14.8h\n"
+ "fmla v30.8h, v0.8h, v16.8h\n"
+ "fmla v31.8h, v4.8h, v15.8h\n"
+ "fmla v31.8h, v2.8h, v16.8h\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v11.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v11.h }[2], [x26], #0x2\n"
+ "b 7f\n"
+ "6:" // Oddments: Load input (1, 3): Bit 1: Unset
+ "ld1 { v11.h }[0], [x26], #0x2\n"
+ "7:" // Oddments: Load input (1, 3): Bit 1: End
+ "fmla v30.8h, v4.8h, v11.8h\n"
+ "ldr x25, [x16, #0x48]\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v12.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v12.h }[2], [x25], #0x2\n"
+ "b 9f\n"
+ "8:" // Oddments: Load input (1, 4): Bit 1: Unset
+ "ld1 { v12.h }[0], [x25], #0x2\n"
+ "9:" // Oddments: Load input (1, 4): Bit 1: End
+ "fmla v30.8h, v5.8h, v12.8h\n"
+ "ldr x24, [x16, #0x50]\n"
+ "add x24, x24, x14\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v13.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v13.h }[2], [x24], #0x2\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (1, 2): Bit 1: Unset
+ "ld1 { v13.h }[0], [x24], #0x2\n"
+ "11:" // Oddments: Load input (1, 2): Bit 1: End
+ "fmla v31.8h, v5.8h, v13.8h\n"
+ "ldr x23, [x16, #0x58]\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "add x23, x23, x14\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v14.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v14.h }[2], [x23], #0x2\n"
+ "b 13f\n"
+ "12:" // Oddments: Load input (3, 0): Bit 1: Unset
+ "ld1 { v14.h }[0], [x23], #0x2\n"
+ "13:" // Oddments: Load input (3, 0): Bit 1: End
+ "fmla v29.8h, v3.8h, v14.8h\n"
+ "ldr x22, [x16, #0x60]\n"
+ "add x22, x22, x14\n"
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v15.s }[0], [x22], #0x4\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v15.h }[2], [x22], #0x2\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (2, 0): Bit 1: Unset
+ "ld1 { v15.h }[0], [x22], #0x2\n"
+ "15:" // Oddments: Load input (2, 0): Bit 1: End
+ "fmla v31.8h, v6.8h, v15.8h\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v29.8h, v0.8h, v15.8h\n"
+ "add x21, x21, x14\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v11.s }[0], [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 17f\n"
+ "ld1 { v11.h }[2], [x21], #0x2\n"
+ "b 17f\n"
+ "16:" // Oddments: Load input (3, 1): Bit 1: Unset
+ "ld1 { v11.h }[0], [x21], #0x2\n"
+ "17:" // Oddments: Load input (3, 1): Bit 1: End
+ "fmla v29.8h, v4.8h, v11.8h\n"
+ "ldr x20, [x16, #0x70]\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v16.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (2, 1): Bit 1: Unset
+ "ld1 { v16.h }[0], [x20], #0x2\n"
+ "19:" // Oddments: Load input (2, 1): Bit 1: End
+ "fmla v31.8h, v7.8h, v16.8h\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v29.8h, v1.8h, v16.8h\n"
+ "add x19, x19, x14\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v13.s }[0], [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 21f\n"
+ "ld1 { v13.h }[2], [x19], #0x2\n"
+ "b 21f\n"
+ "20:" // Oddments: Load input (3, 3): Bit 1: Unset
+ "ld1 { v13.h }[0], [x19], #0x2\n"
+ "21:" // Oddments: Load input (3, 3): Bit 1: End
+ "fmla v28.8h, v4.8h, v13.8h\n"
+ "ldr x26, [x16, #0x80]\n"
+ "add x26, x26, x14\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v12.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v12.h }[2], [x26], #0x2\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (2, 3): Bit 1: Unset
+ "ld1 { v12.h }[0], [x26], #0x2\n"
+ "23:" // Oddments: Load input (2, 3): Bit 1: End
+ "fmla v30.8h, v7.8h, v12.8h\n"
+ "ldr x25, [x16, #0x88]\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v14.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 25f\n"
+ "ld1 { v14.h }[2], [x25], #0x2\n"
+ "b 25f\n"
+ "24:" // Oddments: Load input (3, 4): Bit 1: Unset
+ "ld1 { v14.h }[0], [x25], #0x2\n"
+ "25:" // Oddments: Load input (3, 4): Bit 1: End
+ "fmla v28.8h, v5.8h, v14.8h\n"
+ "ldr x24, [x16, #0x90]\n"
+ "add x24, x24, x14\n"
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v15.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v15.h }[2], [x24], #0x2\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (4, 0): Bit 1: Unset
+ "ld1 { v15.h }[0], [x24], #0x2\n"
+ "27:" // Oddments: Load input (4, 0): Bit 1: End
+ "fmla v29.8h, v6.8h, v15.8h\n"
+ "ldr x23, [x16, #0x98]\n"
+ "add x23, x23, x14\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v11.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 29f\n"
+ "ld1 { v11.h }[2], [x23], #0x2\n"
+ "b 29f\n"
+ "28:" // Oddments: Load input (2, 4): Bit 1: Unset
+ "ld1 { v11.h }[0], [x23], #0x2\n"
+ "29:" // Oddments: Load input (2, 4): Bit 1: End
+ "fmla v30.8h, v8.8h, v11.8h\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "add x22, x22, x14\n"
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v13.s }[0], [x22], #0x4\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v13.h }[2], [x22], #0x2\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (4, 1): Bit 1: Unset
+ "ld1 { v13.h }[0], [x22], #0x2\n"
+ "31:" // Oddments: Load input (4, 1): Bit 1: End
+ "fmla v29.8h, v7.8h, v13.8h\n"
+ "ldr x21, [x16, #0xa8]\n"
+ "add x21, x21, x14\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v16.s }[0], [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 33f\n"
+ "ld1 { v16.h }[2], [x21], #0x2\n"
+ "b 33f\n"
+ "32:" // Oddments: Load input (3, 2): Bit 1: Unset
+ "ld1 { v16.h }[0], [x21], #0x2\n"
+ "33:" // Oddments: Load input (3, 2): Bit 1: End
+ "fmla v29.8h, v5.8h, v16.8h\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "fmla v28.8h, v3.8h, v16.8h\n"
+ "add x20, x20, x14\n"
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v14.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v14.h }[2], [x20], #0x2\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (4, 3): Bit 1: Unset
+ "ld1 { v14.h }[0], [x20], #0x2\n"
+ "35:" // Oddments: Load input (4, 3): Bit 1: End
+ "fmla v28.8h, v7.8h, v14.8h\n"
+ "ldr x19, [x16, #0xb8]\n"
+ "add x19, x19, x14\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v15.s }[0], [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 37f\n"
+ "ld1 { v15.h }[2], [x19], #0x2\n"
+ "b 37f\n"
+ "36:" // Oddments: Load input (4, 2): Bit 1: Unset
+ "ld1 { v15.h }[0], [x19], #0x2\n"
+ "37:" // Oddments: Load input (4, 2): Bit 1: End
+ "fmla v29.8h, v8.8h, v15.8h\n"
+ "ldr x26, [x16, #0xc0]\n"
+ "fmla v28.8h, v6.8h, v15.8h\n"
+ "add x26, x26, x14\n"
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v11.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v11.h }[2], [x26], #0x2\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (4, 4): Bit 1: Unset
+ "ld1 { v11.h }[0], [x26], #0x2\n"
+ "39:" // Oddments: Load input (4, 4): Bit 1: End
+ "fmla v28.8h, v8.8h, v11.8h\n"
+ "fmax v31.8h, v31.8h, v19.8h\n"
+ "fmax v30.8h, v30.8h, v19.8h\n"
+ "fmax v29.8h, v29.8h, v19.8h\n"
+ "fmin v31.8h, v31.8h, v18.8h\n"
+ "fmin v30.8h, v30.8h, v18.8h\n"
+ "fmin v29.8h, v29.8h, v18.8h\n"
+ "fmax v28.8h, v28.8h, v19.8h\n"
+ "fmin v28.8h, v28.8h, v18.8h\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "st1 { v31.s }[0], [x13], #0x4\n"
+ "st1 { v30.s }[0], [x12], #0x4\n"
+ "st1 { v29.s }[0], [x10], #0x4\n"
+ "st1 { v28.s }[0], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 41f\n"
+ "st1 { v31.h }[2], [x13], #0x2\n"
+ "st1 { v30.h }[2], [x12], #0x2\n"
+ "st1 { v29.h }[2], [x10], #0x2\n"
+ "st1 { v28.h }[2], [x9], #0x2\n"
+ "b 41f\n"
+ "40:" // Oddments: Store: Bit 1: Unset
+ "st1 { v31.h }[0], [x13], #0x2\n"
+ "st1 { v30.h }[0], [x12], #0x2\n"
+ "st1 { v29.h }[0], [x10], #0x2\n"
+ "st1 { v28.h }[0], [x9], #0x2\n"
+ "41:" // Oddments: Store: Bit 1: End
+
+ "42:" // End
+
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 000000000..53d2a3a8e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst
+{
+ typedef __fp16 bias_type;
+ typedef __fp16 input_type;
+ typedef __fp16 weight_type;
+ typedef __fp16 return_type;
+
+ typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
+ typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ constexpr static unsigned int output_rows = 2;
+ constexpr static unsigned int output_cols = 2;
+
+ constexpr static unsigned int input_rows = 6;
+ constexpr static unsigned int input_cols = 6;
+
+ indirect_kern_type indirect_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+ direct_kern_type direct_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+
+ a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
new file mode 100644
index 000000000..ec5f97ab6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_direct.cpp
@@ -0,0 +1,973 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ const uint64_t n_tile_rows, n_tile_cols;
+ const __fp16 *inptr;
+ const uint64_t ld_input_row;
+ const uint64_t ld_input_col;
+ __fp16 *outptr;
+ const uint64_t ld_output_row;
+ const uint64_t ld_output_col;
+ const void *params;
+ const __fp16 min, max;
+
+ uint64_t tile_i = 0, tile_j = 0;
+
+ Args(
+ const unsigned int n_tile_rows,
+ const unsigned int n_tile_cols,
+ const __fp16 *inptr,
+ int64_t ld_input_row,
+ int64_t ld_input_col,
+ __fp16 *outptr,
+ int64_t ld_output_row,
+ int64_t ld_output_col,
+ const void *params,
+ const float activation_min,
+ const float activation_max
+ ) : n_tile_rows(n_tile_rows), n_tile_cols(n_tile_cols), inptr(inptr),
+ ld_input_row(ld_input_row), ld_input_col(ld_input_col), outptr(outptr),
+ ld_output_row(ld_output_row), ld_output_col(ld_output_col),
+ params(params), min(activation_min), max(activation_max)
+ {
+ }
+ };
+
+ Args params_struct(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_input_row, ld_input_col,
+ outptr, ld_output_row, ld_output_col,
+ params, activation_min, activation_max
+ );
+
+ __asm__ __volatile__(
+ "mov x28, #0x0\n"
+ "mov x27, #0x0\n"
+ "1:" // Tile loop
+ "str x28, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "mov x26, #0x2\n"
+ "str x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "mov x25, #0x2\n"
+ "ldr x3, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x24, %x[params_struct], %[offsetof_args_min]\n"
+ "ldr x23, [%x[params_struct], %[offsetof_args_ld_input_row]]\n"
+ "add x21, %x[params_struct], %[offsetof_args_max]\n"
+ "ldr x4, [%x[params_struct], %[offsetof_args_ld_input_col]]\n"
+ "mov x22, #0x0\n"
+ "ldr x5, [%x[params_struct], %[offsetof_args_inptr]]\n"
+ "mul x19, x28, x23\n" // offset = tile_i * ld_input_row
+ "ldr x20, [%x[params_struct], %[offsetof_args_ld_output_row]]\n"
+ "madd x19, x27, x4, x19\n" // offset += tile_j * ld_input_col
+ "ldr x6, [%x[params_struct], %[offsetof_args_ld_output_col]]\n"
+ "mul x19, x19, x26\n" // offset *= kernel_stride * output_size
+ "ldr x7, [%x[params_struct], %[offsetof_args_outptr]]\n"
+ "add x5, x5, x19, LSL #1\n" // inptr[0] += offset * sizeof(__fp16)
+ "ld1r { v18.8h }, [x24]\n"
+ "add x8, x5, x23, LSL #1\n"
+ "ld1r { v17.8h }, [x21]\n"
+ "add x17, x8, x23, LSL #1\n"
+ "lsl x4, x4, #0x1\n"
+ "add x16, x17, x23, LSL #1\n"
+ "add x15, x16, x23, LSL #1\n"
+ "add x14, x15, x23, LSL #1\n"
+ "add x13, x4, x4\n"
+ "add x12, x13, x4\n"
+ "add x11, x12, x4\n"
+ "add x10, x11, x4\n"
+ "mul x19, x28, x20\n" // offset = tile_i * ld_output_row
+ "madd x19, x27, x6, x19\n" // offset += tile_j * ld_output_col
+ "mul x19, x19, x25\n" // offset *= output_tile_size
+ "add x7, x7, x19, LSL #1\n" // outptrs[0] += offset * sizeof(__fp16)
+ "add x9, x7, x20, LSL #1\n"
+ "lsl x6, x6, #0x1\n"
+ "mov x21, #0x10\n" // cntb _, ALL, #1
+ "sub x20, XZR, x21\n"
+ "lsr x19, %x[n_channels], #0x3\n"
+ "cbz x19, 4f\n"
+ "ldr q16, [x3, #0x0]\n"
+ "ldr q0, [x3, #0x10]\n"
+ "cmp x21, x19, LSL #4\n"
+ "ldr q1, [x3, #0x20]\n"
+ "ldr q2, [x3, #0x30]\n"
+ "ldr q3, [x3, #0x40]\n"
+ "ldr q4, [x3, #0x50]\n"
+ "add x3, x3, #0x60\n"
+ "ld1 { v5.8h }, [x5]\n"
+ "ldr q6, [x5, x4]\n"
+ "ld1 { v7.8h }, [x8]\n"
+ "ldr q8, [x8, x4]\n"
+ "ldr q9, [x5, x13]\n"
+ "ldr q13, [x8, x13]\n"
+ "ldr q11, [x5, x12]\n"
+ "ldr q12, [x5, x11]\n"
+ "ldr q10, [x8, x10]\n"
+ "ld1 { v14.8h }, [x17]\n"
+ "bge 3f\n"
+ "2:" // Tile loop: Channel loop
+ "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr q5, [x8, x12]\n"
+ "add x20, x20, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n"
+ "add x22, x22, #0x10\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+ "add x21, x21, #0x10\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+ "ldr q0, [x3, #0x0]\n"
+ "cmp x21, x19, LSL #4\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x8, x11]\n"
+ "add x8, x8, #0x10\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "ldr q16, [x3, #0x140]\n"
+ "fmla v29.8h, v1.8h, v8.8h\n"
+ "fmla v28.8h, v1.8h, v13.8h\n"
+ "ldr q1, [x3, #0x10]\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x5, x10]\n"
+ "add x5, x5, #0x10\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "fmla v28.8h, v2.8h, v5.8h\n"
+ "ldr q2, [x3, #0x20]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x17, x4]\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v5.8h\n"
+ "fmla v28.8h, v3.8h, v6.8h\n"
+ "ldr q3, [x3, #0x30]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x17, x13]\n"
+ "fmla v30.8h, v4.8h, v9.8h\n"
+ "ldr q9, [x17, x12]\n"
+ "fmla v29.8h, v4.8h, v6.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q4, [x3, #0x40]\n"
+ "fmla v31.8h, v0.8h, v7.8h\n"
+ "ld1 { v7.8h }, [x8]\n"
+ "fmla v30.8h, v0.8h, v8.8h\n"
+ "fmla v29.8h, v0.8h, v14.8h\n"
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "ldr q0, [x3, #0x50]\n"
+ "fmla v31.8h, v1.8h, v8.8h\n"
+ "ldr q8, [x17, x10]\n"
+ "fmla v30.8h, v1.8h, v13.8h\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q1, [x3, #0x60]\n"
+ "fmla v31.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x17, x11]\n"
+ "add x17, x17, #0x10\n"
+ "fmla v30.8h, v2.8h, v5.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "fmla v28.8h, v2.8h, v9.8h\n"
+ "ldr q2, [x3, #0x70]\n"
+ "fmla v31.8h, v3.8h, v5.8h\n"
+ "ld1 { v5.8h }, [x16]\n"
+ "fmla v30.8h, v3.8h, v6.8h\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "ldr q3, [x3, #0x80]\n"
+ "fmla v31.8h, v4.8h, v6.8h\n"
+ "ldr q6, [x16, x4]\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "ldr q10, [x16, x13]\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "fmla v28.8h, v4.8h, v8.8h\n"
+ "ldr q4, [x3, #0x90]\n"
+ "fmla v31.8h, v0.8h, v14.8h\n"
+ "ldr q14, [x16, x10]\n"
+ "fmla v30.8h, v0.8h, v11.8h\n"
+ "fmla v29.8h, v0.8h, v5.8h\n"
+ "fmla v28.8h, v0.8h, v6.8h\n"
+ "ldr q0, [x3, #0xa0]\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x16, x12]\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v6.8h\n"
+ "fmla v28.8h, v1.8h, v10.8h\n"
+ "ldr q1, [x3, #0xb0]\n"
+ "fmla v31.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x16, x11]\n"
+ "add x16, x16, #0x10\n"
+ "fmla v30.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v2.8h, v10.8h\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q2, [x3, #0xc0]\n"
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x15]\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldr q3, [x3, #0xd0]\n"
+ "fmla v31.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x15, x4]\n"
+ "fmla v30.8h, v4.8h, v8.8h\n"
+ "ldr q8, [x15, x11]\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v4.8h, v14.8h\n"
+ "ldr q4, [x3, #0xe0]\n"
+ "fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr q5, [x15, x13]\n"
+ "fmla v30.8h, v0.8h, v6.8h\n"
+ "fmla v29.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v0.8h, v13.8h\n"
+ "ldr q0, [x3, #0xf0]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x15, x12]\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v1.8h, v13.8h\n"
+ "fmla v28.8h, v1.8h, v5.8h\n"
+ "ldr q1, [x3, #0x100]\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "ldr q10, [x15, x10]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v5.8h\n"
+ "fmla v28.8h, v2.8h, v6.8h\n"
+ "ldr q2, [x3, #0x110]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ld1 { v11.8h }, [x14]\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v6.8h\n"
+ "fmla v28.8h, v3.8h, v8.8h\n"
+ "ldr q3, [x3, #0x120]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x14, x4]\n"
+ "fmla v30.8h, v4.8h, v14.8h\n"
+ "ld1 { v14.8h }, [x17]\n"
+ "fmla v29.8h, v4.8h, v8.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q4, [x3, #0x130]\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x14, x13]\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x14, x12]\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "ldr q0, [x3, #0x150]\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "ldr q13, [x8, x13]\n"
+ "fmla v30.8h, v1.8h, v5.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x14, x11]\n"
+ "fmla v28.8h, v1.8h, v9.8h\n"
+ "ldr q1, [x3, #0x160]\n"
+ "fmla v31.8h, v2.8h, v5.8h\n"
+ "ld1 { v5.8h }, [x5]\n"
+ "fmla v30.8h, v2.8h, v6.8h\n"
+ "fmla v29.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x14, x10]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q2, [x3, #0x170]\n"
+ "fmla v31.8h, v3.8h, v6.8h\n"
+ "ldr q6, [x5, x4]\n"
+ "fmla v30.8h, v3.8h, v8.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x5, x12]\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldr q3, [x3, #0x180]\n"
+ "fmla v31.8h, v4.8h, v8.8h\n"
+ "ldr q8, [x8, x4]\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "ldr q10, [x8, x10]\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x5, x11]\n"
+ "fmla v28.8h, v4.8h, v9.8h\n"
+ "ldr q9, [x5, x13]\n"
+ "ldr q4, [x3, #0x190]\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "add x3, x3, #0x1a0\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "st1 { v31.8h }, [x7]\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "str q30, [x7, x6]\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "add x7, x7, #0x10\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "st1 { v29.8h }, [x9]\n"
+ "str q28, [x9, x6]\n"
+ "add x9, x9, #0x10\n"
+ "blt 2b\n"
+ "3:" // Tile loop: Channel tail
+ "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr q5, [x8, x12]\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+ "ldr q0, [x3, #0x0]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x8, x11]\n"
+ "add x8, x8, #0x10\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v29.8h, v1.8h, v8.8h\n"
+ "fmla v28.8h, v1.8h, v13.8h\n"
+ "ldr q1, [x3, #0x10]\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x5, x10]\n"
+ "add x5, x5, #0x10\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "fmla v28.8h, v2.8h, v5.8h\n"
+ "ldr q2, [x3, #0x20]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x17, x4]\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v5.8h\n"
+ "fmla v28.8h, v3.8h, v6.8h\n"
+ "ldr q3, [x3, #0x30]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x17, x13]\n"
+ "fmla v30.8h, v4.8h, v9.8h\n"
+ "ldr q9, [x17, x12]\n"
+ "fmla v29.8h, v4.8h, v6.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q4, [x3, #0x40]\n"
+ "fmla v31.8h, v0.8h, v7.8h\n"
+ "fmla v30.8h, v0.8h, v8.8h\n"
+ "fmla v29.8h, v0.8h, v14.8h\n"
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "ldr q0, [x3, #0x50]\n"
+ "fmla v31.8h, v1.8h, v8.8h\n"
+ "ldr q8, [x17, x10]\n"
+ "fmla v30.8h, v1.8h, v13.8h\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q1, [x3, #0x60]\n"
+ "fmla v31.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x17, x11]\n"
+ "add x17, x17, #0x10\n"
+ "fmla v30.8h, v2.8h, v5.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "fmla v28.8h, v2.8h, v9.8h\n"
+ "ldr q2, [x3, #0x70]\n"
+ "fmla v31.8h, v3.8h, v5.8h\n"
+ "ld1 { v5.8h }, [x16]\n"
+ "fmla v30.8h, v3.8h, v6.8h\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "ldr q3, [x3, #0x80]\n"
+ "fmla v31.8h, v4.8h, v6.8h\n"
+ "ldr q6, [x16, x4]\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "ldr q10, [x16, x13]\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "fmla v28.8h, v4.8h, v8.8h\n"
+ "ldr q4, [x3, #0x90]\n"
+ "fmla v31.8h, v0.8h, v14.8h\n"
+ "ldr q14, [x16, x10]\n"
+ "fmla v30.8h, v0.8h, v11.8h\n"
+ "fmla v29.8h, v0.8h, v5.8h\n"
+ "fmla v28.8h, v0.8h, v6.8h\n"
+ "ldr q0, [x3, #0xa0]\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x16, x12]\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v6.8h\n"
+ "fmla v28.8h, v1.8h, v10.8h\n"
+ "ldr q1, [x3, #0xb0]\n"
+ "fmla v31.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x16, x11]\n"
+ "add x16, x16, #0x10\n"
+ "fmla v30.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v2.8h, v10.8h\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q2, [x3, #0xc0]\n"
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "ld1 { v9.8h }, [x15]\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldr q3, [x3, #0xd0]\n"
+ "fmla v31.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x15, x4]\n"
+ "fmla v30.8h, v4.8h, v8.8h\n"
+ "ldr q8, [x15, x11]\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v4.8h, v14.8h\n"
+ "ldr q4, [x3, #0xe0]\n"
+ "fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr q5, [x15, x13]\n"
+ "fmla v30.8h, v0.8h, v6.8h\n"
+ "fmla v29.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v0.8h, v13.8h\n"
+ "ldr q0, [x3, #0xf0]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x15, x12]\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v1.8h, v13.8h\n"
+ "fmla v28.8h, v1.8h, v5.8h\n"
+ "ldr q1, [x3, #0x100]\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "ldr q10, [x15, x10]\n"
+ "add x15, x15, #0x10\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v5.8h\n"
+ "fmla v28.8h, v2.8h, v6.8h\n"
+ "ldr q2, [x3, #0x110]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ld1 { v11.8h }, [x14]\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v6.8h\n"
+ "fmla v28.8h, v3.8h, v8.8h\n"
+ "ldr q3, [x3, #0x120]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x14, x4]\n"
+ "fmla v30.8h, v4.8h, v14.8h\n"
+ "fmla v29.8h, v4.8h, v8.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q4, [x3, #0x130]\n"
+ "add x3, x3, #0x140\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x14, x13]\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x14, x12]\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "fmla v30.8h, v1.8h, v5.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x14, x11]\n"
+ "fmla v28.8h, v1.8h, v9.8h\n"
+ "fmla v31.8h, v2.8h, v5.8h\n"
+ "fmla v30.8h, v2.8h, v6.8h\n"
+ "fmla v29.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x14, x10]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "fmla v31.8h, v3.8h, v6.8h\n"
+ "fmla v30.8h, v3.8h, v8.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v31.8h, v4.8h, v8.8h\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v4.8h, v9.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "st1 { v31.8h }, [x7]\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "str q30, [x7, x6]\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "add x7, x7, #0x10\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "st1 { v29.8h }, [x9]\n"
+ "str q28, [x9, x6]\n"
+ "add x9, x9, #0x10\n"
+ "4:" // Tile loop: Oddments
+ "tst %x[n_channels], #0x1\n"
+ "beq 61f\n"
+ "ldr q16, [x3, #0x0]\n"
+ "ldr q0, [x3, #0x10]\n"
+ "add x28, x5, XZR\n"
+ "ldr q1, [x3, #0x20]\n"
+ "add x27, x5, x4\n"
+ "ldr q2, [x3, #0x30]\n"
+ "add x26, x8, XZR\n"
+ "ldr q3, [x3, #0x40]\n"
+ "add x25, x8, x4\n"
+ "ldr q4, [x3, #0x50]\n"
+ "add x24, x5, x13\n"
+ "add x23, x8, x13\n"
+ "add x22, x5, x12\n"
+ "add x21, x5, x11\n"
+ "add x20, x8, x10\n"
+ "add x19, x17, XZR\n"
+ "add x3, x3, #0x60\n"
+ "tbz %x[n_channels], #1, 5f\n"
+ "ldr s5, [x28], #0x4\n"
+ "ldr s6, [x27], #0x4\n"
+ "ldr s7, [x26], #0x4\n"
+ "ldr s8, [x25], #0x4\n"
+ "ldr s9, [x24], #0x4\n"
+ "ldr s13, [x23], #0x4\n"
+ "ldr s11, [x22], #0x4\n"
+ "ldr s12, [x21], #0x4\n"
+ "ldr s10, [x20], #0x4\n"
+ "ldr s14, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 6f\n"
+ "ld1 { v5.h }[2], [x28]\n"
+ "ld1 { v6.h }[2], [x27]\n"
+ "ld1 { v7.h }[2], [x26]\n"
+ "ld1 { v8.h }[2], [x25]\n"
+ "ld1 { v9.h }[2], [x24]\n"
+ "ld1 { v13.h }[2], [x23]\n"
+ "ld1 { v11.h }[2], [x22]\n"
+ "ld1 { v12.h }[2], [x21]\n"
+ "ld1 { v10.h }[2], [x20]\n"
+ "ld1 { v14.h }[2], [x19]\n"
+ "b 6f\n"
+ "5:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset
+ "ldr h5, [x28, #0x0]\n"
+ "ldr h6, [x27, #0x0]\n"
+ "ldr h7, [x26, #0x0]\n"
+ "ldr h8, [x25, #0x0]\n"
+ "ldr h9, [x24, #0x0]\n"
+ "ldr h13, [x23, #0x0]\n"
+ "ldr h11, [x22, #0x0]\n"
+ "ldr h12, [x21, #0x0]\n"
+ "ldr h10, [x20, #0x0]\n"
+ "ldr h14, [x19, #0x0]\n"
+ "6:" // Tile loop: Oddments: Load inputs: (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
+ "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+ "add x19, x8, x12\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v29.8h, v1.8h, v8.8h\n"
+ "fmla v28.8h, v1.8h, v13.8h\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 7f\n"
+ "ldr s5, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 8f\n"
+ "ld1 { v5.h }[2], [x19]\n"
+ "b 8f\n"
+ "7:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: Unset
+ "ldr h5, [x19, #0x0]\n"
+ "8:" // Tile loop: Oddments: Load inputs: (1, 3): Bit 1: End
+ "fmla v28.8h, v2.8h, v5.8h\n"
+ "add x19, x8, x11\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v5.8h\n"
+ "tbz %x[n_channels], #1, 9f\n"
+ "ldr s6, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 10f\n"
+ "ld1 { v6.h }[2], [x19]\n"
+ "b 10f\n"
+ "9:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: Unset
+ "ldr h6, [x19, #0x0]\n"
+ "10:" // Tile loop: Oddments: Load inputs: (1, 4): Bit 1: End
+ "fmla v28.8h, v3.8h, v6.8h\n"
+ "add x19, x5, x10\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 11f\n"
+ "ldr s9, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 12f\n"
+ "ld1 { v9.h }[2], [x19]\n"
+ "b 12f\n"
+ "11:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: Unset
+ "ldr h9, [x19, #0x0]\n"
+ "12:" // Tile loop: Oddments: Load inputs: (0, 5): Bit 1: End
+ "fmla v30.8h, v4.8h, v9.8h\n"
+ "ldr h0, [x3, #0xc]\n"
+ "add x19, x17, x4\n"
+ "fmla v29.8h, v4.8h, v6.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "fmla v31.8h, v0.8h, v7.8h\n"
+ "fmla v30.8h, v0.8h, v8.8h\n"
+ "fmla v29.8h, v0.8h, v14.8h\n"
+ "tbz %x[n_channels], #1, 13f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 14f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 14f\n"
+ "13:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "14:" // Tile loop: Oddments: Load inputs: (2, 1): Bit 1: End
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "ldr h1, [x3, #0xe]\n"
+ "add x19, x17, x13\n"
+ "fmla v31.8h, v1.8h, v8.8h\n"
+ "fmla v30.8h, v1.8h, v13.8h\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 15f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 16f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 16f\n"
+ "15:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "16:" // Tile loop: Oddments: Load inputs: (2, 2): Bit 1: End
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr h2, [x3, #0x10]\n"
+ "add x19, x17, x12\n"
+ "fmla v31.8h, v2.8h, v13.8h\n"
+ "fmla v30.8h, v2.8h, v5.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 17f\n"
+ "ldr s9, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 18f\n"
+ "ld1 { v9.h }[2], [x19]\n"
+ "b 18f\n"
+ "17:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: Unset
+ "ldr h9, [x19, #0x0]\n"
+ "18:" // Tile loop: Oddments: Load inputs: (2, 3): Bit 1: End
+ "fmla v28.8h, v2.8h, v9.8h\n"
+ "ldr h3, [x3, #0x12]\n"
+ "add x19, x17, x11\n"
+ "fmla v31.8h, v3.8h, v5.8h\n"
+ "fmla v30.8h, v3.8h, v6.8h\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "tbz %x[n_channels], #1, 19f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 20f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 20f\n"
+ "19:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "20:" // Tile loop: Oddments: Load inputs: (2, 4): Bit 1: End
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "ldr h4, [x3, #0x14]\n"
+ "add x19, x17, x10\n"
+ "fmla v31.8h, v4.8h, v6.8h\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 21f\n"
+ "ldr s8, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 22f\n"
+ "ld1 { v8.h }[2], [x19]\n"
+ "b 22f\n"
+ "21:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: Unset
+ "ldr h8, [x19, #0x0]\n"
+ "22:" // Tile loop: Oddments: Load inputs: (2, 5): Bit 1: End
+ "fmla v28.8h, v4.8h, v8.8h\n"
+ "ldr h0, [x3, #0x16]\n"
+ "add x19, x16, XZR\n"
+ "fmla v31.8h, v0.8h, v14.8h\n"
+ "fmla v30.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 23f\n"
+ "ldr s5, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 24f\n"
+ "ld1 { v5.h }[2], [x19]\n"
+ "b 24f\n"
+ "23:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: Unset
+ "ldr h5, [x19, #0x0]\n"
+ "24:" // Tile loop: Oddments: Load inputs: (3, 0): Bit 1: End
+ "fmla v29.8h, v0.8h, v5.8h\n"
+ "add x19, x16, x4\n"
+ "tbz %x[n_channels], #1, 25f\n"
+ "ldr s6, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 26f\n"
+ "ld1 { v6.h }[2], [x19]\n"
+ "b 26f\n"
+ "25:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: Unset
+ "ldr h6, [x19, #0x0]\n"
+ "26:" // Tile loop: Oddments: Load inputs: (3, 1): Bit 1: End
+ "fmla v28.8h, v0.8h, v6.8h\n"
+ "ldr h1, [x3, #0x18]\n"
+ "add x19, x16, x13\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v6.8h\n"
+ "tbz %x[n_channels], #1, 27f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 28f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 28f\n"
+ "27:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "28:" // Tile loop: Oddments: Load inputs: (3, 2): Bit 1: End
+ "fmla v28.8h, v1.8h, v10.8h\n"
+ "ldr h2, [x3, #0x1a]\n"
+ "add x19, x16, x12\n"
+ "fmla v31.8h, v2.8h, v12.8h\n"
+ "fmla v30.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v2.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 29f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 30f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 30f\n"
+ "29:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "30:" // Tile loop: Oddments: Load inputs: (3, 3): Bit 1: End
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr h3, [x3, #0x1c]\n"
+ "add x19, x16, x11\n"
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 31f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 32f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 32f\n"
+ "31:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "32:" // Tile loop: Oddments: Load inputs: (3, 4): Bit 1: End
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldr h4, [x3, #0x1e]\n"
+ "add x19, x16, x10\n"
+ "fmla v31.8h, v4.8h, v13.8h\n"
+ "fmla v30.8h, v4.8h, v8.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 33f\n"
+ "ldr s14, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 34f\n"
+ "ld1 { v14.h }[2], [x19]\n"
+ "b 34f\n"
+ "33:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: Unset
+ "ldr h14, [x19, #0x0]\n"
+ "34:" // Tile loop: Oddments: Load inputs: (3, 5): Bit 1: End
+ "fmla v28.8h, v4.8h, v14.8h\n"
+ "ldr h0, [x3, #0x20]\n"
+ "add x19, x15, XZR\n"
+ "fmla v31.8h, v0.8h, v5.8h\n"
+ "fmla v30.8h, v0.8h, v6.8h\n"
+ "tbz %x[n_channels], #1, 35f\n"
+ "ldr s9, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 36f\n"
+ "ld1 { v9.h }[2], [x19]\n"
+ "b 36f\n"
+ "35:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: Unset
+ "ldr h9, [x19, #0x0]\n"
+ "36:" // Tile loop: Oddments: Load inputs: (4, 0): Bit 1: End
+ "fmla v29.8h, v0.8h, v9.8h\n"
+ "add x19, x15, x4\n"
+ "tbz %x[n_channels], #1, 37f\n"
+ "ldr s13, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 38f\n"
+ "ld1 { v13.h }[2], [x19]\n"
+ "b 38f\n"
+ "37:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: Unset
+ "ldr h13, [x19, #0x0]\n"
+ "38:" // Tile loop: Oddments: Load inputs: (4, 1): Bit 1: End
+ "fmla v28.8h, v0.8h, v13.8h\n"
+ "ldr h1, [x3, #0x22]\n"
+ "add x19, x15, x13\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v1.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 39f\n"
+ "ldr s5, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 40f\n"
+ "ld1 { v5.h }[2], [x19]\n"
+ "b 40f\n"
+ "39:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: Unset
+ "ldr h5, [x19, #0x0]\n"
+ "40:" // Tile loop: Oddments: Load inputs: (4, 2): Bit 1: End
+ "fmla v28.8h, v1.8h, v5.8h\n"
+ "ldr h2, [x3, #0x24]\n"
+ "add x19, x15, x12\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v5.8h\n"
+ "tbz %x[n_channels], #1, 41f\n"
+ "ldr s6, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 42f\n"
+ "ld1 { v6.h }[2], [x19]\n"
+ "b 42f\n"
+ "41:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: Unset
+ "ldr h6, [x19, #0x0]\n"
+ "42:" // Tile loop: Oddments: Load inputs: (4, 3): Bit 1: End
+ "fmla v28.8h, v2.8h, v6.8h\n"
+ "ldr h3, [x3, #0x26]\n"
+ "add x19, x15, x11\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v6.8h\n"
+ "tbz %x[n_channels], #1, 43f\n"
+ "ldr s8, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 44f\n"
+ "ld1 { v8.h }[2], [x19]\n"
+ "b 44f\n"
+ "43:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: Unset
+ "ldr h8, [x19, #0x0]\n"
+ "44:" // Tile loop: Oddments: Load inputs: (4, 4): Bit 1: End
+ "fmla v28.8h, v3.8h, v8.8h\n"
+ "ldr h4, [x3, #0x28]\n"
+ "add x19, x15, x10\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "fmla v30.8h, v4.8h, v14.8h\n"
+ "fmla v29.8h, v4.8h, v8.8h\n"
+ "tbz %x[n_channels], #1, 45f\n"
+ "ldr s10, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 46f\n"
+ "ld1 { v10.h }[2], [x19]\n"
+ "b 46f\n"
+ "45:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: Unset
+ "ldr h10, [x19, #0x0]\n"
+ "46:" // Tile loop: Oddments: Load inputs: (4, 5): Bit 1: End
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr h0, [x3, #0x2a]\n"
+ "add x19, x14, XZR\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 47f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 48f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 48f\n"
+ "47:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "48:" // Tile loop: Oddments: Load inputs: (5, 0): Bit 1: End
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "add x19, x14, x4\n"
+ "tbz %x[n_channels], #1, 49f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 50f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 50f\n"
+ "49:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "50:" // Tile loop: Oddments: Load inputs: (5, 1): Bit 1: End
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "ldr h1, [x3, #0x2c]\n"
+ "add x19, x14, x13\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "fmla v30.8h, v1.8h, v5.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 51f\n"
+ "ldr s9, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 52f\n"
+ "ld1 { v9.h }[2], [x19]\n"
+ "b 52f\n"
+ "51:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: Unset
+ "ldr h9, [x19, #0x0]\n"
+ "52:" // Tile loop: Oddments: Load inputs: (5, 2): Bit 1: End
+ "fmla v28.8h, v1.8h, v9.8h\n"
+ "ldr h2, [x3, #0x2e]\n"
+ "add x19, x14, x12\n"
+ "fmla v31.8h, v2.8h, v5.8h\n"
+ "fmla v30.8h, v2.8h, v6.8h\n"
+ "fmla v29.8h, v2.8h, v9.8h\n"
+ "tbz %x[n_channels], #1, 53f\n"
+ "ldr s11, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 54f\n"
+ "ld1 { v11.h }[2], [x19]\n"
+ "b 54f\n"
+ "53:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: Unset
+ "ldr h11, [x19, #0x0]\n"
+ "54:" // Tile loop: Oddments: Load inputs: (5, 3): Bit 1: End
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr h3, [x3, #0x30]\n"
+ "add x19, x14, x11\n"
+ "fmla v31.8h, v3.8h, v6.8h\n"
+ "fmla v30.8h, v3.8h, v8.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 55f\n"
+ "ldr s12, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 56f\n"
+ "ld1 { v12.h }[2], [x19]\n"
+ "b 56f\n"
+ "55:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: Unset
+ "ldr h12, [x19, #0x0]\n"
+ "56:" // Tile loop: Oddments: Load inputs: (5, 4): Bit 1: End
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldr h4, [x3, #0x32]\n"
+ "add x19, x14, x10\n"
+ "fmla v31.8h, v4.8h, v8.8h\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 57f\n"
+ "ldr s9, [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 58f\n"
+ "ld1 { v9.h }[2], [x19]\n"
+ "b 58f\n"
+ "57:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: Unset
+ "ldr h9, [x19, #0x0]\n"
+ "58:" // Tile loop: Oddments: Load inputs: (5, 5): Bit 1: End
+ "fmla v28.8h, v4.8h, v9.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "tbz %x[n_channels], #1, 59f\n"
+ "mov x19, x7\n"
+ "st1 { v31.s }[0], [x19], x6\n"
+ "add x7, x7, #0x4\n"
+ "st1 { v30.s }[0], [x19]\n"
+ "mov x19, x9\n"
+ "st1 { v29.s }[0], [x19], x6\n"
+ "add x9, x9, #0x4\n"
+ "st1 { v28.s }[0], [x19]\n"
+ "tbz %x[n_channels], #0, 60f\n"
+ "mov x20, x7\n"
+ "st1 { v31.h }[2], [x20], x6\n"
+ "mov x19, x9\n"
+ "st1 { v30.h }[2], [x20]\n"
+ "st1 { v29.h }[2], [x19], x6\n"
+ "st1 { v28.h }[2], [x19]\n"
+ "b 60f\n"
+ "59:" // Tile loop: Oddments: Store: Bit 1: Unset
+ "mov x20, x7\n"
+ "st1 { v31.h }[0], [x20], x6\n"
+ "mov x19, x9\n"
+ "st1 { v30.h }[0], [x20]\n"
+ "st1 { v29.h }[0], [x19], x6\n"
+ "st1 { v28.h }[0], [x19]\n"
+ "60:" // Tile loop: Oddments: Store: Bit 1: End
+
+ "61:" // Tile loop: End
+ "ldr x28, [%x[params_struct], %[offsetof_args_tile_i]]\n"
+ "add x21, x28, #0x1\n"
+ "ldr x27, [%x[params_struct], %[offsetof_args_tile_j]]\n"
+ "ldr x20, [%x[params_struct], %[offsetof_args_n_tile_rows]]\n"
+ "add x27, x27, #0x1\n"
+ "ldr x19, [%x[params_struct], %[offsetof_args_n_tile_cols]]\n"
+ "cmp x27, x19\n"
+ "csel x27, x27, XZR, LT\n"
+ "csel x28, x28, x21, LT\n"
+ "cmp x28, x20\n"
+ "blt 1b\n"
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_args_inptr] "I" (offsetof(Args, inptr)), [offsetof_args_ld_input_col] "I" (offsetof(Args, ld_input_col)), [offsetof_args_ld_input_row] "I" (offsetof(Args, ld_input_row)), [offsetof_args_ld_output_col] "I" (offsetof(Args, ld_output_col)), [offsetof_args_ld_output_row] "I" (offsetof(Args, ld_output_row)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_n_tile_cols] "I" (offsetof(Args, n_tile_cols)), [offsetof_args_n_tile_rows] "I" (offsetof(Args, n_tile_rows)), [offsetof_args_outptr] "I" (offsetof(Args, outptr)), [offsetof_args_params] "I" (offsetof(Args, params)), [offsetof_args_tile_i] "I" (offsetof(Args, tile_i)), [offsetof_args_tile_j] "I" (offsetof(Args, tile_j)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
new file mode 100644
index 000000000..96e1ae496
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst/generic_indirect.cpp
@@ -0,0 +1,1022 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ struct Args
+ {
+ __fp16 *const *outptrs;
+ const void *params;
+ const __fp16 min, max;
+ const __fp16 *inptrs[36];
+
+ Args(
+ const __fp16 *const *const input_ptrs,
+ __fp16 *const *const outptrs,
+ const void *const params,
+ const __fp16 min,
+ const __fp16 max
+ ) : outptrs(outptrs), params(params), min(min), max(max)
+ {
+ inptrs[0] = input_ptrs[0];
+ inptrs[1] = input_ptrs[1];
+ inptrs[2] = input_ptrs[6];
+ inptrs[3] = input_ptrs[7];
+ inptrs[4] = input_ptrs[2];
+ inptrs[5] = input_ptrs[8];
+ inptrs[6] = input_ptrs[3];
+ inptrs[7] = input_ptrs[4];
+ inptrs[8] = input_ptrs[11];
+ inptrs[9] = input_ptrs[12];
+ inptrs[10] = input_ptrs[9];
+ inptrs[11] = input_ptrs[10];
+ inptrs[12] = input_ptrs[5];
+ inptrs[13] = input_ptrs[13];
+ inptrs[14] = input_ptrs[14];
+ inptrs[15] = input_ptrs[15];
+ inptrs[16] = input_ptrs[16];
+ inptrs[17] = input_ptrs[17];
+ inptrs[18] = input_ptrs[18];
+ inptrs[19] = input_ptrs[19];
+ inptrs[20] = input_ptrs[20];
+ inptrs[21] = input_ptrs[21];
+ inptrs[22] = input_ptrs[22];
+ inptrs[23] = input_ptrs[23];
+ inptrs[24] = input_ptrs[24];
+ inptrs[25] = input_ptrs[25];
+ inptrs[26] = input_ptrs[26];
+ inptrs[27] = input_ptrs[27];
+ inptrs[28] = input_ptrs[28];
+ inptrs[29] = input_ptrs[29];
+ inptrs[30] = input_ptrs[30];
+ inptrs[31] = input_ptrs[31];
+ inptrs[32] = input_ptrs[32];
+ inptrs[33] = input_ptrs[33];
+ inptrs[34] = input_ptrs[34];
+ inptrs[35] = input_ptrs[35];
+
+ }
+ };
+
+ Args params_struct(input_ptrs, outptrs, params,
+ activation_min, activation_max);
+
+ __asm__ __volatile__(
+ "ldr x21, [%x[params_struct], %[offsetof_args_outptrs]]\n"
+ "add x16, %x[params_struct], %[offsetof_Args_inptrs]\n"
+ "ldr x15, [%x[params_struct], %[offsetof_args_params]]\n"
+ "add x20, %x[params_struct], %[offsetof_args_min]\n"
+ "add x19, %x[params_struct], %[offsetof_args_max]\n"
+ "ld1r { v18.8h }, [x20]\n"
+ "ld1r { v17.8h }, [x19]\n"
+ "mov x14, #0x0\n"
+ "ldp x13, x12, [x21, #0x0]\n"
+ "mov x11, #0x10\n" // cntb _, ALL, #1
+ "ldp x10, x9, [x21, #0x10]\n"
+ "sub x28, XZR, x11\n"
+ "lsr x27, %x[n_channels], #0x3\n"
+ "cbz x27, 3f\n"
+ "ldr q16, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "cmp x11, x27, LSL #4\n"
+ "ldr q1, [x15, #0x20]\n"
+ "ldr q2, [x15, #0x30]\n"
+ "ldr q3, [x15, #0x40]\n"
+ "ldr q4, [x15, #0x50]\n"
+ "add x15, x15, #0x60\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ "ldp x22, x21, [x16, #0x20]\n"
+ "ldr q5, [x26, x14]\n"
+ "ldr q6, [x25, x14]\n"
+ "ldr q7, [x24, x14]\n"
+ "ldr q8, [x23, x14]\n"
+ "ldr q9, [x22, x14]\n"
+ "ldr q13, [x21, x14]\n"
+ "ldp x20, x19, [x16, #0x30]\n"
+ "ldp x26, x25, [x16, #0x40]\n"
+ "ldr q11, [x20, x14]\n"
+ "ldr q12, [x19, x14]\n"
+ "ldr q10, [x26, x14]\n"
+ "ldr q14, [x25, x14]\n"
+ "bge 2f\n"
+ "1:" // Channel loop
+ "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr x24, [x16, #0x50]\n"
+ "add x28, x28, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n"
+ "ldr x23, [x16, #0x58]\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+ "ldr x22, [x16, #0x60]\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+ "ldr q5, [x24, x14]\n"
+ "ldr q0, [x15, #0x0]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x23, x14]\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v29.8h, v1.8h, v8.8h\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v28.8h, v1.8h, v13.8h\n"
+ "ldr q1, [x15, #0x10]\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x22, x14]\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v28.8h, v2.8h, v5.8h\n"
+ "ldr q2, [x15, #0x20]\n"
+ "ldr x26, [x16, #0x80]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x21, x14]\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v5.8h\n"
+ "ldr x25, [x16, #0x88]\n"
+ "fmla v28.8h, v3.8h, v6.8h\n"
+ "ldr q3, [x15, #0x30]\n"
+ "ldr x24, [x16, #0x90]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x20, x14]\n"
+ "fmla v30.8h, v4.8h, v9.8h\n"
+ "fmla v29.8h, v4.8h, v6.8h\n"
+ "ldr q9, [x19, x14]\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q4, [x15, #0x40]\n"
+ "ldr x23, [x16, #0x98]\n"
+ "fmla v31.8h, v0.8h, v7.8h\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla v30.8h, v0.8h, v8.8h\n"
+ "fmla v29.8h, v0.8h, v14.8h\n"
+ "ldr x21, [x16, #0xa8]\n"
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "ldr q0, [x15, #0x50]\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "fmla v31.8h, v1.8h, v8.8h\n"
+ "ldr q8, [x25, x14]\n"
+ "fmla v30.8h, v1.8h, v13.8h\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "ldr x19, [x16, #0xb8]\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q1, [x15, #0x60]\n"
+ "ldr x25, [x16, #0xc8]\n"
+ "fmla v31.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x26, x14]\n"
+ "fmla v30.8h, v2.8h, v5.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "ldr x26, [x16, #0xc0]\n"
+ "fmla v28.8h, v2.8h, v9.8h\n"
+ "ldr q2, [x15, #0x70]\n"
+ "ldr q16, [x15, #0x140]\n"
+ "fmla v31.8h, v3.8h, v5.8h\n"
+ "ldr q5, [x24, x14]\n"
+ "fmla v30.8h, v3.8h, v6.8h\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "ldr x24, [x16, #0xd0]\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "ldr q3, [x15, #0x80]\n"
+ "fmla v31.8h, v4.8h, v6.8h\n"
+ "ldr q6, [x23, x14]\n"
+ "ldr x23, [x16, #0xd8]\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "ldr q10, [x22, x14]\n"
+ "fmla v28.8h, v4.8h, v8.8h\n"
+ "ldr q4, [x15, #0x90]\n"
+ "ldr x22, [x16, #0xe0]\n"
+ "fmla v31.8h, v0.8h, v14.8h\n"
+ "ldr q14, [x19, x14]\n"
+ "fmla v30.8h, v0.8h, v11.8h\n"
+ "fmla v29.8h, v0.8h, v5.8h\n"
+ "ldr x19, [x16, #0xf8]\n"
+ "fmla v28.8h, v0.8h, v6.8h\n"
+ "ldr q0, [x15, #0xa0]\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x21, x14]\n"
+ "ldr x21, [x16, #0xe8]\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v6.8h\n"
+ "fmla v28.8h, v1.8h, v10.8h\n"
+ "ldr q1, [x15, #0xb0]\n"
+ "fmla v31.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x20, x14]\n"
+ "ldr x20, [x16, #0xf0]\n"
+ "fmla v30.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v2.8h, v10.8h\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q2, [x15, #0xc0]\n"
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "ldr q9, [x26, x14]\n"
+ "ldr x26, [x16, #0x100]\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldr q3, [x15, #0xd0]\n"
+ "fmla v31.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x25, x14]\n"
+ "ldr x25, [x16, #0x108]\n"
+ "fmla v30.8h, v4.8h, v8.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "ldr q8, [x22, x14]\n"
+ "fmla v28.8h, v4.8h, v14.8h\n"
+ "ldr q4, [x15, #0xe0]\n"
+ "fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr q5, [x24, x14]\n"
+ "ldr x24, [x16, #0x110]\n"
+ "fmla v30.8h, v0.8h, v6.8h\n"
+ "fmla v29.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v0.8h, v13.8h\n"
+ "ldr q0, [x15, #0xf0]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x23, x14]\n"
+ "ldr x23, [x16, #0x118]\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v1.8h, v13.8h\n"
+ "fmla v28.8h, v1.8h, v5.8h\n"
+ "ldr q1, [x15, #0x100]\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "ldr q10, [x21, x14]\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v5.8h\n"
+ "fmla v28.8h, v2.8h, v6.8h\n"
+ "ldr q2, [x15, #0x110]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x20, x14]\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v6.8h\n"
+ "fmla v28.8h, v3.8h, v8.8h\n"
+ "ldr q3, [x15, #0x120]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x19, x14]\n"
+ "fmla v30.8h, v4.8h, v14.8h\n"
+ "fmla v29.8h, v4.8h, v8.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q4, [x15, #0x130]\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x26, x14]\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x25, x14]\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "ldp x26, x25, [x16, #0x0]\n"
+ "ldr q0, [x15, #0x150]\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "fmla v30.8h, v1.8h, v5.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x24, x14]\n"
+ "fmla v28.8h, v1.8h, v9.8h\n"
+ "ldr q1, [x15, #0x160]\n"
+ "fmla v31.8h, v2.8h, v5.8h\n"
+ "ldr q5, [x26, x11]\n"
+ "fmla v30.8h, v2.8h, v6.8h\n"
+ "fmla v29.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x23, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldp x24, x23, [x16, #0x10]\n"
+ "ldp x22, x21, [x16, #0x20]\n"
+ "fmla v31.8h, v3.8h, v6.8h\n"
+ "ldr q6, [x25, x11]\n"
+ "fmla v30.8h, v3.8h, v8.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "ldr q7, [x24, x11]\n"
+ "ldr q13, [x21, x11]\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldp x20, x19, [x16, #0x30]\n"
+ "fmla v31.8h, v4.8h, v8.8h\n"
+ "ldr q8, [x23, x11]\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "ldr q11, [x20, x11]\n"
+ "ldr q12, [x19, x11]\n"
+ "fmla v28.8h, v4.8h, v9.8h\n"
+ "ldr q9, [x22, x11]\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "ldp x26, x25, [x16, #0x40]\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "ldr q2, [x15, #0x170]\n"
+ "ldr q3, [x15, #0x180]\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "ldr q10, [x26, x11]\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "ldr q14, [x25, x11]\n"
+ "add x11, x11, #0x10\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "str q31, [x13, x28]\n"
+ "cmp x11, x27, LSL #4\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "str q30, [x12, x28]\n"
+ "ldr q4, [x15, #0x190]\n"
+ "add x15, x15, #0x1a0\n"
+ "str q29, [x10, x28]\n"
+ "str q28, [x9, x28]\n"
+ "blt 1b\n"
+ "2:" // Channel tail
+ "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr x24, [x16, #0x50]\n"
+ "add x28, x28, #0x10\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n"
+ "ldr x23, [x16, #0x58]\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+ "ldr x22, [x16, #0x60]\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+ "ldr q5, [x24, x14]\n"
+ "ldr q0, [x15, #0x0]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x23, x14]\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v29.8h, v1.8h, v8.8h\n"
+ "ldr x21, [x16, #0x68]\n"
+ "fmla v28.8h, v1.8h, v13.8h\n"
+ "ldr q1, [x15, #0x10]\n"
+ "ldr x20, [x16, #0x70]\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x22, x14]\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "ldr x19, [x16, #0x78]\n"
+ "fmla v28.8h, v2.8h, v5.8h\n"
+ "ldr q2, [x15, #0x20]\n"
+ "ldr x26, [x16, #0x80]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x21, x14]\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v5.8h\n"
+ "ldr x25, [x16, #0x88]\n"
+ "fmla v28.8h, v3.8h, v6.8h\n"
+ "ldr q3, [x15, #0x30]\n"
+ "ldr x24, [x16, #0x90]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x20, x14]\n"
+ "fmla v30.8h, v4.8h, v9.8h\n"
+ "fmla v29.8h, v4.8h, v6.8h\n"
+ "ldr q9, [x19, x14]\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q4, [x15, #0x40]\n"
+ "ldr x23, [x16, #0x98]\n"
+ "fmla v31.8h, v0.8h, v7.8h\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "fmla v30.8h, v0.8h, v8.8h\n"
+ "fmla v29.8h, v0.8h, v14.8h\n"
+ "ldr x21, [x16, #0xa8]\n"
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "ldr q0, [x15, #0x50]\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "fmla v31.8h, v1.8h, v8.8h\n"
+ "ldr q8, [x25, x14]\n"
+ "fmla v30.8h, v1.8h, v13.8h\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "ldr x19, [x16, #0xb8]\n"
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr q1, [x15, #0x60]\n"
+ "ldr x25, [x16, #0xc8]\n"
+ "fmla v31.8h, v2.8h, v13.8h\n"
+ "ldr q13, [x26, x14]\n"
+ "fmla v30.8h, v2.8h, v5.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "ldr x26, [x16, #0xc0]\n"
+ "fmla v28.8h, v2.8h, v9.8h\n"
+ "ldr q2, [x15, #0x70]\n"
+ "fmla v31.8h, v3.8h, v5.8h\n"
+ "ldr q5, [x24, x14]\n"
+ "ldr x24, [x16, #0xd0]\n"
+ "fmla v30.8h, v3.8h, v6.8h\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "ldr q3, [x15, #0x80]\n"
+ "fmla v31.8h, v4.8h, v6.8h\n"
+ "ldr q6, [x23, x14]\n"
+ "ldr x23, [x16, #0xd8]\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "ldr q10, [x22, x14]\n"
+ "fmla v28.8h, v4.8h, v8.8h\n"
+ "ldr q4, [x15, #0x90]\n"
+ "ldr x22, [x16, #0xe0]\n"
+ "fmla v31.8h, v0.8h, v14.8h\n"
+ "ldr q14, [x19, x14]\n"
+ "fmla v30.8h, v0.8h, v11.8h\n"
+ "fmla v29.8h, v0.8h, v5.8h\n"
+ "ldr x19, [x16, #0xf8]\n"
+ "fmla v28.8h, v0.8h, v6.8h\n"
+ "ldr q0, [x15, #0xa0]\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "ldr q11, [x21, x14]\n"
+ "ldr x21, [x16, #0xe8]\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v6.8h\n"
+ "fmla v28.8h, v1.8h, v10.8h\n"
+ "ldr q1, [x15, #0xb0]\n"
+ "fmla v31.8h, v2.8h, v12.8h\n"
+ "ldr q12, [x20, x14]\n"
+ "ldr x20, [x16, #0xf0]\n"
+ "fmla v30.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v2.8h, v10.8h\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr q2, [x15, #0xc0]\n"
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "ldr q9, [x26, x14]\n"
+ "ldr x26, [x16, #0x100]\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldr q3, [x15, #0xd0]\n"
+ "fmla v31.8h, v4.8h, v13.8h\n"
+ "ldr q13, [x25, x14]\n"
+ "ldr x25, [x16, #0x108]\n"
+ "fmla v30.8h, v4.8h, v8.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "ldr q8, [x22, x14]\n"
+ "fmla v28.8h, v4.8h, v14.8h\n"
+ "ldr q4, [x15, #0xe0]\n"
+ "fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr q5, [x24, x14]\n"
+ "ldr x24, [x16, #0x110]\n"
+ "fmla v30.8h, v0.8h, v6.8h\n"
+ "fmla v29.8h, v0.8h, v9.8h\n"
+ "fmla v28.8h, v0.8h, v13.8h\n"
+ "ldr q0, [x15, #0xf0]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr q6, [x23, x14]\n"
+ "ldr x23, [x16, #0x118]\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v1.8h, v13.8h\n"
+ "fmla v28.8h, v1.8h, v5.8h\n"
+ "ldr q1, [x15, #0x100]\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "ldr q10, [x21, x14]\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v5.8h\n"
+ "fmla v28.8h, v2.8h, v6.8h\n"
+ "ldr q2, [x15, #0x110]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr q11, [x20, x14]\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v6.8h\n"
+ "fmla v28.8h, v3.8h, v8.8h\n"
+ "ldr q3, [x15, #0x120]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr q12, [x19, x14]\n"
+ "fmla v30.8h, v4.8h, v14.8h\n"
+ "fmla v29.8h, v4.8h, v8.8h\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr q4, [x15, #0x130]\n"
+ "add x15, x15, #0x140\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "ldr q9, [x26, x14]\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr q11, [x25, x14]\n"
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "fmla v30.8h, v1.8h, v5.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "ldr q12, [x24, x14]\n"
+ "fmla v28.8h, v1.8h, v9.8h\n"
+ "fmla v31.8h, v2.8h, v5.8h\n"
+ "fmla v30.8h, v2.8h, v6.8h\n"
+ "fmla v29.8h, v2.8h, v9.8h\n"
+ "ldr q9, [x23, x14]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "fmla v31.8h, v3.8h, v6.8h\n"
+ "fmla v30.8h, v3.8h, v8.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "fmla v31.8h, v4.8h, v8.8h\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "fmla v28.8h, v4.8h, v9.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "str q31, [x13, x28]\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "str q30, [x12, x28]\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "str q29, [x10, x28]\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "str q28, [x9, x28]\n"
+ "3:" // Oddments
+ "tst %x[n_channels], #0x1\n"
+ "beq 60f\n"
+ "ldr q16, [x15, #0x0]\n"
+ "ldr q0, [x15, #0x10]\n"
+ "mov x28, x14\n"
+ "ldr q1, [x15, #0x20]\n"
+ "add x13, x13, x28\n"
+ "ldr q2, [x15, #0x30]\n"
+ "add x12, x12, x28\n"
+ "ldr q3, [x15, #0x40]\n"
+ "add x10, x10, x28\n"
+ "ldr q4, [x15, #0x50]\n"
+ "add x9, x9, x28\n"
+ "ldr x24, [x16, #0x10]\n"
+ "ldr x23, [x16, #0x18]\n"
+ "ldr x22, [x16, #0x20]\n"
+ "add x24, x24, x14\n"
+ "ldr x21, [x16, #0x28]\n"
+ "add x23, x23, x14\n"
+ "ldr x20, [x16, #0x30]\n"
+ "add x22, x22, x14\n"
+ "ldr x19, [x16, #0x38]\n"
+ "add x21, x21, x14\n"
+ "ldr x26, [x16, #0x40]\n"
+ "add x20, x20, x14\n"
+ "ldr x25, [x16, #0x48]\n"
+ "add x19, x19, x14\n"
+ "add x26, x26, x14\n"
+ "add x25, x25, x14\n"
+ "add x15, x15, #0x60\n"
+ "tbz %x[n_channels], #1, 4f\n"
+ "ld1 { v5.s }[0], [x26], #0x4\n"
+ "ld1 { v6.s }[0], [x25], #0x4\n"
+ "ld1 { v7.s }[0], [x24], #0x4\n"
+ "ld1 { v8.s }[0], [x23], #0x4\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "ld1 { v13.s }[0], [x21], #0x4\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "ld1 { v12.s }[0], [x19], #0x4\n"
+ "ld1 { v10.s }[0], [x26], #0x4\n"
+ "ld1 { v14.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 5f\n"
+ "ld1 { v7.h }[2], [x24], #0x2\n"
+ "ld1 { v8.h }[2], [x23], #0x2\n"
+ "ld1 { v5.h }[2], [x26], #0x2\n"
+ "ld1 { v6.h }[2], [x25], #0x2\n"
+ "ld1 { v9.h }[2], [x22], #0x2\n"
+ "ld1 { v13.h }[2], [x21], #0x2\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "ld1 { v12.h }[2], [x19], #0x2\n"
+ "ld1 { v10.h }[2], [x26], #0x2\n"
+ "ld1 { v14.h }[2], [x25], #0x2\n"
+ "b 5f\n"
+ "4:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: Unset
+ "ld1 { v5.h }[0], [x26], #0x2\n"
+ "ld1 { v6.h }[0], [x25], #0x2\n"
+ "ld1 { v7.h }[0], [x24], #0x2\n"
+ "ld1 { v8.h }[0], [x23], #0x2\n"
+ "ld1 { v9.h }[0], [x22], #0x2\n"
+ "ld1 { v13.h }[0], [x21], #0x2\n"
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "ld1 { v12.h }[0], [x19], #0x2\n"
+ "ld1 { v10.h }[0], [x26], #0x2\n"
+ "ld1 { v14.h }[0], [x25], #0x2\n"
+ "5:" // Oddments: Load inputs (0, 0), (0, 1), (1, 0), (1, 1), (0, 2), (1, 2), (0, 3), (0, 4), (1, 5), (2, 0): Bit 1: End
+ "mov v31.16b, v16.16b\n fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr x24, [x16, #0x50]\n"
+ "mov v30.16b, v16.16b\n fmla v30.8h, v0.8h, v6.8h\n"
+ "add x24, x24, x14\n"
+ "mov v29.16b, v16.16b\n fmla v29.8h, v0.8h, v7.8h\n"
+ "mov v28.16b, v16.16b\n fmla v28.8h, v0.8h, v8.8h\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "fmla v30.8h, v1.8h, v9.8h\n"
+ "fmla v29.8h, v1.8h, v8.8h\n"
+ "fmla v28.8h, v1.8h, v13.8h\n"
+ "fmla v31.8h, v2.8h, v9.8h\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 6f\n"
+ "ld1 { v5.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 7f\n"
+ "ld1 { v5.h }[2], [x24], #0x2\n"
+ "b 7f\n"
+ "6:" // Oddments: Load input (1, 3): Bit 1: Unset
+ "ld1 { v5.h }[0], [x24], #0x2\n"
+ "7:" // Oddments: Load input (1, 3): Bit 1: End
+ "fmla v28.8h, v2.8h, v5.8h\n"
+ "ldr x23, [x16, #0x58]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "add x23, x23, x14\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v5.8h\n"
+ "tbz %x[n_channels], #1, 8f\n"
+ "ld1 { v6.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 9f\n"
+ "ld1 { v6.h }[2], [x23], #0x2\n"
+ "b 9f\n"
+ "8:" // Oddments: Load input (1, 4): Bit 1: Unset
+ "ld1 { v6.h }[0], [x23], #0x2\n"
+ "9:" // Oddments: Load input (1, 4): Bit 1: End
+ "fmla v28.8h, v3.8h, v6.8h\n"
+ "ldr x22, [x16, #0x60]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "add x22, x22, x14\n"
+ "tbz %x[n_channels], #1, 10f\n"
+ "ld1 { v9.s }[0], [x22], #0x4\n"
+ "tbz %x[n_channels], #0, 11f\n"
+ "ld1 { v9.h }[2], [x22], #0x2\n"
+ "b 11f\n"
+ "10:" // Oddments: Load input (0, 5): Bit 1: Unset
+ "ld1 { v9.h }[0], [x22], #0x2\n"
+ "11:" // Oddments: Load input (0, 5): Bit 1: End
+ "fmla v30.8h, v4.8h, v9.8h\n"
+ "ldr h0, [x15, #0xc]\n"
+ "fmla v29.8h, v4.8h, v6.8h\n"
+ "ldr x21, [x16, #0x68]\n"
+ "add x21, x21, x14\n"
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "fmla v31.8h, v0.8h, v7.8h\n"
+ "fmla v30.8h, v0.8h, v8.8h\n"
+ "fmla v29.8h, v0.8h, v14.8h\n"
+ "tbz %x[n_channels], #1, 12f\n"
+ "ld1 { v11.s }[0], [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 13f\n"
+ "ld1 { v11.h }[2], [x21], #0x2\n"
+ "b 13f\n"
+ "12:" // Oddments: Load input (2, 1): Bit 1: Unset
+ "ld1 { v11.h }[0], [x21], #0x2\n"
+ "13:" // Oddments: Load input (2, 1): Bit 1: End
+ "fmla v28.8h, v0.8h, v11.8h\n"
+ "ldr h1, [x15, #0xe]\n"
+ "fmla v31.8h, v1.8h, v8.8h\n"
+ "ldr x20, [x16, #0x70]\n"
+ "add x20, x20, x14\n"
+ "fmla v30.8h, v1.8h, v13.8h\n"
+ "fmla v29.8h, v1.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 14f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 15f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 15f\n"
+ "14:" // Oddments: Load input (2, 2): Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "15:" // Oddments: Load input (2, 2): Bit 1: End
+ "fmla v28.8h, v1.8h, v12.8h\n"
+ "ldr h2, [x15, #0x10]\n"
+ "fmla v31.8h, v2.8h, v13.8h\n"
+ "ldr x19, [x16, #0x78]\n"
+ "add x19, x19, x14\n"
+ "fmla v30.8h, v2.8h, v5.8h\n"
+ "fmla v29.8h, v2.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 16f\n"
+ "ld1 { v9.s }[0], [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 17f\n"
+ "ld1 { v9.h }[2], [x19], #0x2\n"
+ "b 17f\n"
+ "16:" // Oddments: Load input (2, 3): Bit 1: Unset
+ "ld1 { v9.h }[0], [x19], #0x2\n"
+ "17:" // Oddments: Load input (2, 3): Bit 1: End
+ "fmla v28.8h, v2.8h, v9.8h\n"
+ "ldr h3, [x15, #0x12]\n"
+ "fmla v31.8h, v3.8h, v5.8h\n"
+ "ldr x26, [x16, #0x80]\n"
+ "add x26, x26, x14\n"
+ "fmla v30.8h, v3.8h, v6.8h\n"
+ "fmla v29.8h, v3.8h, v9.8h\n"
+ "tbz %x[n_channels], #1, 18f\n"
+ "ld1 { v13.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 19f\n"
+ "ld1 { v13.h }[2], [x26], #0x2\n"
+ "b 19f\n"
+ "18:" // Oddments: Load input (2, 4): Bit 1: Unset
+ "ld1 { v13.h }[0], [x26], #0x2\n"
+ "19:" // Oddments: Load input (2, 4): Bit 1: End
+ "fmla v28.8h, v3.8h, v13.8h\n"
+ "ldr h4, [x15, #0x14]\n"
+ "fmla v31.8h, v4.8h, v6.8h\n"
+ "ldr x25, [x16, #0x88]\n"
+ "add x25, x25, x14\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v4.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 20f\n"
+ "ld1 { v8.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 21f\n"
+ "ld1 { v8.h }[2], [x25], #0x2\n"
+ "b 21f\n"
+ "20:" // Oddments: Load input (2, 5): Bit 1: Unset
+ "ld1 { v8.h }[0], [x25], #0x2\n"
+ "21:" // Oddments: Load input (2, 5): Bit 1: End
+ "fmla v28.8h, v4.8h, v8.8h\n"
+ "ldr h0, [x15, #0x16]\n"
+ "fmla v31.8h, v0.8h, v14.8h\n"
+ "ldr x24, [x16, #0x90]\n"
+ "add x24, x24, x14\n"
+ "fmla v30.8h, v0.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 22f\n"
+ "ld1 { v5.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 23f\n"
+ "ld1 { v5.h }[2], [x24], #0x2\n"
+ "b 23f\n"
+ "22:" // Oddments: Load input (3, 0): Bit 1: Unset
+ "ld1 { v5.h }[0], [x24], #0x2\n"
+ "23:" // Oddments: Load input (3, 0): Bit 1: End
+ "fmla v29.8h, v0.8h, v5.8h\n"
+ "ldr x23, [x16, #0x98]\n"
+ "add x23, x23, x14\n"
+ "tbz %x[n_channels], #1, 24f\n"
+ "ld1 { v6.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 25f\n"
+ "ld1 { v6.h }[2], [x23], #0x2\n"
+ "b 25f\n"
+ "24:" // Oddments: Load input (3, 1): Bit 1: Unset
+ "ld1 { v6.h }[0], [x23], #0x2\n"
+ "25:" // Oddments: Load input (3, 1): Bit 1: End
+ "fmla v28.8h, v0.8h, v6.8h\n"
+ "ldr h1, [x15, #0x18]\n"
+ "fmla v31.8h, v1.8h, v11.8h\n"
+ "ldr x22, [x16, #0xa0]\n"
+ "add x22, x22, x14\n"
+ "fmla v30.8h, v1.8h, v12.8h\n"
+ "fmla v29.8h, v1.8h, v6.8h\n"
+ "tbz %x[n_channels], #1, 26f\n"
+ "ld1 { v10.s }[0], [x22], #0x4\n"
+ "tbz %x[n_channels], #0, 27f\n"
+ "ld1 { v10.h }[2], [x22], #0x2\n"
+ "b 27f\n"
+ "26:" // Oddments: Load input (3, 2): Bit 1: Unset
+ "ld1 { v10.h }[0], [x22], #0x2\n"
+ "27:" // Oddments: Load input (3, 2): Bit 1: End
+ "fmla v28.8h, v1.8h, v10.8h\n"
+ "ldr h2, [x15, #0x1a]\n"
+ "fmla v31.8h, v2.8h, v12.8h\n"
+ "ldr x21, [x16, #0xa8]\n"
+ "add x21, x21, x14\n"
+ "fmla v30.8h, v2.8h, v9.8h\n"
+ "fmla v29.8h, v2.8h, v10.8h\n"
+ "tbz %x[n_channels], #1, 28f\n"
+ "ld1 { v11.s }[0], [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 29f\n"
+ "ld1 { v11.h }[2], [x21], #0x2\n"
+ "b 29f\n"
+ "28:" // Oddments: Load input (3, 3): Bit 1: Unset
+ "ld1 { v11.h }[0], [x21], #0x2\n"
+ "29:" // Oddments: Load input (3, 3): Bit 1: End
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr h3, [x15, #0x1c]\n"
+ "fmla v31.8h, v3.8h, v9.8h\n"
+ "ldr x20, [x16, #0xb0]\n"
+ "add x20, x20, x14\n"
+ "fmla v30.8h, v3.8h, v13.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 30f\n"
+ "ld1 { v12.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 31f\n"
+ "ld1 { v12.h }[2], [x20], #0x2\n"
+ "b 31f\n"
+ "30:" // Oddments: Load input (3, 4): Bit 1: Unset
+ "ld1 { v12.h }[0], [x20], #0x2\n"
+ "31:" // Oddments: Load input (3, 4): Bit 1: End
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldr h4, [x15, #0x1e]\n"
+ "fmla v31.8h, v4.8h, v13.8h\n"
+ "ldr x19, [x16, #0xb8]\n"
+ "add x19, x19, x14\n"
+ "fmla v30.8h, v4.8h, v8.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 32f\n"
+ "ld1 { v14.s }[0], [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 33f\n"
+ "ld1 { v14.h }[2], [x19], #0x2\n"
+ "b 33f\n"
+ "32:" // Oddments: Load input (3, 5): Bit 1: Unset
+ "ld1 { v14.h }[0], [x19], #0x2\n"
+ "33:" // Oddments: Load input (3, 5): Bit 1: End
+ "fmla v28.8h, v4.8h, v14.8h\n"
+ "ldr h0, [x15, #0x20]\n"
+ "fmla v31.8h, v0.8h, v5.8h\n"
+ "ldr x26, [x16, #0xc0]\n"
+ "add x26, x26, x14\n"
+ "fmla v30.8h, v0.8h, v6.8h\n"
+ "tbz %x[n_channels], #1, 34f\n"
+ "ld1 { v9.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 35f\n"
+ "ld1 { v9.h }[2], [x26], #0x2\n"
+ "b 35f\n"
+ "34:" // Oddments: Load input (4, 0): Bit 1: Unset
+ "ld1 { v9.h }[0], [x26], #0x2\n"
+ "35:" // Oddments: Load input (4, 0): Bit 1: End
+ "fmla v29.8h, v0.8h, v9.8h\n"
+ "ldr x25, [x16, #0xc8]\n"
+ "add x25, x25, x14\n"
+ "tbz %x[n_channels], #1, 36f\n"
+ "ld1 { v13.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 37f\n"
+ "ld1 { v13.h }[2], [x25], #0x2\n"
+ "b 37f\n"
+ "36:" // Oddments: Load input (4, 1): Bit 1: Unset
+ "ld1 { v13.h }[0], [x25], #0x2\n"
+ "37:" // Oddments: Load input (4, 1): Bit 1: End
+ "fmla v28.8h, v0.8h, v13.8h\n"
+ "ldr h1, [x15, #0x22]\n"
+ "fmla v31.8h, v1.8h, v6.8h\n"
+ "ldr x24, [x16, #0xd0]\n"
+ "add x24, x24, x14\n"
+ "fmla v30.8h, v1.8h, v10.8h\n"
+ "fmla v29.8h, v1.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 38f\n"
+ "ld1 { v5.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 39f\n"
+ "ld1 { v5.h }[2], [x24], #0x2\n"
+ "b 39f\n"
+ "38:" // Oddments: Load input (4, 2): Bit 1: Unset
+ "ld1 { v5.h }[0], [x24], #0x2\n"
+ "39:" // Oddments: Load input (4, 2): Bit 1: End
+ "fmla v28.8h, v1.8h, v5.8h\n"
+ "ldr h2, [x15, #0x24]\n"
+ "fmla v31.8h, v2.8h, v10.8h\n"
+ "ldr x23, [x16, #0xd8]\n"
+ "add x23, x23, x14\n"
+ "fmla v30.8h, v2.8h, v11.8h\n"
+ "fmla v29.8h, v2.8h, v5.8h\n"
+ "tbz %x[n_channels], #1, 40f\n"
+ "ld1 { v6.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 41f\n"
+ "ld1 { v6.h }[2], [x23], #0x2\n"
+ "b 41f\n"
+ "40:" // Oddments: Load input (4, 3): Bit 1: Unset
+ "ld1 { v6.h }[0], [x23], #0x2\n"
+ "41:" // Oddments: Load input (4, 3): Bit 1: End
+ "fmla v28.8h, v2.8h, v6.8h\n"
+ "ldr h3, [x15, #0x26]\n"
+ "fmla v31.8h, v3.8h, v11.8h\n"
+ "ldr x22, [x16, #0xe0]\n"
+ "add x22, x22, x14\n"
+ "fmla v30.8h, v3.8h, v12.8h\n"
+ "fmla v29.8h, v3.8h, v6.8h\n"
+ "tbz %x[n_channels], #1, 42f\n"
+ "ld1 { v8.s }[0], [x22], #0x4\n"
+ "tbz %x[n_channels], #0, 43f\n"
+ "ld1 { v8.h }[2], [x22], #0x2\n"
+ "b 43f\n"
+ "42:" // Oddments: Load input (4, 4): Bit 1: Unset
+ "ld1 { v8.h }[0], [x22], #0x2\n"
+ "43:" // Oddments: Load input (4, 4): Bit 1: End
+ "fmla v28.8h, v3.8h, v8.8h\n"
+ "ldr h4, [x15, #0x28]\n"
+ "fmla v31.8h, v4.8h, v12.8h\n"
+ "ldr x21, [x16, #0xe8]\n"
+ "add x21, x21, x14\n"
+ "fmla v30.8h, v4.8h, v14.8h\n"
+ "fmla v29.8h, v4.8h, v8.8h\n"
+ "tbz %x[n_channels], #1, 44f\n"
+ "ld1 { v10.s }[0], [x21], #0x4\n"
+ "tbz %x[n_channels], #0, 45f\n"
+ "ld1 { v10.h }[2], [x21], #0x2\n"
+ "b 45f\n"
+ "44:" // Oddments: Load input (4, 5): Bit 1: Unset
+ "ld1 { v10.h }[0], [x21], #0x2\n"
+ "45:" // Oddments: Load input (4, 5): Bit 1: End
+ "fmla v28.8h, v4.8h, v10.8h\n"
+ "ldr h0, [x15, #0x2a]\n"
+ "fmla v31.8h, v0.8h, v9.8h\n"
+ "ldr x20, [x16, #0xf0]\n"
+ "add x20, x20, x14\n"
+ "fmla v30.8h, v0.8h, v13.8h\n"
+ "tbz %x[n_channels], #1, 46f\n"
+ "ld1 { v11.s }[0], [x20], #0x4\n"
+ "tbz %x[n_channels], #0, 47f\n"
+ "ld1 { v11.h }[2], [x20], #0x2\n"
+ "b 47f\n"
+ "46:" // Oddments: Load input (5, 0): Bit 1: Unset
+ "ld1 { v11.h }[0], [x20], #0x2\n"
+ "47:" // Oddments: Load input (5, 0): Bit 1: End
+ "fmla v29.8h, v0.8h, v11.8h\n"
+ "ldr x19, [x16, #0xf8]\n"
+ "add x19, x19, x14\n"
+ "tbz %x[n_channels], #1, 48f\n"
+ "ld1 { v12.s }[0], [x19], #0x4\n"
+ "tbz %x[n_channels], #0, 49f\n"
+ "ld1 { v12.h }[2], [x19], #0x2\n"
+ "b 49f\n"
+ "48:" // Oddments: Load input (5, 1): Bit 1: Unset
+ "ld1 { v12.h }[0], [x19], #0x2\n"
+ "49:" // Oddments: Load input (5, 1): Bit 1: End
+ "fmla v28.8h, v0.8h, v12.8h\n"
+ "ldr h1, [x15, #0x2c]\n"
+ "fmla v31.8h, v1.8h, v13.8h\n"
+ "ldr x26, [x16, #0x100]\n"
+ "add x26, x26, x14\n"
+ "fmla v30.8h, v1.8h, v5.8h\n"
+ "fmla v29.8h, v1.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 50f\n"
+ "ld1 { v9.s }[0], [x26], #0x4\n"
+ "tbz %x[n_channels], #0, 51f\n"
+ "ld1 { v9.h }[2], [x26], #0x2\n"
+ "b 51f\n"
+ "50:" // Oddments: Load input (5, 2): Bit 1: Unset
+ "ld1 { v9.h }[0], [x26], #0x2\n"
+ "51:" // Oddments: Load input (5, 2): Bit 1: End
+ "fmla v28.8h, v1.8h, v9.8h\n"
+ "ldr h2, [x15, #0x2e]\n"
+ "fmla v31.8h, v2.8h, v5.8h\n"
+ "ldr x25, [x16, #0x108]\n"
+ "add x25, x25, x14\n"
+ "fmla v30.8h, v2.8h, v6.8h\n"
+ "fmla v29.8h, v2.8h, v9.8h\n"
+ "tbz %x[n_channels], #1, 52f\n"
+ "ld1 { v11.s }[0], [x25], #0x4\n"
+ "tbz %x[n_channels], #0, 53f\n"
+ "ld1 { v11.h }[2], [x25], #0x2\n"
+ "b 53f\n"
+ "52:" // Oddments: Load input (5, 3): Bit 1: Unset
+ "ld1 { v11.h }[0], [x25], #0x2\n"
+ "53:" // Oddments: Load input (5, 3): Bit 1: End
+ "fmla v28.8h, v2.8h, v11.8h\n"
+ "ldr h3, [x15, #0x30]\n"
+ "fmla v31.8h, v3.8h, v6.8h\n"
+ "ldr x24, [x16, #0x110]\n"
+ "add x24, x24, x14\n"
+ "fmla v30.8h, v3.8h, v8.8h\n"
+ "fmla v29.8h, v3.8h, v11.8h\n"
+ "tbz %x[n_channels], #1, 54f\n"
+ "ld1 { v12.s }[0], [x24], #0x4\n"
+ "tbz %x[n_channels], #0, 55f\n"
+ "ld1 { v12.h }[2], [x24], #0x2\n"
+ "b 55f\n"
+ "54:" // Oddments: Load input (5, 4): Bit 1: Unset
+ "ld1 { v12.h }[0], [x24], #0x2\n"
+ "55:" // Oddments: Load input (5, 4): Bit 1: End
+ "fmla v28.8h, v3.8h, v12.8h\n"
+ "ldr h4, [x15, #0x32]\n"
+ "fmla v31.8h, v4.8h, v8.8h\n"
+ "ldr x23, [x16, #0x118]\n"
+ "add x23, x23, x14\n"
+ "fmla v30.8h, v4.8h, v10.8h\n"
+ "fmla v29.8h, v4.8h, v12.8h\n"
+ "tbz %x[n_channels], #1, 56f\n"
+ "ld1 { v9.s }[0], [x23], #0x4\n"
+ "tbz %x[n_channels], #0, 57f\n"
+ "ld1 { v9.h }[2], [x23], #0x2\n"
+ "b 57f\n"
+ "56:" // Oddments: Load input (5, 5): Bit 1: Unset
+ "ld1 { v9.h }[0], [x23], #0x2\n"
+ "57:" // Oddments: Load input (5, 5): Bit 1: End
+ "fmla v28.8h, v4.8h, v9.8h\n"
+ "fmax v31.8h, v31.8h, v18.8h\n"
+ "fmax v30.8h, v30.8h, v18.8h\n"
+ "fmax v29.8h, v29.8h, v18.8h\n"
+ "fmin v31.8h, v31.8h, v17.8h\n"
+ "fmin v30.8h, v30.8h, v17.8h\n"
+ "fmin v29.8h, v29.8h, v17.8h\n"
+ "fmax v28.8h, v28.8h, v18.8h\n"
+ "fmin v28.8h, v28.8h, v17.8h\n"
+ "tbz %x[n_channels], #1, 58f\n"
+ "st1 { v31.s }[0], [x13], #0x4\n"
+ "st1 { v30.s }[0], [x12], #0x4\n"
+ "st1 { v29.s }[0], [x10], #0x4\n"
+ "st1 { v28.s }[0], [x9], #0x4\n"
+ "tbz %x[n_channels], #0, 59f\n"
+ "st1 { v31.h }[2], [x13], #0x2\n"
+ "st1 { v30.h }[2], [x12], #0x2\n"
+ "st1 { v29.h }[2], [x10], #0x2\n"
+ "st1 { v28.h }[2], [x9], #0x2\n"
+ "b 59f\n"
+ "58:" // Oddments: Store: Bit 1: Unset
+ "st1 { v31.h }[0], [x13], #0x2\n"
+ "st1 { v30.h }[0], [x12], #0x2\n"
+ "st1 { v29.h }[0], [x10], #0x2\n"
+ "st1 { v28.h }[0], [x9], #0x2\n"
+ "59:" // Oddments: Store: Bit 1: End
+
+ "60:" // End
+
+ :
+ : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp
new file mode 100644
index 000000000..3468b70f2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(const __fp16 *const *const, __fp16 *const *const, const void *, const void *, const unsigned int, const unsigned int, const __fp16, const __fp16);
+
+struct a64_fp16_nhwc_generic_output9_mla_depthfirst
+{
+ typedef __fp16 bias_type;
+ typedef __fp16 input_type;
+ typedef __fp16 weight_type;
+ typedef __fp16 return_type;
+
+ typedef void (*kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, const void *, const unsigned int, const unsigned int, const __fp16, const __fp16);
+
+ constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+
+ constexpr static unsigned int n_output_points = 9;
+
+ kern_type kernel = a64_fp16_nhwc_generic_output9_mla_depthfirst_impl;
+
+ a64_fp16_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {}
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
new file mode 100644
index 000000000..8ac79f82f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -0,0 +1,527 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(
+ const __fp16 *const *const inptrs,
+ __fp16 *const *const outptrs,
+ const void *params,
+ const void *bias,
+ const unsigned int n_points,
+ const unsigned int n_channels,
+ const __fp16 activation_min,
+ const __fp16 activation_max
+)
+{
+ const __fp16 minmax_vals[2] = { activation_min, activation_max };
+
+ __asm__ __volatile__(
+ "ld1r { v4.8h }, [%x[minmax_vals]]\n"
+ "add x19, %x[minmax_vals], #0x2\n"
+ "mov x11, #0x0\n"
+ "ld1r { v3.8h }, [x19]\n"
+ "lsr x10, %x[n_channels], #0x3\n"
+ "cbz x10, 5f\n"
+ "1:" // Channel loop
+ "movi v25.16b, #0x0\n"
+ "cbz %x[bias], 2f\n"
+ "ldr q25, [%x[bias], x11]\n"
+ "2:" // Channel loop: Load bias: Done
+ "mov v24.16b, v25.16b\n"
+ "ldr q23, [%x[params], #0x0]\n"
+ "mov x20, %x[inptrs]\n"
+ "mov v22.16b, v25.16b\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "subs x19, %x[n_points], #0x1\n"
+ "mov v21.16b, v25.16b\n"
+ "ldr q2, [x9, x11]\n"
+ "mov v20.16b, v25.16b\n"
+ "add %x[params], %x[params], #0x10\n"
+ "mov v19.16b, v25.16b\n"
+ "ldr q1, [x28, x11]\n"
+ "mov v18.16b, v25.16b\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "mov v17.16b, v25.16b\n"
+ "ldr q0, [x27, x11]\n"
+ "mov v16.16b, v25.16b\n"
+ "ldr q31, [x26, x11]\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "ldr q30, [x25, x11]\n"
+ "ldr q29, [x24, x11]\n"
+ "ldp x23, x22, [x20], #0x10\n"
+ "ldr q28, [x23, x11]\n"
+ "ldr q27, [x22, x11]\n"
+ "ldr x21, [x20], #0x8\n"
+ "ldr q26, [x21, x11]\n"
+ "ble 4f\n"
+ "3:" // Channel loop: Planar loop
+ "fmla v25.8h, v2.8h, v23.8h\n"
+ "ldp x9, x28, [x20], #0x10\n"
+ "subs x19, x19, #0x1\n"
+ "fmla v24.8h, v1.8h, v23.8h\n"
+ "ldr q2, [x9, x11]\n"
+ "fmla v22.8h, v0.8h, v23.8h\n"
+ "fmla v21.8h, v31.8h, v23.8h\n"
+ "ldr q1, [x28, x11]\n"
+ "fmla v20.8h, v30.8h, v23.8h\n"
+ "ldp x27, x26, [x20], #0x10\n"
+ "fmla v19.8h, v29.8h, v23.8h\n"
+ "fmla v18.8h, v28.8h, v23.8h\n"
+ "ldr q0, [x27, x11]\n"
+ "fmla v17.8h, v27.8h, v23.8h\n"
+ "fmla v16.8h, v26.8h, v23.8h\n"
+ "ldr q23, [%x[params], #0x0]\n"
+ "add %x[params], %x[params], #0x10\n"
+ "ldr q31, [x26, x11]\n"
+ "ldp x25, x24, [x20], #0x10\n"
+ "ldr q30, [x25, x11]\n"
+ "ldr q29, [x24, x11]\n"
+ "ldp x23, x22, [x20], #0x10\