aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorramelg01 <ramy.elgammal@arm.com>2022-04-07 02:42:52 +0100
committerRamy Elgammal <ramy.elgammal@arm.com>2022-04-26 15:51:22 +0000
commit8a164884dddf769643cf3b9f7f94e43cb4f3c20b (patch)
tree35958dd48b6df1a851c880dad2b2ce285671b611
parentc827e99fc46521f43719b0c2d1b6f05d66abf68c (diff)
downloadComputeLibrary-8a164884dddf769643cf3b9f7f94e43cb4f3c20b.tar.gz
Update Neon™ depthwise kernel
- Reduce duplication and simplify overall structure. - Improve multi-threaded performance by sharing more data in lower-level caches. Partially Resolves: COMPMID-5054 Signed-off-by: Ramy Elgammal <ramy.elgammal@arm.com> Change-Id: Iac747f39b21c540122fa75218762631c4d787911 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7449 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Andrew Mundy Reviewed-by: Sheri Zhang <sheri.zhang@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--Android.bp6
-rw-r--r--SConscript1
-rw-r--r--filelist.json38
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp281
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp736
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp520
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp954
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp47
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp81
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp13
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp23
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp409
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp73
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp60
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp95
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp115
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp45
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp150
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp80
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp161
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.hpp53
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp86
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp5
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp24
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp32
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp76
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp76
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp76
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp76
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp72
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp24
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp33
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp33
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp32
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp51
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1386
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1740
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp3496
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp23
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp33
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp32
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp32
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp51
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp51
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1386
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1740
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp3496
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp23
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp32
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp32
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp32
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp63
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1164
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp63
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp1395
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp63
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp2185
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp1412
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp2000
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp31
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp3266
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp23
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp32
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp76
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp76
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp76
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp76
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp76
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp76
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp76
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp76
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp76
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp76
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp20
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp29
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp29
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp32
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp51
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp34
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp34
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp34
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp29
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp29
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp51
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp51
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp14
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp30
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp34
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp34
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp29
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp29
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp34
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp34
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp34
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp6
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp431
-rw-r--r--src/core/NEON/kernels/assembly/depthwise.hpp42
125 files changed, 18989 insertions, 13602 deletions
diff --git a/Android.bp b/Android.bp
index 17c56f3f88..c072c0e371 100644
--- a/Android.bp
+++ b/Android.bp
@@ -301,9 +301,12 @@ cc_library_static {
"src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp",
"src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp",
"src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp",
@@ -962,6 +965,9 @@ cc_library_static {
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
diff --git a/SConscript b/SConscript
index f49263122d..dac1c4ecca 100644
--- a/SConscript
+++ b/SConscript
@@ -555,6 +555,7 @@ if env['neon']:
"src/core/NEON/kernels/convolution/winograd/",
"src/core/NEON/kernels/arm_conv/depthwise/",
"src/core/NEON/kernels/arm_conv/pooling/",
+ "src/core/NEON/kernels/arm_conv/",
"src/core/NEON/kernels/assembly/",
"arm_compute/core/NEON/kernels/assembly/",
"src/cpu/kernels/assembly/"])
diff --git a/filelist.json b/filelist.json
index 84884e132a..93dfdfff6e 100644
--- a/filelist.json
+++ b/filelist.json
@@ -665,7 +665,7 @@
"Reduction": {
"deps": [ "Reshape" ],
"files": {
- "common": [
+ "common": [
"src/core/CL/kernels/CLReductionOperationKernel.cpp",
"src/runtime/CL/functions/CLReductionOperation.cpp"
]
@@ -1154,7 +1154,7 @@
"src/core/NEON/kernels/convolution/common/qsymm8.cpp",
"src/core/NEON/kernels/convolution/common/utils.cpp",
"src/core/NEON/kernels/arm_conv/addressing.cpp",
- "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp",
@@ -1171,6 +1171,7 @@
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp",
@@ -1208,13 +1209,18 @@
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp",
+ "src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp",
"src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp"
- ],
+ ],
"fp16":["src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp"],
- "fp32":["src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp"],
- "qasymm8":["src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp"],
+ "fp32":["src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp"],
+ "qasymm8":["src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp"],
"qasymm8_signed":["src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp"]
},
"sve": {
@@ -1315,7 +1321,7 @@
"fp32": ["src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp"],
"fp16": ["src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp"]
- },
+ },
"sve2":{
"qasymm8": ["src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp"],
"qasymm8_signed": ["src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp"]
@@ -1519,7 +1525,7 @@
"src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp",
"src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp",
"src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp"
],
@@ -1603,7 +1609,7 @@
"common": [
"src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp",
"src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp"
- ],
+ ],
"neon":{
"common":["src/cpu/kernels/instancenorm/generic/neon/impl.cpp"],
"fp16":["src/cpu/kernels/instancenorm/generic/neon/fp16.cpp"],
@@ -1663,7 +1669,7 @@
"files": {
"common": [
"src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp",
- "src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp",
+ "src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp",
"src/cpu/operators/CpuMaxUnpooling.cpp"
],
"neon":{
@@ -1764,12 +1770,12 @@
"src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp",
"src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp",
- "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp"
+ "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp"
],
"nchw": [ "src/cpu/kernels/pool2d/neon/nchw/all.cpp" ],
"fp16": [ "src/cpu/kernels/pool2d/neon/fp16.cpp" ],
- "fp32": [ "src/cpu/kernels/pool2d/neon/fp32.cpp" ],
- "qasymm8":[ "src/cpu/kernels/pool2d/neon/qasymm8.cpp" ],
+ "fp32": [ "src/cpu/kernels/pool2d/neon/fp32.cpp" ],
+ "qasymm8":[ "src/cpu/kernels/pool2d/neon/qasymm8.cpp" ],
"qasymm8_signed":["src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp"]
},
"sve": {
@@ -1969,8 +1975,8 @@
"neon":{
"common":["src/cpu/kernels/softmax/generic/neon/impl.cpp"],
"fp32": ["src/cpu/kernels/softmax/generic/neon/fp32.cpp"],
- "fp16": ["src/cpu/kernels/softmax/generic/neon/fp16.cpp"],
- "qasymm8":[ "src/cpu/kernels/softmax/generic/neon/qasymm8.cpp"],
+ "fp16": ["src/cpu/kernels/softmax/generic/neon/fp16.cpp"],
+ "qasymm8":[ "src/cpu/kernels/softmax/generic/neon/qasymm8.cpp"],
"qasymm8_signed":["src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp"]
},
"sve": {
@@ -1982,7 +1988,7 @@
},
"sve2":{
"common" :["src/cpu/kernels/softmax/generic/sve2/impl.cpp"],
- "qasymm8":[ "src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp"],
+ "qasymm8":[ "src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp"],
"qasymm8_signed":["src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp"]
}
}
@@ -2074,4 +2080,4 @@
"src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp"
]
}
-} \ No newline at end of file
+}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
new file mode 100644
index 0000000000..e02998f5a0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
@@ -0,0 +1,281 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename T> struct DefaultTAccum { using Type = T; };
+template <> struct DefaultTAccum<int8_t> { using Type = int32_t; };
+template <> struct DefaultTAccum<uint8_t> { using Type = int32_t; };
+
+template <typename T> struct DefaultOutputStage { using Type = Nothing; };
+template <> struct DefaultOutputStage<int8_t> { using Type = arm_gemm::Requantize32; };
+template <> struct DefaultOutputStage<uint8_t> { using Type = arm_gemm::Requantize32; };
+
+class IDepthfirstStrategy
+{
+ public:
+ virtual ~IDepthfirstStrategy() = default;
+
+ virtual unsigned int get_input_rows() const = 0;
+ virtual unsigned int get_input_cols() const = 0;
+
+ virtual unsigned int get_output_rows() const = 0;
+ virtual unsigned int get_output_cols() const = 0;
+};
+
+
+template <typename T>
+struct TensorSpec
+{
+ T base;
+ size_t ld_row, ld_col;
+
+ TensorSpec(T ptr, size_t ld_row, size_t ld_col)
+ : base(ptr), ld_row(ld_row), ld_col(ld_col) {}
+};
+
+
+template <typename TInput, typename TWeight, typename TOutput>
+class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
+{
+ protected:
+ using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
+
+ // The strategy which we're applying to solve the depthwise convolution.
+ std::unique_ptr<const IDepthfirstStrategy> m_strat;
+
+ /* Compute the amount of working space required for a single thread. */
+ virtual size_t get_working_size_per_thread(unsigned int n_input_channels) const = 0;
+
+ /* Initialise the working space for a thread. */
+ virtual void initialise_working_space(void *, unsigned int n_input_channels) const = 0;
+
+ /* Compute a portion of the output tensor with padding. */
+ virtual void compute_tile_padded(
+ unsigned int output_i, unsigned int output_j,
+ unsigned int output_channel_start, unsigned int output_channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ const void *parameters,
+ void *working_space
+ ) const = 0;
+
+ /* Compute a portion of the work with only top/bottom padding.
+ *
+ * The default implementation of this repeatedly calls into the padded tile
+ * variant.
+ */
+ virtual void compute_row_padded_tile_row(
+ const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
+ const unsigned int output_channel_start, const unsigned int output_channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ const void *parameters,
+ void *working_space
+ ) const
+ {
+ for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols())
+ {
+ this->compute_tile_padded(
+ output_i, output_j, output_channel_start, output_channel_end,
+ input, output, parameters, working_space
+ );
+ }
+ }
+
+ /* Compute a portion of the output tensor with no padding.
+ *
+ * The default implementation of this repeatedly calls into the padded
+ * variant.
+ */
+ virtual void compute_tiles_unpadded(
+ unsigned int start_output_i, unsigned int start_output_j,
+ unsigned int n_tile_rows, unsigned int n_tile_cols,
+ unsigned int output_channel_start, unsigned int output_channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ const void *parameters,
+ void *working_space
+ ) const
+ {
+ for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
+ {
+ unsigned int row_start_output_j = start_output_j;
+ for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++)
+ {
+ this->compute_tile_padded(
+ start_output_i, row_start_output_j,
+ output_channel_start, output_channel_end,
+ input, output, parameters, working_space
+ );
+ row_start_output_j += m_strat->get_output_cols();
+ }
+ start_output_i += m_strat->get_output_rows();
+ }
+ }
+
+ void execute_internal(
+ unsigned int n_batches,
+ unsigned int input_height,
+ unsigned int input_width,
+ unsigned int n_input_channels,
+ const PaddingValues &padding,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads
+ ) const override
+ {
+ // Get and initialise the working space for this thread.
+ void *thread_working_space =
+ static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread(n_input_channels);
+ this->initialise_working_space(thread_working_space, n_input_channels);
+
+ // Construct convenient representations of the input/output tensors.
+ TensorSpec<const TInput *> input_tensor(reinterpret_cast<const TInput *>(input), ld_input_row, ld_input_col);
+ TensorSpec<TOutput *> output_tensor(reinterpret_cast<TOutput *>(output), ld_output_row, ld_output_col);
+
+ const auto n_output_channels = n_input_channels * this->m_args.channel_multiplier;
+
+ for (unsigned int batch = 0; batch < n_batches; batch++)
+ {
+ // Iterate over rows of the output tensor; we stripe over the tiles.
+ for (unsigned int start_output_i = thread_id * m_strat->get_output_rows();
+ start_output_i < output_height;
+ start_output_i += n_threads * m_strat->get_output_rows())
+ {
+ // Determine what (if any padding) is required on the top/bottom of
+ // this row of the convolution.
+ const auto end_output_i = start_output_i + m_strat->get_output_rows();
+ const bool pad_output_bottom = output_height < end_output_i;
+
+ const int start_input_i = start_output_i * this->m_args.stride_rows - padding.top;
+ const bool pad_input_top = start_input_i < 0;
+ const int end_input_i = start_input_i + m_strat->get_input_rows();
+ const bool pad_input_bottom = static_cast<int>(input_height) < end_input_i;
+ const bool pad_row = pad_input_top || pad_input_bottom || pad_output_bottom;
+
+ // Iterate over the columns of the output tensor; we attempt to grab as
+ // much as possible of the unpadded regions, so the loop structure is a
+ // bit odd.
+ unsigned int start_output_j = 0;
+ while (start_output_j < output_width)
+ {
+ const int start_in_j = start_output_j * this->m_args.stride_cols - padding.left;
+ const bool pad_input_left = start_in_j < 0;
+
+ // Determine if we can process a number of unpadded tiles in one go.
+ int n_unpadded_tiles = 0;
+ if (!pad_input_left)
+ {
+ // Determine the maximum number of tiles we could handle.
+ n_unpadded_tiles = (output_width - start_output_j) / m_strat->get_output_cols();
+
+ // Handle padding on the right hand edge
+ const int tile_stride = m_strat->get_output_cols() * this->m_args.stride_cols;
+ int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols();
+ int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride;
+
+ while (n_unpadded_tiles > 0 &&
+ (static_cast<int>(output_width) < end_output_j ||
+ static_cast<int>(input_width) < end_input_j))
+ {
+ n_unpadded_tiles--;
+ end_output_j -= m_strat->get_output_cols();
+ end_input_j -= tile_stride;
+ }
+ }
+
+ // Process unpadded tiles, if possible, otherwise process a padded tile.
+ if (n_unpadded_tiles)
+ {
+ if (!pad_row)
+ {
+ // Completely unpadded execution
+ this->compute_tiles_unpadded(
+ start_output_i, start_output_j,
+ 1, n_unpadded_tiles, // Compute a row of unpadded tiles
+ 0, n_output_channels, // Compute all channels
+ input_tensor, output_tensor, parameters, thread_working_space
+ );
+ }
+ else
+ {
+ // Top/bottom padding only
+ this->compute_row_padded_tile_row(
+ start_output_i, start_output_j, n_unpadded_tiles,
+ 0, n_output_channels, // Compute all channels
+ input_tensor, output_tensor, parameters, thread_working_space
+ );
+ }
+ start_output_j += n_unpadded_tiles * m_strat->get_output_cols();
+ }
+ else
+ {
+ this->compute_tile_padded(
+ start_output_i, start_output_j,
+ 0, n_output_channels, // Compute all channels
+ input_tensor, output_tensor, parameters, thread_working_space
+ );
+ start_output_j += m_strat->get_output_cols();
+ }
+ }
+ }
+
+ // Progress the pointers for the next batch.
+ input_tensor.base += ld_input_batch;
+ output_tensor.base += ld_output_batch;
+ }
+ }
+
+ public:
+ DepthfirstDriver(IDepthfirstStrategy *strategy, const DepthwiseArgs &args)
+ : Parent(args), m_strat(strategy)
+ {
+ }
+
+ size_t get_working_size(unsigned int n_threads, unsigned int n_input_channels) const override final
+ {
+ return n_threads * this->get_working_size_per_thread(n_input_channels);
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
index 57fa11151b..6905076357 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,9 @@
#pragma once
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "src/core/NEON/kernels/arm_conv/addressing.hpp"
+#include "depthwise_strategies_common.hpp"
+#include "working_space.hpp"
#ifdef CYCLE_PROFILING
#include "profiler.hpp"
@@ -35,349 +37,547 @@
namespace arm_conv {
namespace depthwise {
-struct IDepthwiseDepthfirstStrategy
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum,
+ typename OutputStage>
+class DepthwiseDepthfirstStrategyCommon
+ : public DepthfirstStrategy<TInput, TWeight, TOutput, TAccum, OutputStage>
{
- virtual arm_gemm::VLType get_vl_type() const = 0;
+ protected:
+ unsigned int m_output_rows, m_output_cols;
+ unsigned int m_kernel_rows, m_kernel_cols;
+ unsigned int m_stride_rows, m_stride_cols;
- virtual unsigned int get_input_rows() const = 0;
- virtual unsigned int get_input_cols() const = 0;
+ public:
+ DepthwiseDepthfirstStrategyCommon(
+ unsigned int output_rows, unsigned int output_cols,
+ unsigned int kernel_rows, unsigned int kernel_cols,
+ unsigned int stride_rows=1, unsigned int stride_cols=1
+ ) : m_output_rows(output_rows), m_output_cols(output_cols),
+ m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
+ m_stride_rows(stride_rows), m_stride_cols(stride_cols)
+ {
+ }
+
+ DepthwiseDepthfirstStrategyCommon(unsigned int output_size, unsigned int kernel_size, unsigned int stride=1)
+ : DepthwiseDepthfirstStrategyCommon(output_size, output_size, kernel_size, kernel_size, stride, stride)
+ {
+ }
+
+ virtual ~DepthwiseDepthfirstStrategyCommon() {}
+
+ unsigned int get_output_rows() const override { return m_output_rows; }
+ unsigned int get_output_cols() const override { return m_output_cols; }
- virtual unsigned int get_output_rows() const = 0;
- virtual unsigned int get_output_cols() const = 0;
+ unsigned int get_kernel_rows() const override { return m_kernel_rows; }
+ unsigned int get_kernel_cols() const override { return m_kernel_cols; }
- virtual unsigned int get_kernel_rows() const = 0;
- virtual unsigned int get_kernel_cols() const = 0;
+ unsigned int get_stride_rows() const override { return m_stride_rows; }
+ unsigned int get_stride_cols() const override { return m_stride_cols; }
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwiseDepthfirstStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+ using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>;
- virtual unsigned int get_stride_rows() const = 0;
- virtual unsigned int get_stride_cols() const = 0;
+ public:
+ using Parent::Parent;
- virtual void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const output_ptrs,
+ typedef void (*IndirectKernelType)(
+ const TInput *const *input_ptrs,
+ TOutput *const *output_ptrs,
const void *params,
unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const = 0;
+ const TAccum activation_min,
+ const TAccum activation_max
+ );
+ virtual IndirectKernelType get_indirect_kernel(void) const = 0;
- virtual void direct_kernel(
+ typedef void (*DirectKernelType)(
const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
+ const TInput *inptr_base, int64_t ld_input_row, int64_t ld_input_col,
+ TOutput *outptr_base, int64_t ld_output_row, int64_t ld_output_col,
const void *params, unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const = 0;
-
- virtual ~IDepthwiseDepthfirstStrategy() {}
+ const TAccum activation_min,
+ const TAccum activation_max
+ );
+ virtual DirectKernelType get_direct_kernel(void) const = 0;
};
-template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
-class DepthwiseDepthfirst : public DepthwiseCommon<TInput, TWeight, TOutput>
+template <typename TInput, typename TWeight, typename TOutput>
+class DepthwiseDepthfirstStrategy<TInput, TWeight, TOutput, int32_t>
+: public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
{
- const std::unique_ptr<IDepthwiseDepthfirstStrategy> m_strat;
+ using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>;
- size_t sizeof_inptr_array(void) const
+ protected:
+ interleaves::PackingArguments get_packing_args(void) const
{
- return sizeof(TInput *) * m_strat->get_input_rows() * m_strat->get_input_cols();
+ return interleaves::PackingArguments(
+ this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
+ false, sizeof(int32_t), // Don't pack the bias
+ this->get_vl_type(), sizeof(int32_t), this->get_accumulator_depth_vl(),
+ [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+ { return this->get_kernel_packing_point(idx, x, y); }
+ );
}
- size_t sizeof_input_buffer(unsigned int n_input_channels) const
+ public:
+ using Parent::Parent;
+
+ typedef void (*KernelType)(
+ unsigned int, // n_channels,
+ const TInput *const *, // inptrs
+ const TWeight *, // weights
+ const int32_t *, // bias,
+ const arm_gemm::Requantize32 &,
+ const int32_t *, const int32_t *, // requant_muls and requant_shifts
+ TOutput *const * // outptrs
+ );
+ virtual KernelType get_kernel() const = 0;
+
+ size_t get_storage_size(const DepthwiseArgs &args) const override
{
- return sizeof(TInput) * n_input_channels;
+ return interleaves::get_storage_size_generic(get_packing_args(), args);
}
- size_t sizeof_outptr_array(void) const
+ void pack_parameters(
+ const DepthwiseArgs &args, void *buffer,
+ const void *biases, const arm_gemm::Requantize32 &,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const override
{
- return sizeof(TInput *) * m_strat->get_output_rows() * m_strat->get_output_cols();
+ interleaves::pack_parameters_generic(
+ get_packing_args(), args, buffer, biases, weights, ld_weight_col, ld_weight_row);
}
+};
- size_t sizeof_output_buffer(unsigned int n_output_channels) const
- {
- return sizeof(TOutput) * n_output_channels;
- }
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
+class DepthwiseDepthfirstCommon : public DepthfirstDriver<TInput, TWeight, TOutput>
+{
+ using StratType = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>;
+ OutputStage m_os;
+
+ protected:
+ inline OutputStage &get_output_stage(void) { return m_os; }
+ inline const OutputStage &get_output_stage(void) const { return m_os; }
public:
- DepthwiseDepthfirst(
- IDepthwiseDepthfirstStrategy *const strat,
- const DepthwiseArgs &args
- ) : DepthwiseCommon<TInput, TWeight, TOutput>(args), m_strat(strat)
+ DepthwiseDepthfirstCommon(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os)
+ : DepthfirstDriver<TInput, TWeight, TOutput>(strat, args), m_os(os)
{
}
- DepthwiseDepthfirst(DepthwiseDepthfirst &) = delete;
- DepthwiseDepthfirst &operator=(DepthwiseDepthfirst &) = delete;
+ DepthwiseDepthfirstCommon(DepthwiseDepthfirstCommon &) = delete;
+ DepthwiseDepthfirstCommon &operator=(DepthwiseDepthfirstCommon &) = delete;
size_t get_storage_size(void) const override
{
- // TODO What if we insert extra padding? Biases are a different size to the inputs, ...
- const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(m_strat->get_vl_type());
- const auto rounded_channels = arm_gemm::roundup(this->m_args.input_channels, vl);
- return (1 + this->m_args.kernel_rows * this->m_args.kernel_cols) * rounded_channels * sizeof(TWeight);
+ return reinterpret_cast<const StratType *>(this->m_strat.get())->
+ get_storage_size(this->m_args);
}
- void pack_parameters(void *_buffer, const void *_biases, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override
+ void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
{
- // TODO What if the kernel needs a different packing function?
+ reinterpret_cast<const StratType *>(this->m_strat.get())->
+ pack_parameters(this->m_args, buffer, biases, m_os, weights, ld_weight_col, ld_weight_row);
+ }
+};
- // Cast the pointers
- uint8_t *buffer = static_cast<uint8_t *>(_buffer);
- const TAccum *biases = static_cast<const TAccum *>(_biases);
- const TWeight *const weights = static_cast<const TWeight *>(_weights);
+namespace depthwise_depthfirst {
- const unsigned int vl = arm_gemm::utils::get_vector_length<TAccum>(m_strat->get_vl_type());
- ld_weight_col = (ld_weight_col == 0) ? this->m_args.input_channels : ld_weight_col;
- ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row;
+/* Workspace Element for an array of input pointers as consumed by the
+ * specialised depthwise kernels.
+ */
+template <typename T>
+class InputArrayElement
+{
+ public:
+ struct Workspace
+ {
+ const T **inptr_array;
+ };
- for (unsigned int n = 0; n < this->m_args.input_channels; n += vl)
- {
- const unsigned int todo = std::min(vl, this->m_args.input_channels - n);
+ template <class OutputStage>
+ static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ return sizeof(T **) * args.strategy->get_input_rows() * args.strategy->get_input_cols();
+ }
- // Copy across the correct amount of bias (or 0)
- for (unsigned int i = 0; i < todo; i++)
- {
- reinterpret_cast<TAccum *>(buffer)[i] = (biases == nullptr) ? 0 : biases[n + i];
- }
- buffer += vl * sizeof(TAccum);
+ template <class WorkspaceType, class OutputStage>
+ static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ ws->inptr_array = reinterpret_cast<const T**>(buffer);
+ return reinterpret_cast<char *>(buffer) + get_element_size(args);
+ }
+};
- // Copy each of the weights in turn
- auto weights_row = weights + n;
- for (unsigned int i = 0; i < this->m_args.kernel_rows; i++)
- {
- auto weights_col = weights_row;
+template <typename TAccum, typename OutputStage, bool IsDot=false>
+struct WorkspaceFinalElement
+{
+ using Element = ActivationsElement<TAccum, OutputStage>;
+};
- for (unsigned int j = 0; j < this->m_args.kernel_cols; j++)
- {
- for (unsigned int m = 0; m < todo; m++)
- {
- reinterpret_cast<TWeight *>(buffer)[m] = weights_col[m];
- }
- buffer += vl * sizeof(TWeight);
+template <>
+struct WorkspaceFinalElement<int32_t, arm_gemm::Requantize32, false>
+{
+ using Element = RequantizationParametersElement;
+};
- weights_col += ld_weight_col;
- }
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
+struct Invoke
+{
+ constexpr static bool supports_direct_kernel = true;
- weights_row += ld_weight_row;
- }
- }
+ template <typename Strat, typename Workspace>
+ static inline void indirect(const Strat *strat, const Workspace *ws, const OutputStage &, const void *params, const TAccum *, unsigned int n_channels)
+ {
+ strat->get_indirect_kernel()(
+ ws->inptr_array,
+ ws->outptr_array,
+ params, n_channels,
+ ws->activation_min, ws->activation_max
+ );
}
- size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
+ template <typename Strat, typename Workspace>
+ static void direct(
+ const Strat *strat, const Workspace *ws, const OutputStage &,
+ unsigned int n_tile_rows, unsigned int n_tile_cols,
+ const TInput *inptr, size_t ld_in_row, size_t ld_in_col,
+ TOutput *outptr, size_t ld_out_row, size_t ld_out_col,
+ const void *params, unsigned int n_channels
+ )
{
- const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
- return n_threads * (sizeof_inptr_array() + sizeof_outptr_array() +
- sizeof_output_buffer(n_output_channels) +
- sizeof_input_buffer(n_channels));
+ strat->get_direct_kernel()(
+ n_tile_rows, n_tile_cols,
+ inptr, ld_in_row, ld_in_col,
+ outptr, ld_out_row, ld_out_col,
+ params, n_channels, ws->activation_min, ws->activation_max
+ );
}
+};
- using DepthwiseCommon<TInput, TWeight, TOutput>::execute;
- void execute(
- const unsigned int batches,
- const unsigned int input_height,
- const unsigned int input_width,
- const unsigned int input_channels,
- const PaddingValues &padding,
- const void *const _input,
- const size_t ld_input_col,
- const size_t ld_input_row,
- const size_t ld_input_batch,
- const void *const parameters,
- const unsigned int output_height,
- const unsigned int output_width,
- void *const _output,
- const size_t ld_output_col,
- const size_t ld_output_row,
- const size_t ld_output_batch,
- void *const _working_space,
- const unsigned int thread_id,
- const unsigned int n_threads
- ) const override
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+struct Invoke<TInput, TWeight, TOutput, TAccum, arm_gemm::Requantize32>
+{
+ constexpr static bool supports_direct_kernel = false;
+
+ template <typename Strat, typename Workspace>
+ static inline void indirect(const Strat *strat, const Workspace *ws, const arm_gemm::Requantize32 &qp, const void *params, const TAccum *, unsigned int n_channels)
{
-#ifdef CYCLE_PROFILING
- arm_gemm::profiler prof;
-#endif
+ strat->get_kernel()(
+ n_channels, ws->inptr_array,
+ reinterpret_cast<const TWeight *>(params), ws->bias,
+ qp, ws->requant_muls, ws->requant_shifts,
+ ws->outptr_array
+ );
+ }
+
+ template <typename Strat, typename Workspace>
+ static inline void direct(
+ const Strat *, const Workspace *, const arm_gemm::Requantize32 &,
+ unsigned int, unsigned int, // n_tile_rows, n_tile_cols
+ const TInput *, size_t, size_t, // Input pointer, row stride, column stride
+ TOutput *, size_t, size_t, // Output pointer, row stride, column stride
+ const void *, unsigned int // Parameters, number of channels
+ )
+ {
+ // Do nothing - this should never be reached because entry to it is guarded
+ // by an `if` on a `constexpr static bool`.
+ }
+};
- // Compute activation values
- TAccum activation_min, activation_max;
- std::tie(activation_min, activation_max) = get_default_activation_values<TAccum>();
+namespace
+{
- switch (this->m_args.activation.type)
- {
- case arm_gemm::Activation::Type::BoundedReLU:
- activation_max = static_cast<TAccum>(this->m_args.activation.param1);
- // Fall through
- case arm_gemm::Activation::Type::ReLU:
- activation_min = static_cast<TAccum>(0);
- break;
- default:
- break;
- }
+template <typename OutputStage>
+inline void stash_bias(OutputStage &, const void *) {}
- // Determine what portion of the work to do.
- const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads);
- const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
- const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
+template <>
+inline void stash_bias(arm_gemm::Requantize32 &qp, const void *bias) __attribute__ ((unused));
- // Cast input and output pointers into the right types
- const TInput *const inptr = static_cast<const TInput *>(_input);
- TOutput *const outptr = static_cast<TOutput *>(_output);
+template <>
+inline void stash_bias(arm_gemm::Requantize32 &qp, const void *bias)
+{
+ qp.bias = reinterpret_cast<const int32_t *>(bias);
+}
- // Allocate portions of the working space
- uint8_t *working_space = static_cast<uint8_t *>(_working_space) + get_working_size(thread_id, input_channels);
+}
- const void **const inptr_array = reinterpret_cast<const void **>(working_space);
- working_space += sizeof_inptr_array();
+} // namespace depthwise_depthfirst
- void **const outptr_array = reinterpret_cast<void **>(working_space);
- working_space += sizeof_outptr_array();
+template <typename TInput,
+ typename TWeight=TInput,
+ typename TOutput=TInput,
+ typename TAccum=typename DefaultTAccum<TInput>::Type,
+ typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwiseDepthfirst
+: public DepthwiseDepthfirstCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+ using StratType = DepthwiseDepthfirstStrategy<TInput, TWeight, TOutput, TAccum>;
+ using Parent = DepthwiseDepthfirstCommon<TInput, TWeight, TOutput, TAccum, OutputStage>;
+ using WorkspaceManager = Workspace<
+ OutputArrayElement<TOutput>,
+ depthwise_depthfirst::InputArrayElement<TInput>,
+ InputBufferElement<TInput>,
+ typename depthwise_depthfirst::WorkspaceFinalElement<TAccum, OutputStage>::Element
+ >;
+ using WorkingSpace = typename WorkspaceManager::WorkspaceType;
+
+ // We keep a copy of the bias and output stage
+ const TAccum *m_bias;
- TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
- working_space += sizeof_output_buffer(input_channels * this->m_args.channel_multiplier);
+ public:
+ DepthwiseDepthfirst(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
+ : Parent(strat, args, os), m_bias(nullptr)
+ {
+ }
- TInput *const input_buffer = reinterpret_cast<TInput *>(working_space);
+ DepthwiseDepthfirst(DepthwiseDepthfirst &) = delete;
+ DepthwiseDepthfirst &operator=(DepthwiseDepthfirst &) = delete;
- // Initialise the input buffer
- for (unsigned int c = 0; c < input_channels; c++)
- {
- input_buffer[c] = static_cast<TInput>(0);
- }
+ void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+ {
+ reinterpret_cast<const StratType *>(this->m_strat.get())->pack_parameters(
+ this->m_args, buffer, biases, this->get_output_stage(),
+ weights, ld_weight_col, ld_weight_row
+ );
+ m_bias = reinterpret_cast<const TAccum *>(biases);
+ depthwise_depthfirst::stash_bias(this->get_output_stage(), biases);
+ }
- // For each output tile, construct the requisite set of pointers and call
- // into the kernel.
- for (unsigned int batch = 0; batch < batches; batch++)
- {
- // Get batch pointers
- const auto inptr_batch = inptr + batch * ld_input_batch;
- const auto outptr_batch = outptr + batch * ld_output_batch;
+ size_t get_working_size_per_thread(const unsigned int n_input_channels) const override
+ {
+ DepthwiseArgs args(this->m_args);
+ args.input_channels = n_input_channels;
+ return WorkspaceManager::get_sizeof_workspace(
+ WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage())
+ );
+ }
- for (int start_out_i = start_out_height;
- start_out_i < end_out_height;
- start_out_i += static_cast<int>(m_strat->get_output_rows()))
- {
- const int end_out_i = start_out_i + m_strat->get_output_rows();
- const int start_in_i = start_out_i * m_strat->get_stride_rows() - padding.top;
- const int end_in_i = start_in_i + m_strat->get_input_rows();
-
- // Compute top/bottom padding
- const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
- const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(input_height) - end_in_i, 0));
- const unsigned int valid_output_rows = std::min(
- end_out_i - start_out_i,
- static_cast<int>(output_height) - start_out_i
- );
+ void initialise_working_space(void *buffer, unsigned int n_input_channels) const override
+ {
+ DepthwiseArgs args(this->m_args);
+ args.input_channels = n_input_channels;
+ WorkspaceManager::initialise(
+ buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage())
+ );
+ }
- // Fill the input pointer array with padding values
- for (auto index = 0u; index < m_strat->get_input_rows() * m_strat->get_input_cols(); index++)
- {
- inptr_array[index] = input_buffer;
- }
+ protected:
+ void compute_tile_padded(
+ unsigned int output_i, unsigned int output_j,
+ unsigned int output_channel_start, unsigned int output_channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ const void *parameters,
+ void *working_space_raw
+ ) const override
+ {
+ // Get the working space
+ auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
+
+ // Compute the input pointer array
+ const auto input_channel_start = output_channel_start / this->m_args.channel_multiplier;
+
+ const int ii = static_cast<int>(output_i * this->m_args.stride_rows) - this->m_args.padding.top;
+ const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+ const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+
+ const int ij = static_cast<int>(output_j * this->m_args.stride_cols) - this->m_args.padding.left;
+ const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
+ const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
+
+ fill_pointer_array<const TInput>(
+ ws->inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
+ input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start,
+ input.ld_row, input.ld_col,
+ ws->input_buffer,
+ input_pad_top, this->m_args.input_rows - input_i,
+ input_pad_left, this->m_args.input_cols - input_j
+ );
+
+ // Compute the output pointer array
+ fill_pointer_array(
+ ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+ output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
+ output.ld_row, output.ld_col,
+ ws->output_buffer,
+ 0, this->m_args.output_rows - output_i, // Top padding, # valid rows
+ 0, this->m_args.output_cols - output_j // Left padding, # valid columns
+ );
+
+ // Execute the kernel
+ depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>::indirect(
+ reinterpret_cast<const StratType *>(this->m_strat.get()),
+ ws, this->get_output_stage(), parameters, m_bias, output_channel_end - output_channel_start
+ );
+ }
- for (int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
- {
- const int start_in_j = start_out_j * m_strat->get_stride_cols() - this->m_args.padding.left;
- int pad_left = std::min(0, start_in_j);
+ void compute_row_padded_tile_row(
+ const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
+ const unsigned int output_channel_start, const unsigned int output_channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ const void *parameters,
+ void *working_space
+ ) const override
+ {
+ using Invoker = depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>;
+ auto ws = reinterpret_cast<WorkingSpace *>(working_space);
+ const auto strat = reinterpret_cast<const StratType *>(this->m_strat.get());
+ const auto os = this->get_output_stage();
+
+ // Compute top and bottom padding; hence fill in the initial pointer arrays.
+ const auto input_channel_start = output_channel_start / this->m_args.channel_multiplier;
+ const int ii = static_cast<int>(output_i * this->m_args.stride_rows) - this->m_args.padding.top;
+ const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+
+ const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+ const auto input_j = output_j * this->m_args.stride_cols - this->m_args.padding.left;
+
+ const auto valid_input_rows = std::min(strat->get_input_rows(), this->m_args.input_rows - input_i);
+ const auto valid_output_rows = std::min(strat->get_output_rows(), this->m_args.output_rows - output_i);
+
+ const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * this->m_args.stride_cols;
+ const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols();
+
+ fill_pointer_array<const TInput>(
+ ws->inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
+ input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start,
+ input.ld_row, input.ld_col,
+ ws->input_buffer,
+ input_pad_top, this->m_args.input_rows - input_i,
+ 0, this->m_args.input_cols - input_j // No left padding
+ );
+
+ fill_pointer_array(
+ ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+ output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
+ output.ld_row, output.ld_col,
+ ws->output_buffer,
+ 0, this->m_args.output_rows - output_i, // Top padding, # valid rows
+ 0, this->m_args.output_cols - output_j // Left padding, # valid columns
+ );
+
+ for (; n_tile_cols; n_tile_cols--)
+ {
+ // Execute the kernel
+ Invoker::indirect(
+ strat, ws, os, parameters, m_bias, output_channel_end - output_channel_start
+ );
- // Compute how many output tiles we can compute with the direct kernel.
- int n_direct_tiles = 0;
- if (!pad_top && !pad_bottom && !pad_left)
+ // Update all unpadded pointers
+ {
+ auto ptr = ws->inptr_array + strat->get_input_cols() * input_pad_top;
+ for (auto n = input_pad_top; n < valid_input_rows; n++)
+ {
+ for (auto m = 0u; m < strat->get_input_cols(); m++)
{
- // Determine the maximum number of tiles we could handle.
- n_direct_tiles = (output_width - start_out_j) / m_strat->get_output_cols();
-
- // Continue to reduce this number as required to avoid reading
- // padding on the right edge.
- int end_in_j = start_in_j + n_direct_tiles * m_strat->get_input_cols();
- int pad_right = std::max(0, end_in_j - static_cast<int>(input_width));
-
- while (pad_right && n_direct_tiles)
- {
- n_direct_tiles--;
- end_in_j -= m_strat->get_input_cols();
- pad_right = std::max(0, end_in_j - static_cast<int>(input_width));
- }
+ *(ptr++) += input_point_stride;
}
+ }
+ }
+ {
+ auto ptr = ws->outptr_array;
+ for (auto n = 0u; n < valid_output_rows * strat->get_output_cols(); n++)
+ {
+ *(ptr++) += output_point_stride;
+ }
+ }
+ }
+ }
- // Use the unpadded kernel if we can, otherwise use the padded one.
- if (n_direct_tiles)
- {
- auto inptr = inptr_batch + start_in_i*ld_input_row + start_in_j*ld_input_col;
- auto outptr = outptr_batch + start_out_i*ld_output_row + start_out_j*ld_output_col;
- start_out_j += n_direct_tiles*m_strat->get_output_cols();
+ void compute_tiles_unpadded(
+ unsigned int output_i, const unsigned int output_j,
+ unsigned int n_tile_rows, unsigned int n_tile_cols,
+ unsigned int output_channel_start, unsigned int output_channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ const void *parameters,
+ void *working_space_raw
+ ) const override
+ {
+ using Invoker = depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>;
+ auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
+ const auto strat = reinterpret_cast<const StratType *>(this->m_strat.get());
+ const auto os = this->get_output_stage();
-#ifdef CYCLE_PROFILING
- auto p = prof.ScopedProfiler(PROFILE_KERNEL, 0);
-#endif
- m_strat->direct_kernel(1, n_direct_tiles,
- inptr, ld_input_row, ld_input_col,
- outptr, ld_output_row, ld_output_col,
- parameters, this->m_args.input_channels,
- &activation_min, &activation_max);
- continue;
- }
+ if (Invoker::supports_direct_kernel)
+ {
+ // If the direct kernel is supported, then use it.
+ // Compute the base pointers we'll use in the tile.
+ auto outptr = output.base + output_channel_start + output_i * output.ld_row + output_j * output.ld_col;
+ const int start_input_i = output_i * this->m_args.stride_rows - this->m_args.padding.top;
+ const int start_input_j = output_j * this->m_args.stride_cols - this->m_args.padding.left;
+ auto inptr = input.base + output_channel_start + start_input_i * input.ld_row + start_input_j * input.ld_col;
+
+ // Execute the kernel
+ Invoker::direct(
+ strat, ws, os,
+ n_tile_rows, n_tile_cols,
+ inptr, input.ld_row, input.ld_col,
+ outptr, output.ld_row, output.ld_col,
+ parameters, output_channel_end - output_channel_start
+ );
+ }
+ else
+ {
+ // Otherwise, we repeatedly call the padded kernel but use our knowledge
+ // of the tensor structure to avoid recomputing the pointer array.
+ const auto input_channel_start = output_channel_start / this->m_args.channel_multiplier;
+
+ const auto n_input_pointers = this->m_strat->get_input_rows() * this->m_strat->get_input_cols();
+ const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * this->m_args.stride_cols;
+ const auto n_output_pointers = this->m_strat->get_output_rows() * this->m_strat->get_output_cols();
+ const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols();
+
+ // For each tile row, initialise the input and output pointer arrays. For
+ // each subsequent tile we simply update the pointers.
+ for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
+ {
+ const int input_i = static_cast<int>(output_i * this->m_args.stride_rows) - this->m_args.padding.top;
+ const int input_j = static_cast<int>(output_j * this->m_args.stride_cols) - this->m_args.padding.left;
+
+ fill_pointer_array<const TInput>(
+ ws->inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
+ input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start,
+ input.ld_row, input.ld_col,
+ ws->input_buffer,
+ 0, this->m_args.input_rows,
+ 0, this->m_args.input_cols
+ );
- const int end_out_j = start_out_j + m_strat->get_output_cols();
- const int end_in_j = start_in_j + m_strat->get_input_cols();
+ // Compute the output pointer array
+ fill_pointer_array(
+ ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+ output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
+ output.ld_row, output.ld_col,
+ ws->output_buffer,
+ 0, this->m_args.output_rows,
+ 0, this->m_args.output_cols
+ );
- const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(input_width) - end_in_j, 0));
- const unsigned int valid_output_cols = std::min(
- end_out_j - start_out_j,
- static_cast<int>(output_width) - start_out_j
+ for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++)
+ {
+ // Invoke the indirect kernel for this tile
+ depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>::indirect(
+ strat, ws, os, parameters, m_bias, output_channel_end - output_channel_start
);
- pad_left *= -1;
- // Construct the input pointer array - fill the array with pointers to
- // the input buffer and then fill in the required values.
- for (auto i = pad_top; i < m_strat->get_input_rows() - pad_bottom; i++)
- {
- // Can skip over the left padding because we will have either the
- // same or less than the previous tile.
- unsigned int j = pad_left;
- const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col;
- const void **ptrs = inptr_array + i * m_strat->get_input_cols() + j;
- for (; j < m_strat->get_input_cols() - pad_right; j++)
- {
- *(ptrs++) = colptr;
- colptr += ld_input_col;
- }
- for (; j < m_strat->get_input_cols(); j++)
- {
- *(ptrs++) = input_buffer;
- }
- }
- // Construct the output pointer array.
- void **outptr_pos = outptr_array;
- for (auto i = 0u; i < valid_output_rows; i++)
+ // Progress the pointers
+ for (auto i = 0u; i < n_input_pointers; i++)
{
- unsigned int j = 0u;
- TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col;
- for (; j < valid_output_cols; j++)
- {
- *(outptr_pos++) = colptr;
- colptr += ld_output_col;
- }
- for (; j < m_strat->get_output_cols(); j++)
- {
- *(outptr_pos++) = output_buffer;
- }
+ ws->inptr_array[i] += input_point_stride;
}
- for (auto i = valid_output_rows; i < m_strat->get_output_rows(); i++)
+ for (auto i = 0u; i < n_output_pointers; i++)
{
- for (auto j = 0u; j < m_strat->get_output_cols(); j++)
- {
- *(outptr_pos++) = output_buffer;
- }
+ ws->outptr_array[i] += output_point_stride;
}
-
- start_out_j += m_strat->get_output_cols();
-
-#ifdef CYCLE_PROFILING
- // TODO Work number
- auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(0));
-#endif
- m_strat->indirect_kernel(inptr_array, outptr_array, parameters,
- this->m_args.input_channels,
- &activation_min, &activation_max);
}
+
+ output_i += this->m_strat->get_output_rows();
}
}
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp
index f04f7751db..9f53f7cc6f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,355 +24,277 @@
#pragma once
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
-
-#ifdef CYCLE_PROFILING
-#include "profiler.hpp"
-#endif
-
-#include <limits>
+#include "depthwise_depthfirst.hpp"
namespace arm_conv {
namespace depthwise {
-template <class Strategy, unsigned OutputRows, unsigned int OutputCols>
-class DepthwiseDepthfirstGenericBase :
- public DepthwiseCommon<typename Strategy::input_type,
- typename Strategy::weight_type,
- typename Strategy::return_type>
+template <typename TInput, typename TOutput, typename TAccum>
+struct GenericDepthfirstKernelStrategyFunctionType
{
- protected:
+ using KernelType = std::function<void(const TInput *const *const, TOutput *const *const, const void *, const void *, const unsigned int, const unsigned int, const TAccum, const TAccum)>;
+};
- using TInput = typename Strategy::input_type;
- using TWeight = typename Strategy::weight_type;
- using TOutput = typename Strategy::return_type;
- using TAccum = typename Strategy::bias_type;
+template <typename TInput, typename TOutput>
+struct GenericDepthfirstKernelStrategyFunctionType<TInput, TOutput, int32_t>
+{
+ using KernelType = std::function<void(const TInput *const *const, TOutput *const *const, const void *, const arm_gemm::Requantize32 &, unsigned int, unsigned int)>;
+};
- size_t sizeof_input_ptr_array(void) const
- {
- return sizeof(TInput *) * this->m_args.kernel_rows * this->m_args.kernel_cols * Strategy::n_output_points;
- }
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+class GenericDepthfirstKernelStrategy
+{
+ unsigned int m_n_output_points;
+ arm_gemm::VLType m_vl_type;
+ unsigned int m_accumulator_depth_vl;
- size_t sizeof_input_buffer(unsigned int n_channels) const
+ public:
+ GenericDepthfirstKernelStrategy(unsigned int n_output_points, arm_gemm::VLType vl_type, unsigned int accumulator_depth_vl=1)
+ : m_n_output_points(n_output_points), m_vl_type(vl_type), m_accumulator_depth_vl(accumulator_depth_vl)
{
- const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(Strategy::vl_type);
- const auto rounded_channels = arm_gemm::roundup(n_channels, vl);
- return sizeof(TInput) * rounded_channels;
}
- size_t sizeof_output_buffer(unsigned int n_channels) const
+ virtual ~GenericDepthfirstKernelStrategy() = default;
+
+ virtual arm_gemm::VLType get_vl_type() const { return m_vl_type; }
+ virtual unsigned int get_accumulator_depth_vl() const { return m_accumulator_depth_vl; }
+ virtual unsigned int get_n_output_points() const { return m_n_output_points; }
+
+ using KernelType = typename GenericDepthfirstKernelStrategyFunctionType<TInput, TOutput, TAccum>::KernelType;
+ virtual KernelType get_kernel(void) const = 0;
+};
+
+template <typename TInput,
+ typename TWeight=TInput,
+ typename TOutput=TInput,
+ typename TAccum=typename DefaultTAccum<TInput>::Type,
+ typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class GenericDepthfirstStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+ protected:
+ using KernelStrategyType = GenericDepthfirstKernelStrategy<TInput, TWeight, TOutput, TAccum>;
+ std::unique_ptr<KernelStrategyType> m_strategy;
+
+ public:
+ GenericDepthfirstStrategy(
+ KernelStrategyType *strat, unsigned int n_output_rows, unsigned int n_output_cols,
+ const DepthwiseArgs &args
+ )
+ : DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>(
+ n_output_rows, n_output_cols,
+ args.kernel_rows, args.kernel_cols,
+ args.stride_rows, args.stride_cols
+ ),
+ m_strategy(strat)
{
- const unsigned int vl = arm_gemm::utils::get_vector_length<TOutput>(Strategy::vl_type);
- const auto rounded_channels = arm_gemm::roundup(n_channels, vl);
- return sizeof(TOutput) * rounded_channels;
}
- unsigned int input_rows(void) const
+ GenericDepthfirstStrategy(GenericDepthfirstStrategy &) = delete;
+ GenericDepthfirstStrategy operator=(GenericDepthfirstStrategy &) = delete;
+
+ arm_gemm::VLType get_vl_type(void) const override { return m_strategy->get_vl_type(); }
+ unsigned int get_accumulator_depth_vl(void) const override { return m_strategy->get_accumulator_depth_vl(); }
+
+ size_t get_storage_size(const DepthwiseArgs &args) const override
{
- return this->m_args.kernel_rows + (OutputRows - 1)*this->m_args.stride_rows;
+ interleaves::PackingArguments packing_args(
+ this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
+ false, sizeof(TAccum), // Don't pack the bias
+ this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(),
+ [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+ { return this->get_kernel_packing_point(idx, x, y); }
+ );
+ return interleaves::get_storage_size_generic(packing_args, args);
}
- unsigned int input_cols(void) const
+ void pack_parameters(
+ const DepthwiseArgs &args, void *buffer,
+ const void *biases, const OutputStage &,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const override
{
- return this->m_args.kernel_cols + (OutputCols - 1)*this->m_args.stride_cols;
+ interleaves::PackingArguments packing_args(
+ this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
+ false, sizeof(TAccum), // Don't pack the bias
+ this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(),
+ [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+ { return this->get_kernel_packing_point(idx, x, y); }
+ );
+ interleaves::pack_parameters_generic(
+ packing_args, args, buffer, biases, weights, ld_weight_col, ld_weight_row);
}
- void execute_tiles(
- std::function<void(const TInput *const *, TOutput *const *)> tile_fn,
- std::function<void(TInput *, unsigned int)> initialise_input_buffer,
- const unsigned int batches,
- const unsigned int input_height,
- const unsigned int input_width,
- const unsigned int input_channels,
- const PaddingValues &padding,
- const void *const _input,
- const size_t ld_input_col,
- const size_t ld_input_row,
- const size_t ld_input_batch,
- const unsigned int output_height,
- const unsigned int output_width,
- void *const _output,
- const size_t ld_output_col,
- const size_t ld_output_row,
- const size_t ld_output_batch,
- void *const _working_space,
- const unsigned int thread_id,
- const unsigned int n_threads
- ) const
+ const typename KernelStrategyType::KernelType get_kernel() const { return m_strategy->get_kernel(); }
+};
+
+// Use a templated function to marshal arguments when executing the kernel.
+template <typename OutputStage> struct DepthwiseDepthfirstGenericKernelCall;
+
+template <>
+struct DepthwiseDepthfirstGenericKernelCall<Nothing>
+{
+ template <typename StratType, typename WorkspaceType, typename TAccum>
+ static void execute(
+ const StratType *strat, const WorkspaceType *ws, const Nothing &,
+ const TAccum *bias, const void *params,
+ const unsigned int n_kernel_points, const unsigned int n_output_channels
+ )
{
- static_assert(OutputRows * OutputCols <= Strategy::n_output_points,
- "Too many output points for kernel.");
-
- // Determine what portion of the work to do.
- const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads);
- const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
- const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
-
- // Cast input and output pointers into the right types
- const TInput *const inptr = static_cast<const TInput *>(_input);
- TOutput *const outptr = static_cast<TOutput *>(_output);
-
- // Allocate portions of the working space
- uint8_t *const working_space = static_cast<uint8_t *>(_working_space) + this->get_working_size(thread_id, input_channels);
- const TInput **const inptr_array = reinterpret_cast<const TInput **>(working_space);
- TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space + this->sizeof_input_ptr_array());
- TInput *const input_buffer = reinterpret_cast<TInput *>(working_space + this->sizeof_input_ptr_array() + this->sizeof_output_buffer(input_channels * this->m_args.channel_multiplier));
-
- // Create an array for the output pointers
- TOutput * _outptr_array[Strategy::n_output_points];
- TOutput **const outptr_array = _outptr_array;
-
- // Initialise the input buffer
- initialise_input_buffer(input_buffer, input_channels);
-
- // For each output tile, construct the requisite set of pointers and call
- // into the kernel.
- for (unsigned int batch = 0; batch < batches; batch++)
- {
- // Get batch pointers
- const auto inptr_batch = inptr + batch * ld_input_batch;
- const auto outptr_batch = outptr + batch * ld_output_batch;
-
- for (int start_out_i = start_out_height;
- start_out_i < end_out_height;
- start_out_i += static_cast<int>(OutputRows))
- {
- const int end_out_i = std::min(start_out_i + OutputRows,
- output_height);
-
- for (int start_out_j = 0;
- start_out_j < static_cast<int>(output_width);
- start_out_j += static_cast<int>(OutputCols))
- {
- const int end_out_j = std::min(start_out_j + OutputCols,
- output_width);
-
- // Fill the pointer arrays with pointers to the input/output buffers.
- for (auto index = 0u;
- index < (Strategy::n_output_points * this->m_args.kernel_rows * this->m_args.kernel_cols);
- index++)
- {
- inptr_array[index] = input_buffer;
- }
- for (auto index = 0u; index < Strategy::n_output_points; index++)
- {
- outptr_array[index] = output_buffer;
- }
-
- // Construct the pointer arrays together. Note that the input pointer
- // array is striped. Since the array has already been filled with
- // pointers to the padding array we merely fill in the valid points
- // as we get to them.
- unsigned int output_index = 0;
- auto outptr_row = outptr_batch + start_out_i * ld_output_row + start_out_j * ld_output_col;
- for (auto out_i = start_out_i; out_i < end_out_i; out_i++)
- {
- auto outptr_col = outptr_row;
-
- // Compute the padding for this row of tiles.
- const int start_in_i = out_i * this->m_args.stride_rows - padding.top;
- const int end_in_i = start_in_i + this->m_args.kernel_rows;
- const auto pad_top = static_cast<unsigned int>(std::max<int>(0, 0 - start_in_i));
- const auto pad_bottom = static_cast<unsigned int>(std::max<int>(0, end_in_i - input_height));
- const unsigned int valid_rows = this->m_args.kernel_rows - pad_top - pad_bottom;
-
- for (auto out_j = start_out_j; out_j < end_out_j; out_j++, output_index++)
- {
- // Compute the output pointer.
- outptr_array[output_index] = outptr_col;
- outptr_col += ld_output_col;
-
- // Compute the padding for this tile.
- const int start_in_j = out_j * this->m_args.stride_cols - padding.left;
- const int end_in_j = start_in_j + this->m_args.kernel_cols;
- const auto pad_left = static_cast<unsigned int>(std::max<int>(0, 0 - start_in_j));
- const auto pad_right = static_cast<unsigned int>(std::max<int>(0, end_in_j - input_width));
- const unsigned int valid_cols = this->m_args.kernel_cols - pad_left - pad_right;
-
- // Hence compute the input pointers.
- auto input_index = output_index + Strategy::n_output_points * (pad_top * this->m_args.kernel_cols + pad_left);
- auto inptr_row = inptr_batch + (start_in_i + pad_top) * ld_input_row + (start_in_j + pad_left) * ld_input_col;
- for (auto in_i = 0u; in_i < valid_rows; in_i++)
- {
- auto inptr_col = inptr_row;
- auto input_index_col = input_index;
-
- for (auto in_j = 0u; in_j < valid_cols; in_j++)
- {
- inptr_array[input_index_col] = inptr_col;
- inptr_col += ld_input_col;
- input_index_col += Strategy::n_output_points;
- }
-
- inptr_row += ld_input_row;
- input_index += Strategy::n_output_points * this->m_args.kernel_cols;
- }
- }
-
- outptr_row += ld_output_row;
- }
-
- tile_fn(inptr_array, outptr_array);
- }
- }
- }
+ strat->get_kernel()(
+ ws->inptr_array,
+ ws->outptr_array,
+ params, bias,
+ n_kernel_points, n_output_channels,
+ ws->activation_min, ws->activation_max
+ );
}
+};
- public:
- DepthwiseDepthfirstGenericBase(const DepthwiseArgs &args) : DepthwiseCommon<TInput, TWeight, TOutput>(args)
+template <>
+struct DepthwiseDepthfirstGenericKernelCall<arm_gemm::Requantize32>
+{
+ template <typename StratType, typename WorkspaceType>
+ static void execute(
+ const StratType *strat, const WorkspaceType *ws, const arm_gemm::Requantize32 &qp,
+ const int32_t *, const void *params,
+ const unsigned int n_kernel_points, const unsigned int n_output_channels
+ )
{
+ strat->get_kernel()(
+ ws->inptr_array,
+ ws->outptr_array,
+ params, qp,
+ n_kernel_points, n_output_channels
+ );
}
+};
- DepthwiseDepthfirstGenericBase(DepthwiseDepthfirstGenericBase &) = delete;
- DepthwiseDepthfirstGenericBase &operator=(DepthwiseDepthfirstGenericBase &) = delete;
- size_t get_storage_size(void) const override
+/* Workspace Element for an array of input pointers as consumed by the
+ * "Generic" depthwise kernels.
+ */
+template <typename T>
+class GenericInputArrayElement
+{
+ public:
+ struct Workspace
{
- const unsigned int vl = arm_gemm::utils::get_vector_length<TAccum>(Strategy::vl_type);
- const auto rounded_channels = arm_gemm::roundup(this->m_args.input_channels, vl);
- return (this->m_args.kernel_rows * this->m_args.kernel_cols) * rounded_channels * sizeof(TWeight);
- }
+ const T **inptr_array;
+ };
- void pack_parameters(void *_buffer, const void *, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override
+ template <class OutputStage>
+ static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
{
- // Cast the pointers
- TWeight *buffer = static_cast<TWeight *>(_buffer);
- const TWeight *const weights = static_cast<const TWeight *>(_weights);
-
- const unsigned int vl = arm_gemm::utils::get_vector_length<TAccum>(Strategy::vl_type);
- ld_weight_col = (ld_weight_col == 0) ? this->m_args.input_channels : ld_weight_col;
- ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row;
-
- for (unsigned int n = 0; n < this->m_args.input_channels; n += vl)
- {
- const unsigned int todo = std::min(vl, this->m_args.input_channels - n);
-
- // Copy each of the weights in turn
- auto weights_row = weights + n;
- for (unsigned int i = 0; i < this->m_args.kernel_rows; i++)
- {
- auto weights_col = weights_row;
-
- for (unsigned int j = 0; j < this->m_args.kernel_cols; j++)
- {
- for (unsigned int m = 0; m < todo; m++)
- {
- buffer[m] = weights_col[m];
- }
- buffer += vl;
-
- weights_col += ld_weight_col;
- }
-
- weights_row += ld_weight_row;
- }
- }
+ const auto kernel_points = args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
+ return sizeof(T **) * args.strategy->get_input_rows() * args.strategy->get_input_cols() * kernel_points;
}
- size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
+ template <class WorkspaceType, class OutputStage>
+ static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
{
- const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
- return n_threads * (sizeof_input_ptr_array() +
- sizeof_output_buffer(n_output_channels) +
- sizeof_input_buffer(n_channels));
+ ws->inptr_array = reinterpret_cast<const T**>(buffer);
+ return reinterpret_cast<char *>(buffer) + get_element_size(args);
}
};
-template <class Strategy, unsigned OutputRows, unsigned int OutputCols>
-class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstGenericBase<Strategy, OutputRows, OutputCols>
+template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
+ typename TAccum=typename DefaultTAccum<TInput>::Type,
+ typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
{
- using Parent = DepthwiseDepthfirstGenericBase<Strategy, OutputRows, OutputCols>;
- using TInput = typename Parent::TInput;
- using TWeight = typename Parent::TWeight;
- using TAccum = typename Parent::TAccum;
- using TOutput = typename Parent::TOutput;
-
+ using StratType = GenericDepthfirstStrategy<TInput, TWeight, TOutput, TAccum, OutputStage>;
+ using Parent = DepthwiseDepthfirstCommon<TInput, TWeight, TOutput, TAccum, OutputStage>;
+ using WorkspaceManager = Workspace<
+ OutputArrayElement<TOutput>,
+ GenericInputArrayElement<TInput>,
+ InputBufferElement<TInput>,
+ ActivationsElement<TAccum, OutputStage>
+ >;
+ using WorkingSpace = typename WorkspaceManager::WorkspaceType;
const TAccum *m_bias = nullptr;
public:
- DepthwiseDepthfirstGeneric(const DepthwiseArgs &args) : Parent(args)
+ DepthwiseDepthfirstGeneric(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os={})
+ : Parent(strat, args, os)
{
}
DepthwiseDepthfirstGeneric(DepthwiseDepthfirstGeneric &) = delete;
DepthwiseDepthfirstGeneric &operator=(DepthwiseDepthfirstGeneric &) = delete;
- void pack_parameters(void *buffer, const void *bias, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+ void pack_parameters(
+ void *buffer, const void *biases,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) override
+ {
+ Parent::pack_parameters(buffer, biases, weights, ld_weight_col, ld_weight_row);
+ m_bias = reinterpret_cast<const TAccum *>(biases); // Get a copy of the biases
+ depthwise_depthfirst::stash_bias(this->get_output_stage(), m_bias);
+ }
+
+ size_t get_working_size_per_thread(const unsigned int n_input_channels) const override
+ {
+ DepthwiseArgs args(this->m_args);
+ args.input_channels = n_input_channels;
+ return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage()));
+ }
+
+ void initialise_working_space(void *buffer, unsigned int n_input_channels) const override
{
- m_bias = static_cast<const TAccum *>(bias);
- Parent::pack_parameters(buffer, bias, weights, ld_weight_col, ld_weight_row);
+ DepthwiseArgs args(this->m_args);
+ args.input_channels = n_input_channels;
+ return WorkspaceManager::initialise(buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, this->get_output_stage()));
}
- using DepthwiseDepthfirstGenericBase<Strategy, OutputRows, OutputCols>::execute;
- void execute(
- const unsigned int batches,
- const unsigned int input_height,
- const unsigned int input_width,
- const unsigned int input_channels,
- const PaddingValues &padding,
- const void *const _input,
- const size_t ld_input_col,
- const size_t ld_input_row,
- const size_t ld_input_batch,
- const void *const parameters,
- const unsigned int output_height,
- const unsigned int output_width,
- void *const _output,
- const size_t ld_output_col,
- const size_t ld_output_row,
- const size_t ld_output_batch,
- void *const _working_space,
- const unsigned int thread_id,
- const unsigned int n_threads
+ protected:
+ void compute_tile_padded(
+ unsigned int output_i, unsigned int output_j,
+ unsigned int channel_start, unsigned int channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ const void *parameters,
+ void *working_space_raw
) const override
{
- Strategy strat(this->m_args.cpu_info);
-#ifdef CYCLE_PROFILING
- arm_gemm::profiler prof;
-#endif
-
- // Compute activation values
- TAccum activation_min, activation_max;
- std::tie(activation_min, activation_max) = get_default_activation_values<TAccum>();
-
- switch (this->m_args.activation.type)
- {
- case arm_gemm::Activation::Type::BoundedReLU:
- activation_max = static_cast<TAccum>(this->m_args.activation.param1);
- // Fall through
- case arm_gemm::Activation::Type::ReLU:
- activation_min = static_cast<TAccum>(0);
- break;
- default:
- break;
- }
-
- // Create a function to initialise the input buffer
- const auto initialise_input_buffer = [] (TInput *const buffer, const unsigned int n) {
- std::memset(buffer, 0, n * sizeof(TInput));
- };
-
- // Create a function to execute a tile of work
- const auto tile_fn = [&] (const TInput *const *const inptrs, TOutput *const * const outptrs) {
-#ifdef CYCLE_PROFILING
- auto p = prof.ScopedProfiler(
- PROFILE_KERNEL,
- (unsigned long) (OutputRows * OutputCols * this->m_args.kernel_rows* this->m_args.kernel_cols)
- );
-#endif
- strat.kernel(inptrs, outptrs, parameters, m_bias,
- this->m_args.kernel_rows * this->m_args.kernel_cols,
- this->m_args.input_channels, activation_min, activation_max);
- };
-
- // Call into a parent utility function to do the actual work.
- Parent::execute_tiles(
- tile_fn, initialise_input_buffer,
- batches, input_height, input_width, input_channels, padding,
- _input, ld_input_col, ld_input_row, ld_input_batch,
- output_height, output_width,
- _output, ld_output_col, ld_output_row, ld_output_batch,
- _working_space, thread_id, n_threads
+ // Get the working space
+ WorkingSpace *ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
+
+ const int ii = static_cast<int>(output_i * this->m_args.stride_rows) - this->m_args.padding.top;
+ const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+ const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+
+ const int ij = static_cast<int>(output_j * this->m_args.stride_cols) - this->m_args.padding.left;
+ const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
+ const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
+
+ fill_pointer_array_generic_kernel<const TInput>(
+ ws->inptr_array,
+ this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+ this->m_args.kernel_rows, this->m_args.kernel_cols,
+ this->m_args.stride_rows, this->m_args.stride_cols,
+ input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start,
+ input.ld_row, input.ld_col,
+ ws->input_buffer,
+ input_pad_top, this->m_args.input_rows - input_i,
+ input_pad_left, this->m_args.input_cols - input_j
+ );
+
+ // Compute the output pointer array
+ fill_pointer_array<TOutput>(
+ ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+ output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start,
+ output.ld_row, output.ld_col,
+ ws->output_buffer,
+ 0, this->m_args.output_rows - output_i, // Top padding, # valid rows
+ 0, this->m_args.output_cols - output_j // Left padding, # valid columns
+ );
+
+ // Execute the kernel
+ DepthwiseDepthfirstGenericKernelCall<OutputStage>::execute(
+ reinterpret_cast<const StratType *>(this->m_strat.get()), ws,
+ this->get_output_stage(), m_bias, parameters,
+ this->m_args.kernel_rows * this->m_args.kernel_cols,
+ channel_end - channel_start
);
}
};
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
index 2862361b82..e58467b0f4 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,8 @@
#pragma once
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "interleaves/generic_quantized_dot_product.hpp"
#ifdef CYCLE_PROFILING
#include "profiler.hpp"
@@ -35,492 +36,559 @@
namespace arm_conv {
namespace depthwise {
-namespace common
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+class DepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, Nothing>
{
- template <typename strategy, typename F>
- void depthwise_multiplier_execute(
- const F execute_tile,
- typename strategy::input_type pad_value,
- const DepthwiseArgs &args,
- const unsigned int batches,
- const unsigned int input_height,
- const unsigned int input_width,
- const unsigned int input_channels,
- const PaddingValues &padding,
- const void *const _input,
- const size_t ld_input_col,
- const size_t ld_input_row,
- const size_t ld_input_batch,
- const void *const parameters,
- const size_t param_stride,
- const unsigned int output_height,
- const unsigned int output_width,
- void *const _output,
- const size_t ld_output_col,
- const size_t ld_output_row,
- const size_t ld_output_batch,
- void *const _working_space,
- const unsigned int thread_id,
- const unsigned int n_threads
- )
- {
- using TInput = typename strategy::input_type;
- using TOutput = typename strategy::return_type;
-
- // Determine what portion of the work to do.
- const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads);
- const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
- const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
-
- // Cast input and output pointers into the right types
- const TInput *const inptr = static_cast<const TInput *>(_input);
- TOutput *const outptr = static_cast<TOutput *>(_output);
-
- // To simplify the kernel, we process padded or non-NCHW-ordered input into
- // a form which can be consumed by the kernel. This data is stored here and
- // passed into the kernel as an array of N pointers (one per row of the
- // input).
- TInput rearranged_input[strategy::input_rows][strategy::input_col_quads*(16 / sizeof(TInput))];
- const TInput *inptrs[strategy::input_rows];
-
- // Create an array for the output pointers
- TOutput * _outptr_array[strategy::output_rows * strategy::output_cols];
- TOutput **const outptr_array = _outptr_array;
-
- // Allocate portions of the working space
- uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
- TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
-
- // For each output tile, construct the requisite set of pointers and call
- // into the kernel.
- for (unsigned int batch = 0; batch < batches; batch++)
- {
- // Get batch pointers
- const auto inptr_batch = inptr + batch * ld_input_batch;
- const auto outptr_batch = outptr + batch * ld_output_batch;
+ using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, Nothing>;
- for (int start_out_i = start_out_height;
- start_out_i < end_out_height;
- start_out_i += static_cast<int>(strategy::output_rows))
+ protected:
+ virtual interleaves::PackingArguments get_packing_args(const DepthwiseArgs &args) const
+ {
+ return interleaves::PackingArguments(
+ args.kernel_rows, args.kernel_cols, sizeof(TWeight),
+ true, sizeof(TAccum),
+ this->get_vl_type(),
+ sizeof(TAccum), 1,
+ [args] (unsigned int pos, unsigned int &x, unsigned int &y) -> bool
{
- const int end_out_i = start_out_i + strategy::output_rows;
- const int start_in_i = start_out_i * strategy::stride_rows - padding.top;
- const int end_in_i = start_in_i + strategy::input_rows;
-
- // Compute top/bottom padding
- const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
- const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(input_height) - end_in_i, 0));
- const unsigned int valid_output_rows = std::min(
- end_out_i - start_out_i,
- static_cast<int>(output_height) - start_out_i
- );
-
- for (int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
+ if (pos < args.kernel_rows * args.kernel_cols)
{
- const int start_in_j = start_out_j * strategy::stride_cols - args.padding.left;
- const int pad_left = -std::min(0, start_in_j);
-
- const int end_out_j = start_out_j + strategy::output_cols;
- const int end_in_j = start_in_j + strategy::input_cols;
-
- const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(input_width) - end_in_j, 0));
- const unsigned int valid_output_cols = std::min(
- end_out_j - start_out_j,
- static_cast<int>(output_width) - start_out_j
- );
-
- // Construct the output pointer array.
- TOutput **outptr_pos = outptr_array;
- for (auto i = 0u; i < valid_output_rows; i++)
- {
- unsigned int j = 0u;
- TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col;
- for (; j < valid_output_cols; j++)
- {
- *(outptr_pos++) = colptr;
- colptr += ld_output_col;
- }
- for (; j < strategy::output_cols; j++)
- {
- *(outptr_pos++) = output_buffer;
- }
- }
- for (auto i = valid_output_rows; i < strategy::output_rows; i++)
- {
- for (auto j = 0u; j < strategy::output_cols; j++)
- {
- *(outptr_pos++) = output_buffer;
- }
- }
-
- start_out_j += strategy::output_cols;
-
- const uint8_t *params = static_cast<const uint8_t *>(parameters);
-
- // Loop over the input channels
- for (unsigned int in_c = 0; in_c < input_channels; in_c++)
- {
- // Construct the input array - first fill with padding values and
- // then fill in correct values.
- for (unsigned int i = 0; i < strategy::input_rows; i++)
- {
- for (unsigned int j = 0;
- j < (16 / sizeof(TInput)) * strategy::input_col_quads; j++)
- {
- rearranged_input[i][j] = pad_value;
- }
- inptrs[i] = rearranged_input[i];
- }
-
- auto inptr_row = inptr_batch + in_c +
- (start_in_i + pad_top) * ld_input_row +
- (start_in_j + pad_left) * ld_input_col;
- if (ld_input_col == 1 && !pad_left &&
- start_in_j + (16 / sizeof(TInput)) * strategy::input_col_quads < input_width)
- {
- // The input tensor is already in NCHW format, and we're reading
- // an unpadded section of it - allow the kernel to read it
- // directly.
- for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
- {
- inptrs[i] = inptr_row;
- inptr_row += ld_input_row;
- }
- }
- else
- {
- // Either the input tensor isn't in NCHW format, or we're reading
- // a padded section. Copy the relevant portion of the input here
- // and allow the kernel to read this.
- for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
- {
- auto inptr_col = inptr_row;
- for (unsigned int j = pad_left; j < strategy::input_cols - pad_right; j++)
- {
- rearranged_input[i][j] = *inptr_col;
- inptr_col += ld_input_col;
- }
- inptr_row += ld_input_row;
- }
- }
-
- execute_tile(inptrs, outptr_array, params);
-
- // Progress the output pointers
- TOutput **outptr_pos = outptr_array;
- for (auto i = 0u; i < strategy::output_rows * strategy::output_cols; i++)
- {
- outptr_pos[i] += args.channel_multiplier;
- }
-
- // Progress the pointer into the parameters
- params += param_stride;
- }
+ y = pos % args.kernel_cols;
+ x = pos / args.kernel_cols;
+ return true;
}
+ return false;
}
- }
+ );
}
-}
-template <class strategy>
-class DepthwiseDepthfirstWithMultiplier :
- public DepthwiseCommon<typename strategy::input_type,
- typename strategy::weight_type,
- typename strategy::return_type>
-{
- using TInput = typename strategy::input_type;
- using TWeight = typename strategy::weight_type;
- using TOutput = typename strategy::return_type;
- using TAccum = typename strategy::bias_type;
+ public:
+ using Parent::Parent;
+
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleaves::get_storage_size_generic(this->get_packing_args(args), args);
+ }
- size_t sizeof_output_buffer(unsigned int n_channels) const
+ void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const Nothing &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
{
- const unsigned int vl = arm_gemm::utils::get_vector_length<TOutput>(strategy::vl_type);
- const auto rounded_channels = arm_gemm::roundup(n_channels, vl);
- return sizeof(TOutput) * rounded_channels;
+ interleaves::pack_parameters_generic(
+ this->get_packing_args(args), args,
+ buffer, biases, weights, ld_weight_col, ld_weight_row
+ );
}
+ using KernelType = std::function<void(
+ const TInput *const *, // Input pointers
+ TOutput *const *, // Output pointers
+ const void *, // Ravelled bias, weights, and quantization parameters
+ unsigned int, // # output channels
+ TAccum, TAccum // Min and max activation clamps
+ )>;
+ virtual KernelType get_kernel(void) const = 0;
+};
+
+
+template <typename TInput, typename TWeight, typename TOutput>
+class DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t> : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+ using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>;
+
public:
- DepthwiseDepthfirstWithMultiplier(const DepthwiseArgs &args) : DepthwiseCommon<TInput, TWeight, TOutput>(args)
+ using Parent::Parent;
+
+ size_t get_storage_size(const DepthwiseArgs &args) const override
{
+ return interleaves::quantized::get_storage_size(args, this->get_vl_type(), this->get_accumulator_depth_vl());
}
- DepthwiseDepthfirstWithMultiplier(DepthwiseDepthfirstWithMultiplier &) = delete;
- DepthwiseDepthfirstWithMultiplier &operator=(DepthwiseDepthfirstWithMultiplier &) = delete;
+ void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
+ {
+ interleaves::quantized::pack_parameters<TWeight>(
+ buffer, reinterpret_cast<const int32_t *>(biases),
+ reinterpret_cast<const TWeight *>(weights), ld_weight_col, ld_weight_row,
+ args, qp, this->get_vl_type(), this->get_accumulator_depth_vl()
+ );
+ }
- size_t get_storage_size(void) const override
+ using KernelType = std::function<void(
+ const TInput *const *, // Input pointers
+ TOutput *const *, // Output pointers
+ const void *, // Ravelled bias, weights, and quantization parameters
+ unsigned int, // # output channels
+ const arm_gemm::Requantize32 &
+ )>;
+ virtual KernelType get_kernel(void) const = 0;
+};
+
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+class GenericDepthfirstMultiplierKernelStrategy
+{
+ const arm_gemm::VLType m_vl_type;
+ const unsigned int m_output_rows, m_output_cols;
+
+ public:
+ GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type)
+ : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols)
{
- // TODO What if we insert extra padding? Biases are a different size to the inputs, ...
- const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(strategy::vl_type);
- const auto rounded_channels = this->m_args.input_channels * arm_gemm::roundup(this->m_args.channel_multiplier, vl);
- return (1 + this->m_args.kernel_rows * this->m_args.kernel_cols) * rounded_channels * sizeof(TWeight);
}
- void pack_parameters(void *_buffer, const void *_biases, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override
+ virtual ~GenericDepthfirstMultiplierKernelStrategy() = default;
+
+ arm_gemm::VLType get_vl_type(void) const { return m_vl_type; }
+ unsigned int get_output_rows(void) const { return m_output_rows; }
+ unsigned int get_output_cols(void) const { return m_output_cols; }
+
+ using KernelType = std::function<void(
+ const TInput *const *, // Input pointers
+ TOutput *const *, // Output pointers
+ const TWeight *, // Ravelled weight parameters
+ const TAccum *, // Bias,
+ unsigned int, unsigned int, // Number of kernel points, number of output channels
+ TAccum, TAccum // Activation minimum and maximum
+ )>;
+ virtual KernelType get_kernel(void) const = 0;
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+class GenericDepthfirstMultiplierKernelStrategy<TInput, TWeight, TOutput, int32_t>
+{
+ const arm_gemm::VLType m_vl_type;
+ const unsigned int m_output_rows, m_output_cols;
+
+ public:
+ GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type)
+ : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols)
{
- // TODO What if the kernel needs a different packing function?
+ }
- // Cast the pointers
- float *buffer = static_cast<float *>(_buffer);
- const float *biases = static_cast<const float *>(_biases);
- const float *const weights = static_cast<const float *>(_weights);
+ virtual ~GenericDepthfirstMultiplierKernelStrategy() = default;
+
+ arm_gemm::VLType get_vl_type(void) const { return m_vl_type; }
+ unsigned int get_output_rows(void) const { return m_output_rows; }
+ unsigned int get_output_cols(void) const { return m_output_cols; }
+
+ using KernelType = std::function<void(
+ const TInput *const *, // Input pointers
+ TOutput *const *, // Output pointers
+ const TWeight *, // Ravelled weight parameters
+ const int32_t *, // Bias,
+ unsigned int, unsigned int, // Number of kernel points, number of output channels
+ const int32_t *, const int32_t *, const int32_t *, // Per-channel left-shifts, multipliers, right-shifts (need to account for start channel)
+ const arm_gemm::Requantize32 &
+ )>;
+ virtual KernelType get_kernel(void) const = 0;
+};
- const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(strategy::vl_type);
- ld_weight_col = (ld_weight_col == 0) ? this->m_args.channel_multiplier * this->m_args.input_channels : ld_weight_col;
- ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row;
+template <typename TInput,
+ typename TWeight=TInput,
+ typename TOutput=TInput,
+ typename TAccum=typename DefaultTAccum<TInput>::Type,
+ typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class GenericDepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+ using KernelStrategyType = GenericDepthfirstMultiplierKernelStrategy<TInput, TWeight, TOutput, TAccum>;
+ std::unique_ptr<KernelStrategyType> m_kern;
- for (unsigned int in_c = 0; in_c < this->m_args.input_channels; in_c++)
- {
- for (unsigned int n = 0; n < this->m_args.channel_multiplier; n += vl)
+ protected:
+ virtual interleaves::PackingArguments get_packing_args(const DepthwiseArgs &args) const
+ {
+ return interleaves::PackingArguments(
+ args.kernel_rows, args.kernel_cols, sizeof(TWeight),
+ false, sizeof(TAccum),
+ this->get_vl_type(),
+ sizeof(TAccum), 1,
+ [args] (unsigned int pos, unsigned int &x, unsigned int &y) -> bool
{
- const unsigned int out_c = in_c * this->m_args.channel_multiplier + n;
- const unsigned int todo = std::min(vl, this->m_args.channel_multiplier - n);
-
- // Copy across the correct amount of bias (or 0)
- for (unsigned int i = 0; i < todo; i++)
+ if (pos < args.kernel_rows * args.kernel_cols)
{
- buffer[i] = (biases == nullptr) ? 0 : biases[out_c + i];
+ y = pos % args.kernel_cols;
+ x = pos / args.kernel_cols;
+ return true;
}
- buffer += vl;
+ return false;
+ }
+ );
+ }
- // Copy each of the weights in turn
- auto weights_row = weights + out_c;
- for (unsigned int i = 0; i < this->m_args.kernel_rows; i++)
- {
- auto weights_col = weights_row;
+ public:
+ GenericDepthfirstMultiplierStrategy(KernelStrategyType *kern, const DepthwiseArgs &args)
+ : DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>(
+ kern->get_output_rows(), kern->get_output_cols(),
+ args.kernel_rows, args.kernel_cols,
+ args.stride_rows, args.stride_cols
+ ),
+ m_kern(kern)
+ {
+ };
- for (unsigned int j = 0; j < this->m_args.kernel_cols; j++)
- {
- for (unsigned int m = 0; m < todo; m++)
- {
- buffer[m] = weights_col[m];
- }
- buffer += vl;
+ arm_gemm::VLType get_vl_type(void) const override { return m_kern->get_vl_type(); }
+ const typename KernelStrategyType::KernelType get_kernel(void) const { return m_kern->get_kernel(); }
- weights_col += ld_weight_col;
- }
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleaves::get_storage_size_generic(this->get_packing_args(args), args);
+ }
- weights_row += ld_weight_row;
- }
- }
- }
+ void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const OutputStage &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
+ {
+ interleaves::pack_parameters_generic(
+ this->get_packing_args(args), args,
+ buffer, biases, weights, ld_weight_col, ld_weight_row
+ );
}
+};
+
+// Specialise elements of the wrapper based on the type of kernel.
+namespace depthfirst_multiplier {
- size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
+/* Working space element which contains a pointer for each row of input, a row
+ * of padding, and a space which can be used to construct an NCHW-ordered patch
+ * of input.
+ */
+template <typename T, bool IsGeneric=false, typename OutputStage=Nothing>
+class InputPatchElement
+{
+ public:
+ struct Workspace
{
- const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
- return n_threads * sizeof_output_buffer(n_output_channels);
+ constexpr static bool InputPatchIsGeneric = IsGeneric;
+ const T **input_rows;
+ T *input_padding;
+ T *input_patch;
+ };
+
+ static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ return sizeof_input_rows(args) + sizeof_input_padding(args) + sizeof_input_patch(args);
}
-
- using DepthwiseCommon<typename strategy::input_type, typename strategy::weight_type, typename strategy::return_type>::execute;
- void execute(
- const unsigned int batches,
- const unsigned int input_height,
- const unsigned int input_width,
- const unsigned int input_channels,
- const PaddingValues &padding,
- const void *const _input,
- const size_t ld_input_col,
- const size_t ld_input_row,
- const size_t ld_input_batch,
- const void *const parameters,
- const unsigned int output_height,
- const unsigned int output_width,
- void *const _output,
- const size_t ld_output_col,
- const size_t ld_output_row,
- const size_t ld_output_batch,
- void *const _working_space,
- const unsigned int thread_id,
- const unsigned int n_threads
- ) const override
+
+ template <class WorkspaceType>
+ static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
{
- strategy strat(this->m_args.cpu_info);
-#ifdef CYCLE_PROFILING
- arm_gemm::profiler prof;
-#endif
+ auto buffer_bytes = reinterpret_cast<char *>(buffer);
+
+ ws->input_rows = reinterpret_cast<const T **>(buffer_bytes);
+ buffer_bytes += sizeof_input_rows(args);
+
+ ws->input_padding = reinterpret_cast<T*>(buffer_bytes);
+ buffer_bytes += sizeof_input_padding(args);
+
+ ws->input_patch = reinterpret_cast<T*>(buffer_bytes);
+ buffer_bytes += sizeof_input_patch(args);
- // Compute activation values
- TAccum activation_min = std::numeric_limits<TAccum>::has_infinity ? -std::numeric_limits<TAccum>::infinity() : std::numeric_limits<TAccum>::min();
- TAccum activation_max = std::numeric_limits<TAccum>::has_infinity ? std::numeric_limits<TAccum>::infinity() : std::numeric_limits<TAccum>::max();
+ // Initialise the padding
+ memset(ws->input_padding,
+ get_input_buffer_fill_value(args.output_stage),
+ sizeof_input_padding(args));
- switch (this->m_args.activation.type)
+ return buffer_bytes;
+ }
+
+ protected:
+ static size_t sizeof_input_rows(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ if (IsGeneric)
+ {
+ return sizeof(T *) * args.strategy->get_output_rows() * args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
+ }
+ else
{
- case arm_gemm::Activation::Type::BoundedReLU:
- activation_max = static_cast<TAccum>(this->m_args.activation.param1);
- // Fall through
- case arm_gemm::Activation::Type::ReLU:
- activation_min = static_cast<TAccum>(0);
- break;
- default:
- break;
+ return sizeof(T *) * args.strategy->get_input_rows();
}
+ }
+
+ static size_t sizeof_input_padding(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ // Round-up the number of columns to be a whole number of QUADS
+ auto input_cols = arm_gemm::roundup<size_t>(args.strategy->get_input_cols(), 16 / sizeof(T));
+ return sizeof(T) * input_cols;
+ }
- // Determine what portion of the work to do.
- const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads);
- const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
- const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
-
- // Need a stride over blocks of parameters
- const unsigned int vl = arm_gemm::utils::get_vector_length<TOutput>(strategy::vl_type);
- const unsigned int param_stride =
- arm_gemm::roundup(this->m_args.channel_multiplier, vl) *
- (sizeof(TAccum) + sizeof(TWeight) * strategy::kernel_rows * strategy::kernel_cols);
-
- // Cast input and output pointers into the right types
- const TInput *const inptr = static_cast<const TInput *>(_input);
- TOutput *const outptr = static_cast<TOutput *>(_output);
-
- // To simplify the kernel, we process padded or non-NCHW-ordered input into
- // a form which can be consumed by the kernel. This data is stored here and
- // passed into the kernel as an array of N pointers (one per row of the
- // input).
- TInput rearranged_input[strategy::input_rows][strategy::input_col_quads*4];
- const TInput *inptrs[strategy::input_rows];
-
- // Create an array for the output pointers
- TOutput * _outptr_array[strategy::output_rows * strategy::output_cols];
- TOutput **const outptr_array = _outptr_array;
-
- // Allocate portions of the working space
- uint8_t *const working_space = static_cast<uint8_t *>(_working_space) + get_working_size(thread_id, input_channels);
- TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
-
- // For each output tile, construct the requisite set of pointers and call
- // into the kernel.
- for (unsigned int batch = 0; batch < batches; batch++)
+ static size_t sizeof_input_patch(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ if (IsGeneric)
+ {
+ // Round-up the number of columns to be a whole number of QUADS
+ auto output_cols = arm_gemm::roundup<size_t>(args.strategy->get_output_cols(), 16 / sizeof(T));
+ const auto kernel_points = args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
+ return sizeof(T) * kernel_points * args.strategy->get_output_rows() * output_cols;
+ }
+ else
{
- // Get batch pointers
- const auto inptr_batch = inptr + batch * ld_input_batch;
- const auto outptr_batch = outptr + batch * ld_output_batch;
+ // Round-up the number of columns to be a whole number of QUADS
+ auto input_cols = arm_gemm::roundup<size_t>(args.strategy->get_input_cols(), 16 / sizeof(T));
+ return sizeof(T) * args.strategy->get_input_rows() * input_cols;
+ }
+ }
+};
+
+template <bool IsGeneric, typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
+struct StrategyType
+{
+ using Type = DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, TAccum>;
+
+ template <typename WorkspaceType>
+ static void execute(
+ const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
+ const OutputStage &, const unsigned int,
+ const void *parameters, const void *
+ )
+ {
+ strat->get_kernel()(
+ ws->input_rows,
+ ws->outptr_array,
+ parameters, args.channel_multiplier,
+ ws->activation_min, ws->activation_max
+ );
+ }
+};
- for (int start_out_i = start_out_height;
- start_out_i < end_out_height;
- start_out_i += static_cast<int>(strategy::output_rows))
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
+struct StrategyType<true, TInput, TWeight, TOutput, TAccum, OutputStage>
+{
+ using Type = GenericDepthfirstMultiplierStrategy<TInput, TWeight, TOutput, TAccum, OutputStage>;
+
+ template <typename WorkspaceType>
+ static void execute(
+ const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
+ const OutputStage &, const unsigned int start_output_channel,
+ const void *parameters, const void *bias
+ )
+ {
+ strat->get_kernel()(
+ ws->input_rows, ws->outptr_array,
+ reinterpret_cast<const TWeight *>(parameters),
+ bias == nullptr ? nullptr : reinterpret_cast<const TAccum *>(bias) + start_output_channel,
+ strat->get_kernel_rows() * strat->get_kernel_cols(),
+ args.channel_multiplier,
+ ws->activation_min, ws->activation_max
+ );
+ }
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+struct StrategyType<false, TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+ using Type = DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t>;
+
+ template <typename WorkspaceType>
+ static void execute(
+ const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
+ const arm_gemm::Requantize32 &qp, const unsigned int,
+ const void *parameters, const void *
+ )
+ {
+ strat->get_kernel()(
+ ws->input_rows,
+ ws->outptr_array,
+ parameters, args.channel_multiplier,
+ qp
+ );
+ }
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+struct StrategyType<true, TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+ using Type = GenericDepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>;
+
+ template <typename WorkspaceType>
+ static void execute(
+ const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
+ const arm_gemm::Requantize32 &qp, const unsigned int start_output_channel,
+ const void *parameters, const void *
+ )
+ {
+ auto get_ptr = [start_output_channel] (const int32_t *ptr) -> const int32_t *
+ {
+ return ptr == nullptr ? nullptr : ptr + start_output_channel;
+ };
+
+ strat->get_kernel()(
+ ws->input_rows, ws->outptr_array,
+ reinterpret_cast<const TWeight *>(parameters),
+ get_ptr(qp.bias),
+ strat->get_kernel_rows() * strat->get_kernel_cols(),
+ args.channel_multiplier,
+ get_ptr(qp.per_channel_left_shifts),
+ get_ptr(qp.per_channel_muls),
+ get_ptr(qp.per_channel_right_shifts),
+ qp
+ );
+ }
+};
+
+template <bool IsGeneric> struct PrepareInputSample;
+
+template <> struct PrepareInputSample<false>
+{
+ template <typename WorkspaceType, typename StrategyType, typename T>
+ static void execute(
+ const DepthwiseArgs &, WorkspaceType *ws, const StrategyType *strat,
+ T *base_ptr, size_t ld_row, size_t ld_col,
+ const unsigned int input_pad_top, const unsigned int valid_rows,
+ const unsigned int input_pad_left, const unsigned int valid_cols
+ )
+ {
+ fill_nchw_patch_array(
+ ws->input_rows, ws->input_patch, strat->get_input_rows(), strat->get_input_cols(),
+ base_ptr, ld_row, ld_col,
+ ws->input_padding,
+ input_pad_top, valid_rows,
+ input_pad_left, valid_cols
+ );
+ }
+};
+
+template <> struct PrepareInputSample<true>
+{
+ template <typename WorkspaceType, typename StrategyType, typename T>
+ static void execute(
+ const DepthwiseArgs &args, WorkspaceType *ws, const StrategyType *strat,
+ T *base_ptr, size_t ld_row, size_t ld_col,
+ const unsigned int input_pad_top, const unsigned int valid_rows,
+ const unsigned int input_pad_left, const unsigned int valid_cols
+ )
+ {
+ fill_patch_array_generic_kernel(
+ ws->input_rows, ws->input_patch,
+ strat->get_output_rows(), strat->get_output_cols(),
+ args.kernel_rows, args.kernel_cols,
+ args.stride_rows, args.stride_cols,
+ base_ptr, ld_row, ld_col,
+ ws->input_padding,
+ input_pad_top, valid_rows,
+ input_pad_left, valid_cols
+ );
+ }
+};
+
+} // namespace depthfirst_multiplier
+
+template <typename TInput,
+ typename TWeight=TInput,
+ typename TOutput=TInput,
+ typename TAccum=typename DefaultTAccum<TInput>::Type,
+ bool is_generic=false,
+ typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwiseDepthfirstMultiplier : public DepthfirstDriver<TInput, TWeight, TOutput>
+{
+ protected:
+ using StratType = typename depthfirst_multiplier::StrategyType<is_generic, TInput, TWeight, TOutput, TAccum, OutputStage>::Type;
+ using WorkspaceManager = Workspace<
+ OutputArrayElement<TOutput>,
+ depthfirst_multiplier::InputPatchElement<TInput, is_generic, OutputStage>,
+ ActivationsElement<TOutput, OutputStage>
+ >;
+ using WorkingSpace = typename WorkspaceManager::WorkspaceType;
+
+ OutputStage m_os; // Copy of the output parameters
+ const void *m_bias = nullptr; // Copy of the bias (should we need it)
+
+ public:
+ DepthwiseDepthfirstMultiplier(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
+ : DepthfirstDriver<TInput, TWeight, TOutput>(strat, args), m_os(os)
+ {
+ }
+
+ DepthwiseDepthfirstMultiplier(DepthwiseDepthfirstMultiplier &) = delete;
+ DepthwiseDepthfirstMultiplier &operator=(DepthwiseDepthfirstMultiplier &) = delete;
+
+ size_t get_storage_size(void) const override
+ {
+ return reinterpret_cast<const StratType *>(this->m_strat.get())
+ ->get_storage_size(this->m_args);
+ }
+
+ void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
+ {
+ reinterpret_cast<const StratType *>(this->m_strat.get())
+ ->pack_parameters(this->m_args, buffer, biases, m_os, weights, ld_weight_col, ld_weight_row);
+ m_bias = biases;
+ depthwise_depthfirst::stash_bias(m_os, biases);
+ }
+
+ size_t get_working_size_per_thread(const unsigned int n_input_channels) const override
+ {
+ DepthwiseArgs args(this->m_args);
+ args.input_channels = n_input_channels;
+ return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, m_os));
+ }
+
+ void initialise_working_space(void *buffer, unsigned int n_input_channels) const override
+ {
+ DepthwiseArgs args(this->m_args);
+ args.input_channels = n_input_channels;
+ return WorkspaceManager::initialise(buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, m_os));
+ }
+
+ void compute_tile_padded(
+ unsigned int output_i, unsigned int output_j,
+ unsigned int output_channel_start, unsigned int output_channel_end,
+ const TensorSpec<const TInput *> &input,
+ const TensorSpec<TOutput *> &output,
+ const void *parameters,
+ void *working_space_raw
+ ) const override
+ {
+ // Get the working space
+ auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
+
+ const int ii = static_cast<int>(output_i * this->m_args.stride_rows) - this->m_args.padding.top;
+ const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
+ const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
+
+ const int ij = static_cast<int>(output_j * this->m_args.stride_cols) - this->m_args.padding.left;
+ const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
+ const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
+
+ // Compute the output pointer array. We'll update this array after every
+ // invocation of the kernel.
+ fill_pointer_array(
+ ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
+ output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
+ output.ld_row, output.ld_col,
+ ws->output_buffer,
+ 0, this->m_args.output_rows - output_i, // Top padding, # valid rows
+ 0, this->m_args.output_cols - output_j // Left padding, # valid columns
+ );
+
+ // Compute the parameter stride
+ DepthwiseArgs single_iter(this->m_args);
+ single_iter.input_channels = 1;
+ const size_t parameter_stride = reinterpret_cast<const StratType *>(this->m_strat.get())
+ ->get_storage_size(single_iter);
+
+ for (; output_channel_start < output_channel_end;
+ output_channel_start += this->m_args.channel_multiplier)
+ {
+ // Compute the input pointer array
+ const auto input_channel = output_channel_start / this->m_args.channel_multiplier;
+
+ // Construct the input patch
+ depthfirst_multiplier::PrepareInputSample<is_generic>::execute(
+ this->m_args, ws, this->m_strat.get(),
+ input.base + input_channel + input_i*input.ld_row + input_j*input.ld_col, input.ld_row, input.ld_col,
+ input_pad_top, this->m_args.input_rows - input_i,
+ input_pad_left, this->m_args.input_cols - input_j
+ );
+
+ // Execute the kernel
+ depthfirst_multiplier::StrategyType<is_generic, TInput, TWeight, TOutput, TAccum, OutputStage>::execute(
+ this->m_args, ws, reinterpret_cast<const StratType *>(this->m_strat.get()), m_os, output_channel_start,
+ parameters, m_bias
+ );
+
+ // Update the output pointers
+ for (unsigned int n = 0; n < this->m_strat->get_output_rows() * this->m_strat->get_output_cols(); n++)
{
- const int end_out_i = start_out_i + strategy::output_rows;
- const int start_in_i = start_out_i * strategy::stride_rows - padding.top;
- const int end_in_i = start_in_i + strategy::input_rows;
-
- // Compute top/bottom padding
- const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
- const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(input_height) - end_in_i, 0));
- const unsigned int valid_output_rows = std::min(
- end_out_i - start_out_i,
- static_cast<int>(output_height) - start_out_i
- );
-
- for (int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
- {
- const int start_in_j = start_out_j * strategy::stride_cols - this->m_args.padding.left;
- const int pad_left = -std::min(0, start_in_j);
-
- const int end_out_j = start_out_j + strategy::output_cols;
- const int end_in_j = start_in_j + strategy::input_cols;
-
- const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(input_width) - end_in_j, 0));
- const unsigned int valid_output_cols = std::min(
- end_out_j - start_out_j,
- static_cast<int>(output_width) - start_out_j
- );
-
- // Construct the output pointer array.
- TOutput **outptr_pos = outptr_array;
- for (auto i = 0u; i < valid_output_rows; i++)
- {
- unsigned int j = 0u;
- TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col;
- for (; j < valid_output_cols; j++)
- {
- *(outptr_pos++) = colptr;
- colptr += ld_output_col;
- }
- for (; j < strategy::output_cols; j++)
- {
- *(outptr_pos++) = output_buffer;
- }
- }
- for (auto i = valid_output_rows; i < strategy::output_rows; i++)
- {
- for (auto j = 0u; j < strategy::output_cols; j++)
- {
- *(outptr_pos++) = output_buffer;
- }
- }
-
- start_out_j += strategy::output_cols;
-
- const uint8_t *params = static_cast<const uint8_t *>(parameters);
-
- // Loop over the input channels
- for (unsigned int in_c = 0; in_c < input_channels; in_c++)
- {
- // Construct the input array - first fill with padding values and
- // then fill in correct values.
- for (unsigned int i = 0; i < strategy::input_rows; i++)
- {
- for (unsigned int j = 0; j < 4 * strategy::input_col_quads; j++)
- {
- rearranged_input[i][j] = static_cast<TInput>(0);
- }
- inptrs[i] = rearranged_input[i];
- }
-
- auto inptr_row = inptr_batch + in_c +
- (start_in_i + pad_top) * ld_input_row +
- (start_in_j + pad_left) * ld_input_col;
- if (ld_input_col == 1 && !pad_left &&
- start_in_j + 4 * strategy::input_col_quads < input_width)
- {
- // The input tensor is already in NCHW format, and we're reading
- // an unpadded section of it - allow the kernel to read it
- // directly.
- for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
- {
- inptrs[i] = inptr_row;
- inptr_row += ld_input_row;
- }
- }
- else
- {
- // Either the input tensor isn't in NCHW format, or we're reading
- // a padded section. Copy the relevant portion of the input here
- // and allow the kernel to read this.
- for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
- {
- auto inptr_col = inptr_row;
- for (unsigned int j = pad_left; j < strategy::input_cols - pad_right; j++)
- {
- rearranged_input[i][j] = *inptr_col;
- inptr_col += ld_input_col;
- }
- inptr_row += ld_input_row;
- }
- }
-
- {
-#ifdef CYCLE_PROFILING
- auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::output_rows * strategy::output_cols * this->m_args.channel_multiplier * strategy::kernel_rows * strategy::kernel_cols));
-#endif
- strat.kernel(
- inptrs, outptr_array, params,
- this->m_args.channel_multiplier,
- activation_min, activation_max
- );
- }
-
- // Progress the output pointers
- TOutput **outptr_pos = outptr_array;
- for (auto i = 0u; i < strategy::output_rows * strategy::output_cols; i++)
- {
- outptr_pos[i] += this->m_args.channel_multiplier;
- }
-
- // Progress the pointer into the parameters
- params += param_stride;
- }
- }
+ ws->outptr_array[n] += this->m_args.channel_multiplier;
}
+
+ // Progress the parameters
+ parameters = reinterpret_cast<const char *>(parameters) + parameter_stride;
}
}
};
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
index 934272a0ac..6b100d9d61 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,7 +28,6 @@
#include "depthwise_depthfirst.hpp"
#include "depthwise_depthfirst_generic.hpp"
#include "depthwise_depthfirst_multiplier.hpp"
-#include "depthwise_depthfirst_generic_multiplier.hpp"
#include "depthwise_implementation_constraints.hpp"
@@ -43,6 +42,7 @@
#include "kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
#include "kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
#endif // defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
#include "kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp"
#include "kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp"
#include "kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
@@ -50,6 +50,7 @@
#include "kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
#include "kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp"
#include "kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+#endif // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
#endif // defined(__aarch64__)
namespace arm_conv {
@@ -70,15 +71,11 @@ namespace
}
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+ unsigned int not_preferred(const DepthwiseArgs &, const Nothing &) __attribute__ ((unused));
unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
{
return std::numeric_limits<unsigned int>::max();
}
-
- unsigned int not_preferred_if_no_multiplier(const DepthwiseArgs &args, const Nothing &)
- {
- return args.channel_multiplier > 1 ? 0 : std::numeric_limits<unsigned int>::max();
- }
#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
}
@@ -94,7 +91,7 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] =
cycle_estimate<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
auto strat = new sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
- return new DepthwiseDepthfirst<__fp16, __fp16, __fp16, __fp16>(strat, args);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
},
},
{
@@ -106,7 +103,7 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] =
cycle_estimate<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
auto strat = new sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
- return new DepthwiseDepthfirst<__fp16, __fp16, __fp16, __fp16>(strat, args);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
},
},
{
@@ -118,7 +115,7 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] =
cycle_estimate<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
auto strat = new sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
- return new DepthwiseDepthfirst<__fp16, __fp16, __fp16, __fp16>(strat, args);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
},
},
{
@@ -130,7 +127,7 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] =
cycle_estimate<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
auto strat = new sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
- return new DepthwiseDepthfirst<__fp16, __fp16, __fp16, __fp16>(strat, args);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
},
},
{
@@ -142,11 +139,11 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] =
cycle_estimate<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
auto strat = new sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
- return new DepthwiseDepthfirst<__fp16, __fp16, __fp16, __fp16>(strat, args);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
},
},
#endif // defined(ARM_COMPUTE_ENABLE_SVE)
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
{
DepthwiseMethod::DEPTHFIRST,
"a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
@@ -156,7 +153,7 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] =
cycle_estimate<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
auto strat = new a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
- return new DepthwiseDepthfirst<__fp16, __fp16, __fp16, __fp16>(strat, args);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
},
},
{
@@ -168,7 +165,7 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] =
cycle_estimate<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
auto strat = new a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
- return new DepthwiseDepthfirst<__fp16, __fp16, __fp16, __fp16>(strat, args);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
},
},
{
@@ -180,7 +177,7 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] =
cycle_estimate<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
auto strat = new a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
- return new DepthwiseDepthfirst<__fp16, __fp16, __fp16, __fp16>(strat, args);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
},
},
{
@@ -192,7 +189,7 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] =
cycle_estimate<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
auto strat = new a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
- return new DepthwiseDepthfirst<__fp16, __fp16, __fp16, __fp16>(strat, args);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
},
},
{
@@ -204,7 +201,7 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] =
cycle_estimate<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
auto strat = new a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
- return new DepthwiseDepthfirst<__fp16, __fp16, __fp16, __fp16>(strat, args);
+ return new DepthwiseDepthfirst<__fp16>(strat, args);
},
},
{
@@ -213,19 +210,23 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] =
constraint(has_no_channel_multiplier, cpu_has_fp16),
not_preferred,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
- return new DepthwiseDepthfirstGeneric<a64_fp16_nhwc_generic_output9_mla_depthfirst, 3, 3>(args);
+ auto kern = new a64_fp16_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstStrategy<__fp16>(kern, 3, 3, args);
+ return new DepthwiseDepthfirstGeneric<__fp16>(strat, args);
},
},
{
DepthwiseMethod::DEPTHFIRST,
"a64_fp16_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
- constraint(cpu_has_fp16),
- not_preferred_if_no_multiplier,
+ constraint(cpu_has_fp16, has_channel_multiplier),
+ nullptr,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
- return new DepthwiseDepthfirstGenericWithMultiplier<a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst>(args);
+ auto kern = new a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstMultiplierStrategy<__fp16>(kern, args);
+ return new DepthwiseDepthfirstMultiplier<__fp16, __fp16, __fp16, __fp16, true>(strat, args);
},
},
-#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
#endif // defined(__aarch64__)
{ DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
};
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
index 5107ddacbc..643cf1d460 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,7 +28,7 @@
#include "depthwise_depthfirst.hpp"
#include "depthwise_depthfirst_generic.hpp"
#include "depthwise_depthfirst_multiplier.hpp"
-#include "depthwise_depthfirst_generic_multiplier.hpp"
+#include "depthwise_planar.hpp"
#include "depthwise_implementation_constraints.hpp"
@@ -78,9 +78,10 @@ namespace
return std::numeric_limits<unsigned int>::max();
}
- unsigned int not_preferred_if_no_multiplier(const DepthwiseArgs &args, const Nothing &)
+ bool fast_mode_enabled(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+ bool fast_mode_enabled(const DepthwiseArgs &args, const void *)
{
- return args.channel_multiplier > 1 ? 0 : std::numeric_limits<unsigned int>::max();
+ return args.fast_mode;
}
#endif // defined(__aarch64__)
}
@@ -97,7 +98,7 @@ static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
cycle_estimate<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
auto strat = new sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
- return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+ return new DepthwiseDepthfirst<float>(strat, args);
},
},
{
@@ -109,7 +110,7 @@ static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
cycle_estimate<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
auto strat = new sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
- return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+ return new DepthwiseDepthfirst<float>(strat, args);
},
},
{
@@ -121,7 +122,7 @@ static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
cycle_estimate<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
auto strat = new sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
- return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+ return new DepthwiseDepthfirst<float>(strat, args);
},
},
{
@@ -133,7 +134,7 @@ static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
cycle_estimate<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
auto strat = new sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
- return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+ return new DepthwiseDepthfirst<float>(strat, args);
},
},
{
@@ -145,7 +146,7 @@ static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
cycle_estimate<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
auto strat = new sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
- return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+ return new DepthwiseDepthfirst<float>(strat, args);
},
},
{
@@ -154,36 +155,42 @@ static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
constraint(has_no_channel_multiplier, cpu_has_sve),
not_preferred,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
- return new DepthwiseDepthfirstGeneric<sve_fp32_nhwc_generic_output9_mla_depthfirst, 3, 3>(args);
+ auto kern = new sve_fp32_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstStrategy<float>(kern, 3, 3, args);
+ return new DepthwiseDepthfirstGeneric<float>(strat, args);
},
},
{
DepthwiseMethod::DEPTHFIRST,
"sve_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
constraint(is_supported<sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>,
- cpu_has_sve),
- not_preferred_if_no_multiplier,
+ cpu_has_sve, has_channel_multiplier),
+ nullptr,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
- return new DepthwiseDepthfirstWithMultiplier<sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>(args);
+ auto strat = new sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<float>(strat, args);
},
},
{
DepthwiseMethod::DEPTHFIRST,
"sve_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
constraint(is_supported<sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>,
- cpu_has_sve),
- not_preferred_if_no_multiplier,
+ cpu_has_sve, has_channel_multiplier),
+ nullptr,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
- return new DepthwiseDepthfirstWithMultiplier<sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>(args);
+ auto strat = new sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<float>(strat, args);
},
},
{
DepthwiseMethod::DEPTHFIRST,
"sve_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
- constraint(cpu_has_sve),
- not_preferred_if_no_multiplier,
+ constraint(cpu_has_sve, has_channel_multiplier),
+ nullptr,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
- return new DepthwiseDepthfirstGenericWithMultiplier<sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst>(args);
+ auto kern = new sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstMultiplierStrategy<float>(kern, args);
+ return new DepthwiseDepthfirstMultiplier<float, float, float, float, true>(strat, args);
},
},
#endif // defined(ARM_COMPUTE_ENABLE_SVE)
@@ -195,7 +202,7 @@ static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
cycle_estimate<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
auto strat = new a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
- return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+ return new DepthwiseDepthfirst<float>(strat, args);
},
},
{
@@ -206,7 +213,7 @@ static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
cycle_estimate<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
auto strat = new a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
- return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+ return new DepthwiseDepthfirst<float>(strat, args);
},
},
{
@@ -217,7 +224,7 @@ static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
cycle_estimate<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
auto strat = new a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
- return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+ return new DepthwiseDepthfirst<float>(strat, args);
},
},
{
@@ -228,7 +235,7 @@ static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
cycle_estimate<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
auto strat = new a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
- return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+ return new DepthwiseDepthfirst<float>(strat, args);
},
},
{
@@ -239,7 +246,7 @@ static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
cycle_estimate<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
auto strat = new a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
- return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
+ return new DepthwiseDepthfirst<float>(strat, args);
},
},
{
@@ -248,34 +255,42 @@ static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
constraint(has_no_channel_multiplier),
not_preferred,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
- return new DepthwiseDepthfirstGeneric<a64_fp32_nhwc_generic_output9_mla_depthfirst, 3, 3>(args);
+ auto kern = new a64_fp32_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstStrategy<float>(kern, 3, 3, args);
+ return new DepthwiseDepthfirstGeneric<float>(strat, args);
},
},
{
DepthwiseMethod::DEPTHFIRST,
"a64_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
- constraint(is_supported<a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>),
- not_preferred_if_no_multiplier,
+ constraint(is_supported<a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>,
+ has_channel_multiplier),
+ nullptr,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
- return new DepthwiseDepthfirstWithMultiplier<a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>(args);
+ auto strat = new a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<float>(strat, args);
},
},
{
DepthwiseMethod::DEPTHFIRST,
"a64_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
- constraint(is_supported<a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>),
- not_preferred_if_no_multiplier,
+ constraint(is_supported<a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>,
+ has_channel_multiplier),
+ nullptr,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
- return new DepthwiseDepthfirstWithMultiplier<a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>(args);
+ auto strat = new a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<float>(strat, args);
},
},
{
DepthwiseMethod::DEPTHFIRST,
"a64_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
+ constraint(has_channel_multiplier),
nullptr,
- not_preferred_if_no_multiplier,
[] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
- return new DepthwiseDepthfirstGenericWithMultiplier<a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst>(args);
+ auto kern = new a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstMultiplierStrategy<float>(kern, args);
+ return new DepthwiseDepthfirstMultiplier<float, float, float, float, true>(strat, args);
},
},
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
index ea41529d81..0665fa3a29 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,7 +24,7 @@
#pragma once
-#include "src/core/NEON/kernels/assembly/depthwise.hpp"
+#include "depthwise.hpp"
#include <cstddef>
#include <functional>
@@ -136,14 +136,7 @@ UniqueDepthwiseCommon<TInput, TWeight, TOutput> depthwise(const DepthwiseArgs &a
{
const DepthwiseImplementation<TInput, TWeight, TOutput, OutputStage> *impl = nullptr;
const bool success = find_implementation<TInput, TWeight, TOutput, OutputStage>(args, os, impl);
-
- if(success)
- {
- auto i = impl->get_instance(args, os);
- i->set_name(impl->name);
- return UniqueDepthwiseCommon<TInput, TWeight, TOutput>(i);
- }
- return nullptr;
+ return UniqueDepthwiseCommon<TInput, TWeight, TOutput>(success ? impl->get_instance(args, os) : nullptr);
}
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
index 419872744d..78b6aec388 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -109,6 +109,12 @@ bool has_no_channel_multiplier(const DepthwiseArgs &args, const void *)
return args.channel_multiplier == 1;
}
+bool has_channel_multiplier(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
+bool has_channel_multiplier(const DepthwiseArgs &args, const void *)
+{
+ return args.channel_multiplier > 1;
+}
+
bool qp_has_no_left_shift(const DepthwiseArgs &args, const void *_qp) __attribute__ ((unused));
bool qp_has_no_left_shift(const DepthwiseArgs &, const void *_qp)
{
@@ -118,6 +124,21 @@ bool qp_has_no_left_shift(const DepthwiseArgs &, const void *_qp)
(qp->per_layer_left_shift == 0);
}
+bool qp_zero_a_offset(const DepthwiseArgs &args, const void *_qp) __attribute__ ((unused));
+bool qp_zero_a_offset(const DepthwiseArgs &, const void *_qp)
+{
+ const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
+ return qp->a_offset == 0;
+}
+
+template <typename T> bool qp_skip_clamp(const DepthwiseArgs &args, const void *_qp) __attribute__ ((unused));
+template <typename T> bool qp_skip_clamp(const DepthwiseArgs &, const void *_qp)
+{
+ const auto qp = static_cast<const arm_gemm::Requantize32 *>(_qp);
+ return (qp->minval == std::numeric_limits<T>::min() &&
+ qp->maxval == std::numeric_limits<T>::max());
+}
+
} // namespace
} // namespace depthwise
} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
new file mode 100644
index 0000000000..ff5098d551
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "depthfirst_driver.hpp"
+#include "interleaves/generic.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename OutputStage>
+class IPlanarStrategy
+{
+ public:
+ virtual ~IPlanarStrategy() = default;
+ virtual unsigned int get_output_rows(void) const = 0;
+ virtual arm_gemm::VLType get_vl_type(void) const = 0;
+
+ virtual size_t get_storage_size(const DepthwiseArgs &) const = 0;
+ virtual void pack_parameters(
+ const DepthwiseArgs &args, void *buffer,
+ const void *biases, const OutputStage &,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const = 0;
+};
+
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum,
+ typename OutputStage>
+struct PlanarKernelType;
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+struct PlanarKernelType<TInput, TWeight, TOutput, TAccum, Nothing>
+{
+ using Type = std::function<void(
+ const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+ unsigned int pad_top, unsigned int valid_input_rows,
+ unsigned int pad_left, unsigned int valid_input_cols,
+ const TWeight *, const TAccum *,
+ TOutput **, const size_t *, const size_t *, unsigned int output_cols,
+ unsigned int start_channels, unsigned int valid_channels,
+ TAccum act_min, TAccum act_max
+ )>;
+
+ template <typename WorkspaceType>
+ static inline void execute(
+ const Type fn,
+ const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+ unsigned int pad_top, unsigned int valid_input_rows,
+ unsigned int pad_left, unsigned int valid_input_cols,
+ const TWeight *weights, const TAccum *bias,
+ TOutput **outptrs, const size_t *outlds, const size_t *outvllds, unsigned int output_cols,
+ unsigned int start_channel, unsigned int valid_channels,
+ const Nothing &, const WorkspaceType *ws
+ )
+ {
+ fn(
+ inptr, ld_in_row, ld_in_col, ld_in_vl,
+ pad_top, valid_input_rows,
+ pad_left, valid_input_cols,
+ weights, bias,
+ outptrs, outlds, outvllds, output_cols,
+ start_channel, valid_channels,
+ ws->activation_min, ws->activation_max
+ );
+ }
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+struct PlanarKernelType<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+ using Type = std::function<void(
+ const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+ unsigned int pad_top, unsigned int valid_input_rows,
+ unsigned int pad_left, unsigned int valid_input_cols,
+ const TWeight *,
+ TOutput **, const size_t *, const size_t *, unsigned int output_cols,
+ unsigned int start_channel, unsigned int valid_channels,
+ const arm_gemm::Requantize32 &
+ )>;
+
+ template <typename WorkspaceType>
+ static inline void execute(
+ const Type fn,
+ const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+ unsigned int pad_top, unsigned int valid_input_rows,
+ unsigned int pad_left, unsigned int valid_input_cols,
+ const TWeight *weights, const int32_t *,
+ TOutput **outptrs, const size_t *outlds, const size_t *outldvls, unsigned int output_cols,
+ unsigned int first_channel, unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp, const WorkspaceType *
+ )
+ {
+ fn(
+ inptr, ld_in_row, ld_in_col, ld_in_vl,
+ pad_top, valid_input_rows,
+ pad_left, valid_input_cols,
+ weights,
+ outptrs, outlds, outldvls, output_cols,
+ first_channel, valid_channels,
+ qp
+ );
+ }
+};
+
+
+template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
+ typename TAccum=typename DefaultTAccum<TOutput>::Type,
+ typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class PlanarStrategy : public IPlanarStrategy<OutputStage>
+{
+ unsigned int m_kernel_rows, m_kernel_cols;
+ unsigned int m_stride_rows, m_stride_cols;
+ unsigned int m_output_rows;
+ arm_gemm::VLType m_vl_type;
+
+ protected:
+ virtual bool get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const
+ {
+ // Get the kernel point to pack at the given index; return false to
+ // indicate that this index (and all greater indices) is out of range.
+ if (m_kernel_rows * m_kernel_cols <= index)
+ return false;
+
+ y = index % m_kernel_cols;
+ x = index / m_kernel_cols;
+ return true;
+ }
+
+ virtual interleaves::PackingArguments get_kernel_packing_arguments(void) const
+ {
+ return interleaves::PackingArguments(
+ m_kernel_rows, m_kernel_cols, sizeof(TWeight),
+ false, sizeof(TAccum), // Don't pack the bias
+ m_vl_type, sizeof(TAccum), 1, // Accumulator depth of 1 TODO
+ [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+ { return this->get_kernel_packing_point(idx, x, y); }
+ );
+ }
+
+ public:
+ PlanarStrategy(
+ unsigned int kernel_rows, unsigned int kernel_cols,
+ unsigned int stride_rows, unsigned int stride_cols,
+ unsigned int output_rows,
+ arm_gemm::VLType vl_type
+ ) : m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
+ m_stride_rows(stride_rows), m_stride_cols(stride_cols),
+ m_output_rows(output_rows), m_vl_type(vl_type)
+ {
+ }
+
+ unsigned int get_output_rows(void) const override { return m_output_rows; }
+ arm_gemm::VLType get_vl_type(void) const override { return m_vl_type; }
+
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleaves::get_storage_size_generic(this->get_kernel_packing_arguments(), args);
+ }
+
+ void pack_parameters(
+ const DepthwiseArgs &args, void *buffer,
+ const void *biases, const OutputStage &,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const override
+ {
+ interleaves::pack_parameters_generic(
+ this->get_kernel_packing_arguments(), args,
+ buffer, biases, weights, ld_weight_col, ld_weight_row
+ );
+ }
+
+ using KernelType = typename PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::Type;
+ virtual KernelType get_kernel(void) const = 0;
+};
+
+
+namespace {
+
+template <typename T>
+struct OutputRowPtrsElement
+{
+ struct Workspace
+ {
+ T **output_row_ptrs;
+ size_t *output_ld_cols;
+ size_t *output_ld_vls; // Stride between vectors of channels
+ T *output_padding_buffer;
+ };
+
+ template <typename OutputStage>
+ static size_t get_element_size(const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
+ {
+ // We need one pointer and stride for each row of output, and an additional
+ // blob of memory into which padded stores can go.
+ return args.strategy->get_output_rows() * (sizeof(T *) + 2*sizeof(size_t)) +
+ get_vector_length<char>(args.strategy->get_vl_type());
+ }
+
+ template <typename WorkspaceType, typename OutputStage>
+ static void *initialise(WorkspaceType *ws, void *buffer,
+ const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
+ {
+ const auto n_rows = args.strategy->get_output_rows();
+ ws->output_row_ptrs = reinterpret_cast<T **>(buffer);
+ ws->output_ld_cols = reinterpret_cast<size_t *>(ws->output_row_ptrs + n_rows);
+ ws->output_ld_vls = ws->output_ld_cols + n_rows;
+ ws->output_padding_buffer = reinterpret_cast<T *>(ws->output_ld_vls + n_rows);
+ return ws->output_padding_buffer + get_vector_length<T>(args.strategy->get_vl_type());
+ }
+};
+
+} // namespace {anonymous}
+
+
+template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
+ typename TAccum=typename DefaultTAccum<TOutput>::Type,
+ typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwisePlanar : public DepthwiseCommon<TInput, TWeight, TOutput>
+{
+ using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
+ using StrategyType = IPlanarStrategy<OutputStage>;
+ using WorkspaceManager = Workspace<
+ OutputRowPtrsElement<TOutput>,
+ ActivationsElement<TAccum, OutputStage>
+ >;
+ using WorkspaceType = typename WorkspaceManager::WorkspaceType;
+
+ std::unique_ptr<StrategyType> m_strat;
+ const TAccum *m_bias;
+ OutputStage m_os;
+
+ public:
+ DepthwisePlanar(StrategyType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
+ : Parent(args), m_strat(strat), m_bias(nullptr), m_os(os)
+ {
+ }
+
+ size_t get_storage_size(void) const override
+ {
+ return m_strat->get_storage_size(this->m_args);
+ }
+
+ void pack_parameters(
+ void *buffer, const void *biases,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) override
+ {
+ m_strat->pack_parameters(this->m_args, buffer, biases, {}, weights, ld_weight_col, ld_weight_row);
+ this->m_bias = reinterpret_cast<const TAccum *>(biases);
+ depthwise_depthfirst::stash_bias(this->m_os, biases);
+ }
+
+ size_t get_working_size(unsigned int n_threads, unsigned int) const override
+ {
+ return this->get_working_size_per_thread() * n_threads;
+ }
+
+ protected:
+ /* Compute the amount of working space required for a single thread. */
+ virtual size_t get_working_size_per_thread(void) const
+ {
+ return WorkspaceManager::get_sizeof_workspace(
+ WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os));
+ }
+
+ /* Initialise the working space for a thread. */
+ virtual void initialise_working_space(void *buffer) const
+ {
+ WorkspaceManager::initialise(
+ buffer,
+ WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os)
+ );
+ }
+
+ /* Execute the kernel for a given chunk of work. */
+ virtual void execute_kernel(
+ const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+ unsigned int pad_top, unsigned int valid_input_rows,
+ unsigned int pad_left, unsigned int valid_input_cols,
+ const TWeight *weights, const TAccum *bias,
+ TOutput *outptr, size_t ld_out_row, size_t ld_out_col, size_t ld_out_vl,
+ unsigned int valid_output_rows, unsigned int valid_output_cols,
+ unsigned int first_channel, unsigned int valid_channels,
+ WorkspaceType *ws
+ ) const
+ {
+ // Initialise the output pointers
+ for (auto i = 0u; i < m_strat->get_output_rows(); i++)
+ {
+ // Point at the output tensor for all valid rows; otherwise point at the
+ // padding buffer.
+ ws->output_row_ptrs[i] = i < valid_output_rows ? outptr : ws->output_padding_buffer;
+ ws->output_ld_cols[i] = i < valid_output_rows ? ld_out_col : 0;
+ ws->output_ld_vls[i] = i < valid_output_rows ? ld_out_vl : 0;
+ outptr += ld_out_row;
+ }
+
+ // Execute the kernel
+ PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::template execute<WorkspaceType>(
+ reinterpret_cast<const PlanarStrategy<TInput, TWeight, TOutput, TAccum, OutputStage> *>(m_strat.get())->get_kernel(),
+ inptr, ld_in_row, ld_in_col, ld_in_vl,
+ pad_top, valid_input_rows, pad_left, valid_input_cols,
+ weights, bias,
+ ws->output_row_ptrs, ws->output_ld_cols, ws->output_ld_vls,
+ valid_output_cols, first_channel, valid_channels,
+ this->m_os, ws
+ );
+ }
+
+ void execute_internal(
+ unsigned int batches,
+ unsigned int input_height,
+ unsigned int input_width,
+ unsigned int n_input_channels,
+ const PaddingValues &padding,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads
+ ) const override
+ {
+ // Get and initialise the working space for this thread.
+ void *thread_working_space =
+ static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
+ this->initialise_working_space(thread_working_space);
+ auto ws = reinterpret_cast<WorkspaceType *>(thread_working_space);
+
+ const auto n_output_channels = n_input_channels * this->m_args.channel_multiplier;
+ const auto vl = get_vector_length<TAccum>(m_strat->get_vl_type());
+
+ // Get typed pointers
+ auto input_batch = reinterpret_cast<const TInput *>(input);
+ auto output_batch = reinterpret_cast<TOutput *>(output);
+ auto weights = reinterpret_cast<const TWeight *>(parameters);
+
+ // Iterate over batches
+ for (; batches; batches--)
+ {
+ // NOTE: Other loop orderings are possible and it would be worth
+ // investigating them.
+
+ // Within a batch, stripe threads across rows.
+ for (auto start_output_i = thread_id * m_strat->get_output_rows();
+ start_output_i < output_height;
+ start_output_i += n_threads * m_strat->get_output_rows())
+ {
+ // Determine what (if any padding) is required on the top/bottom of
+ // this row of the convolution.
+ const int start_input_i = start_output_i * this->m_args.stride_rows - padding.top;
+ const unsigned int input_pad_top = start_input_i < 0 ? -start_input_i : 0;
+ const unsigned int input_i = start_input_i < 0 ? 0 : start_input_i;
+ const unsigned int valid_input_rows = input_i > input_height ? 0 : input_height - input_i;
+ const unsigned int valid_output_rows = output_height - start_output_i;
+
+ auto inptr_row = input_batch + input_i*ld_input_row;
+ auto outptr_row = output_batch + start_output_i * ld_output_row;
+
+ // Execute the kernel
+ this->execute_kernel(
+ inptr_row, ld_input_row, ld_input_col, vl,
+ input_pad_top, valid_input_rows, padding.left, input_width,
+ weights, this->m_bias,
+ outptr_row, ld_output_row, ld_output_col, vl,
+ valid_output_rows, output_width,
+ 0 /* first channel */, n_output_channels,
+ ws
+ );
+ }
+
+ // Update the input and output pointers to account for batch
+ input_batch += ld_input_batch;
+ output_batch += ld_output_batch;
+ }
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp
index 46a31185d7..4ff249a5d5 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,15 +25,14 @@
#include "arm_gemm_local.hpp"
#include "depthwise_implementation.hpp"
-#include "depthwise_depthfirst_quantized.hpp"
-#include "depthwise_depthfirst_generic_quantized.hpp"
-#include "depthwise_depthfirst_multiplier_quantized.hpp"
-#include "depthwise_depthfirst_generic_multiplier_quantized.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
#include "depthwise_implementation_constraints.hpp"
#if defined(__aarch64__)
-#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
#include "kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
#include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
#include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
@@ -41,7 +40,7 @@
#include "kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
#include "kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
#include "kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
-#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
#include "kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
#include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
#include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
@@ -73,7 +72,7 @@ bool qp_weights_are_symmetric(const DepthwiseArgs &, const void *_qp)
static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depthwise_s8q_methods[] = {
#if defined(__aarch64__)
-#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
{
DepthwiseMethod::DEPTHFIRST,
"sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst",
@@ -84,7 +83,8 @@ static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depth
cpu_has_sve2),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
- return new DepthwiseDepthfirstQuantized<sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+ auto strat = new sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
},
},
{
@@ -96,7 +96,8 @@ static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depth
cpu_has_sve2),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
- return new DepthwiseDepthfirstQuantized<sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+ auto strat = new sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
},
},
{
@@ -108,7 +109,8 @@ static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depth
cpu_has_sve2),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
- return new DepthwiseDepthfirstQuantized<sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>(args, qp);
+ auto strat = new sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
},
},
{
@@ -120,7 +122,8 @@ static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depth
cpu_has_sve2),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
- return new DepthwiseDepthfirstQuantized<sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>(args, qp);
+ auto strat = new sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
},
},
{
@@ -132,7 +135,8 @@ static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depth
cpu_has_sve2),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
- return new DepthwiseDepthfirstQuantized<sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>(args, qp);
+ auto strat = new sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
},
},
{
@@ -140,10 +144,12 @@ static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depth
"sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
constraint<Requantize32>(is_supported<sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
qp_has_no_left_shift,
+ has_channel_multiplier,
cpu_has_sve2),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
- return new DepthwiseDepthfirstWithMultiplierQuantized<sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>(args, qp);
+ auto strat = new sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
},
},
{
@@ -151,13 +157,15 @@ static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depth
"sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
constraint<Requantize32>(is_supported<sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
qp_has_no_left_shift,
+ has_channel_multiplier,
cpu_has_sve2),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
- return new DepthwiseDepthfirstWithMultiplierQuantized<sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>(args, qp);
+ auto strat = new sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
},
},
-#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
{
DepthwiseMethod::DEPTHFIRST,
"a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst",
@@ -168,7 +176,8 @@ static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depth
cpu_has_dot_product),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
- return new DepthwiseDepthfirstQuantized<a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+ auto strat = new a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
},
},
{
@@ -180,7 +189,8 @@ static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depth
cpu_has_dot_product),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
- return new DepthwiseDepthfirstQuantized<a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+ auto strat = new a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
},
},
{
@@ -191,7 +201,8 @@ static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depth
qp_has_no_left_shift),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
- return new DepthwiseDepthfirstQuantized<a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst>(args, qp);
+ auto strat = new a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
},
},
{
@@ -202,7 +213,8 @@ static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depth
qp_has_no_left_shift),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
- return new DepthwiseDepthfirstQuantized<a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst>(args, qp);
+ auto strat = new a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
},
},
{
@@ -213,7 +225,8 @@ static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depth
qp_has_no_left_shift),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
- return new DepthwiseDepthfirstQuantized<a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst>(args, qp);
+ auto strat = new a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<int8_t>(strat, args, qp);
},
},
{
@@ -222,7 +235,9 @@ static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depth
constraint<Requantize32>(has_no_channel_multiplier),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
- return new DepthwiseDepthfirstGenericQuantized<a64_s8q_nhwc_generic_output9_mla_depthfirst, 3, 3>(args, qp);
+ auto kernel = new a64_s8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstStrategy<int8_t>(kernel, 3, 3, args);
+ return new DepthwiseDepthfirstGeneric<int8_t>(strat, args, qp);
},
},
{
@@ -230,10 +245,12 @@ static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depth
"a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
constraint<Requantize32>(is_supported<a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
qp_has_no_left_shift,
+ has_channel_multiplier,
cpu_has_dot_product),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
- return new DepthwiseDepthfirstWithMultiplierQuantized<a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>(args, qp);
+ auto strat = new a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
},
},
{
@@ -241,19 +258,23 @@ static const DepthwiseImplementation<int8_t, int8_t, int8_t, Requantize32> depth
"a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
constraint<Requantize32>(is_supported<a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
qp_has_no_left_shift,
+ has_channel_multiplier,
cpu_has_dot_product),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
- return new DepthwiseDepthfirstWithMultiplierQuantized<a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>(args, qp);
+ auto strat = new a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, false>(strat, args, qp);
},
},
{
DepthwiseMethod::DEPTHFIRST,
"a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
- nullptr,
+ constraint<Requantize32>(has_channel_multiplier),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<int8_t, int8_t, int8_t> * {
- return new DepthwiseDepthfirstGenericWithMultiplierQuantized<a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst>(args, qp);
+ auto kern = new a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstMultiplierStrategy<int8_t>(kern, args);
+ return new DepthwiseDepthfirstMultiplier<int8_t, int8_t, int8_t, int32_t, true>(strat, args, qp);
},
},
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp
new file mode 100644
index 0000000000..33f2177efe
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "depthwise_strategies_common.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+unsigned int DepthfirstStrategyUntyped::get_input_rows() const
+{
+ return this->get_kernel_rows() + (this->get_output_rows() - 1) * this->get_stride_rows();
+}
+
+unsigned int DepthfirstStrategyUntyped::get_input_cols() const
+{
+ return this->get_kernel_cols() + (this->get_output_cols() - 1) * this->get_stride_cols();
+}
+
+unsigned int DepthfirstStrategyUntyped::get_n_input_points() const { return this->get_input_rows() * this->get_input_cols(); }
+unsigned int DepthfirstStrategyUntyped::get_n_output_points() const { return this->get_output_rows() * this->get_output_cols(); }
+unsigned int DepthfirstStrategyUntyped::get_n_kernel_points() const { return this->get_kernel_rows() * this->get_kernel_cols(); }
+
+unsigned int DepthfirstStrategyUntyped::get_accumulator_depth_vl() const { return 1; }
+
+bool DepthfirstStrategyUntyped::get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const
+{
+ // Get the kernel point to pack at the given index; return false to
+ // indicate that this index, and all greater indices, is out of range.
+ if (index < (this->get_kernel_cols() * this->get_kernel_rows()))
+ {
+ y = index % this->get_kernel_cols();
+ x = index / this->get_kernel_cols();
+ return true;
+ }
+ return false;
+}
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp
new file mode 100644
index 0000000000..99b91fb833
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "interleaves/generic.hpp"
+#include "depthfirst_driver.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+class DepthfirstStrategyUntyped : public IDepthfirstStrategy
+{
+ public:
+ virtual arm_gemm::VLType get_vl_type() const = 0;
+
+ virtual unsigned int get_kernel_rows() const = 0;
+ virtual unsigned int get_kernel_cols() const = 0;
+
+ virtual unsigned int get_stride_rows() const = 0;
+ virtual unsigned int get_stride_cols() const = 0;
+
+ virtual unsigned int get_input_rows() const override;
+ virtual unsigned int get_input_cols() const override;
+
+ virtual unsigned int get_n_input_points() const;
+ virtual unsigned int get_n_output_points() const;
+ virtual unsigned int get_n_kernel_points() const;
+
+ // Get the number of VLs used in the accumulator, this defaults to 1.
+ virtual unsigned int get_accumulator_depth_vl() const;
+
+ // Get the order in which to pack the weights, this defaults to a row-major
+ // sweep over the weight tensor.
+ virtual bool get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const;
+};
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
+class DepthfirstStrategy : public DepthfirstStrategyUntyped
+{
+ public:
+ virtual size_t get_storage_size(const DepthwiseArgs &args) const
+ {
+ interleaves::PackingArguments packing_args(
+ this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
+ true, sizeof(TAccum),
+ this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(),
+ [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+ { return this->get_kernel_packing_point(idx, x, y); }
+ );
+ return interleaves::get_storage_size_generic(packing_args, args);
+ }
+
+ virtual void pack_parameters(
+ const DepthwiseArgs &args, void *buffer,
+ const void *biases, const OutputStage &,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const
+ {
+ interleaves::PackingArguments packing_args(
+ this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight),
+ true, sizeof(TAccum),
+ this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(),
+ [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+ { return this->get_kernel_packing_point(idx, x, y); }
+ );
+ interleaves::pack_parameters_generic(
+ packing_args, args, buffer, biases, weights, ld_weight_col, ld_weight_row);
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp
index 67713c5bcc..b1489d0b59 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,23 +25,27 @@
#include "arm_gemm_local.hpp"
#include "depthwise_implementation.hpp"
-#include "depthwise_depthfirst_quantized.hpp"
-#include "depthwise_depthfirst_generic_quantized.hpp"
-#include "depthwise_depthfirst_multiplier_quantized.hpp"
-#include "depthwise_depthfirst_generic_multiplier_quantized.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
#include "depthwise_implementation_constraints.hpp"
#if defined(__aarch64__)
-#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
#include "kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
#include "kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
#include "kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
#include "kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
#include "kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
#include "kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
-#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
#include "kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp"
+
+#include "kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
+#include "kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
+
#include "kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
#include "kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
#include "kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
@@ -49,6 +53,7 @@
#include "kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp"
#include "kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp"
#include "kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp"
+
#endif // defined(__aarch64__)
#include <cstdint>
@@ -60,7 +65,7 @@ namespace depthwise {
static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> depthwise_u8q_methods[] = {
#if defined(__aarch64__)
-#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
{
DepthwiseMethod::DEPTHFIRST,
"sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
@@ -70,7 +75,8 @@ static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> de
cpu_has_sve2),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
- return new DepthwiseDepthfirstQuantized<sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+ auto strat = new sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
},
},
{
@@ -82,7 +88,8 @@ static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> de
cpu_has_sve2),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
- return new DepthwiseDepthfirstQuantized<sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>(args, qp);
+ auto strat = new sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
},
},
{
@@ -94,7 +101,8 @@ static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> de
cpu_has_sve2),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
- return new DepthwiseDepthfirstQuantized<sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>(args, qp);
+ auto strat = new sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
},
},
{
@@ -106,7 +114,8 @@ static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> de
cpu_has_sve2),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
- return new DepthwiseDepthfirstQuantized<sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>(args, qp);
+ auto strat = new sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
},
},
{
@@ -114,10 +123,12 @@ static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> de
"sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
constraint<Requantize32>(is_supported<sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
qp_has_no_left_shift,
+ has_channel_multiplier,
cpu_has_sve2),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
- return new DepthwiseDepthfirstWithMultiplierQuantized<sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>(args, qp);
+ auto strat = new sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
},
},
{
@@ -125,13 +136,15 @@ static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> de
"sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
constraint<Requantize32>(is_supported<sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
qp_has_no_left_shift,
+ has_channel_multiplier,
cpu_has_sve2),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
- return new DepthwiseDepthfirstWithMultiplierQuantized<sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>(args, qp);
+ auto strat = new sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
},
},
-#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
{
DepthwiseMethod::DEPTHFIRST,
"a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
@@ -141,18 +154,61 @@ static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> de
qp_has_no_left_shift),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
- return new DepthwiseDepthfirstQuantized<a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>(args, qp);
+ auto strat = new a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+ },
+ },
+
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_zero_a_offset,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
},
},
{
DepthwiseMethod::DEPTHFIRST,
+ "a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_zero_a_offset,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+ },
+ },
+ {
+ DepthwiseMethod::DEPTHFIRST,
+ "a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst",
+ constraint<Requantize32>(is_supported<a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst>,
+ has_no_channel_multiplier,
+ qp_zero_a_offset,
+ qp_has_no_left_shift),
+ nullptr,
+ [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
+ auto strat = new a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
+ },
+ },
+
+ {
+ DepthwiseMethod::DEPTHFIRST,
"a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
has_no_channel_multiplier,
qp_has_no_left_shift),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
- return new DepthwiseDepthfirstQuantized<a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>(args, qp);
+ auto strat = new a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
},
},
{
@@ -163,7 +219,8 @@ static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> de
qp_has_no_left_shift),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
- return new DepthwiseDepthfirstQuantized<a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>(args, qp);
+ auto strat = new a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
},
},
{
@@ -174,7 +231,8 @@ static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> de
qp_has_no_left_shift),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
- return new DepthwiseDepthfirstQuantized<a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>(args, qp);
+ auto strat = new a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t>(strat, args, qp);
},
},
{
@@ -183,7 +241,9 @@ static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> de
constraint<Requantize32>(has_no_channel_multiplier),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
- return new DepthwiseDepthfirstGenericQuantized<a64_u8q_nhwc_generic_output9_mla_depthfirst, 3, 3>(args, qp);
+ auto kernel = new a64_u8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstStrategy<uint8_t>(kernel, 3, 3, args);
+ return new DepthwiseDepthfirstGeneric<uint8_t>(strat, args, qp);
},
},
{
@@ -191,10 +251,12 @@ static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> de
"a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
constraint<Requantize32>(is_supported<a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
cpu_has_dot_product,
+ has_channel_multiplier,
qp_has_no_left_shift),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
- return new DepthwiseDepthfirstWithMultiplierQuantized<a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>(args, qp);
+ auto strat = new a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
},
},
{
@@ -202,21 +264,26 @@ static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> de
"a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
constraint<Requantize32>(is_supported<a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
cpu_has_dot_product,
+ has_channel_multiplier,
qp_has_no_left_shift),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
- return new DepthwiseDepthfirstWithMultiplierQuantized<a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>(args, qp);
+ auto strat = new a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat, args, qp);
},
},
{
DepthwiseMethod::DEPTHFIRST,
"a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
- nullptr,
+ constraint<Requantize32>(has_channel_multiplier),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
- return new DepthwiseDepthfirstGenericWithMultiplierQuantized<a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst>(args, qp);
+ auto kern = new a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstMultiplierStrategy<uint8_t>(kern, args);
+ return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, true>(strat, args, qp);
},
},
+
#endif // defined(__aarch64__)
{ DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
};
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp
index af4426b69f..9b989012b8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,19 +25,18 @@
#include "arm_gemm_local.hpp"
#include "depthwise_implementation.hpp"
-#include "depthwise_depthfirst_quantized.hpp"
-#include "depthwise_depthfirst_generic_quantized.hpp"
-#include "depthwise_depthfirst_multiplier_quantized.hpp"
-#include "depthwise_depthfirst_generic_multiplier_quantized.hpp"
+#include "depthwise_depthfirst.hpp"
+#include "depthwise_depthfirst_generic.hpp"
+#include "depthwise_depthfirst_multiplier.hpp"
#include "depthwise_implementation_constraints.hpp"
#if defined(__aarch64__)
-#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
#include "kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
#include "kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
#include "kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
-#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
#include "kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp"
#include "kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp"
#include "kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp"
@@ -54,7 +53,7 @@ namespace depthwise {
static const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> depthwise_u8q_methods[] = {
#if defined(__aarch64__)
-#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(ARM_COMPUTE_ENABLE_SVE)
{
DepthwiseMethod::DEPTHFIRST,
"sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
@@ -64,7 +63,8 @@ static const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> dep
cpu_has_sve2),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
- return new DepthwiseDepthfirstQuantized<sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>(args, qp);
+ auto strat = new sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
},
},
{
@@ -76,7 +76,8 @@ static const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> dep
cpu_has_sve2),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
- return new DepthwiseDepthfirstQuantized<sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>(args, qp);
+ auto strat = new sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
},
},
{
@@ -88,10 +89,11 @@ static const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> dep
cpu_has_sve2),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
- return new DepthwiseDepthfirstQuantized<sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>(args, qp);
+ auto strat = new sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
},
},
-#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(ARM_COMPUTE_ENABLE_SVE)
{
DepthwiseMethod::DEPTHFIRST,
"a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
@@ -100,7 +102,8 @@ static const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> dep
qp_has_no_left_shift),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
- return new DepthwiseDepthfirstQuantized<a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>(args, qp);
+ auto strat = new a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
},
},
{
@@ -111,7 +114,8 @@ static const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> dep
qp_has_no_left_shift),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
- return new DepthwiseDepthfirstQuantized<a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>(args, qp);
+ auto strat = new a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
},
},
{
@@ -122,7 +126,8 @@ static const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> dep
qp_has_no_left_shift),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
- return new DepthwiseDepthfirstQuantized<a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>(args, qp);
+ auto strat = new a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
+ return new DepthwiseDepthfirst<uint8_t, int8_t>(strat, args, qp);
},
},
{
@@ -131,16 +136,20 @@ static const DepthwiseImplementation<uint8_t, int8_t, uint8_t, Requantize32> dep
constraint<Requantize32>(has_no_channel_multiplier),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
- return new DepthwiseDepthfirstGenericQuantized<a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst, 3, 3>(args, qp);
+ auto kernel = new a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstStrategy<uint8_t, int8_t>(kernel, 3, 3, args);
+ return new DepthwiseDepthfirstGeneric<uint8_t, int8_t>(strat, args, qp);
},
},
{
DepthwiseMethod::DEPTHFIRST,
"a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
- nullptr,
+ constraint<Requantize32>(has_channel_multiplier),
nullptr,
[] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon<uint8_t, int8_t, uint8_t> * {
- return new DepthwiseDepthfirstGenericWithMultiplierQuantized<a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst>(args, qp);
+ auto kern = new a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
+ auto strat = new GenericDepthfirstMultiplierStrategy<uint8_t, int8_t>(kern, args);
+ return new DepthwiseDepthfirstMultiplier<uint8_t, int8_t, uint8_t, int32_t, true>(strat, args, qp);
},
},
#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp
new file mode 100644
index 0000000000..056f08d037
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "generic.hpp"
+
+#include <functional>
+
+namespace arm_conv {
+namespace depthwise {
+namespace interleaves {
+
+PackingArguments::PackingArguments(
+ unsigned int kernel_rows, unsigned int kernel_cols, size_t weight_element_size,
+ bool include_bias, size_t bias_element_size,
+ arm_gemm::VLType vl_type, size_t accumulator_element_size, unsigned int accumulator_depth_vl,
+ std::function<bool(unsigned int, unsigned int &, unsigned int &)> get_weight_pos
+) : kernel_rows(kernel_rows), kernel_cols(kernel_cols), weight_element_size(weight_element_size),
+ include_bias(include_bias), bias_element_size(bias_element_size),
+ vl_type(vl_type), accumulator_element_size(accumulator_element_size), accumulator_depth_vl(accumulator_depth_vl),
+ get_weight_pos(get_weight_pos)
+{
+}
+
+size_t get_storage_size_generic(const PackingArguments &packing_args, const DepthwiseArgs &args)
+{
+ // If the channel multiplier is greater than one, then we treat this as a
+ // repeated packing of `channel_multiplier`-sized problems.
+ if (args.channel_multiplier > 1)
+ {
+ DepthwiseArgs args_per_input_channel(args);
+ args_per_input_channel.input_channels = args.channel_multiplier;
+ args_per_input_channel.channel_multiplier = 1;
+
+ return args.input_channels * get_storage_size_generic(packing_args, args_per_input_channel);
+ }
+
+ const unsigned int vl =
+ packing_args.accumulator_depth_vl *
+ arm_gemm::utils::get_vector_length<uint8_t>(packing_args.vl_type) / packing_args.accumulator_element_size;
+ const unsigned int n_packs = arm_gemm::iceildiv(args.input_channels, vl);
+ const auto pack_size = (packing_args.include_bias ? packing_args.bias_element_size : 0) +
+ packing_args.kernel_points() * packing_args.weight_element_size;
+ return n_packs * pack_size * vl;
+}
+
+void pack_parameters_generic(
+ const PackingArguments &packing_args,
+ const DepthwiseArgs &args,
+ void *buffer_raw,
+ const void *biases_raw,
+ const void *weights_raw,
+ size_t ld_weight_col,
+ size_t ld_weight_row
+)
+{
+ // Cast the pointers to byte sizes
+ auto *buffer = static_cast<uint8_t *>(buffer_raw);
+ auto *biases = static_cast<const uint8_t *>(biases_raw);
+ auto *weights = static_cast<const uint8_t *>(weights_raw);
+
+ // If the channel multiplier is greater than one, then we treat this as a
+ // repeated packing of `channel_multiplier`-sized problems.
+ if (args.channel_multiplier > 1)
+ {
+ // Get a modified copy of the depthwise arguments
+ DepthwiseArgs args_per_input_channel(args);
+ args_per_input_channel.input_channels = args.channel_multiplier;
+ args_per_input_channel.channel_multiplier = 1;
+
+ // Resolve the strides here
+ ld_weight_col = ld_weight_col ? ld_weight_col : args.input_channels * args.channel_multiplier;
+ ld_weight_row = ld_weight_row ? ld_weight_row : ld_weight_col * packing_args.kernel_cols;
+
+ auto per_input_channel_size = get_storage_size_generic(packing_args, args_per_input_channel);
+
+ for (unsigned int c = 0; c < args.input_channels; c++)
+ {
+ pack_parameters_generic(
+ packing_args, args_per_input_channel, buffer, biases, weights, ld_weight_col, ld_weight_row);
+
+ // Update the pointers
+ buffer += per_input_channel_size;
+ biases += (biases == nullptr) ? 0 : packing_args.bias_element_size * args.channel_multiplier;
+ weights += packing_args.weight_element_size * args.channel_multiplier;
+ }
+ return;
+ }
+
+ // Finalise the weight strides
+ ld_weight_col = (ld_weight_col == 0) ? args.input_channels : ld_weight_col;
+ ld_weight_row = (ld_weight_row == 0) ? packing_args.kernel_cols * ld_weight_col : ld_weight_row;
+
+ const unsigned int vl =
+ packing_args.accumulator_depth_vl *
+ arm_gemm::utils::get_vector_length<uint8_t>(packing_args.vl_type) / packing_args.accumulator_element_size;
+
+ for (unsigned int n = 0; n < args.input_channels; n += vl)
+ {
+ const unsigned int todo = std::min(vl, args.input_channels - n);
+
+ if (packing_args.include_bias)
+ {
+ if (biases != nullptr)
+ {
+ memcpy(buffer, biases, todo * packing_args.bias_element_size);
+ biases += todo * packing_args.bias_element_size;
+ }
+ else
+ {
+ memset(buffer, 0, vl * packing_args.bias_element_size);
+ }
+
+ buffer += vl * packing_args.bias_element_size;
+ }
+
+ // Copy each of the weights in turn
+ unsigned int kx, ky;
+ for (int kindex = 0; packing_args.get_weight_pos(kindex, kx, ky); kindex++)
+ {
+ const auto src_ptr = weights + (kx*ld_weight_row + ky*ld_weight_col + n) * packing_args.weight_element_size;
+ memcpy(buffer, src_ptr, todo * packing_args.weight_element_size);
+ buffer += vl * packing_args.weight_element_size;
+ }
+ }
+}
+
+} // namespace interleaves
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp
new file mode 100644
index 0000000000..5b5ae17806
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+#include "depthwise.hpp"
+
+#include <functional>
+
+namespace arm_conv {
+namespace depthwise {
+namespace interleaves {
+
+struct PackingArguments
+{
+ const unsigned int kernel_rows;
+ const unsigned int kernel_cols;
+ const size_t weight_element_size;
+ const bool include_bias;
+ const size_t bias_element_size;
+ arm_gemm::VLType vl_type;
+ const size_t accumulator_element_size;
+ const unsigned int accumulator_depth_vl;
+ std::function<bool(unsigned int, unsigned int &, unsigned int &)> get_weight_pos;
+
+ unsigned int kernel_points(void) const { return kernel_cols * kernel_rows; }
+
+ PackingArguments(
+ unsigned int kernel_rows,
+ unsigned int kernel_cols,
+ size_t weight_element_size,
+ bool include_bias,
+ size_t bias_element_size,
+ arm_gemm::VLType vl_type,
+ size_t accumulator_element_size,
+ unsigned int accumulator_depth_vl,
+ std::function<bool(unsigned int, unsigned int &, unsigned int &)> get_weight_pos
+ );
+};
+
+size_t get_storage_size_generic(
+ const PackingArguments &packing_args,
+ const DepthwiseArgs &args
+);
+
+void pack_parameters_generic(
+ const PackingArguments &packing_args,
+ const DepthwiseArgs &args,
+ void *buffer_raw,
+ const void *biases_raw,
+ const void *weights_raw,
+ size_t ld_weight_col,
+ size_t ld_weight_row
+);
+
+} // namespace interleaves
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp
new file mode 100644
index 0000000000..a6389054d1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "generic_quantized_dot_product.hpp"
+#include <cstdint>
+
+namespace arm_conv {
+namespace depthwise {
+namespace interleaves {
+namespace quantized {
+
+size_t get_storage_size(
+ const DepthwiseArgs &args,
+ const arm_gemm::VLType vl_type,
+ const unsigned int accumulator_depth_vl
+)
+{
+ // We produce VL<int32_t> channels at a time, for each of these blocks of
+ // channels we store a vector of biases, weights (complicated) and
+ // requantize parameters.
+ const unsigned int iter_length = accumulator_depth_vl * arm_gemm::utils::get_vector_length<int32_t>(vl_type);
+ const unsigned int n_iters = args.input_channels * arm_gemm::iceildiv(args.channel_multiplier, iter_length);
+
+ // Compute the cost of storing the weights
+ const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(args.kernel_cols, 4u);
+
+ return n_iters * iter_length * (
+ sizeof(int32_t) + // Bias
+ 4 * n_dots_per_kernel_row * args.kernel_rows * sizeof(int8_t) + // Weights
+ 2 * sizeof(int32_t) // Requantisation parameters
+ );
+}
+
+template <typename T>
+void pack_parameters(
+ void *_buffer, const int32_t *biases,
+ const T *weights, size_t ld_weight_col, size_t ld_weight_row,
+ const DepthwiseArgs &args,
+ const arm_gemm::Requantize32 &qp,
+ const arm_gemm::VLType vl_type,
+ const unsigned int accumulator_depth_vl
+)
+{
+ auto buffer = static_cast<uint8_t *>(_buffer);
+ auto requant_muls = qp.per_channel_muls;
+ auto requant_shifts = qp.per_channel_right_shifts;
+
+ const unsigned int iter_length = accumulator_depth_vl * arm_gemm::utils::get_vector_length<int32_t>(vl_type);
+ const unsigned int n_iters_per_input_channel = arm_gemm::iceildiv(args.channel_multiplier, iter_length);
+ const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(args.kernel_cols, 4u);
+
+ const size_t iter_stride = iter_length * (
+ sizeof(int32_t) + // Bias
+ 4 * n_dots_per_kernel_row * args.kernel_rows * sizeof(T) + // Weights
+ 2 * sizeof(int32_t) // Requantisation parameters
+ );
+
+ ld_weight_col = (ld_weight_col == 0) ? args.input_channels * args.channel_multiplier : ld_weight_col;
+ ld_weight_row = (ld_weight_row == 0) ? args.kernel_cols * ld_weight_col : ld_weight_row;
+
+ for (unsigned int input_channel = 0; input_channel < args.input_channels; input_channel++)
+ {
+ auto buffer_input_channel = buffer + input_channel * n_iters_per_input_channel * iter_stride;
+ auto weights_input_channel = weights + input_channel * args.channel_multiplier;
+
+ for (unsigned int iter = 0; iter < n_iters_per_input_channel; iter++)
+ {
+ // Get a pointer to the start of this portion of the buffer; consequently
+ // derive pointers to the bias, weight and requantisation portions of
+ // this frame.
+ auto buffer_base = buffer_input_channel + iter_stride * iter;
+ auto buffer_biases = reinterpret_cast<int32_t *>(buffer_base);
+ auto buffer_weights = buffer_base + sizeof(int32_t) * iter_length;
+ auto buffer_requant_mul = reinterpret_cast<int32_t *>(
+ buffer_weights + args.kernel_rows * n_dots_per_kernel_row * 4 * iter_length);
+ auto buffer_requant_shift = buffer_requant_mul + iter_length;
+ auto weights_base = weights_input_channel + iter * iter_length;
+
+ // Hence work through the data for this iteration, on a
+ // channel-by-channel basis.
+ const auto this_iter_length = std::min<unsigned int>(
+ iter_length, args.channel_multiplier - iter * iter_length
+ );
+ for (unsigned int i = 0; i < this_iter_length; i++)
+ {
+ auto weights_channel = weights_base + i;
+
+ // Read the bias value, we modify this as we read the weights.
+ auto bias_value = biases == nullptr ? 0 : *(biases++);
+ int32_t elements_sum = 0;
+
+ // Read through the kernel; for each row, marshal together as many dot
+ // product terms as are required.
+ for (unsigned int ki = 0; ki < args.kernel_rows; ki++)
+ {
+ auto buffer_row = buffer_weights + i*4 + ki * 4 * n_dots_per_kernel_row * iter_length;
+ auto weights_row = weights_channel + ki * ld_weight_row;
+
+ unsigned int kj = 0;
+ for (; kj < args.kernel_cols; kj++)
+ {
+ // Determine which element to which we're writing
+ const auto dot = kj / 4;
+ const auto elem = kj % 4;
+
+ // Copy the value; include in the sum
+ const auto val = weights_row[kj * ld_weight_col];
+ buffer_row[dot * 4 * iter_length + elem] = val;
+ elements_sum += val;
+ }
+ for (; kj < 4 * n_dots_per_kernel_row; kj++)
+ {
+ const auto dot = kj / 4;
+ const auto elem = kj % 4;
+ buffer_row[dot * 4 * iter_length + elem] = 0;
+ }
+
+ buffer_row += 4 * n_dots_per_kernel_row * iter_length;
+ }
+
+ // Write back the bias and offset values
+ *(buffer_biases++) =
+ bias_value - qp.a_offset * elements_sum +
+ args.kernel_rows * args.kernel_cols * qp.a_offset * qp.b_offset;
+
+ // Write out the requantisation parameters
+ *(buffer_requant_mul++) = qp.per_channel_requant ? *(requant_muls++) : qp.per_layer_mul;
+ *(buffer_requant_shift++) = qp.per_channel_requant ? *(requant_shifts++) : qp.per_layer_right_shift;
+ }
+ }
+ }
+}
+
+template void pack_parameters(void *, const int32_t *, const int8_t *, size_t, size_t, const DepthwiseArgs &, const arm_gemm::Requantize32 &, arm_gemm::VLType, unsigned int);
+template void pack_parameters(void *, const int32_t *, const uint8_t *, size_t, size_t, const DepthwiseArgs &, const arm_gemm::Requantize32 &, arm_gemm::VLType, unsigned int);
+
+} // namespace quantized
+} // namespace interleaves
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.hpp
new file mode 100644
index 0000000000..779d67d3f4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "generic.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+namespace interleaves {
+namespace quantized {
+
+size_t get_storage_size(
+ const DepthwiseArgs &args,
+ arm_gemm::VLType vl_type,
+ unsigned int accumulator_depth_vl=1
+);
+
+template <typename T>
+void pack_parameters(
+ void *buffer, const int32_t *biases,
+ const T *weights, size_t ld_weight_col, size_t ld_weight_row,
+ const DepthwiseArgs &args,
+ const arm_gemm::Requantize32 &qp,
+ arm_gemm::VLType vl_type,
+ unsigned int accumulator_depth_vl
+);
+
+} // namespace quantized
+} // namespace interleaves
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp
index cb49a243af..76f38eb335 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,90 +29,30 @@ namespace depthwise {
#if defined(ARM_COMPUTE_ENABLE_SVE)
-class interleave_sve_u8q_3x3_dot
+struct interleave_sve_u8q_3x3_dot
{
- public:
- static void pack_parameters(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
- static size_t get_packed_size(const DepthwiseArgs &);
+ static void pack_parameters(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+ static size_t get_packed_size(const DepthwiseArgs &);
};
-class interleave_sve_s8q_3x3_dot
+struct interleave_sve_s8q_3x3_dot
{
- public:
- static void pack_parameters(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
- static size_t get_packed_size(const DepthwiseArgs &);
-};
-
-class interleave_sve_u8q_3x3_mla
-{
- public:
- static void pack_parameters(unsigned int, void *, const uint8_t *, size_t, size_t);
- static size_t get_packed_size(const DepthwiseArgs &);
-};
-
-class interleave_sve_s8q_3x3_mla
-{
- public:
- static void pack_parameters(unsigned int, void *, const int8_t *, size_t, size_t);
- static size_t get_packed_size(const DepthwiseArgs &);
-};
-
-class interleave_sve_u8q_5x5_mla
-{
- public:
- static void pack_parameters(unsigned int, void *, const uint8_t *, size_t, size_t);
- static size_t get_packed_size(const DepthwiseArgs &);
-};
-
-class interleave_sve_s8q_5x5_mla
-{
- public:
- static void pack_parameters(unsigned int, void *, const int8_t *, size_t, size_t);
- static size_t get_packed_size(const DepthwiseArgs &);
+ static void pack_parameters(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+ static size_t get_packed_size(const DepthwiseArgs &);
};
#endif // defined(ARM_COMPUTE_ENABLE_SVE)
-class interleave_a64_u8q_3x3_dot
-{
- public:
- static void pack_parameters(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
- static size_t get_packed_size(const DepthwiseArgs &);
-};
-
-class interleave_a64_s8q_3x3_dot
-{
- public:
- static void pack_parameters(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
- static size_t get_packed_size(const DepthwiseArgs &);
-};
-
-class interleave_a64_u8q_3x3_mla
-{
- public:
- static void pack_parameters(unsigned int, void *, const uint8_t *, size_t, size_t);
- static size_t get_packed_size(const DepthwiseArgs &);
-};
-
-class interleave_a64_s8q_3x3_mla
-{
- public:
- static void pack_parameters(unsigned int, void *, const int8_t *, size_t, size_t);
- static size_t get_packed_size(const DepthwiseArgs &);
-};
-
-class interleave_a64_u8q_5x5_mla
+struct interleave_a64_u8q_3x3_dot
{
- public:
- static void pack_parameters(unsigned int, void *, const uint8_t *, size_t, size_t);
- static size_t get_packed_size(const DepthwiseArgs &);
+ static void pack_parameters(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+ static size_t get_packed_size(const DepthwiseArgs &);
};
-class interleave_a64_s8q_5x5_mla
+struct interleave_a64_s8q_3x3_dot
{
- public:
- static void pack_parameters(unsigned int, void *, const int8_t *, size_t, size_t);
- static size_t get_packed_size(const DepthwiseArgs &);
+ static void pack_parameters(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
+ static size_t get_packed_size(const DepthwiseArgs &);
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index be50d1c535..d2db12535f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,19 +36,16 @@ namespace depthwise {
void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
-class a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy
+class a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
private:
- typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
- indirect_kern_type m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
-
- typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
- direct_kern_type m_direct_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
public:
- typedef __fp16 return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::None;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
@@ -59,60 +56,13 @@ class a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirs
constexpr static unsigned int output_rows = 2;
constexpr static unsigned int output_cols = 2;
- constexpr static unsigned int input_rows = 4;
- constexpr static unsigned int input_cols = 4;
-
- a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+ a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+ : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 3, 1) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
- unsigned int get_kernel_rows(void) const override { return kernel_rows; }
- unsigned int get_kernel_cols(void) const override { return kernel_cols; }
-
- unsigned int get_stride_rows(void) const override { return stride_rows; }
- unsigned int get_stride_cols(void) const override { return stride_cols; }
-
- unsigned int get_output_rows(void) const override { return output_rows; }
- unsigned int get_output_cols(void) const override { return output_cols; }
-
- unsigned int get_input_rows(void) const override { return input_rows; }
- unsigned int get_input_cols(void) const override { return input_cols; }
-
- void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const override
- {
- m_indirect_kernel(
- reinterpret_cast<const __fp16 *const *>(input_ptrs),
- reinterpret_cast<__fp16 *const *>(outptrs),
- params, n_channels,
- *static_cast<const __fp16 *>(activation_min),
- *static_cast<const __fp16 *>(activation_max)
- );
- }
-
- void direct_kernel(
- const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
- const void *params, unsigned int n_channels,
- const void *activation_min, const void *activation_max
- ) const override
- {
- m_direct_kernel(
- n_tile_rows, n_tile_cols,
- static_cast<const __fp16 *>(inptr), ld_input_row, ld_input_col,
- static_cast<__fp16 *>(outptr), ld_output_row, ld_output_col,
- params, n_channels,
- *static_cast<const __fp16 *>(activation_min),
- *static_cast<const __fp16 *>(activation_max)
- );
- }
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
index 39fa7f62fd..75368dfcf9 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,19 +36,16 @@ namespace depthwise {
void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
-class a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public IDepthwiseDepthfirstStrategy
+class a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
private:
- typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
- indirect_kern_type m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
-
- typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
- direct_kern_type m_direct_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
public:
- typedef __fp16 return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::None;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
@@ -59,60 +56,13 @@ class a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public IDepthwiseDepthfirs
constexpr static unsigned int output_rows = 3;
constexpr static unsigned int output_cols = 3;
- constexpr static unsigned int input_rows = 5;
- constexpr static unsigned int input_cols = 5;
-
- a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {}
+ a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+ : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(3, 3, 1) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
- unsigned int get_kernel_rows(void) const override { return kernel_rows; }
- unsigned int get_kernel_cols(void) const override { return kernel_cols; }
-
- unsigned int get_stride_rows(void) const override { return stride_rows; }
- unsigned int get_stride_cols(void) const override { return stride_cols; }
-
- unsigned int get_output_rows(void) const override { return output_rows; }
- unsigned int get_output_cols(void) const override { return output_cols; }
-
- unsigned int get_input_rows(void) const override { return input_rows; }
- unsigned int get_input_cols(void) const override { return input_cols; }
-
- void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const override
- {
- m_indirect_kernel(
- reinterpret_cast<const __fp16 *const *>(input_ptrs),
- reinterpret_cast<__fp16 *const *>(outptrs),
- params, n_channels,
- *static_cast<const __fp16 *>(activation_min),
- *static_cast<const __fp16 *>(activation_max)
- );
- }
-
- void direct_kernel(
- const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
- const void *params, unsigned int n_channels,
- const void *activation_min, const void *activation_max
- ) const override
- {
- m_direct_kernel(
- n_tile_rows, n_tile_cols,
- static_cast<const __fp16 *>(inptr), ld_input_row, ld_input_col,
- static_cast<__fp16 *>(outptr), ld_output_row, ld_output_col,
- params, n_channels,
- *static_cast<const __fp16 *>(activation_min),
- *static_cast<const __fp16 *>(activation_max)
- );
- }
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
index e0abca91a2..faf6c91181 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -1283,7 +1283,8 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(
:
: [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (&params_struct)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" );
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
}
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
index 1e0d922be4..4f0de6b61c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,19 +36,16 @@ namespace depthwise {
void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
-class a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public IDepthwiseDepthfirstStrategy
+class a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
private:
- typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
- indirect_kern_type m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
-
- typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
- direct_kern_type m_direct_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
public:
- typedef __fp16 return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::None;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
@@ -59,60 +56,13 @@ class a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public IDepthwiseDepthfirs
constexpr static unsigned int output_rows = 4;
constexpr static unsigned int output_cols = 4;
- constexpr static unsigned int input_rows = 6;
- constexpr static unsigned int input_cols = 6;
-
- a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {}
+ a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+ : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(4, 3, 1) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
- unsigned int get_kernel_rows(void) const override { return kernel_rows; }
- unsigned int get_kernel_cols(void) const override { return kernel_cols; }
-
- unsigned int get_stride_rows(void) const override { return stride_rows; }
- unsigned int get_stride_cols(void) const override { return stride_cols; }
-
- unsigned int get_output_rows(void) const override { return output_rows; }
- unsigned int get_output_cols(void) const override { return output_cols; }
-
- unsigned int get_input_rows(void) const override { return input_rows; }
- unsigned int get_input_cols(void) const override { return input_cols; }
-
- void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const override
- {
- m_indirect_kernel(
- reinterpret_cast<const __fp16 *const *>(input_ptrs),
- reinterpret_cast<__fp16 *const *>(outptrs),
- params, n_channels,
- *static_cast<const __fp16 *>(activation_min),
- *static_cast<const __fp16 *>(activation_max)
- );
- }
-
- void direct_kernel(
- const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
- const void *params, unsigned int n_channels,
- const void *activation_min, const void *activation_max
- ) const override
- {
- m_direct_kernel(
- n_tile_rows, n_tile_cols,
- static_cast<const __fp16 *>(inptr), ld_input_row, ld_input_col,
- static_cast<__fp16 *>(outptr), ld_output_row, ld_output_col,
- params, n_channels,
- *static_cast<const __fp16 *>(activation_min),
- *static_cast<const __fp16 *>(activation_max)
- );
- }
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index d89ae0cd6d..d52f48064f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,19 +36,16 @@ namespace depthwise {
void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
-class a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy
+class a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
private:
- typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
- indirect_kern_type m_indirect_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
-
- typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
- direct_kern_type m_direct_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
public:
- typedef __fp16 return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::None;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
@@ -59,60 +56,13 @@ class a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public IDepthwiseDepthfirs
constexpr static unsigned int output_rows = 2;
constexpr static unsigned int output_cols = 2;
- constexpr static unsigned int input_rows = 5;
- constexpr static unsigned int input_cols = 5;
-
- a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+ a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+ : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 3, 2) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
- unsigned int get_kernel_rows(void) const override { return kernel_rows; }
- unsigned int get_kernel_cols(void) const override { return kernel_cols; }
-
- unsigned int get_stride_rows(void) const override { return stride_rows; }
- unsigned int get_stride_cols(void) const override { return stride_cols; }
-
- unsigned int get_output_rows(void) const override { return output_rows; }
- unsigned int get_output_cols(void) const override { return output_cols; }
-
- unsigned int get_input_rows(void) const override { return input_rows; }
- unsigned int get_input_cols(void) const override { return input_cols; }
-
- void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const override
- {
- m_indirect_kernel(
- reinterpret_cast<const __fp16 *const *>(input_ptrs),
- reinterpret_cast<__fp16 *const *>(outptrs),
- params, n_channels,
- *static_cast<const __fp16 *>(activation_min),
- *static_cast<const __fp16 *>(activation_max)
- );
- }
-
- void direct_kernel(
- const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
- const void *params, unsigned int n_channels,
- const void *activation_min, const void *activation_max
- ) const override
- {
- m_direct_kernel(
- n_tile_rows, n_tile_cols,
- static_cast<const __fp16 *>(inptr), ld_input_row, ld_input_col,
- static_cast<__fp16 *>(outptr), ld_output_row, ld_output_col,
- params, n_channels,
- *static_cast<const __fp16 *>(activation_min),
- *static_cast<const __fp16 *>(activation_max)
- );
- }
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index 6b5f91fa64..81a608e349 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,19 +36,16 @@ namespace depthwise {
void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
-class a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy
+class a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
private:
- typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
- indirect_kern_type m_indirect_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
-
- typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
- direct_kern_type m_direct_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
public:
- typedef __fp16 return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::None;
constexpr static unsigned int kernel_rows = 5;
constexpr static unsigned int kernel_cols = 5;
@@ -59,60 +56,13 @@ class a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirs
constexpr static unsigned int output_rows = 2;
constexpr static unsigned int output_cols = 2;
- constexpr static unsigned int input_rows = 6;
- constexpr static unsigned int input_cols = 6;
-
- a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+ a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
+ : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 5, 1) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
- unsigned int get_kernel_rows(void) const override { return kernel_rows; }
- unsigned int get_kernel_cols(void) const override { return kernel_cols; }
-
- unsigned int get_stride_rows(void) const override { return stride_rows; }
- unsigned int get_stride_cols(void) const override { return stride_cols; }
-
- unsigned int get_output_rows(void) const override { return output_rows; }
- unsigned int get_output_cols(void) const override { return output_cols; }
-
- unsigned int get_input_rows(void) const override { return input_rows; }
- unsigned int get_input_cols(void) const override { return input_cols; }
-
- void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const override
- {
- m_indirect_kernel(
- reinterpret_cast<const __fp16 *const *>(input_ptrs),
- reinterpret_cast<__fp16 *const *>(outptrs),
- params, n_channels,
- *static_cast<const __fp16 *>(activation_min),
- *static_cast<const __fp16 *>(activation_max)
- );
- }
-
- void direct_kernel(
- const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
- const void *params, unsigned int n_channels,
- const void *activation_min, const void *activation_max
- ) const override
- {
- m_direct_kernel(
- n_tile_rows, n_tile_cols,
- static_cast<const __fp16 *>(inptr), ld_input_row, ld_input_col,
- static_cast<__fp16 *>(outptr), ld_output_row, ld_output_col,
- params, n_channels,
- *static_cast<const __fp16 *>(activation_min),
- *static_cast<const __fp16 *>(activation_max)
- );
- }
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp
index 3468b70f29..1ccd3408e2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,32 +28,24 @@
#pragma once
-#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(const __fp16 *const *const, __fp16 *const *const, const void *, const void *, const unsigned int, const unsigned int, const __fp16, const __fp16);
-struct a64_fp16_nhwc_generic_output9_mla_depthfirst
+class a64_fp16_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<__fp16, __fp16, __fp16, __fp16>
{
- typedef __fp16 bias_type;
- typedef __fp16 input_type;
- typedef __fp16 weight_type;
- typedef __fp16 return_type;
+ KernelType kernel = a64_fp16_nhwc_generic_output9_mla_depthfirst_impl;
- typedef void (*kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, const void *, const unsigned int, const unsigned int, const __fp16, const __fp16);
+ public:
+ a64_fp16_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<__fp16, __fp16, __fp16, __fp16>(9, arm_gemm::VLType::None) {}
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- constexpr static unsigned int n_output_points = 9;
-
- kern_type kernel = a64_fp16_nhwc_generic_output9_mla_depthfirst_impl;
-
- a64_fp16_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {}
+ virtual KernelType get_kernel() const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
index 8ac79f82fa..423ee4190c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -524,4 +524,4 @@ void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
index a02a2b2984..8fcbce2cfe 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,35 +28,25 @@
#pragma once
-#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const __fp16 *const *const, __fp16 *const *const, const __fp16 *, const __fp16 *, const unsigned int, const unsigned int, const __fp16, const __fp16);
-struct a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
+struct a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<__fp16, __fp16, __fp16, __fp16>
{
- typedef __fp16 bias_type;
- typedef __fp16 input_type;
- typedef __fp16 weight_type;
- typedef __fp16 return_type;
-
- typedef void (*kern_type)(const __fp16 *const *const, __fp16 *const *const, const __fp16 *, const __fp16 *, const unsigned int, const unsigned int, const __fp16, const __fp16);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- constexpr static unsigned int output_rows(void) { return 2; };
- constexpr static unsigned int output_cols(void) { return 8; };
-
- constexpr static unsigned int output_col_regs(void) { return 1; };
-
- kern_type kernel = a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
-
- a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {}
+ using Parent = GenericDepthfirstMultiplierKernelStrategy<__fp16, __fp16, __fp16, __fp16>;
+ a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+ : Parent(2, 8, arm_gemm::VLType::None)
+ {
+ }
+ Parent::KernelType kernel = a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
index 7ed7c52db2..d9fc1403b2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -25,7 +25,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
namespace arm_conv {
namespace depthwise {
@@ -1046,4 +1046,4 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index a888eb5776..420e95384d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,7 +28,7 @@
#pragma once
-#if __aarch64__
+#if defined(__aarch64__)
namespace arm_conv {
namespace depthwise {
@@ -36,19 +36,16 @@ namespace depthwise {
void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
-class a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy
+class a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
private:
- typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
- indirect_kern_type m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
-
- typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
- direct_kern_type m_direct_kernel = a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
public:
- typedef float return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::None;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
@@ -59,63 +56,16 @@ class a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirs
constexpr static unsigned int output_rows = 2;
constexpr static unsigned int output_cols = 2;
- constexpr static unsigned int input_rows = 4;
- constexpr static unsigned int input_cols = 4;
-
- a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+ a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+ : DepthwiseDepthfirstStrategy<float, float, float, float>(2, 3, 1) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
- unsigned int get_kernel_rows(void) const override { return kernel_rows; }
- unsigned int get_kernel_cols(void) const override { return kernel_cols; }
-
- unsigned int get_stride_rows(void) const override { return stride_rows; }
- unsigned int get_stride_cols(void) const override { return stride_cols; }
-
- unsigned int get_output_rows(void) const override { return output_rows; }
- unsigned int get_output_cols(void) const override { return output_cols; }
-
- unsigned int get_input_rows(void) const override { return input_rows; }
- unsigned int get_input_cols(void) const override { return input_cols; }
-
- void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const override
- {
- m_indirect_kernel(
- reinterpret_cast<const float *const *>(input_ptrs),
- reinterpret_cast<float *const *>(outptrs),
- params, n_channels,
- *static_cast<const float *>(activation_min),
- *static_cast<const float *>(activation_max)
- );
- }
-
- void direct_kernel(
- const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
- const void *params, unsigned int n_channels,
- const void *activation_min, const void *activation_max
- ) const override
- {
- m_direct_kernel(
- n_tile_rows, n_tile_cols,
- static_cast<const float *>(inptr), ld_input_row, ld_input_col,
- static_cast<float *>(outptr), ld_output_row, ld_output_col,
- params, n_channels,
- *static_cast<const float *>(activation_min),
- *static_cast<const float *>(activation_max)
- );
- }
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
index 01bb06a561..0e9a3ba3fc 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,7 +28,7 @@
#pragma once
-#if __aarch64__
+#if defined(__aarch64__)
namespace arm_conv {
namespace depthwise {
@@ -36,19 +36,16 @@ namespace depthwise {
void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
-class a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public IDepthwiseDepthfirstStrategy
+class a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
private:
- typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
- indirect_kern_type m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
-
- typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
- direct_kern_type m_direct_kernel = a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
public:
- typedef float return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::None;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
@@ -59,63 +56,16 @@ class a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public IDepthwiseDepthfirs
constexpr static unsigned int output_rows = 3;
constexpr static unsigned int output_cols = 3;
- constexpr static unsigned int input_rows = 5;
- constexpr static unsigned int input_cols = 5;
-
- a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {}
+ a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+ : DepthwiseDepthfirstStrategy<float, float, float, float>(3, 3, 1) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
- unsigned int get_kernel_rows(void) const override { return kernel_rows; }
- unsigned int get_kernel_cols(void) const override { return kernel_cols; }
-
- unsigned int get_stride_rows(void) const override { return stride_rows; }
- unsigned int get_stride_cols(void) const override { return stride_cols; }
-
- unsigned int get_output_rows(void) const override { return output_rows; }
- unsigned int get_output_cols(void) const override { return output_cols; }
-
- unsigned int get_input_rows(void) const override { return input_rows; }
- unsigned int get_input_cols(void) const override { return input_cols; }
-
- void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const override
- {
- m_indirect_kernel(
- reinterpret_cast<const float *const *>(input_ptrs),
- reinterpret_cast<float *const *>(outptrs),
- params, n_channels,
- *static_cast<const float *>(activation_min),
- *static_cast<const float *>(activation_max)
- );
- }
-
- void direct_kernel(
- const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
- const void *params, unsigned int n_channels,
- const void *activation_min, const void *activation_max
- ) const override
- {
- m_direct_kernel(
- n_tile_rows, n_tile_cols,
- static_cast<const float *>(inptr), ld_input_row, ld_input_col,
- static_cast<float *>(outptr), ld_output_row, ld_output_col,
- params, n_channels,
- *static_cast<const float *>(activation_min),
- *static_cast<const float *>(activation_max)
- );
- }
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
index 17084b57d5..6c897d6eaa 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,7 +28,7 @@
#pragma once
-#if __aarch64__
+#if defined(__aarch64__)
namespace arm_conv {
namespace depthwise {
@@ -36,19 +36,16 @@ namespace depthwise {
void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
-class a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public IDepthwiseDepthfirstStrategy
+class a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
private:
- typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
- indirect_kern_type m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
-
- typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
- direct_kern_type m_direct_kernel = a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
public:
- typedef float return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::None;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
@@ -59,63 +56,16 @@ class a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public IDepthwiseDepthfirs
constexpr static unsigned int output_rows = 4;
constexpr static unsigned int output_cols = 4;
- constexpr static unsigned int input_rows = 6;
- constexpr static unsigned int input_cols = 6;
-
- a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {}
+ a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+ : DepthwiseDepthfirstStrategy<float, float, float, float>(4, 3, 1) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
- unsigned int get_kernel_rows(void) const override { return kernel_rows; }
- unsigned int get_kernel_cols(void) const override { return kernel_cols; }
-
- unsigned int get_stride_rows(void) const override { return stride_rows; }
- unsigned int get_stride_cols(void) const override { return stride_cols; }
-
- unsigned int get_output_rows(void) const override { return output_rows; }
- unsigned int get_output_cols(void) const override { return output_cols; }
-
- unsigned int get_input_rows(void) const override { return input_rows; }
- unsigned int get_input_cols(void) const override { return input_cols; }
-
- void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const override
- {
- m_indirect_kernel(
- reinterpret_cast<const float *const *>(input_ptrs),
- reinterpret_cast<float *const *>(outptrs),
- params, n_channels,
- *static_cast<const float *>(activation_min),
- *static_cast<const float *>(activation_max)
- );
- }
-
- void direct_kernel(
- const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
- const void *params, unsigned int n_channels,
- const void *activation_min, const void *activation_max
- ) const override
- {
- m_direct_kernel(
- n_tile_rows, n_tile_cols,
- static_cast<const float *>(inptr), ld_input_row, ld_input_col,
- static_cast<float *>(outptr), ld_output_row, ld_output_col,
- params, n_channels,
- *static_cast<const float *>(activation_min),
- *static_cast<const float *>(activation_max)
- );
- }
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index f23862b699..ff521fb2ca 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,7 +28,7 @@
#pragma once
-#if __aarch64__
+#if defined(__aarch64__)
namespace arm_conv {
namespace depthwise {
@@ -36,19 +36,16 @@ namespace depthwise {
void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
-class a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy
+class a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
private:
- typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
- indirect_kern_type m_indirect_kernel = a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
-
- typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
- direct_kern_type m_direct_kernel = a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
public:
- typedef float return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::None;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
@@ -59,63 +56,16 @@ class a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public IDepthwiseDepthfirs
constexpr static unsigned int output_rows = 2;
constexpr static unsigned int output_cols = 2;
- constexpr static unsigned int input_rows = 5;
- constexpr static unsigned int input_cols = 5;
-
- a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+ a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+ : DepthwiseDepthfirstStrategy<float, float, float, float>(2, 3, 2) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
- unsigned int get_kernel_rows(void) const override { return kernel_rows; }
- unsigned int get_kernel_cols(void) const override { return kernel_cols; }
-
- unsigned int get_stride_rows(void) const override { return stride_rows; }
- unsigned int get_stride_cols(void) const override { return stride_cols; }
-
- unsigned int get_output_rows(void) const override { return output_rows; }
- unsigned int get_output_cols(void) const override { return output_cols; }
-
- unsigned int get_input_rows(void) const override { return input_rows; }
- unsigned int get_input_cols(void) const override { return input_cols; }
-
- void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const override
- {
- m_indirect_kernel(
- reinterpret_cast<const float *const *>(input_ptrs),
- reinterpret_cast<float *const *>(outptrs),
- params, n_channels,
- *static_cast<const float *>(activation_min),
- *static_cast<const float *>(activation_max)
- );
- }
-
- void direct_kernel(
- const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
- const void *params, unsigned int n_channels,
- const void *activation_min, const void *activation_max
- ) const override
- {
- m_direct_kernel(
- n_tile_rows, n_tile_cols,
- static_cast<const float *>(inptr), ld_input_row, ld_input_col,
- static_cast<float *>(outptr), ld_output_row, ld_output_col,
- params, n_channels,
- *static_cast<const float *>(activation_min),
- *static_cast<const float *>(activation_max)
- );
- }
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index e4bfbe6783..c88a7d57ce 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,19 +36,16 @@ namespace depthwise {
void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
-class a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy
+class a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
private:
- typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
- indirect_kern_type m_indirect_kernel = a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
-
- typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
- direct_kern_type m_direct_kernel = a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
public:
- typedef float return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::None;
constexpr static unsigned int kernel_rows = 5;
constexpr static unsigned int kernel_cols = 5;
@@ -59,60 +56,13 @@ class a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirs
constexpr static unsigned int output_rows = 2;
constexpr static unsigned int output_cols = 2;
- constexpr static unsigned int input_rows = 6;
- constexpr static unsigned int input_cols = 6;
-
- a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+ a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
+ : DepthwiseDepthfirstStrategy<float, float, float, float>(2, 5, 1) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
- unsigned int get_kernel_rows(void) const override { return kernel_rows; }
- unsigned int get_kernel_cols(void) const override { return kernel_cols; }
-
- unsigned int get_stride_rows(void) const override { return stride_rows; }
- unsigned int get_stride_cols(void) const override { return stride_cols; }
-
- unsigned int get_output_rows(void) const override { return output_rows; }
- unsigned int get_output_cols(void) const override { return output_cols; }
-
- unsigned int get_input_rows(void) const override { return input_rows; }
- unsigned int get_input_cols(void) const override { return input_cols; }
-
- void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const override
- {
- m_indirect_kernel(
- reinterpret_cast<const float *const *>(input_ptrs),
- reinterpret_cast<float *const *>(outptrs),
- params, n_channels,
- *static_cast<const float *>(activation_min),
- *static_cast<const float *>(activation_max)
- );
- }
-
- void direct_kernel(
- const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
- const void *params, unsigned int n_channels,
- const void *activation_min, const void *activation_max
- ) const override
- {
- m_direct_kernel(
- n_tile_rows, n_tile_cols,
- static_cast<const float *>(inptr), ld_input_row, ld_input_col,
- static_cast<float *>(outptr), ld_output_row, ld_output_col,
- params, n_channels,
- *static_cast<const float *>(activation_min),
- *static_cast<const float *>(activation_max)
- );
- }
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp
index 0f6cecdc56..6fa02b781e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,28 +28,24 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float);
-struct a64_fp32_nhwc_generic_output9_mla_depthfirst
+class a64_fp32_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<float, float, float, float>
{
- typedef float bias_type;
- typedef float input_type;
- typedef float weight_type;
- typedef float return_type;
-
- typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
+ KernelType kernel = a64_fp32_nhwc_generic_output9_mla_depthfirst_impl;
- constexpr static unsigned int n_output_points = 9;
+ public:
+ a64_fp32_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<float, float, float, float>(9, arm_gemm::VLType::None) {}
- kern_type kernel = a64_fp32_nhwc_generic_output9_mla_depthfirst_impl;
-
- a64_fp32_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {}
+ KernelType get_kernel() const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
index 60f5ddd68f..2ec0525226 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,39 +28,34 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
-struct a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst
+struct a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst : DepthfirstMultiplierStrategy<float, float, float, float>
{
- typedef float bias_type;
- typedef float input_type;
- typedef float weight_type;
- typedef float return_type;
-
- typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
+ using Parent = DepthfirstMultiplierStrategy<float, float, float, float>;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 2;
constexpr static unsigned int stride_cols = 2;
- constexpr static unsigned int output_rows = 3;
- constexpr static unsigned int output_cols = 3;
-
- constexpr static unsigned int input_rows = 7;
- constexpr static unsigned int input_cols = 7;
- constexpr static unsigned int input_col_quads = 2;
+ a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(const CPUInfo *)
+ : Parent(3, 3, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
- kern_type kernel = a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
- a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
+
+#endif // defined(__aarch64__) \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
index 92d6a757f2..5ae8dd3653 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,39 +28,34 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
-struct a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst
+struct a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst : DepthfirstMultiplierStrategy<float, float, float, float>
{
- typedef float bias_type;
- typedef float input_type;
- typedef float weight_type;
- typedef float return_type;
-
- typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
+ using Parent = DepthfirstMultiplierStrategy<float, float, float, float>;
constexpr static unsigned int kernel_rows = 5;
constexpr static unsigned int kernel_cols = 5;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 4;
-
- constexpr static unsigned int input_rows = 6;
- constexpr static unsigned int input_cols = 8;
- constexpr static unsigned int input_col_quads = 2;
+ a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(const CPUInfo *)
+ : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
- kern_type kernel = a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
- a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
index 2cc2f7c103..d60e15ec84 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,31 +28,25 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float);
-struct a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
+struct a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<float, float, float, float>
{
- typedef float bias_type;
- typedef float input_type;
- typedef float weight_type;
- typedef float return_type;
-
- typedef void (*kern_type)(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- constexpr static unsigned int output_rows(void) { return 2; };
- constexpr static unsigned int output_cols(void) { return 8; };
-
- constexpr static unsigned int output_col_regs(void) { return 2; };
-
- kern_type kernel = a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
-
- a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {}
+ using Parent = GenericDepthfirstMultiplierKernelStrategy<float, float, float, float>;
+ a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+ : Parent(2, 8, arm_gemm::VLType::None)
+ {
+ }
+ Parent::KernelType kernel = a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
index c76cb9906f..62e4a82fbf 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,39 +34,40 @@
namespace arm_conv {
namespace depthwise {
-void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *);
-struct a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst
+class a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef int8_t input_type;
- typedef int8_t weight_type;
- typedef int8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- typedef void (*kern_type)(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 4;
- constexpr static unsigned int input_cols = 4;
-
- constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_dot::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_dot::get_packed_size;
-
- kern_type kernel = a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
-
- a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {}
+ a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleave_a64_s8q_3x3_dot::get_packed_size(args);
+ }
+
+ void pack_parameters(
+ const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const override
+ {
+ interleave_a64_s8q_3x3_dot::pack_parameters(
+ args.input_channels, buffer, reinterpret_cast<const int32_t *>(biases),
+ reinterpret_cast<const int8_t *>(weights), qp, ld_weight_col, ld_weight_row
+ );
+ }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index ed8cd4861e..f8245fc5d9 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,7 +30,15 @@
namespace arm_conv {
namespace depthwise {
-void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp)
+void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
+ const unsigned int n_channels,
+ const int8_t *const *const inptrs,
+ const int8_t *params,
+ const int32_t *, // Bias, should be wrapped into the parameters
+ const arm_gemm::Requantize32& qp,
+ const int32_t *, const int32_t *, // Requant parameters, also wrapped
+ int8_t *const *const outptrs
+)
{
__asm__ __volatile__(
"ldp x13, x12, [%x[inptrs], #0x0]\n"
@@ -1307,7 +1315,7 @@ void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *cons
"34:" // End
"add SP, SP, #0x80\n"
: [params] "+&r" (params)
- : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : [inptrs] "r" (inptrs), [n_channels] "r" ((long unsigned int) n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 76c927abcb..c1baab43e5 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,37 +36,24 @@ namespace depthwise {
void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
-struct a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst
+class a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef int8_t input_type;
- typedef int8_t weight_type;
- typedef int8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 4;
- constexpr static unsigned int input_cols = 4;
-
- constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_mla::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_mla::get_packed_size;
+ a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
- kern_type kernel = a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
- a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 3001276fb5..0e8d16fc40 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,7 +46,7 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const int8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const int8_t *const *inptrs_raw,
- const int8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -91,513 +91,497 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
+ "ldr x19, [%x[params], %[offsetof_Params_requant]]\n"
"ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
- "mov x17, #0x0\n"
- "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x24, x19, %[offsetof_Requantize32_a_offset]\n"
+ "add x23, x19, %[offsetof_Requantize32_b_offset]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x19, %[offsetof_Requantize32_c_offset]\n"
+ "add x20, x19, %[offsetof_Requantize32_minval]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x19, x19, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v22.16b }, [x24]\n"
+ "ld1r { v12.16b }, [x23]\n"
+ "lsr x16, x8, #0x3\n"
+ "ld1r { v14.8h }, [x21]\n"
+ "ld1r { v17.8h }, [x20]\n"
"mov x15, #0x0\n"
- "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
- "add x14, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x14, #0x0\n"
+ "ld1r { v15.8h }, [x19]\n"
"ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "lsr x12, x8, #0x3\n"
+ "add x12, %x[params], %[offsetof_Params_inptrs]\n"
"ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
- "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v14.16b }, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v9.16b }, [x20]\n"
- "add x20, x22, %[offsetof_Requantize32_minval]\n"
- "ld1r { v15.4s }, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v24.4s }, [x20]\n"
- "ld1r { v12.4s }, [x19]\n"
- "ldp x10, x9, [x21, #0x0]\n"
- "ldp x28, x27, [x21, #0x10]\n"
- "cbz x12, 3f\n"
- "subs x12, x12, #0x1\n"
+ "ldp x10, x9, [x22, #0x0]\n"
+ "ldp x28, x27, [x22, #0x10]\n"
+ "cbz x16, 3f\n"
"ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
"ldr q13, [x19, #0x0]\n"
- "mov v17.16b, v13.16b\n"
- "ldr q19, [x19, #0x10]\n"
+ "subs x16, x16, #0x1\n"
+ "mov v19.16b, v13.16b\n"
+ "ldr q26, [x19, #0x10]\n"
"add x19, x19, #0x20\n"
- "mov v16.16b, v13.16b\n"
"str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "mov v23.16b, v13.16b\n"
- "ldr d0, [x16, #0x0]\n"
- "ssubl v0.8h, v0.8b, v9.8b\n"
- "mov v25.16b, v19.16b\n"
- "ldr d1, [x16, #0x8]\n"
- "mov v21.16b, v19.16b\n"
- "ldr d2, [x16, #0x10]\n"
- "ssubl v1.8h, v1.8b, v9.8b\n"
- "mov v20.16b, v19.16b\n"
- "ldr d3, [x16, #0x18]\n"
- "ldr d4, [x16, #0x20]\n"
- "ssubl v2.8h, v2.8b, v9.8b\n"
- "ldr d5, [x16, #0x28]\n"
- "ssubl v3.8h, v3.8b, v9.8b\n"
- "ldr d6, [x16, #0x30]\n"
- "ldr d7, [x16, #0x38]\n"
- "ssubl v4.8h, v4.8b, v9.8b\n"
- "ldr d8, [x16, #0x40]\n"
- "ssubl v5.8h, v5.8b, v9.8b\n"
- "ldp x23, x22, [x14, #0x0]\n"
- "ssubl v6.8h, v6.8b, v9.8b\n"
- "ldp x21, x20, [x14, #0x10]\n"
- "ssubl v7.8h, v7.8b, v9.8b\n"
- "ssubl v8.8h, v8.8b, v9.8b\n"
- "ldr x19, [x14, #0x20]\n"
- "ldr d31, [x23, x17]\n"
- "ssubl v31.8h, v31.8b, v14.8b\n"
- "ldr d30, [x22, x17]\n"
- "ldr d29, [x21, x17]\n"
- "ssubl v30.8h, v30.8b, v14.8b\n"
- "ldr d28, [x20, x17]\n"
- "ldr d27, [x19, x17]\n"
- "ssubl v29.8h, v29.8b, v14.8b\n"
- "ssubl v28.8h, v28.8b, v14.8b\n"
- "ssubl v27.8h, v27.8b, v14.8b\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "ldr d2, [x17, #0x10]\n"
+ "mov v11.16b, v26.16b\n"
+ "mov v18.16b, v13.16b\n"
+ "ldr d3, [x17, #0x18]\n"
+ "ldr d4, [x17, #0x20]\n"
+ "mov v24.16b, v26.16b\n"
+ "mov v9.16b, v13.16b\n"
+ "ldr d5, [x17, #0x28]\n"
+ "ldr d6, [x17, #0x30]\n"
+ "mov v23.16b, v26.16b\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
+ "ldr d7, [x17, #0x38]\n"
+ "ldr d8, [x17, #0x40]\n"
+ "ssubl v1.8h, v1.8b, v12.8b\n"
+ "ssubl v2.8h, v2.8b, v12.8b\n"
+ "ldp x23, x22, [x12, #0x0]\n"
+ "ldp x21, x20, [x12, #0x10]\n"
+ "ssubl v3.8h, v3.8b, v12.8b\n"
+ "ssubl v4.8h, v4.8b, v12.8b\n"
+ "ldr x19, [x12, #0x20]\n"
+ "ldr d31, [x23, x15]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "ssubl v6.8h, v6.8b, v12.8b\n"
+ "ldr d30, [x22, x15]\n"
+ "ldr d29, [x21, x15]\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "ldr d28, [x20, x15]\n"
+ "ldr d27, [x19, x15]\n"
+ "ssubl v31.8h, v31.8b, v22.8b\n"
+ "ssubl v30.8h, v30.8b, v22.8b\n"
+ "ssubl v29.8h, v29.8b, v22.8b\n"
+ "ssubl v28.8h, v28.8b, v22.8b\n"
+ "ssubl v27.8h, v27.8b, v22.8b\n"
"beq 2f\n"
"1:" // Loop
"smlal v13.4s, v31.4h, v4.4h\n"
- "ldr x21, [x14, #0x28]\n"
- "add x16, x16, #0x48\n"
- "smlal2 v19.4s, v31.8h, v4.8h\n"
- "ldr x20, [x14, #0x30]\n"
- "subs x12, x12, #0x1\n"
- "smlal v17.4s, v31.4h, v3.4h\n"
- "ldr x26, [x14, #0x38]\n"
- "smlal2 v25.4s, v31.8h, v3.8h\n"
- "ldr x25, [x14, #0x40]\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "ldr x19, [x14, #0x48]\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "ldr x24, [x14, #0x50]\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "ldr x23, [x14, #0x58]\n"
- "smlal2 v20.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x17]\n"
- "ssubl v31.8h, v31.8b, v14.8b\n"
+ "smlal2 v26.4s, v31.8h, v4.8h\n"
+ "ldr x21, [x12, #0x28]\n"
+ "ldr x26, [x12, #0x38]\n"
+ "smlal v19.4s, v31.4h, v3.4h\n"
+ "smlal2 v11.4s, v31.8h, v3.8h\n"
+ "ldr x20, [x12, #0x30]\n"
+ "ldr x25, [x12, #0x40]\n"
"smlal v13.4s, v30.4h, v0.4h\n"
- "ldr x22, [x14, #0x60]\n"
- "smlal2 v19.4s, v30.8h, v0.8h\n"
- "ldr d30, [x19, x17]\n"
- "ssubl v30.8h, v30.8b, v14.8b\n"
- "smlal v17.4s, v29.4h, v2.4h\n"
- "ldr x21, [x14, #0x68]\n"
- "smlal2 v25.4s, v29.8h, v2.8h\n"
- "ldr d29, [x20, x17]\n"
- "ssubl v29.8h, v29.8b, v14.8b\n"
+ "smlal2 v26.4s, v30.8h, v0.8h\n"
+ "ldr x19, [x12, #0x48]\n"
+ "ldr d30, [x19, x15]\n"
+ "smlal v19.4s, v29.4h, v2.4h\n"
+ "smlal2 v11.4s, v29.8h, v2.8h\n"
+ "ldr d29, [x20, x15]\n"
+ "ssubl v29.8h, v29.8b, v22.8b\n"
+ "smlal v18.4s, v31.4h, v1.4h\n"
+ "smlal2 v24.4s, v31.8h, v1.8h\n"
+ "ldr x24, [x12, #0x50]\n"
+ "ldr x23, [x12, #0x58]\n"
+ "smlal v9.4s, v31.4h, v0.4h\n"
+ "smlal2 v23.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x21, x15]\n"
+ "ssubl v31.8h, v31.8b, v22.8b\n"
"smlal v13.4s, v28.4h, v5.4h\n"
- "ldr x20, [x14, #0x70]\n"
- "smlal2 v19.4s, v28.8h, v5.8h\n"
- "ldr x19, [x14, #0x78]\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "ldr q26, [x13, #0x0]\n"
- "smlal2 v25.4s, v28.8h, v4.8h\n"
- "ldr q10, [x11, #0x0]\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "ldr q11, [x13, #0x10]\n"
- "add x13, x13, #0x20\n"
- "smlal2 v21.4s, v28.8h, v2.8h\n"
- "ldr q18, [x11, #0x10]\n"
- "add x11, x11, #0x20\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x26, x17]\n"
- "ssubl v28.8h, v28.8b, v14.8b\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v21.4s, v31.8h, v6.8h\n"
- "ldr d31, [x25, x17]\n"
- "ssubl v31.8h, v31.8b, v14.8b\n"
+ "smlal2 v26.4s, v28.8h, v5.8h\n"
+ "ssubl v30.8h, v30.8b, v22.8b\n"
+ "ldr x22, [x12, #0x60]\n"
+ "smlal v19.4s, v28.4h, v4.4h\n"
+ "smlal2 v11.4s, v28.8h, v4.8h\n"
+ "ldr x21, [x12, #0x68]\n"
+ "ldr x20, [x12, #0x70]\n"
+ "smlal v18.4s, v28.4h, v2.4h\n"
+ "smlal2 v24.4s, v28.8h, v2.8h\n"
+ "ldr x19, [x12, #0x78]\n"
+ "ldr q21, [x13, #0x0]\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "smlal2 v23.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x26, x15]\n"
+ "ssubl v28.8h, v28.8b, v22.8b\n"
"smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v19.4s, v27.8h, v7.8h\n"
- "smlal v17.4s, v27.4h, v6.4h\n"
- "smlal2 v25.4s, v27.8h, v6.8h\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "smlal v23.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal2 v26.4s, v27.8h, v7.8h\n"
+ "ldr q25, [x11, #0x0]\n"
+ "ldr q10, [x13, #0x10]\n"
+ "smlal v19.4s, v27.4h, v6.4h\n"
+ "smlal2 v11.4s, v27.8h, v6.8h\n"
+ "ldr q16, [x11, #0x10]\n"
+ "add x17, x17, #0x48\n"
+ "smlal v18.4s, v31.4h, v6.4h\n"
+ "smlal2 v24.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x25, x15]\n"
+ "ssubl v31.8h, v31.8b, v22.8b\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "smlal2 v23.4s, v27.8h, v3.8h\n"
+ "subs x16, x16, #0x1\n"
+ "add x13, x13, #0x20\n"
"smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "ldr d29, [x24, x17]\n"
- "ssubl v29.8h, v29.8b, v14.8b\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal2 v25.4s, v28.8h, v0.8h\n"
- "ldr d28, [x23, x17]\n"
- "ssubl v28.8h, v28.8b, v14.8b\n"
+ "smlal2 v26.4s, v28.8h, v1.8h\n"
+ "add x11, x11, #0x20\n"
+ "smlal v19.4s, v28.4h, v0.4h\n"
+ "smlal2 v11.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x23, x15]\n"
+ "ssubl v28.8h, v28.8b, v22.8b\n"
+ "smlal v18.4s, v27.4h, v4.4h\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "smlal2 v24.4s, v27.8h, v4.8h\n"
+ "smlal2 v23.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x24, x15]\n"
+ "ssubl v29.8h, v29.8b, v22.8b\n"
"smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v25.4s, v31.8h, v1.8h\n"
- "ldr d31, [x22, x17]\n"
- "ssubl v31.8h, v31.8b, v14.8b\n"
+ "smlal2 v26.4s, v31.8h, v2.8h\n"
+ "smlal v19.4s, v31.4h, v1.4h\n"
+ "smlal2 v11.4s, v31.8h, v1.8h\n"
+ "ldr d31, [x22, x15]\n"
+ "ssubl v31.8h, v31.8b, v22.8b\n"
+ "smlal v18.4s, v30.4h, v5.4h\n"
+ "smlal v9.4s, v30.4h, v4.4h\n"
"smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v19.4s, v30.8h, v8.8h\n"
- "smlal v17.4s, v30.4h, v7.4h\n"
- "smlal2 v25.4s, v30.8h, v7.8h\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal2 v21.4s, v30.8h, v5.8h\n"
- "smlal v23.4s, v30.4h, v4.4h\n"
- "smlal2 v20.4s, v30.8h, v4.8h\n"
- "ldr d30, [x21, x17]\n"
- "ssubl v30.8h, v30.8b, v14.8b\n"
+ "smlal2 v26.4s, v30.8h, v8.8h\n"
+ "smlal v19.4s, v30.4h, v7.4h\n"
+ "smlal2 v11.4s, v30.8h, v7.8h\n"
+ "smlal2 v24.4s, v30.8h, v5.8h\n"
+ "smlal2 v23.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x21, x15]\n"
+ "ssubl v30.8h, v30.8b, v22.8b\n"
+ "smlal v18.4s, v29.4h, v0.4h\n"
+ "smlal v9.4s, v28.4h, v2.4h\n"
"smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v19.4s, v29.8h, v3.8h\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal2 v21.4s, v29.8h, v0.8h\n"
- "ldr d29, [x20, x17]\n"
- "ssubl v29.8h, v29.8b, v14.8b\n"
- "smlal v17.4s, v28.4h, v5.4h\n"
- "smlal2 v25.4s, v28.8h, v5.8h\n"
- "smlal v23.4s, v28.4h, v2.4h\n"
- "smlal2 v20.4s, v28.8h, v2.8h\n"
- "ldr d28, [x19, x17]\n"
- "add x17, x17, #0x8\n"
+ "smlal2 v26.4s, v29.8h, v3.8h\n"
+ "smlal2 v24.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x20, x15]\n"
+ "smlal2 v23.4s, v28.8h, v2.8h\n"
+ "ssubl v29.8h, v29.8b, v22.8b\n"
+ "smlal v18.4s, v31.4h, v3.4h\n"
+ "smlal v9.4s, v30.4h, v5.4h\n"
+ "smlal v19.4s, v28.4h, v5.4h\n"
+ "smlal2 v11.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x19, x15]\n"
+ "ssubl v28.8h, v28.8b, v22.8b\n"
+ "smlal2 v24.4s, v31.8h, v3.8h\n"
+ "smlal2 v23.4s, v30.8h, v5.8h\n"
+ "add x15, x15, #0x8\n"
+ "smlal v18.4s, v29.4h, v7.4h\n"
+ "smlal v9.4s, v29.4h, v6.4h\n"
+ "smlal2 v24.4s, v29.8h, v7.8h\n"
+ "smlal2 v23.4s, v29.8h, v6.8h\n"
"smlal v13.4s, v31.4h, v6.4h\n"
- "ssubl v28.8h, v28.8b, v14.8b\n"
- "smlal2 v19.4s, v31.8h, v6.8h\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v3.8h\n"
- "smlal v17.4s, v30.4h, v8.4h\n"
- "smlal2 v25.4s, v30.8h, v8.8h\n"
- "smlal v23.4s, v30.4h, v5.4h\n"
- "smlal2 v20.4s, v30.8h, v5.8h\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "smlal2 v21.4s, v29.8h, v7.8h\n"
- "smlal v23.4s, v29.4h, v6.4h\n"
- "smlal2 v20.4s, v29.8h, v6.8h\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "smlal2 v21.4s, v28.8h, v8.8h\n"
- "smlal v23.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "sqrdmulh v13.4s, v13.4s, v26.4s\n"
- "sqrdmulh v19.4s, v19.4s, v11.4s\n"
- "sqrdmulh v17.4s, v17.4s, v26.4s\n"
- "sqrdmulh v25.4s, v25.4s, v11.4s\n"
- "and v22.16b, v13.16b, v10.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v28.16b, v19.16b, v18.16b\n"
- "and v3.16b, v17.16b, v10.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v6.16b, v25.16b, v18.16b\n"
- "sqrdmulh v16.4s, v16.4s, v26.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqrdmulh v21.4s, v21.4s, v11.4s\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sqadd v13.4s, v13.4s, v22.4s\n"
- "sqrdmulh v23.4s, v23.4s, v26.4s\n"
- "and v0.16b, v16.16b, v10.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v10.4s\n"
- "sqadd v19.4s, v19.4s, v28.4s\n"
- "sqadd v17.4s, v17.4s, v3.4s\n"
- "sqadd v25.4s, v25.4s, v6.4s\n"
- "and v29.16b, v21.16b, v18.16b\n"
+ "smlal v19.4s, v30.4h, v8.4h\n"
+ "sqdmulh v13.4s, v13.4s, v21.4s\n"
+ "smlal v18.4s, v28.4h, v8.4h\n"
+ "smlal v9.4s, v28.4h, v7.4h\n"
+ "sqdmulh v19.4s, v19.4s, v21.4s\n"
+ "smlal2 v26.4s, v31.8h, v6.8h\n"
+ "smlal2 v11.4s, v30.8h, v8.8h\n"
+ "sqdmulh v18.4s, v18.4s, v21.4s\n"
+ "smlal2 v24.4s, v28.8h, v8.8h\n"
+ "smlal2 v23.4s, v28.8h, v7.8h\n"
+ "sqdmulh v9.4s, v9.4s, v21.4s\n"
+ "and v7.16b, v13.16b, v25.16b\n"
+ "sqdmulh v26.4s, v26.4s, v10.4s\n"
+ "and v4.16b, v19.16b, v25.16b\n"
+ "sqdmulh v11.4s, v11.4s, v10.4s\n"
+ "and v21.16b, v18.16b, v25.16b\n"
+ "sqdmulh v24.4s, v24.4s, v10.4s\n"
+ "and v20.16b, v9.16b, v25.16b\n"
+ "sqdmulh v23.4s, v23.4s, v10.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v29.16b, v26.16b, v16.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v10.16b, v11.16b, v16.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v31.16b, v24.16b, v16.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v30.16b, v23.16b, v16.16b\n"
+ "sqadd v13.4s, v13.4s, v7.4s\n"
"sshr v29.4s, v29.4s, #0x1f\n"
- "add v13.4s, v13.4s, v15.4s\n"
- "srshl v19.4s, v19.4s, v18.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v18.4s\n"
- "smin v13.4s, v13.4s, v12.4s\n"
- "add v19.4s, v19.4s, v15.4s\n"
- "add v17.4s, v17.4s, v15.4s\n"
- "smax v13.4s, v13.4s, v24.4s\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "smin v17.4s, v17.4s, v12.4s\n"
- "add v25.4s, v25.4s, v15.4s\n"
- "smax v19.4s, v19.4s, v24.4s\n"
- "smax v17.4s, v17.4s, v24.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "uzp1 v13.16b, v13.16b, v19.16b\n"
- "sqadd v16.4s, v16.4s, v0.4s\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v21.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v20.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v25.4s\n"
+ "sqadd v26.4s, v26.4s, v29.4s\n"
+ "srshl v19.4s, v19.4s, v25.4s\n"
+ "sqadd v11.4s, v11.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v25.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "srshl v9.4s, v9.4s, v25.4s\n"
+ "sqadd v23.4s, v23.4s, v30.4s\n"
+ "srshl v26.4s, v26.4s, v16.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v11.4s, v11.4s, v16.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "srshl v24.4s, v24.4s, v16.4s\n"
+ "sqxtn v18.4h, v18.4s\n"
+ "srshl v23.4s, v23.4s, v16.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "sqxtn2 v13.8h, v26.4s\n"
+ "sqxtn2 v19.8h, v11.4s\n"
+ "sqxtn2 v18.8h, v24.4s\n"
+ "sqxtn2 v9.8h, v23.4s\n"
+ "sqadd v13.8h, v13.8h, v14.8h\n"
+ "sqadd v19.8h, v19.8h, v14.8h\n"
+ "sqadd v18.8h, v18.8h, v14.8h\n"
+ "sqadd v9.8h, v9.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smax v18.8h, v18.8h, v17.8h\n"
+ "smax v9.8h, v9.8h, v17.8h\n"
+ "smin v13.8h, v13.8h, v15.8h\n"
+ "smin v19.8h, v19.8h, v15.8h\n"
+ "smin v18.8h, v18.8h, v15.8h\n"
+ "smin v9.8h, v9.8h, v15.8h\n"
"uzp1 v13.16b, v13.16b, v13.16b\n"
- "str d13, [x10, x15]\n"
- "smax v25.4s, v25.4s, v24.4s\n"
- "sqadd v21.4s, v21.4s, v29.4s\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "and v3.16b, v23.16b, v10.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "uzp1 v17.16b, v17.16b, v25.16b\n"
- "add v16.4s, v16.4s, v15.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d17, [x9, x15]\n"
- "smin v16.4s, v16.4s, v12.4s\n"
- "sqrdmulh v20.4s, v20.4s, v11.4s\n"
- "add v21.4s, v21.4s, v15.4s\n"
- "sqadd v23.4s, v23.4s, v3.4s\n"
- "smax v16.4s, v16.4s, v24.4s\n"
- "smin v21.4s, v21.4s, v12.4s\n"
- "and v25.16b, v20.16b, v18.16b\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "smax v21.4s, v21.4s, v24.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "uzp1 v16.16b, v16.16b, v21.16b\n"
- "add v23.4s, v23.4s, v15.4s\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d16, [x28, x15]\n"
- "smin v23.4s, v23.4s, v12.4s\n"
- "sqadd v20.4s, v20.4s, v25.4s\n"
- "smax v23.4s, v23.4s, v24.4s\n"
- "srshl v20.4s, v20.4s, v18.4s\n"
- "add v20.4s, v20.4s, v15.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "smax v20.4s, v20.4s, v24.4s\n"
- "uzp1 v23.16b, v23.16b, v20.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d23, [x27, x15]\n"
- "add x15, x15, #0x8\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d13, [x10, x14]\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "str d19, [x9, x14]\n"
+ "str d18, [x28, x14]\n"
+ "str d9, [x27, x14]\n"
"ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
"ldr q13, [x19, #0x0]\n"
- "mov v17.16b, v13.16b\n"
- "ldr q19, [x19, #0x10]\n"
+ "add x14, x14, #0x8\n"
+ "ldr q26, [x19, #0x10]\n"
"add x19, x19, #0x20\n"
- "mov v16.16b, v13.16b\n"
"str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "mov v23.16b, v13.16b\n"
- "ldr d0, [x16, #0x0]\n"
- "ssubl v0.8h, v0.8b, v9.8b\n"
- "mov v25.16b, v19.16b\n"
- "ldr d1, [x16, #0x8]\n"
- "mov v21.16b, v19.16b\n"
- "ldr d2, [x16, #0x10]\n"
- "ssubl v1.8h, v1.8b, v9.8b\n"
- "mov v20.16b, v19.16b\n"
- "ldr d3, [x16, #0x18]\n"
- "ldr d4, [x16, #0x20]\n"
- "ssubl v2.8h, v2.8b, v9.8b\n"
- "ldr d5, [x16, #0x28]\n"
- "ssubl v3.8h, v3.8b, v9.8b\n"
- "ldr d6, [x16, #0x30]\n"
- "ldr d7, [x16, #0x38]\n"
- "ssubl v4.8h, v4.8b, v9.8b\n"
- "ldr d8, [x16, #0x40]\n"
- "ssubl v5.8h, v5.8b, v9.8b\n"
- "ldp x23, x22, [x14, #0x0]\n"
- "ssubl v6.8h, v6.8b, v9.8b\n"
- "ldp x21, x20, [x14, #0x10]\n"
- "ssubl v7.8h, v7.8b, v9.8b\n"
- "ssubl v8.8h, v8.8b, v9.8b\n"
- "ldr x19, [x14, #0x20]\n"
- "ldr d31, [x23, x17]\n"
- "ssubl v31.8h, v31.8b, v14.8b\n"
- "ldr d30, [x22, x17]\n"
- "ldr d29, [x21, x17]\n"
- "ssubl v30.8h, v30.8b, v14.8b\n"
- "ldr d28, [x20, x17]\n"
- "ldr d27, [x19, x17]\n"
- "ssubl v29.8h, v29.8b, v14.8b\n"
- "ssubl v28.8h, v28.8b, v14.8b\n"
- "ssubl v27.8h, v27.8b, v14.8b\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "ldr d2, [x17, #0x10]\n"
+ "mov v19.16b, v13.16b\n"
+ "mov v11.16b, v26.16b\n"
+ "ldr d3, [x17, #0x18]\n"
+ "ldr d4, [x17, #0x20]\n"
+ "mov v18.16b, v13.16b\n"
+ "mov v24.16b, v26.16b\n"
+ "ldr d5, [x17, #0x28]\n"
+ "ldr d6, [x17, #0x30]\n"
+ "mov v9.16b, v13.16b\n"
+ "mov v23.16b, v26.16b\n"
+ "ldr d7, [x17, #0x38]\n"
+ "ldr d8, [x17, #0x40]\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
+ "ssubl v1.8h, v1.8b, v12.8b\n"
+ "ldp x23, x22, [x12, #0x0]\n"
+ "ldp x21, x20, [x12, #0x10]\n"
+ "ssubl v2.8h, v2.8b, v12.8b\n"
+ "ssubl v3.8h, v3.8b, v12.8b\n"
+ "ldr x19, [x12, #0x20]\n"
+ "ldr d31, [x23, x15]\n"
+ "ssubl v4.8h, v4.8b, v12.8b\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "ldr d30, [x22, x15]\n"
+ "ldr d29, [x21, x15]\n"
+ "ssubl v6.8h, v6.8b, v12.8b\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d28, [x20, x15]\n"
+ "ldr d27, [x19, x15]\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "ssubl v31.8h, v31.8b, v22.8b\n"
+ "ssubl v30.8h, v30.8b, v22.8b\n"
+ "ssubl v29.8h, v29.8b, v22.8b\n"
+ "ssubl v28.8h, v28.8b, v22.8b\n"
+ "ssubl v27.8h, v27.8b, v22.8b\n"
"bgt 1b\n"
"2:" // Tail
"smlal v13.4s, v31.4h, v4.4h\n"
- "ldr x21, [x14, #0x28]\n"
- "tst x8, #0x7\n"
- "smlal2 v19.4s, v31.8h, v4.8h\n"
- "ldr x20, [x14, #0x30]\n"
- "smlal v17.4s, v31.4h, v3.4h\n"
- "ldr x26, [x14, #0x38]\n"
- "smlal2 v25.4s, v31.8h, v3.8h\n"
- "ldr x25, [x14, #0x40]\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "ldr x19, [x14, #0x48]\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "ldr x24, [x14, #0x50]\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "ldr x23, [x14, #0x58]\n"
- "smlal2 v20.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x17]\n"
- "ssubl v31.8h, v31.8b, v14.8b\n"
+ "smlal2 v26.4s, v31.8h, v4.8h\n"
+ "ldr x21, [x12, #0x28]\n"
+ "ldr x26, [x12, #0x38]\n"
+ "smlal v19.4s, v31.4h, v3.4h\n"
+ "smlal2 v11.4s, v31.8h, v3.8h\n"
+ "ldr x20, [x12, #0x30]\n"
+ "ldr x25, [x12, #0x40]\n"
"smlal v13.4s, v30.4h, v0.4h\n"
- "ldr x22, [x14, #0x60]\n"
- "smlal2 v19.4s, v30.8h, v0.8h\n"
- "ldr d30, [x19, x17]\n"
- "ssubl v30.8h, v30.8b, v14.8b\n"
- "smlal v17.4s, v29.4h, v2.4h\n"
- "ldr x21, [x14, #0x68]\n"
- "smlal2 v25.4s, v29.8h, v2.8h\n"
- "ldr d29, [x20, x17]\n"
- "ssubl v29.8h, v29.8b, v14.8b\n"
+ "smlal2 v26.4s, v30.8h, v0.8h\n"
+ "ldr x19, [x12, #0x48]\n"
+ "ldr d30, [x19, x15]\n"
+ "smlal v19.4s, v29.4h, v2.4h\n"
+ "smlal2 v11.4s, v29.8h, v2.8h\n"
+ "ldr d29, [x20, x15]\n"
+ "ssubl v29.8h, v29.8b, v22.8b\n"
+ "smlal v18.4s, v31.4h, v1.4h\n"
+ "smlal2 v24.4s, v31.8h, v1.8h\n"
+ "ldr x24, [x12, #0x50]\n"
+ "ldr x23, [x12, #0x58]\n"
+ "smlal v9.4s, v31.4h, v0.4h\n"
+ "smlal2 v23.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x21, x15]\n"
+ "ssubl v31.8h, v31.8b, v22.8b\n"
"smlal v13.4s, v28.4h, v5.4h\n"
- "ldr x20, [x14, #0x70]\n"
- "smlal2 v19.4s, v28.8h, v5.8h\n"
- "ldr x19, [x14, #0x78]\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "ldr q26, [x13, #0x0]\n"
- "smlal2 v25.4s, v28.8h, v4.8h\n"
- "ldr q10, [x11, #0x0]\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "ldr q11, [x13, #0x10]\n"
+ "smlal2 v26.4s, v28.8h, v5.8h\n"
+ "ssubl v30.8h, v30.8b, v22.8b\n"
+ "ldr x22, [x12, #0x60]\n"
+ "smlal v19.4s, v28.4h, v4.4h\n"
+ "smlal2 v11.4s, v28.8h, v4.8h\n"
+ "ldr x21, [x12, #0x68]\n"
+ "ldr x20, [x12, #0x70]\n"
+ "smlal v18.4s, v28.4h, v2.4h\n"
+ "smlal2 v24.4s, v28.8h, v2.8h\n"
+ "ldr x19, [x12, #0x78]\n"
+ "ldr q21, [x13, #0x0]\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "smlal2 v23.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x26, x15]\n"
+ "ssubl v28.8h, v28.8b, v22.8b\n"
+ "smlal v13.4s, v27.4h, v7.4h\n"
+ "smlal2 v26.4s, v27.8h, v7.8h\n"
+ "ldr q25, [x11, #0x0]\n"
+ "ldr q10, [x13, #0x10]\n"
+ "smlal v19.4s, v27.4h, v6.4h\n"
+ "smlal2 v11.4s, v27.8h, v6.8h\n"
+ "ldr q16, [x11, #0x10]\n"
+ "tst x8, #0x7\n"
+ "smlal v18.4s, v31.4h, v6.4h\n"
+ "smlal2 v24.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x25, x15]\n"
+ "ssubl v31.8h, v31.8b, v22.8b\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "smlal2 v23.4s, v27.8h, v3.8h\n"
"add x13, x13, #0x20\n"
- "smlal2 v21.4s, v28.8h, v2.8h\n"
- "ldr q18, [x11, #0x10]\n"
"add x11, x11, #0x20\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x26, x17]\n"
- "ssubl v28.8h, v28.8b, v14.8b\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v21.4s, v31.8h, v6.8h\n"
- "ldr d31, [x25, x17]\n"
- "ssubl v31.8h, v31.8b, v14.8b\n"
- "smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v19.4s, v27.8h, v7.8h\n"
- "smlal v17.4s, v27.4h, v6.4h\n"
- "smlal2 v25.4s, v27.8h, v6.8h\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "smlal v23.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
"smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "ldr d29, [x24, x17]\n"
- "ssubl v29.8h, v29.8b, v14.8b\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal2 v25.4s, v28.8h, v0.8h\n"
- "ldr d28, [x23, x17]\n"
- "ssubl v28.8h, v28.8b, v14.8b\n"
+ "smlal2 v26.4s, v28.8h, v1.8h\n"
+ "smlal v19.4s, v28.4h, v0.4h\n"
+ "smlal2 v11.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x23, x15]\n"
+ "ssubl v28.8h, v28.8b, v22.8b\n"
+ "smlal v18.4s, v27.4h, v4.4h\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "smlal2 v24.4s, v27.8h, v4.8h\n"
+ "smlal2 v23.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x24, x15]\n"
+ "ssubl v29.8h, v29.8b, v22.8b\n"
"smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v25.4s, v31.8h, v1.8h\n"
- "ldr d31, [x22, x17]\n"
- "ssubl v31.8h, v31.8b, v14.8b\n"
+ "smlal2 v26.4s, v31.8h, v2.8h\n"
+ "smlal v19.4s, v31.4h, v1.4h\n"
+ "smlal2 v11.4s, v31.8h, v1.8h\n"
+ "ldr d31, [x22, x15]\n"
+ "ssubl v31.8h, v31.8b, v22.8b\n"
+ "smlal v18.4s, v30.4h, v5.4h\n"
+ "smlal v9.4s, v30.4h, v4.4h\n"
"smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v19.4s, v30.8h, v8.8h\n"
- "smlal v17.4s, v30.4h, v7.4h\n"
- "smlal2 v25.4s, v30.8h, v7.8h\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal2 v21.4s, v30.8h, v5.8h\n"
- "smlal v23.4s, v30.4h, v4.4h\n"
- "smlal2 v20.4s, v30.8h, v4.8h\n"
- "ldr d30, [x21, x17]\n"
- "ssubl v30.8h, v30.8b, v14.8b\n"
+ "smlal2 v26.4s, v30.8h, v8.8h\n"
+ "smlal v19.4s, v30.4h, v7.4h\n"
+ "smlal2 v11.4s, v30.8h, v7.8h\n"
+ "smlal2 v24.4s, v30.8h, v5.8h\n"
+ "smlal2 v23.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x21, x15]\n"
+ "ssubl v30.8h, v30.8b, v22.8b\n"
+ "smlal v18.4s, v29.4h, v0.4h\n"
+ "smlal v9.4s, v28.4h, v2.4h\n"
"smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v19.4s, v29.8h, v3.8h\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal2 v21.4s, v29.8h, v0.8h\n"
- "ldr d29, [x20, x17]\n"
- "ssubl v29.8h, v29.8b, v14.8b\n"
- "smlal v17.4s, v28.4h, v5.4h\n"
- "smlal2 v25.4s, v28.8h, v5.8h\n"
- "smlal v23.4s, v28.4h, v2.4h\n"
- "smlal2 v20.4s, v28.8h, v2.8h\n"
- "ldr d28, [x19, x17]\n"
- "add x17, x17, #0x8\n"
+ "smlal2 v26.4s, v29.8h, v3.8h\n"
+ "smlal2 v24.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x20, x15]\n"
+ "smlal2 v23.4s, v28.8h, v2.8h\n"
+ "ssubl v29.8h, v29.8b, v22.8b\n"
+ "smlal v18.4s, v31.4h, v3.4h\n"
+ "smlal v9.4s, v30.4h, v5.4h\n"
+ "smlal v19.4s, v28.4h, v5.4h\n"
+ "smlal2 v11.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x19, x15]\n"
+ "ssubl v28.8h, v28.8b, v22.8b\n"
+ "smlal2 v24.4s, v31.8h, v3.8h\n"
+ "smlal2 v23.4s, v30.8h, v5.8h\n"
+ "add x15, x15, #0x8\n"
+ "smlal v18.4s, v29.4h, v7.4h\n"
+ "smlal v9.4s, v29.4h, v6.4h\n"
+ "smlal2 v24.4s, v29.8h, v7.8h\n"
+ "smlal2 v23.4s, v29.8h, v6.8h\n"
"smlal v13.4s, v31.4h, v6.4h\n"
- "ssubl v28.8h, v28.8b, v14.8b\n"
- "smlal2 v19.4s, v31.8h, v6.8h\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v3.8h\n"
- "smlal v17.4s, v30.4h, v8.4h\n"
- "smlal2 v25.4s, v30.8h, v8.8h\n"
- "smlal v23.4s, v30.4h, v5.4h\n"
- "smlal2 v20.4s, v30.8h, v5.8h\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "smlal2 v21.4s, v29.8h, v7.8h\n"
- "smlal v23.4s, v29.4h, v6.4h\n"
- "smlal2 v20.4s, v29.8h, v6.8h\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "smlal2 v21.4s, v28.8h, v8.8h\n"
- "smlal v23.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "sqrdmulh v13.4s, v13.4s, v26.4s\n"
- "sqrdmulh v19.4s, v19.4s, v11.4s\n"
- "sqrdmulh v17.4s, v17.4s, v26.4s\n"
- "sqrdmulh v25.4s, v25.4s, v11.4s\n"
- "and v22.16b, v13.16b, v10.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v28.16b, v19.16b, v18.16b\n"
- "and v3.16b, v17.16b, v10.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v6.16b, v25.16b, v18.16b\n"
- "sqrdmulh v16.4s, v16.4s, v26.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqrdmulh v21.4s, v21.4s, v11.4s\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sqadd v13.4s, v13.4s, v22.4s\n"
- "sqrdmulh v23.4s, v23.4s, v26.4s\n"
- "and v0.16b, v16.16b, v10.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v10.4s\n"
- "sqadd v19.4s, v19.4s, v28.4s\n"
- "sqadd v17.4s, v17.4s, v3.4s\n"
- "sqadd v25.4s, v25.4s, v6.4s\n"
- "and v29.16b, v21.16b, v18.16b\n"
+ "smlal v19.4s, v30.4h, v8.4h\n"
+ "sqdmulh v13.4s, v13.4s, v21.4s\n"
+ "smlal v18.4s, v28.4h, v8.4h\n"
+ "smlal v9.4s, v28.4h, v7.4h\n"
+ "sqdmulh v19.4s, v19.4s, v21.4s\n"
+ "smlal2 v26.4s, v31.8h, v6.8h\n"
+ "smlal2 v11.4s, v30.8h, v8.8h\n"
+ "sqdmulh v18.4s, v18.4s, v21.4s\n"
+ "smlal2 v24.4s, v28.8h, v8.8h\n"
+ "smlal2 v23.4s, v28.8h, v7.8h\n"
+ "sqdmulh v9.4s, v9.4s, v21.4s\n"
+ "and v7.16b, v13.16b, v25.16b\n"
+ "sqdmulh v26.4s, v26.4s, v10.4s\n"
+ "and v4.16b, v19.16b, v25.16b\n"
+ "sqdmulh v11.4s, v11.4s, v10.4s\n"
+ "and v21.16b, v18.16b, v25.16b\n"
+ "sqdmulh v24.4s, v24.4s, v10.4s\n"
+ "and v20.16b, v9.16b, v25.16b\n"
+ "sqdmulh v23.4s, v23.4s, v10.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v29.16b, v26.16b, v16.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v10.16b, v11.16b, v16.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v31.16b, v24.16b, v16.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v30.16b, v23.16b, v16.16b\n"
+ "sqadd v13.4s, v13.4s, v7.4s\n"
"sshr v29.4s, v29.4s, #0x1f\n"
- "add v13.4s, v13.4s, v15.4s\n"
- "srshl v19.4s, v19.4s, v18.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v18.4s\n"
- "smin v13.4s, v13.4s, v12.4s\n"
- "add v19.4s, v19.4s, v15.4s\n"
- "add v17.4s, v17.4s, v15.4s\n"
- "smax v13.4s, v13.4s, v24.4s\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "smin v17.4s, v17.4s, v12.4s\n"
- "add v25.4s, v25.4s, v15.4s\n"
- "smax v19.4s, v19.4s, v24.4s\n"
- "smax v17.4s, v17.4s, v24.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "uzp1 v13.16b, v13.16b, v19.16b\n"
- "sqadd v16.4s, v16.4s, v0.4s\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v21.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v20.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v25.4s\n"
+ "sqadd v26.4s, v26.4s, v29.4s\n"
+ "srshl v19.4s, v19.4s, v25.4s\n"
+ "sqadd v11.4s, v11.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v25.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "srshl v9.4s, v9.4s, v25.4s\n"
+ "sqadd v23.4s, v23.4s, v30.4s\n"
+ "srshl v26.4s, v26.4s, v16.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v11.4s, v11.4s, v16.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "srshl v24.4s, v24.4s, v16.4s\n"
+ "sqxtn v18.4h, v18.4s\n"
+ "srshl v23.4s, v23.4s, v16.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "sqxtn2 v13.8h, v26.4s\n"
+ "sqxtn2 v19.8h, v11.4s\n"
+ "sqxtn2 v18.8h, v24.4s\n"
+ "sqxtn2 v9.8h, v23.4s\n"
+ "sqadd v13.8h, v13.8h, v14.8h\n"
+ "sqadd v19.8h, v19.8h, v14.8h\n"
+ "sqadd v18.8h, v18.8h, v14.8h\n"
+ "sqadd v9.8h, v9.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smax v18.8h, v18.8h, v17.8h\n"
+ "smax v9.8h, v9.8h, v17.8h\n"
+ "smin v13.8h, v13.8h, v15.8h\n"
+ "smin v19.8h, v19.8h, v15.8h\n"
+ "smin v18.8h, v18.8h, v15.8h\n"
+ "smin v9.8h, v9.8h, v15.8h\n"
"uzp1 v13.16b, v13.16b, v13.16b\n"
- "str d13, [x10, x15]\n"
- "smax v25.4s, v25.4s, v24.4s\n"
- "sqadd v21.4s, v21.4s, v29.4s\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "and v3.16b, v23.16b, v10.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "uzp1 v17.16b, v17.16b, v25.16b\n"
- "add v16.4s, v16.4s, v15.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d17, [x9, x15]\n"
- "smin v16.4s, v16.4s, v12.4s\n"
- "sqrdmulh v20.4s, v20.4s, v11.4s\n"
- "add v21.4s, v21.4s, v15.4s\n"
- "sqadd v23.4s, v23.4s, v3.4s\n"
- "smax v16.4s, v16.4s, v24.4s\n"
- "smin v21.4s, v21.4s, v12.4s\n"
- "and v25.16b, v20.16b, v18.16b\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "smax v21.4s, v21.4s, v24.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "uzp1 v16.16b, v16.16b, v21.16b\n"
- "add v23.4s, v23.4s, v15.4s\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d16, [x28, x15]\n"
- "smin v23.4s, v23.4s, v12.4s\n"
- "sqadd v20.4s, v20.4s, v25.4s\n"
- "smax v23.4s, v23.4s, v24.4s\n"
- "srshl v20.4s, v20.4s, v18.4s\n"
- "add v20.4s, v20.4s, v15.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "smax v20.4s, v20.4s, v24.4s\n"
- "uzp1 v23.16b, v23.16b, v20.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d23, [x27, x15]\n"
- "add x15, x15, #0x8\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d13, [x10, x14]\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "str d19, [x9, x14]\n"
+ "str d18, [x28, x14]\n"
+ "str d9, [x27, x14]\n"
+ "add x14, x14, #0x8\n"
"beq 64f\n"
- "add x16, x16, #0x48\n"
+ "add x17, x17, #0x48\n"
"3:" // Oddments
"ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x8, #2, 5f\n"
"ld1 { v13.4s }, [x19], #0x10\n"
"tbz x8, #1, 4f\n"
- "ld1 { v19.d }[0], [x19], #0x8\n"
+ "ld1 { v26.d }[0], [x19], #0x8\n"
"tbz x8, #0, 7f\n"
- "ld1 { v19.s }[2], [x19]\n"
+ "ld1 { v26.s }[2], [x19]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz x8, #0, 7f\n"
- "ld1 { v19.s }[0], [x19]\n"
+ "ld1 { v26.s }[0], [x19]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x8, #1, 6f\n"
@@ -609,38 +593,38 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 7f\n"
"ld1 { v13.s }[0], [x19]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "mov v17.16b, v13.16b\n"
- "ldr d0, [x16, #0x0]\n"
- "mov v25.16b, v19.16b\n"
- "ldr d1, [x16, #0x8]\n"
- "mov v16.16b, v13.16b\n"
- "ldr d2, [x16, #0x10]\n"
- "mov v21.16b, v19.16b\n"
- "ldr d3, [x16, #0x18]\n"
- "mov v23.16b, v13.16b\n"
- "ldr d4, [x16, #0x20]\n"
- "ssubl v0.8h, v0.8b, v9.8b\n"
- "mov v20.16b, v19.16b\n"
- "ldr d5, [x16, #0x28]\n"
- "ssubl v1.8h, v1.8b, v9.8b\n"
- "ldr d6, [x16, #0x30]\n"
- "ssubl v2.8h, v2.8b, v9.8b\n"
- "ldr d7, [x16, #0x38]\n"
- "ssubl v3.8h, v3.8b, v9.8b\n"
- "ldr d8, [x16, #0x40]\n"
- "ssubl v4.8h, v4.8b, v9.8b\n"
- "ldp x23, x22, [x14, #0x0]\n"
- "ssubl v5.8h, v5.8b, v9.8b\n"
- "ldp x21, x20, [x14, #0x10]\n"
- "ssubl v6.8h, v6.8b, v9.8b\n"
- "ssubl v7.8h, v7.8b, v9.8b\n"
- "ldr x19, [x14, #0x20]\n"
- "ssubl v8.8h, v8.8b, v9.8b\n"
- "add x23, x23, x17\n"
- "add x22, x22, x17\n"
- "add x21, x21, x17\n"
- "add x20, x20, x17\n"
- "add x19, x19, x17\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "mov v19.16b, v13.16b\n"
+ "mov v11.16b, v26.16b\n"
+ "ldr d2, [x17, #0x10]\n"
+ "ldr d3, [x17, #0x18]\n"
+ "mov v18.16b, v13.16b\n"
+ "mov v24.16b, v26.16b\n"
+ "ldr d4, [x17, #0x20]\n"
+ "ldr d5, [x17, #0x28]\n"
+ "mov v9.16b, v13.16b\n"
+ "mov v23.16b, v26.16b\n"
+ "ldr d6, [x17, #0x30]\n"
+ "ldr d7, [x17, #0x38]\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
+ "ssubl v1.8h, v1.8b, v12.8b\n"
+ "ldr d8, [x17, #0x40]\n"
+ "ldp x23, x22, [x12, #0x0]\n"
+ "ssubl v2.8h, v2.8b, v12.8b\n"
+ "ssubl v3.8h, v3.8b, v12.8b\n"
+ "ldp x21, x20, [x12, #0x10]\n"
+ "ldr x19, [x12, #0x20]\n"
+ "ssubl v4.8h, v4.8b, v12.8b\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "ssubl v6.8h, v6.8b, v12.8b\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "add x23, x23, x15\n"
+ "add x22, x22, x15\n"
+ "add x21, x21, x15\n"
+ "add x20, x20, x15\n"
+ "add x19, x19, x15\n"
"tbz x8, #2, 9f\n"
"ld1 { v31.s }[0], [x23], #0x4\n"
"ld1 { v30.s }[0], [x22], #0x4\n"
@@ -690,33 +674,33 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"ld1 { v28.b }[0], [x20]\n"
"ld1 { v27.b }[0], [x19]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ldr x21, [x14, #0x28]\n"
- "ssubl v31.8h, v31.8b, v14.8b\n"
+ "ssubl v31.8h, v31.8b, v22.8b\n"
"smlal v13.4s, v31.4h, v4.4h\n"
- "ssubl v30.8h, v30.8b, v14.8b\n"
- "smlal2 v19.4s, v31.8h, v4.8h\n"
- "ssubl v29.8h, v29.8b, v14.8b\n"
- "smlal v17.4s, v31.4h, v3.4h\n"
- "ssubl v28.8h, v28.8b, v14.8b\n"
- "smlal2 v25.4s, v31.8h, v3.8h\n"
- "ssubl v27.8h, v27.8b, v14.8b\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "add x21, x21, x17\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "smlal2 v20.4s, v31.8h, v0.8h\n"
+ "smlal2 v26.4s, v31.8h, v4.8h\n"
+ "ldr x21, [x12, #0x28]\n"
+ "smlal v19.4s, v31.4h, v3.4h\n"
+ "smlal2 v11.4s, v31.8h, v3.8h\n"
+ "ssubl v30.8h, v30.8b, v22.8b\n"
+ "add x21, x21, x15\n"
+ "ssubl v29.8h, v29.8b, v22.8b\n"
+ "smlal v18.4s, v31.4h, v1.4h\n"
+ "smlal2 v24.4s, v31.8h, v1.8h\n"
+ "smlal v9.4s, v31.4h, v0.4h\n"
+ "smlal2 v23.4s, v31.8h, v0.8h\n"
+ "ssubl v28.8h, v28.8b, v22.8b\n"
"smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v19.4s, v30.8h, v0.8h\n"
- "smlal v17.4s, v29.4h, v2.4h\n"
- "smlal2 v25.4s, v29.8h, v2.8h\n"
+ "smlal2 v26.4s, v30.8h, v0.8h\n"
+ "ssubl v27.8h, v27.8b, v22.8b\n"
+ "smlal v19.4s, v29.4h, v2.4h\n"
+ "smlal2 v11.4s, v29.8h, v2.8h\n"
"smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v19.4s, v28.8h, v5.8h\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v25.4s, v28.8h, v4.8h\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v21.4s, v28.8h, v2.8h\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "smlal2 v26.4s, v28.8h, v5.8h\n"
+ "smlal v19.4s, v28.4h, v4.4h\n"
+ "smlal2 v11.4s, v28.8h, v4.8h\n"
+ "smlal v18.4s, v28.4h, v2.4h\n"
+ "smlal2 v24.4s, v28.8h, v2.8h\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "smlal2 v23.4s, v28.8h, v1.8h\n"
"tbz x8, #2, 13f\n"
"ld1 { v31.s }[0], [x21], #0x4\n"
"tbz x8, #1, 12f\n"
@@ -738,19 +722,19 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 15f\n"
"ld1 { v31.b }[0], [x21]\n"
"15:" // Oddments: Load (3, 0): Bit 2: End
+ "ssubl v31.8h, v31.8b, v22.8b\n"
+ "smlal v18.4s, v31.4h, v6.4h\n"
+ "smlal2 v24.4s, v31.8h, v6.8h\n"
+ "ldr x20, [x12, #0x30]\n"
"smlal v13.4s, v27.4h, v7.4h\n"
- "ldr x20, [x14, #0x30]\n"
- "ssubl v31.8h, v31.8b, v14.8b\n"
- "smlal2 v19.4s, v27.8h, v7.8h\n"
- "smlal v17.4s, v27.4h, v6.4h\n"
- "add x20, x20, x17\n"
- "smlal2 v25.4s, v27.8h, v6.8h\n"
- "smlal v23.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v21.4s, v31.8h, v6.8h\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "smlal2 v26.4s, v27.8h, v7.8h\n"
+ "add x20, x20, x15\n"
+ "smlal v19.4s, v27.4h, v6.4h\n"
+ "smlal2 v11.4s, v27.8h, v6.8h\n"
+ "smlal v18.4s, v27.4h, v4.4h\n"
+ "smlal2 v24.4s, v27.8h, v4.8h\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "smlal2 v23.4s, v27.8h, v3.8h\n"
"tbz x8, #2, 17f\n"
"ld1 { v29.s }[0], [x20], #0x4\n"
"tbz x8, #1, 16f\n"
@@ -772,11 +756,11 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 19f\n"
"ld1 { v29.b }[0], [x20]\n"
"19:" // Oddments: Load (3, 3): Bit 2: End
- "ldr x26, [x14, #0x38]\n"
- "ssubl v29.8h, v29.8b, v14.8b\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "add x26, x26, x17\n"
+ "ssubl v29.8h, v29.8b, v22.8b\n"
+ "ldr x26, [x12, #0x38]\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "smlal2 v23.4s, v29.8h, v8.8h\n"
+ "add x26, x26, x15\n"
"tbz x8, #2, 21f\n"
"ld1 { v28.s }[0], [x26], #0x4\n"
"tbz x8, #1, 20f\n"
@@ -798,13 +782,13 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 23f\n"
"ld1 { v28.b }[0], [x26]\n"
"23:" // Oddments: Load (0, 1): Bit 2: End
- "ldr x25, [x14, #0x40]\n"
- "ssubl v28.8h, v28.8b, v14.8b\n"
+ "ssubl v28.8h, v28.8b, v22.8b\n"
+ "ldr x25, [x12, #0x40]\n"
"smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "add x25, x25, x17\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal2 v25.4s, v28.8h, v0.8h\n"
+ "smlal2 v26.4s, v28.8h, v1.8h\n"
+ "smlal v19.4s, v28.4h, v0.4h\n"
+ "smlal2 v11.4s, v28.8h, v0.8h\n"
+ "add x25, x25, x15\n"
"tbz x8, #2, 25f\n"
"ld1 { v31.s }[0], [x25], #0x4\n"
"tbz x8, #1, 24f\n"
@@ -826,13 +810,13 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 27f\n"
"ld1 { v31.b }[0], [x25]\n"
"27:" // Oddments: Load (0, 2): Bit 2: End
- "ldr x19, [x14, #0x48]\n"
- "ssubl v31.8h, v31.8b, v14.8b\n"
+ "ssubl v31.8h, v31.8b, v22.8b\n"
+ "ldr x19, [x12, #0x48]\n"
"smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "add x19, x19, x17\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v25.4s, v31.8h, v1.8h\n"
+ "smlal2 v26.4s, v31.8h, v2.8h\n"
+ "smlal v19.4s, v31.4h, v1.4h\n"
+ "smlal2 v11.4s, v31.8h, v1.8h\n"
+ "add x19, x19, x15\n"
"tbz x8, #2, 29f\n"
"ld1 { v30.s }[0], [x19], #0x4\n"
"tbz x8, #1, 28f\n"
@@ -854,17 +838,17 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 31f\n"
"ld1 { v30.b }[0], [x19]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ldr x24, [x14, #0x50]\n"
- "ssubl v30.8h, v30.8b, v14.8b\n"
+ "ssubl v30.8h, v30.8b, v22.8b\n"
+ "ldr x24, [x12, #0x50]\n"
"smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v19.4s, v30.8h, v8.8h\n"
- "add x24, x24, x17\n"
- "smlal v17.4s, v30.4h, v7.4h\n"
- "smlal2 v25.4s, v30.8h, v7.8h\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal2 v21.4s, v30.8h, v5.8h\n"
- "smlal v23.4s, v30.4h, v4.4h\n"
- "smlal2 v20.4s, v30.8h, v4.8h\n"
+ "smlal2 v26.4s, v30.8h, v8.8h\n"
+ "smlal v19.4s, v30.4h, v7.4h\n"
+ "smlal2 v11.4s, v30.8h, v7.8h\n"
+ "add x24, x24, x15\n"
+ "smlal v18.4s, v30.4h, v5.4h\n"
+ "smlal2 v24.4s, v30.8h, v5.8h\n"
+ "smlal v9.4s, v30.4h, v4.4h\n"
+ "smlal2 v23.4s, v30.8h, v4.8h\n"
"tbz x8, #2, 33f\n"
"ld1 { v29.s }[0], [x24], #0x4\n"
"tbz x8, #1, 32f\n"
@@ -886,13 +870,13 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 35f\n"
"ld1 { v29.b }[0], [x24]\n"
"35:" // Oddments: Load (1, 0): Bit 2: End
- "ldr x23, [x14, #0x58]\n"
- "ssubl v29.8h, v29.8b, v14.8b\n"
+ "ssubl v29.8h, v29.8b, v22.8b\n"
+ "ldr x23, [x12, #0x58]\n"
"smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v19.4s, v29.8h, v3.8h\n"
- "add x23, x23, x17\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal2 v21.4s, v29.8h, v0.8h\n"
+ "smlal2 v26.4s, v29.8h, v3.8h\n"
+ "smlal v18.4s, v29.4h, v0.4h\n"
+ "smlal2 v24.4s, v29.8h, v0.8h\n"
+ "add x23, x23, x15\n"
"tbz x8, #2, 37f\n"
"ld1 { v28.s }[0], [x23], #0x4\n"
"tbz x8, #1, 36f\n"
@@ -914,13 +898,13 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 39f\n"
"ld1 { v28.b }[0], [x23]\n"
"39:" // Oddments: Load (1, 3): Bit 2: End
- "ldr x22, [x14, #0x60]\n"
- "ssubl v28.8h, v28.8b, v14.8b\n"
- "smlal v17.4s, v28.4h, v5.4h\n"
- "smlal2 v25.4s, v28.8h, v5.8h\n"
- "add x22, x22, x17\n"
- "smlal v23.4s, v28.4h, v2.4h\n"
- "smlal2 v20.4s, v28.8h, v2.8h\n"
+ "ssubl v28.8h, v28.8b, v22.8b\n"
+ "ldr x22, [x12, #0x60]\n"
+ "smlal v19.4s, v28.4h, v5.4h\n"
+ "smlal2 v11.4s, v28.8h, v5.8h\n"
+ "smlal v9.4s, v28.4h, v2.4h\n"
+ "smlal2 v23.4s, v28.8h, v2.8h\n"
+ "add x22, x22, x15\n"
"tbz x8, #2, 41f\n"
"ld1 { v31.s }[0], [x22], #0x4\n"
"tbz x8, #1, 40f\n"
@@ -942,13 +926,13 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 43f\n"
"ld1 { v31.b }[0], [x22]\n"
"43:" // Oddments: Load (2, 0): Bit 2: End
- "ldr x21, [x14, #0x68]\n"
- "ssubl v31.8h, v31.8b, v14.8b\n"
+ "ssubl v31.8h, v31.8b, v22.8b\n"
+ "ldr x21, [x12, #0x68]\n"
"smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v19.4s, v31.8h, v6.8h\n"
- "add x21, x21, x17\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v3.8h\n"
+ "smlal2 v26.4s, v31.8h, v6.8h\n"
+ "smlal v18.4s, v31.4h, v3.4h\n"
+ "smlal2 v24.4s, v31.8h, v3.8h\n"
+ "add x21, x21, x15\n"
"tbz x8, #2, 45f\n"
"ld1 { v30.s }[0], [x21], #0x4\n"
"tbz x8, #1, 44f\n"
@@ -970,13 +954,13 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 47f\n"
"ld1 { v30.b }[0], [x21]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "ldr x20, [x14, #0x70]\n"
- "ssubl v30.8h, v30.8b, v14.8b\n"
- "smlal v17.4s, v30.4h, v8.4h\n"
- "smlal2 v25.4s, v30.8h, v8.8h\n"
- "add x20, x20, x17\n"
- "smlal v23.4s, v30.4h, v5.4h\n"
- "smlal2 v20.4s, v30.8h, v5.8h\n"
+ "ssubl v30.8h, v30.8b, v22.8b\n"
+ "ldr x20, [x12, #0x70]\n"
+ "smlal v19.4s, v30.4h, v8.4h\n"
+ "smlal2 v11.4s, v30.8h, v8.8h\n"
+ "smlal v9.4s, v30.4h, v5.4h\n"
+ "smlal2 v23.4s, v30.8h, v5.8h\n"
+ "add x20, x20, x15\n"
"tbz x8, #2, 49f\n"
"ld1 { v29.s }[0], [x20], #0x4\n"
"tbz x8, #1, 48f\n"
@@ -998,13 +982,13 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 51f\n"
"ld1 { v29.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ldr x19, [x14, #0x78]\n"
- "ssubl v29.8h, v29.8b, v14.8b\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "smlal2 v21.4s, v29.8h, v7.8h\n"
- "add x19, x19, x17\n"
- "smlal v23.4s, v29.4h, v6.4h\n"
- "smlal2 v20.4s, v29.8h, v6.8h\n"
+ "ssubl v29.8h, v29.8b, v22.8b\n"
+ "ldr x19, [x12, #0x78]\n"
+ "smlal v18.4s, v29.4h, v7.4h\n"
+ "smlal2 v24.4s, v29.8h, v7.8h\n"
+ "smlal v9.4s, v29.4h, v6.4h\n"
+ "smlal2 v23.4s, v29.8h, v6.8h\n"
+ "add x19, x19, x15\n"
"tbz x8, #2, 53f\n"
"ld1 { v28.s }[0], [x19], #0x4\n"
"tbz x8, #1, 52f\n"
@@ -1026,160 +1010,150 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 55f\n"
"ld1 { v28.b }[0], [x19]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ssubl v28.8h, v28.8b, v14.8b\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "smlal2 v21.4s, v28.8h, v8.8h\n"
- "smlal v23.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "ssubl v28.8h, v28.8b, v22.8b\n"
+ "smlal v18.4s, v28.4h, v8.4h\n"
+ "smlal2 v24.4s, v28.8h, v8.8h\n"
+ "smlal v9.4s, v28.4h, v7.4h\n"
+ "smlal2 v23.4s, v28.8h, v7.8h\n"
"tbz x8, #2, 57f\n"
- "ld1 { v26.4s }, [x13], #0x10\n"
- "ld1 { v10.4s }, [x11], #0x10\n"
+ "ld1 { v21.4s }, [x13], #0x10\n"
+ "ld1 { v25.4s }, [x11], #0x10\n"
"tbz x8, #1, 56f\n"
- "ld1 { v11.d }[0], [x13], #0x8\n"
- "ld1 { v18.d }[0], [x11], #0x8\n"
+ "ld1 { v10.d }[0], [x13], #0x8\n"
+ "ld1 { v16.d }[0], [x11], #0x8\n"
"tbz x8, #0, 59f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v18.s }[2], [x11]\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v16.s }[2], [x11]\n"
"b 59f\n"
"56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x8, #0, 59f\n"
- "ld1 { v11.s }[0], [x13]\n"
- "ld1 { v18.s }[0], [x11]\n"
+ "ld1 { v10.s }[0], [x13]\n"
+ "ld1 { v16.s }[0], [x11]\n"
"b 59f\n"
"57:" // Oddments: Load requant params: Bit 2: Unset
"tbz x8, #1, 58f\n"
- "ld1 { v26.d }[0], [x13], #0x8\n"
- "ld1 { v10.d }[0], [x11], #0x8\n"
+ "ld1 { v21.d }[0], [x13], #0x8\n"
+ "ld1 { v25.d }[0], [x11], #0x8\n"
"tbz x8, #0, 59f\n"
- "ld1 { v26.s }[2], [x13]\n"
- "ld1 { v10.s }[2], [x11]\n"
+ "ld1 { v21.s }[2], [x13]\n"
+ "ld1 { v25.s }[2], [x11]\n"
"b 59f\n"
"58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x8, #0, 59f\n"
- "ld1 { v26.s }[0], [x13]\n"
- "ld1 { v10.s }[0], [x11]\n"
+ "ld1 { v21.s }[0], [x13]\n"
+ "ld1 { v25.s }[0], [x11]\n"
"59:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v13.4s, v13.4s, v26.4s\n"
- "add x10, x10, x15\n"
- "sqrdmulh v19.4s, v19.4s, v11.4s\n"
- "add x9, x9, x15\n"
- "sqrdmulh v17.4s, v17.4s, v26.4s\n"
- "add x28, x28, x15\n"
- "sqrdmulh v25.4s, v25.4s, v11.4s\n"
- "add x27, x27, x15\n"
- "sqrdmulh v16.4s, v16.4s, v26.4s\n"
- "and v22.16b, v13.16b, v10.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v28.16b, v19.16b, v18.16b\n"
- "and v3.16b, v17.16b, v10.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v6.16b, v25.16b, v18.16b\n"
- "and v0.16b, v16.16b, v10.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqrdmulh v21.4s, v21.4s, v11.4s\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sqadd v13.4s, v13.4s, v22.4s\n"
- "sqrdmulh v23.4s, v23.4s, v26.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v20.4s, v20.4s, v11.4s\n"
- "sqadd v19.4s, v19.4s, v28.4s\n"
- "sqadd v17.4s, v17.4s, v3.4s\n"
- "srshl v13.4s, v13.4s, v10.4s\n"
- "sqadd v25.4s, v25.4s, v6.4s\n"
- "srshl v19.4s, v19.4s, v18.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "add v13.4s, v13.4s, v15.4s\n"
- "srshl v25.4s, v25.4s, v18.4s\n"
- "add v19.4s, v19.4s, v15.4s\n"
- "smin v13.4s, v13.4s, v12.4s\n"
- "add v17.4s, v17.4s, v15.4s\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "smax v13.4s, v13.4s, v24.4s\n"
- "smin v17.4s, v17.4s, v12.4s\n"
- "smax v19.4s, v19.4s, v24.4s\n"
- "add v25.4s, v25.4s, v15.4s\n"
- "smax v17.4s, v17.4s, v24.4s\n"
- "uzp1 v13.16b, v13.16b, v19.16b\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "sqadd v16.4s, v16.4s, v0.4s\n"
- "smax v25.4s, v25.4s, v24.4s\n"
- "and v29.16b, v21.16b, v18.16b\n"
+ "sqdmulh v13.4s, v13.4s, v21.4s\n"
+ "sqdmulh v19.4s, v19.4s, v21.4s\n"
+ "add x10, x10, x14\n"
+ "add x9, x9, x14\n"
+ "sqdmulh v18.4s, v18.4s, v21.4s\n"
+ "sqdmulh v9.4s, v9.4s, v21.4s\n"
+ "add x28, x28, x14\n"
+ "add x27, x27, x14\n"
+ "and v7.16b, v13.16b, v25.16b\n"
+ "sqdmulh v26.4s, v26.4s, v10.4s\n"
+ "and v4.16b, v19.16b, v25.16b\n"
+ "sqdmulh v11.4s, v11.4s, v10.4s\n"
+ "and v21.16b, v18.16b, v25.16b\n"
+ "sqdmulh v24.4s, v24.4s, v10.4s\n"
+ "and v20.16b, v9.16b, v25.16b\n"
+ "sqdmulh v23.4s, v23.4s, v10.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v29.16b, v26.16b, v16.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v10.16b, v11.16b, v16.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v31.16b, v24.16b, v16.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v30.16b, v23.16b, v16.16b\n"
+ "sqadd v13.4s, v13.4s, v7.4s\n"
"sshr v29.4s, v29.4s, #0x1f\n"
- "uzp1 v17.16b, v17.16b, v25.16b\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "and v3.16b, v23.16b, v10.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "add v16.4s, v16.4s, v15.4s\n"
- "sqadd v21.4s, v21.4s, v29.4s\n"
- "and v25.16b, v20.16b, v18.16b\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "smin v16.4s, v16.4s, v12.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "sqadd v23.4s, v23.4s, v3.4s\n"
- "smax v16.4s, v16.4s, v24.4s\n"
- "add v21.4s, v21.4s, v15.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v20.4s, v20.4s, v25.4s\n"
- "smin v21.4s, v21.4s, v12.4s\n"
- "add v23.4s, v23.4s, v15.4s\n"
- "srshl v20.4s, v20.4s, v18.4s\n"
- "smax v21.4s, v21.4s, v24.4s\n"
- "smin v23.4s, v23.4s, v12.4s\n"
- "uzp1 v16.16b, v16.16b, v21.16b\n"
- "add v20.4s, v20.4s, v15.4s\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "smax v23.4s, v23.4s, v24.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "smax v20.4s, v20.4s, v24.4s\n"
- "uzp1 v23.16b, v23.16b, v20.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v21.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v20.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v25.4s\n"
+ "sqadd v26.4s, v26.4s, v29.4s\n"
+ "srshl v19.4s, v19.4s, v25.4s\n"
+ "sqadd v11.4s, v11.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v25.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "srshl v9.4s, v9.4s, v25.4s\n"
+ "sqadd v23.4s, v23.4s, v30.4s\n"
+ "srshl v26.4s, v26.4s, v16.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v11.4s, v11.4s, v16.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "srshl v24.4s, v24.4s, v16.4s\n"
+ "sqxtn v18.4h, v18.4s\n"
+ "srshl v23.4s, v23.4s, v16.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "sqxtn2 v13.8h, v26.4s\n"
+ "sqxtn2 v19.8h, v11.4s\n"
+ "sqxtn2 v18.8h, v24.4s\n"
+ "sqxtn2 v9.8h, v23.4s\n"
+ "sqadd v13.8h, v13.8h, v14.8h\n"
+ "sqadd v19.8h, v19.8h, v14.8h\n"
+ "sqadd v18.8h, v18.8h, v14.8h\n"
+ "sqadd v9.8h, v9.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smax v18.8h, v18.8h, v17.8h\n"
+ "smax v9.8h, v9.8h, v17.8h\n"
+ "smin v13.8h, v13.8h, v15.8h\n"
+ "smin v19.8h, v19.8h, v15.8h\n"
+ "smin v18.8h, v18.8h, v15.8h\n"
+ "smin v9.8h, v9.8h, v15.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
"tbz x8, #2, 61f\n"
"st1 { v13.s }[0], [x10], #0x4\n"
- "st1 { v17.s }[0], [x9], #0x4\n"
- "st1 { v16.s }[0], [x28], #0x4\n"
- "st1 { v23.s }[0], [x27], #0x4\n"
+ "st1 { v19.s }[0], [x9], #0x4\n"
+ "st1 { v18.s }[0], [x28], #0x4\n"
+ "st1 { v9.s }[0], [x27], #0x4\n"
"tbz x8, #1, 60f\n"
"st1 { v13.h }[2], [x10], #0x2\n"
- "st1 { v17.h }[2], [x9], #0x2\n"
- "st1 { v16.h }[2], [x28], #0x2\n"
- "st1 { v23.h }[2], [x27], #0x2\n"
+ "st1 { v19.h }[2], [x9], #0x2\n"
+ "st1 { v18.h }[2], [x28], #0x2\n"
+ "st1 { v9.h }[2], [x27], #0x2\n"
"tbz x8, #0, 63f\n"
"st1 { v13.b }[6], [x10], #0x1\n"
- "st1 { v17.b }[6], [x9], #0x1\n"
- "st1 { v16.b }[6], [x28], #0x1\n"
- "st1 { v23.b }[6], [x27], #0x1\n"
+ "st1 { v19.b }[6], [x9], #0x1\n"
+ "st1 { v18.b }[6], [x28], #0x1\n"
+ "st1 { v9.b }[6], [x27], #0x1\n"
"b 63f\n"
"60:" // Oddments: Bit 2: Bit 1: Unset
"tbz x8, #0, 63f\n"
"st1 { v13.b }[4], [x10], #0x1\n"
- "st1 { v17.b }[4], [x9], #0x1\n"
- "st1 { v16.b }[4], [x28], #0x1\n"
- "st1 { v23.b }[4], [x27], #0x1\n"
+ "st1 { v19.b }[4], [x9], #0x1\n"
+ "st1 { v18.b }[4], [x28], #0x1\n"
+ "st1 { v9.b }[4], [x27], #0x1\n"
"b 63f\n"
"61:" // Oddments: Bit 2: Unset
"tbz x8, #1, 62f\n"
"st1 { v13.h }[0], [x10], #0x2\n"
- "st1 { v17.h }[0], [x9], #0x2\n"
- "st1 { v16.h }[0], [x28], #0x2\n"
- "st1 { v23.h }[0], [x27], #0x2\n"
+ "st1 { v19.h }[0], [x9], #0x2\n"
+ "st1 { v18.h }[0], [x28], #0x2\n"
+ "st1 { v9.h }[0], [x27], #0x2\n"
"tbz x8, #0, 63f\n"
"st1 { v13.b }[2], [x10], #0x1\n"
- "st1 { v17.b }[2], [x9], #0x1\n"
- "st1 { v16.b }[2], [x28], #0x1\n"
- "st1 { v23.b }[2], [x27], #0x1\n"
+ "st1 { v19.b }[2], [x9], #0x1\n"
+ "st1 { v18.b }[2], [x28], #0x1\n"
+ "st1 { v9.b }[2], [x27], #0x1\n"
"b 63f\n"
"62:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x8, #0, 63f\n"
"st1 { v13.b }[0], [x10], #0x1\n"
- "st1 { v17.b }[0], [x9], #0x1\n"
- "st1 { v16.b }[0], [x28], #0x1\n"
- "st1 { v23.b }[0], [x27], #0x1\n"
+ "st1 { v19.b }[0], [x9], #0x1\n"
+ "st1 { v18.b }[0], [x28], #0x1\n"
+ "st1 { v9.b }[0], [x27], #0x1\n"
"63:" // Oddments: Bit 2: End
-
"64:" // End
-
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index b20759eec4..6032f8f2fb 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,37 +36,24 @@ namespace depthwise {
void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
-struct a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst
+class a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef int8_t input_type;
- typedef int8_t weight_type;
- typedef int8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 2;
constexpr static unsigned int stride_cols = 2;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 5;
- constexpr static unsigned int input_cols = 5;
-
- constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_mla::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_mla::get_packed_size;
+ a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
- kern_type kernel = a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
- a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 3b3d9c8946..5499392d6d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,7 +46,7 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const int8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const int8_t *const *inptrs_raw,
- const int8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -100,75 +100,75 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
- "mov x5, #0x0\n"
- "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
- "mov x7, #0x0\n"
- "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
- "add x8, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "lsr x16, x4, #0x3\n"
- "ldr x15, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
- "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v12.16b }, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v13.16b }, [x20]\n"
- "add x20, x22, %[offsetof_Requantize32_minval]\n"
- "ld1r { v11.4s }, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v19.4s }, [x20]\n"
- "ld1r { v14.4s }, [x19]\n"
- "ldp x14, x13, [x21, #0x0]\n"
- "ldp x12, x11, [x21, #0x10]\n"
+ "ldr x19, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "add x24, x19, %[offsetof_Requantize32_a_offset]\n"
+ "add x23, x19, %[offsetof_Requantize32_b_offset]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x19, %[offsetof_Requantize32_c_offset]\n"
+ "add x20, x19, %[offsetof_Requantize32_minval]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x19, x19, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v12.16b }, [x24]\n"
+ "ld1r { v13.16b }, [x23]\n"
+ "lsr x16, x8, #0x3\n"
+ "ld1r { v11.8h }, [x21]\n"
+ "ld1r { v17.8h }, [x20]\n"
+ "mov x15, #0x0\n"
+ "mov x14, #0x0\n"
+ "ld1r { v14.8h }, [x19]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x12, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x10, x9, [x22, #0x0]\n"
+ "ldp x28, x27, [x22, #0x10]\n"
"cbz x16, 3f\n"
- "subs x16, x16, #0x1\n"
"ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
"ldr q15, [x19, #0x0]\n"
- "mov v20.16b, v15.16b\n"
+ "subs x16, x16, #0x1\n"
+ "mov v9.16b, v15.16b\n"
"ldr q10, [x19, #0x10]\n"
"add x19, x19, #0x20\n"
- "mov v16.16b, v15.16b\n"
"str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "mov v17.16b, v15.16b\n"
- "ldr d0, [x6, #0x0]\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "ldr d2, [x17, #0x10]\n"
+ "mov v16.16b, v10.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldr d3, [x17, #0x18]\n"
+ "ldr d4, [x17, #0x20]\n"
+ "mov v21.16b, v10.16b\n"
+ "mov v23.16b, v15.16b\n"
+ "ldr d5, [x17, #0x28]\n"
+ "ldr d6, [x17, #0x30]\n"
+ "mov v18.16b, v10.16b\n"
"ssubl v0.8h, v0.8b, v13.8b\n"
- "mov v23.16b, v10.16b\n"
- "ldr d1, [x6, #0x8]\n"
- "mov v22.16b, v10.16b\n"
- "ldr d2, [x6, #0x10]\n"
+ "ldr d7, [x17, #0x38]\n"
+ "ldr d8, [x17, #0x40]\n"
"ssubl v1.8h, v1.8b, v13.8b\n"
- "mov v18.16b, v10.16b\n"
- "ldr d3, [x6, #0x18]\n"
- "ldr d4, [x6, #0x20]\n"
"ssubl v2.8h, v2.8b, v13.8b\n"
- "ldr d5, [x6, #0x28]\n"
+ "ldp x26, x25, [x12, #0x0]\n"
+ "ldp x24, x23, [x12, #0x10]\n"
"ssubl v3.8h, v3.8b, v13.8b\n"
- "ldr d6, [x6, #0x30]\n"
- "ldr d7, [x6, #0x38]\n"
"ssubl v4.8h, v4.8b, v13.8b\n"
- "ldr d8, [x6, #0x40]\n"
+ "ldp x22, x21, [x12, #0x20]\n"
+ "ldp x20, x19, [x12, #0x30]\n"
"ssubl v5.8h, v5.8b, v13.8b\n"
- "ldp x26, x25, [x8, #0x0]\n"
"ssubl v6.8h, v6.8b, v13.8b\n"
- "ldp x24, x23, [x8, #0x10]\n"
+ "ldr d31, [x26, x15]\n"
+ "ldr d30, [x25, x15]\n"
"ssubl v7.8h, v7.8b, v13.8b\n"
"ssubl v8.8h, v8.8b, v13.8b\n"
- "ldp x22, x21, [x8, #0x20]\n"
- "ldp x20, x19, [x8, #0x30]\n"
- "ldr d31, [x26, x5]\n"
+ "ldr d29, [x24, x15]\n"
+ "ldr d28, [x23, x15]\n"
"ssubl v31.8h, v31.8b, v12.8b\n"
- "ldr d30, [x25, x5]\n"
- "ldr d29, [x24, x5]\n"
"ssubl v30.8h, v30.8b, v12.8b\n"
- "ldr d28, [x23, x5]\n"
- "ldr d27, [x22, x5]\n"
+ "ldr d27, [x22, x15]\n"
+ "ldr d26, [x21, x15]\n"
"ssubl v29.8h, v29.8b, v12.8b\n"
- "ldr d26, [x21, x5]\n"
"ssubl v28.8h, v28.8b, v12.8b\n"
- "ldr d25, [x20, x5]\n"
- "ldr d24, [x19, x5]\n"
+ "ldr d25, [x20, x15]\n"
+ "ldr d24, [x19, x15]\n"
"ssubl v27.8h, v27.8b, v12.8b\n"
"ssubl v26.8h, v26.8b, v12.8b\n"
"ssubl v25.8h, v25.8b, v12.8b\n"
@@ -176,259 +176,251 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"beq 2f\n"
"1:" // Loop
"smlal v15.4s, v31.4h, v8.4h\n"
- "ldr x23, [x8, #0x40]\n"
- "add x6, x6, #0x48\n"
"smlal2 v10.4s, v31.8h, v8.8h\n"
- "ldr x22, [x8, #0x48]\n"
- "subs x16, x16, #0x1\n"
- "smlal v20.4s, v31.4h, v6.4h\n"
- "ldr x21, [x8, #0x50]\n"
- "smlal2 v23.4s, v31.8h, v6.8h\n"
- "ldr x20, [x8, #0x58]\n"
- "smlal v16.4s, v31.4h, v2.4h\n"
- "ldr x19, [x8, #0x60]\n"
- "smlal2 v22.4s, v31.8h, v2.8h\n"
- "ldr x10, [x8, #0x68]\n"
- "smlal v17.4s, v31.4h, v0.4h\n"
- "ldr x9, [x8, #0x70]\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
- "ldr x28, [x8, #0x78]\n"
+ "ldr x24, [x12, #0x40]\n"
+ "ldr x23, [x12, #0x48]\n"
+ "smlal v9.4s, v31.4h, v6.4h\n"
+ "smlal2 v16.4s, v31.8h, v6.8h\n"
+ "ldr x21, [x12, #0x50]\n"
+ "ldr x19, [x12, #0x58]\n"
"smlal v15.4s, v30.4h, v0.4h\n"
- "ldr x27, [x8, #0x80]\n"
"smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr x26, [x8, #0x88]\n"
- "smlal v20.4s, v28.4h, v1.4h\n"
- "ldr x25, [x8, #0x90]\n"
- "smlal2 v23.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x5]\n"
+ "ldr x22, [x12, #0x78]\n"
+ "ldr x20, [x12, #0x60]\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "smlal2 v16.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x23, x15]\n"
"ssubl v28.8h, v28.8b, v12.8b\n"
"smlal v15.4s, v29.4h, v1.4h\n"
- "ldr x24, [x8, #0x98]\n"
"smlal2 v10.4s, v29.8h, v1.8h\n"
- "ldr d29, [x23, x5]\n"
+ "ldr d29, [x24, x15]\n"
"ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "ldr x23, [x8, #0xa0]\n"
- "smlal2 v23.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x5]\n"
+ "smlal v9.4s, v27.4h, v2.4h\n"
+ "smlal2 v16.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x15]\n"
"ssubl v27.8h, v27.8b, v12.8b\n"
"smlal v15.4s, v26.4h, v3.4h\n"
- "ldr x22, [x8, #0xa8]\n"
"smlal2 v10.4s, v26.8h, v3.8h\n"
- "ldr d26, [x20, x5]\n"
+ "ldr d26, [x19, x15]\n"
"ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v9.4s, v24.4h, v0.4h\n"
+ "smlal2 v16.4s, v24.8h, v0.8h\n"
+ "ldr x21, [x12, #0x80]\n"
+ "ldr x19, [x12, #0x68]\n"
"smlal v15.4s, v25.4h, v4.4h\n"
- "ldr x21, [x8, #0xb0]\n"
"smlal2 v10.4s, v25.8h, v4.8h\n"
- "ldr d25, [x19, x5]\n"
+ "ldr d25, [x20, x15]\n"
"ssubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v9.4s, v29.4h, v4.4h\n"
+ "smlal2 v16.4s, v29.8h, v4.8h\n"
+ "ldr x20, [x12, #0x88]\n"
+ "ldr d29, [x19, x15]\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "ldr x20, [x8, #0xb8]\n"
"smlal2 v10.4s, v24.8h, v2.8h\n"
- "ldr x19, [x8, #0xc0]\n"
- "smlal v20.4s, v24.4h, v0.4h\n"
- "ldr q21, [x17, #0x0]\n"
- "smlal2 v23.4s, v24.8h, v0.8h\n"
- "ldr d24, [x9, x5]\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "smlal v20.4s, v29.4h, v4.4h\n"
- "ldr q30, [x15, #0x0]\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "ldr d29, [x10, x5]\n"
+ "ldr x19, [x12, #0x70]\n"
"ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v20.4s, v28.4h, v5.4h\n"
- "ldr q31, [x17, #0x10]\n"
- "smlal2 v23.4s, v28.8h, v5.8h\n"
- "ldr d28, [x27, x5]\n"
- "add x17, x17, #0x20\n"
+ "smlal v9.4s, v28.4h, v5.4h\n"
+ "smlal2 v16.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x21, x15]\n"
+ "ssubl v28.8h, v28.8b, v12.8b\n"
+ "smlal v22.4s, v31.4h, v2.4h\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "ldr x24, [x12, #0x98]\n"
+ "ldr d24, [x19, x15]\n"
"smlal v15.4s, v27.4h, v5.4h\n"
- "ldr q9, [x15, #0x10]\n"
- "add x15, x15, #0x20\n"
"smlal2 v10.4s, v27.8h, v5.8h\n"
- "ssubl v28.8h, v28.8b, v12.8b\n"
- "smlal v20.4s, v27.4h, v3.4h\n"
- "smlal2 v23.4s, v27.8h, v3.8h\n"
- "ldr d27, [x28, x5]\n"
+ "ssubl v24.8h, v24.8b, v12.8b\n"
+ "ldr x23, [x12, #0x90]\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "smlal2 v16.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x22, x15]\n"
"ssubl v27.8h, v27.8b, v12.8b\n"
- "smlal v16.4s, v26.4h, v3.4h\n"
- "smlal2 v22.4s, v26.8h, v3.8h\n"
- "ldr d26, [x26, x5]\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "smlal v22.4s, v26.4h, v3.4h\n"
+ "ldr x22, [x12, #0xa8]\n"
+ "ldr x19, [x12, #0xa0]\n"
+ "smlal2 v21.4s, v26.8h, v3.8h\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "ldr d26, [x20, x15]\n"
"ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v23.4s, v27.4h, v4.4h\n"
+ "smlal v22.4s, v25.4h, v0.4h\n"
+ "ldr x21, [x12, #0xb0]\n"
+ "ldr x20, [x12, #0xb8]\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "smlal2 v18.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x19, x15]\n"
+ "ssubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
"smlal v15.4s, v25.4h, v6.4h\n"
+ "ldr x19, [x12, #0xc0]\n"
+ "ldr q19, [x13, #0x0]\n"
"smlal2 v10.4s, v25.8h, v6.8h\n"
- "smlal v16.4s, v25.4h, v0.4h\n"
- "smlal2 v22.4s, v25.8h, v0.8h\n"
- "ldr d25, [x25, x5]\n"
+ "smlal v22.4s, v29.4h, v4.4h\n"
+ "ldr d25, [x23, x15]\n"
"ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v16.4s, v29.4h, v4.4h\n"
- "smlal2 v22.4s, v29.8h, v4.8h\n"
- "ldr d29, [x24, x5]\n"
+ "smlal2 v21.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x24, x15]\n"
+ "smlal2 v18.4s, v28.8h, v1.8h\n"
"ssubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v23.4s, v26.4h, v5.4h\n"
"smlal v15.4s, v24.4h, v7.4h\n"
+ "ldr q0, [x11, #0x0]\n"
+ "ldr q4, [x13, #0x10]\n"
"smlal2 v10.4s, v24.8h, v7.8h\n"
- "smlal v16.4s, v24.4h, v1.4h\n"
- "smlal2 v22.4s, v24.8h, v1.8h\n"
- "ldr d24, [x22, x5]\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v18.4s, v27.8h, v4.8h\n"
- "ldr d27, [x23, x5]\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "smlal v20.4s, v28.4h, v7.4h\n"
- "smlal2 v23.4s, v28.8h, v7.8h\n"
- "smlal v17.4s, v28.4h, v1.4h\n"
- "smlal2 v18.4s, v28.8h, v1.8h\n"
- "smlal v16.4s, v25.4h, v6.4h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "ldr d25, [x20, x5]\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v17.4s, v26.4h, v5.4h\n"
+ "smlal v22.4s, v24.4h, v1.4h\n"
+ "sqdmulh v15.4s, v15.4s, v19.4s\n"
+ "ldr q31, [x11, #0x10]\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x22, x15]\n"
"smlal2 v18.4s, v26.8h, v5.8h\n"
- "ldr d26, [x21, x5]\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "smlal v20.4s, v29.4h, v8.4h\n"
- "smlal2 v23.4s, v29.8h, v8.8h\n"
- "smlal v17.4s, v29.4h, v2.4h\n"
+ "ssubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v23.4s, v29.4h, v2.4h\n"
+ "ldr d26, [x21, x15]\n"
"smlal2 v18.4s, v29.8h, v2.8h\n"
- "ldr d29, [x19, x5]\n"
- "add x5, x5, #0x8\n"
- "smlal v16.4s, v27.4h, v7.4h\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v22.4s, v27.8h, v7.8h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal v16.4s, v24.4h, v5.4h\n"
+ "ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v22.4s, v25.4h, v6.4h\n"
+ "smlal v23.4s, v24.4h, v3.4h\n"
+ "and v30.16b, v15.16b, v0.16b\n"
+ "add x17, x17, #0x48\n"
+ "smlal v9.4s, v28.4h, v7.4h\n"
+ "smlal2 v16.4s, v28.8h, v7.8h\n"
+ "sqdmulh v10.4s, v10.4s, v4.4s\n"
+ "subs x16, x16, #0x1\n"
+ "smlal2 v21.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x20, x15]\n"
"smlal2 v18.4s, v24.8h, v3.8h\n"
- "sqrdmulh v15.4s, v15.4s, v21.4s\n"
- "smlal2 v22.4s, v24.8h, v5.8h\n"
- "smlal v17.4s, v26.4h, v7.4h\n"
+ "ssubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v22.4s, v27.4h, v7.4h\n"
+ "smlal v23.4s, v26.4h, v7.4h\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "add x13, x13, #0x20\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "smlal2 v16.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x19, x15]\n"
+ "ssubl v29.8h, v29.8b, v12.8b\n"
+ "smlal2 v21.4s, v27.8h, v7.8h\n"
"smlal2 v18.4s, v26.8h, v7.8h\n"
- "smlal v16.4s, v25.4h, v8.4h\n"
- "smlal2 v22.4s, v25.8h, v8.8h\n"
- "smlal v17.4s, v25.4h, v6.4h\n"
+ "sqdmulh v9.4s, v9.4s, v19.4s\n"
+ "add x15, x15, #0x8\n"
+ "smlal v22.4s, v24.4h, v5.4h\n"
+ "smlal v23.4s, v25.4h, v6.4h\n"
+ "and v28.16b, v9.16b, v0.16b\n"
+ "add x11, x11, #0x20\n"
+ "smlal2 v21.4s, v24.8h, v5.8h\n"
"smlal2 v18.4s, v25.8h, v6.8h\n"
- "and v26.16b, v15.16b, v30.16b\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "smlal v17.4s, v29.4h, v8.4h\n"
+ "sqdmulh v16.4s, v16.4s, v4.4s\n"
+ "smlal v22.4s, v25.4h, v8.4h\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
+ "sqdmulh v22.4s, v22.4s, v19.4s\n"
+ "smlal2 v21.4s, v25.8h, v8.8h\n"
"smlal2 v18.4s, v29.8h, v8.8h\n"
- "sqrdmulh v10.4s, v10.4s, v31.4s\n"
- "sqrdmulh v20.4s, v20.4s, v21.4s\n"
- "sqrdmulh v23.4s, v23.4s, v31.4s\n"
- "sqrdmulh v16.4s, v16.4s, v21.4s\n"
- "sqadd v15.4s, v15.4s, v26.4s\n"
- "and v8.16b, v10.16b, v9.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v30.4s\n"
- "and v4.16b, v20.16b, v30.16b\n"
+ "sqdmulh v23.4s, v23.4s, v19.4s\n"
+ "and v29.16b, v22.16b, v0.16b\n"
+ "sqdmulh v21.4s, v21.4s, v4.4s\n"
+ "and v20.16b, v23.16b, v0.16b\n"
+ "sqdmulh v18.4s, v18.4s, v4.4s\n"
+ "and v19.16b, v10.16b, v31.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v4.16b, v16.16b, v31.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v5.16b, v21.16b, v31.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v26.16b, v18.16b, v31.16b\n"
+ "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v28.4s\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "and v2.16b, v23.16b, v9.16b\n"
- "and v1.16b, v16.16b, v30.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "add v15.4s, v15.4s, v11.4s\n"
- "sqadd v10.4s, v10.4s, v8.4s\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "sqrdmulh v22.4s, v22.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v4.4s\n"
- "smin v15.4s, v15.4s, v14.4s\n"
- "srshl v10.4s, v10.4s, v9.4s\n"
- "sqadd v23.4s, v23.4s, v2.4s\n"
- "smax v15.4s, v15.4s, v19.4s\n"
- "srshl v20.4s, v20.4s, v30.4s\n"
- "add v10.4s, v10.4s, v11.4s\n"
- "srshl v23.4s, v23.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v1.4s\n"
- "smin v10.4s, v10.4s, v14.4s\n"
- "add v20.4s, v20.4s, v11.4s\n"
- "add v23.4s, v23.4s, v11.4s\n"
- "smax v10.4s, v10.4s, v19.4s\n"
- "smin v20.4s, v20.4s, v14.4s\n"
- "smin v23.4s, v23.4s, v14.4s\n"
- "uzp1 v15.16b, v15.16b, v10.16b\n"
- "smax v20.4s, v20.4s, v19.4s\n"
+ "sqadd v22.4s, v22.4s, v29.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v20.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v0.4s\n"
+ "sqadd v10.4s, v10.4s, v19.4s\n"
+ "srshl v9.4s, v9.4s, v0.4s\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "sqadd v21.4s, v21.4s, v5.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "sqadd v18.4s, v18.4s, v26.4s\n"
+ "srshl v10.4s, v10.4s, v31.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v16.4s, v16.4s, v31.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v21.4s, v21.4s, v31.4s\n"
+ "sqxtn v22.4h, v22.4s\n"
+ "srshl v18.4s, v18.4s, v31.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v15.8h, v10.4s\n"
+ "sqxtn2 v9.8h, v16.4s\n"
+ "sqxtn2 v22.8h, v21.4s\n"
+ "sqxtn2 v23.8h, v18.4s\n"
+ "sqadd v15.8h, v15.8h, v11.8h\n"
+ "sqadd v9.8h, v9.8h, v11.8h\n"
+ "sqadd v22.8h, v22.8h, v11.8h\n"
+ "sqadd v23.8h, v23.8h, v11.8h\n"
+ "smax v15.8h, v15.8h, v17.8h\n"
+ "smax v9.8h, v9.8h, v17.8h\n"
+ "smax v22.8h, v22.8h, v17.8h\n"
+ "smax v23.8h, v23.8h, v17.8h\n"
+ "smin v15.8h, v15.8h, v14.8h\n"
+ "smin v9.8h, v9.8h, v14.8h\n"
+ "smin v22.8h, v22.8h, v14.8h\n"
+ "smin v23.8h, v23.8h, v14.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x14, x7]\n"
- "smax v23.4s, v23.4s, v19.4s\n"
- "srshl v16.4s, v16.4s, v30.4s\n"
- "and v24.16b, v22.16b, v9.16b\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
- "uzp1 v20.16b, v20.16b, v23.16b\n"
- "add v16.4s, v16.4s, v11.4s\n"
- "sqrdmulh v17.4s, v17.4s, v21.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d20, [x13, x7]\n"
- "smin v16.4s, v16.4s, v14.4s\n"
- "sqrdmulh v18.4s, v18.4s, v31.4s\n"
- "sqadd v22.4s, v22.4s, v24.4s\n"
- "and v2.16b, v17.16b, v30.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "smax v16.4s, v16.4s, v19.4s\n"
- "srshl v22.4s, v22.4s, v9.4s\n"
- "and v31.16b, v18.16b, v9.16b\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "add v22.4s, v22.4s, v11.4s\n"
- "sqadd v17.4s, v17.4s, v2.4s\n"
- "smin v22.4s, v22.4s, v14.4s\n"
- "srshl v17.4s, v17.4s, v30.4s\n"
- "sqadd v18.4s, v18.4s, v31.4s\n"
- "smax v22.4s, v22.4s, v19.4s\n"
- "uzp1 v16.16b, v16.16b, v22.16b\n"
- "add v17.4s, v17.4s, v11.4s\n"
- "srshl v18.4s, v18.4s, v9.4s\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d16, [x12, x7]\n"
- "smin v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v11.4s\n"
- "smax v17.4s, v17.4s, v19.4s\n"
- "smin v18.4s, v18.4s, v14.4s\n"
- "smax v18.4s, v18.4s, v19.4s\n"
- "uzp1 v17.16b, v17.16b, v18.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d17, [x11, x7]\n"
- "add x7, x7, #0x8\n"
+ "str d15, [x10, x14]\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str d9, [x9, x14]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d22, [x28, x14]\n"
+ "str d23, [x27, x14]\n"
"ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
"ldr q15, [x19, #0x0]\n"
- "mov v20.16b, v15.16b\n"
+ "add x14, x14, #0x8\n"
"ldr q10, [x19, #0x10]\n"
"add x19, x19, #0x20\n"
- "mov v16.16b, v15.16b\n"
"str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "mov v17.16b, v15.16b\n"
- "ldr d0, [x6, #0x0]\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "ldr d2, [x17, #0x10]\n"
+ "mov v9.16b, v15.16b\n"
+ "mov v16.16b, v10.16b\n"
+ "ldr d3, [x17, #0x18]\n"
+ "ldr d4, [x17, #0x20]\n"
+ "mov v22.16b, v15.16b\n"
+ "mov v21.16b, v10.16b\n"
+ "ldr d5, [x17, #0x28]\n"
+ "ldr d6, [x17, #0x30]\n"
+ "mov v23.16b, v15.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "ldr d7, [x17, #0x38]\n"
+ "ldr d8, [x17, #0x40]\n"
"ssubl v0.8h, v0.8b, v13.8b\n"
- "mov v23.16b, v10.16b\n"
- "ldr d1, [x6, #0x8]\n"
- "mov v22.16b, v10.16b\n"
- "ldr d2, [x6, #0x10]\n"
"ssubl v1.8h, v1.8b, v13.8b\n"
- "mov v18.16b, v10.16b\n"
- "ldr d3, [x6, #0x18]\n"
- "ldr d4, [x6, #0x20]\n"
+ "ldp x26, x25, [x12, #0x0]\n"
+ "ldp x24, x23, [x12, #0x10]\n"
"ssubl v2.8h, v2.8b, v13.8b\n"
- "ldr d5, [x6, #0x28]\n"
"ssubl v3.8h, v3.8b, v13.8b\n"
- "ldr d6, [x6, #0x30]\n"
- "ldr d7, [x6, #0x38]\n"
+ "ldp x22, x21, [x12, #0x20]\n"
+ "ldp x20, x19, [x12, #0x30]\n"
"ssubl v4.8h, v4.8b, v13.8b\n"
- "ldr d8, [x6, #0x40]\n"
"ssubl v5.8h, v5.8b, v13.8b\n"
- "ldp x26, x25, [x8, #0x0]\n"
+ "ldr d31, [x26, x15]\n"
+ "ldr d30, [x25, x15]\n"
"ssubl v6.8h, v6.8b, v13.8b\n"
- "ldp x24, x23, [x8, #0x10]\n"
"ssubl v7.8h, v7.8b, v13.8b\n"
+ "ldr d29, [x24, x15]\n"
+ "ldr d28, [x23, x15]\n"
"ssubl v8.8h, v8.8b, v13.8b\n"
- "ldp x22, x21, [x8, #0x20]\n"
- "ldp x20, x19, [x8, #0x30]\n"
- "ldr d31, [x26, x5]\n"
"ssubl v31.8h, v31.8b, v12.8b\n"
- "ldr d30, [x25, x5]\n"
- "ldr d29, [x24, x5]\n"
+ "ldr d27, [x22, x15]\n"
+ "ldr d26, [x21, x15]\n"
"ssubl v30.8h, v30.8b, v12.8b\n"
- "ldr d28, [x23, x5]\n"
- "ldr d27, [x22, x5]\n"
"ssubl v29.8h, v29.8b, v12.8b\n"
- "ldr d26, [x21, x5]\n"
+ "ldr d25, [x20, x15]\n"
+ "ldr d24, [x19, x15]\n"
"ssubl v28.8h, v28.8b, v12.8b\n"
- "ldr d25, [x20, x5]\n"
- "ldr d24, [x19, x5]\n"
"ssubl v27.8h, v27.8b, v12.8b\n"
"ssubl v26.8h, v26.8b, v12.8b\n"
"ssubl v25.8h, v25.8b, v12.8b\n"
@@ -436,275 +428,267 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"bgt 1b\n"
"2:" // Tail
"smlal v15.4s, v31.4h, v8.4h\n"
- "ldr x23, [x8, #0x40]\n"
- "tst x4, #0x7\n"
"smlal2 v10.4s, v31.8h, v8.8h\n"
- "ldr x22, [x8, #0x48]\n"
- "smlal v20.4s, v31.4h, v6.4h\n"
- "ldr x21, [x8, #0x50]\n"
- "smlal2 v23.4s, v31.8h, v6.8h\n"
- "ldr x20, [x8, #0x58]\n"
- "smlal v16.4s, v31.4h, v2.4h\n"
- "ldr x19, [x8, #0x60]\n"
- "smlal2 v22.4s, v31.8h, v2.8h\n"
- "ldr x10, [x8, #0x68]\n"
- "smlal v17.4s, v31.4h, v0.4h\n"
- "ldr x9, [x8, #0x70]\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
- "ldr x28, [x8, #0x78]\n"
+ "ldr x24, [x12, #0x40]\n"
+ "ldr x23, [x12, #0x48]\n"
+ "smlal v9.4s, v31.4h, v6.4h\n"
+ "smlal2 v16.4s, v31.8h, v6.8h\n"
+ "ldr x21, [x12, #0x50]\n"
+ "ldr x19, [x12, #0x58]\n"
"smlal v15.4s, v30.4h, v0.4h\n"
- "ldr x27, [x8, #0x80]\n"
"smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr x26, [x8, #0x88]\n"
- "smlal v20.4s, v28.4h, v1.4h\n"
- "ldr x25, [x8, #0x90]\n"
- "smlal2 v23.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x5]\n"
+ "ldr x22, [x12, #0x78]\n"
+ "ldr x20, [x12, #0x60]\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "smlal2 v16.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x23, x15]\n"
"ssubl v28.8h, v28.8b, v12.8b\n"
"smlal v15.4s, v29.4h, v1.4h\n"
- "ldr x24, [x8, #0x98]\n"
"smlal2 v10.4s, v29.8h, v1.8h\n"
- "ldr d29, [x23, x5]\n"
+ "ldr d29, [x24, x15]\n"
"ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "ldr x23, [x8, #0xa0]\n"
- "smlal2 v23.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x5]\n"
+ "smlal v9.4s, v27.4h, v2.4h\n"
+ "smlal2 v16.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x15]\n"
"ssubl v27.8h, v27.8b, v12.8b\n"
"smlal v15.4s, v26.4h, v3.4h\n"
- "ldr x22, [x8, #0xa8]\n"
"smlal2 v10.4s, v26.8h, v3.8h\n"
- "ldr d26, [x20, x5]\n"
+ "ldr d26, [x19, x15]\n"
"ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v9.4s, v24.4h, v0.4h\n"
+ "smlal2 v16.4s, v24.8h, v0.8h\n"
+ "ldr x21, [x12, #0x80]\n"
+ "ldr x19, [x12, #0x68]\n"
"smlal v15.4s, v25.4h, v4.4h\n"
- "ldr x21, [x8, #0xb0]\n"
"smlal2 v10.4s, v25.8h, v4.8h\n"
- "ldr d25, [x19, x5]\n"
+ "ldr d25, [x20, x15]\n"
"ssubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v9.4s, v29.4h, v4.4h\n"
+ "smlal2 v16.4s, v29.8h, v4.8h\n"
+ "ldr x20, [x12, #0x88]\n"
+ "ldr d29, [x19, x15]\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "ldr x20, [x8, #0xb8]\n"
"smlal2 v10.4s, v24.8h, v2.8h\n"
- "ldr x19, [x8, #0xc0]\n"
- "smlal v20.4s, v24.4h, v0.4h\n"
- "ldr q21, [x17, #0x0]\n"
- "smlal2 v23.4s, v24.8h, v0.8h\n"
- "ldr d24, [x9, x5]\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "smlal v20.4s, v29.4h, v4.4h\n"
- "ldr q30, [x15, #0x0]\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "ldr d29, [x10, x5]\n"
+ "ldr x19, [x12, #0x70]\n"
"ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v20.4s, v28.4h, v5.4h\n"
- "ldr q31, [x17, #0x10]\n"
- "smlal2 v23.4s, v28.8h, v5.8h\n"
- "ldr d28, [x27, x5]\n"
- "add x17, x17, #0x20\n"
+ "smlal v9.4s, v28.4h, v5.4h\n"
+ "smlal2 v16.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x21, x15]\n"
+ "ssubl v28.8h, v28.8b, v12.8b\n"
+ "smlal v22.4s, v31.4h, v2.4h\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "ldr x24, [x12, #0x98]\n"
+ "ldr d24, [x19, x15]\n"
"smlal v15.4s, v27.4h, v5.4h\n"
- "ldr q9, [x15, #0x10]\n"
- "add x15, x15, #0x20\n"
"smlal2 v10.4s, v27.8h, v5.8h\n"
- "ssubl v28.8h, v28.8b, v12.8b\n"
- "smlal v20.4s, v27.4h, v3.4h\n"
- "smlal2 v23.4s, v27.8h, v3.8h\n"
- "ldr d27, [x28, x5]\n"
+ "ssubl v24.8h, v24.8b, v12.8b\n"
+ "ldr x23, [x12, #0x90]\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "smlal2 v16.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x22, x15]\n"
"ssubl v27.8h, v27.8b, v12.8b\n"
- "smlal v16.4s, v26.4h, v3.4h\n"
- "smlal2 v22.4s, v26.8h, v3.8h\n"
- "ldr d26, [x26, x5]\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "smlal v22.4s, v26.4h, v3.4h\n"
+ "ldr x22, [x12, #0xa8]\n"
+ "ldr x19, [x12, #0xa0]\n"
+ "smlal2 v21.4s, v26.8h, v3.8h\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "ldr d26, [x20, x15]\n"
"ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v23.4s, v27.4h, v4.4h\n"
+ "smlal v22.4s, v25.4h, v0.4h\n"
+ "ldr x21, [x12, #0xb0]\n"
+ "ldr x20, [x12, #0xb8]\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "smlal2 v18.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x19, x15]\n"
+ "ssubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
"smlal v15.4s, v25.4h, v6.4h\n"
+ "ldr x19, [x12, #0xc0]\n"
+ "ldr q19, [x13, #0x0]\n"
"smlal2 v10.4s, v25.8h, v6.8h\n"
- "smlal v16.4s, v25.4h, v0.4h\n"
- "smlal2 v22.4s, v25.8h, v0.8h\n"
- "ldr d25, [x25, x5]\n"
+ "smlal v22.4s, v29.4h, v4.4h\n"
+ "ldr d25, [x23, x15]\n"
"ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v16.4s, v29.4h, v4.4h\n"
- "smlal2 v22.4s, v29.8h, v4.8h\n"
- "ldr d29, [x24, x5]\n"
+ "smlal2 v21.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x24, x15]\n"
+ "smlal2 v18.4s, v28.8h, v1.8h\n"
"ssubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v23.4s, v26.4h, v5.4h\n"
"smlal v15.4s, v24.4h, v7.4h\n"
+ "ldr q0, [x11, #0x0]\n"
+ "ldr q4, [x13, #0x10]\n"
"smlal2 v10.4s, v24.8h, v7.8h\n"
- "smlal v16.4s, v24.4h, v1.4h\n"
- "smlal2 v22.4s, v24.8h, v1.8h\n"
- "ldr d24, [x22, x5]\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v18.4s, v27.8h, v4.8h\n"
- "ldr d27, [x23, x5]\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "smlal v20.4s, v28.4h, v7.4h\n"
- "smlal2 v23.4s, v28.8h, v7.8h\n"
- "smlal v17.4s, v28.4h, v1.4h\n"
- "smlal2 v18.4s, v28.8h, v1.8h\n"
- "smlal v16.4s, v25.4h, v6.4h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "ldr d25, [x20, x5]\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v17.4s, v26.4h, v5.4h\n"
+ "smlal v22.4s, v24.4h, v1.4h\n"
+ "sqdmulh v15.4s, v15.4s, v19.4s\n"
+ "ldr q31, [x11, #0x10]\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x22, x15]\n"
"smlal2 v18.4s, v26.8h, v5.8h\n"
- "ldr d26, [x21, x5]\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "smlal v20.4s, v29.4h, v8.4h\n"
- "smlal2 v23.4s, v29.8h, v8.8h\n"
- "smlal v17.4s, v29.4h, v2.4h\n"
+ "ssubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v23.4s, v29.4h, v2.4h\n"
+ "ldr d26, [x21, x15]\n"
"smlal2 v18.4s, v29.8h, v2.8h\n"
- "ldr d29, [x19, x5]\n"
- "add x5, x5, #0x8\n"
- "smlal v16.4s, v27.4h, v7.4h\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v22.4s, v27.8h, v7.8h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal v16.4s, v24.4h, v5.4h\n"
+ "ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v22.4s, v25.4h, v6.4h\n"
+ "smlal v23.4s, v24.4h, v3.4h\n"
+ "and v30.16b, v15.16b, v0.16b\n"
+ "tst x8, #0x7\n"
+ "smlal v9.4s, v28.4h, v7.4h\n"
+ "smlal2 v16.4s, v28.8h, v7.8h\n"
+ "sqdmulh v10.4s, v10.4s, v4.4s\n"
+ "add x13, x13, #0x20\n"
+ "smlal2 v21.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x20, x15]\n"
"smlal2 v18.4s, v24.8h, v3.8h\n"
- "sqrdmulh v15.4s, v15.4s, v21.4s\n"
- "smlal2 v22.4s, v24.8h, v5.8h\n"
- "smlal v17.4s, v26.4h, v7.4h\n"
+ "ssubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v22.4s, v27.4h, v7.4h\n"
+ "smlal v23.4s, v26.4h, v7.4h\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "add x11, x11, #0x20\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "smlal2 v16.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x19, x15]\n"
+ "ssubl v29.8h, v29.8b, v12.8b\n"
+ "smlal2 v21.4s, v27.8h, v7.8h\n"
"smlal2 v18.4s, v26.8h, v7.8h\n"
- "smlal v16.4s, v25.4h, v8.4h\n"
- "smlal2 v22.4s, v25.8h, v8.8h\n"
- "smlal v17.4s, v25.4h, v6.4h\n"
+ "sqdmulh v9.4s, v9.4s, v19.4s\n"
+ "add x15, x15, #0x8\n"
+ "smlal v22.4s, v24.4h, v5.4h\n"
+ "smlal v23.4s, v25.4h, v6.4h\n"
+ "and v28.16b, v9.16b, v0.16b\n"
+ "smlal2 v21.4s, v24.8h, v5.8h\n"
"smlal2 v18.4s, v25.8h, v6.8h\n"
- "and v26.16b, v15.16b, v30.16b\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "smlal v17.4s, v29.4h, v8.4h\n"
+ "sqdmulh v16.4s, v16.4s, v4.4s\n"
+ "smlal v22.4s, v25.4h, v8.4h\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
+ "sqdmulh v22.4s, v22.4s, v19.4s\n"
+ "smlal2 v21.4s, v25.8h, v8.8h\n"
"smlal2 v18.4s, v29.8h, v8.8h\n"
- "sqrdmulh v10.4s, v10.4s, v31.4s\n"
- "sqrdmulh v20.4s, v20.4s, v21.4s\n"
- "sqrdmulh v23.4s, v23.4s, v31.4s\n"
- "sqrdmulh v16.4s, v16.4s, v21.4s\n"
- "sqadd v15.4s, v15.4s, v26.4s\n"
- "and v8.16b, v10.16b, v9.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v30.4s\n"
- "and v4.16b, v20.16b, v30.16b\n"
+ "sqdmulh v23.4s, v23.4s, v19.4s\n"
+ "and v29.16b, v22.16b, v0.16b\n"
+ "sqdmulh v21.4s, v21.4s, v4.4s\n"
+ "and v20.16b, v23.16b, v0.16b\n"
+ "sqdmulh v18.4s, v18.4s, v4.4s\n"
+ "and v19.16b, v10.16b, v31.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v4.16b, v16.16b, v31.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v5.16b, v21.16b, v31.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v26.16b, v18.16b, v31.16b\n"
+ "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v28.4s\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "and v2.16b, v23.16b, v9.16b\n"
- "and v1.16b, v16.16b, v30.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "add v15.4s, v15.4s, v11.4s\n"
- "sqadd v10.4s, v10.4s, v8.4s\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "sqrdmulh v22.4s, v22.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v4.4s\n"
- "smin v15.4s, v15.4s, v14.4s\n"
- "srshl v10.4s, v10.4s, v9.4s\n"
- "sqadd v23.4s, v23.4s, v2.4s\n"
- "smax v15.4s, v15.4s, v19.4s\n"
- "srshl v20.4s, v20.4s, v30.4s\n"
- "add v10.4s, v10.4s, v11.4s\n"
- "srshl v23.4s, v23.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v1.4s\n"
- "smin v10.4s, v10.4s, v14.4s\n"
- "add v20.4s, v20.4s, v11.4s\n"
- "add v23.4s, v23.4s, v11.4s\n"
- "smax v10.4s, v10.4s, v19.4s\n"
- "smin v20.4s, v20.4s, v14.4s\n"
- "smin v23.4s, v23.4s, v14.4s\n"
- "uzp1 v15.16b, v15.16b, v10.16b\n"
- "smax v20.4s, v20.4s, v19.4s\n"
+ "sqadd v22.4s, v22.4s, v29.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v20.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v0.4s\n"
+ "sqadd v10.4s, v10.4s, v19.4s\n"
+ "srshl v9.4s, v9.4s, v0.4s\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "sqadd v21.4s, v21.4s, v5.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "sqadd v18.4s, v18.4s, v26.4s\n"
+ "srshl v10.4s, v10.4s, v31.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v16.4s, v16.4s, v31.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v21.4s, v21.4s, v31.4s\n"
+ "sqxtn v22.4h, v22.4s\n"
+ "srshl v18.4s, v18.4s, v31.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v15.8h, v10.4s\n"
+ "sqxtn2 v9.8h, v16.4s\n"
+ "sqxtn2 v22.8h, v21.4s\n"
+ "sqxtn2 v23.8h, v18.4s\n"
+ "sqadd v15.8h, v15.8h, v11.8h\n"
+ "sqadd v9.8h, v9.8h, v11.8h\n"
+ "sqadd v22.8h, v22.8h, v11.8h\n"
+ "sqadd v23.8h, v23.8h, v11.8h\n"
+ "smax v15.8h, v15.8h, v17.8h\n"
+ "smax v9.8h, v9.8h, v17.8h\n"
+ "smax v22.8h, v22.8h, v17.8h\n"
+ "smax v23.8h, v23.8h, v17.8h\n"
+ "smin v15.8h, v15.8h, v14.8h\n"
+ "smin v9.8h, v9.8h, v14.8h\n"
+ "smin v22.8h, v22.8h, v14.8h\n"
+ "smin v23.8h, v23.8h, v14.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x14, x7]\n"
- "smax v23.4s, v23.4s, v19.4s\n"
- "srshl v16.4s, v16.4s, v30.4s\n"
- "and v24.16b, v22.16b, v9.16b\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
- "uzp1 v20.16b, v20.16b, v23.16b\n"
- "add v16.4s, v16.4s, v11.4s\n"
- "sqrdmulh v17.4s, v17.4s, v21.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d20, [x13, x7]\n"
- "smin v16.4s, v16.4s, v14.4s\n"
- "sqrdmulh v18.4s, v18.4s, v31.4s\n"
- "sqadd v22.4s, v22.4s, v24.4s\n"
- "and v2.16b, v17.16b, v30.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "smax v16.4s, v16.4s, v19.4s\n"
- "srshl v22.4s, v22.4s, v9.4s\n"
- "and v31.16b, v18.16b, v9.16b\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "add v22.4s, v22.4s, v11.4s\n"
- "sqadd v17.4s, v17.4s, v2.4s\n"
- "smin v22.4s, v22.4s, v14.4s\n"
- "srshl v17.4s, v17.4s, v30.4s\n"
- "sqadd v18.4s, v18.4s, v31.4s\n"
- "smax v22.4s, v22.4s, v19.4s\n"
- "uzp1 v16.16b, v16.16b, v22.16b\n"
- "add v17.4s, v17.4s, v11.4s\n"
- "srshl v18.4s, v18.4s, v9.4s\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d16, [x12, x7]\n"
- "smin v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v11.4s\n"
- "smax v17.4s, v17.4s, v19.4s\n"
- "smin v18.4s, v18.4s, v14.4s\n"
- "smax v18.4s, v18.4s, v19.4s\n"
- "uzp1 v17.16b, v17.16b, v18.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d17, [x11, x7]\n"
- "add x7, x7, #0x8\n"
+ "str d15, [x10, x14]\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str d9, [x9, x14]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d22, [x28, x14]\n"
+ "str d23, [x27, x14]\n"
+ "add x14, x14, #0x8\n"
"beq 88f\n"
- "add x6, x6, #0x48\n"
+ "add x17, x17, #0x48\n"
"3:" // Oddments
"ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x4, #2, 5f\n"
+ "tbz x8, #2, 5f\n"
"ld1 { v15.4s }, [x19], #0x10\n"
- "tbz x4, #1, 4f\n"
+ "tbz x8, #1, 4f\n"
"ld1 { v10.d }[0], [x19], #0x8\n"
- "tbz x4, #0, 7f\n"
+ "tbz x8, #0, 7f\n"
"ld1 { v10.s }[2], [x19]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x4, #0, 7f\n"
+ "tbz x8, #0, 7f\n"
"ld1 { v10.s }[0], [x19]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x4, #1, 6f\n"
+ "tbz x8, #1, 6f\n"
"ld1 { v15.d }[0], [x19], #0x8\n"
- "tbz x4, #0, 7f\n"
+ "tbz x8, #0, 7f\n"
"ld1 { v15.s }[2], [x19]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 7f\n"
+ "tbz x8, #0, 7f\n"
"ld1 { v15.s }[0], [x19]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "mov v20.16b, v15.16b\n"
- "ldr d0, [x6, #0x0]\n"
- "mov v23.16b, v10.16b\n"
- "ldr d1, [x6, #0x8]\n"
- "mov v16.16b, v15.16b\n"
- "ldr d2, [x6, #0x10]\n"
- "mov v22.16b, v10.16b\n"
- "ldr d3, [x6, #0x18]\n"
- "mov v17.16b, v15.16b\n"
- "ldr d4, [x6, #0x20]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "mov v9.16b, v15.16b\n"
+ "mov v16.16b, v10.16b\n"
+ "ldr d2, [x17, #0x10]\n"
+ "ldr d3, [x17, #0x18]\n"
+ "mov v22.16b, v15.16b\n"
+ "mov v21.16b, v10.16b\n"
+ "ldr d4, [x17, #0x20]\n"
+ "ldr d5, [x17, #0x28]\n"
+ "mov v23.16b, v15.16b\n"
"mov v18.16b, v10.16b\n"
- "ldr d5, [x6, #0x28]\n"
+ "ldr d6, [x17, #0x30]\n"
+ "ldr d7, [x17, #0x38]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
"ssubl v1.8h, v1.8b, v13.8b\n"
- "ldr d6, [x6, #0x30]\n"
+ "ldr d8, [x17, #0x40]\n"
+ "ldp x26, x25, [x12, #0x0]\n"
"ssubl v2.8h, v2.8b, v13.8b\n"
- "ldr d7, [x6, #0x38]\n"
"ssubl v3.8h, v3.8b, v13.8b\n"
- "ldr d8, [x6, #0x40]\n"
+ "ldp x24, x23, [x12, #0x10]\n"
+ "ldp x22, x21, [x12, #0x20]\n"
"ssubl v4.8h, v4.8b, v13.8b\n"
- "ldp x26, x25, [x8, #0x0]\n"
"ssubl v5.8h, v5.8b, v13.8b\n"
- "ldp x24, x23, [x8, #0x10]\n"
+ "ldp x20, x19, [x12, #0x30]\n"
"ssubl v6.8h, v6.8b, v13.8b\n"
"ssubl v7.8h, v7.8b, v13.8b\n"
- "ldp x22, x21, [x8, #0x20]\n"
"ssubl v8.8h, v8.8b, v13.8b\n"
- "ldp x20, x19, [x8, #0x30]\n"
- "add x26, x26, x5\n"
- "add x25, x25, x5\n"
- "add x24, x24, x5\n"
- "add x23, x23, x5\n"
- "add x22, x22, x5\n"
- "add x21, x21, x5\n"
- "add x20, x20, x5\n"
- "add x19, x19, x5\n"
- "tbz x4, #2, 9f\n"
+ "add x26, x26, x15\n"
+ "add x25, x25, x15\n"
+ "add x24, x24, x15\n"
+ "add x23, x23, x15\n"
+ "add x22, x22, x15\n"
+ "add x21, x21, x15\n"
+ "add x20, x20, x15\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 9f\n"
"ld1 { v31.s }[0], [x26], #0x4\n"
"ld1 { v30.s }[0], [x25], #0x4\n"
"ld1 { v29.s }[0], [x24], #0x4\n"
@@ -713,7 +697,7 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v26.s }[0], [x21], #0x4\n"
"ld1 { v25.s }[0], [x20], #0x4\n"
"ld1 { v24.s }[0], [x19], #0x4\n"
- "tbz x4, #1, 8f\n"
+ "tbz x8, #1, 8f\n"
"ld1 { v31.h }[2], [x26], #0x2\n"
"ld1 { v30.h }[2], [x25], #0x2\n"
"ld1 { v29.h }[2], [x24], #0x2\n"
@@ -722,7 +706,7 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v26.h }[2], [x21], #0x2\n"
"ld1 { v25.h }[2], [x20], #0x2\n"
"ld1 { v24.h }[2], [x19], #0x2\n"
- "tbz x4, #0, 11f\n"
+ "tbz x8, #0, 11f\n"
"ld1 { v31.b }[6], [x26]\n"
"ld1 { v30.b }[6], [x25]\n"
"ld1 { v29.b }[6], [x24]\n"
@@ -733,7 +717,7 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v24.b }[6], [x19]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x4, #0, 11f\n"
+ "tbz x8, #0, 11f\n"
"ld1 { v31.b }[4], [x26]\n"
"ld1 { v30.b }[4], [x25]\n"
"ld1 { v29.b }[4], [x24]\n"
@@ -744,7 +728,7 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v24.b }[4], [x19]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x4, #1, 10f\n"
+ "tbz x8, #1, 10f\n"
"ld1 { v31.h }[0], [x26], #0x2\n"
"ld1 { v30.h }[0], [x25], #0x2\n"
"ld1 { v29.h }[0], [x24], #0x2\n"
@@ -753,7 +737,7 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v26.h }[0], [x21], #0x2\n"
"ld1 { v25.h }[0], [x20], #0x2\n"
"ld1 { v24.h }[0], [x19], #0x2\n"
- "tbz x4, #0, 11f\n"
+ "tbz x8, #0, 11f\n"
"ld1 { v31.b }[2], [x26]\n"
"ld1 { v30.b }[2], [x25]\n"
"ld1 { v29.b }[2], [x24]\n"
@@ -764,7 +748,7 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v24.b }[2], [x19]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 11f\n"
+ "tbz x8, #0, 11f\n"
"ld1 { v31.b }[0], [x26]\n"
"ld1 { v30.b }[0], [x25]\n"
"ld1 { v29.b }[0], [x24]\n"
@@ -774,646 +758,636 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v25.b }[0], [x20]\n"
"ld1 { v24.b }[0], [x19]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ldr x23, [x8, #0x40]\n"
"ssubl v31.8h, v31.8b, v12.8b\n"
"smlal v15.4s, v31.4h, v8.4h\n"
- "ssubl v30.8h, v30.8b, v12.8b\n"
"smlal2 v10.4s, v31.8h, v8.8h\n"
- "ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v20.4s, v31.4h, v6.4h\n"
- "ssubl v28.8h, v28.8b, v12.8b\n"
- "smlal2 v23.4s, v31.8h, v6.8h\n"
- "ssubl v27.8h, v27.8b, v12.8b\n"
- "smlal v16.4s, v31.4h, v2.4h\n"
- "ssubl v26.8h, v26.8b, v12.8b\n"
- "smlal2 v22.4s, v31.8h, v2.8h\n"
- "ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v17.4s, v31.4h, v0.4h\n"
- "ssubl v24.8h, v24.8b, v12.8b\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
- "add x23, x23, x5\n"
+ "ldr x24, [x12, #0x40]\n"
+ "ssubl v30.8h, v30.8b, v12.8b\n"
"smlal v15.4s, v30.4h, v0.4h\n"
"smlal2 v10.4s, v30.8h, v0.8h\n"
- "smlal v20.4s, v28.4h, v1.4h\n"
- "smlal2 v23.4s, v28.8h, v1.8h\n"
+ "add x24, x24, x15\n"
+ "ssubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v9.4s, v31.4h, v6.4h\n"
+ "smlal2 v16.4s, v31.8h, v6.8h\n"
"smlal v15.4s, v29.4h, v1.4h\n"
"smlal2 v10.4s, v29.8h, v1.8h\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "smlal2 v23.4s, v27.8h, v2.8h\n"
+ "ssubl v28.8h, v28.8b, v12.8b\n"
+ "ssubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "smlal2 v16.4s, v28.8h, v1.8h\n"
"smlal v15.4s, v26.4h, v3.4h\n"
"smlal2 v10.4s, v26.8h, v3.8h\n"
- "smlal v20.4s, v24.4h, v0.4h\n"
- "smlal2 v23.4s, v24.8h, v0.8h\n"
+ "ssubl v27.8h, v27.8b, v12.8b\n"
+ "ssubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v9.4s, v27.4h, v2.4h\n"
+ "smlal2 v16.4s, v27.8h, v2.8h\n"
"smlal v15.4s, v25.4h, v4.4h\n"
"smlal2 v10.4s, v25.8h, v4.8h\n"
+ "ssubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v22.4s, v31.4h, v2.4h\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
"smlal v15.4s, v24.4h, v2.4h\n"
"smlal2 v10.4s, v24.8h, v2.8h\n"
- "tbz x4, #2, 13f\n"
- "ld1 { v29.s }[0], [x23], #0x4\n"
- "tbz x4, #1, 12f\n"
- "ld1 { v29.h }[2], [x23], #0x2\n"
- "tbz x4, #0, 15f\n"
- "ld1 { v29.b }[6], [x23]\n"
+ "smlal v9.4s, v24.4h, v0.4h\n"
+ "smlal2 v16.4s, v24.8h, v0.8h\n"
+ "tbz x8, #2, 13f\n"
+ "ld1 { v29.s }[0], [x24], #0x4\n"
+ "tbz x8, #1, 12f\n"
+ "ld1 { v29.h }[2], [x24], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v29.b }[6], [x24]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 15f\n"
- "ld1 { v29.b }[4], [x23]\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v29.b }[4], [x24]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x4, #1, 14f\n"
- "ld1 { v29.h }[0], [x23], #0x2\n"
- "tbz x4, #0, 15f\n"
- "ld1 { v29.b }[2], [x23]\n"
+ "tbz x8, #1, 14f\n"
+ "ld1 { v29.h }[0], [x24], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v29.b }[2], [x24]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 15f\n"
- "ld1 { v29.b }[0], [x23]\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v29.b }[0], [x24]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "ldr x22, [x8, #0x48]\n"
"ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v20.4s, v29.4h, v4.4h\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "add x22, x22, x5\n"
- "tbz x4, #2, 17f\n"
- "ld1 { v28.s }[0], [x22], #0x4\n"
- "tbz x4, #1, 16f\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
- "tbz x4, #0, 19f\n"
- "ld1 { v28.b }[6], [x22]\n"
+ "ldr x23, [x12, #0x48]\n"
+ "smlal v9.4s, v29.4h, v4.4h\n"
+ "smlal2 v16.4s, v29.8h, v4.8h\n"
+ "add x23, x23, x15\n"
+ "tbz x8, #2, 17f\n"
+ "ld1 { v28.s }[0], [x23], #0x4\n"
+ "tbz x8, #1, 16f\n"
+ "ld1 { v28.h }[2], [x23], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v28.b }[6], [x23]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 19f\n"
- "ld1 { v28.b }[4], [x22]\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v28.b }[4], [x23]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
- "tbz x4, #1, 18f\n"
- "ld1 { v28.h }[0], [x22], #0x2\n"
- "tbz x4, #0, 19f\n"
- "ld1 { v28.b }[2], [x22]\n"
+ "tbz x8, #1, 18f\n"
+ "ld1 { v28.h }[0], [x23], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v28.b }[2], [x23]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 19f\n"
- "ld1 { v28.b }[0], [x22]\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v28.b }[0], [x23]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "ldr x21, [x8, #0x50]\n"
"ssubl v28.8h, v28.8b, v12.8b\n"
- "smlal v20.4s, v28.4h, v5.4h\n"
- "smlal2 v23.4s, v28.8h, v5.8h\n"
- "add x21, x21, x5\n"
- "tbz x4, #2, 21f\n"
+ "ldr x21, [x12, #0x50]\n"
+ "smlal v9.4s, v28.4h, v5.4h\n"
+ "smlal2 v16.4s, v28.8h, v5.8h\n"
+ "add x21, x21, x15\n"
+ "tbz x8, #2, 21f\n"
"ld1 { v27.s }[0], [x21], #0x4\n"
- "tbz x4, #1, 20f\n"
+ "tbz x8, #1, 20f\n"
"ld1 { v27.h }[2], [x21], #0x2\n"
- "tbz x4, #0, 23f\n"
+ "tbz x8, #0, 23f\n"
"ld1 { v27.b }[6], [x21]\n"
"b 23f\n"
"20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 23f\n"
+ "tbz x8, #0, 23f\n"
"ld1 { v27.b }[4], [x21]\n"
"b 23f\n"
"21:" // Oddments: Load (1, 2): Bit 2: Unset
- "tbz x4, #1, 22f\n"
+ "tbz x8, #1, 22f\n"
"ld1 { v27.h }[0], [x21], #0x2\n"
- "tbz x4, #0, 23f\n"
+ "tbz x8, #0, 23f\n"
"ld1 { v27.b }[2], [x21]\n"
"b 23f\n"
"22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 23f\n"
+ "tbz x8, #0, 23f\n"
"ld1 { v27.b }[0], [x21]\n"
"23:" // Oddments: Load (1, 2): Bit 2: End
- "ldr x20, [x8, #0x58]\n"
"ssubl v27.8h, v27.8b, v12.8b\n"
+ "ldr x19, [x12, #0x58]\n"
"smlal v15.4s, v27.4h, v5.4h\n"
"smlal2 v10.4s, v27.8h, v5.8h\n"
- "add x20, x20, x5\n"
- "smlal v20.4s, v27.4h, v3.4h\n"
- "smlal2 v23.4s, v27.8h, v3.8h\n"
- "tbz x4, #2, 25f\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 24f\n"
- "ld1 { v26.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 27f\n"
- "ld1 { v26.b }[6], [x20]\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "smlal2 v16.4s, v27.8h, v3.8h\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 25f\n"
+ "ld1 { v26.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 24f\n"
+ "ld1 { v26.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v26.b }[6], [x19]\n"
"b 27f\n"
"24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 27f\n"
- "ld1 { v26.b }[4], [x20]\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v26.b }[4], [x19]\n"
"b 27f\n"
"25:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x4, #1, 26f\n"
- "ld1 { v26.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 27f\n"
- "ld1 { v26.b }[2], [x20]\n"
+ "tbz x8, #1, 26f\n"
+ "ld1 { v26.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v26.b }[2], [x19]\n"
"b 27f\n"
"26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 27f\n"
- "ld1 { v26.b }[0], [x20]\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v26.b }[0], [x19]\n"
"27:" // Oddments: Load (3, 0): Bit 2: End
- "ldr x19, [x8, #0x60]\n"
"ssubl v26.8h, v26.8b, v12.8b\n"
- "smlal v16.4s, v26.4h, v3.4h\n"
- "smlal2 v22.4s, v26.8h, v3.8h\n"
- "add x19, x19, x5\n"
- "tbz x4, #2, 29f\n"
- "ld1 { v25.s }[0], [x19], #0x4\n"
- "tbz x4, #1, 28f\n"
- "ld1 { v25.h }[2], [x19], #0x2\n"
- "tbz x4, #0, 31f\n"
- "ld1 { v25.b }[6], [x19]\n"
+ "ldr x20, [x12, #0x60]\n"
+ "smlal v22.4s, v26.4h, v3.4h\n"
+ "smlal2 v21.4s, v26.8h, v3.8h\n"
+ "add x20, x20, x15\n"
+ "tbz x8, #2, 29f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 28f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v25.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 31f\n"
- "ld1 { v25.b }[4], [x19]\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v25.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x4, #1, 30f\n"
- "ld1 { v25.h }[0], [x19], #0x2\n"
- "tbz x4, #0, 31f\n"
- "ld1 { v25.b }[2], [x19]\n"
+ "tbz x8, #1, 30f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v25.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 31f\n"
- "ld1 { v25.b }[0], [x19]\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v25.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 0): Bit 2: End
- "ldr x10, [x8, #0x68]\n"
"ssubl v25.8h, v25.8b, v12.8b\n"
+ "ldr x19, [x12, #0x68]\n"
"smlal v15.4s, v25.4h, v6.4h\n"
"smlal2 v10.4s, v25.8h, v6.8h\n"
- "add x10, x10, x5\n"
- "smlal v16.4s, v25.4h, v0.4h\n"
- "smlal2 v22.4s, v25.8h, v0.8h\n"
- "tbz x4, #2, 33f\n"
- "ld1 { v29.s }[0], [x10], #0x4\n"
- "tbz x4, #1, 32f\n"
- "ld1 { v29.h }[2], [x10], #0x2\n"
- "tbz x4, #0, 35f\n"
- "ld1 { v29.b }[6], [x10]\n"
+ "smlal v22.4s, v25.4h, v0.4h\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 33f\n"
+ "ld1 { v29.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 32f\n"
+ "ld1 { v29.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[6], [x19]\n"
"b 35f\n"
"32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 35f\n"
- "ld1 { v29.b }[4], [x10]\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[4], [x19]\n"
"b 35f\n"
"33:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x4, #1, 34f\n"
- "ld1 { v29.h }[0], [x10], #0x2\n"
- "tbz x4, #0, 35f\n"
- "ld1 { v29.b }[2], [x10]\n"
+ "tbz x8, #1, 34f\n"
+ "ld1 { v29.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[2], [x19]\n"
"b 35f\n"
"34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 35f\n"
- "ld1 { v29.b }[0], [x10]\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[0], [x19]\n"
"35:" // Oddments: Load (3, 1): Bit 2: End
- "ldr x9, [x8, #0x70]\n"
"ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v16.4s, v29.4h, v4.4h\n"
- "smlal2 v22.4s, v29.8h, v4.8h\n"
- "add x9, x9, x5\n"
- "tbz x4, #2, 37f\n"
- "ld1 { v24.s }[0], [x9], #0x4\n"
- "tbz x4, #1, 36f\n"
- "ld1 { v24.h }[2], [x9], #0x2\n"
- "tbz x4, #0, 39f\n"
- "ld1 { v24.b }[6], [x9]\n"
+ "ldr x19, [x12, #0x70]\n"
+ "smlal v22.4s, v29.4h, v4.4h\n"
+ "smlal2 v21.4s, v29.8h, v4.8h\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 37f\n"
+ "ld1 { v24.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 36f\n"
+ "ld1 { v24.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v24.b }[6], [x19]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 39f\n"
- "ld1 { v24.b }[4], [x9]\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v24.b }[4], [x19]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 1): Bit 2: Unset
- "tbz x4, #1, 38f\n"
- "ld1 { v24.h }[0], [x9], #0x2\n"
- "tbz x4, #0, 39f\n"
- "ld1 { v24.b }[2], [x9]\n"
+ "tbz x8, #1, 38f\n"
+ "ld1 { v24.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v24.b }[2], [x19]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 39f\n"
- "ld1 { v24.b }[0], [x9]\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v24.b }[0], [x19]\n"
"39:" // Oddments: Load (2, 1): Bit 2: End
- "ldr x28, [x8, #0x78]\n"
"ssubl v24.8h, v24.8b, v12.8b\n"
+ "ldr x22, [x12, #0x78]\n"
"smlal v15.4s, v24.4h, v7.4h\n"
"smlal2 v10.4s, v24.8h, v7.8h\n"
- "add x28, x28, x5\n"
- "smlal v16.4s, v24.4h, v1.4h\n"
- "smlal2 v22.4s, v24.8h, v1.8h\n"
- "tbz x4, #2, 41f\n"
- "ld1 { v27.s }[0], [x28], #0x4\n"
- "tbz x4, #1, 40f\n"
- "ld1 { v27.h }[2], [x28], #0x2\n"
- "tbz x4, #0, 43f\n"
- "ld1 { v27.b }[6], [x28]\n"
+ "smlal v22.4s, v24.4h, v1.4h\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "add x22, x22, x15\n"
+ "tbz x8, #2, 41f\n"
+ "ld1 { v27.s }[0], [x22], #0x4\n"
+ "tbz x8, #1, 40f\n"
+ "ld1 { v27.h }[2], [x22], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v27.b }[6], [x22]\n"
"b 43f\n"
"40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 43f\n"
- "ld1 { v27.b }[4], [x28]\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v27.b }[4], [x22]\n"
"b 43f\n"
"41:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x4, #1, 42f\n"
- "ld1 { v27.h }[0], [x28], #0x2\n"
- "tbz x4, #0, 43f\n"
- "ld1 { v27.b }[2], [x28]\n"
+ "tbz x8, #1, 42f\n"
+ "ld1 { v27.h }[0], [x22], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v27.b }[2], [x22]\n"
"b 43f\n"
"42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 43f\n"
- "ld1 { v27.b }[0], [x28]\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v27.b }[0], [x22]\n"
"43:" // Oddments: Load (3, 3): Bit 2: End
- "ldr x27, [x8, #0x80]\n"
"ssubl v27.8h, v27.8b, v12.8b\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
+ "ldr x21, [x12, #0x80]\n"
+ "smlal v23.4s, v27.4h, v4.4h\n"
"smlal2 v18.4s, v27.8h, v4.8h\n"
- "add x27, x27, x5\n"
- "tbz x4, #2, 45f\n"
- "ld1 { v28.s }[0], [x27], #0x4\n"
- "tbz x4, #1, 44f\n"
- "ld1 { v28.h }[2], [x27], #0x2\n"
- "tbz x4, #0, 47f\n"
- "ld1 { v28.b }[6], [x27]\n"
+ "add x21, x21, x15\n"
+ "tbz x8, #2, 45f\n"
+ "ld1 { v28.s }[0], [x21], #0x4\n"
+ "tbz x8, #1, 44f\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v28.b }[6], [x21]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 47f\n"
- "ld1 { v28.b }[4], [x27]\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v28.b }[4], [x21]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x4, #1, 46f\n"
- "ld1 { v28.h }[0], [x27], #0x2\n"
- "tbz x4, #0, 47f\n"
- "ld1 { v28.b }[2], [x27]\n"
+ "tbz x8, #1, 46f\n"
+ "ld1 { v28.h }[0], [x21], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v28.b }[2], [x21]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 47f\n"
- "ld1 { v28.b }[0], [x27]\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v28.b }[0], [x21]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "ldr x26, [x8, #0x88]\n"
"ssubl v28.8h, v28.8b, v12.8b\n"
- "smlal v20.4s, v28.4h, v7.4h\n"
- "smlal2 v23.4s, v28.8h, v7.8h\n"
- "add x26, x26, x5\n"
- "smlal v17.4s, v28.4h, v1.4h\n"
+ "ldr x20, [x12, #0x88]\n"
+ "smlal v9.4s, v28.4h, v7.4h\n"
+ "smlal2 v16.4s, v28.8h, v7.8h\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
"smlal2 v18.4s, v28.8h, v1.8h\n"
- "tbz x4, #2, 49f\n"
- "ld1 { v26.s }[0], [x26], #0x4\n"
- "tbz x4, #1, 48f\n"
- "ld1 { v26.h }[2], [x26], #0x2\n"
- "tbz x4, #0, 51f\n"
- "ld1 { v26.b }[6], [x26]\n"
+ "add x20, x20, x15\n"
+ "tbz x8, #2, 49f\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 48f\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 51f\n"
- "ld1 { v26.b }[4], [x26]\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 4): Bit 2: Unset
- "tbz x4, #1, 50f\n"
- "ld1 { v26.h }[0], [x26], #0x2\n"
- "tbz x4, #0, 51f\n"
- "ld1 { v26.b }[2], [x26]\n"
+ "tbz x8, #1, 50f\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 51f\n"
- "ld1 { v26.b }[0], [x26]\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v26.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 4): Bit 2: End
- "ldr x25, [x8, #0x90]\n"
"ssubl v26.8h, v26.8b, v12.8b\n"
- "smlal v17.4s, v26.4h, v5.4h\n"
+ "ldr x23, [x12, #0x90]\n"
+ "smlal v23.4s, v26.4h, v5.4h\n"
"smlal2 v18.4s, v26.8h, v5.8h\n"
- "add x25, x25, x5\n"
- "tbz x4, #2, 53f\n"
- "ld1 { v25.s }[0], [x25], #0x4\n"
- "tbz x4, #1, 52f\n"
- "ld1 { v25.h }[2], [x25], #0x2\n"
- "tbz x4, #0, 55f\n"
- "ld1 { v25.b }[6], [x25]\n"
+ "add x23, x23, x15\n"
+ "tbz x8, #2, 53f\n"
+ "ld1 { v25.s }[0], [x23], #0x4\n"
+ "tbz x8, #1, 52f\n"
+ "ld1 { v25.h }[2], [x23], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v25.b }[6], [x23]\n"
"b 55f\n"
"52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 55f\n"
- "ld1 { v25.b }[4], [x25]\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v25.b }[4], [x23]\n"
"b 55f\n"
"53:" // Oddments: Load (4, 0): Bit 2: Unset
- "tbz x4, #1, 54f\n"
- "ld1 { v25.h }[0], [x25], #0x2\n"
- "tbz x4, #0, 55f\n"
- "ld1 { v25.b }[2], [x25]\n"
+ "tbz x8, #1, 54f\n"
+ "ld1 { v25.h }[0], [x23], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v25.b }[2], [x23]\n"
"b 55f\n"
"54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 55f\n"
- "ld1 { v25.b }[0], [x25]\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v25.b }[0], [x23]\n"
"55:" // Oddments: Load (4, 0): Bit 2: End
- "ldr x24, [x8, #0x98]\n"
"ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v16.4s, v25.4h, v6.4h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "add x24, x24, x5\n"
- "tbz x4, #2, 57f\n"
+ "ldr x24, [x12, #0x98]\n"
+ "smlal v22.4s, v25.4h, v6.4h\n"
+ "smlal2 v21.4s, v25.8h, v6.8h\n"
+ "add x24, x24, x15\n"
+ "tbz x8, #2, 57f\n"
"ld1 { v29.s }[0], [x24], #0x4\n"
- "tbz x4, #1, 56f\n"
+ "tbz x8, #1, 56f\n"
"ld1 { v29.h }[2], [x24], #0x2\n"
- "tbz x4, #0, 59f\n"
+ "tbz x8, #0, 59f\n"
"ld1 { v29.b }[6], [x24]\n"
"b 59f\n"
"56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 59f\n"
+ "tbz x8, #0, 59f\n"
"ld1 { v29.b }[4], [x24]\n"
"b 59f\n"
"57:" // Oddments: Load (2, 4): Bit 2: Unset
- "tbz x4, #1, 58f\n"
+ "tbz x8, #1, 58f\n"
"ld1 { v29.h }[0], [x24], #0x2\n"
- "tbz x4, #0, 59f\n"
+ "tbz x8, #0, 59f\n"
"ld1 { v29.b }[2], [x24]\n"
"b 59f\n"
"58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 59f\n"
+ "tbz x8, #0, 59f\n"
"ld1 { v29.b }[0], [x24]\n"
"59:" // Oddments: Load (2, 4): Bit 2: End
- "ldr x23, [x8, #0xa0]\n"
"ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v20.4s, v29.4h, v8.4h\n"
- "smlal2 v23.4s, v29.8h, v8.8h\n"
- "add x23, x23, x5\n"
- "smlal v17.4s, v29.4h, v2.4h\n"
+ "ldr x19, [x12, #0xa0]\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "smlal2 v16.4s, v29.8h, v8.8h\n"
+ "smlal v23.4s, v29.4h, v2.4h\n"
"smlal2 v18.4s, v29.8h, v2.8h\n"
- "tbz x4, #2, 61f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
- "tbz x4, #1, 60f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
- "tbz x4, #0, 63f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 61f\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 60f\n"
+ "ld1 { v27.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "ld1 { v27.b }[6], [x19]\n"
"b 63f\n"
"60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 63f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "tbz x8, #0, 63f\n"
+ "ld1 { v27.b }[4], [x19]\n"
"b 63f\n"
"61:" // Oddments: Load (4, 1): Bit 2: Unset
- "tbz x4, #1, 62f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
- "tbz x4, #0, 63f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "tbz x8, #1, 62f\n"
+ "ld1 { v27.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "ld1 { v27.b }[2], [x19]\n"
"b 63f\n"
"62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 63f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "tbz x8, #0, 63f\n"
+ "ld1 { v27.b }[0], [x19]\n"
"63:" // Oddments: Load (4, 1): Bit 2: End
- "ldr x22, [x8, #0xa8]\n"
"ssubl v27.8h, v27.8b, v12.8b\n"
- "smlal v16.4s, v27.4h, v7.4h\n"
- "smlal2 v22.4s, v27.8h, v7.8h\n"
- "add x22, x22, x5\n"
- "tbz x4, #2, 65f\n"
+ "ldr x22, [x12, #0xa8]\n"
+ "smlal v22.4s, v27.4h, v7.4h\n"
+ "smlal2 v21.4s, v27.8h, v7.8h\n"
+ "add x22, x22, x15\n"
+ "tbz x8, #2, 65f\n"
"ld1 { v24.s }[0], [x22], #0x4\n"
- "tbz x4, #1, 64f\n"
+ "tbz x8, #1, 64f\n"
"ld1 { v24.h }[2], [x22], #0x2\n"
- "tbz x4, #0, 67f\n"
+ "tbz x8, #0, 67f\n"
"ld1 { v24.b }[6], [x22]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 67f\n"
+ "tbz x8, #0, 67f\n"
"ld1 { v24.b }[4], [x22]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x4, #1, 66f\n"
+ "tbz x8, #1, 66f\n"
"ld1 { v24.h }[0], [x22], #0x2\n"
- "tbz x4, #0, 67f\n"
+ "tbz x8, #0, 67f\n"
"ld1 { v24.b }[2], [x22]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 67f\n"
+ "tbz x8, #0, 67f\n"
"ld1 { v24.b }[0], [x22]\n"
"67:" // Oddments: Load (3, 2): Bit 2: End
- "ldr x21, [x8, #0xb0]\n"
"ssubl v24.8h, v24.8b, v12.8b\n"
- "smlal v16.4s, v24.4h, v5.4h\n"
- "smlal2 v22.4s, v24.8h, v5.8h\n"
- "add x21, x21, x5\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
+ "ldr x21, [x12, #0xb0]\n"
+ "smlal v22.4s, v24.4h, v5.4h\n"
+ "smlal2 v21.4s, v24.8h, v5.8h\n"
+ "smlal v23.4s, v24.4h, v3.4h\n"
"smlal2 v18.4s, v24.8h, v3.8h\n"
- "tbz x4, #2, 69f\n"
+ "add x21, x21, x15\n"
+ "tbz x8, #2, 69f\n"
"ld1 { v26.s }[0], [x21], #0x4\n"
- "tbz x4, #1, 68f\n"
+ "tbz x8, #1, 68f\n"
"ld1 { v26.h }[2], [x21], #0x2\n"
- "tbz x4, #0, 71f\n"
+ "tbz x8, #0, 71f\n"
"ld1 { v26.b }[6], [x21]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 71f\n"
+ "tbz x8, #0, 71f\n"
"ld1 { v26.b }[4], [x21]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 3): Bit 2: Unset
- "tbz x4, #1, 70f\n"
+ "tbz x8, #1, 70f\n"
"ld1 { v26.h }[0], [x21], #0x2\n"
- "tbz x4, #0, 71f\n"
+ "tbz x8, #0, 71f\n"
"ld1 { v26.b }[2], [x21]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 71f\n"
+ "tbz x8, #0, 71f\n"
"ld1 { v26.b }[0], [x21]\n"
"71:" // Oddments: Load (4, 3): Bit 2: End
- "ldr x20, [x8, #0xb8]\n"
"ssubl v26.8h, v26.8b, v12.8b\n"
- "smlal v17.4s, v26.4h, v7.4h\n"
+ "ldr x20, [x12, #0xb8]\n"
+ "smlal v23.4s, v26.4h, v7.4h\n"
"smlal2 v18.4s, v26.8h, v7.8h\n"
- "add x20, x20, x5\n"
- "tbz x4, #2, 73f\n"
+ "add x20, x20, x15\n"
+ "tbz x8, #2, 73f\n"
"ld1 { v25.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 72f\n"
+ "tbz x8, #1, 72f\n"
"ld1 { v25.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 75f\n"
+ "tbz x8, #0, 75f\n"
"ld1 { v25.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 75f\n"
+ "tbz x8, #0, 75f\n"
"ld1 { v25.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 2): Bit 2: Unset
- "tbz x4, #1, 74f\n"
+ "tbz x8, #1, 74f\n"
"ld1 { v25.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 75f\n"
+ "tbz x8, #0, 75f\n"
"ld1 { v25.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 75f\n"
+ "tbz x8, #0, 75f\n"
"ld1 { v25.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 2): Bit 2: End
- "ldr x19, [x8, #0xc0]\n"
"ssubl v25.8h, v25.8b, v12.8b\n"
- "smlal v16.4s, v25.4h, v8.4h\n"
- "smlal2 v22.4s, v25.8h, v8.8h\n"
- "add x19, x19, x5\n"
- "smlal v17.4s, v25.4h, v6.4h\n"
+ "ldr x19, [x12, #0xc0]\n"
+ "smlal v22.4s, v25.4h, v8.4h\n"
+ "smlal2 v21.4s, v25.8h, v8.8h\n"
+ "smlal v23.4s, v25.4h, v6.4h\n"
"smlal2 v18.4s, v25.8h, v6.8h\n"
- "tbz x4, #2, 77f\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 77f\n"
"ld1 { v29.s }[0], [x19], #0x4\n"
- "tbz x4, #1, 76f\n"
+ "tbz x8, #1, 76f\n"
"ld1 { v29.h }[2], [x19], #0x2\n"
- "tbz x4, #0, 79f\n"
+ "tbz x8, #0, 79f\n"
"ld1 { v29.b }[6], [x19]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 79f\n"
+ "tbz x8, #0, 79f\n"
"ld1 { v29.b }[4], [x19]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 4): Bit 2: Unset
- "tbz x4, #1, 78f\n"
+ "tbz x8, #1, 78f\n"
"ld1 { v29.h }[0], [x19], #0x2\n"
- "tbz x4, #0, 79f\n"
+ "tbz x8, #0, 79f\n"
"ld1 { v29.b }[2], [x19]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 79f\n"
+ "tbz x8, #0, 79f\n"
"ld1 { v29.b }[0], [x19]\n"
"79:" // Oddments: Load (4, 4): Bit 2: End
"ssubl v29.8h, v29.8b, v12.8b\n"
- "smlal v17.4s, v29.4h, v8.4h\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
"smlal2 v18.4s, v29.8h, v8.8h\n"
- "tbz x4, #2, 81f\n"
- "ld1 { v21.4s }, [x17], #0x10\n"
- "ld1 { v30.4s }, [x15], #0x10\n"
- "tbz x4, #1, 80f\n"
- "ld1 { v31.d }[0], [x17], #0x8\n"
- "ld1 { v9.d }[0], [x15], #0x8\n"
- "tbz x4, #0, 83f\n"
- "ld1 { v31.s }[2], [x17]\n"
- "ld1 { v9.s }[2], [x15]\n"
+ "tbz x8, #2, 81f\n"
+ "ld1 { v19.4s }, [x13], #0x10\n"
+ "ld1 { v0.4s }, [x11], #0x10\n"
+ "tbz x8, #1, 80f\n"
+ "ld1 { v4.d }[0], [x13], #0x8\n"
+ "ld1 { v31.d }[0], [x11], #0x8\n"
+ "tbz x8, #0, 83f\n"
+ "ld1 { v4.s }[2], [x13]\n"
+ "ld1 { v31.s }[2], [x11]\n"
"b 83f\n"
"80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x4, #0, 83f\n"
- "ld1 { v31.s }[0], [x17]\n"
- "ld1 { v9.s }[0], [x15]\n"
+ "tbz x8, #0, 83f\n"
+ "ld1 { v4.s }[0], [x13]\n"
+ "ld1 { v31.s }[0], [x11]\n"
"b 83f\n"
"81:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x4, #1, 82f\n"
- "ld1 { v21.d }[0], [x17], #0x8\n"
- "ld1 { v30.d }[0], [x15], #0x8\n"
- "tbz x4, #0, 83f\n"
- "ld1 { v21.s }[2], [x17]\n"
- "ld1 { v30.s }[2], [x15]\n"
+ "tbz x8, #1, 82f\n"
+ "ld1 { v19.d }[0], [x13], #0x8\n"
+ "ld1 { v0.d }[0], [x11], #0x8\n"
+ "tbz x8, #0, 83f\n"
+ "ld1 { v19.s }[2], [x13]\n"
+ "ld1 { v0.s }[2], [x11]\n"
"b 83f\n"
"82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 83f\n"
- "ld1 { v21.s }[0], [x17]\n"
- "ld1 { v30.s }[0], [x15]\n"
+ "tbz x8, #0, 83f\n"
+ "ld1 { v19.s }[0], [x13]\n"
+ "ld1 { v0.s }[0], [x11]\n"
"83:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v15.4s, v15.4s, v21.4s\n"
- "add x14, x14, x7\n"
- "sqrdmulh v10.4s, v10.4s, v31.4s\n"
- "add x13, x13, x7\n"
- "sqrdmulh v20.4s, v20.4s, v21.4s\n"
- "add x12, x12, x7\n"
- "sqrdmulh v23.4s, v23.4s, v31.4s\n"
- "add x11, x11, x7\n"
- "sqrdmulh v16.4s, v16.4s, v21.4s\n"
- "and v26.16b, v15.16b, v30.16b\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "and v8.16b, v10.16b, v9.16b\n"
- "and v4.16b, v20.16b, v30.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "and v2.16b, v23.16b, v9.16b\n"
- "and v1.16b, v16.16b, v30.16b\n"
+ "sqdmulh v15.4s, v15.4s, v19.4s\n"
+ "sqdmulh v9.4s, v9.4s, v19.4s\n"
+ "add x10, x10, x14\n"
+ "add x9, x9, x14\n"
+ "sqdmulh v22.4s, v22.4s, v19.4s\n"
+ "sqdmulh v23.4s, v23.4s, v19.4s\n"
+ "add x28, x28, x14\n"
+ "add x27, x27, x14\n"
+ "and v30.16b, v15.16b, v0.16b\n"
+ "sqdmulh v10.4s, v10.4s, v4.4s\n"
+ "and v28.16b, v9.16b, v0.16b\n"
+ "sqdmulh v16.4s, v16.4s, v4.4s\n"
+ "and v29.16b, v22.16b, v0.16b\n"
+ "sqdmulh v21.4s, v21.4s, v4.4s\n"
+ "and v20.16b, v23.16b, v0.16b\n"
+ "sqdmulh v18.4s, v18.4s, v4.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "and v19.16b, v10.16b, v31.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v4.16b, v16.16b, v31.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v5.16b, v21.16b, v31.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v26.16b, v18.16b, v31.16b\n"
+ "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v28.4s\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "sqrdmulh v22.4s, v22.4s, v31.4s\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sqadd v15.4s, v15.4s, v26.4s\n"
- "sqrdmulh v17.4s, v17.4s, v21.4s\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "sqrdmulh v18.4s, v18.4s, v31.4s\n"
- "sqadd v10.4s, v10.4s, v8.4s\n"
- "sqadd v20.4s, v20.4s, v4.4s\n"
- "srshl v15.4s, v15.4s, v30.4s\n"
- "sqadd v23.4s, v23.4s, v2.4s\n"
- "srshl v10.4s, v10.4s, v9.4s\n"
- "srshl v20.4s, v20.4s, v30.4s\n"
- "add v15.4s, v15.4s, v11.4s\n"
- "srshl v23.4s, v23.4s, v9.4s\n"
- "add v10.4s, v10.4s, v11.4s\n"
- "smin v15.4s, v15.4s, v14.4s\n"
- "add v20.4s, v20.4s, v11.4s\n"
- "smin v10.4s, v10.4s, v14.4s\n"
- "smax v15.4s, v15.4s, v19.4s\n"
- "smin v20.4s, v20.4s, v14.4s\n"
- "smax v10.4s, v10.4s, v19.4s\n"
- "add v23.4s, v23.4s, v11.4s\n"
- "smax v20.4s, v20.4s, v19.4s\n"
- "uzp1 v15.16b, v15.16b, v10.16b\n"
- "smin v23.4s, v23.4s, v14.4s\n"
+ "sqadd v22.4s, v22.4s, v29.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v20.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v0.4s\n"
+ "sqadd v10.4s, v10.4s, v19.4s\n"
+ "srshl v9.4s, v9.4s, v0.4s\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "sqadd v21.4s, v21.4s, v5.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "sqadd v18.4s, v18.4s, v26.4s\n"
+ "srshl v10.4s, v10.4s, v31.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v16.4s, v16.4s, v31.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v21.4s, v21.4s, v31.4s\n"
+ "sqxtn v22.4h, v22.4s\n"
+ "srshl v18.4s, v18.4s, v31.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v15.8h, v10.4s\n"
+ "sqxtn2 v9.8h, v16.4s\n"
+ "sqxtn2 v22.8h, v21.4s\n"
+ "sqxtn2 v23.8h, v18.4s\n"
+ "sqadd v15.8h, v15.8h, v11.8h\n"
+ "sqadd v9.8h, v9.8h, v11.8h\n"
+ "sqadd v22.8h, v22.8h, v11.8h\n"
+ "sqadd v23.8h, v23.8h, v11.8h\n"
+ "smax v15.8h, v15.8h, v17.8h\n"
+ "smax v9.8h, v9.8h, v17.8h\n"
+ "smax v22.8h, v22.8h, v17.8h\n"
+ "smax v23.8h, v23.8h, v17.8h\n"
+ "smin v15.8h, v15.8h, v14.8h\n"
+ "smin v9.8h, v9.8h, v14.8h\n"
+ "smin v22.8h, v22.8h, v14.8h\n"
+ "smin v23.8h, v23.8h, v14.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "sqadd v16.4s, v16.4s, v1.4s\n"
- "smax v23.4s, v23.4s, v19.4s\n"
- "and v24.16b, v22.16b, v9.16b\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
- "uzp1 v20.16b, v20.16b, v23.16b\n"
- "srshl v16.4s, v16.4s, v30.4s\n"
- "and v2.16b, v17.16b, v30.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "add v16.4s, v16.4s, v11.4s\n"
- "sqadd v22.4s, v22.4s, v24.4s\n"
- "and v31.16b, v18.16b, v9.16b\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "smin v16.4s, v16.4s, v14.4s\n"
- "srshl v22.4s, v22.4s, v9.4s\n"
- "sqadd v17.4s, v17.4s, v2.4s\n"
- "smax v16.4s, v16.4s, v19.4s\n"
- "add v22.4s, v22.4s, v11.4s\n"
- "srshl v17.4s, v17.4s, v30.4s\n"
- "sqadd v18.4s, v18.4s, v31.4s\n"
- "smin v22.4s, v22.4s, v14.4s\n"
- "add v17.4s, v17.4s, v11.4s\n"
- "srshl v18.4s, v18.4s, v9.4s\n"
- "smax v22.4s, v22.4s, v19.4s\n"
- "smin v17.4s, v17.4s, v14.4s\n"
- "uzp1 v16.16b, v16.16b, v22.16b\n"
- "add v18.4s, v18.4s, v11.4s\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "smax v17.4s, v17.4s, v19.4s\n"
- "smin v18.4s, v18.4s, v14.4s\n"
- "smax v18.4s, v18.4s, v19.4s\n"
- "uzp1 v17.16b, v17.16b, v18.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "tbz x4, #2, 85f\n"
- "st1 { v15.s }[0], [x14], #0x4\n"
- "st1 { v20.s }[0], [x13], #0x4\n"
- "st1 { v16.s }[0], [x12], #0x4\n"
- "st1 { v17.s }[0], [x11], #0x4\n"
- "tbz x4, #1, 84f\n"
- "st1 { v15.h }[2], [x14], #0x2\n"
- "st1 { v20.h }[2], [x13], #0x2\n"
- "st1 { v16.h }[2], [x12], #0x2\n"
- "st1 { v17.h }[2], [x11], #0x2\n"
- "tbz x4, #0, 87f\n"
- "st1 { v15.b }[6], [x14], #0x1\n"
- "st1 { v20.b }[6], [x13], #0x1\n"
- "st1 { v16.b }[6], [x12], #0x1\n"
- "st1 { v17.b }[6], [x11], #0x1\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "tbz x8, #2, 85f\n"
+ "st1 { v15.s }[0], [x10], #0x4\n"
+ "st1 { v9.s }[0], [x9], #0x4\n"
+ "st1 { v22.s }[0], [x28], #0x4\n"
+ "st1 { v23.s }[0], [x27], #0x4\n"
+ "tbz x8, #1, 84f\n"
+ "st1 { v15.h }[2], [x10], #0x2\n"
+ "st1 { v9.h }[2], [x9], #0x2\n"
+ "st1 { v22.h }[2], [x28], #0x2\n"
+ "st1 { v23.h }[2], [x27], #0x2\n"
+ "tbz x8, #0, 87f\n"
+ "st1 { v15.b }[6], [x10], #0x1\n"
+ "st1 { v9.b }[6], [x9], #0x1\n"
+ "st1 { v22.b }[6], [x28], #0x1\n"
+ "st1 { v23.b }[6], [x27], #0x1\n"
"b 87f\n"
"84:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x4, #0, 87f\n"
- "st1 { v15.b }[4], [x14], #0x1\n"
- "st1 { v20.b }[4], [x13], #0x1\n"
- "st1 { v16.b }[4], [x12], #0x1\n"
- "st1 { v17.b }[4], [x11], #0x1\n"
+ "tbz x8, #0, 87f\n"
+ "st1 { v15.b }[4], [x10], #0x1\n"
+ "st1 { v9.b }[4], [x9], #0x1\n"
+ "st1 { v22.b }[4], [x28], #0x1\n"
+ "st1 { v23.b }[4], [x27], #0x1\n"
"b 87f\n"
"85:" // Oddments: Bit 2: Unset
- "tbz x4, #1, 86f\n"
- "st1 { v15.h }[0], [x14], #0x2\n"
- "st1 { v20.h }[0], [x13], #0x2\n"
- "st1 { v16.h }[0], [x12], #0x2\n"
- "st1 { v17.h }[0], [x11], #0x2\n"
- "tbz x4, #0, 87f\n"
- "st1 { v15.b }[2], [x14], #0x1\n"
- "st1 { v20.b }[2], [x13], #0x1\n"
- "st1 { v16.b }[2], [x12], #0x1\n"
- "st1 { v17.b }[2], [x11], #0x1\n"
+ "tbz x8, #1, 86f\n"
+ "st1 { v15.h }[0], [x10], #0x2\n"
+ "st1 { v9.h }[0], [x9], #0x2\n"
+ "st1 { v22.h }[0], [x28], #0x2\n"
+ "st1 { v23.h }[0], [x27], #0x2\n"
+ "tbz x8, #0, 87f\n"
+ "st1 { v15.b }[2], [x10], #0x1\n"
+ "st1 { v9.b }[2], [x9], #0x1\n"
+ "st1 { v22.b }[2], [x28], #0x1\n"
+ "st1 { v23.b }[2], [x27], #0x1\n"
"b 87f\n"
"86:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 87f\n"
- "st1 { v15.b }[0], [x14], #0x1\n"
- "st1 { v20.b }[0], [x13], #0x1\n"
- "st1 { v16.b }[0], [x12], #0x1\n"
- "st1 { v17.b }[0], [x11], #0x1\n"
+ "tbz x8, #0, 87f\n"
+ "st1 { v15.b }[0], [x10], #0x1\n"
+ "st1 { v9.b }[0], [x9], #0x1\n"
+ "st1 { v22.b }[0], [x28], #0x1\n"
+ "st1 { v23.b }[0], [x27], #0x1\n"
"87:" // Oddments: Bit 2: End
-
"88:" // End
-
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index a998fa16d6..52031e18da 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,37 +36,24 @@ namespace depthwise {
void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
-struct a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst
+class a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef int8_t input_type;
- typedef int8_t weight_type;
- typedef int8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 5;
constexpr static unsigned int kernel_cols = 5;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 6;
- constexpr static unsigned int input_cols = 6;
-
- constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_5x5_mla::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_5x5_mla::get_packed_size;
+ a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
- kern_type kernel = a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
- a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index ab64f53f66..bd71b65908 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,7 +46,7 @@ void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const int8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const int8_t *const *inptrs_raw,
- const int8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -111,2096 +111,2070 @@ void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
- "mov x10, #0x0\n"
- "ldr x3, [%x[params], %[offsetof_Params_weights]]\n"
- "mov x1, #0x0\n"
- "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
- "add x25, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x2, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "lsr x19, x4, #0x3\n"
- "ldr x5, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "add x13, x22, %[offsetof_Requantize32_a_offset]\n"
- "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v7.16b }, [x13]\n"
- "add x8, x22, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v13.16b }, [x20]\n"
- "add x20, x22, %[offsetof_Requantize32_minval]\n"
- "ld1r { v19.4s }, [x8]\n"
- "add x8, x22, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v16.4s }, [x20]\n"
- "ld1r { v12.4s }, [x8]\n"
- "ldp x17, x16, [x21, #0x0]\n"
- "ldp x6, x8, [x21, #0x10]\n"
- "cbz x19, 3f\n"
- "subs x19, x19, #0x1\n"
- "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q15, [x12, #0x0]\n"
- "mov v18.16b, v15.16b\n"
- "ldr q20, [x12, #0x10]\n"
- "add x12, x12, #0x20\n"
- "mov v11.16b, v15.16b\n"
- "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x10, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "add x17, x10, %[offsetof_Requantize32_a_offset]\n"
+ "add x9, x10, %[offsetof_Requantize32_b_offset]\n"
+ "ldr x25, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x4, x10, %[offsetof_Requantize32_c_offset]\n"
+ "add x14, x10, %[offsetof_Requantize32_minval]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x5, x10, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v9.16b }, [x17]\n"
+ "ld1r { v14.16b }, [x9]\n"
+ "lsr x3, x0, #0x3\n"
+ "ld1r { v18.8h }, [x4]\n"
+ "ld1r { v11.8h }, [x14]\n"
+ "mov x24, #0x0\n"
+ "mov x22, #0x0\n"
+ "ld1r { v13.8h }, [x5]\n"
+ "ldr x10, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x20, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x1, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x16, x8, [x25, #0x0]\n"
+ "ldp x4, x7, [x25, #0x10]\n"
+ "cbz x3, 3f\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q15, [x19, #0x0]\n"
+ "subs x3, x3, #0x1\n"
+ "mov v17.16b, v15.16b\n"
+ "ldr q16, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d0, [x23, #0x0]\n"
+ "ldr d1, [x23, #0x8]\n"
+ "ldr d2, [x23, #0x10]\n"
+ "mov v8.16b, v16.16b\n"
"mov v10.16b, v15.16b\n"
- "ldr d0, [x3, #0x0]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
- "mov v5.16b, v20.16b\n"
- "ldr d1, [x3, #0x8]\n"
- "mov v8.16b, v20.16b\n"
- "ldr d2, [x3, #0x10]\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "mov v9.16b, v20.16b\n"
- "ldr d3, [x3, #0x18]\n"
- "ldr d4, [x3, #0x20]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ldp x28, x27, [x25, #0x0]\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
- "ldp x26, x13, [x25, #0x10]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
- "ldp x24, x23, [x25, #0x20]\n"
- "ldp x22, x21, [x25, #0x30]\n"
- "ldp x20, x0, [x25, #0x40]\n"
- "ldr d31, [x28, x10]\n"
- "ssubl v31.8h, v31.8b, v7.8b\n"
- "ldr d30, [x27, x10]\n"
- "ldr d29, [x26, x10]\n"
- "ssubl v30.8h, v30.8b, v7.8b\n"
- "ldr d28, [x13, x10]\n"
- "ldr d27, [x24, x10]\n"
- "ssubl v29.8h, v29.8b, v7.8b\n"
- "ldr d23, [x23, x10]\n"
- "ssubl v28.8h, v28.8b, v7.8b\n"
- "ldr d25, [x22, x10]\n"
- "ldr d24, [x21, x10]\n"
- "ssubl v27.8h, v27.8b, v7.8b\n"
- "ldr d26, [x20, x10]\n"
- "ssubl v23.8h, v23.8b, v7.8b\n"
- "ldr d22, [x0, x10]\n"
- "ssubl v25.8h, v25.8b, v7.8b\n"
- "ssubl v24.8h, v24.8b, v7.8b\n"
- "ssubl v26.8h, v26.8b, v7.8b\n"
- "ssubl v22.8h, v22.8b, v7.8b\n"
+ "ldr d3, [x23, #0x18]\n"
+ "ldr d4, [x23, #0x20]\n"
+ "mov v7.16b, v16.16b\n"
+ "mov v6.16b, v15.16b\n"
+ "ldp x28, x6, [x20, #0x0]\n"
+ "ldp x26, x25, [x20, #0x10]\n"
+ "mov v5.16b, v16.16b\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "ldp x5, x2, [x20, #0x20]\n"
+ "ldp x27, x21, [x20, #0x30]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "ldp x12, x19, [x20, #0x40]\n"
+ "ldr d31, [x28, x24]\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ldr d30, [x6, x24]\n"
+ "ldr d29, [x26, x24]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "ssubl v30.8h, v30.8b, v9.8b\n"
+ "ldr d28, [x25, x24]\n"
+ "ldr d27, [x5, x24]\n"
+ "ssubl v29.8h, v29.8b, v9.8b\n"
+ "ssubl v28.8h, v28.8b, v9.8b\n"
+ "ldr d23, [x2, x24]\n"
+ "ldr d25, [x27, x24]\n"
+ "ssubl v27.8h, v27.8b, v9.8b\n"
+ "ssubl v23.8h, v23.8b, v9.8b\n"
+ "ldr d24, [x21, x24]\n"
+ "ldr d26, [x12, x24]\n"
+ "ssubl v25.8h, v25.8b, v9.8b\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "ldr d22, [x19, x24]\n"
+ "ssubl v26.8h, v26.8b, v9.8b\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
"beq 2f\n"
"1:" // Loop
"smlal v15.4s, v31.4h, v0.4h\n"
- "ldr x20, [x25, #0x50]\n"
- "subs x19, x19, #0x1\n"
- "smlal2 v20.4s, v31.8h, v0.8h\n"
- "ldr x28, [x25, #0x58]\n"
- "smlal v18.4s, v30.4h, v0.4h\n"
- "ldr x0, [x25, #0x60]\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x10]\n"
- "ssubl v31.8h, v31.8b, v7.8b\n"
- "smlal v11.4s, v29.4h, v0.4h\n"
- "ldr x7, [x25, #0x68]\n"
- "smlal2 v8.4s, v29.8h, v0.8h\n"
- "ldr x26, [x25, #0x70]\n"
- "smlal v10.4s, v28.4h, v0.4h\n"
- "ldr x23, [x25, #0x78]\n"
- "smlal2 v9.4s, v28.8h, v0.8h\n"
- "ldr d0, [x3, #0x28]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal2 v16.4s, v31.8h, v0.8h\n"
+ "ldr x19, [x20, #0x50]\n"
+ "ldr d31, [x19, x24]\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal v10.4s, v29.4h, v0.4h\n"
+ "ldr x15, [x20, #0x58]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v6.4s, v28.4h, v0.4h\n"
+ "smlal2 v8.4s, v30.8h, v0.8h\n"
+ "ldr x19, [x20, #0x60]\n"
+ "ldr x27, [x20, #0x68]\n"
+ "smlal2 v7.4s, v29.8h, v0.8h\n"
"smlal v15.4s, v30.4h, v1.4h\n"
- "ldr x20, [x25, #0x80]\n"
- "smlal2 v20.4s, v30.8h, v1.8h\n"
- "ldr d30, [x28, x10]\n"
- "ssubl v30.8h, v30.8b, v7.8b\n"
- "smlal v18.4s, v27.4h, v1.4h\n"
- "ldr x22, [x25, #0x88]\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "ldr x13, [x25, #0x90]\n"
- "smlal v11.4s, v28.4h, v1.4h\n"
- "ldr x21, [x25, #0x98]\n"
- "smlal2 v8.4s, v28.8h, v1.8h\n"
- "ldr x14, [x25, #0xa0]\n"
- "smlal v10.4s, v23.4h, v1.4h\n"
- "ldr x11, [x25, #0xa8]\n"
- "smlal2 v9.4s, v23.8h, v1.8h\n"
- "ldr d1, [x3, #0x30]\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x5, [x20, #0x70]\n"
+ "ldr x11, [x20, #0x78]\n"
+ "smlal2 v16.4s, v30.8h, v1.8h\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x15, x24]\n"
+ "ssubl v30.8h, v30.8b, v9.8b\n"
+ "smlal v17.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v28.4h, v1.4h\n"
+ "ldr d0, [x23, #0x28]\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v6.4s, v23.4h, v1.4h\n"
+ "smlal2 v8.4s, v27.8h, v1.8h\n"
+ "ldr x12, [x20, #0x80]\n"
+ "ldr x26, [x20, #0x88]\n"
+ "smlal2 v7.4s, v28.8h, v1.8h\n"
"smlal v15.4s, v27.4h, v2.4h\n"
- "ldr x24, [x25, #0xb0]\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "ldr d27, [x0, x10]\n"
- "ssubl v27.8h, v27.8b, v7.8b\n"
- "smlal v18.4s, v25.4h, v2.4h\n"
- "ldr x0, [x25, #0xb8]\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "ldr x15, [x25, #0xc0]\n"
- "smlal v11.4s, v23.4h, v2.4h\n"
- "ldr x9, [x25, #0xc8]\n"
- "smlal2 v8.4s, v23.8h, v2.8h\n"
- "ldr x27, [x25, #0xd0]\n"
- "smlal v10.4s, v31.4h, v2.4h\n"
- "ldr x28, [x25, #0xd8]\n"
- "smlal2 v9.4s, v31.8h, v2.8h\n"
- "ldr d2, [x3, #0x38]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x14, [x20, #0x90]\n"
+ "ldr x15, [x20, #0x98]\n"
+ "smlal2 v16.4s, v27.8h, v2.8h\n"
+ "smlal2 v5.4s, v23.8h, v1.8h\n"
+ "ldr d27, [x19, x24]\n"
+ "ssubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal v10.4s, v23.4h, v2.4h\n"
+ "ldr d1, [x23, #0x30]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v31.4h, v2.4h\n"
+ "smlal2 v8.4s, v25.8h, v2.8h\n"
+ "ldr x21, [x20, #0xa0]\n"
+ "ldr x2, [x20, #0xa8]\n"
+ "smlal2 v7.4s, v23.8h, v2.8h\n"
"smlal v15.4s, v25.4h, v3.4h\n"
- "ldr q6, [x2, #0x0]\n"
- "smlal2 v20.4s, v25.8h, v3.8h\n"
- "ldr d25, [x7, x10]\n"
- "ssubl v25.8h, v25.8b, v7.8b\n"
- "smlal v18.4s, v24.4h, v3.4h\n"
- "ldr x12, [x25, #0xe0]\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "ldr q21, [x5, #0x0]\n"
- "smlal v11.4s, v31.4h, v3.4h\n"
- "ldr q17, [x2, #0x10]\n"
- "add x2, x2, #0x20\n"
- "smlal2 v8.4s, v31.8h, v3.8h\n"
- "ldr q14, [x5, #0x10]\n"
- "add x5, x5, #0x20\n"
- "smlal v10.4s, v30.4h, v3.4h\n"
- "smlal2 v9.4s, v30.8h, v3.8h\n"
- "ldr d3, [x3, #0x40]\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ldr x13, [x20, #0xb0]\n"
+ "ldr x9, [x20, #0xb8]\n"
+ "smlal2 v16.4s, v25.8h, v3.8h\n"
+ "smlal2 v5.4s, v31.8h, v2.8h\n"
+ "ldr d25, [x27, x24]\n"
+ "ssubl v25.8h, v25.8b, v9.8b\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal v10.4s, v31.4h, v3.4h\n"
+ "ldr d2, [x23, #0x38]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v30.4h, v3.4h\n"
+ "smlal2 v8.4s, v24.8h, v3.8h\n"
+ "ldr x19, [x20, #0xc0]\n"
+ "ldr x28, [x20, #0xc8]\n"
+ "smlal2 v7.4s, v31.8h, v3.8h\n"
"smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v20.4s, v24.8h, v4.8h\n"
- "ldr d24, [x26, x10]\n"
- "ssubl v24.8h, v24.8b, v7.8b\n"
- "smlal v18.4s, v27.4h, v4.4h\n"
- "ldr x7, [x25, #0xe8]\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
- "ldr d27, [x23, x10]\n"
- "ssubl v27.8h, v27.8b, v7.8b\n"
- "smlal v11.4s, v30.4h, v4.4h\n"
- "ldr x26, [x25, #0xf0]\n"
- "smlal2 v8.4s, v30.8h, v4.8h\n"
- "smlal v10.4s, v26.4h, v4.4h\n"
- "smlal2 v9.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0x48]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
+ "ldr x6, [x20, #0xd0]\n"
+ "ldr x27, [x20, #0xd8]\n"
+ "smlal2 v16.4s, v24.8h, v4.8h\n"
+ "smlal2 v5.4s, v30.8h, v3.8h\n"
+ "ldr d24, [x5, x24]\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal v10.4s, v30.4h, v4.4h\n"
+ "ldr d3, [x23, #0x40]\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v8.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x11, x24]\n"
+ "ssubl v27.8h, v27.8b, v9.8b\n"
+ "smlal2 v7.4s, v30.8h, v4.8h\n"
"smlal v15.4s, v29.4h, v0.4h\n"
- "smlal2 v20.4s, v29.8h, v0.8h\n"
- "smlal v18.4s, v28.4h, v0.4h\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "smlal v11.4s, v22.4h, v0.4h\n"
- "smlal2 v8.4s, v22.8h, v0.8h\n"
- "smlal v10.4s, v25.4h, v0.4h\n"
- "smlal2 v9.4s, v25.8h, v0.8h\n"
- "ldr d0, [x3, #0x50]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
+ "ldr x11, [x20, #0xe0]\n"
+ "ldr x17, [x20, #0xe8]\n"
+ "smlal2 v16.4s, v29.8h, v0.8h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x23, #0x48]\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "smlal v10.4s, v22.4h, v0.4h\n"
+ "ldr x5, [x20, #0xf0]\n"
+ "ldr q12, [x10, #0x0]\n"
+ "smlal v6.4s, v25.4h, v0.4h\n"
+ "smlal2 v8.4s, v28.8h, v0.8h\n"
+ "ldr q19, [x1, #0x0]\n"
+ "ldr q20, [x10, #0x10]\n"
+ "smlal2 v7.4s, v22.8h, v0.8h\n"
"smlal v15.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x10]\n"
- "ssubl v28.8h, v28.8b, v7.8b\n"
- "smlal v18.4s, v23.4h, v1.4h\n"
- "ldr x23, [x25, #0xf8]\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "smlal v11.4s, v25.4h, v1.4h\n"
- "smlal2 v8.4s, v25.8h, v1.8h\n"
- "smlal v10.4s, v24.4h, v1.4h\n"
- "smlal2 v9.4s, v24.8h, v1.8h\n"
- "ldr d1, [x3, #0x58]\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr q29, [x1, #0x10]\n"
+ "subs x3, x3, #0x1\n"
+ "smlal2 v16.4s, v28.8h, v1.8h\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "ldr d28, [x26, x24]\n"
+ "ldr d0, [x23, #0x50]\n"
+ "smlal v17.4s, v23.4h, v1.4h\n"
+ "smlal v10.4s, v25.4h, v1.4h\n"
+ "ssubl v28.8h, v28.8b, v9.8b\n"
+ "ldr x25, [x20, #0xf8]\n"
+ "smlal v6.4s, v24.4h, v1.4h\n"
+ "smlal2 v8.4s, v23.8h, v1.8h\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "add x10, x10, #0x20\n"
+ "smlal2 v7.4s, v25.8h, v1.8h\n"
"smlal v15.4s, v23.4h, v2.4h\n"
- "smlal2 v20.4s, v23.8h, v2.8h\n"
- "ldr d23, [x20, x10]\n"
- "ssubl v23.8h, v23.8b, v7.8b\n"
- "smlal v18.4s, v31.4h, v2.4h\n"
- "ldr x22, [x25, #0x100]\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "smlal v11.4s, v24.4h, v2.4h\n"
- "smlal2 v8.4s, v24.8h, v2.8h\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v9.4s, v27.8h, v2.8h\n"
- "ldr d2, [x3, #0x60]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
+ "add x1, x1, #0x20\n"
+ "smlal2 v16.4s, v23.8h, v2.8h\n"
+ "ldr d23, [x12, x24]\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "ssubl v23.8h, v23.8b, v9.8b\n"
+ "smlal v17.4s, v31.4h, v2.4h\n"
+ "smlal v10.4s, v24.4h, v2.4h\n"
+ "ldr d1, [x23, #0x58]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v27.4h, v2.4h\n"
+ "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "ldr x26, [x20, #0x100]\n"
+ "smlal2 v7.4s, v24.8h, v2.8h\n"
"smlal v15.4s, v31.4h, v3.4h\n"
- "smlal2 v20.4s, v31.8h, v3.8h\n"
- "ldr d31, [x13, x10]\n"
- "ssubl v31.8h, v31.8b, v7.8b\n"
- "smlal v18.4s, v30.4h, v3.4h\n"
- "ldr x20, [x25, #0x108]\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "smlal v11.4s, v27.4h, v3.4h\n"
- "smlal2 v8.4s, v27.8h, v3.8h\n"
- "smlal v10.4s, v23.4h, v3.4h\n"
- "smlal2 v9.4s, v23.8h, v3.8h\n"
- "ldr d3, [x3, #0x68]\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v16.4s, v31.8h, v3.8h\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "ldr d31, [x14, x24]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v17.4s, v30.4h, v3.4h\n"
+ "smlal v10.4s, v27.4h, v3.4h\n"
+ "ldr d2, [x23, #0x60]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v23.4h, v3.4h\n"
+ "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "ldr x12, [x20, #0x108]\n"
+ "smlal2 v7.4s, v27.8h, v3.8h\n"
"smlal v15.4s, v30.4h, v4.4h\n"
- "smlal2 v20.4s, v30.8h, v4.8h\n"
- "ldr d30, [x21, x10]\n"
- "ssubl v30.8h, v30.8b, v7.8b\n"
- "smlal v18.4s, v26.4h, v4.4h\n"
- "ldr x13, [x25, #0x110]\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x10]\n"
- "ssubl v26.8h, v26.8b, v7.8b\n"
- "smlal v11.4s, v23.4h, v4.4h\n"
- "ldr x21, [x25, #0x118]\n"
- "smlal2 v8.4s, v23.8h, v4.8h\n"
- "smlal v10.4s, v28.4h, v4.4h\n"
- "smlal2 v9.4s, v28.8h, v4.8h\n"
- "ldr d4, [x3, #0x70]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal2 v16.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x15, x24]\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "ssubl v30.8h, v30.8b, v9.8b\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal v10.4s, v23.4h, v4.4h\n"
+ "ldr d3, [x23, #0x68]\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v28.4h, v4.4h\n"
+ "smlal2 v8.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x21, x24]\n"
+ "ssubl v26.8h, v26.8b, v9.8b\n"
+ "smlal2 v7.4s, v23.8h, v4.8h\n"
"smlal v15.4s, v22.4h, v0.4h\n"
- "smlal2 v20.4s, v22.8h, v0.8h\n"
- "ldr d22, [x0, x10]\n"
- "ssubl v22.8h, v22.8b, v7.8b\n"
- "smlal v18.4s, v25.4h, v0.4h\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v8.4s, v31.8h, v0.8h\n"
- "smlal v10.4s, v30.4h, v0.4h\n"
- "smlal2 v9.4s, v30.8h, v0.8h\n"
- "ldr d0, [x3, #0x78]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
+ "ldr x14, [x20, #0x110]\n"
+ "ldr x21, [x20, #0x118]\n"
+ "smlal2 v16.4s, v22.8h, v0.8h\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x23, #0x70]\n"
+ "ldr d22, [x9, x24]\n"
+ "smlal v17.4s, v25.4h, v0.4h\n"
+ "smlal v10.4s, v31.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v6.4s, v30.4h, v0.4h\n"
+ "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "smlal2 v7.4s, v31.8h, v0.8h\n"
"smlal v15.4s, v25.4h, v1.4h\n"
- "smlal2 v20.4s, v25.8h, v1.8h\n"
- "ldr d25, [x11, x10]\n"
- "ssubl v25.8h, v25.8b, v7.8b\n"
- "smlal v18.4s, v24.4h, v1.4h\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
- "smlal v11.4s, v30.4h, v1.4h\n"
- "smlal2 v8.4s, v30.8h, v1.8h\n"
- "smlal v10.4s, v26.4h, v1.4h\n"
- "smlal2 v9.4s, v26.8h, v1.8h\n"
- "ldr d1, [x3, #0x80]\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal2 v16.4s, v25.8h, v1.8h\n"
+ "ldr d25, [x2, x24]\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "ssubl v25.8h, v25.8b, v9.8b\n"
+ "smlal v17.4s, v24.4h, v1.4h\n"
+ "smlal v10.4s, v30.4h, v1.4h\n"
+ "ldr d0, [x23, #0x78]\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v6.4s, v26.4h, v1.4h\n"
+ "smlal2 v8.4s, v24.8h, v1.8h\n"
+ "smlal2 v7.4s, v30.8h, v1.8h\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v20.4s, v24.8h, v2.8h\n"
- "ldr d24, [x24, x10]\n"
- "ssubl v24.8h, v24.8b, v7.8b\n"
- "smlal v18.4s, v27.4h, v2.4h\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "smlal v11.4s, v26.4h, v2.4h\n"
- "smlal2 v8.4s, v26.8h, v2.8h\n"
- "smlal v10.4s, v25.4h, v2.4h\n"
- "smlal2 v9.4s, v25.8h, v2.8h\n"
- "ldr d2, [x3, #0x88]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal2 v16.4s, v24.8h, v2.8h\n"
+ "ldr d24, [x13, x24]\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal v10.4s, v26.4h, v2.4h\n"
+ "ldr d1, [x23, #0x80]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v8.4s, v27.8h, v2.8h\n"
+ "smlal2 v7.4s, v26.8h, v2.8h\n"
"smlal v15.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
- "ldr d27, [x15, x10]\n"
- "ssubl v27.8h, v27.8b, v7.8b\n"
- "smlal v18.4s, v23.4h, v3.4h\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v8.4s, v25.8h, v3.8h\n"
- "smlal v10.4s, v24.4h, v3.4h\n"
- "smlal2 v9.4s, v24.8h, v3.8h\n"
- "ldr d3, [x3, #0x90]\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v16.4s, v27.8h, v3.8h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "ldr d27, [x19, x24]\n"
+ "ssubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v17.4s, v23.4h, v3.4h\n"
+ "smlal v10.4s, v25.4h, v3.4h\n"
+ "ldr d2, [x23, #0x88]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v8.4s, v23.8h, v3.8h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
"smlal v15.4s, v23.4h, v4.4h\n"
- "smlal2 v20.4s, v23.8h, v4.8h\n"
- "ldr d23, [x9, x10]\n"
- "ssubl v23.8h, v23.8b, v7.8b\n"
- "smlal v18.4s, v28.4h, v4.4h\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "ldr d28, [x12, x10]\n"
- "ssubl v28.8h, v28.8b, v7.8b\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v8.4s, v24.8h, v4.8h\n"
- "smlal v10.4s, v22.4h, v4.4h\n"
- "smlal2 v9.4s, v22.8h, v4.8h\n"
- "ldr d4, [x3, #0x98]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal2 v16.4s, v23.8h, v4.8h\n"
+ "ldr d23, [x28, x24]\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "ssubl v23.8h, v23.8b, v9.8b\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "smlal v10.4s, v24.4h, v4.4h\n"
+ "ldr d3, [x23, #0x90]\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v22.4h, v4.4h\n"
+ "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x11, x24]\n"
+ "ssubl v28.8h, v28.8b, v9.8b\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
"smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v20.4s, v31.8h, v0.8h\n"
- "ldr d31, [x27, x10]\n"
- "ssubl v31.8h, v31.8b, v7.8b\n"
- "smlal v18.4s, v30.4h, v0.4h\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "smlal v11.4s, v27.4h, v0.4h\n"
- "smlal2 v8.4s, v27.8h, v0.8h\n"
- "smlal v10.4s, v23.4h, v0.4h\n"
- "smlal2 v9.4s, v23.8h, v0.8h\n"
- "ldr d0, [x3, #0xa0]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal2 v16.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x6, x24]\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal v10.4s, v27.4h, v0.4h\n"
+ "ldr d4, [x23, #0x98]\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v6.4s, v23.4h, v0.4h\n"
+ "smlal2 v8.4s, v30.8h, v0.8h\n"
+ "smlal2 v7.4s, v27.8h, v0.8h\n"
"smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v20.4s, v30.8h, v1.8h\n"
- "ldr d30, [x28, x10]\n"
- "ssubl v30.8h, v30.8b, v7.8b\n"
- "smlal v18.4s, v26.4h, v1.4h\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
- "smlal v11.4s, v23.4h, v1.4h\n"
- "smlal2 v8.4s, v23.8h, v1.8h\n"
- "smlal v10.4s, v31.4h, v1.4h\n"
- "smlal2 v9.4s, v31.8h, v1.8h\n"
- "ldr d1, [x3, #0xa8]\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal2 v16.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x27, x24]\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "ssubl v30.8h, v30.8b, v9.8b\n"
+ "smlal v17.4s, v26.4h, v1.4h\n"
+ "smlal v10.4s, v23.4h, v1.4h\n"
+ "ldr d0, [x23, #0xa0]\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v6.4s, v31.4h, v1.4h\n"
+ "smlal2 v8.4s, v26.8h, v1.8h\n"
+ "smlal2 v7.4s, v23.8h, v1.8h\n"
"smlal v15.4s, v26.4h, v2.4h\n"
- "smlal2 v20.4s, v26.8h, v2.8h\n"
- "ldr d26, [x7, x10]\n"
- "ssubl v26.8h, v26.8b, v7.8b\n"
- "smlal v18.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "smlal2 v8.4s, v31.8h, v2.8h\n"
- "smlal v10.4s, v30.4h, v2.4h\n"
- "smlal2 v9.4s, v30.8h, v2.8h\n"
- "ldr d2, [x3, #0xb0]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal2 v16.4s, v26.8h, v2.8h\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "ldr d26, [x17, x24]\n"
+ "ssubl v26.8h, v26.8b, v9.8b\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal v10.4s, v31.4h, v2.4h\n"
+ "ldr d1, [x23, #0xa8]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v30.4h, v2.4h\n"
+ "smlal2 v8.4s, v25.8h, v2.8h\n"
+ "smlal2 v7.4s, v31.8h, v2.8h\n"
"smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v20.4s, v25.8h, v3.8h\n"
- "ldr d25, [x26, x10]\n"
- "ssubl v25.8h, v25.8b, v7.8b\n"
- "smlal v18.4s, v24.4h, v3.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "smlal v11.4s, v30.4h, v3.4h\n"
- "smlal2 v8.4s, v30.8h, v3.8h\n"
- "smlal v10.4s, v28.4h, v3.4h\n"
- "smlal2 v9.4s, v28.8h, v3.8h\n"
- "ldr d3, [x3, #0xb8]\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v16.4s, v25.8h, v3.8h\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "ldr d25, [x5, x24]\n"
+ "ssubl v25.8h, v25.8b, v9.8b\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal v10.4s, v30.4h, v3.4h\n"
+ "ldr d2, [x23, #0xb0]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v28.4h, v3.4h\n"
+ "smlal2 v8.4s, v24.8h, v3.8h\n"
+ "smlal2 v7.4s, v30.8h, v3.8h\n"
"smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v20.4s, v24.8h, v4.8h\n"
- "ldr d24, [x23, x10]\n"
- "ssubl v24.8h, v24.8b, v7.8b\n"
- "smlal v18.4s, v22.4h, v4.4h\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
- "smlal v11.4s, v28.4h, v4.4h\n"
- "smlal2 v8.4s, v28.8h, v4.8h\n"
- "smlal v10.4s, v26.4h, v4.4h\n"
- "smlal2 v9.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0xc0]\n"
- "add x3, x3, #0xc8\n"
+ "smlal2 v16.4s, v24.8h, v4.8h\n"
+ "ldr d24, [x25, x24]\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v17.4s, v22.4h, v4.4h\n"
+ "smlal v10.4s, v28.4h, v4.4h\n"
+ "ldr d3, [x23, #0xb8]\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v7.4s, v28.8h, v4.8h\n"
"smlal v15.4s, v27.4h, v0.4h\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
- "smlal2 v20.4s, v27.8h, v0.8h\n"
- "ldr d27, [x22, x10]\n"
- "smlal v18.4s, v23.4h, v0.4h\n"
- "ssubl v27.8h, v27.8b, v7.8b\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
- "smlal v11.4s, v25.4h, v0.4h\n"
- "smlal2 v8.4s, v25.8h, v0.8h\n"
- "ldr d25, [x20, x10]\n"
- "ssubl v25.8h, v25.8b, v7.8b\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v9.4s, v24.8h, v0.8h\n"
+ "smlal2 v16.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x26, x24]\n"
+ "ssubl v27.8h, v27.8b, v9.8b\n"
+ "smlal2 v8.4s, v22.8h, v4.8h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x23, #0xc0]\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v17.4s, v23.4h, v0.4h\n"
+ "smlal v10.4s, v25.4h, v0.4h\n"
+ "add x23, x23, #0xc8\n"
+ "smlal v6.4s, v24.4h, v0.4h\n"
+ "smlal2 v7.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x12, x24]\n"
+ "ssubl v25.8h, v25.8b, v9.8b\n"
+ "smlal2 v8.4s, v23.8h, v0.8h\n"
+ "smlal2 v5.4s, v24.8h, v0.8h\n"
"smlal v15.4s, v23.4h, v1.4h\n"
- "smlal2 v20.4s, v23.8h, v1.8h\n"
- "smlal v18.4s, v31.4h, v1.4h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "smlal v11.4s, v24.4h, v1.4h\n"
- "smlal2 v8.4s, v24.8h, v1.8h\n"
- "ldr d24, [x13, x10]\n"
- "ssubl v24.8h, v24.8b, v7.8b\n"
- "smlal v10.4s, v27.4h, v1.4h\n"
- "smlal2 v9.4s, v27.8h, v1.8h\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal v10.4s, v24.4h, v1.4h\n"
+ "smlal v6.4s, v27.4h, v1.4h\n"
+ "smlal2 v7.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x14, x24]\n"
+ "smlal2 v16.4s, v23.8h, v1.8h\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "smlal2 v8.4s, v31.8h, v1.8h\n"
+ "smlal2 v5.4s, v27.8h, v1.8h\n"
"smlal v15.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "smlal v18.4s, v30.4h, v2.4h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "smlal v11.4s, v27.4h, v2.4h\n"
- "smlal2 v8.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x10]\n"
- "add x10, x10, #0x8\n"
- "smlal v10.4s, v25.4h, v2.4h\n"
- "ssubl v27.8h, v27.8b, v7.8b\n"
- "smlal2 v9.4s, v25.8h, v2.8h\n"
+ "smlal v17.4s, v30.4h, v2.4h\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v7.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x24]\n"
+ "smlal2 v16.4s, v31.8h, v2.8h\n"
+ "ssubl v27.8h, v27.8b, v9.8b\n"
+ "smlal2 v8.4s, v30.8h, v2.8h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "add x24, x24, #0x8\n"
"smlal v15.4s, v30.4h, v3.4h\n"
- "smlal2 v20.4s, v30.8h, v3.8h\n"
- "smlal v18.4s, v28.4h, v3.4h\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v8.4s, v25.8h, v3.8h\n"
- "smlal v10.4s, v24.4h, v3.4h\n"
- "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "smlal v17.4s, v28.4h, v3.4h\n"
+ "smlal v10.4s, v25.4h, v3.4h\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v16.4s, v30.8h, v3.8h\n"
+ "smlal2 v8.4s, v28.8h, v3.8h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
"smlal v15.4s, v28.4h, v4.4h\n"
- "smlal2 v20.4s, v28.8h, v4.8h\n"
- "smlal v18.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v8.4s, v24.8h, v4.8h\n"
- "smlal v10.4s, v27.4h, v4.4h\n"
- "smlal2 v9.4s, v27.8h, v4.8h\n"
- "sqrdmulh v15.4s, v15.4s, v6.4s\n"
- "sqrdmulh v20.4s, v20.4s, v17.4s\n"
- "sqrdmulh v18.4s, v18.4s, v6.4s\n"
- "sqrdmulh v5.4s, v5.4s, v17.4s\n"
- "and v1.16b, v15.16b, v21.16b\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "and v29.16b, v20.16b, v14.16b\n"
- "and v3.16b, v18.16b, v21.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v2.16b, v5.16b, v14.16b\n"
- "sqrdmulh v11.4s, v11.4s, v6.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "sqdmulh v15.4s, v15.4s, v12.4s\n"
+ "smlal v10.4s, v24.4h, v4.4h\n"
+ "smlal v6.4s, v27.4h, v4.4h\n"
+ "sqdmulh v17.4s, v17.4s, v12.4s\n"
+ "smlal2 v16.4s, v28.8h, v4.8h\n"
+ "smlal2 v8.4s, v26.8h, v4.8h\n"
+ "sqdmulh v10.4s, v10.4s, v12.4s\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "smlal2 v5.4s, v27.8h, v4.8h\n"
+ "sqdmulh v6.4s, v6.4s, v12.4s\n"
+ "and v23.16b, v15.16b, v19.16b\n"
+ "sqdmulh v16.4s, v16.4s, v20.4s\n"
+ "and v22.16b, v17.16b, v19.16b\n"
+ "sqdmulh v8.4s, v8.4s, v20.4s\n"
+ "and v21.16b, v10.16b, v19.16b\n"
+ "sqdmulh v7.4s, v7.4s, v20.4s\n"
+ "and v26.16b, v6.16b, v19.16b\n"
+ "sqdmulh v5.4s, v5.4s, v20.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v4.16b, v16.16b, v29.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v2.16b, v8.16b, v29.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v3.16b, v7.16b, v29.16b\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "and v25.16b, v5.16b, v29.16b\n"
+ "sqadd v15.4s, v15.4s, v23.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v22.4s\n"
"sshr v2.4s, v2.4s, #0x1f\n"
- "sqadd v15.4s, v15.4s, v1.4s\n"
- "sqrdmulh v10.4s, v10.4s, v6.4s\n"
- "and v0.16b, v11.16b, v21.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v21.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sqadd v18.4s, v18.4s, v3.4s\n"
- "sqadd v5.4s, v5.4s, v2.4s\n"
- "and v27.16b, v8.16b, v14.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "add v15.4s, v15.4s, v19.4s\n"
- "srshl v20.4s, v20.4s, v14.4s\n"
- "srshl v18.4s, v18.4s, v21.4s\n"
- "srshl v5.4s, v5.4s, v14.4s\n"
- "smin v15.4s, v15.4s, v12.4s\n"
- "add v20.4s, v20.4s, v19.4s\n"
- "add v18.4s, v18.4s, v19.4s\n"
- "smax v15.4s, v15.4s, v16.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "smin v18.4s, v18.4s, v12.4s\n"
- "add v5.4s, v5.4s, v19.4s\n"
- "smax v20.4s, v20.4s, v16.4s\n"
- "smax v18.4s, v18.4s, v16.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "uzp1 v15.16b, v15.16b, v20.16b\n"
- "sqadd v11.4s, v11.4s, v0.4s\n"
+ "sqadd v10.4s, v10.4s, v21.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v26.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v19.4s\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "srshl v17.4s, v17.4s, v19.4s\n"
+ "sqadd v8.4s, v8.4s, v2.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqadd v7.4s, v7.4s, v3.4s\n"
+ "srshl v6.4s, v6.4s, v19.4s\n"
+ "sqadd v5.4s, v5.4s, v25.4s\n"
+ "srshl v16.4s, v16.4s, v29.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqxtn v17.4h, v17.4s\n"
+ "srshl v7.4s, v7.4s, v29.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v5.4s, v5.4s, v29.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v15.8h, v16.4s\n"
+ "sqxtn2 v17.8h, v8.4s\n"
+ "sqxtn2 v10.8h, v7.4s\n"
+ "sqxtn2 v6.8h, v5.4s\n"
+ "sqadd v15.8h, v15.8h, v18.8h\n"
+ "sqadd v17.8h, v17.8h, v18.8h\n"
+ "sqadd v10.8h, v10.8h, v18.8h\n"
+ "sqadd v6.8h, v6.8h, v18.8h\n"
+ "smax v15.8h, v15.8h, v11.8h\n"
+ "smax v17.8h, v17.8h, v11.8h\n"
+ "smax v10.8h, v10.8h, v11.8h\n"
+ "smax v6.8h, v6.8h, v11.8h\n"
+ "smin v15.8h, v15.8h, v13.8h\n"
+ "smin v17.8h, v17.8h, v13.8h\n"
+ "smin v10.8h, v10.8h, v13.8h\n"
+ "smin v6.8h, v6.8h, v13.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x17, x1]\n"
- "smax v5.4s, v5.4s, v16.4s\n"
- "sqadd v8.4s, v8.4s, v27.4s\n"
- "srshl v11.4s, v11.4s, v21.4s\n"
- "and v30.16b, v10.16b, v21.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "uzp1 v18.16b, v18.16b, v5.16b\n"
- "add v11.4s, v11.4s, v19.4s\n"
- "srshl v8.4s, v8.4s, v14.4s\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
- "str d18, [x16, x1]\n"
- "smin v11.4s, v11.4s, v12.4s\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "add v8.4s, v8.4s, v19.4s\n"
- "sqadd v10.4s, v10.4s, v30.4s\n"
- "smax v11.4s, v11.4s, v16.4s\n"
- "smin v8.4s, v8.4s, v12.4s\n"
- "and v6.16b, v9.16b, v14.16b\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "smax v8.4s, v8.4s, v16.4s\n"
- "srshl v10.4s, v10.4s, v21.4s\n"
- "uzp1 v11.16b, v11.16b, v8.16b\n"
- "add v10.4s, v10.4s, v19.4s\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "str d11, [x6, x1]\n"
- "smin v10.4s, v10.4s, v12.4s\n"
- "sqadd v9.4s, v9.4s, v6.4s\n"
- "smax v10.4s, v10.4s, v16.4s\n"
- "srshl v9.4s, v9.4s, v14.4s\n"
- "add v9.4s, v9.4s, v19.4s\n"
- "smin v9.4s, v9.4s, v12.4s\n"
- "smax v9.4s, v9.4s, v16.4s\n"
- "uzp1 v10.16b, v10.16b, v9.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "str d15, [x16, x22]\n"
"uzp1 v10.16b, v10.16b, v10.16b\n"
- "str d10, [x8, x1]\n"
- "add x1, x1, #0x8\n"
- "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q15, [x12, #0x0]\n"
- "mov v18.16b, v15.16b\n"
- "ldr q20, [x12, #0x10]\n"
- "add x12, x12, #0x20\n"
- "mov v11.16b, v15.16b\n"
- "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str d17, [x8, x22]\n"
+ "str d10, [x4, x22]\n"
+ "str d6, [x7, x22]\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q15, [x19, #0x0]\n"
+ "add x22, x22, #0x8\n"
+ "ldr q16, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d0, [x23, #0x0]\n"
+ "ldr d1, [x23, #0x8]\n"
+ "ldr d2, [x23, #0x10]\n"
+ "mov v17.16b, v15.16b\n"
+ "mov v8.16b, v16.16b\n"
+ "ldr d3, [x23, #0x18]\n"
+ "ldr d4, [x23, #0x20]\n"
"mov v10.16b, v15.16b\n"
- "ldr d0, [x3, #0x0]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
- "mov v5.16b, v20.16b\n"
- "ldr d1, [x3, #0x8]\n"
- "mov v8.16b, v20.16b\n"
- "ldr d2, [x3, #0x10]\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "mov v9.16b, v20.16b\n"
- "ldr d3, [x3, #0x18]\n"
- "ldr d4, [x3, #0x20]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ldp x28, x27, [x25, #0x0]\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
- "ldp x26, x13, [x25, #0x10]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
- "ldp x24, x23, [x25, #0x20]\n"
- "ldp x22, x21, [x25, #0x30]\n"
- "ldp x20, x0, [x25, #0x40]\n"
- "ldr d31, [x28, x10]\n"
- "ssubl v31.8h, v31.8b, v7.8b\n"
- "ldr d30, [x27, x10]\n"
- "ldr d29, [x26, x10]\n"
- "ssubl v30.8h, v30.8b, v7.8b\n"
- "ldr d28, [x13, x10]\n"
- "ldr d27, [x24, x10]\n"
- "ssubl v29.8h, v29.8b, v7.8b\n"
- "ldr d23, [x23, x10]\n"
- "ssubl v28.8h, v28.8b, v7.8b\n"
- "ldr d25, [x22, x10]\n"
- "ldr d24, [x21, x10]\n"
- "ssubl v27.8h, v27.8b, v7.8b\n"
- "ldr d26, [x20, x10]\n"
- "ssubl v23.8h, v23.8b, v7.8b\n"
- "ldr d22, [x0, x10]\n"
- "ssubl v25.8h, v25.8b, v7.8b\n"
- "ssubl v24.8h, v24.8b, v7.8b\n"
- "ssubl v26.8h, v26.8b, v7.8b\n"
- "ssubl v22.8h, v22.8b, v7.8b\n"
+ "mov v7.16b, v16.16b\n"
+ "ldp x28, x6, [x20, #0x0]\n"
+ "ldp x26, x25, [x20, #0x10]\n"
+ "mov v6.16b, v15.16b\n"
+ "mov v5.16b, v16.16b\n"
+ "ldp x5, x2, [x20, #0x20]\n"
+ "ldp x27, x21, [x20, #0x30]\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "ldp x12, x19, [x20, #0x40]\n"
+ "ldr d31, [x28, x24]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "ldr d30, [x6, x24]\n"
+ "ldr d29, [x26, x24]\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "ldr d28, [x25, x24]\n"
+ "ldr d27, [x5, x24]\n"
+ "ssubl v30.8h, v30.8b, v9.8b\n"
+ "ssubl v29.8h, v29.8b, v9.8b\n"
+ "ldr d23, [x2, x24]\n"
+ "ldr d25, [x27, x24]\n"
+ "ssubl v28.8h, v28.8b, v9.8b\n"
+ "ssubl v27.8h, v27.8b, v9.8b\n"
+ "ldr d24, [x21, x24]\n"
+ "ldr d26, [x12, x24]\n"
+ "ssubl v23.8h, v23.8b, v9.8b\n"
+ "ssubl v25.8h, v25.8b, v9.8b\n"
+ "ldr d22, [x19, x24]\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "ssubl v26.8h, v26.8b, v9.8b\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
"bgt 1b\n"
"2:" // Tail
"smlal v15.4s, v31.4h, v0.4h\n"
- "ldr x20, [x25, #0x50]\n"
- "tst x4, #0x7\n"
- "smlal2 v20.4s, v31.8h, v0.8h\n"
- "ldr x28, [x25, #0x58]\n"
- "smlal v18.4s, v30.4h, v0.4h\n"
- "ldr x0, [x25, #0x60]\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x10]\n"
- "ssubl v31.8h, v31.8b, v7.8b\n"
- "smlal v11.4s, v29.4h, v0.4h\n"
- "ldr x7, [x25, #0x68]\n"
- "smlal2 v8.4s, v29.8h, v0.8h\n"
- "ldr x26, [x25, #0x70]\n"
- "smlal v10.4s, v28.4h, v0.4h\n"
- "ldr x23, [x25, #0x78]\n"
- "smlal2 v9.4s, v28.8h, v0.8h\n"
- "ldr d0, [x3, #0x28]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal2 v16.4s, v31.8h, v0.8h\n"
+ "ldr x19, [x20, #0x50]\n"
+ "ldr d31, [x19, x24]\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal v10.4s, v29.4h, v0.4h\n"
+ "ldr x15, [x20, #0x58]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v6.4s, v28.4h, v0.4h\n"
+ "smlal2 v8.4s, v30.8h, v0.8h\n"
+ "ldr x19, [x20, #0x60]\n"
+ "ldr x27, [x20, #0x68]\n"
+ "smlal2 v7.4s, v29.8h, v0.8h\n"
"smlal v15.4s, v30.4h, v1.4h\n"
- "ldr x20, [x25, #0x80]\n"
- "smlal2 v20.4s, v30.8h, v1.8h\n"
- "ldr d30, [x28, x10]\n"
- "ssubl v30.8h, v30.8b, v7.8b\n"
- "smlal v18.4s, v27.4h, v1.4h\n"
- "ldr x22, [x25, #0x88]\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "ldr x13, [x25, #0x90]\n"
- "smlal v11.4s, v28.4h, v1.4h\n"
- "ldr x21, [x25, #0x98]\n"
- "smlal2 v8.4s, v28.8h, v1.8h\n"
- "ldr x14, [x25, #0xa0]\n"
- "smlal v10.4s, v23.4h, v1.4h\n"
- "ldr x11, [x25, #0xa8]\n"
- "smlal2 v9.4s, v23.8h, v1.8h\n"
- "ldr d1, [x3, #0x30]\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x5, [x20, #0x70]\n"
+ "ldr x11, [x20, #0x78]\n"
+ "smlal2 v16.4s, v30.8h, v1.8h\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x15, x24]\n"
+ "ssubl v30.8h, v30.8b, v9.8b\n"
+ "smlal v17.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v28.4h, v1.4h\n"
+ "ldr d0, [x23, #0x28]\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v6.4s, v23.4h, v1.4h\n"
+ "smlal2 v8.4s, v27.8h, v1.8h\n"
+ "ldr x12, [x20, #0x80]\n"
+ "ldr x26, [x20, #0x88]\n"
+ "smlal2 v7.4s, v28.8h, v1.8h\n"
"smlal v15.4s, v27.4h, v2.4h\n"
- "ldr x24, [x25, #0xb0]\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "ldr d27, [x0, x10]\n"
- "ssubl v27.8h, v27.8b, v7.8b\n"
- "smlal v18.4s, v25.4h, v2.4h\n"
- "ldr x0, [x25, #0xb8]\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "ldr x15, [x25, #0xc0]\n"
- "smlal v11.4s, v23.4h, v2.4h\n"
- "ldr x9, [x25, #0xc8]\n"
- "smlal2 v8.4s, v23.8h, v2.8h\n"
- "ldr x27, [x25, #0xd0]\n"
- "smlal v10.4s, v31.4h, v2.4h\n"
- "ldr x28, [x25, #0xd8]\n"
- "smlal2 v9.4s, v31.8h, v2.8h\n"
- "ldr d2, [x3, #0x38]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x14, [x20, #0x90]\n"
+ "ldr x15, [x20, #0x98]\n"
+ "smlal2 v16.4s, v27.8h, v2.8h\n"
+ "smlal2 v5.4s, v23.8h, v1.8h\n"
+ "ldr d27, [x19, x24]\n"
+ "ssubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal v10.4s, v23.4h, v2.4h\n"
+ "ldr d1, [x23, #0x30]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v31.4h, v2.4h\n"
+ "smlal2 v8.4s, v25.8h, v2.8h\n"
+ "ldr x21, [x20, #0xa0]\n"
+ "ldr x2, [x20, #0xa8]\n"
+ "smlal2 v7.4s, v23.8h, v2.8h\n"
"smlal v15.4s, v25.4h, v3.4h\n"
- "ldr x12, [x25, #0xe0]\n"
- "smlal2 v20.4s, v25.8h, v3.8h\n"
- "ldr d25, [x7, x10]\n"
- "ssubl v25.8h, v25.8b, v7.8b\n"
- "smlal v18.4s, v24.4h, v3.4h\n"
- "ldr x7, [x25, #0xe8]\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "ldr q6, [x2, #0x0]\n"
- "smlal v11.4s, v31.4h, v3.4h\n"
- "ldr q21, [x5, #0x0]\n"
- "smlal2 v8.4s, v31.8h, v3.8h\n"
- "ldr q17, [x2, #0x10]\n"
- "add x2, x2, #0x20\n"
- "smlal v10.4s, v30.4h, v3.4h\n"
- "ldr q14, [x5, #0x10]\n"
- "add x5, x5, #0x20\n"
- "smlal2 v9.4s, v30.8h, v3.8h\n"
- "ldr d3, [x3, #0x40]\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ldr x13, [x20, #0xb0]\n"
+ "ldr x9, [x20, #0xb8]\n"
+ "smlal2 v16.4s, v25.8h, v3.8h\n"
+ "smlal2 v5.4s, v31.8h, v2.8h\n"
+ "ldr d25, [x27, x24]\n"
+ "ssubl v25.8h, v25.8b, v9.8b\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal v10.4s, v31.4h, v3.4h\n"
+ "ldr d2, [x23, #0x38]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v30.4h, v3.4h\n"
+ "smlal2 v8.4s, v24.8h, v3.8h\n"
+ "ldr x19, [x20, #0xc0]\n"
+ "ldr x28, [x20, #0xc8]\n"
+ "smlal2 v7.4s, v31.8h, v3.8h\n"
"smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v20.4s, v24.8h, v4.8h\n"
- "ldr d24, [x26, x10]\n"
- "ssubl v24.8h, v24.8b, v7.8b\n"
- "smlal v18.4s, v27.4h, v4.4h\n"
- "ldr x26, [x25, #0xf0]\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
- "ldr d27, [x23, x10]\n"
- "ssubl v27.8h, v27.8b, v7.8b\n"
- "smlal v11.4s, v30.4h, v4.4h\n"
- "ldr x23, [x25, #0xf8]\n"
- "smlal2 v8.4s, v30.8h, v4.8h\n"
- "smlal v10.4s, v26.4h, v4.4h\n"
- "smlal2 v9.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0x48]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
+ "ldr x6, [x20, #0xd0]\n"
+ "ldr x27, [x20, #0xd8]\n"
+ "smlal2 v16.4s, v24.8h, v4.8h\n"
+ "smlal2 v5.4s, v30.8h, v3.8h\n"
+ "ldr d24, [x5, x24]\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal v10.4s, v30.4h, v4.4h\n"
+ "ldr d3, [x23, #0x40]\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v8.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x11, x24]\n"
+ "ssubl v27.8h, v27.8b, v9.8b\n"
+ "smlal2 v7.4s, v30.8h, v4.8h\n"
"smlal v15.4s, v29.4h, v0.4h\n"
- "smlal2 v20.4s, v29.8h, v0.8h\n"
- "smlal v18.4s, v28.4h, v0.4h\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "smlal v11.4s, v22.4h, v0.4h\n"
- "smlal2 v8.4s, v22.8h, v0.8h\n"
- "smlal v10.4s, v25.4h, v0.4h\n"
- "smlal2 v9.4s, v25.8h, v0.8h\n"
- "ldr d0, [x3, #0x50]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
+ "ldr x11, [x20, #0xe0]\n"
+ "ldr x17, [x20, #0xe8]\n"
+ "smlal2 v16.4s, v29.8h, v0.8h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x23, #0x48]\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "smlal v10.4s, v22.4h, v0.4h\n"
+ "ldr x5, [x20, #0xf0]\n"
+ "ldr x25, [x20, #0xf8]\n"
+ "smlal v6.4s, v25.4h, v0.4h\n"
+ "smlal2 v8.4s, v28.8h, v0.8h\n"
+ "ldr q12, [x10, #0x0]\n"
+ "ldr q19, [x1, #0x0]\n"
+ "smlal2 v7.4s, v22.8h, v0.8h\n"
"smlal v15.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x10]\n"
- "ssubl v28.8h, v28.8b, v7.8b\n"
- "smlal v18.4s, v23.4h, v1.4h\n"
- "ldr x22, [x25, #0x100]\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "smlal v11.4s, v25.4h, v1.4h\n"
- "smlal2 v8.4s, v25.8h, v1.8h\n"
- "smlal v10.4s, v24.4h, v1.4h\n"
- "smlal2 v9.4s, v24.8h, v1.8h\n"
- "ldr d1, [x3, #0x58]\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr q20, [x10, #0x10]\n"
+ "ldr q29, [x1, #0x10]\n"
+ "smlal2 v16.4s, v28.8h, v1.8h\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "ldr d28, [x26, x24]\n"
+ "ldr d0, [x23, #0x50]\n"
+ "smlal v17.4s, v23.4h, v1.4h\n"
+ "smlal v10.4s, v25.4h, v1.4h\n"
+ "ssubl v28.8h, v28.8b, v9.8b\n"
+ "ldr x26, [x20, #0x100]\n"
+ "smlal v6.4s, v24.4h, v1.4h\n"
+ "smlal2 v8.4s, v23.8h, v1.8h\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "tst x0, #0x7\n"
+ "smlal2 v7.4s, v25.8h, v1.8h\n"
"smlal v15.4s, v23.4h, v2.4h\n"
- "smlal2 v20.4s, v23.8h, v2.8h\n"
- "ldr d23, [x20, x10]\n"
- "ssubl v23.8h, v23.8b, v7.8b\n"
- "smlal v18.4s, v31.4h, v2.4h\n"
- "ldr x20, [x25, #0x108]\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "smlal v11.4s, v24.4h, v2.4h\n"
- "smlal2 v8.4s, v24.8h, v2.8h\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v9.4s, v27.8h, v2.8h\n"
- "ldr d2, [x3, #0x60]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
+ "add x10, x10, #0x20\n"
+ "add x1, x1, #0x20\n"
+ "smlal2 v16.4s, v23.8h, v2.8h\n"
+ "ldr d23, [x12, x24]\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "ssubl v23.8h, v23.8b, v9.8b\n"
+ "smlal v17.4s, v31.4h, v2.4h\n"
+ "smlal v10.4s, v24.4h, v2.4h\n"
+ "ldr d1, [x23, #0x58]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v27.4h, v2.4h\n"
+ "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "ldr x12, [x20, #0x108]\n"
+ "smlal2 v7.4s, v24.8h, v2.8h\n"
"smlal v15.4s, v31.4h, v3.4h\n"
- "smlal2 v20.4s, v31.8h, v3.8h\n"
- "ldr d31, [x13, x10]\n"
- "ssubl v31.8h, v31.8b, v7.8b\n"
- "smlal v18.4s, v30.4h, v3.4h\n"
- "ldr x13, [x25, #0x110]\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "smlal v11.4s, v27.4h, v3.4h\n"
- "smlal2 v8.4s, v27.8h, v3.8h\n"
- "smlal v10.4s, v23.4h, v3.4h\n"
- "smlal2 v9.4s, v23.8h, v3.8h\n"
- "ldr d3, [x3, #0x68]\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v16.4s, v31.8h, v3.8h\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "ldr d31, [x14, x24]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v17.4s, v30.4h, v3.4h\n"
+ "smlal v10.4s, v27.4h, v3.4h\n"
+ "ldr d2, [x23, #0x60]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v23.4h, v3.4h\n"
+ "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "ldr x14, [x20, #0x110]\n"
+ "smlal2 v7.4s, v27.8h, v3.8h\n"
"smlal v15.4s, v30.4h, v4.4h\n"
- "smlal2 v20.4s, v30.8h, v4.8h\n"
- "ldr d30, [x21, x10]\n"
- "ssubl v30.8h, v30.8b, v7.8b\n"
- "smlal v18.4s, v26.4h, v4.4h\n"
- "ldr x21, [x25, #0x118]\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x10]\n"
- "ssubl v26.8h, v26.8b, v7.8b\n"
- "smlal v11.4s, v23.4h, v4.4h\n"
- "smlal2 v8.4s, v23.8h, v4.8h\n"
- "smlal v10.4s, v28.4h, v4.4h\n"
- "smlal2 v9.4s, v28.8h, v4.8h\n"
- "ldr d4, [x3, #0x70]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal2 v16.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x15, x24]\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "ssubl v30.8h, v30.8b, v9.8b\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal v10.4s, v23.4h, v4.4h\n"
+ "ldr d3, [x23, #0x68]\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v28.4h, v4.4h\n"
+ "smlal2 v8.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x21, x24]\n"
+ "ssubl v26.8h, v26.8b, v9.8b\n"
+ "smlal2 v7.4s, v23.8h, v4.8h\n"
"smlal v15.4s, v22.4h, v0.4h\n"
- "smlal2 v20.4s, v22.8h, v0.8h\n"
- "ldr d22, [x0, x10]\n"
- "ssubl v22.8h, v22.8b, v7.8b\n"
- "smlal v18.4s, v25.4h, v0.4h\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v8.4s, v31.8h, v0.8h\n"
- "smlal v10.4s, v30.4h, v0.4h\n"
- "smlal2 v9.4s, v30.8h, v0.8h\n"
- "ldr d0, [x3, #0x78]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
+ "ldr x21, [x20, #0x118]\n"
+ "smlal2 v16.4s, v22.8h, v0.8h\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x23, #0x70]\n"
+ "ldr d22, [x9, x24]\n"
+ "smlal v17.4s, v25.4h, v0.4h\n"
+ "smlal v10.4s, v31.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v6.4s, v30.4h, v0.4h\n"
+ "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "smlal2 v7.4s, v31.8h, v0.8h\n"
"smlal v15.4s, v25.4h, v1.4h\n"
- "smlal2 v20.4s, v25.8h, v1.8h\n"
- "ldr d25, [x11, x10]\n"
- "ssubl v25.8h, v25.8b, v7.8b\n"
- "smlal v18.4s, v24.4h, v1.4h\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
- "smlal v11.4s, v30.4h, v1.4h\n"
- "smlal2 v8.4s, v30.8h, v1.8h\n"
- "smlal v10.4s, v26.4h, v1.4h\n"
- "smlal2 v9.4s, v26.8h, v1.8h\n"
- "ldr d1, [x3, #0x80]\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal2 v16.4s, v25.8h, v1.8h\n"
+ "ldr d25, [x2, x24]\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "ssubl v25.8h, v25.8b, v9.8b\n"
+ "smlal v17.4s, v24.4h, v1.4h\n"
+ "smlal v10.4s, v30.4h, v1.4h\n"
+ "ldr d0, [x23, #0x78]\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v6.4s, v26.4h, v1.4h\n"
+ "smlal2 v8.4s, v24.8h, v1.8h\n"
+ "smlal2 v7.4s, v30.8h, v1.8h\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v20.4s, v24.8h, v2.8h\n"
- "ldr d24, [x24, x10]\n"
- "ssubl v24.8h, v24.8b, v7.8b\n"
- "smlal v18.4s, v27.4h, v2.4h\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "smlal v11.4s, v26.4h, v2.4h\n"
- "smlal2 v8.4s, v26.8h, v2.8h\n"
- "smlal v10.4s, v25.4h, v2.4h\n"
- "smlal2 v9.4s, v25.8h, v2.8h\n"
- "ldr d2, [x3, #0x88]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal2 v16.4s, v24.8h, v2.8h\n"
+ "ldr d24, [x13, x24]\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal v10.4s, v26.4h, v2.4h\n"
+ "ldr d1, [x23, #0x80]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v8.4s, v27.8h, v2.8h\n"
+ "smlal2 v7.4s, v26.8h, v2.8h\n"
"smlal v15.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
- "ldr d27, [x15, x10]\n"
- "ssubl v27.8h, v27.8b, v7.8b\n"
- "smlal v18.4s, v23.4h, v3.4h\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v8.4s, v25.8h, v3.8h\n"
- "smlal v10.4s, v24.4h, v3.4h\n"
- "smlal2 v9.4s, v24.8h, v3.8h\n"
- "ldr d3, [x3, #0x90]\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v16.4s, v27.8h, v3.8h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "ldr d27, [x19, x24]\n"
+ "ssubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v17.4s, v23.4h, v3.4h\n"
+ "smlal v10.4s, v25.4h, v3.4h\n"
+ "ldr d2, [x23, #0x88]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v8.4s, v23.8h, v3.8h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
"smlal v15.4s, v23.4h, v4.4h\n"
- "smlal2 v20.4s, v23.8h, v4.8h\n"
- "ldr d23, [x9, x10]\n"
- "ssubl v23.8h, v23.8b, v7.8b\n"
- "smlal v18.4s, v28.4h, v4.4h\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "ldr d28, [x12, x10]\n"
- "ssubl v28.8h, v28.8b, v7.8b\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v8.4s, v24.8h, v4.8h\n"
- "smlal v10.4s, v22.4h, v4.4h\n"
- "smlal2 v9.4s, v22.8h, v4.8h\n"
- "ldr d4, [x3, #0x98]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal2 v16.4s, v23.8h, v4.8h\n"
+ "ldr d23, [x28, x24]\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "ssubl v23.8h, v23.8b, v9.8b\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "smlal v10.4s, v24.4h, v4.4h\n"
+ "ldr d3, [x23, #0x90]\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v22.4h, v4.4h\n"
+ "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x11, x24]\n"
+ "ssubl v28.8h, v28.8b, v9.8b\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
"smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v20.4s, v31.8h, v0.8h\n"
- "ldr d31, [x27, x10]\n"
- "ssubl v31.8h, v31.8b, v7.8b\n"
- "smlal v18.4s, v30.4h, v0.4h\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "smlal v11.4s, v27.4h, v0.4h\n"
- "smlal2 v8.4s, v27.8h, v0.8h\n"
- "smlal v10.4s, v23.4h, v0.4h\n"
- "smlal2 v9.4s, v23.8h, v0.8h\n"
- "ldr d0, [x3, #0xa0]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
+ "smlal2 v16.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x6, x24]\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal v10.4s, v27.4h, v0.4h\n"
+ "ldr d4, [x23, #0x98]\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v6.4s, v23.4h, v0.4h\n"
+ "smlal2 v8.4s, v30.8h, v0.8h\n"
+ "smlal2 v7.4s, v27.8h, v0.8h\n"
"smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v20.4s, v30.8h, v1.8h\n"
- "ldr d30, [x28, x10]\n"
- "ssubl v30.8h, v30.8b, v7.8b\n"
- "smlal v18.4s, v26.4h, v1.4h\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
- "smlal v11.4s, v23.4h, v1.4h\n"
- "smlal2 v8.4s, v23.8h, v1.8h\n"
- "smlal v10.4s, v31.4h, v1.4h\n"
- "smlal2 v9.4s, v31.8h, v1.8h\n"
- "ldr d1, [x3, #0xa8]\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
+ "smlal2 v16.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x27, x24]\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "ssubl v30.8h, v30.8b, v9.8b\n"
+ "smlal v17.4s, v26.4h, v1.4h\n"
+ "smlal v10.4s, v23.4h, v1.4h\n"
+ "ldr d0, [x23, #0xa0]\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v6.4s, v31.4h, v1.4h\n"
+ "smlal2 v8.4s, v26.8h, v1.8h\n"
+ "smlal2 v7.4s, v23.8h, v1.8h\n"
"smlal v15.4s, v26.4h, v2.4h\n"
- "smlal2 v20.4s, v26.8h, v2.8h\n"
- "ldr d26, [x7, x10]\n"
- "ssubl v26.8h, v26.8b, v7.8b\n"
- "smlal v18.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "smlal2 v8.4s, v31.8h, v2.8h\n"
- "smlal v10.4s, v30.4h, v2.4h\n"
- "smlal2 v9.4s, v30.8h, v2.8h\n"
- "ldr d2, [x3, #0xb0]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
+ "smlal2 v16.4s, v26.8h, v2.8h\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "ldr d26, [x17, x24]\n"
+ "ssubl v26.8h, v26.8b, v9.8b\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal v10.4s, v31.4h, v2.4h\n"
+ "ldr d1, [x23, #0xa8]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v30.4h, v2.4h\n"
+ "smlal2 v8.4s, v25.8h, v2.8h\n"
+ "smlal2 v7.4s, v31.8h, v2.8h\n"
"smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v20.4s, v25.8h, v3.8h\n"
- "ldr d25, [x26, x10]\n"
- "ssubl v25.8h, v25.8b, v7.8b\n"
- "smlal v18.4s, v24.4h, v3.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "smlal v11.4s, v30.4h, v3.4h\n"
- "smlal2 v8.4s, v30.8h, v3.8h\n"
- "smlal v10.4s, v28.4h, v3.4h\n"
- "smlal2 v9.4s, v28.8h, v3.8h\n"
- "ldr d3, [x3, #0xb8]\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v16.4s, v25.8h, v3.8h\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "ldr d25, [x5, x24]\n"
+ "ssubl v25.8h, v25.8b, v9.8b\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal v10.4s, v30.4h, v3.4h\n"
+ "ldr d2, [x23, #0xb0]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v28.4h, v3.4h\n"
+ "smlal2 v8.4s, v24.8h, v3.8h\n"
+ "smlal2 v7.4s, v30.8h, v3.8h\n"
"smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v20.4s, v24.8h, v4.8h\n"
- "ldr d24, [x23, x10]\n"
- "ssubl v24.8h, v24.8b, v7.8b\n"
- "smlal v18.4s, v22.4h, v4.4h\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
- "smlal v11.4s, v28.4h, v4.4h\n"
- "smlal2 v8.4s, v28.8h, v4.8h\n"
- "smlal v10.4s, v26.4h, v4.4h\n"
- "smlal2 v9.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0xc0]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
+ "smlal2 v16.4s, v24.8h, v4.8h\n"
+ "ldr d24, [x25, x24]\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v17.4s, v22.4h, v4.4h\n"
+ "smlal v10.4s, v28.4h, v4.4h\n"
+ "ldr d3, [x23, #0xb8]\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v7.4s, v28.8h, v4.8h\n"
"smlal v15.4s, v27.4h, v0.4h\n"
- "smlal2 v20.4s, v27.8h, v0.8h\n"
- "ldr d27, [x22, x10]\n"
- "ssubl v27.8h, v27.8b, v7.8b\n"
- "smlal v18.4s, v23.4h, v0.4h\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
- "smlal v11.4s, v25.4h, v0.4h\n"
- "smlal2 v8.4s, v25.8h, v0.8h\n"
- "ldr d25, [x20, x10]\n"
- "ssubl v25.8h, v25.8b, v7.8b\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v9.4s, v24.8h, v0.8h\n"
+ "smlal2 v16.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x26, x24]\n"
+ "ssubl v27.8h, v27.8b, v9.8b\n"
+ "smlal2 v8.4s, v22.8h, v4.8h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x23, #0xc0]\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v17.4s, v23.4h, v0.4h\n"
+ "smlal v10.4s, v25.4h, v0.4h\n"
+ "smlal v6.4s, v24.4h, v0.4h\n"
+ "smlal2 v7.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x12, x24]\n"
+ "ssubl v25.8h, v25.8b, v9.8b\n"
+ "smlal2 v8.4s, v23.8h, v0.8h\n"
+ "smlal2 v5.4s, v24.8h, v0.8h\n"
"smlal v15.4s, v23.4h, v1.4h\n"
- "smlal2 v20.4s, v23.8h, v1.8h\n"
- "smlal v18.4s, v31.4h, v1.4h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "smlal v11.4s, v24.4h, v1.4h\n"
- "smlal2 v8.4s, v24.8h, v1.8h\n"
- "ldr d24, [x13, x10]\n"
- "ssubl v24.8h, v24.8b, v7.8b\n"
- "smlal v10.4s, v27.4h, v1.4h\n"
- "smlal2 v9.4s, v27.8h, v1.8h\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal v10.4s, v24.4h, v1.4h\n"
+ "smlal v6.4s, v27.4h, v1.4h\n"
+ "smlal2 v7.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x14, x24]\n"
+ "smlal2 v16.4s, v23.8h, v1.8h\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "smlal2 v8.4s, v31.8h, v1.8h\n"
+ "smlal2 v5.4s, v27.8h, v1.8h\n"
"smlal v15.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "smlal v18.4s, v30.4h, v2.4h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "smlal v11.4s, v27.4h, v2.4h\n"
- "smlal2 v8.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x10]\n"
- "add x10, x10, #0x8\n"
- "smlal v10.4s, v25.4h, v2.4h\n"
- "ssubl v27.8h, v27.8b, v7.8b\n"
- "smlal2 v9.4s, v25.8h, v2.8h\n"
+ "smlal v17.4s, v30.4h, v2.4h\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v7.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x24]\n"
+ "smlal2 v16.4s, v31.8h, v2.8h\n"
+ "ssubl v27.8h, v27.8b, v9.8b\n"
+ "smlal2 v8.4s, v30.8h, v2.8h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "add x24, x24, #0x8\n"
"smlal v15.4s, v30.4h, v3.4h\n"
- "smlal2 v20.4s, v30.8h, v3.8h\n"
- "smlal v18.4s, v28.4h, v3.4h\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v8.4s, v25.8h, v3.8h\n"
- "smlal v10.4s, v24.4h, v3.4h\n"
- "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "smlal v17.4s, v28.4h, v3.4h\n"
+ "smlal v10.4s, v25.4h, v3.4h\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v16.4s, v30.8h, v3.8h\n"
+ "smlal2 v8.4s, v28.8h, v3.8h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
"smlal v15.4s, v28.4h, v4.4h\n"
- "smlal2 v20.4s, v28.8h, v4.8h\n"
- "smlal v18.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v8.4s, v24.8h, v4.8h\n"
- "smlal v10.4s, v27.4h, v4.4h\n"
- "smlal2 v9.4s, v27.8h, v4.8h\n"
- "sqrdmulh v15.4s, v15.4s, v6.4s\n"
- "sqrdmulh v20.4s, v20.4s, v17.4s\n"
- "sqrdmulh v18.4s, v18.4s, v6.4s\n"
- "sqrdmulh v5.4s, v5.4s, v17.4s\n"
- "and v1.16b, v15.16b, v21.16b\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "and v29.16b, v20.16b, v14.16b\n"
- "and v3.16b, v18.16b, v21.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v2.16b, v5.16b, v14.16b\n"
- "sqrdmulh v11.4s, v11.4s, v6.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "sqdmulh v15.4s, v15.4s, v12.4s\n"
+ "smlal v10.4s, v24.4h, v4.4h\n"
+ "smlal v6.4s, v27.4h, v4.4h\n"
+ "sqdmulh v17.4s, v17.4s, v12.4s\n"
+ "smlal2 v16.4s, v28.8h, v4.8h\n"
+ "smlal2 v8.4s, v26.8h, v4.8h\n"
+ "sqdmulh v10.4s, v10.4s, v12.4s\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "smlal2 v5.4s, v27.8h, v4.8h\n"
+ "sqdmulh v6.4s, v6.4s, v12.4s\n"
+ "and v23.16b, v15.16b, v19.16b\n"
+ "sqdmulh v16.4s, v16.4s, v20.4s\n"
+ "and v22.16b, v17.16b, v19.16b\n"
+ "sqdmulh v8.4s, v8.4s, v20.4s\n"
+ "and v21.16b, v10.16b, v19.16b\n"
+ "sqdmulh v7.4s, v7.4s, v20.4s\n"
+ "and v26.16b, v6.16b, v19.16b\n"
+ "sqdmulh v5.4s, v5.4s, v20.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v4.16b, v16.16b, v29.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v2.16b, v8.16b, v29.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v3.16b, v7.16b, v29.16b\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "and v25.16b, v5.16b, v29.16b\n"
+ "sqadd v15.4s, v15.4s, v23.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v22.4s\n"
"sshr v2.4s, v2.4s, #0x1f\n"
- "sqadd v15.4s, v15.4s, v1.4s\n"
- "sqrdmulh v10.4s, v10.4s, v6.4s\n"
- "and v0.16b, v11.16b, v21.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v21.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sqadd v18.4s, v18.4s, v3.4s\n"
- "sqadd v5.4s, v5.4s, v2.4s\n"
- "and v27.16b, v8.16b, v14.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "add v15.4s, v15.4s, v19.4s\n"
- "srshl v20.4s, v20.4s, v14.4s\n"
- "srshl v18.4s, v18.4s, v21.4s\n"
- "srshl v5.4s, v5.4s, v14.4s\n"
- "smin v15.4s, v15.4s, v12.4s\n"
- "add v20.4s, v20.4s, v19.4s\n"
- "add v18.4s, v18.4s, v19.4s\n"
- "smax v15.4s, v15.4s, v16.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "smin v18.4s, v18.4s, v12.4s\n"
- "add v5.4s, v5.4s, v19.4s\n"
- "smax v20.4s, v20.4s, v16.4s\n"
- "smax v18.4s, v18.4s, v16.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "uzp1 v15.16b, v15.16b, v20.16b\n"
- "sqadd v11.4s, v11.4s, v0.4s\n"
+ "sqadd v10.4s, v10.4s, v21.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v26.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v19.4s\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "srshl v17.4s, v17.4s, v19.4s\n"
+ "sqadd v8.4s, v8.4s, v2.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqadd v7.4s, v7.4s, v3.4s\n"
+ "srshl v6.4s, v6.4s, v19.4s\n"
+ "sqadd v5.4s, v5.4s, v25.4s\n"
+ "srshl v16.4s, v16.4s, v29.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqxtn v17.4h, v17.4s\n"
+ "srshl v7.4s, v7.4s, v29.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v5.4s, v5.4s, v29.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v15.8h, v16.4s\n"
+ "sqxtn2 v17.8h, v8.4s\n"
+ "sqxtn2 v10.8h, v7.4s\n"
+ "sqxtn2 v6.8h, v5.4s\n"
+ "sqadd v15.8h, v15.8h, v18.8h\n"
+ "sqadd v17.8h, v17.8h, v18.8h\n"
+ "sqadd v10.8h, v10.8h, v18.8h\n"
+ "sqadd v6.8h, v6.8h, v18.8h\n"
+ "smax v15.8h, v15.8h, v11.8h\n"
+ "smax v17.8h, v17.8h, v11.8h\n"
+ "smax v10.8h, v10.8h, v11.8h\n"
+ "smax v6.8h, v6.8h, v11.8h\n"
+ "smin v15.8h, v15.8h, v13.8h\n"
+ "smin v17.8h, v17.8h, v13.8h\n"
+ "smin v10.8h, v10.8h, v13.8h\n"
+ "smin v6.8h, v6.8h, v13.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x17, x1]\n"
- "smax v5.4s, v5.4s, v16.4s\n"
- "sqadd v8.4s, v8.4s, v27.4s\n"
- "srshl v11.4s, v11.4s, v21.4s\n"
- "and v30.16b, v10.16b, v21.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "uzp1 v18.16b, v18.16b, v5.16b\n"
- "add v11.4s, v11.4s, v19.4s\n"
- "srshl v8.4s, v8.4s, v14.4s\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
- "str d18, [x16, x1]\n"
- "smin v11.4s, v11.4s, v12.4s\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "add v8.4s, v8.4s, v19.4s\n"
- "sqadd v10.4s, v10.4s, v30.4s\n"
- "smax v11.4s, v11.4s, v16.4s\n"
- "smin v8.4s, v8.4s, v12.4s\n"
- "and v6.16b, v9.16b, v14.16b\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "smax v8.4s, v8.4s, v16.4s\n"
- "srshl v10.4s, v10.4s, v21.4s\n"
- "uzp1 v11.16b, v11.16b, v8.16b\n"
- "add v10.4s, v10.4s, v19.4s\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "str d11, [x6, x1]\n"
- "smin v10.4s, v10.4s, v12.4s\n"
- "sqadd v9.4s, v9.4s, v6.4s\n"
- "smax v10.4s, v10.4s, v16.4s\n"
- "srshl v9.4s, v9.4s, v14.4s\n"
- "add v9.4s, v9.4s, v19.4s\n"
- "smin v9.4s, v9.4s, v12.4s\n"
- "smax v9.4s, v9.4s, v16.4s\n"
- "uzp1 v10.16b, v10.16b, v9.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "str d15, [x16, x22]\n"
"uzp1 v10.16b, v10.16b, v10.16b\n"
- "str d10, [x8, x1]\n"
- "add x1, x1, #0x8\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str d17, [x8, x22]\n"
+ "str d10, [x4, x22]\n"
+ "str d6, [x7, x22]\n"
+ "add x22, x22, #0x8\n"
"beq 124f\n"
- "add x3, x3, #0xc8\n"
+ "add x23, x23, #0xc8\n"
"3:" // Oddments
- "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x4, #2, 5f\n"
- "ld1 { v15.4s }, [x12], #0x10\n"
- "tbz x4, #1, 4f\n"
- "ld1 { v20.d }[0], [x12], #0x8\n"
- "tbz x4, #0, 7f\n"
- "ld1 { v20.s }[2], [x12]\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x0, #2, 5f\n"
+ "ld1 { v15.4s }, [x19], #0x10\n"
+ "tbz x0, #1, 4f\n"
+ "ld1 { v16.d }[0], [x19], #0x8\n"
+ "tbz x0, #0, 7f\n"
+ "ld1 { v16.s }[2], [x19]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x4, #0, 7f\n"
- "ld1 { v20.s }[0], [x12]\n"
+ "tbz x0, #0, 7f\n"
+ "ld1 { v16.s }[0], [x19]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x4, #1, 6f\n"
- "ld1 { v15.d }[0], [x12], #0x8\n"
- "tbz x4, #0, 7f\n"
- "ld1 { v15.s }[2], [x12]\n"
+ "tbz x0, #1, 6f\n"
+ "ld1 { v15.d }[0], [x19], #0x8\n"
+ "tbz x0, #0, 7f\n"
+ "ld1 { v15.s }[2], [x19]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 7f\n"
- "ld1 { v15.s }[0], [x12]\n"
+ "tbz x0, #0, 7f\n"
+ "ld1 { v15.s }[0], [x19]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "mov v18.16b, v15.16b\n"
- "ldr d0, [x3, #0x0]\n"
- "mov v5.16b, v20.16b\n"
- "ldr d1, [x3, #0x8]\n"
- "mov v11.16b, v15.16b\n"
- "ldr d2, [x3, #0x10]\n"
- "mov v8.16b, v20.16b\n"
- "ldr d3, [x3, #0x18]\n"
+ "ldr d0, [x23, #0x0]\n"
+ "ldr d1, [x23, #0x8]\n"
+ "mov v17.16b, v15.16b\n"
+ "mov v8.16b, v16.16b\n"
+ "ldr d2, [x23, #0x10]\n"
+ "ldr d3, [x23, #0x18]\n"
"mov v10.16b, v15.16b\n"
- "ldr d4, [x3, #0x20]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
- "mov v9.16b, v20.16b\n"
- "ldp x28, x27, [x25, #0x0]\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "ldp x26, x13, [x25, #0x10]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
- "ldp x24, x23, [x25, #0x20]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
- "ldp x22, x21, [x25, #0x30]\n"
- "ldp x20, x0, [x25, #0x40]\n"
- "add x28, x28, x10\n"
- "add x27, x27, x10\n"
- "add x26, x26, x10\n"
- "add x13, x13, x10\n"
- "add x24, x24, x10\n"
- "add x23, x23, x10\n"
- "add x22, x22, x10\n"
- "add x21, x21, x10\n"
- "add x20, x20, x10\n"
- "add x0, x0, x10\n"
- "tbz x4, #2, 9f\n"
+ "mov v7.16b, v16.16b\n"
+ "ldr d4, [x23, #0x20]\n"
+ "ldp x28, x6, [x20, #0x0]\n"
+ "mov v6.16b, v15.16b\n"
+ "mov v5.16b, v16.16b\n"
+ "ldp x26, x25, [x20, #0x10]\n"
+ "ldp x5, x2, [x20, #0x20]\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "ldp x27, x21, [x20, #0x30]\n"
+ "ldp x12, x19, [x20, #0x40]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "add x28, x28, x24\n"
+ "add x6, x6, x24\n"
+ "add x26, x26, x24\n"
+ "add x25, x25, x24\n"
+ "add x5, x5, x24\n"
+ "add x2, x2, x24\n"
+ "add x27, x27, x24\n"
+ "add x21, x21, x24\n"
+ "add x12, x12, x24\n"
+ "add x19, x19, x24\n"
+ "tbz x0, #2, 9f\n"
"ld1 { v31.s }[0], [x28], #0x4\n"
- "ld1 { v30.s }[0], [x27], #0x4\n"
+ "ld1 { v30.s }[0], [x6], #0x4\n"
"ld1 { v29.s }[0], [x26], #0x4\n"
- "ld1 { v28.s }[0], [x13], #0x4\n"
- "ld1 { v27.s }[0], [x24], #0x4\n"
- "ld1 { v23.s }[0], [x23], #0x4\n"
- "ld1 { v25.s }[0], [x22], #0x4\n"
+ "ld1 { v28.s }[0], [x25], #0x4\n"
+ "ld1 { v27.s }[0], [x5], #0x4\n"
+ "ld1 { v23.s }[0], [x2], #0x4\n"
+ "ld1 { v25.s }[0], [x27], #0x4\n"
"ld1 { v24.s }[0], [x21], #0x4\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
- "ld1 { v22.s }[0], [x0], #0x4\n"
- "tbz x4, #1, 8f\n"
+ "ld1 { v26.s }[0], [x12], #0x4\n"
+ "ld1 { v22.s }[0], [x19], #0x4\n"
+ "tbz x0, #1, 8f\n"
"ld1 { v31.h }[2], [x28], #0x2\n"
- "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v30.h }[2], [x6], #0x2\n"
"ld1 { v29.h }[2], [x26], #0x2\n"
- "ld1 { v28.h }[2], [x13], #0x2\n"
- "ld1 { v27.h }[2], [x24], #0x2\n"
- "ld1 { v23.h }[2], [x23], #0x2\n"
- "ld1 { v25.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x25], #0x2\n"
+ "ld1 { v27.h }[2], [x5], #0x2\n"
+ "ld1 { v23.h }[2], [x2], #0x2\n"
+ "ld1 { v25.h }[2], [x27], #0x2\n"
"ld1 { v24.h }[2], [x21], #0x2\n"
- "ld1 { v26.h }[2], [x20], #0x2\n"
- "ld1 { v22.h }[2], [x0], #0x2\n"
- "tbz x4, #0, 11f\n"
+ "ld1 { v26.h }[2], [x12], #0x2\n"
+ "ld1 { v22.h }[2], [x19], #0x2\n"
+ "tbz x0, #0, 11f\n"
"ld1 { v31.b }[6], [x28]\n"
- "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v30.b }[6], [x6]\n"
"ld1 { v29.b }[6], [x26]\n"
- "ld1 { v28.b }[6], [x13]\n"
- "ld1 { v27.b }[6], [x24]\n"
- "ld1 { v23.b }[6], [x23]\n"
- "ld1 { v25.b }[6], [x22]\n"
+ "ld1 { v28.b }[6], [x25]\n"
+ "ld1 { v27.b }[6], [x5]\n"
+ "ld1 { v23.b }[6], [x2]\n"
+ "ld1 { v25.b }[6], [x27]\n"
"ld1 { v24.b }[6], [x21]\n"
- "ld1 { v26.b }[6], [x20]\n"
- "ld1 { v22.b }[6], [x0]\n"
+ "ld1 { v26.b }[6], [x12]\n"
+ "ld1 { v22.b }[6], [x19]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x4, #0, 11f\n"
+ "tbz x0, #0, 11f\n"
"ld1 { v31.b }[4], [x28]\n"
- "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v30.b }[4], [x6]\n"
"ld1 { v29.b }[4], [x26]\n"
- "ld1 { v28.b }[4], [x13]\n"
- "ld1 { v27.b }[4], [x24]\n"
- "ld1 { v23.b }[4], [x23]\n"
- "ld1 { v25.b }[4], [x22]\n"
+ "ld1 { v28.b }[4], [x25]\n"
+ "ld1 { v27.b }[4], [x5]\n"
+ "ld1 { v23.b }[4], [x2]\n"
+ "ld1 { v25.b }[4], [x27]\n"
"ld1 { v24.b }[4], [x21]\n"
- "ld1 { v26.b }[4], [x20]\n"
- "ld1 { v22.b }[4], [x0]\n"
+ "ld1 { v26.b }[4], [x12]\n"
+ "ld1 { v22.b }[4], [x19]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x4, #1, 10f\n"
+ "tbz x0, #1, 10f\n"
"ld1 { v31.h }[0], [x28], #0x2\n"
- "ld1 { v30.h }[0], [x27], #0x2\n"
+ "ld1 { v30.h }[0], [x6], #0x2\n"
"ld1 { v29.h }[0], [x26], #0x2\n"
- "ld1 { v28.h }[0], [x13], #0x2\n"
- "ld1 { v27.h }[0], [x24], #0x2\n"
- "ld1 { v23.h }[0], [x23], #0x2\n"
- "ld1 { v25.h }[0], [x22], #0x2\n"
+ "ld1 { v28.h }[0], [x25], #0x2\n"
+ "ld1 { v27.h }[0], [x5], #0x2\n"
+ "ld1 { v23.h }[0], [x2], #0x2\n"
+ "ld1 { v25.h }[0], [x27], #0x2\n"
"ld1 { v24.h }[0], [x21], #0x2\n"
- "ld1 { v26.h }[0], [x20], #0x2\n"
- "ld1 { v22.h }[0], [x0], #0x2\n"
- "tbz x4, #0, 11f\n"
+ "ld1 { v26.h }[0], [x12], #0x2\n"
+ "ld1 { v22.h }[0], [x19], #0x2\n"
+ "tbz x0, #0, 11f\n"
"ld1 { v31.b }[2], [x28]\n"
- "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v30.b }[2], [x6]\n"
"ld1 { v29.b }[2], [x26]\n"
- "ld1 { v28.b }[2], [x13]\n"
- "ld1 { v27.b }[2], [x24]\n"
- "ld1 { v23.b }[2], [x23]\n"
- "ld1 { v25.b }[2], [x22]\n"
+ "ld1 { v28.b }[2], [x25]\n"
+ "ld1 { v27.b }[2], [x5]\n"
+ "ld1 { v23.b }[2], [x2]\n"
+ "ld1 { v25.b }[2], [x27]\n"
"ld1 { v24.b }[2], [x21]\n"
- "ld1 { v26.b }[2], [x20]\n"
- "ld1 { v22.b }[2], [x0]\n"
+ "ld1 { v26.b }[2], [x12]\n"
+ "ld1 { v22.b }[2], [x19]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 11f\n"
+ "tbz x0, #0, 11f\n"
"ld1 { v31.b }[0], [x28]\n"
- "ld1 { v30.b }[0], [x27]\n"
+ "ld1 { v30.b }[0], [x6]\n"
"ld1 { v29.b }[0], [x26]\n"
- "ld1 { v28.b }[0], [x13]\n"
- "ld1 { v27.b }[0], [x24]\n"
- "ld1 { v23.b }[0], [x23]\n"
- "ld1 { v25.b }[0], [x22]\n"
+ "ld1 { v28.b }[0], [x25]\n"
+ "ld1 { v27.b }[0], [x5]\n"
+ "ld1 { v23.b }[0], [x2]\n"
+ "ld1 { v25.b }[0], [x27]\n"
"ld1 { v24.b }[0], [x21]\n"
- "ld1 { v26.b }[0], [x20]\n"
- "ld1 { v22.b }[0], [x0]\n"
+ "ld1 { v26.b }[0], [x12]\n"
+ "ld1 { v22.b }[0], [x19]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ldr x20, [x25, #0x50]\n"
- "ssubl v31.8h, v31.8b, v7.8b\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "ssubl v30.8h, v30.8b, v9.8b\n"
"smlal v15.4s, v31.4h, v0.4h\n"
- "ssubl v30.8h, v30.8b, v7.8b\n"
- "smlal2 v20.4s, v31.8h, v0.8h\n"
- "ssubl v29.8h, v29.8b, v7.8b\n"
- "ssubl v28.8h, v28.8b, v7.8b\n"
- "smlal v18.4s, v30.4h, v0.4h\n"
- "ssubl v27.8h, v27.8b, v7.8b\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "ssubl v23.8h, v23.8b, v7.8b\n"
- "smlal v11.4s, v29.4h, v0.4h\n"
- "ssubl v25.8h, v25.8b, v7.8b\n"
- "smlal2 v8.4s, v29.8h, v0.8h\n"
- "ssubl v24.8h, v24.8b, v7.8b\n"
- "smlal v10.4s, v28.4h, v0.4h\n"
- "ssubl v26.8h, v26.8b, v7.8b\n"
- "smlal2 v9.4s, v28.8h, v0.8h\n"
- "ssubl v22.8h, v22.8b, v7.8b\n"
+ "ldr x19, [x20, #0x50]\n"
+ "ssubl v29.8h, v29.8b, v9.8b\n"
+ "smlal2 v16.4s, v31.8h, v0.8h\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal2 v8.4s, v30.8h, v0.8h\n"
+ "smlal v10.4s, v29.4h, v0.4h\n"
+ "ssubl v28.8h, v28.8b, v9.8b\n"
+ "add x19, x19, x24\n"
+ "smlal2 v7.4s, v29.8h, v0.8h\n"
+ "ssubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v6.4s, v28.4h, v0.4h\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
"smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v20.4s, v30.8h, v1.8h\n"
- "add x20, x20, x10\n"
- "smlal v18.4s, v27.4h, v1.4h\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "smlal v11.4s, v28.4h, v1.4h\n"
- "smlal2 v8.4s, v28.8h, v1.8h\n"
- "smlal v10.4s, v23.4h, v1.4h\n"
- "smlal2 v9.4s, v23.8h, v1.8h\n"
+ "ssubl v23.8h, v23.8b, v9.8b\n"
+ "smlal2 v16.4s, v30.8h, v1.8h\n"
+ "smlal v17.4s, v27.4h, v1.4h\n"
+ "ssubl v25.8h, v25.8b, v9.8b\n"
+ "smlal2 v8.4s, v27.8h, v1.8h\n"
+ "smlal v10.4s, v28.4h, v1.4h\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "smlal2 v7.4s, v28.8h, v1.8h\n"
+ "ssubl v26.8h, v26.8b, v9.8b\n"
+ "smlal v6.4s, v23.4h, v1.4h\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "smlal2 v5.4s, v23.8h, v1.8h\n"
"smlal v15.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "smlal v18.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "smlal v11.4s, v23.4h, v2.4h\n"
- "smlal2 v8.4s, v23.8h, v2.8h\n"
- "tbz x4, #2, 13f\n"
- "ld1 { v31.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 12f\n"
- "ld1 { v31.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 15f\n"
- "ld1 { v31.b }[6], [x20]\n"
+ "smlal2 v16.4s, v27.8h, v2.8h\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v8.4s, v25.8h, v2.8h\n"
+ "smlal v10.4s, v23.4h, v2.4h\n"
+ "smlal2 v7.4s, v23.8h, v2.8h\n"
+ "tbz x0, #2, 13f\n"
+ "ld1 { v31.s }[0], [x19], #0x4\n"
+ "tbz x0, #1, 12f\n"
+ "ld1 { v31.h }[2], [x19], #0x2\n"
+ "tbz x0, #0, 15f\n"
+ "ld1 { v31.b }[6], [x19]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 15f\n"
- "ld1 { v31.b }[4], [x20]\n"
+ "tbz x0, #0, 15f\n"
+ "ld1 { v31.b }[4], [x19]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x4, #1, 14f\n"
- "ld1 { v31.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 15f\n"
- "ld1 { v31.b }[2], [x20]\n"
+ "tbz x0, #1, 14f\n"
+ "ld1 { v31.h }[0], [x19], #0x2\n"
+ "tbz x0, #0, 15f\n"
+ "ld1 { v31.b }[2], [x19]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 15f\n"
- "ld1 { v31.b }[0], [x20]\n"
+ "tbz x0, #0, 15f\n"
+ "ld1 { v31.b }[0], [x19]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "ldr x15, [x20, #0x58]\n"
+ "smlal v6.4s, v31.4h, v2.4h\n"
+ "smlal2 v5.4s, v31.8h, v2.8h\n"
"smlal v15.4s, v25.4h, v3.4h\n"
- "ldr x28, [x25, #0x58]\n"
- "ssubl v31.8h, v31.8b, v7.8b\n"
- "smlal2 v20.4s, v25.8h, v3.8h\n"
- "smlal v18.4s, v24.4h, v3.4h\n"
- "add x28, x28, x10\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "smlal v10.4s, v31.4h, v2.4h\n"
- "smlal2 v9.4s, v31.8h, v2.8h\n"
- "smlal v11.4s, v31.4h, v3.4h\n"
- "smlal2 v8.4s, v31.8h, v3.8h\n"
- "tbz x4, #2, 17f\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
- "tbz x4, #1, 16f\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
- "tbz x4, #0, 19f\n"
- "ld1 { v30.b }[6], [x28]\n"
+ "smlal2 v16.4s, v25.8h, v3.8h\n"
+ "add x15, x15, x24\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal2 v8.4s, v24.8h, v3.8h\n"
+ "smlal v10.4s, v31.4h, v3.4h\n"
+ "smlal2 v7.4s, v31.8h, v3.8h\n"
+ "tbz x0, #2, 17f\n"
+ "ld1 { v30.s }[0], [x15], #0x4\n"
+ "tbz x0, #1, 16f\n"
+ "ld1 { v30.h }[2], [x15], #0x2\n"
+ "tbz x0, #0, 19f\n"
+ "ld1 { v30.b }[6], [x15]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 19f\n"
- "ld1 { v30.b }[4], [x28]\n"
+ "tbz x0, #0, 19f\n"
+ "ld1 { v30.b }[4], [x15]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
- "tbz x4, #1, 18f\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
- "tbz x4, #0, 19f\n"
- "ld1 { v30.b }[2], [x28]\n"
+ "tbz x0, #1, 18f\n"
+ "ld1 { v30.h }[0], [x15], #0x2\n"
+ "tbz x0, #0, 19f\n"
+ "ld1 { v30.b }[2], [x15]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 19f\n"
- "ld1 { v30.b }[0], [x28]\n"
+ "tbz x0, #0, 19f\n"
+ "ld1 { v30.b }[0], [x15]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
+ "ssubl v30.8h, v30.8b, v9.8b\n"
+ "ldr x19, [x20, #0x60]\n"
+ "smlal v6.4s, v30.4h, v3.4h\n"
+ "smlal2 v5.4s, v30.8h, v3.8h\n"
"smlal v15.4s, v24.4h, v4.4h\n"
- "ldr x0, [x25, #0x60]\n"
- "ssubl v30.8h, v30.8b, v7.8b\n"
- "smlal2 v20.4s, v24.8h, v4.8h\n"
- "add x0, x0, x10\n"
- "smlal v10.4s, v30.4h, v3.4h\n"
- "smlal2 v9.4s, v30.8h, v3.8h\n"
- "tbz x4, #2, 21f\n"
- "ld1 { v27.s }[0], [x0], #0x4\n"
- "tbz x4, #1, 20f\n"
- "ld1 { v27.h }[2], [x0], #0x2\n"
- "tbz x4, #0, 23f\n"
- "ld1 { v27.b }[6], [x0]\n"
+ "smlal2 v16.4s, v24.8h, v4.8h\n"
+ "add x19, x19, x24\n"
+ "tbz x0, #2, 21f\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "tbz x0, #1, 20f\n"
+ "ld1 { v27.h }[2], [x19], #0x2\n"
+ "tbz x0, #0, 23f\n"
+ "ld1 { v27.b }[6], [x19]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 23f\n"
- "ld1 { v27.b }[4], [x0]\n"
+ "tbz x0, #0, 23f\n"
+ "ld1 { v27.b }[4], [x19]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 5): Bit 2: Unset
- "tbz x4, #1, 22f\n"
- "ld1 { v27.h }[0], [x0], #0x2\n"
- "tbz x4, #0, 23f\n"
- "ld1 { v27.b }[2], [x0]\n"
+ "tbz x0, #1, 22f\n"
+ "ld1 { v27.h }[0], [x19], #0x2\n"
+ "tbz x0, #0, 23f\n"
+ "ld1 { v27.b }[2], [x19]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 23f\n"
- "ld1 { v27.b }[0], [x0]\n"
+ "tbz x0, #0, 23f\n"
+ "ld1 { v27.b }[0], [x19]\n"
"23:" // Oddments: Load (0, 5): Bit 2: End
- "smlal v11.4s, v30.4h, v4.4h\n"
- "ldr d0, [x3, #0x28]\n"
- "ssubl v27.8h, v27.8b, v7.8b\n"
- "smlal2 v8.4s, v30.8h, v4.8h\n"
- "ldr x7, [x25, #0x68]\n"
- "smlal v10.4s, v26.4h, v4.4h\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
- "smlal2 v9.4s, v26.8h, v4.8h\n"
- "add x7, x7, x10\n"
- "smlal v18.4s, v27.4h, v4.4h\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
+ "ssubl v27.8h, v27.8b, v9.8b\n"
+ "ldr d0, [x23, #0x28]\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v27.8h, v4.8h\n"
+ "smlal v10.4s, v30.4h, v4.4h\n"
+ "smlal2 v7.4s, v30.8h, v4.8h\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "ldr x27, [x20, #0x68]\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "add x27, x27, x24\n"
"smlal v15.4s, v29.4h, v0.4h\n"
- "smlal2 v20.4s, v29.8h, v0.8h\n"
- "smlal v18.4s, v28.4h, v0.4h\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "smlal v11.4s, v22.4h, v0.4h\n"
- "smlal2 v8.4s, v22.8h, v0.8h\n"
- "tbz x4, #2, 25f\n"
- "ld1 { v25.s }[0], [x7], #0x4\n"
- "tbz x4, #1, 24f\n"
- "ld1 { v25.h }[2], [x7], #0x2\n"
- "tbz x4, #0, 27f\n"
- "ld1 { v25.b }[6], [x7]\n"
+ "smlal2 v16.4s, v29.8h, v0.8h\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "smlal2 v8.4s, v28.8h, v0.8h\n"
+ "smlal v10.4s, v22.4h, v0.4h\n"
+ "smlal2 v7.4s, v22.8h, v0.8h\n"
+ "tbz x0, #2, 25f\n"
+ "ld1 { v25.s }[0], [x27], #0x4\n"
+ "tbz x0, #1, 24f\n"
+ "ld1 { v25.h }[2], [x27], #0x2\n"
+ "tbz x0, #0, 27f\n"
+ "ld1 { v25.b }[6], [x27]\n"
"b 27f\n"
"24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 27f\n"
- "ld1 { v25.b }[4], [x7]\n"
+ "tbz x0, #0, 27f\n"
+ "ld1 { v25.b }[4], [x27]\n"
"b 27f\n"
"25:" // Oddments: Load (2, 1): Bit 2: Unset
- "tbz x4, #1, 26f\n"
- "ld1 { v25.h }[0], [x7], #0x2\n"
- "tbz x4, #0, 27f\n"
- "ld1 { v25.b }[2], [x7]\n"
+ "tbz x0, #1, 26f\n"
+ "ld1 { v25.h }[0], [x27], #0x2\n"
+ "tbz x0, #0, 27f\n"
+ "ld1 { v25.b }[2], [x27]\n"
"b 27f\n"
"26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 27f\n"
- "ld1 { v25.b }[0], [x7]\n"
+ "tbz x0, #0, 27f\n"
+ "ld1 { v25.b }[0], [x27]\n"
"27:" // Oddments: Load (2, 1): Bit 2: End
- "ldr d1, [x3, #0x30]\n"
- "ssubl v25.8h, v25.8b, v7.8b\n"
- "smlal v10.4s, v25.4h, v0.4h\n"
- "ldr x26, [x25, #0x70]\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "smlal2 v9.4s, v25.8h, v0.8h\n"
- "add x26, x26, x10\n"
+ "ldr d1, [x23, #0x30]\n"
+ "ssubl v25.8h, v25.8b, v9.8b\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "ldr x5, [x20, #0x70]\n"
+ "smlal v6.4s, v25.4h, v0.4h\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "add x5, x5, x24\n"
"smlal v15.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v18.4s, v23.4h, v1.4h\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "smlal v11.4s, v25.4h, v1.4h\n"
- "smlal2 v8.4s, v25.8h, v1.8h\n"
- "tbz x4, #2, 29f\n"
- "ld1 { v24.s }[0], [x26], #0x4\n"
- "tbz x4, #1, 28f\n"
- "ld1 { v24.h }[2], [x26], #0x2\n"
- "tbz x4, #0, 31f\n"
- "ld1 { v24.b }[6], [x26]\n"
+ "smlal2 v16.4s, v28.8h, v1.8h\n"
+ "smlal v17.4s, v23.4h, v1.4h\n"
+ "smlal2 v8.4s, v23.8h, v1.8h\n"
+ "smlal v10.4s, v25.4h, v1.4h\n"
+ "smlal2 v7.4s, v25.8h, v1.8h\n"
+ "tbz x0, #2, 29f\n"
+ "ld1 { v24.s }[0], [x5], #0x4\n"
+ "tbz x0, #1, 28f\n"
+ "ld1 { v24.h }[2], [x5], #0x2\n"
+ "tbz x0, #0, 31f\n"
+ "ld1 { v24.b }[6], [x5]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 31f\n"
- "ld1 { v24.b }[4], [x26]\n"
+ "tbz x0, #0, 31f\n"
+ "ld1 { v24.b }[4], [x5]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x4, #1, 30f\n"
- "ld1 { v24.h }[0], [x26], #0x2\n"
- "tbz x4, #0, 31f\n"
- "ld1 { v24.b }[2], [x26]\n"
+ "tbz x0, #1, 30f\n"
+ "ld1 { v24.h }[0], [x5], #0x2\n"
+ "tbz x0, #0, 31f\n"
+ "ld1 { v24.b }[2], [x5]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 31f\n"
- "ld1 { v24.b }[0], [x26]\n"
+ "tbz x0, #0, 31f\n"
+ "ld1 { v24.b }[0], [x5]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ldr d2, [x3, #0x38]\n"
- "ssubl v24.8h, v24.8b, v7.8b\n"
- "smlal v10.4s, v24.4h, v1.4h\n"
- "ldr x23, [x25, #0x78]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "smlal2 v9.4s, v24.8h, v1.8h\n"
- "add x23, x23, x10\n"
+ "ldr d2, [x23, #0x38]\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "ldr x11, [x20, #0x78]\n"
+ "smlal v6.4s, v24.4h, v1.4h\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "add x11, x11, x24\n"
"smlal v15.4s, v23.4h, v2.4h\n"
- "smlal2 v20.4s, v23.8h, v2.8h\n"
- "smlal v18.4s, v31.4h, v2.4h\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "smlal v11.4s, v24.4h, v2.4h\n"
- "smlal2 v8.4s, v24.8h, v2.8h\n"
- "tbz x4, #2, 33f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
- "tbz x4, #1, 32f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
- "tbz x4, #0, 35f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "smlal2 v16.4s, v23.8h, v2.8h\n"
+ "smlal v17.4s, v31.4h, v2.4h\n"
+ "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "smlal v10.4s, v24.4h, v2.4h\n"
+ "smlal2 v7.4s, v24.8h, v2.8h\n"
+ "tbz x0, #2, 33f\n"
+ "ld1 { v27.s }[0], [x11], #0x4\n"
+ "tbz x0, #1, 32f\n"
+ "ld1 { v27.h }[2], [x11], #0x2\n"
+ "tbz x0, #0, 35f\n"
+ "ld1 { v27.b }[6], [x11]\n"
"b 35f\n"
"32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 35f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "tbz x0, #0, 35f\n"
+ "ld1 { v27.b }[4], [x11]\n"
"b 35f\n"
"33:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x4, #1, 34f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
- "tbz x4, #0, 35f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "tbz x0, #1, 34f\n"
+ "ld1 { v27.h }[0], [x11], #0x2\n"
+ "tbz x0, #0, 35f\n"
+ "ld1 { v27.b }[2], [x11]\n"
"b 35f\n"
"34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 35f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "tbz x0, #0, 35f\n"
+ "ld1 { v27.b }[0], [x11]\n"
"35:" // Oddments: Load (2, 3): Bit 2: End
- "ldr d3, [x3, #0x40]\n"
- "ssubl v27.8h, v27.8b, v7.8b\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "ldr x20, [x25, #0x80]\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
- "smlal2 v9.4s, v27.8h, v2.8h\n"
- "add x20, x20, x10\n"
+ "ldr d3, [x23, #0x40]\n"
+ "ssubl v27.8h, v27.8b, v9.8b\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "ldr x12, [x20, #0x80]\n"
+ "smlal v6.4s, v27.4h, v2.4h\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "add x12, x12, x24\n"
"smlal v15.4s, v31.4h, v3.4h\n"
- "smlal2 v20.4s, v31.8h, v3.8h\n"
- "smlal v18.4s, v30.4h, v3.4h\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "smlal v11.4s, v27.4h, v3.4h\n"
- "smlal2 v8.4s, v27.8h, v3.8h\n"
- "tbz x4, #2, 37f\n"
- "ld1 { v23.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 36f\n"
- "ld1 { v23.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 39f\n"
- "ld1 { v23.b }[6], [x20]\n"
+ "smlal2 v16.4s, v31.8h, v3.8h\n"
+ "smlal v17.4s, v30.4h, v3.4h\n"
+ "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "smlal v10.4s, v27.4h, v3.4h\n"
+ "smlal2 v7.4s, v27.8h, v3.8h\n"
+ "tbz x0, #2, 37f\n"
+ "ld1 { v23.s }[0], [x12], #0x4\n"
+ "tbz x0, #1, 36f\n"
+ "ld1 { v23.h }[2], [x12], #0x2\n"
+ "tbz x0, #0, 39f\n"
+ "ld1 { v23.b }[6], [x12]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 39f\n"
- "ld1 { v23.b }[4], [x20]\n"
+ "tbz x0, #0, 39f\n"
+ "ld1 { v23.b }[4], [x12]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 4): Bit 2: Unset
- "tbz x4, #1, 38f\n"
- "ld1 { v23.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 39f\n"
- "ld1 { v23.b }[2], [x20]\n"
+ "tbz x0, #1, 38f\n"
+ "ld1 { v23.h }[0], [x12], #0x2\n"
+ "tbz x0, #0, 39f\n"
+ "ld1 { v23.b }[2], [x12]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 39f\n"
- "ld1 { v23.b }[0], [x20]\n"
+ "tbz x0, #0, 39f\n"
+ "ld1 { v23.b }[0], [x12]\n"
"39:" // Oddments: Load (2, 4): Bit 2: End
- "ldr d4, [x3, #0x48]\n"
- "ssubl v23.8h, v23.8b, v7.8b\n"
- "smlal v10.4s, v23.4h, v3.4h\n"
- "ldr x22, [x25, #0x88]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
- "smlal2 v9.4s, v23.8h, v3.8h\n"
- "add x22, x22, x10\n"
+ "ldr d4, [x23, #0x48]\n"
+ "ssubl v23.8h, v23.8b, v9.8b\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x26, [x20, #0x88]\n"
+ "smlal v6.4s, v23.4h, v3.4h\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "add x26, x26, x24\n"
"smlal v15.4s, v30.4h, v4.4h\n"
- "smlal2 v20.4s, v30.8h, v4.8h\n"
- "smlal v18.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "smlal v11.4s, v23.4h, v4.4h\n"
- "smlal2 v8.4s, v23.8h, v4.8h\n"
- "tbz x4, #2, 41f\n"
- "ld1 { v28.s }[0], [x22], #0x4\n"
- "tbz x4, #1, 40f\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
- "tbz x4, #0, 43f\n"
- "ld1 { v28.b }[6], [x22]\n"
+ "smlal2 v16.4s, v30.8h, v4.8h\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal2 v8.4s, v26.8h, v4.8h\n"
+ "smlal v10.4s, v23.4h, v4.4h\n"
+ "smlal2 v7.4s, v23.8h, v4.8h\n"
+ "tbz x0, #2, 41f\n"
+ "ld1 { v28.s }[0], [x26], #0x4\n"
+ "tbz x0, #1, 40f\n"
+ "ld1 { v28.h }[2], [x26], #0x2\n"
+ "tbz x0, #0, 43f\n"
+ "ld1 { v28.b }[6], [x26]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 43f\n"
- "ld1 { v28.b }[4], [x22]\n"
+ "tbz x0, #0, 43f\n"
+ "ld1 { v28.b }[4], [x26]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 5): Bit 2: Unset
- "tbz x4, #1, 42f\n"
- "ld1 { v28.h }[0], [x22], #0x2\n"
- "tbz x4, #0, 43f\n"
- "ld1 { v28.b }[2], [x22]\n"
+ "tbz x0, #1, 42f\n"
+ "ld1 { v28.h }[0], [x26], #0x2\n"
+ "tbz x0, #0, 43f\n"
+ "ld1 { v28.b }[2], [x26]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 43f\n"
- "ld1 { v28.b }[0], [x22]\n"
+ "tbz x0, #0, 43f\n"
+ "ld1 { v28.b }[0], [x26]\n"
"43:" // Oddments: Load (2, 5): Bit 2: End
- "ldr d0, [x3, #0x50]\n"
- "ssubl v28.8h, v28.8b, v7.8b\n"
- "smlal v10.4s, v28.4h, v4.4h\n"
- "ldr x13, [x25, #0x90]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
- "smlal2 v9.4s, v28.8h, v4.8h\n"
- "add x13, x13, x10\n"
+ "ldr d0, [x23, #0x50]\n"
+ "ssubl v28.8h, v28.8b, v9.8b\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "ldr x14, [x20, #0x90]\n"
+ "smlal v6.4s, v28.4h, v4.4h\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "add x14, x14, x24\n"
"smlal v15.4s, v22.4h, v0.4h\n"
- "smlal2 v20.4s, v22.8h, v0.8h\n"
- "smlal v18.4s, v25.4h, v0.4h\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "tbz x4, #2, 45f\n"
- "ld1 { v31.s }[0], [x13], #0x4\n"
- "tbz x4, #1, 44f\n"
- "ld1 { v31.h }[2], [x13], #0x2\n"
- "tbz x4, #0, 47f\n"
- "ld1 { v31.b }[6], [x13]\n"
+ "smlal2 v16.4s, v22.8h, v0.8h\n"
+ "smlal v17.4s, v25.4h, v0.4h\n"
+ "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "tbz x0, #2, 45f\n"
+ "ld1 { v31.s }[0], [x14], #0x4\n"
+ "tbz x0, #1, 44f\n"
+ "ld1 { v31.h }[2], [x14], #0x2\n"
+ "tbz x0, #0, 47f\n"
+ "ld1 { v31.b }[6], [x14]\n"
"b 47f\n"
"44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 47f\n"
- "ld1 { v31.b }[4], [x13]\n"
+ "tbz x0, #0, 47f\n"
+ "ld1 { v31.b }[4], [x14]\n"
"b 47f\n"
"45:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x4, #1, 46f\n"
- "ld1 { v31.h }[0], [x13], #0x2\n"
- "tbz x4, #0, 47f\n"
- "ld1 { v31.b }[2], [x13]\n"
+ "tbz x0, #1, 46f\n"
+ "ld1 { v31.h }[0], [x14], #0x2\n"
+ "tbz x0, #0, 47f\n"
+ "ld1 { v31.b }[2], [x14]\n"
"b 47f\n"
"46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 47f\n"
- "ld1 { v31.b }[0], [x13]\n"
+ "tbz x0, #0, 47f\n"
+ "ld1 { v31.b }[0], [x14]\n"
"47:" // Oddments: Load (3, 0): Bit 2: End
- "ldr x21, [x25, #0x98]\n"
- "ssubl v31.8h, v31.8b, v7.8b\n"
- "smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v8.4s, v31.8h, v0.8h\n"
- "add x21, x21, x10\n"
- "tbz x4, #2, 49f\n"
- "ld1 { v30.s }[0], [x21], #0x4\n"
- "tbz x4, #1, 48f\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
- "tbz x4, #0, 51f\n"
- "ld1 { v30.b }[6], [x21]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "ldr x15, [x20, #0x98]\n"
+ "smlal v10.4s, v31.4h, v0.4h\n"
+ "smlal2 v7.4s, v31.8h, v0.8h\n"
+ "add x15, x15, x24\n"
+ "tbz x0, #2, 49f\n"
+ "ld1 { v30.s }[0], [x15], #0x4\n"
+ "tbz x0, #1, 48f\n"
+ "ld1 { v30.h }[2], [x15], #0x2\n"
+ "tbz x0, #0, 51f\n"
+ "ld1 { v30.b }[6], [x15]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 51f\n"
- "ld1 { v30.b }[4], [x21]\n"
+ "tbz x0, #0, 51f\n"
+ "ld1 { v30.b }[4], [x15]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x4, #1, 50f\n"
- "ld1 { v30.h }[0], [x21], #0x2\n"
- "tbz x4, #0, 51f\n"
- "ld1 { v30.b }[2], [x21]\n"
+ "tbz x0, #1, 50f\n"
+ "ld1 { v30.h }[0], [x15], #0x2\n"
+ "tbz x0, #0, 51f\n"
+ "ld1 { v30.b }[2], [x15]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 51f\n"
- "ld1 { v30.b }[0], [x21]\n"
+ "tbz x0, #0, 51f\n"
+ "ld1 { v30.b }[0], [x15]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ldr d1, [x3, #0x58]\n"
- "ssubl v30.8h, v30.8b, v7.8b\n"
- "smlal v10.4s, v30.4h, v0.4h\n"
- "ldr x14, [x25, #0xa0]\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "smlal2 v9.4s, v30.8h, v0.8h\n"
- "add x14, x14, x10\n"
+ "ldr d1, [x23, #0x58]\n"
+ "ssubl v30.8h, v30.8b, v9.8b\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "ldr x21, [x20, #0xa0]\n"
+ "smlal v6.4s, v30.4h, v0.4h\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "add x21, x21, x24\n"
"smlal v15.4s, v25.4h, v1.4h\n"
- "smlal2 v20.4s, v25.8h, v1.8h\n"
- "smlal v18.4s, v24.4h, v1.4h\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
- "smlal v11.4s, v30.4h, v1.4h\n"
- "smlal2 v8.4s, v30.8h, v1.8h\n"
- "tbz x4, #2, 53f\n"
- "ld1 { v26.s }[0], [x14], #0x4\n"
- "tbz x4, #1, 52f\n"
- "ld1 { v26.h }[2], [x14], #0x2\n"
- "tbz x4, #0, 55f\n"
- "ld1 { v26.b }[6], [x14]\n"
+ "smlal2 v16.4s, v25.8h, v1.8h\n"
+ "smlal v17.4s, v24.4h, v1.4h\n"
+ "smlal2 v8.4s, v24.8h, v1.8h\n"
+ "smlal v10.4s, v30.4h, v1.4h\n"
+ "smlal2 v7.4s, v30.8h, v1.8h\n"
+ "tbz x0, #2, 53f\n"
+ "ld1 { v26.s }[0], [x21], #0x4\n"
+ "tbz x0, #1, 52f\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "tbz x0, #0, 55f\n"
+ "ld1 { v26.b }[6], [x21]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 55f\n"
- "ld1 { v26.b }[4], [x14]\n"
+ "tbz x0, #0, 55f\n"
+ "ld1 { v26.b }[4], [x21]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x4, #1, 54f\n"
- "ld1 { v26.h }[0], [x14], #0x2\n"
- "tbz x4, #0, 55f\n"
- "ld1 { v26.b }[2], [x14]\n"
+ "tbz x0, #1, 54f\n"
+ "ld1 { v26.h }[0], [x21], #0x2\n"
+ "tbz x0, #0, 55f\n"
+ "ld1 { v26.b }[2], [x21]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 55f\n"
- "ld1 { v26.b }[0], [x14]\n"
+ "tbz x0, #0, 55f\n"
+ "ld1 { v26.b }[0], [x21]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ldr d2, [x3, #0x60]\n"
- "ssubl v26.8h, v26.8b, v7.8b\n"
- "smlal v10.4s, v26.4h, v1.4h\n"
- "ldr x11, [x25, #0xa8]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "smlal2 v9.4s, v26.8h, v1.8h\n"
- "add x11, x11, x10\n"
+ "ldr d2, [x23, #0x60]\n"
+ "ssubl v26.8h, v26.8b, v9.8b\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "ldr x2, [x20, #0xa8]\n"
+ "smlal v6.4s, v26.4h, v1.4h\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "add x2, x2, x24\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v20.4s, v24.8h, v2.8h\n"
- "smlal v18.4s, v27.4h, v2.4h\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "smlal v11.4s, v26.4h, v2.4h\n"
- "smlal2 v8.4s, v26.8h, v2.8h\n"
- "tbz x4, #2, 57f\n"
- "ld1 { v25.s }[0], [x11], #0x4\n"
- "tbz x4, #1, 56f\n"
- "ld1 { v25.h }[2], [x11], #0x2\n"
- "tbz x4, #0, 59f\n"
- "ld1 { v25.b }[6], [x11]\n"
+ "smlal2 v16.4s, v24.8h, v2.8h\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal2 v8.4s, v27.8h, v2.8h\n"
+ "smlal v10.4s, v26.4h, v2.4h\n"
+ "smlal2 v7.4s, v26.8h, v2.8h\n"
+ "tbz x0, #2, 57f\n"
+ "ld1 { v25.s }[0], [x2], #0x4\n"
+ "tbz x0, #1, 56f\n"
+ "ld1 { v25.h }[2], [x2], #0x2\n"
+ "tbz x0, #0, 59f\n"
+ "ld1 { v25.b }[6], [x2]\n"
"b 59f\n"
"56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 59f\n"
- "ld1 { v25.b }[4], [x11]\n"
+ "tbz x0, #0, 59f\n"
+ "ld1 { v25.b }[4], [x2]\n"
"b 59f\n"
"57:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x4, #1, 58f\n"
- "ld1 { v25.h }[0], [x11], #0x2\n"
- "tbz x4, #0, 59f\n"
- "ld1 { v25.b }[2], [x11]\n"
+ "tbz x0, #1, 58f\n"
+ "ld1 { v25.h }[0], [x2], #0x2\n"
+ "tbz x0, #0, 59f\n"
+ "ld1 { v25.b }[2], [x2]\n"
"b 59f\n"
"58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 59f\n"
- "ld1 { v25.b }[0], [x11]\n"
+ "tbz x0, #0, 59f\n"
+ "ld1 { v25.b }[0], [x2]\n"
"59:" // Oddments: Load (3, 3): Bit 2: End
- "ldr d3, [x3, #0x68]\n"
- "ssubl v25.8h, v25.8b, v7.8b\n"
- "smlal v10.4s, v25.4h, v2.4h\n"
- "ldr x24, [x25, #0xb0]\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
- "smlal2 v9.4s, v25.8h, v2.8h\n"
- "add x24, x24, x10\n"
+ "ldr d3, [x23, #0x68]\n"
+ "ssubl v25.8h, v25.8b, v9.8b\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "ldr x13, [x20, #0xb0]\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "add x13, x13, x24\n"
"smlal v15.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
- "smlal v18.4s, v23.4h, v3.4h\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v8.4s, v25.8h, v3.8h\n"
- "tbz x4, #2, 61f\n"
- "ld1 { v24.s }[0], [x24], #0x4\n"
- "tbz x4, #1, 60f\n"
- "ld1 { v24.h }[2], [x24], #0x2\n"
- "tbz x4, #0, 63f\n"
- "ld1 { v24.b }[6], [x24]\n"
+ "smlal2 v16.4s, v27.8h, v3.8h\n"
+ "smlal v17.4s, v23.4h, v3.4h\n"
+ "smlal2 v8.4s, v23.8h, v3.8h\n"
+ "smlal v10.4s, v25.4h, v3.4h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "tbz x0, #2, 61f\n"
+ "ld1 { v24.s }[0], [x13], #0x4\n"
+ "tbz x0, #1, 60f\n"
+ "ld1 { v24.h }[2], [x13], #0x2\n"
+ "tbz x0, #0, 63f\n"
+ "ld1 { v24.b }[6], [x13]\n"
"b 63f\n"
"60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 63f\n"
- "ld1 { v24.b }[4], [x24]\n"
+ "tbz x0, #0, 63f\n"
+ "ld1 { v24.b }[4], [x13]\n"
"b 63f\n"
"61:" // Oddments: Load (3, 4): Bit 2: Unset
- "tbz x4, #1, 62f\n"
- "ld1 { v24.h }[0], [x24], #0x2\n"
- "tbz x4, #0, 63f\n"
- "ld1 { v24.b }[2], [x24]\n"
+ "tbz x0, #1, 62f\n"
+ "ld1 { v24.h }[0], [x13], #0x2\n"
+ "tbz x0, #0, 63f\n"
+ "ld1 { v24.b }[2], [x13]\n"
"b 63f\n"
"62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 63f\n"
- "ld1 { v24.b }[0], [x24]\n"
+ "tbz x0, #0, 63f\n"
+ "ld1 { v24.b }[0], [x13]\n"
"63:" // Oddments: Load (3, 4): Bit 2: End
- "ldr d4, [x3, #0x70]\n"
- "ssubl v24.8h, v24.8b, v7.8b\n"
- "smlal v10.4s, v24.4h, v3.4h\n"
- "ldr x0, [x25, #0xb8]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
- "smlal2 v9.4s, v24.8h, v3.8h\n"
- "add x0, x0, x10\n"
+ "ldr d4, [x23, #0x70]\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x9, [x20, #0xb8]\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "add x9, x9, x24\n"
"smlal v15.4s, v23.4h, v4.4h\n"
- "smlal2 v20.4s, v23.8h, v4.8h\n"
- "smlal v18.4s, v28.4h, v4.4h\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v8.4s, v24.8h, v4.8h\n"
- "tbz x4, #2, 65f\n"
- "ld1 { v22.s }[0], [x0], #0x4\n"
- "tbz x4, #1, 64f\n"
- "ld1 { v22.h }[2], [x0], #0x2\n"
- "tbz x4, #0, 67f\n"
- "ld1 { v22.b }[6], [x0]\n"
+ "smlal2 v16.4s, v23.8h, v4.8h\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "smlal v10.4s, v24.4h, v4.4h\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "tbz x0, #2, 65f\n"
+ "ld1 { v22.s }[0], [x9], #0x4\n"
+ "tbz x0, #1, 64f\n"
+ "ld1 { v22.h }[2], [x9], #0x2\n"
+ "tbz x0, #0, 67f\n"
+ "ld1 { v22.b }[6], [x9]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 67f\n"
- "ld1 { v22.b }[4], [x0]\n"
+ "tbz x0, #0, 67f\n"
+ "ld1 { v22.b }[4], [x9]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 5): Bit 2: Unset
- "tbz x4, #1, 66f\n"
- "ld1 { v22.h }[0], [x0], #0x2\n"
- "tbz x4, #0, 67f\n"
- "ld1 { v22.b }[2], [x0]\n"
+ "tbz x0, #1, 66f\n"
+ "ld1 { v22.h }[0], [x9], #0x2\n"
+ "tbz x0, #0, 67f\n"
+ "ld1 { v22.b }[2], [x9]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 67f\n"
- "ld1 { v22.b }[0], [x0]\n"
+ "tbz x0, #0, 67f\n"
+ "ld1 { v22.b }[0], [x9]\n"
"67:" // Oddments: Load (3, 5): Bit 2: End
- "ldr d0, [x3, #0x78]\n"
- "ssubl v22.8h, v22.8b, v7.8b\n"
- "smlal v10.4s, v22.4h, v4.4h\n"
- "ldr x15, [x25, #0xc0]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
- "smlal2 v9.4s, v22.8h, v4.8h\n"
- "add x15, x15, x10\n"
+ "ldr d0, [x23, #0x78]\n"
+ "ssubl v22.8h, v22.8b, v9.8b\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "ldr x19, [x20, #0xc0]\n"
+ "smlal v6.4s, v22.4h, v4.4h\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "add x19, x19, x24\n"
"smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v20.4s, v31.8h, v0.8h\n"
- "smlal v18.4s, v30.4h, v0.4h\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "tbz x4, #2, 69f\n"
- "ld1 { v27.s }[0], [x15], #0x4\n"
- "tbz x4, #1, 68f\n"
- "ld1 { v27.h }[2], [x15], #0x2\n"
- "tbz x4, #0, 71f\n"
- "ld1 { v27.b }[6], [x15]\n"
+ "smlal2 v16.4s, v31.8h, v0.8h\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal2 v8.4s, v30.8h, v0.8h\n"
+ "tbz x0, #2, 69f\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "tbz x0, #1, 68f\n"
+ "ld1 { v27.h }[2], [x19], #0x2\n"
+ "tbz x0, #0, 71f\n"
+ "ld1 { v27.b }[6], [x19]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 71f\n"
- "ld1 { v27.b }[4], [x15]\n"
+ "tbz x0, #0, 71f\n"
+ "ld1 { v27.b }[4], [x19]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 0): Bit 2: Unset
- "tbz x4, #1, 70f\n"
- "ld1 { v27.h }[0], [x15], #0x2\n"
- "tbz x4, #0, 71f\n"
- "ld1 { v27.b }[2], [x15]\n"
+ "tbz x0, #1, 70f\n"
+ "ld1 { v27.h }[0], [x19], #0x2\n"
+ "tbz x0, #0, 71f\n"
+ "ld1 { v27.b }[2], [x19]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 71f\n"
- "ld1 { v27.b }[0], [x15]\n"
+ "tbz x0, #0, 71f\n"
+ "ld1 { v27.b }[0], [x19]\n"
"71:" // Oddments: Load (4, 0): Bit 2: End
- "ldr x9, [x25, #0xc8]\n"
- "ssubl v27.8h, v27.8b, v7.8b\n"
- "smlal v11.4s, v27.4h, v0.4h\n"
- "smlal2 v8.4s, v27.8h, v0.8h\n"
- "add x9, x9, x10\n"
- "tbz x4, #2, 73f\n"
- "ld1 { v23.s }[0], [x9], #0x4\n"
- "tbz x4, #1, 72f\n"
- "ld1 { v23.h }[2], [x9], #0x2\n"
- "tbz x4, #0, 75f\n"
- "ld1 { v23.b }[6], [x9]\n"
+ "ssubl v27.8h, v27.8b, v9.8b\n"
+ "ldr x28, [x20, #0xc8]\n"
+ "smlal v10.4s, v27.4h, v0.4h\n"
+ "smlal2 v7.4s, v27.8h, v0.8h\n"
+ "add x28, x28, x24\n"
+ "tbz x0, #2, 73f\n"
+ "ld1 { v23.s }[0], [x28], #0x4\n"
+ "tbz x0, #1, 72f\n"
+ "ld1 { v23.h }[2], [x28], #0x2\n"
+ "tbz x0, #0, 75f\n"
+ "ld1 { v23.b }[6], [x28]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 75f\n"
- "ld1 { v23.b }[4], [x9]\n"
+ "tbz x0, #0, 75f\n"
+ "ld1 { v23.b }[4], [x28]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 1): Bit 2: Unset
- "tbz x4, #1, 74f\n"
- "ld1 { v23.h }[0], [x9], #0x2\n"
- "tbz x4, #0, 75f\n"
- "ld1 { v23.b }[2], [x9]\n"
+ "tbz x0, #1, 74f\n"
+ "ld1 { v23.h }[0], [x28], #0x2\n"
+ "tbz x0, #0, 75f\n"
+ "ld1 { v23.b }[2], [x28]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 75f\n"
- "ld1 { v23.b }[0], [x9]\n"
+ "tbz x0, #0, 75f\n"
+ "ld1 { v23.b }[0], [x28]\n"
"75:" // Oddments: Load (4, 1): Bit 2: End
- "ldr d1, [x3, #0x80]\n"
- "ssubl v23.8h, v23.8b, v7.8b\n"
- "smlal v10.4s, v23.4h, v0.4h\n"
- "ldr x27, [x25, #0xd0]\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "smlal2 v9.4s, v23.8h, v0.8h\n"
- "add x27, x27, x10\n"
+ "ldr d1, [x23, #0x80]\n"
+ "ssubl v23.8h, v23.8b, v9.8b\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "ldr x6, [x20, #0xd0]\n"
+ "smlal v6.4s, v23.4h, v0.4h\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "add x6, x6, x24\n"
"smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v20.4s, v30.8h, v1.8h\n"
- "smlal v18.4s, v26.4h, v1.4h\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
- "smlal v11.4s, v23.4h, v1.4h\n"
- "smlal2 v8.4s, v23.8h, v1.8h\n"
- "tbz x4, #2, 77f\n"
- "ld1 { v31.s }[0], [x27], #0x4\n"
- "tbz x4, #1, 76f\n"
- "ld1 { v31.h }[2], [x27], #0x2\n"
- "tbz x4, #0, 79f\n"
- "ld1 { v31.b }[6], [x27]\n"
+ "smlal2 v16.4s, v30.8h, v1.8h\n"
+ "smlal v17.4s, v26.4h, v1.4h\n"
+ "smlal2 v8.4s, v26.8h, v1.8h\n"
+ "smlal v10.4s, v23.4h, v1.4h\n"
+ "smlal2 v7.4s, v23.8h, v1.8h\n"
+ "tbz x0, #2, 77f\n"
+ "ld1 { v31.s }[0], [x6], #0x4\n"
+ "tbz x0, #1, 76f\n"
+ "ld1 { v31.h }[2], [x6], #0x2\n"
+ "tbz x0, #0, 79f\n"
+ "ld1 { v31.b }[6], [x6]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 79f\n"
- "ld1 { v31.b }[4], [x27]\n"
+ "tbz x0, #0, 79f\n"
+ "ld1 { v31.b }[4], [x6]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 2): Bit 2: Unset
- "tbz x4, #1, 78f\n"
- "ld1 { v31.h }[0], [x27], #0x2\n"
- "tbz x4, #0, 79f\n"
- "ld1 { v31.b }[2], [x27]\n"
+ "tbz x0, #1, 78f\n"
+ "ld1 { v31.h }[0], [x6], #0x2\n"
+ "tbz x0, #0, 79f\n"
+ "ld1 { v31.b }[2], [x6]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 79f\n"
- "ld1 { v31.b }[0], [x27]\n"
+ "tbz x0, #0, 79f\n"
+ "ld1 { v31.b }[0], [x6]\n"
"79:" // Oddments: Load (4, 2): Bit 2: End
- "ldr d2, [x3, #0x88]\n"
- "ssubl v31.8h, v31.8b, v7.8b\n"
- "smlal v10.4s, v31.4h, v1.4h\n"
- "ldr x28, [x25, #0xd8]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "smlal2 v9.4s, v31.8h, v1.8h\n"
- "add x28, x28, x10\n"
+ "ldr d2, [x23, #0x88]\n"
+ "ssubl v31.8h, v31.8b, v9.8b\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "ldr x27, [x20, #0xd8]\n"
+ "smlal v6.4s, v31.4h, v1.4h\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "add x27, x27, x24\n"
"smlal v15.4s, v26.4h, v2.4h\n"
- "smlal2 v20.4s, v26.8h, v2.8h\n"
- "smlal v18.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "smlal2 v8.4s, v31.8h, v2.8h\n"
- "tbz x4, #2, 81f\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
- "tbz x4, #1, 80f\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
- "tbz x4, #0, 83f\n"
- "ld1 { v30.b }[6], [x28]\n"
+ "smlal2 v16.4s, v26.8h, v2.8h\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v8.4s, v25.8h, v2.8h\n"
+ "smlal v10.4s, v31.4h, v2.4h\n"
+ "smlal2 v7.4s, v31.8h, v2.8h\n"
+ "tbz x0, #2, 81f\n"
+ "ld1 { v30.s }[0], [x27], #0x4\n"
+ "tbz x0, #1, 80f\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "tbz x0, #0, 83f\n"
+ "ld1 { v30.b }[6], [x27]\n"
"b 83f\n"
"80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 83f\n"
- "ld1 { v30.b }[4], [x28]\n"
+ "tbz x0, #0, 83f\n"
+ "ld1 { v30.b }[4], [x27]\n"
"b 83f\n"
"81:" // Oddments: Load (4, 3): Bit 2: Unset
- "tbz x4, #1, 82f\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
- "tbz x4, #0, 83f\n"
- "ld1 { v30.b }[2], [x28]\n"
+ "tbz x0, #1, 82f\n"
+ "ld1 { v30.h }[0], [x27], #0x2\n"
+ "tbz x0, #0, 83f\n"
+ "ld1 { v30.b }[2], [x27]\n"
"b 83f\n"
"82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 83f\n"
- "ld1 { v30.b }[0], [x28]\n"
+ "tbz x0, #0, 83f\n"
+ "ld1 { v30.b }[0], [x27]\n"
"83:" // Oddments: Load (4, 3): Bit 2: End
- "ldr d3, [x3, #0x90]\n"
- "ssubl v30.8h, v30.8b, v7.8b\n"
- "smlal v10.4s, v30.4h, v2.4h\n"
- "ldr x12, [x25, #0xe0]\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
- "smlal2 v9.4s, v30.8h, v2.8h\n"
- "add x12, x12, x10\n"
+ "ldr d3, [x23, #0x90]\n"
+ "ssubl v30.8h, v30.8b, v9.8b\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "ldr x11, [x20, #0xe0]\n"
+ "smlal v6.4s, v30.4h, v2.4h\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "add x11, x11, x24\n"
"smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v20.4s, v25.8h, v3.8h\n"
- "smlal v18.4s, v24.4h, v3.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "smlal v11.4s, v30.4h, v3.4h\n"
- "smlal2 v8.4s, v30.8h, v3.8h\n"
- "tbz x4, #2, 85f\n"
- "ld1 { v28.s }[0], [x12], #0x4\n"
- "tbz x4, #1, 84f\n"
- "ld1 { v28.h }[2], [x12], #0x2\n"
- "tbz x4, #0, 87f\n"
- "ld1 { v28.b }[6], [x12]\n"
+ "smlal2 v16.4s, v25.8h, v3.8h\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal2 v8.4s, v24.8h, v3.8h\n"
+ "smlal v10.4s, v30.4h, v3.4h\n"
+ "smlal2 v7.4s, v30.8h, v3.8h\n"
+ "tbz x0, #2, 85f\n"
+ "ld1 { v28.s }[0], [x11], #0x4\n"
+ "tbz x0, #1, 84f\n"
+ "ld1 { v28.h }[2], [x11], #0x2\n"
+ "tbz x0, #0, 87f\n"
+ "ld1 { v28.b }[6], [x11]\n"
"b 87f\n"
"84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 87f\n"
- "ld1 { v28.b }[4], [x12]\n"
+ "tbz x0, #0, 87f\n"
+ "ld1 { v28.b }[4], [x11]\n"
"b 87f\n"
"85:" // Oddments: Load (4, 4): Bit 2: Unset
- "tbz x4, #1, 86f\n"
- "ld1 { v28.h }[0], [x12], #0x2\n"
- "tbz x4, #0, 87f\n"
- "ld1 { v28.b }[2], [x12]\n"
+ "tbz x0, #1, 86f\n"
+ "ld1 { v28.h }[0], [x11], #0x2\n"
+ "tbz x0, #0, 87f\n"
+ "ld1 { v28.b }[2], [x11]\n"
"b 87f\n"
"86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 87f\n"
- "ld1 { v28.b }[0], [x12]\n"
+ "tbz x0, #0, 87f\n"
+ "ld1 { v28.b }[0], [x11]\n"
"87:" // Oddments: Load (4, 4): Bit 2: End
- "ldr d4, [x3, #0x98]\n"
- "ssubl v28.8h, v28.8b, v7.8b\n"
- "smlal v10.4s, v28.4h, v3.4h\n"
- "ldr x7, [x25, #0xe8]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
- "smlal2 v9.4s, v28.8h, v3.8h\n"
- "add x7, x7, x10\n"
+ "ldr d4, [x23, #0x98]\n"
+ "ssubl v28.8h, v28.8b, v9.8b\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x17, [x20, #0xe8]\n"
+ "smlal v6.4s, v28.4h, v3.4h\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "add x17, x17, x24\n"
"smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v20.4s, v24.8h, v4.8h\n"
- "smlal v18.4s, v22.4h, v4.4h\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
- "smlal v11.4s, v28.4h, v4.4h\n"
- "smlal2 v8.4s, v28.8h, v4.8h\n"
- "tbz x4, #2, 89f\n"
- "ld1 { v26.s }[0], [x7], #0x4\n"
- "tbz x4, #1, 88f\n"
- "ld1 { v26.h }[2], [x7], #0x2\n"
- "tbz x4, #0, 91f\n"
- "ld1 { v26.b }[6], [x7]\n"
+ "smlal2 v16.4s, v24.8h, v4.8h\n"
+ "smlal v17.4s, v22.4h, v4.4h\n"
+ "smlal2 v8.4s, v22.8h, v4.8h\n"
+ "smlal v10.4s, v28.4h, v4.4h\n"
+ "smlal2 v7.4s, v28.8h, v4.8h\n"
+ "tbz x0, #2, 89f\n"
+ "ld1 { v26.s }[0], [x17], #0x4\n"
+ "tbz x0, #1, 88f\n"
+ "ld1 { v26.h }[2], [x17], #0x2\n"
+ "tbz x0, #0, 91f\n"
+ "ld1 { v26.b }[6], [x17]\n"
"b 91f\n"
"88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 91f\n"
- "ld1 { v26.b }[4], [x7]\n"
+ "tbz x0, #0, 91f\n"
+ "ld1 { v26.b }[4], [x17]\n"
"b 91f\n"
"89:" // Oddments: Load (4, 5): Bit 2: Unset
- "tbz x4, #1, 90f\n"
- "ld1 { v26.h }[0], [x7], #0x2\n"
- "tbz x4, #0, 91f\n"
- "ld1 { v26.b }[2], [x7]\n"
+ "tbz x0, #1, 90f\n"
+ "ld1 { v26.h }[0], [x17], #0x2\n"
+ "tbz x0, #0, 91f\n"
+ "ld1 { v26.b }[2], [x17]\n"
"b 91f\n"
"90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 91f\n"
- "ld1 { v26.b }[0], [x7]\n"
+ "tbz x0, #0, 91f\n"
+ "ld1 { v26.b }[0], [x17]\n"
"91:" // Oddments: Load (4, 5): Bit 2: End
- "ldr d0, [x3, #0xa0]\n"
- "ssubl v26.8h, v26.8b, v7.8b\n"
- "smlal v10.4s, v26.4h, v4.4h\n"
- "ldr x26, [x25, #0xf0]\n"
- "ssubl v0.8h, v0.8b, v13.8b\n"
- "smlal2 v9.4s, v26.8h, v4.8h\n"
- "add x26, x26, x10\n"
+ "ldr d0, [x23, #0xa0]\n"
+ "ssubl v26.8h, v26.8b, v9.8b\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "ldr x5, [x20, #0xf0]\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "add x5, x5, x24\n"
"smlal v15.4s, v27.4h, v0.4h\n"
- "smlal2 v20.4s, v27.8h, v0.8h\n"
- "smlal v18.4s, v23.4h, v0.4h\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
- "tbz x4, #2, 93f\n"
- "ld1 { v25.s }[0], [x26], #0x4\n"
- "tbz x4, #1, 92f\n"
- "ld1 { v25.h }[2], [x26], #0x2\n"
- "tbz x4, #0, 95f\n"
- "ld1 { v25.b }[6], [x26]\n"
+ "smlal2 v16.4s, v27.8h, v0.8h\n"
+ "smlal v17.4s, v23.4h, v0.4h\n"
+ "smlal2 v8.4s, v23.8h, v0.8h\n"
+ "tbz x0, #2, 93f\n"
+ "ld1 { v25.s }[0], [x5], #0x4\n"
+ "tbz x0, #1, 92f\n"
+ "ld1 { v25.h }[2], [x5], #0x2\n"
+ "tbz x0, #0, 95f\n"
+ "ld1 { v25.b }[6], [x5]\n"
"b 95f\n"
"92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 95f\n"
- "ld1 { v25.b }[4], [x26]\n"
+ "tbz x0, #0, 95f\n"
+ "ld1 { v25.b }[4], [x5]\n"
"b 95f\n"
"93:" // Oddments: Load (5, 0): Bit 2: Unset
- "tbz x4, #1, 94f\n"
- "ld1 { v25.h }[0], [x26], #0x2\n"
- "tbz x4, #0, 95f\n"
- "ld1 { v25.b }[2], [x26]\n"
+ "tbz x0, #1, 94f\n"
+ "ld1 { v25.h }[0], [x5], #0x2\n"
+ "tbz x0, #0, 95f\n"
+ "ld1 { v25.b }[2], [x5]\n"
"b 95f\n"
"94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 95f\n"
- "ld1 { v25.b }[0], [x26]\n"
+ "tbz x0, #0, 95f\n"
+ "ld1 { v25.b }[0], [x5]\n"
"95:" // Oddments: Load (5, 0): Bit 2: End
- "ldr x23, [x25, #0xf8]\n"
- "ssubl v25.8h, v25.8b, v7.8b\n"
- "smlal v11.4s, v25.4h, v0.4h\n"
- "smlal2 v8.4s, v25.8h, v0.8h\n"
- "add x23, x23, x10\n"
- "tbz x4, #2, 97f\n"
- "ld1 { v24.s }[0], [x23], #0x4\n"
- "tbz x4, #1, 96f\n"
- "ld1 { v24.h }[2], [x23], #0x2\n"
- "tbz x4, #0, 99f\n"
- "ld1 { v24.b }[6], [x23]\n"
+ "ssubl v25.8h, v25.8b, v9.8b\n"
+ "ldr x25, [x20, #0xf8]\n"
+ "smlal v10.4s, v25.4h, v0.4h\n"
+ "smlal2 v7.4s, v25.8h, v0.8h\n"
+ "add x25, x25, x24\n"
+ "tbz x0, #2, 97f\n"
+ "ld1 { v24.s }[0], [x25], #0x4\n"
+ "tbz x0, #1, 96f\n"
+ "ld1 { v24.h }[2], [x25], #0x2\n"
+ "tbz x0, #0, 99f\n"
+ "ld1 { v24.b }[6], [x25]\n"
"b 99f\n"
"96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 99f\n"
- "ld1 { v24.b }[4], [x23]\n"
+ "tbz x0, #0, 99f\n"
+ "ld1 { v24.b }[4], [x25]\n"
"b 99f\n"
"97:" // Oddments: Load (5, 1): Bit 2: Unset
- "tbz x4, #1, 98f\n"
- "ld1 { v24.h }[0], [x23], #0x2\n"
- "tbz x4, #0, 99f\n"
- "ld1 { v24.b }[2], [x23]\n"
+ "tbz x0, #1, 98f\n"
+ "ld1 { v24.h }[0], [x25], #0x2\n"
+ "tbz x0, #0, 99f\n"
+ "ld1 { v24.b }[2], [x25]\n"
"b 99f\n"
"98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 99f\n"
- "ld1 { v24.b }[0], [x23]\n"
+ "tbz x0, #0, 99f\n"
+ "ld1 { v24.b }[0], [x25]\n"
"99:" // Oddments: Load (5, 1): Bit 2: End
- "ldr d1, [x3, #0xa8]\n"
- "ssubl v24.8h, v24.8b, v7.8b\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "ldr x22, [x25, #0x100]\n"
- "ssubl v1.8h, v1.8b, v13.8b\n"
- "smlal2 v9.4s, v24.8h, v0.8h\n"
- "add x22, x22, x10\n"
+ "ldr d1, [x23, #0xa8]\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "ldr x26, [x20, #0x100]\n"
+ "smlal v6.4s, v24.4h, v0.4h\n"
+ "smlal2 v5.4s, v24.8h, v0.8h\n"
+ "add x26, x26, x24\n"
"smlal v15.4s, v23.4h, v1.4h\n"
- "smlal2 v20.4s, v23.8h, v1.8h\n"
- "smlal v18.4s, v31.4h, v1.4h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "smlal v11.4s, v24.4h, v1.4h\n"
- "smlal2 v8.4s, v24.8h, v1.8h\n"
- "tbz x4, #2, 101f\n"
- "ld1 { v27.s }[0], [x22], #0x4\n"
- "tbz x4, #1, 100f\n"
- "ld1 { v27.h }[2], [x22], #0x2\n"
- "tbz x4, #0, 103f\n"
- "ld1 { v27.b }[6], [x22]\n"
+ "smlal2 v16.4s, v23.8h, v1.8h\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal2 v8.4s, v31.8h, v1.8h\n"
+ "smlal v10.4s, v24.4h, v1.4h\n"
+ "smlal2 v7.4s, v24.8h, v1.8h\n"
+ "tbz x0, #2, 101f\n"
+ "ld1 { v27.s }[0], [x26], #0x4\n"
+ "tbz x0, #1, 100f\n"
+ "ld1 { v27.h }[2], [x26], #0x2\n"
+ "tbz x0, #0, 103f\n"
+ "ld1 { v27.b }[6], [x26]\n"
"b 103f\n"
"100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 103f\n"
- "ld1 { v27.b }[4], [x22]\n"
+ "tbz x0, #0, 103f\n"
+ "ld1 { v27.b }[4], [x26]\n"
"b 103f\n"
"101:" // Oddments: Load (5, 2): Bit 2: Unset
- "tbz x4, #1, 102f\n"
- "ld1 { v27.h }[0], [x22], #0x2\n"
- "tbz x4, #0, 103f\n"
- "ld1 { v27.b }[2], [x22]\n"
+ "tbz x0, #1, 102f\n"
+ "ld1 { v27.h }[0], [x26], #0x2\n"
+ "tbz x0, #0, 103f\n"
+ "ld1 { v27.b }[2], [x26]\n"
"b 103f\n"
"102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 103f\n"
- "ld1 { v27.b }[0], [x22]\n"
+ "tbz x0, #0, 103f\n"
+ "ld1 { v27.b }[0], [x26]\n"
"103:" // Oddments: Load (5, 2): Bit 2: End
- "ldr d2, [x3, #0xb0]\n"
- "ssubl v27.8h, v27.8b, v7.8b\n"
- "smlal v10.4s, v27.4h, v1.4h\n"
- "ldr x20, [x25, #0x108]\n"
- "ssubl v2.8h, v2.8b, v13.8b\n"
- "smlal2 v9.4s, v27.8h, v1.8h\n"
- "add x20, x20, x10\n"
+ "ldr d2, [x23, #0xb0]\n"
+ "ssubl v27.8h, v27.8b, v9.8b\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "ldr x12, [x20, #0x108]\n"
+ "smlal v6.4s, v27.4h, v1.4h\n"
+ "smlal2 v5.4s, v27.8h, v1.8h\n"
+ "add x12, x12, x24\n"
"smlal v15.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "smlal v18.4s, v30.4h, v2.4h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "smlal v11.4s, v27.4h, v2.4h\n"
- "smlal2 v8.4s, v27.8h, v2.8h\n"
- "tbz x4, #2, 105f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 104f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 107f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "smlal2 v16.4s, v31.8h, v2.8h\n"
+ "smlal v17.4s, v30.4h, v2.4h\n"
+ "smlal2 v8.4s, v30.8h, v2.8h\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal2 v7.4s, v27.8h, v2.8h\n"
+ "tbz x0, #2, 105f\n"
+ "ld1 { v25.s }[0], [x12], #0x4\n"
+ "tbz x0, #1, 104f\n"
+ "ld1 { v25.h }[2], [x12], #0x2\n"
+ "tbz x0, #0, 107f\n"
+ "ld1 { v25.b }[6], [x12]\n"
"b 107f\n"
"104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 107f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "tbz x0, #0, 107f\n"
+ "ld1 { v25.b }[4], [x12]\n"
"b 107f\n"
"105:" // Oddments: Load (5, 3): Bit 2: Unset
- "tbz x4, #1, 106f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 107f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "tbz x0, #1, 106f\n"
+ "ld1 { v25.h }[0], [x12], #0x2\n"
+ "tbz x0, #0, 107f\n"
+ "ld1 { v25.b }[2], [x12]\n"
"b 107f\n"
"106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 107f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "tbz x0, #0, 107f\n"
+ "ld1 { v25.b }[0], [x12]\n"
"107:" // Oddments: Load (5, 3): Bit 2: End
- "ldr d3, [x3, #0xb8]\n"
- "ssubl v25.8h, v25.8b, v7.8b\n"
- "smlal v10.4s, v25.4h, v2.4h\n"
- "ldr x13, [x25, #0x110]\n"
- "ssubl v3.8h, v3.8b, v13.8b\n"
- "smlal2 v9.4s, v25.8h, v2.8h\n"
- "add x13, x13, x10\n"
+ "ldr d3, [x23, #0xb8]\n"
+ "ssubl v25.8h, v25.8b, v9.8b\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "ldr x14, [x20, #0x110]\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "add x14, x14, x24\n"
"smlal v15.4s, v30.4h, v3.4h\n"
- "smlal2 v20.4s, v30.8h, v3.8h\n"
- "smlal v18.4s, v28.4h, v3.4h\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v8.4s, v25.8h, v3.8h\n"
- "tbz x4, #2, 109f\n"
- "ld1 { v24.s }[0], [x13], #0x4\n"
- "tbz x4, #1, 108f\n"
- "ld1 { v24.h }[2], [x13], #0x2\n"
- "tbz x4, #0, 111f\n"
- "ld1 { v24.b }[6], [x13]\n"
+ "smlal2 v16.4s, v30.8h, v3.8h\n"
+ "smlal v17.4s, v28.4h, v3.4h\n"
+ "smlal2 v8.4s, v28.8h, v3.8h\n"
+ "smlal v10.4s, v25.4h, v3.4h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "tbz x0, #2, 109f\n"
+ "ld1 { v24.s }[0], [x14], #0x4\n"
+ "tbz x0, #1, 108f\n"
+ "ld1 { v24.h }[2], [x14], #0x2\n"
+ "tbz x0, #0, 111f\n"
+ "ld1 { v24.b }[6], [x14]\n"
"b 111f\n"
"108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 111f\n"
- "ld1 { v24.b }[4], [x13]\n"
+ "tbz x0, #0, 111f\n"
+ "ld1 { v24.b }[4], [x14]\n"
"b 111f\n"
"109:" // Oddments: Load (5, 4): Bit 2: Unset
- "tbz x4, #1, 110f\n"
- "ld1 { v24.h }[0], [x13], #0x2\n"
- "tbz x4, #0, 111f\n"
- "ld1 { v24.b }[2], [x13]\n"
+ "tbz x0, #1, 110f\n"
+ "ld1 { v24.h }[0], [x14], #0x2\n"
+ "tbz x0, #0, 111f\n"
+ "ld1 { v24.b }[2], [x14]\n"
"b 111f\n"
"110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 111f\n"
- "ld1 { v24.b }[0], [x13]\n"
+ "tbz x0, #0, 111f\n"
+ "ld1 { v24.b }[0], [x14]\n"
"111:" // Oddments: Load (5, 4): Bit 2: End
- "ldr d4, [x3, #0xc0]\n"
- "ssubl v24.8h, v24.8b, v7.8b\n"
- "smlal v10.4s, v24.4h, v3.4h\n"
- "ldr x21, [x25, #0x118]\n"
- "ssubl v4.8h, v4.8b, v13.8b\n"
- "smlal2 v9.4s, v24.8h, v3.8h\n"
- "add x21, x21, x10\n"
+ "ldr d4, [x23, #0xc0]\n"
+ "ssubl v24.8h, v24.8b, v9.8b\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x21, [x20, #0x118]\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "add x21, x21, x24\n"
"smlal v15.4s, v28.4h, v4.4h\n"
- "smlal2 v20.4s, v28.8h, v4.8h\n"
- "smlal v18.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v8.4s, v24.8h, v4.8h\n"
- "tbz x4, #2, 113f\n"
+ "smlal2 v16.4s, v28.8h, v4.8h\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal2 v8.4s, v26.8h, v4.8h\n"
+ "smlal v10.4s, v24.4h, v4.4h\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "tbz x0, #2, 113f\n"
"ld1 { v27.s }[0], [x21], #0x4\n"
- "tbz x4, #1, 112f\n"
+ "tbz x0, #1, 112f\n"
"ld1 { v27.h }[2], [x21], #0x2\n"
- "tbz x4, #0, 115f\n"
+ "tbz x0, #0, 115f\n"
"ld1 { v27.b }[6], [x21]\n"
"b 115f\n"
"112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 115f\n"
+ "tbz x0, #0, 115f\n"
"ld1 { v27.b }[4], [x21]\n"
"b 115f\n"
"113:" // Oddments: Load (5, 5): Bit 2: Unset
- "tbz x4, #1, 114f\n"
+ "tbz x0, #1, 114f\n"
"ld1 { v27.h }[0], [x21], #0x2\n"
- "tbz x4, #0, 115f\n"
+ "tbz x0, #0, 115f\n"
"ld1 { v27.b }[2], [x21]\n"
"b 115f\n"
"114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 115f\n"
+ "tbz x0, #0, 115f\n"
"ld1 { v27.b }[0], [x21]\n"
"115:" // Oddments: Load (5, 5): Bit 2: End
- "ssubl v27.8h, v27.8b, v7.8b\n"
- "smlal v10.4s, v27.4h, v4.4h\n"
- "smlal2 v9.4s, v27.8h, v4.8h\n"
- "tbz x4, #2, 117f\n"
- "ld1 { v6.4s }, [x2], #0x10\n"
- "ld1 { v21.4s }, [x5], #0x10\n"
- "tbz x4, #1, 116f\n"
- "ld1 { v17.d }[0], [x2], #0x8\n"
- "ld1 { v14.d }[0], [x5], #0x8\n"
- "tbz x4, #0, 119f\n"
- "ld1 { v17.s }[2], [x2]\n"
- "ld1 { v14.s }[2], [x5]\n"
+ "ssubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v6.4s, v27.4h, v4.4h\n"
+ "smlal2 v5.4s, v27.8h, v4.8h\n"
+ "tbz x0, #2, 117f\n"
+ "ld1 { v12.4s }, [x10], #0x10\n"
+ "ld1 { v19.4s }, [x1], #0x10\n"
+ "tbz x0, #1, 116f\n"
+ "ld1 { v20.d }[0], [x10], #0x8\n"
+ "ld1 { v29.d }[0], [x1], #0x8\n"
+ "tbz x0, #0, 119f\n"
+ "ld1 { v20.s }[2], [x10]\n"
+ "ld1 { v29.s }[2], [x1]\n"
"b 119f\n"
"116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x4, #0, 119f\n"
- "ld1 { v17.s }[0], [x2]\n"
- "ld1 { v14.s }[0], [x5]\n"
+ "tbz x0, #0, 119f\n"
+ "ld1 { v20.s }[0], [x10]\n"
+ "ld1 { v29.s }[0], [x1]\n"
"b 119f\n"
"117:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x4, #1, 118f\n"
- "ld1 { v6.d }[0], [x2], #0x8\n"
- "ld1 { v21.d }[0], [x5], #0x8\n"
- "tbz x4, #0, 119f\n"
- "ld1 { v6.s }[2], [x2]\n"
- "ld1 { v21.s }[2], [x5]\n"
+ "tbz x0, #1, 118f\n"
+ "ld1 { v12.d }[0], [x10], #0x8\n"
+ "ld1 { v19.d }[0], [x1], #0x8\n"
+ "tbz x0, #0, 119f\n"
+ "ld1 { v12.s }[2], [x10]\n"
+ "ld1 { v19.s }[2], [x1]\n"
"b 119f\n"
"118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 119f\n"
- "ld1 { v6.s }[0], [x2]\n"
- "ld1 { v21.s }[0], [x5]\n"
+ "tbz x0, #0, 119f\n"
+ "ld1 { v12.s }[0], [x10]\n"
+ "ld1 { v19.s }[0], [x1]\n"
"119:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v15.4s, v15.4s, v6.4s\n"
- "add x17, x17, x1\n"
- "sqrdmulh v20.4s, v20.4s, v17.4s\n"
- "add x16, x16, x1\n"
- "sqrdmulh v18.4s, v18.4s, v6.4s\n"
- "add x6, x6, x1\n"
- "sqrdmulh v5.4s, v5.4s, v17.4s\n"
- "add x8, x8, x1\n"
- "sqrdmulh v11.4s, v11.4s, v6.4s\n"
- "and v1.16b, v15.16b, v21.16b\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "and v29.16b, v20.16b, v14.16b\n"
- "and v3.16b, v18.16b, v21.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v2.16b, v5.16b, v14.16b\n"
- "and v0.16b, v11.16b, v21.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+ "sqdmulh v15.4s, v15.4s, v12.4s\n"
+ "sqdmulh v17.4s, v17.4s, v12.4s\n"
+ "add x16, x16, x22\n"
+ "add x8, x8, x22\n"
+ "sqdmulh v10.4s, v10.4s, v12.4s\n"
+ "sqdmulh v6.4s, v6.4s, v12.4s\n"
+ "add x4, x4, x22\n"
+ "add x7, x7, x22\n"
+ "and v23.16b, v15.16b, v19.16b\n"
+ "sqdmulh v16.4s, v16.4s, v20.4s\n"
+ "and v22.16b, v17.16b, v19.16b\n"
+ "sqdmulh v8.4s, v8.4s, v20.4s\n"
+ "and v21.16b, v10.16b, v19.16b\n"
+ "sqdmulh v7.4s, v7.4s, v20.4s\n"
+ "and v26.16b, v6.16b, v19.16b\n"
+ "sqdmulh v5.4s, v5.4s, v20.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v4.16b, v16.16b, v29.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v2.16b, v8.16b, v29.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v3.16b, v7.16b, v29.16b\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "and v25.16b, v5.16b, v29.16b\n"
+ "sqadd v15.4s, v15.4s, v23.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v22.4s\n"
"sshr v2.4s, v2.4s, #0x1f\n"
- "sqadd v15.4s, v15.4s, v1.4s\n"
- "sqrdmulh v10.4s, v10.4s, v6.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sqadd v18.4s, v18.4s, v3.4s\n"
- "srshl v15.4s, v15.4s, v21.4s\n"
- "sqadd v5.4s, v5.4s, v2.4s\n"
- "srshl v20.4s, v20.4s, v14.4s\n"
- "srshl v18.4s, v18.4s, v21.4s\n"
- "add v15.4s, v15.4s, v19.4s\n"
- "srshl v5.4s, v5.4s, v14.4s\n"
- "add v20.4s, v20.4s, v19.4s\n"
- "smin v15.4s, v15.4s, v12.4s\n"
- "add v18.4s, v18.4s, v19.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "smax v15.4s, v15.4s, v16.4s\n"
- "smin v18.4s, v18.4s, v12.4s\n"
- "smax v20.4s, v20.4s, v16.4s\n"
- "add v5.4s, v5.4s, v19.4s\n"
- "smax v18.4s, v18.4s, v16.4s\n"
- "uzp1 v15.16b, v15.16b, v20.16b\n"
- "smin v5.4s, v5.4s, v12.4s\n"
+ "sqadd v10.4s, v10.4s, v21.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v26.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v19.4s\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "srshl v17.4s, v17.4s, v19.4s\n"
+ "sqadd v8.4s, v8.4s, v2.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqadd v7.4s, v7.4s, v3.4s\n"
+ "srshl v6.4s, v6.4s, v19.4s\n"
+ "sqadd v5.4s, v5.4s, v25.4s\n"
+ "srshl v16.4s, v16.4s, v29.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqxtn v17.4h, v17.4s\n"
+ "srshl v7.4s, v7.4s, v29.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v5.4s, v5.4s, v29.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v15.8h, v16.4s\n"
+ "sqxtn2 v17.8h, v8.4s\n"
+ "sqxtn2 v10.8h, v7.4s\n"
+ "sqxtn2 v6.8h, v5.4s\n"
+ "sqadd v15.8h, v15.8h, v18.8h\n"
+ "sqadd v17.8h, v17.8h, v18.8h\n"
+ "sqadd v10.8h, v10.8h, v18.8h\n"
+ "sqadd v6.8h, v6.8h, v18.8h\n"
+ "smax v15.8h, v15.8h, v11.8h\n"
+ "smax v17.8h, v17.8h, v11.8h\n"
+ "smax v10.8h, v10.8h, v11.8h\n"
+ "smax v6.8h, v6.8h, v11.8h\n"
+ "smin v15.8h, v15.8h, v13.8h\n"
+ "smin v17.8h, v17.8h, v13.8h\n"
+ "smin v10.8h, v10.8h, v13.8h\n"
+ "smin v6.8h, v6.8h, v13.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "sqadd v11.4s, v11.4s, v0.4s\n"
- "smax v5.4s, v5.4s, v16.4s\n"
- "and v27.16b, v8.16b, v14.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "uzp1 v18.16b, v18.16b, v5.16b\n"
- "srshl v11.4s, v11.4s, v21.4s\n"
- "and v30.16b, v10.16b, v21.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
- "add v11.4s, v11.4s, v19.4s\n"
- "sqadd v8.4s, v8.4s, v27.4s\n"
- "and v6.16b, v9.16b, v14.16b\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "smin v11.4s, v11.4s, v12.4s\n"
- "srshl v8.4s, v8.4s, v14.4s\n"
- "sqadd v10.4s, v10.4s, v30.4s\n"
- "smax v11.4s, v11.4s, v16.4s\n"
- "add v8.4s, v8.4s, v19.4s\n"
- "srshl v10.4s, v10.4s, v21.4s\n"
- "sqadd v9.4s, v9.4s, v6.4s\n"
- "smin v8.4s, v8.4s, v12.4s\n"
- "add v10.4s, v10.4s, v19.4s\n"
- "srshl v9.4s, v9.4s, v14.4s\n"
- "smax v8.4s, v8.4s, v16.4s\n"
- "smin v10.4s, v10.4s, v12.4s\n"
- "uzp1 v11.16b, v11.16b, v8.16b\n"
- "add v9.4s, v9.4s, v19.4s\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "smax v10.4s, v10.4s, v16.4s\n"
- "smin v9.4s, v9.4s, v12.4s\n"
- "smax v9.4s, v9.4s, v16.4s\n"
- "uzp1 v10.16b, v10.16b, v9.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v10.16b, v10.16b, v10.16b\n"
- "tbz x4, #2, 121f\n"
- "st1 { v15.s }[0], [x17], #0x4\n"
- "st1 { v18.s }[0], [x16], #0x4\n"
- "st1 { v11.s }[0], [x6], #0x4\n"
- "st1 { v10.s }[0], [x8], #0x4\n"
- "tbz x4, #1, 120f\n"
- "st1 { v15.h }[2], [x17], #0x2\n"
- "st1 { v18.h }[2], [x16], #0x2\n"
- "st1 { v11.h }[2], [x6], #0x2\n"
- "st1 { v10.h }[2], [x8], #0x2\n"
- "tbz x4, #0, 123f\n"
- "st1 { v15.b }[6], [x17], #0x1\n"
- "st1 { v18.b }[6], [x16], #0x1\n"
- "st1 { v11.b }[6], [x6], #0x1\n"
- "st1 { v10.b }[6], [x8], #0x1\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "tbz x0, #2, 121f\n"
+ "st1 { v15.s }[0], [x16], #0x4\n"
+ "st1 { v17.s }[0], [x8], #0x4\n"
+ "st1 { v10.s }[0], [x4], #0x4\n"
+ "st1 { v6.s }[0], [x7], #0x4\n"
+ "tbz x0, #1, 120f\n"
+ "st1 { v15.h }[2], [x16], #0x2\n"
+ "st1 { v17.h }[2], [x8], #0x2\n"
+ "st1 { v10.h }[2], [x4], #0x2\n"
+ "st1 { v6.h }[2], [x7], #0x2\n"
+ "tbz x0, #0, 123f\n"
+ "st1 { v15.b }[6], [x16], #0x1\n"
+ "st1 { v17.b }[6], [x8], #0x1\n"
+ "st1 { v10.b }[6], [x4], #0x1\n"
+ "st1 { v6.b }[6], [x7], #0x1\n"
"b 123f\n"
"120:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x4, #0, 123f\n"
- "st1 { v15.b }[4], [x17], #0x1\n"
- "st1 { v18.b }[4], [x16], #0x1\n"
- "st1 { v11.b }[4], [x6], #0x1\n"
- "st1 { v10.b }[4], [x8], #0x1\n"
+ "tbz x0, #0, 123f\n"
+ "st1 { v15.b }[4], [x16], #0x1\n"
+ "st1 { v17.b }[4], [x8], #0x1\n"
+ "st1 { v10.b }[4], [x4], #0x1\n"
+ "st1 { v6.b }[4], [x7], #0x1\n"
"b 123f\n"
"121:" // Oddments: Bit 2: Unset
- "tbz x4, #1, 122f\n"
- "st1 { v15.h }[0], [x17], #0x2\n"
- "st1 { v18.h }[0], [x16], #0x2\n"
- "st1 { v11.h }[0], [x6], #0x2\n"
- "st1 { v10.h }[0], [x8], #0x2\n"
- "tbz x4, #0, 123f\n"
- "st1 { v15.b }[2], [x17], #0x1\n"
- "st1 { v18.b }[2], [x16], #0x1\n"
- "st1 { v11.b }[2], [x6], #0x1\n"
- "st1 { v10.b }[2], [x8], #0x1\n"
+ "tbz x0, #1, 122f\n"
+ "st1 { v15.h }[0], [x16], #0x2\n"
+ "st1 { v17.h }[0], [x8], #0x2\n"
+ "st1 { v10.h }[0], [x4], #0x2\n"
+ "st1 { v6.h }[0], [x7], #0x2\n"
+ "tbz x0, #0, 123f\n"
+ "st1 { v15.b }[2], [x16], #0x1\n"
+ "st1 { v17.b }[2], [x8], #0x1\n"
+ "st1 { v10.b }[2], [x4], #0x1\n"
+ "st1 { v6.b }[2], [x7], #0x1\n"
"b 123f\n"
"122:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 123f\n"
- "st1 { v15.b }[0], [x17], #0x1\n"
- "st1 { v18.b }[0], [x16], #0x1\n"
- "st1 { v11.b }[0], [x6], #0x1\n"
- "st1 { v10.b }[0], [x8], #0x1\n"
+ "tbz x0, #0, 123f\n"
+ "st1 { v15.b }[0], [x16], #0x1\n"
+ "st1 { v17.b }[0], [x8], #0x1\n"
+ "st1 { v10.b }[0], [x4], #0x1\n"
+ "st1 { v6.b }[0], [x7], #0x1\n"
"123:" // Oddments: Bit 2: End
-
"124:" // End
-
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp
index 4e845cceaf..9b1f7c239f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,28 +28,23 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
-struct a64_s8q_nhwc_generic_output9_mla_depthfirst
+class a64_s8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<int8_t, int8_t, int8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef int8_t input_type;
- typedef int8_t weight_type;
- typedef int8_t return_type;
-
- typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- constexpr static unsigned int n_output_points = 9;
+ KernelType kernel = a64_s8q_nhwc_generic_output9_mla_depthfirst_impl;
- kern_type kernel = a64_s8q_nhwc_generic_output9_mla_depthfirst_impl;
+ public:
+ a64_s8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<int8_t, int8_t, int8_t, int32_t>(9, arm_gemm::VLType::None) {}
- a64_s8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {}
+ KernelType get_kernel() const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
index b9fef4f9ab..5ca3ccd4bf 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,39 +28,34 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
-struct a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst
+struct a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst : DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef int8_t input_type;
- typedef int8_t weight_type;
- typedef int8_t return_type;
-
- typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
+ using Parent = DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 2;
constexpr static unsigned int stride_cols = 2;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 4;
-
- constexpr static unsigned int input_rows = 5;
- constexpr static unsigned int input_cols = 9;
- constexpr static unsigned int input_col_quads = 1;
+ a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *)
+ : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
- kern_type kernel = a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
- a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
index 9a3eed47fb..0641229aa7 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,6 +26,8 @@
#include <cstdint>
+#if defined(__aarch64__)
+
#pragma once
namespace arm_conv {
@@ -33,34 +35,26 @@ namespace depthwise {
void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
-struct a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst
+struct a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst : DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef int8_t input_type;
- typedef int8_t weight_type;
- typedef int8_t return_type;
-
- typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
+ using Parent = DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>;
constexpr static unsigned int kernel_rows = 5;
constexpr static unsigned int kernel_cols = 5;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 4;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 8;
- constexpr static unsigned int input_cols = 6;
- constexpr static unsigned int input_col_quads = 1;
+ a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *)
+ : Parent(4, 2, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
- kern_type kernel = a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
- a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
index d0ae00d260..3dad8d5604 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,31 +28,25 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
-struct a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
+struct a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<int8_t, int8_t, int8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef int8_t input_type;
- typedef int8_t weight_type;
- typedef int8_t return_type;
-
- typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- constexpr static unsigned int output_rows(void) { return 2; };
- constexpr static unsigned int output_cols(void) { return 8; };
-
- constexpr static unsigned int output_col_regs(void) { return 2; };
-
- kern_type kernel = a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
-
- a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {}
+ using Parent = GenericDepthfirstMultiplierKernelStrategy<int8_t, int8_t, int8_t, int32_t>;
+ a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+ : Parent(2, 8, arm_gemm::VLType::None)
+ {
+ }
+ Parent::KernelType kernel = a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
index 0fde00ba37..22b6b657a5 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,39 +34,40 @@
namespace arm_conv {
namespace depthwise {
-void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *);
-struct a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst
+class a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef int8_t input_type;
- typedef int8_t weight_type;
- typedef int8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- typedef void (*kern_type)(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 4;
- constexpr static unsigned int input_cols = 4;
-
- constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_dot::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_dot::get_packed_size;
-
- kern_type kernel = a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
-
- a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {}
+ a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleave_a64_s8q_3x3_dot::get_packed_size(args);
+ }
+
+ void pack_parameters(
+ const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const override
+ {
+ interleave_a64_s8q_3x3_dot::pack_parameters(
+ args.input_channels, buffer, reinterpret_cast<const int32_t *>(biases),
+ reinterpret_cast<const int8_t *>(weights), qp, ld_weight_col, ld_weight_row
+ );
+ }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index bdbda178b3..761c7ec86e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,7 +30,15 @@
namespace arm_conv {
namespace depthwise {
-void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp)
+void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
+ const unsigned int n_channels,
+ const int8_t *const *const inptrs,
+ const int8_t *params,
+ const int32_t *, // Bias, should be wrapped into the parameters
+ const arm_gemm::Requantize32& qp,
+ const int32_t *, const int32_t *, // Requant parameters, also wrapped
+ int8_t *const *const outptrs
+)
{
__asm__ __volatile__(
"ldp x15, x14, [%x[inptrs], #0x0]\n"
@@ -1173,7 +1181,7 @@ void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *con
"34:" // End
"add SP, SP, #0x80\n"
: [params] "+&r" (params)
- : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : [inptrs] "r" (inptrs), [n_channels] "r" ((long unsigned int) n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
index 05eddd1853..00c8a3cde2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,39 +34,40 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *, uint8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, uint8_t *const *);
-struct a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst
+class a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
- typedef uint32_t bias_type;
- typedef uint8_t input_type;
- typedef uint8_t weight_type;
- typedef uint8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- typedef void (*kern_type)(const uint8_t *const *, uint8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 4;
- constexpr static unsigned int input_cols = 4;
-
- constexpr static parameter_packing_fn pack_parameters = interleave_a64_u8q_3x3_dot::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_a64_u8q_3x3_dot::get_packed_size;
-
- kern_type kernel = a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
-
- a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {}
+ a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleave_a64_u8q_3x3_dot::get_packed_size(args);
+ }
+
+ void pack_parameters(
+ const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const override
+ {
+ interleave_a64_u8q_3x3_dot::pack_parameters(
+ args.input_channels, buffer, reinterpret_cast<const int32_t *>(biases),
+ reinterpret_cast<const uint8_t *>(weights), qp, ld_weight_col, ld_weight_row
+ );
+ }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 22c584f8e7..64b305c21d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,7 +30,15 @@
namespace arm_conv {
namespace depthwise {
-void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *const inptrs, uint8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp)
+void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const uint8_t *params,
+ const int32_t *, // Bias, should be wrapped into the parameters
+ const arm_gemm::Requantize32& qp,
+ const int32_t *, const int32_t *, // Requant parameters, also wrapped
+ uint8_t *const *const outptrs
+)
{
__asm__ __volatile__(
"ldp x13, x12, [%x[inptrs], #0x0]\n"
@@ -1307,7 +1315,7 @@ void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *con
"34:" // End
"add SP, SP, #0x80\n"
: [params] "+&r" (params)
- : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : [inptrs] "r" (inptrs), [n_channels] "r" ((long unsigned) n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 09ba75f685..b55055fc93 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,37 +36,24 @@ namespace depthwise {
void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
-struct a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst
+class a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef uint8_t input_type;
- typedef uint8_t weight_type;
- typedef uint8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 4;
- constexpr static unsigned int input_cols = 4;
-
- constexpr static parameter_packing_fn pack_parameters = interleave_a64_u8q_3x3_mla::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_a64_u8q_3x3_mla::get_packed_size;
+ a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
- kern_type kernel = a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
- a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index b62ebb1687..453f9cfa92 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,7 +46,7 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const uint8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const uint8_t *const *inptrs_raw,
- const uint8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -91,513 +91,497 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
+ "ldr x19, [%x[params], %[offsetof_Params_requant]]\n"
"ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
- "mov x17, #0x0\n"
- "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x24, x19, %[offsetof_Requantize32_a_offset]\n"
+ "add x23, x19, %[offsetof_Requantize32_b_offset]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x19, %[offsetof_Requantize32_c_offset]\n"
+ "add x20, x19, %[offsetof_Requantize32_minval]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x19, x19, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v22.16b }, [x24]\n"
+ "ld1r { v12.16b }, [x23]\n"
+ "lsr x16, x8, #0x3\n"
+ "ld1r { v14.8h }, [x21]\n"
+ "ld1r { v17.8h }, [x20]\n"
"mov x15, #0x0\n"
- "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
- "add x14, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x14, #0x0\n"
+ "ld1r { v15.8h }, [x19]\n"
"ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "lsr x12, x8, #0x3\n"
+ "add x12, %x[params], %[offsetof_Params_inptrs]\n"
"ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
- "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v14.16b }, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v9.16b }, [x20]\n"
- "add x20, x22, %[offsetof_Requantize32_minval]\n"
- "ld1r { v15.4s }, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v24.4s }, [x20]\n"
- "ld1r { v12.4s }, [x19]\n"
- "ldp x10, x9, [x21, #0x0]\n"
- "ldp x28, x27, [x21, #0x10]\n"
- "cbz x12, 3f\n"
- "subs x12, x12, #0x1\n"
+ "ldp x10, x9, [x22, #0x0]\n"
+ "ldp x28, x27, [x22, #0x10]\n"
+ "cbz x16, 3f\n"
"ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
"ldr q13, [x19, #0x0]\n"
- "mov v17.16b, v13.16b\n"
- "ldr q19, [x19, #0x10]\n"
+ "subs x16, x16, #0x1\n"
+ "mov v19.16b, v13.16b\n"
+ "ldr q26, [x19, #0x10]\n"
"add x19, x19, #0x20\n"
- "mov v16.16b, v13.16b\n"
"str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "mov v23.16b, v13.16b\n"
- "ldr d0, [x16, #0x0]\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "mov v25.16b, v19.16b\n"
- "ldr d1, [x16, #0x8]\n"
- "mov v21.16b, v19.16b\n"
- "ldr d2, [x16, #0x10]\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "mov v20.16b, v19.16b\n"
- "ldr d3, [x16, #0x18]\n"
- "ldr d4, [x16, #0x20]\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "ldr d5, [x16, #0x28]\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "ldr d6, [x16, #0x30]\n"
- "ldr d7, [x16, #0x38]\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "ldr d8, [x16, #0x40]\n"
- "usubl v5.8h, v5.8b, v9.8b\n"
- "ldp x23, x22, [x14, #0x0]\n"
- "usubl v6.8h, v6.8b, v9.8b\n"
- "ldp x21, x20, [x14, #0x10]\n"
- "usubl v7.8h, v7.8b, v9.8b\n"
- "usubl v8.8h, v8.8b, v9.8b\n"
- "ldr x19, [x14, #0x20]\n"
- "ldr d31, [x23, x17]\n"
- "usubl v31.8h, v31.8b, v14.8b\n"
- "ldr d30, [x22, x17]\n"
- "ldr d29, [x21, x17]\n"
- "usubl v30.8h, v30.8b, v14.8b\n"
- "ldr d28, [x20, x17]\n"
- "ldr d27, [x19, x17]\n"
- "usubl v29.8h, v29.8b, v14.8b\n"
- "usubl v28.8h, v28.8b, v14.8b\n"
- "usubl v27.8h, v27.8b, v14.8b\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "ldr d2, [x17, #0x10]\n"
+ "mov v11.16b, v26.16b\n"
+ "mov v18.16b, v13.16b\n"
+ "ldr d3, [x17, #0x18]\n"
+ "ldr d4, [x17, #0x20]\n"
+ "mov v24.16b, v26.16b\n"
+ "mov v9.16b, v13.16b\n"
+ "ldr d5, [x17, #0x28]\n"
+ "ldr d6, [x17, #0x30]\n"
+ "mov v23.16b, v26.16b\n"
+ "usubl v0.8h, v0.8b, v12.8b\n"
+ "ldr d7, [x17, #0x38]\n"
+ "ldr d8, [x17, #0x40]\n"
+ "usubl v1.8h, v1.8b, v12.8b\n"
+ "usubl v2.8h, v2.8b, v12.8b\n"
+ "ldp x23, x22, [x12, #0x0]\n"
+ "ldp x21, x20, [x12, #0x10]\n"
+ "usubl v3.8h, v3.8b, v12.8b\n"
+ "usubl v4.8h, v4.8b, v12.8b\n"
+ "ldr x19, [x12, #0x20]\n"
+ "ldr d31, [x23, x15]\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "usubl v6.8h, v6.8b, v12.8b\n"
+ "ldr d30, [x22, x15]\n"
+ "ldr d29, [x21, x15]\n"
+ "usubl v7.8h, v7.8b, v12.8b\n"
+ "usubl v8.8h, v8.8b, v12.8b\n"
+ "ldr d28, [x20, x15]\n"
+ "ldr d27, [x19, x15]\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "usubl v27.8h, v27.8b, v22.8b\n"
"beq 2f\n"
"1:" // Loop
"smlal v13.4s, v31.4h, v4.4h\n"
- "ldr x21, [x14, #0x28]\n"
- "add x16, x16, #0x48\n"
- "smlal2 v19.4s, v31.8h, v4.8h\n"
- "ldr x20, [x14, #0x30]\n"
- "subs x12, x12, #0x1\n"
- "smlal v17.4s, v31.4h, v3.4h\n"
- "ldr x26, [x14, #0x38]\n"
- "smlal2 v25.4s, v31.8h, v3.8h\n"
- "ldr x25, [x14, #0x40]\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "ldr x19, [x14, #0x48]\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "ldr x24, [x14, #0x50]\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "ldr x23, [x14, #0x58]\n"
- "smlal2 v20.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x17]\n"
- "usubl v31.8h, v31.8b, v14.8b\n"
+ "smlal2 v26.4s, v31.8h, v4.8h\n"
+ "ldr x21, [x12, #0x28]\n"
+ "ldr x26, [x12, #0x38]\n"
+ "smlal v19.4s, v31.4h, v3.4h\n"
+ "smlal2 v11.4s, v31.8h, v3.8h\n"
+ "ldr x20, [x12, #0x30]\n"
+ "ldr x25, [x12, #0x40]\n"
"smlal v13.4s, v30.4h, v0.4h\n"
- "ldr x22, [x14, #0x60]\n"
- "smlal2 v19.4s, v30.8h, v0.8h\n"
- "ldr d30, [x19, x17]\n"
- "usubl v30.8h, v30.8b, v14.8b\n"
- "smlal v17.4s, v29.4h, v2.4h\n"
- "ldr x21, [x14, #0x68]\n"
- "smlal2 v25.4s, v29.8h, v2.8h\n"
- "ldr d29, [x20, x17]\n"
- "usubl v29.8h, v29.8b, v14.8b\n"
+ "smlal2 v26.4s, v30.8h, v0.8h\n"
+ "ldr x19, [x12, #0x48]\n"
+ "ldr d30, [x19, x15]\n"
+ "smlal v19.4s, v29.4h, v2.4h\n"
+ "smlal2 v11.4s, v29.8h, v2.8h\n"
+ "ldr d29, [x20, x15]\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "smlal v18.4s, v31.4h, v1.4h\n"
+ "smlal2 v24.4s, v31.8h, v1.8h\n"
+ "ldr x24, [x12, #0x50]\n"
+ "ldr x23, [x12, #0x58]\n"
+ "smlal v9.4s, v31.4h, v0.4h\n"
+ "smlal2 v23.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x21, x15]\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
"smlal v13.4s, v28.4h, v5.4h\n"
- "ldr x20, [x14, #0x70]\n"
- "smlal2 v19.4s, v28.8h, v5.8h\n"
- "ldr x19, [x14, #0x78]\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "ldr q26, [x13, #0x0]\n"
- "smlal2 v25.4s, v28.8h, v4.8h\n"
- "ldr q10, [x11, #0x0]\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "ldr q11, [x13, #0x10]\n"
- "add x13, x13, #0x20\n"
- "smlal2 v21.4s, v28.8h, v2.8h\n"
- "ldr q18, [x11, #0x10]\n"
- "add x11, x11, #0x20\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x26, x17]\n"
- "usubl v28.8h, v28.8b, v14.8b\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v21.4s, v31.8h, v6.8h\n"
- "ldr d31, [x25, x17]\n"
- "usubl v31.8h, v31.8b, v14.8b\n"
+ "smlal2 v26.4s, v28.8h, v5.8h\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "ldr x22, [x12, #0x60]\n"
+ "smlal v19.4s, v28.4h, v4.4h\n"
+ "smlal2 v11.4s, v28.8h, v4.8h\n"
+ "ldr x21, [x12, #0x68]\n"
+ "ldr x20, [x12, #0x70]\n"
+ "smlal v18.4s, v28.4h, v2.4h\n"
+ "smlal2 v24.4s, v28.8h, v2.8h\n"
+ "ldr x19, [x12, #0x78]\n"
+ "ldr q21, [x13, #0x0]\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "smlal2 v23.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x26, x15]\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
"smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v19.4s, v27.8h, v7.8h\n"
- "smlal v17.4s, v27.4h, v6.4h\n"
- "smlal2 v25.4s, v27.8h, v6.8h\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "smlal v23.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
+ "smlal2 v26.4s, v27.8h, v7.8h\n"
+ "ldr q25, [x11, #0x0]\n"
+ "ldr q10, [x13, #0x10]\n"
+ "smlal v19.4s, v27.4h, v6.4h\n"
+ "smlal2 v11.4s, v27.8h, v6.8h\n"
+ "ldr q16, [x11, #0x10]\n"
+ "add x17, x17, #0x48\n"
+ "smlal v18.4s, v31.4h, v6.4h\n"
+ "smlal2 v24.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x25, x15]\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "smlal2 v23.4s, v27.8h, v3.8h\n"
+ "subs x16, x16, #0x1\n"
+ "add x13, x13, #0x20\n"
"smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "ldr d29, [x24, x17]\n"
- "usubl v29.8h, v29.8b, v14.8b\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal2 v25.4s, v28.8h, v0.8h\n"
- "ldr d28, [x23, x17]\n"
- "usubl v28.8h, v28.8b, v14.8b\n"
+ "smlal2 v26.4s, v28.8h, v1.8h\n"
+ "add x11, x11, #0x20\n"
+ "smlal v19.4s, v28.4h, v0.4h\n"
+ "smlal2 v11.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x23, x15]\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal v18.4s, v27.4h, v4.4h\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "smlal2 v24.4s, v27.8h, v4.8h\n"
+ "smlal2 v23.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x24, x15]\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
"smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v25.4s, v31.8h, v1.8h\n"
- "ldr d31, [x22, x17]\n"
- "usubl v31.8h, v31.8b, v14.8b\n"
+ "smlal2 v26.4s, v31.8h, v2.8h\n"
+ "smlal v19.4s, v31.4h, v1.4h\n"
+ "smlal2 v11.4s, v31.8h, v1.8h\n"
+ "ldr d31, [x22, x15]\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "smlal v18.4s, v30.4h, v5.4h\n"
+ "smlal v9.4s, v30.4h, v4.4h\n"
"smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v19.4s, v30.8h, v8.8h\n"
- "smlal v17.4s, v30.4h, v7.4h\n"
- "smlal2 v25.4s, v30.8h, v7.8h\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal2 v21.4s, v30.8h, v5.8h\n"
- "smlal v23.4s, v30.4h, v4.4h\n"
- "smlal2 v20.4s, v30.8h, v4.8h\n"
- "ldr d30, [x21, x17]\n"
- "usubl v30.8h, v30.8b, v14.8b\n"
+ "smlal2 v26.4s, v30.8h, v8.8h\n"
+ "smlal v19.4s, v30.4h, v7.4h\n"
+ "smlal2 v11.4s, v30.8h, v7.8h\n"
+ "smlal2 v24.4s, v30.8h, v5.8h\n"
+ "smlal2 v23.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x21, x15]\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "smlal v18.4s, v29.4h, v0.4h\n"
+ "smlal v9.4s, v28.4h, v2.4h\n"
"smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v19.4s, v29.8h, v3.8h\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal2 v21.4s, v29.8h, v0.8h\n"
- "ldr d29, [x20, x17]\n"
- "usubl v29.8h, v29.8b, v14.8b\n"
- "smlal v17.4s, v28.4h, v5.4h\n"
- "smlal2 v25.4s, v28.8h, v5.8h\n"
- "smlal v23.4s, v28.4h, v2.4h\n"
- "smlal2 v20.4s, v28.8h, v2.8h\n"
- "ldr d28, [x19, x17]\n"
- "add x17, x17, #0x8\n"
+ "smlal2 v26.4s, v29.8h, v3.8h\n"
+ "smlal2 v24.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x20, x15]\n"
+ "smlal2 v23.4s, v28.8h, v2.8h\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "smlal v18.4s, v31.4h, v3.4h\n"
+ "smlal v9.4s, v30.4h, v5.4h\n"
+ "smlal v19.4s, v28.4h, v5.4h\n"
+ "smlal2 v11.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x19, x15]\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal2 v24.4s, v31.8h, v3.8h\n"
+ "smlal2 v23.4s, v30.8h, v5.8h\n"
+ "add x15, x15, #0x8\n"
+ "smlal v18.4s, v29.4h, v7.4h\n"
+ "smlal v9.4s, v29.4h, v6.4h\n"
+ "smlal2 v24.4s, v29.8h, v7.8h\n"
+ "smlal2 v23.4s, v29.8h, v6.8h\n"
"smlal v13.4s, v31.4h, v6.4h\n"
- "usubl v28.8h, v28.8b, v14.8b\n"
- "smlal2 v19.4s, v31.8h, v6.8h\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v3.8h\n"
- "smlal v17.4s, v30.4h, v8.4h\n"
- "smlal2 v25.4s, v30.8h, v8.8h\n"
- "smlal v23.4s, v30.4h, v5.4h\n"
- "smlal2 v20.4s, v30.8h, v5.8h\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "smlal2 v21.4s, v29.8h, v7.8h\n"
- "smlal v23.4s, v29.4h, v6.4h\n"
- "smlal2 v20.4s, v29.8h, v6.8h\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "smlal2 v21.4s, v28.8h, v8.8h\n"
- "smlal v23.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "sqrdmulh v13.4s, v13.4s, v26.4s\n"
- "sqrdmulh v19.4s, v19.4s, v11.4s\n"
- "sqrdmulh v17.4s, v17.4s, v26.4s\n"
- "sqrdmulh v25.4s, v25.4s, v11.4s\n"
- "and v22.16b, v13.16b, v10.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v28.16b, v19.16b, v18.16b\n"
- "and v3.16b, v17.16b, v10.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v6.16b, v25.16b, v18.16b\n"
- "sqrdmulh v16.4s, v16.4s, v26.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqrdmulh v21.4s, v21.4s, v11.4s\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sqadd v13.4s, v13.4s, v22.4s\n"
- "sqrdmulh v23.4s, v23.4s, v26.4s\n"
- "and v0.16b, v16.16b, v10.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v10.4s\n"
- "sqadd v19.4s, v19.4s, v28.4s\n"
- "sqadd v17.4s, v17.4s, v3.4s\n"
- "sqadd v25.4s, v25.4s, v6.4s\n"
- "and v29.16b, v21.16b, v18.16b\n"
+ "smlal v19.4s, v30.4h, v8.4h\n"
+ "sqdmulh v13.4s, v13.4s, v21.4s\n"
+ "smlal v18.4s, v28.4h, v8.4h\n"
+ "smlal v9.4s, v28.4h, v7.4h\n"
+ "sqdmulh v19.4s, v19.4s, v21.4s\n"
+ "smlal2 v26.4s, v31.8h, v6.8h\n"
+ "smlal2 v11.4s, v30.8h, v8.8h\n"
+ "sqdmulh v18.4s, v18.4s, v21.4s\n"
+ "smlal2 v24.4s, v28.8h, v8.8h\n"
+ "smlal2 v23.4s, v28.8h, v7.8h\n"
+ "sqdmulh v9.4s, v9.4s, v21.4s\n"
+ "and v7.16b, v13.16b, v25.16b\n"
+ "sqdmulh v26.4s, v26.4s, v10.4s\n"
+ "and v4.16b, v19.16b, v25.16b\n"
+ "sqdmulh v11.4s, v11.4s, v10.4s\n"
+ "and v21.16b, v18.16b, v25.16b\n"
+ "sqdmulh v24.4s, v24.4s, v10.4s\n"
+ "and v20.16b, v9.16b, v25.16b\n"
+ "sqdmulh v23.4s, v23.4s, v10.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v29.16b, v26.16b, v16.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v10.16b, v11.16b, v16.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v31.16b, v24.16b, v16.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v30.16b, v23.16b, v16.16b\n"
+ "sqadd v13.4s, v13.4s, v7.4s\n"
"sshr v29.4s, v29.4s, #0x1f\n"
- "add v13.4s, v13.4s, v15.4s\n"
- "srshl v19.4s, v19.4s, v18.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v18.4s\n"
- "smin v13.4s, v13.4s, v12.4s\n"
- "add v19.4s, v19.4s, v15.4s\n"
- "add v17.4s, v17.4s, v15.4s\n"
- "smax v13.4s, v13.4s, v24.4s\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "smin v17.4s, v17.4s, v12.4s\n"
- "add v25.4s, v25.4s, v15.4s\n"
- "smax v19.4s, v19.4s, v24.4s\n"
- "smax v17.4s, v17.4s, v24.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "uzp1 v13.16b, v13.16b, v19.16b\n"
- "sqadd v16.4s, v16.4s, v0.4s\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v21.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v20.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v25.4s\n"
+ "sqadd v26.4s, v26.4s, v29.4s\n"
+ "srshl v19.4s, v19.4s, v25.4s\n"
+ "sqadd v11.4s, v11.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v25.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "srshl v9.4s, v9.4s, v25.4s\n"
+ "sqadd v23.4s, v23.4s, v30.4s\n"
+ "srshl v26.4s, v26.4s, v16.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v11.4s, v11.4s, v16.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "srshl v24.4s, v24.4s, v16.4s\n"
+ "sqxtn v18.4h, v18.4s\n"
+ "srshl v23.4s, v23.4s, v16.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "sqxtn2 v13.8h, v26.4s\n"
+ "sqxtn2 v19.8h, v11.4s\n"
+ "sqxtn2 v18.8h, v24.4s\n"
+ "sqxtn2 v9.8h, v23.4s\n"
+ "sqadd v13.8h, v13.8h, v14.8h\n"
+ "sqadd v19.8h, v19.8h, v14.8h\n"
+ "sqadd v18.8h, v18.8h, v14.8h\n"
+ "sqadd v9.8h, v9.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smax v18.8h, v18.8h, v17.8h\n"
+ "smax v9.8h, v9.8h, v17.8h\n"
+ "smin v13.8h, v13.8h, v15.8h\n"
+ "smin v19.8h, v19.8h, v15.8h\n"
+ "smin v18.8h, v18.8h, v15.8h\n"
+ "smin v9.8h, v9.8h, v15.8h\n"
"uzp1 v13.16b, v13.16b, v13.16b\n"
- "str d13, [x10, x15]\n"
- "smax v25.4s, v25.4s, v24.4s\n"
- "sqadd v21.4s, v21.4s, v29.4s\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "and v3.16b, v23.16b, v10.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "uzp1 v17.16b, v17.16b, v25.16b\n"
- "add v16.4s, v16.4s, v15.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d17, [x9, x15]\n"
- "smin v16.4s, v16.4s, v12.4s\n"
- "sqrdmulh v20.4s, v20.4s, v11.4s\n"
- "add v21.4s, v21.4s, v15.4s\n"
- "sqadd v23.4s, v23.4s, v3.4s\n"
- "smax v16.4s, v16.4s, v24.4s\n"
- "smin v21.4s, v21.4s, v12.4s\n"
- "and v25.16b, v20.16b, v18.16b\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "smax v21.4s, v21.4s, v24.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "uzp1 v16.16b, v16.16b, v21.16b\n"
- "add v23.4s, v23.4s, v15.4s\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d16, [x28, x15]\n"
- "smin v23.4s, v23.4s, v12.4s\n"
- "sqadd v20.4s, v20.4s, v25.4s\n"
- "smax v23.4s, v23.4s, v24.4s\n"
- "srshl v20.4s, v20.4s, v18.4s\n"
- "add v20.4s, v20.4s, v15.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "smax v20.4s, v20.4s, v24.4s\n"
- "uzp1 v23.16b, v23.16b, v20.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d23, [x27, x15]\n"
- "add x15, x15, #0x8\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d13, [x10, x14]\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "str d19, [x9, x14]\n"
+ "str d18, [x28, x14]\n"
+ "str d9, [x27, x14]\n"
"ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
"ldr q13, [x19, #0x0]\n"
- "mov v17.16b, v13.16b\n"
- "ldr q19, [x19, #0x10]\n"
+ "add x14, x14, #0x8\n"
+ "ldr q26, [x19, #0x10]\n"
"add x19, x19, #0x20\n"
- "mov v16.16b, v13.16b\n"
"str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "mov v23.16b, v13.16b\n"
- "ldr d0, [x16, #0x0]\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "mov v25.16b, v19.16b\n"
- "ldr d1, [x16, #0x8]\n"
- "mov v21.16b, v19.16b\n"
- "ldr d2, [x16, #0x10]\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "mov v20.16b, v19.16b\n"
- "ldr d3, [x16, #0x18]\n"
- "ldr d4, [x16, #0x20]\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "ldr d5, [x16, #0x28]\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "ldr d6, [x16, #0x30]\n"
- "ldr d7, [x16, #0x38]\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "ldr d8, [x16, #0x40]\n"
- "usubl v5.8h, v5.8b, v9.8b\n"
- "ldp x23, x22, [x14, #0x0]\n"
- "usubl v6.8h, v6.8b, v9.8b\n"
- "ldp x21, x20, [x14, #0x10]\n"
- "usubl v7.8h, v7.8b, v9.8b\n"
- "usubl v8.8h, v8.8b, v9.8b\n"
- "ldr x19, [x14, #0x20]\n"
- "ldr d31, [x23, x17]\n"
- "usubl v31.8h, v31.8b, v14.8b\n"
- "ldr d30, [x22, x17]\n"
- "ldr d29, [x21, x17]\n"
- "usubl v30.8h, v30.8b, v14.8b\n"
- "ldr d28, [x20, x17]\n"
- "ldr d27, [x19, x17]\n"
- "usubl v29.8h, v29.8b, v14.8b\n"
- "usubl v28.8h, v28.8b, v14.8b\n"
- "usubl v27.8h, v27.8b, v14.8b\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "ldr d2, [x17, #0x10]\n"
+ "mov v19.16b, v13.16b\n"
+ "mov v11.16b, v26.16b\n"
+ "ldr d3, [x17, #0x18]\n"
+ "ldr d4, [x17, #0x20]\n"
+ "mov v18.16b, v13.16b\n"
+ "mov v24.16b, v26.16b\n"
+ "ldr d5, [x17, #0x28]\n"
+ "ldr d6, [x17, #0x30]\n"
+ "mov v9.16b, v13.16b\n"
+ "mov v23.16b, v26.16b\n"
+ "ldr d7, [x17, #0x38]\n"
+ "ldr d8, [x17, #0x40]\n"
+ "usubl v0.8h, v0.8b, v12.8b\n"
+ "usubl v1.8h, v1.8b, v12.8b\n"
+ "ldp x23, x22, [x12, #0x0]\n"
+ "ldp x21, x20, [x12, #0x10]\n"
+ "usubl v2.8h, v2.8b, v12.8b\n"
+ "usubl v3.8h, v3.8b, v12.8b\n"
+ "ldr x19, [x12, #0x20]\n"
+ "ldr d31, [x23, x15]\n"
+ "usubl v4.8h, v4.8b, v12.8b\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "ldr d30, [x22, x15]\n"
+ "ldr d29, [x21, x15]\n"
+ "usubl v6.8h, v6.8b, v12.8b\n"
+ "usubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d28, [x20, x15]\n"
+ "ldr d27, [x19, x15]\n"
+ "usubl v8.8h, v8.8b, v12.8b\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "usubl v27.8h, v27.8b, v22.8b\n"
"bgt 1b\n"
"2:" // Tail
"smlal v13.4s, v31.4h, v4.4h\n"
- "ldr x21, [x14, #0x28]\n"
- "tst x8, #0x7\n"
- "smlal2 v19.4s, v31.8h, v4.8h\n"
- "ldr x20, [x14, #0x30]\n"
- "smlal v17.4s, v31.4h, v3.4h\n"
- "ldr x26, [x14, #0x38]\n"
- "smlal2 v25.4s, v31.8h, v3.8h\n"
- "ldr x25, [x14, #0x40]\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "ldr x19, [x14, #0x48]\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "ldr x24, [x14, #0x50]\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "ldr x23, [x14, #0x58]\n"
- "smlal2 v20.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x17]\n"
- "usubl v31.8h, v31.8b, v14.8b\n"
+ "smlal2 v26.4s, v31.8h, v4.8h\n"
+ "ldr x21, [x12, #0x28]\n"
+ "ldr x26, [x12, #0x38]\n"
+ "smlal v19.4s, v31.4h, v3.4h\n"
+ "smlal2 v11.4s, v31.8h, v3.8h\n"
+ "ldr x20, [x12, #0x30]\n"
+ "ldr x25, [x12, #0x40]\n"
"smlal v13.4s, v30.4h, v0.4h\n"
- "ldr x22, [x14, #0x60]\n"
- "smlal2 v19.4s, v30.8h, v0.8h\n"
- "ldr d30, [x19, x17]\n"
- "usubl v30.8h, v30.8b, v14.8b\n"
- "smlal v17.4s, v29.4h, v2.4h\n"
- "ldr x21, [x14, #0x68]\n"
- "smlal2 v25.4s, v29.8h, v2.8h\n"
- "ldr d29, [x20, x17]\n"
- "usubl v29.8h, v29.8b, v14.8b\n"
+ "smlal2 v26.4s, v30.8h, v0.8h\n"
+ "ldr x19, [x12, #0x48]\n"
+ "ldr d30, [x19, x15]\n"
+ "smlal v19.4s, v29.4h, v2.4h\n"
+ "smlal2 v11.4s, v29.8h, v2.8h\n"
+ "ldr d29, [x20, x15]\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "smlal v18.4s, v31.4h, v1.4h\n"
+ "smlal2 v24.4s, v31.8h, v1.8h\n"
+ "ldr x24, [x12, #0x50]\n"
+ "ldr x23, [x12, #0x58]\n"
+ "smlal v9.4s, v31.4h, v0.4h\n"
+ "smlal2 v23.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x21, x15]\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
"smlal v13.4s, v28.4h, v5.4h\n"
- "ldr x20, [x14, #0x70]\n"
- "smlal2 v19.4s, v28.8h, v5.8h\n"
- "ldr x19, [x14, #0x78]\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "ldr q26, [x13, #0x0]\n"
- "smlal2 v25.4s, v28.8h, v4.8h\n"
- "ldr q10, [x11, #0x0]\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "ldr q11, [x13, #0x10]\n"
+ "smlal2 v26.4s, v28.8h, v5.8h\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "ldr x22, [x12, #0x60]\n"
+ "smlal v19.4s, v28.4h, v4.4h\n"
+ "smlal2 v11.4s, v28.8h, v4.8h\n"
+ "ldr x21, [x12, #0x68]\n"
+ "ldr x20, [x12, #0x70]\n"
+ "smlal v18.4s, v28.4h, v2.4h\n"
+ "smlal2 v24.4s, v28.8h, v2.8h\n"
+ "ldr x19, [x12, #0x78]\n"
+ "ldr q21, [x13, #0x0]\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "smlal2 v23.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x26, x15]\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal v13.4s, v27.4h, v7.4h\n"
+ "smlal2 v26.4s, v27.8h, v7.8h\n"
+ "ldr q25, [x11, #0x0]\n"
+ "ldr q10, [x13, #0x10]\n"
+ "smlal v19.4s, v27.4h, v6.4h\n"
+ "smlal2 v11.4s, v27.8h, v6.8h\n"
+ "ldr q16, [x11, #0x10]\n"
+ "tst x8, #0x7\n"
+ "smlal v18.4s, v31.4h, v6.4h\n"
+ "smlal2 v24.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x25, x15]\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "smlal2 v23.4s, v27.8h, v3.8h\n"
"add x13, x13, #0x20\n"
- "smlal2 v21.4s, v28.8h, v2.8h\n"
- "ldr q18, [x11, #0x10]\n"
"add x11, x11, #0x20\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x26, x17]\n"
- "usubl v28.8h, v28.8b, v14.8b\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v21.4s, v31.8h, v6.8h\n"
- "ldr d31, [x25, x17]\n"
- "usubl v31.8h, v31.8b, v14.8b\n"
- "smlal v13.4s, v27.4h, v7.4h\n"
- "smlal2 v19.4s, v27.8h, v7.8h\n"
- "smlal v17.4s, v27.4h, v6.4h\n"
- "smlal2 v25.4s, v27.8h, v6.8h\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "smlal v23.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
"smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "ldr d29, [x24, x17]\n"
- "usubl v29.8h, v29.8b, v14.8b\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal2 v25.4s, v28.8h, v0.8h\n"
- "ldr d28, [x23, x17]\n"
- "usubl v28.8h, v28.8b, v14.8b\n"
+ "smlal2 v26.4s, v28.8h, v1.8h\n"
+ "smlal v19.4s, v28.4h, v0.4h\n"
+ "smlal2 v11.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x23, x15]\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal v18.4s, v27.4h, v4.4h\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "smlal2 v24.4s, v27.8h, v4.8h\n"
+ "smlal2 v23.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x24, x15]\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
"smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v25.4s, v31.8h, v1.8h\n"
- "ldr d31, [x22, x17]\n"
- "usubl v31.8h, v31.8b, v14.8b\n"
+ "smlal2 v26.4s, v31.8h, v2.8h\n"
+ "smlal v19.4s, v31.4h, v1.4h\n"
+ "smlal2 v11.4s, v31.8h, v1.8h\n"
+ "ldr d31, [x22, x15]\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "smlal v18.4s, v30.4h, v5.4h\n"
+ "smlal v9.4s, v30.4h, v4.4h\n"
"smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v19.4s, v30.8h, v8.8h\n"
- "smlal v17.4s, v30.4h, v7.4h\n"
- "smlal2 v25.4s, v30.8h, v7.8h\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal2 v21.4s, v30.8h, v5.8h\n"
- "smlal v23.4s, v30.4h, v4.4h\n"
- "smlal2 v20.4s, v30.8h, v4.8h\n"
- "ldr d30, [x21, x17]\n"
- "usubl v30.8h, v30.8b, v14.8b\n"
+ "smlal2 v26.4s, v30.8h, v8.8h\n"
+ "smlal v19.4s, v30.4h, v7.4h\n"
+ "smlal2 v11.4s, v30.8h, v7.8h\n"
+ "smlal2 v24.4s, v30.8h, v5.8h\n"
+ "smlal2 v23.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x21, x15]\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "smlal v18.4s, v29.4h, v0.4h\n"
+ "smlal v9.4s, v28.4h, v2.4h\n"
"smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v19.4s, v29.8h, v3.8h\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal2 v21.4s, v29.8h, v0.8h\n"
- "ldr d29, [x20, x17]\n"
- "usubl v29.8h, v29.8b, v14.8b\n"
- "smlal v17.4s, v28.4h, v5.4h\n"
- "smlal2 v25.4s, v28.8h, v5.8h\n"
- "smlal v23.4s, v28.4h, v2.4h\n"
- "smlal2 v20.4s, v28.8h, v2.8h\n"
- "ldr d28, [x19, x17]\n"
- "add x17, x17, #0x8\n"
+ "smlal2 v26.4s, v29.8h, v3.8h\n"
+ "smlal2 v24.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x20, x15]\n"
+ "smlal2 v23.4s, v28.8h, v2.8h\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "smlal v18.4s, v31.4h, v3.4h\n"
+ "smlal v9.4s, v30.4h, v5.4h\n"
+ "smlal v19.4s, v28.4h, v5.4h\n"
+ "smlal2 v11.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x19, x15]\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal2 v24.4s, v31.8h, v3.8h\n"
+ "smlal2 v23.4s, v30.8h, v5.8h\n"
+ "add x15, x15, #0x8\n"
+ "smlal v18.4s, v29.4h, v7.4h\n"
+ "smlal v9.4s, v29.4h, v6.4h\n"
+ "smlal2 v24.4s, v29.8h, v7.8h\n"
+ "smlal2 v23.4s, v29.8h, v6.8h\n"
"smlal v13.4s, v31.4h, v6.4h\n"
- "usubl v28.8h, v28.8b, v14.8b\n"
- "smlal2 v19.4s, v31.8h, v6.8h\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v3.8h\n"
- "smlal v17.4s, v30.4h, v8.4h\n"
- "smlal2 v25.4s, v30.8h, v8.8h\n"
- "smlal v23.4s, v30.4h, v5.4h\n"
- "smlal2 v20.4s, v30.8h, v5.8h\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "smlal2 v21.4s, v29.8h, v7.8h\n"
- "smlal v23.4s, v29.4h, v6.4h\n"
- "smlal2 v20.4s, v29.8h, v6.8h\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "smlal2 v21.4s, v28.8h, v8.8h\n"
- "smlal v23.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "sqrdmulh v13.4s, v13.4s, v26.4s\n"
- "sqrdmulh v19.4s, v19.4s, v11.4s\n"
- "sqrdmulh v17.4s, v17.4s, v26.4s\n"
- "sqrdmulh v25.4s, v25.4s, v11.4s\n"
- "and v22.16b, v13.16b, v10.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v28.16b, v19.16b, v18.16b\n"
- "and v3.16b, v17.16b, v10.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v6.16b, v25.16b, v18.16b\n"
- "sqrdmulh v16.4s, v16.4s, v26.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqrdmulh v21.4s, v21.4s, v11.4s\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sqadd v13.4s, v13.4s, v22.4s\n"
- "sqrdmulh v23.4s, v23.4s, v26.4s\n"
- "and v0.16b, v16.16b, v10.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v13.4s, v13.4s, v10.4s\n"
- "sqadd v19.4s, v19.4s, v28.4s\n"
- "sqadd v17.4s, v17.4s, v3.4s\n"
- "sqadd v25.4s, v25.4s, v6.4s\n"
- "and v29.16b, v21.16b, v18.16b\n"
+ "smlal v19.4s, v30.4h, v8.4h\n"
+ "sqdmulh v13.4s, v13.4s, v21.4s\n"
+ "smlal v18.4s, v28.4h, v8.4h\n"
+ "smlal v9.4s, v28.4h, v7.4h\n"
+ "sqdmulh v19.4s, v19.4s, v21.4s\n"
+ "smlal2 v26.4s, v31.8h, v6.8h\n"
+ "smlal2 v11.4s, v30.8h, v8.8h\n"
+ "sqdmulh v18.4s, v18.4s, v21.4s\n"
+ "smlal2 v24.4s, v28.8h, v8.8h\n"
+ "smlal2 v23.4s, v28.8h, v7.8h\n"
+ "sqdmulh v9.4s, v9.4s, v21.4s\n"
+ "and v7.16b, v13.16b, v25.16b\n"
+ "sqdmulh v26.4s, v26.4s, v10.4s\n"
+ "and v4.16b, v19.16b, v25.16b\n"
+ "sqdmulh v11.4s, v11.4s, v10.4s\n"
+ "and v21.16b, v18.16b, v25.16b\n"
+ "sqdmulh v24.4s, v24.4s, v10.4s\n"
+ "and v20.16b, v9.16b, v25.16b\n"
+ "sqdmulh v23.4s, v23.4s, v10.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v29.16b, v26.16b, v16.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v10.16b, v11.16b, v16.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v31.16b, v24.16b, v16.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v30.16b, v23.16b, v16.16b\n"
+ "sqadd v13.4s, v13.4s, v7.4s\n"
"sshr v29.4s, v29.4s, #0x1f\n"
- "add v13.4s, v13.4s, v15.4s\n"
- "srshl v19.4s, v19.4s, v18.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "srshl v25.4s, v25.4s, v18.4s\n"
- "smin v13.4s, v13.4s, v12.4s\n"
- "add v19.4s, v19.4s, v15.4s\n"
- "add v17.4s, v17.4s, v15.4s\n"
- "smax v13.4s, v13.4s, v24.4s\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "smin v17.4s, v17.4s, v12.4s\n"
- "add v25.4s, v25.4s, v15.4s\n"
- "smax v19.4s, v19.4s, v24.4s\n"
- "smax v17.4s, v17.4s, v24.4s\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "uzp1 v13.16b, v13.16b, v19.16b\n"
- "sqadd v16.4s, v16.4s, v0.4s\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v21.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v20.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v25.4s\n"
+ "sqadd v26.4s, v26.4s, v29.4s\n"
+ "srshl v19.4s, v19.4s, v25.4s\n"
+ "sqadd v11.4s, v11.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v25.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "srshl v9.4s, v9.4s, v25.4s\n"
+ "sqadd v23.4s, v23.4s, v30.4s\n"
+ "srshl v26.4s, v26.4s, v16.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v11.4s, v11.4s, v16.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "srshl v24.4s, v24.4s, v16.4s\n"
+ "sqxtn v18.4h, v18.4s\n"
+ "srshl v23.4s, v23.4s, v16.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "sqxtn2 v13.8h, v26.4s\n"
+ "sqxtn2 v19.8h, v11.4s\n"
+ "sqxtn2 v18.8h, v24.4s\n"
+ "sqxtn2 v9.8h, v23.4s\n"
+ "sqadd v13.8h, v13.8h, v14.8h\n"
+ "sqadd v19.8h, v19.8h, v14.8h\n"
+ "sqadd v18.8h, v18.8h, v14.8h\n"
+ "sqadd v9.8h, v9.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smax v18.8h, v18.8h, v17.8h\n"
+ "smax v9.8h, v9.8h, v17.8h\n"
+ "smin v13.8h, v13.8h, v15.8h\n"
+ "smin v19.8h, v19.8h, v15.8h\n"
+ "smin v18.8h, v18.8h, v15.8h\n"
+ "smin v9.8h, v9.8h, v15.8h\n"
"uzp1 v13.16b, v13.16b, v13.16b\n"
- "str d13, [x10, x15]\n"
- "smax v25.4s, v25.4s, v24.4s\n"
- "sqadd v21.4s, v21.4s, v29.4s\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "and v3.16b, v23.16b, v10.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "uzp1 v17.16b, v17.16b, v25.16b\n"
- "add v16.4s, v16.4s, v15.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d17, [x9, x15]\n"
- "smin v16.4s, v16.4s, v12.4s\n"
- "sqrdmulh v20.4s, v20.4s, v11.4s\n"
- "add v21.4s, v21.4s, v15.4s\n"
- "sqadd v23.4s, v23.4s, v3.4s\n"
- "smax v16.4s, v16.4s, v24.4s\n"
- "smin v21.4s, v21.4s, v12.4s\n"
- "and v25.16b, v20.16b, v18.16b\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "smax v21.4s, v21.4s, v24.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "uzp1 v16.16b, v16.16b, v21.16b\n"
- "add v23.4s, v23.4s, v15.4s\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d16, [x28, x15]\n"
- "smin v23.4s, v23.4s, v12.4s\n"
- "sqadd v20.4s, v20.4s, v25.4s\n"
- "smax v23.4s, v23.4s, v24.4s\n"
- "srshl v20.4s, v20.4s, v18.4s\n"
- "add v20.4s, v20.4s, v15.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "smax v20.4s, v20.4s, v24.4s\n"
- "uzp1 v23.16b, v23.16b, v20.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d23, [x27, x15]\n"
- "add x15, x15, #0x8\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d13, [x10, x14]\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "str d19, [x9, x14]\n"
+ "str d18, [x28, x14]\n"
+ "str d9, [x27, x14]\n"
+ "add x14, x14, #0x8\n"
"beq 64f\n"
- "add x16, x16, #0x48\n"
+ "add x17, x17, #0x48\n"
"3:" // Oddments
"ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x8, #2, 5f\n"
"ld1 { v13.4s }, [x19], #0x10\n"
"tbz x8, #1, 4f\n"
- "ld1 { v19.d }[0], [x19], #0x8\n"
+ "ld1 { v26.d }[0], [x19], #0x8\n"
"tbz x8, #0, 7f\n"
- "ld1 { v19.s }[2], [x19]\n"
+ "ld1 { v26.s }[2], [x19]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
"tbz x8, #0, 7f\n"
- "ld1 { v19.s }[0], [x19]\n"
+ "ld1 { v26.s }[0], [x19]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x8, #1, 6f\n"
@@ -609,38 +593,38 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 7f\n"
"ld1 { v13.s }[0], [x19]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "mov v17.16b, v13.16b\n"
- "ldr d0, [x16, #0x0]\n"
- "mov v25.16b, v19.16b\n"
- "ldr d1, [x16, #0x8]\n"
- "mov v16.16b, v13.16b\n"
- "ldr d2, [x16, #0x10]\n"
- "mov v21.16b, v19.16b\n"
- "ldr d3, [x16, #0x18]\n"
- "mov v23.16b, v13.16b\n"
- "ldr d4, [x16, #0x20]\n"
- "usubl v0.8h, v0.8b, v9.8b\n"
- "mov v20.16b, v19.16b\n"
- "ldr d5, [x16, #0x28]\n"
- "usubl v1.8h, v1.8b, v9.8b\n"
- "ldr d6, [x16, #0x30]\n"
- "usubl v2.8h, v2.8b, v9.8b\n"
- "ldr d7, [x16, #0x38]\n"
- "usubl v3.8h, v3.8b, v9.8b\n"
- "ldr d8, [x16, #0x40]\n"
- "usubl v4.8h, v4.8b, v9.8b\n"
- "ldp x23, x22, [x14, #0x0]\n"
- "usubl v5.8h, v5.8b, v9.8b\n"
- "ldp x21, x20, [x14, #0x10]\n"
- "usubl v6.8h, v6.8b, v9.8b\n"
- "usubl v7.8h, v7.8b, v9.8b\n"
- "ldr x19, [x14, #0x20]\n"
- "usubl v8.8h, v8.8b, v9.8b\n"
- "add x23, x23, x17\n"
- "add x22, x22, x17\n"
- "add x21, x21, x17\n"
- "add x20, x20, x17\n"
- "add x19, x19, x17\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "mov v19.16b, v13.16b\n"
+ "mov v11.16b, v26.16b\n"
+ "ldr d2, [x17, #0x10]\n"
+ "ldr d3, [x17, #0x18]\n"
+ "mov v18.16b, v13.16b\n"
+ "mov v24.16b, v26.16b\n"
+ "ldr d4, [x17, #0x20]\n"
+ "ldr d5, [x17, #0x28]\n"
+ "mov v9.16b, v13.16b\n"
+ "mov v23.16b, v26.16b\n"
+ "ldr d6, [x17, #0x30]\n"
+ "ldr d7, [x17, #0x38]\n"
+ "usubl v0.8h, v0.8b, v12.8b\n"
+ "usubl v1.8h, v1.8b, v12.8b\n"
+ "ldr d8, [x17, #0x40]\n"
+ "ldp x23, x22, [x12, #0x0]\n"
+ "usubl v2.8h, v2.8b, v12.8b\n"
+ "usubl v3.8h, v3.8b, v12.8b\n"
+ "ldp x21, x20, [x12, #0x10]\n"
+ "ldr x19, [x12, #0x20]\n"
+ "usubl v4.8h, v4.8b, v12.8b\n"
+ "usubl v5.8h, v5.8b, v12.8b\n"
+ "usubl v6.8h, v6.8b, v12.8b\n"
+ "usubl v7.8h, v7.8b, v12.8b\n"
+ "usubl v8.8h, v8.8b, v12.8b\n"
+ "add x23, x23, x15\n"
+ "add x22, x22, x15\n"
+ "add x21, x21, x15\n"
+ "add x20, x20, x15\n"
+ "add x19, x19, x15\n"
"tbz x8, #2, 9f\n"
"ld1 { v31.s }[0], [x23], #0x4\n"
"ld1 { v30.s }[0], [x22], #0x4\n"
@@ -690,33 +674,33 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"ld1 { v28.b }[0], [x20]\n"
"ld1 { v27.b }[0], [x19]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ldr x21, [x14, #0x28]\n"
- "usubl v31.8h, v31.8b, v14.8b\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
"smlal v13.4s, v31.4h, v4.4h\n"
- "usubl v30.8h, v30.8b, v14.8b\n"
- "smlal2 v19.4s, v31.8h, v4.8h\n"
- "usubl v29.8h, v29.8b, v14.8b\n"
- "smlal v17.4s, v31.4h, v3.4h\n"
- "usubl v28.8h, v28.8b, v14.8b\n"
- "smlal2 v25.4s, v31.8h, v3.8h\n"
- "usubl v27.8h, v27.8b, v14.8b\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "add x21, x21, x17\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "smlal v23.4s, v31.4h, v0.4h\n"
- "smlal2 v20.4s, v31.8h, v0.8h\n"
+ "smlal2 v26.4s, v31.8h, v4.8h\n"
+ "ldr x21, [x12, #0x28]\n"
+ "smlal v19.4s, v31.4h, v3.4h\n"
+ "smlal2 v11.4s, v31.8h, v3.8h\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "add x21, x21, x15\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "smlal v18.4s, v31.4h, v1.4h\n"
+ "smlal2 v24.4s, v31.8h, v1.8h\n"
+ "smlal v9.4s, v31.4h, v0.4h\n"
+ "smlal2 v23.4s, v31.8h, v0.8h\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
"smlal v13.4s, v30.4h, v0.4h\n"
- "smlal2 v19.4s, v30.8h, v0.8h\n"
- "smlal v17.4s, v29.4h, v2.4h\n"
- "smlal2 v25.4s, v29.8h, v2.8h\n"
+ "smlal2 v26.4s, v30.8h, v0.8h\n"
+ "usubl v27.8h, v27.8b, v22.8b\n"
+ "smlal v19.4s, v29.4h, v2.4h\n"
+ "smlal2 v11.4s, v29.8h, v2.8h\n"
"smlal v13.4s, v28.4h, v5.4h\n"
- "smlal2 v19.4s, v28.8h, v5.8h\n"
- "smlal v17.4s, v28.4h, v4.4h\n"
- "smlal2 v25.4s, v28.8h, v4.8h\n"
- "smlal v16.4s, v28.4h, v2.4h\n"
- "smlal2 v21.4s, v28.8h, v2.8h\n"
- "smlal v23.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
+ "smlal2 v26.4s, v28.8h, v5.8h\n"
+ "smlal v19.4s, v28.4h, v4.4h\n"
+ "smlal2 v11.4s, v28.8h, v4.8h\n"
+ "smlal v18.4s, v28.4h, v2.4h\n"
+ "smlal2 v24.4s, v28.8h, v2.8h\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "smlal2 v23.4s, v28.8h, v1.8h\n"
"tbz x8, #2, 13f\n"
"ld1 { v31.s }[0], [x21], #0x4\n"
"tbz x8, #1, 12f\n"
@@ -738,19 +722,19 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 15f\n"
"ld1 { v31.b }[0], [x21]\n"
"15:" // Oddments: Load (3, 0): Bit 2: End
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "smlal v18.4s, v31.4h, v6.4h\n"
+ "smlal2 v24.4s, v31.8h, v6.8h\n"
+ "ldr x20, [x12, #0x30]\n"
"smlal v13.4s, v27.4h, v7.4h\n"
- "ldr x20, [x14, #0x30]\n"
- "usubl v31.8h, v31.8b, v14.8b\n"
- "smlal2 v19.4s, v27.8h, v7.8h\n"
- "smlal v17.4s, v27.4h, v6.4h\n"
- "add x20, x20, x17\n"
- "smlal2 v25.4s, v27.8h, v6.8h\n"
- "smlal v23.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
- "smlal v16.4s, v31.4h, v6.4h\n"
- "smlal2 v21.4s, v31.8h, v6.8h\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
+ "smlal2 v26.4s, v27.8h, v7.8h\n"
+ "add x20, x20, x15\n"
+ "smlal v19.4s, v27.4h, v6.4h\n"
+ "smlal2 v11.4s, v27.8h, v6.8h\n"
+ "smlal v18.4s, v27.4h, v4.4h\n"
+ "smlal2 v24.4s, v27.8h, v4.8h\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "smlal2 v23.4s, v27.8h, v3.8h\n"
"tbz x8, #2, 17f\n"
"ld1 { v29.s }[0], [x20], #0x4\n"
"tbz x8, #1, 16f\n"
@@ -772,11 +756,11 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 19f\n"
"ld1 { v29.b }[0], [x20]\n"
"19:" // Oddments: Load (3, 3): Bit 2: End
- "ldr x26, [x14, #0x38]\n"
- "usubl v29.8h, v29.8b, v14.8b\n"
- "smlal v23.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "add x26, x26, x17\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "ldr x26, [x12, #0x38]\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "smlal2 v23.4s, v29.8h, v8.8h\n"
+ "add x26, x26, x15\n"
"tbz x8, #2, 21f\n"
"ld1 { v28.s }[0], [x26], #0x4\n"
"tbz x8, #1, 20f\n"
@@ -798,13 +782,13 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 23f\n"
"ld1 { v28.b }[0], [x26]\n"
"23:" // Oddments: Load (0, 1): Bit 2: End
- "ldr x25, [x14, #0x40]\n"
- "usubl v28.8h, v28.8b, v14.8b\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "ldr x25, [x12, #0x40]\n"
"smlal v13.4s, v28.4h, v1.4h\n"
- "smlal2 v19.4s, v28.8h, v1.8h\n"
- "add x25, x25, x17\n"
- "smlal v17.4s, v28.4h, v0.4h\n"
- "smlal2 v25.4s, v28.8h, v0.8h\n"
+ "smlal2 v26.4s, v28.8h, v1.8h\n"
+ "smlal v19.4s, v28.4h, v0.4h\n"
+ "smlal2 v11.4s, v28.8h, v0.8h\n"
+ "add x25, x25, x15\n"
"tbz x8, #2, 25f\n"
"ld1 { v31.s }[0], [x25], #0x4\n"
"tbz x8, #1, 24f\n"
@@ -826,13 +810,13 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 27f\n"
"ld1 { v31.b }[0], [x25]\n"
"27:" // Oddments: Load (0, 2): Bit 2: End
- "ldr x19, [x14, #0x48]\n"
- "usubl v31.8h, v31.8b, v14.8b\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "ldr x19, [x12, #0x48]\n"
"smlal v13.4s, v31.4h, v2.4h\n"
- "smlal2 v19.4s, v31.8h, v2.8h\n"
- "add x19, x19, x17\n"
- "smlal v17.4s, v31.4h, v1.4h\n"
- "smlal2 v25.4s, v31.8h, v1.8h\n"
+ "smlal2 v26.4s, v31.8h, v2.8h\n"
+ "smlal v19.4s, v31.4h, v1.4h\n"
+ "smlal2 v11.4s, v31.8h, v1.8h\n"
+ "add x19, x19, x15\n"
"tbz x8, #2, 29f\n"
"ld1 { v30.s }[0], [x19], #0x4\n"
"tbz x8, #1, 28f\n"
@@ -854,17 +838,17 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 31f\n"
"ld1 { v30.b }[0], [x19]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ldr x24, [x14, #0x50]\n"
- "usubl v30.8h, v30.8b, v14.8b\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "ldr x24, [x12, #0x50]\n"
"smlal v13.4s, v30.4h, v8.4h\n"
- "smlal2 v19.4s, v30.8h, v8.8h\n"
- "add x24, x24, x17\n"
- "smlal v17.4s, v30.4h, v7.4h\n"
- "smlal2 v25.4s, v30.8h, v7.8h\n"
- "smlal v16.4s, v30.4h, v5.4h\n"
- "smlal2 v21.4s, v30.8h, v5.8h\n"
- "smlal v23.4s, v30.4h, v4.4h\n"
- "smlal2 v20.4s, v30.8h, v4.8h\n"
+ "smlal2 v26.4s, v30.8h, v8.8h\n"
+ "smlal v19.4s, v30.4h, v7.4h\n"
+ "smlal2 v11.4s, v30.8h, v7.8h\n"
+ "add x24, x24, x15\n"
+ "smlal v18.4s, v30.4h, v5.4h\n"
+ "smlal2 v24.4s, v30.8h, v5.8h\n"
+ "smlal v9.4s, v30.4h, v4.4h\n"
+ "smlal2 v23.4s, v30.8h, v4.8h\n"
"tbz x8, #2, 33f\n"
"ld1 { v29.s }[0], [x24], #0x4\n"
"tbz x8, #1, 32f\n"
@@ -886,13 +870,13 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 35f\n"
"ld1 { v29.b }[0], [x24]\n"
"35:" // Oddments: Load (1, 0): Bit 2: End
- "ldr x23, [x14, #0x58]\n"
- "usubl v29.8h, v29.8b, v14.8b\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "ldr x23, [x12, #0x58]\n"
"smlal v13.4s, v29.4h, v3.4h\n"
- "smlal2 v19.4s, v29.8h, v3.8h\n"
- "add x23, x23, x17\n"
- "smlal v16.4s, v29.4h, v0.4h\n"
- "smlal2 v21.4s, v29.8h, v0.8h\n"
+ "smlal2 v26.4s, v29.8h, v3.8h\n"
+ "smlal v18.4s, v29.4h, v0.4h\n"
+ "smlal2 v24.4s, v29.8h, v0.8h\n"
+ "add x23, x23, x15\n"
"tbz x8, #2, 37f\n"
"ld1 { v28.s }[0], [x23], #0x4\n"
"tbz x8, #1, 36f\n"
@@ -914,13 +898,13 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 39f\n"
"ld1 { v28.b }[0], [x23]\n"
"39:" // Oddments: Load (1, 3): Bit 2: End
- "ldr x22, [x14, #0x60]\n"
- "usubl v28.8h, v28.8b, v14.8b\n"
- "smlal v17.4s, v28.4h, v5.4h\n"
- "smlal2 v25.4s, v28.8h, v5.8h\n"
- "add x22, x22, x17\n"
- "smlal v23.4s, v28.4h, v2.4h\n"
- "smlal2 v20.4s, v28.8h, v2.8h\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "ldr x22, [x12, #0x60]\n"
+ "smlal v19.4s, v28.4h, v5.4h\n"
+ "smlal2 v11.4s, v28.8h, v5.8h\n"
+ "smlal v9.4s, v28.4h, v2.4h\n"
+ "smlal2 v23.4s, v28.8h, v2.8h\n"
+ "add x22, x22, x15\n"
"tbz x8, #2, 41f\n"
"ld1 { v31.s }[0], [x22], #0x4\n"
"tbz x8, #1, 40f\n"
@@ -942,13 +926,13 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 43f\n"
"ld1 { v31.b }[0], [x22]\n"
"43:" // Oddments: Load (2, 0): Bit 2: End
- "ldr x21, [x14, #0x68]\n"
- "usubl v31.8h, v31.8b, v14.8b\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "ldr x21, [x12, #0x68]\n"
"smlal v13.4s, v31.4h, v6.4h\n"
- "smlal2 v19.4s, v31.8h, v6.8h\n"
- "add x21, x21, x17\n"
- "smlal v16.4s, v31.4h, v3.4h\n"
- "smlal2 v21.4s, v31.8h, v3.8h\n"
+ "smlal2 v26.4s, v31.8h, v6.8h\n"
+ "smlal v18.4s, v31.4h, v3.4h\n"
+ "smlal2 v24.4s, v31.8h, v3.8h\n"
+ "add x21, x21, x15\n"
"tbz x8, #2, 45f\n"
"ld1 { v30.s }[0], [x21], #0x4\n"
"tbz x8, #1, 44f\n"
@@ -970,13 +954,13 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 47f\n"
"ld1 { v30.b }[0], [x21]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "ldr x20, [x14, #0x70]\n"
- "usubl v30.8h, v30.8b, v14.8b\n"
- "smlal v17.4s, v30.4h, v8.4h\n"
- "smlal2 v25.4s, v30.8h, v8.8h\n"
- "add x20, x20, x17\n"
- "smlal v23.4s, v30.4h, v5.4h\n"
- "smlal2 v20.4s, v30.8h, v5.8h\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "ldr x20, [x12, #0x70]\n"
+ "smlal v19.4s, v30.4h, v8.4h\n"
+ "smlal2 v11.4s, v30.8h, v8.8h\n"
+ "smlal v9.4s, v30.4h, v5.4h\n"
+ "smlal2 v23.4s, v30.8h, v5.8h\n"
+ "add x20, x20, x15\n"
"tbz x8, #2, 49f\n"
"ld1 { v29.s }[0], [x20], #0x4\n"
"tbz x8, #1, 48f\n"
@@ -998,13 +982,13 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 51f\n"
"ld1 { v29.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ldr x19, [x14, #0x78]\n"
- "usubl v29.8h, v29.8b, v14.8b\n"
- "smlal v16.4s, v29.4h, v7.4h\n"
- "smlal2 v21.4s, v29.8h, v7.8h\n"
- "add x19, x19, x17\n"
- "smlal v23.4s, v29.4h, v6.4h\n"
- "smlal2 v20.4s, v29.8h, v6.8h\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "ldr x19, [x12, #0x78]\n"
+ "smlal v18.4s, v29.4h, v7.4h\n"
+ "smlal2 v24.4s, v29.8h, v7.8h\n"
+ "smlal v9.4s, v29.4h, v6.4h\n"
+ "smlal2 v23.4s, v29.8h, v6.8h\n"
+ "add x19, x19, x15\n"
"tbz x8, #2, 53f\n"
"ld1 { v28.s }[0], [x19], #0x4\n"
"tbz x8, #1, 52f\n"
@@ -1026,160 +1010,150 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 55f\n"
"ld1 { v28.b }[0], [x19]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "usubl v28.8h, v28.8b, v14.8b\n"
- "smlal v16.4s, v28.4h, v8.4h\n"
- "smlal2 v21.4s, v28.8h, v8.8h\n"
- "smlal v23.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal v18.4s, v28.4h, v8.4h\n"
+ "smlal2 v24.4s, v28.8h, v8.8h\n"
+ "smlal v9.4s, v28.4h, v7.4h\n"
+ "smlal2 v23.4s, v28.8h, v7.8h\n"
"tbz x8, #2, 57f\n"
- "ld1 { v26.4s }, [x13], #0x10\n"
- "ld1 { v10.4s }, [x11], #0x10\n"
+ "ld1 { v21.4s }, [x13], #0x10\n"
+ "ld1 { v25.4s }, [x11], #0x10\n"
"tbz x8, #1, 56f\n"
- "ld1 { v11.d }[0], [x13], #0x8\n"
- "ld1 { v18.d }[0], [x11], #0x8\n"
+ "ld1 { v10.d }[0], [x13], #0x8\n"
+ "ld1 { v16.d }[0], [x11], #0x8\n"
"tbz x8, #0, 59f\n"
- "ld1 { v11.s }[2], [x13]\n"
- "ld1 { v18.s }[2], [x11]\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v16.s }[2], [x11]\n"
"b 59f\n"
"56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x8, #0, 59f\n"
- "ld1 { v11.s }[0], [x13]\n"
- "ld1 { v18.s }[0], [x11]\n"
+ "ld1 { v10.s }[0], [x13]\n"
+ "ld1 { v16.s }[0], [x11]\n"
"b 59f\n"
"57:" // Oddments: Load requant params: Bit 2: Unset
"tbz x8, #1, 58f\n"
- "ld1 { v26.d }[0], [x13], #0x8\n"
- "ld1 { v10.d }[0], [x11], #0x8\n"
+ "ld1 { v21.d }[0], [x13], #0x8\n"
+ "ld1 { v25.d }[0], [x11], #0x8\n"
"tbz x8, #0, 59f\n"
- "ld1 { v26.s }[2], [x13]\n"
- "ld1 { v10.s }[2], [x11]\n"
+ "ld1 { v21.s }[2], [x13]\n"
+ "ld1 { v25.s }[2], [x11]\n"
"b 59f\n"
"58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x8, #0, 59f\n"
- "ld1 { v26.s }[0], [x13]\n"
- "ld1 { v10.s }[0], [x11]\n"
+ "ld1 { v21.s }[0], [x13]\n"
+ "ld1 { v25.s }[0], [x11]\n"
"59:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v13.4s, v13.4s, v26.4s\n"
- "add x10, x10, x15\n"
- "sqrdmulh v19.4s, v19.4s, v11.4s\n"
- "add x9, x9, x15\n"
- "sqrdmulh v17.4s, v17.4s, v26.4s\n"
- "add x28, x28, x15\n"
- "sqrdmulh v25.4s, v25.4s, v11.4s\n"
- "add x27, x27, x15\n"
- "sqrdmulh v16.4s, v16.4s, v26.4s\n"
- "and v22.16b, v13.16b, v10.16b\n"
- "sshr v22.4s, v22.4s, #0x1f\n"
- "and v28.16b, v19.16b, v18.16b\n"
- "and v3.16b, v17.16b, v10.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "and v6.16b, v25.16b, v18.16b\n"
- "and v0.16b, v16.16b, v10.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqrdmulh v21.4s, v21.4s, v11.4s\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sqadd v13.4s, v13.4s, v22.4s\n"
- "sqrdmulh v23.4s, v23.4s, v26.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v20.4s, v20.4s, v11.4s\n"
- "sqadd v19.4s, v19.4s, v28.4s\n"
- "sqadd v17.4s, v17.4s, v3.4s\n"
- "srshl v13.4s, v13.4s, v10.4s\n"
- "sqadd v25.4s, v25.4s, v6.4s\n"
- "srshl v19.4s, v19.4s, v18.4s\n"
- "srshl v17.4s, v17.4s, v10.4s\n"
- "add v13.4s, v13.4s, v15.4s\n"
- "srshl v25.4s, v25.4s, v18.4s\n"
- "add v19.4s, v19.4s, v15.4s\n"
- "smin v13.4s, v13.4s, v12.4s\n"
- "add v17.4s, v17.4s, v15.4s\n"
- "smin v19.4s, v19.4s, v12.4s\n"
- "smax v13.4s, v13.4s, v24.4s\n"
- "smin v17.4s, v17.4s, v12.4s\n"
- "smax v19.4s, v19.4s, v24.4s\n"
- "add v25.4s, v25.4s, v15.4s\n"
- "smax v17.4s, v17.4s, v24.4s\n"
- "uzp1 v13.16b, v13.16b, v19.16b\n"
- "smin v25.4s, v25.4s, v12.4s\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "sqadd v16.4s, v16.4s, v0.4s\n"
- "smax v25.4s, v25.4s, v24.4s\n"
- "and v29.16b, v21.16b, v18.16b\n"
+ "sqdmulh v13.4s, v13.4s, v21.4s\n"
+ "sqdmulh v19.4s, v19.4s, v21.4s\n"
+ "add x10, x10, x14\n"
+ "add x9, x9, x14\n"
+ "sqdmulh v18.4s, v18.4s, v21.4s\n"
+ "sqdmulh v9.4s, v9.4s, v21.4s\n"
+ "add x28, x28, x14\n"
+ "add x27, x27, x14\n"
+ "and v7.16b, v13.16b, v25.16b\n"
+ "sqdmulh v26.4s, v26.4s, v10.4s\n"
+ "and v4.16b, v19.16b, v25.16b\n"
+ "sqdmulh v11.4s, v11.4s, v10.4s\n"
+ "and v21.16b, v18.16b, v25.16b\n"
+ "sqdmulh v24.4s, v24.4s, v10.4s\n"
+ "and v20.16b, v9.16b, v25.16b\n"
+ "sqdmulh v23.4s, v23.4s, v10.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v29.16b, v26.16b, v16.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v10.16b, v11.16b, v16.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v31.16b, v24.16b, v16.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v30.16b, v23.16b, v16.16b\n"
+ "sqadd v13.4s, v13.4s, v7.4s\n"
"sshr v29.4s, v29.4s, #0x1f\n"
- "uzp1 v17.16b, v17.16b, v25.16b\n"
- "srshl v16.4s, v16.4s, v10.4s\n"
- "and v3.16b, v23.16b, v10.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "add v16.4s, v16.4s, v15.4s\n"
- "sqadd v21.4s, v21.4s, v29.4s\n"
- "and v25.16b, v20.16b, v18.16b\n"
- "sshr v25.4s, v25.4s, #0x1f\n"
- "smin v16.4s, v16.4s, v12.4s\n"
- "srshl v21.4s, v21.4s, v18.4s\n"
- "sqadd v23.4s, v23.4s, v3.4s\n"
- "smax v16.4s, v16.4s, v24.4s\n"
- "add v21.4s, v21.4s, v15.4s\n"
- "srshl v23.4s, v23.4s, v10.4s\n"
- "sqadd v20.4s, v20.4s, v25.4s\n"
- "smin v21.4s, v21.4s, v12.4s\n"
- "add v23.4s, v23.4s, v15.4s\n"
- "srshl v20.4s, v20.4s, v18.4s\n"
- "smax v21.4s, v21.4s, v24.4s\n"
- "smin v23.4s, v23.4s, v12.4s\n"
- "uzp1 v16.16b, v16.16b, v21.16b\n"
- "add v20.4s, v20.4s, v15.4s\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "smax v23.4s, v23.4s, v24.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "smax v20.4s, v20.4s, v24.4s\n"
- "uzp1 v23.16b, v23.16b, v20.16b\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v21.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v20.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v25.4s\n"
+ "sqadd v26.4s, v26.4s, v29.4s\n"
+ "srshl v19.4s, v19.4s, v25.4s\n"
+ "sqadd v11.4s, v11.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v25.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "srshl v9.4s, v9.4s, v25.4s\n"
+ "sqadd v23.4s, v23.4s, v30.4s\n"
+ "srshl v26.4s, v26.4s, v16.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v11.4s, v11.4s, v16.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "srshl v24.4s, v24.4s, v16.4s\n"
+ "sqxtn v18.4h, v18.4s\n"
+ "srshl v23.4s, v23.4s, v16.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "sqxtn2 v13.8h, v26.4s\n"
+ "sqxtn2 v19.8h, v11.4s\n"
+ "sqxtn2 v18.8h, v24.4s\n"
+ "sqxtn2 v9.8h, v23.4s\n"
+ "sqadd v13.8h, v13.8h, v14.8h\n"
+ "sqadd v19.8h, v19.8h, v14.8h\n"
+ "sqadd v18.8h, v18.8h, v14.8h\n"
+ "sqadd v9.8h, v9.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smax v18.8h, v18.8h, v17.8h\n"
+ "smax v9.8h, v9.8h, v17.8h\n"
+ "smin v13.8h, v13.8h, v15.8h\n"
+ "smin v19.8h, v19.8h, v15.8h\n"
+ "smin v18.8h, v18.8h, v15.8h\n"
+ "smin v9.8h, v9.8h, v15.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
"tbz x8, #2, 61f\n"
"st1 { v13.s }[0], [x10], #0x4\n"
- "st1 { v17.s }[0], [x9], #0x4\n"
- "st1 { v16.s }[0], [x28], #0x4\n"
- "st1 { v23.s }[0], [x27], #0x4\n"
+ "st1 { v19.s }[0], [x9], #0x4\n"
+ "st1 { v18.s }[0], [x28], #0x4\n"
+ "st1 { v9.s }[0], [x27], #0x4\n"
"tbz x8, #1, 60f\n"
"st1 { v13.h }[2], [x10], #0x2\n"
- "st1 { v17.h }[2], [x9], #0x2\n"
- "st1 { v16.h }[2], [x28], #0x2\n"
- "st1 { v23.h }[2], [x27], #0x2\n"
+ "st1 { v19.h }[2], [x9], #0x2\n"
+ "st1 { v18.h }[2], [x28], #0x2\n"
+ "st1 { v9.h }[2], [x27], #0x2\n"
"tbz x8, #0, 63f\n"
"st1 { v13.b }[6], [x10], #0x1\n"
- "st1 { v17.b }[6], [x9], #0x1\n"
- "st1 { v16.b }[6], [x28], #0x1\n"
- "st1 { v23.b }[6], [x27], #0x1\n"
+ "st1 { v19.b }[6], [x9], #0x1\n"
+ "st1 { v18.b }[6], [x28], #0x1\n"
+ "st1 { v9.b }[6], [x27], #0x1\n"
"b 63f\n"
"60:" // Oddments: Bit 2: Bit 1: Unset
"tbz x8, #0, 63f\n"
"st1 { v13.b }[4], [x10], #0x1\n"
- "st1 { v17.b }[4], [x9], #0x1\n"
- "st1 { v16.b }[4], [x28], #0x1\n"
- "st1 { v23.b }[4], [x27], #0x1\n"
+ "st1 { v19.b }[4], [x9], #0x1\n"
+ "st1 { v18.b }[4], [x28], #0x1\n"
+ "st1 { v9.b }[4], [x27], #0x1\n"
"b 63f\n"
"61:" // Oddments: Bit 2: Unset
"tbz x8, #1, 62f\n"
"st1 { v13.h }[0], [x10], #0x2\n"
- "st1 { v17.h }[0], [x9], #0x2\n"
- "st1 { v16.h }[0], [x28], #0x2\n"
- "st1 { v23.h }[0], [x27], #0x2\n"
+ "st1 { v19.h }[0], [x9], #0x2\n"
+ "st1 { v18.h }[0], [x28], #0x2\n"
+ "st1 { v9.h }[0], [x27], #0x2\n"
"tbz x8, #0, 63f\n"
"st1 { v13.b }[2], [x10], #0x1\n"
- "st1 { v17.b }[2], [x9], #0x1\n"
- "st1 { v16.b }[2], [x28], #0x1\n"
- "st1 { v23.b }[2], [x27], #0x1\n"
+ "st1 { v19.b }[2], [x9], #0x1\n"
+ "st1 { v18.b }[2], [x28], #0x1\n"
+ "st1 { v9.b }[2], [x27], #0x1\n"
"b 63f\n"
"62:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x8, #0, 63f\n"
"st1 { v13.b }[0], [x10], #0x1\n"
- "st1 { v17.b }[0], [x9], #0x1\n"
- "st1 { v16.b }[0], [x28], #0x1\n"
- "st1 { v23.b }[0], [x27], #0x1\n"
+ "st1 { v19.b }[0], [x9], #0x1\n"
+ "st1 { v18.b }[0], [x28], #0x1\n"
+ "st1 { v9.b }[0], [x27], #0x1\n"
"63:" // Oddments: Bit 2: End
-
"64:" // End
-
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 44817dbccf..00d1c5e868 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,37 +36,24 @@ namespace depthwise {
void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
-struct a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst
+class a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef uint8_t input_type;
- typedef uint8_t weight_type;
- typedef uint8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 2;
constexpr static unsigned int stride_cols = 2;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 5;
- constexpr static unsigned int input_cols = 5;
-
- constexpr static parameter_packing_fn pack_parameters = interleave_a64_u8q_3x3_mla::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_a64_u8q_3x3_mla::get_packed_size;
+ a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
- kern_type kernel = a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
- a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 8d22836e5b..872f665b40 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,7 +46,7 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const uint8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const uint8_t *const *inptrs_raw,
- const uint8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -100,75 +100,75 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
- "mov x5, #0x0\n"
- "ldr x6, [%x[params], %[offsetof_Params_weights]]\n"
- "mov x7, #0x0\n"
- "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
- "add x8, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "lsr x16, x4, #0x3\n"
- "ldr x15, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
- "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v12.16b }, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v13.16b }, [x20]\n"
- "add x20, x22, %[offsetof_Requantize32_minval]\n"
- "ld1r { v11.4s }, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v19.4s }, [x20]\n"
- "ld1r { v14.4s }, [x19]\n"
- "ldp x14, x13, [x21, #0x0]\n"
- "ldp x12, x11, [x21, #0x10]\n"
+ "ldr x19, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "add x24, x19, %[offsetof_Requantize32_a_offset]\n"
+ "add x23, x19, %[offsetof_Requantize32_b_offset]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x19, %[offsetof_Requantize32_c_offset]\n"
+ "add x20, x19, %[offsetof_Requantize32_minval]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x19, x19, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v12.16b }, [x24]\n"
+ "ld1r { v13.16b }, [x23]\n"
+ "lsr x16, x8, #0x3\n"
+ "ld1r { v11.8h }, [x21]\n"
+ "ld1r { v17.8h }, [x20]\n"
+ "mov x15, #0x0\n"
+ "mov x14, #0x0\n"
+ "ld1r { v14.8h }, [x19]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x12, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x10, x9, [x22, #0x0]\n"
+ "ldp x28, x27, [x22, #0x10]\n"
"cbz x16, 3f\n"
- "subs x16, x16, #0x1\n"
"ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
"ldr q15, [x19, #0x0]\n"
- "mov v20.16b, v15.16b\n"
+ "subs x16, x16, #0x1\n"
+ "mov v9.16b, v15.16b\n"
"ldr q10, [x19, #0x10]\n"
"add x19, x19, #0x20\n"
- "mov v16.16b, v15.16b\n"
"str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "mov v17.16b, v15.16b\n"
- "ldr d0, [x6, #0x0]\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "ldr d2, [x17, #0x10]\n"
+ "mov v16.16b, v10.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldr d3, [x17, #0x18]\n"
+ "ldr d4, [x17, #0x20]\n"
+ "mov v21.16b, v10.16b\n"
+ "mov v23.16b, v15.16b\n"
+ "ldr d5, [x17, #0x28]\n"
+ "ldr d6, [x17, #0x30]\n"
+ "mov v18.16b, v10.16b\n"
"usubl v0.8h, v0.8b, v13.8b\n"
- "mov v23.16b, v10.16b\n"
- "ldr d1, [x6, #0x8]\n"
- "mov v22.16b, v10.16b\n"
- "ldr d2, [x6, #0x10]\n"
+ "ldr d7, [x17, #0x38]\n"
+ "ldr d8, [x17, #0x40]\n"
"usubl v1.8h, v1.8b, v13.8b\n"
- "mov v18.16b, v10.16b\n"
- "ldr d3, [x6, #0x18]\n"
- "ldr d4, [x6, #0x20]\n"
"usubl v2.8h, v2.8b, v13.8b\n"
- "ldr d5, [x6, #0x28]\n"
+ "ldp x26, x25, [x12, #0x0]\n"
+ "ldp x24, x23, [x12, #0x10]\n"
"usubl v3.8h, v3.8b, v13.8b\n"
- "ldr d6, [x6, #0x30]\n"
- "ldr d7, [x6, #0x38]\n"
"usubl v4.8h, v4.8b, v13.8b\n"
- "ldr d8, [x6, #0x40]\n"
+ "ldp x22, x21, [x12, #0x20]\n"
+ "ldp x20, x19, [x12, #0x30]\n"
"usubl v5.8h, v5.8b, v13.8b\n"
- "ldp x26, x25, [x8, #0x0]\n"
"usubl v6.8h, v6.8b, v13.8b\n"
- "ldp x24, x23, [x8, #0x10]\n"
+ "ldr d31, [x26, x15]\n"
+ "ldr d30, [x25, x15]\n"
"usubl v7.8h, v7.8b, v13.8b\n"
"usubl v8.8h, v8.8b, v13.8b\n"
- "ldp x22, x21, [x8, #0x20]\n"
- "ldp x20, x19, [x8, #0x30]\n"
- "ldr d31, [x26, x5]\n"
+ "ldr d29, [x24, x15]\n"
+ "ldr d28, [x23, x15]\n"
"usubl v31.8h, v31.8b, v12.8b\n"
- "ldr d30, [x25, x5]\n"
- "ldr d29, [x24, x5]\n"
"usubl v30.8h, v30.8b, v12.8b\n"
- "ldr d28, [x23, x5]\n"
- "ldr d27, [x22, x5]\n"
+ "ldr d27, [x22, x15]\n"
+ "ldr d26, [x21, x15]\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "ldr d26, [x21, x5]\n"
"usubl v28.8h, v28.8b, v12.8b\n"
- "ldr d25, [x20, x5]\n"
- "ldr d24, [x19, x5]\n"
+ "ldr d25, [x20, x15]\n"
+ "ldr d24, [x19, x15]\n"
"usubl v27.8h, v27.8b, v12.8b\n"
"usubl v26.8h, v26.8b, v12.8b\n"
"usubl v25.8h, v25.8b, v12.8b\n"
@@ -176,259 +176,251 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"beq 2f\n"
"1:" // Loop
"smlal v15.4s, v31.4h, v8.4h\n"
- "ldr x23, [x8, #0x40]\n"
- "add x6, x6, #0x48\n"
"smlal2 v10.4s, v31.8h, v8.8h\n"
- "ldr x22, [x8, #0x48]\n"
- "subs x16, x16, #0x1\n"
- "smlal v20.4s, v31.4h, v6.4h\n"
- "ldr x21, [x8, #0x50]\n"
- "smlal2 v23.4s, v31.8h, v6.8h\n"
- "ldr x20, [x8, #0x58]\n"
- "smlal v16.4s, v31.4h, v2.4h\n"
- "ldr x19, [x8, #0x60]\n"
- "smlal2 v22.4s, v31.8h, v2.8h\n"
- "ldr x10, [x8, #0x68]\n"
- "smlal v17.4s, v31.4h, v0.4h\n"
- "ldr x9, [x8, #0x70]\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
- "ldr x28, [x8, #0x78]\n"
+ "ldr x24, [x12, #0x40]\n"
+ "ldr x23, [x12, #0x48]\n"
+ "smlal v9.4s, v31.4h, v6.4h\n"
+ "smlal2 v16.4s, v31.8h, v6.8h\n"
+ "ldr x21, [x12, #0x50]\n"
+ "ldr x19, [x12, #0x58]\n"
"smlal v15.4s, v30.4h, v0.4h\n"
- "ldr x27, [x8, #0x80]\n"
"smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr x26, [x8, #0x88]\n"
- "smlal v20.4s, v28.4h, v1.4h\n"
- "ldr x25, [x8, #0x90]\n"
- "smlal2 v23.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x5]\n"
+ "ldr x22, [x12, #0x78]\n"
+ "ldr x20, [x12, #0x60]\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "smlal2 v16.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x23, x15]\n"
"usubl v28.8h, v28.8b, v12.8b\n"
"smlal v15.4s, v29.4h, v1.4h\n"
- "ldr x24, [x8, #0x98]\n"
"smlal2 v10.4s, v29.8h, v1.8h\n"
- "ldr d29, [x23, x5]\n"
+ "ldr d29, [x24, x15]\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "ldr x23, [x8, #0xa0]\n"
- "smlal2 v23.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x5]\n"
+ "smlal v9.4s, v27.4h, v2.4h\n"
+ "smlal2 v16.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x15]\n"
"usubl v27.8h, v27.8b, v12.8b\n"
"smlal v15.4s, v26.4h, v3.4h\n"
- "ldr x22, [x8, #0xa8]\n"
"smlal2 v10.4s, v26.8h, v3.8h\n"
- "ldr d26, [x20, x5]\n"
+ "ldr d26, [x19, x15]\n"
"usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v9.4s, v24.4h, v0.4h\n"
+ "smlal2 v16.4s, v24.8h, v0.8h\n"
+ "ldr x21, [x12, #0x80]\n"
+ "ldr x19, [x12, #0x68]\n"
"smlal v15.4s, v25.4h, v4.4h\n"
- "ldr x21, [x8, #0xb0]\n"
"smlal2 v10.4s, v25.8h, v4.8h\n"
- "ldr d25, [x19, x5]\n"
+ "ldr d25, [x20, x15]\n"
"usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v9.4s, v29.4h, v4.4h\n"
+ "smlal2 v16.4s, v29.8h, v4.8h\n"
+ "ldr x20, [x12, #0x88]\n"
+ "ldr d29, [x19, x15]\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "ldr x20, [x8, #0xb8]\n"
"smlal2 v10.4s, v24.8h, v2.8h\n"
- "ldr x19, [x8, #0xc0]\n"
- "smlal v20.4s, v24.4h, v0.4h\n"
- "ldr q21, [x17, #0x0]\n"
- "smlal2 v23.4s, v24.8h, v0.8h\n"
- "ldr d24, [x9, x5]\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "smlal v20.4s, v29.4h, v4.4h\n"
- "ldr q30, [x15, #0x0]\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "ldr d29, [x10, x5]\n"
+ "ldr x19, [x12, #0x70]\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v20.4s, v28.4h, v5.4h\n"
- "ldr q31, [x17, #0x10]\n"
- "smlal2 v23.4s, v28.8h, v5.8h\n"
- "ldr d28, [x27, x5]\n"
- "add x17, x17, #0x20\n"
+ "smlal v9.4s, v28.4h, v5.4h\n"
+ "smlal2 v16.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x21, x15]\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal v22.4s, v31.4h, v2.4h\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "ldr x24, [x12, #0x98]\n"
+ "ldr d24, [x19, x15]\n"
"smlal v15.4s, v27.4h, v5.4h\n"
- "ldr q9, [x15, #0x10]\n"
- "add x15, x15, #0x20\n"
"smlal2 v10.4s, v27.8h, v5.8h\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "smlal v20.4s, v27.4h, v3.4h\n"
- "smlal2 v23.4s, v27.8h, v3.8h\n"
- "ldr d27, [x28, x5]\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "ldr x23, [x12, #0x90]\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "smlal2 v16.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x22, x15]\n"
"usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v16.4s, v26.4h, v3.4h\n"
- "smlal2 v22.4s, v26.8h, v3.8h\n"
- "ldr d26, [x26, x5]\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "smlal v22.4s, v26.4h, v3.4h\n"
+ "ldr x22, [x12, #0xa8]\n"
+ "ldr x19, [x12, #0xa0]\n"
+ "smlal2 v21.4s, v26.8h, v3.8h\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "ldr d26, [x20, x15]\n"
"usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v23.4s, v27.4h, v4.4h\n"
+ "smlal v22.4s, v25.4h, v0.4h\n"
+ "ldr x21, [x12, #0xb0]\n"
+ "ldr x20, [x12, #0xb8]\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "smlal2 v18.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x19, x15]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
"smlal v15.4s, v25.4h, v6.4h\n"
+ "ldr x19, [x12, #0xc0]\n"
+ "ldr q19, [x13, #0x0]\n"
"smlal2 v10.4s, v25.8h, v6.8h\n"
- "smlal v16.4s, v25.4h, v0.4h\n"
- "smlal2 v22.4s, v25.8h, v0.8h\n"
- "ldr d25, [x25, x5]\n"
+ "smlal v22.4s, v29.4h, v4.4h\n"
+ "ldr d25, [x23, x15]\n"
"usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v16.4s, v29.4h, v4.4h\n"
- "smlal2 v22.4s, v29.8h, v4.8h\n"
- "ldr d29, [x24, x5]\n"
+ "smlal2 v21.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x24, x15]\n"
+ "smlal2 v18.4s, v28.8h, v1.8h\n"
"usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v23.4s, v26.4h, v5.4h\n"
"smlal v15.4s, v24.4h, v7.4h\n"
+ "ldr q0, [x11, #0x0]\n"
+ "ldr q4, [x13, #0x10]\n"
"smlal2 v10.4s, v24.8h, v7.8h\n"
- "smlal v16.4s, v24.4h, v1.4h\n"
- "smlal2 v22.4s, v24.8h, v1.8h\n"
- "ldr d24, [x22, x5]\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v18.4s, v27.8h, v4.8h\n"
- "ldr d27, [x23, x5]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v20.4s, v28.4h, v7.4h\n"
- "smlal2 v23.4s, v28.8h, v7.8h\n"
- "smlal v17.4s, v28.4h, v1.4h\n"
- "smlal2 v18.4s, v28.8h, v1.8h\n"
- "smlal v16.4s, v25.4h, v6.4h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "ldr d25, [x20, x5]\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v17.4s, v26.4h, v5.4h\n"
+ "smlal v22.4s, v24.4h, v1.4h\n"
+ "sqdmulh v15.4s, v15.4s, v19.4s\n"
+ "ldr q31, [x11, #0x10]\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x22, x15]\n"
"smlal2 v18.4s, v26.8h, v5.8h\n"
- "ldr d26, [x21, x5]\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v20.4s, v29.4h, v8.4h\n"
- "smlal2 v23.4s, v29.8h, v8.8h\n"
- "smlal v17.4s, v29.4h, v2.4h\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v23.4s, v29.4h, v2.4h\n"
+ "ldr d26, [x21, x15]\n"
"smlal2 v18.4s, v29.8h, v2.8h\n"
- "ldr d29, [x19, x5]\n"
- "add x5, x5, #0x8\n"
- "smlal v16.4s, v27.4h, v7.4h\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v22.4s, v27.8h, v7.8h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal v16.4s, v24.4h, v5.4h\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v22.4s, v25.4h, v6.4h\n"
+ "smlal v23.4s, v24.4h, v3.4h\n"
+ "and v30.16b, v15.16b, v0.16b\n"
+ "add x17, x17, #0x48\n"
+ "smlal v9.4s, v28.4h, v7.4h\n"
+ "smlal2 v16.4s, v28.8h, v7.8h\n"
+ "sqdmulh v10.4s, v10.4s, v4.4s\n"
+ "subs x16, x16, #0x1\n"
+ "smlal2 v21.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x20, x15]\n"
"smlal2 v18.4s, v24.8h, v3.8h\n"
- "sqrdmulh v15.4s, v15.4s, v21.4s\n"
- "smlal2 v22.4s, v24.8h, v5.8h\n"
- "smlal v17.4s, v26.4h, v7.4h\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v22.4s, v27.4h, v7.4h\n"
+ "smlal v23.4s, v26.4h, v7.4h\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "add x13, x13, #0x20\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "smlal2 v16.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x19, x15]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal2 v21.4s, v27.8h, v7.8h\n"
"smlal2 v18.4s, v26.8h, v7.8h\n"
- "smlal v16.4s, v25.4h, v8.4h\n"
- "smlal2 v22.4s, v25.8h, v8.8h\n"
- "smlal v17.4s, v25.4h, v6.4h\n"
+ "sqdmulh v9.4s, v9.4s, v19.4s\n"
+ "add x15, x15, #0x8\n"
+ "smlal v22.4s, v24.4h, v5.4h\n"
+ "smlal v23.4s, v25.4h, v6.4h\n"
+ "and v28.16b, v9.16b, v0.16b\n"
+ "add x11, x11, #0x20\n"
+ "smlal2 v21.4s, v24.8h, v5.8h\n"
"smlal2 v18.4s, v25.8h, v6.8h\n"
- "and v26.16b, v15.16b, v30.16b\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "smlal v17.4s, v29.4h, v8.4h\n"
+ "sqdmulh v16.4s, v16.4s, v4.4s\n"
+ "smlal v22.4s, v25.4h, v8.4h\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
+ "sqdmulh v22.4s, v22.4s, v19.4s\n"
+ "smlal2 v21.4s, v25.8h, v8.8h\n"
"smlal2 v18.4s, v29.8h, v8.8h\n"
- "sqrdmulh v10.4s, v10.4s, v31.4s\n"
- "sqrdmulh v20.4s, v20.4s, v21.4s\n"
- "sqrdmulh v23.4s, v23.4s, v31.4s\n"
- "sqrdmulh v16.4s, v16.4s, v21.4s\n"
- "sqadd v15.4s, v15.4s, v26.4s\n"
- "and v8.16b, v10.16b, v9.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v30.4s\n"
- "and v4.16b, v20.16b, v30.16b\n"
+ "sqdmulh v23.4s, v23.4s, v19.4s\n"
+ "and v29.16b, v22.16b, v0.16b\n"
+ "sqdmulh v21.4s, v21.4s, v4.4s\n"
+ "and v20.16b, v23.16b, v0.16b\n"
+ "sqdmulh v18.4s, v18.4s, v4.4s\n"
+ "and v19.16b, v10.16b, v31.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v4.16b, v16.16b, v31.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v5.16b, v21.16b, v31.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v26.16b, v18.16b, v31.16b\n"
+ "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v28.4s\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "and v2.16b, v23.16b, v9.16b\n"
- "and v1.16b, v16.16b, v30.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "add v15.4s, v15.4s, v11.4s\n"
- "sqadd v10.4s, v10.4s, v8.4s\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "sqrdmulh v22.4s, v22.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v4.4s\n"
- "smin v15.4s, v15.4s, v14.4s\n"
- "srshl v10.4s, v10.4s, v9.4s\n"
- "sqadd v23.4s, v23.4s, v2.4s\n"
- "smax v15.4s, v15.4s, v19.4s\n"
- "srshl v20.4s, v20.4s, v30.4s\n"
- "add v10.4s, v10.4s, v11.4s\n"
- "srshl v23.4s, v23.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v1.4s\n"
- "smin v10.4s, v10.4s, v14.4s\n"
- "add v20.4s, v20.4s, v11.4s\n"
- "add v23.4s, v23.4s, v11.4s\n"
- "smax v10.4s, v10.4s, v19.4s\n"
- "smin v20.4s, v20.4s, v14.4s\n"
- "smin v23.4s, v23.4s, v14.4s\n"
- "uzp1 v15.16b, v15.16b, v10.16b\n"
- "smax v20.4s, v20.4s, v19.4s\n"
+ "sqadd v22.4s, v22.4s, v29.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v20.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v0.4s\n"
+ "sqadd v10.4s, v10.4s, v19.4s\n"
+ "srshl v9.4s, v9.4s, v0.4s\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "sqadd v21.4s, v21.4s, v5.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "sqadd v18.4s, v18.4s, v26.4s\n"
+ "srshl v10.4s, v10.4s, v31.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v16.4s, v16.4s, v31.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v21.4s, v21.4s, v31.4s\n"
+ "sqxtn v22.4h, v22.4s\n"
+ "srshl v18.4s, v18.4s, v31.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v15.8h, v10.4s\n"
+ "sqxtn2 v9.8h, v16.4s\n"
+ "sqxtn2 v22.8h, v21.4s\n"
+ "sqxtn2 v23.8h, v18.4s\n"
+ "sqadd v15.8h, v15.8h, v11.8h\n"
+ "sqadd v9.8h, v9.8h, v11.8h\n"
+ "sqadd v22.8h, v22.8h, v11.8h\n"
+ "sqadd v23.8h, v23.8h, v11.8h\n"
+ "smax v15.8h, v15.8h, v17.8h\n"
+ "smax v9.8h, v9.8h, v17.8h\n"
+ "smax v22.8h, v22.8h, v17.8h\n"
+ "smax v23.8h, v23.8h, v17.8h\n"
+ "smin v15.8h, v15.8h, v14.8h\n"
+ "smin v9.8h, v9.8h, v14.8h\n"
+ "smin v22.8h, v22.8h, v14.8h\n"
+ "smin v23.8h, v23.8h, v14.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x14, x7]\n"
- "smax v23.4s, v23.4s, v19.4s\n"
- "srshl v16.4s, v16.4s, v30.4s\n"
- "and v24.16b, v22.16b, v9.16b\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
- "uzp1 v20.16b, v20.16b, v23.16b\n"
- "add v16.4s, v16.4s, v11.4s\n"
- "sqrdmulh v17.4s, v17.4s, v21.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d20, [x13, x7]\n"
- "smin v16.4s, v16.4s, v14.4s\n"
- "sqrdmulh v18.4s, v18.4s, v31.4s\n"
- "sqadd v22.4s, v22.4s, v24.4s\n"
- "and v2.16b, v17.16b, v30.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "smax v16.4s, v16.4s, v19.4s\n"
- "srshl v22.4s, v22.4s, v9.4s\n"
- "and v31.16b, v18.16b, v9.16b\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "add v22.4s, v22.4s, v11.4s\n"
- "sqadd v17.4s, v17.4s, v2.4s\n"
- "smin v22.4s, v22.4s, v14.4s\n"
- "srshl v17.4s, v17.4s, v30.4s\n"
- "sqadd v18.4s, v18.4s, v31.4s\n"
- "smax v22.4s, v22.4s, v19.4s\n"
- "uzp1 v16.16b, v16.16b, v22.16b\n"
- "add v17.4s, v17.4s, v11.4s\n"
- "srshl v18.4s, v18.4s, v9.4s\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d16, [x12, x7]\n"
- "smin v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v11.4s\n"
- "smax v17.4s, v17.4s, v19.4s\n"
- "smin v18.4s, v18.4s, v14.4s\n"
- "smax v18.4s, v18.4s, v19.4s\n"
- "uzp1 v17.16b, v17.16b, v18.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d17, [x11, x7]\n"
- "add x7, x7, #0x8\n"
+ "str d15, [x10, x14]\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str d9, [x9, x14]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d22, [x28, x14]\n"
+ "str d23, [x27, x14]\n"
"ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
"ldr q15, [x19, #0x0]\n"
- "mov v20.16b, v15.16b\n"
+ "add x14, x14, #0x8\n"
"ldr q10, [x19, #0x10]\n"
"add x19, x19, #0x20\n"
- "mov v16.16b, v15.16b\n"
"str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "mov v17.16b, v15.16b\n"
- "ldr d0, [x6, #0x0]\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "ldr d2, [x17, #0x10]\n"
+ "mov v9.16b, v15.16b\n"
+ "mov v16.16b, v10.16b\n"
+ "ldr d3, [x17, #0x18]\n"
+ "ldr d4, [x17, #0x20]\n"
+ "mov v22.16b, v15.16b\n"
+ "mov v21.16b, v10.16b\n"
+ "ldr d5, [x17, #0x28]\n"
+ "ldr d6, [x17, #0x30]\n"
+ "mov v23.16b, v15.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "ldr d7, [x17, #0x38]\n"
+ "ldr d8, [x17, #0x40]\n"
"usubl v0.8h, v0.8b, v13.8b\n"
- "mov v23.16b, v10.16b\n"
- "ldr d1, [x6, #0x8]\n"
- "mov v22.16b, v10.16b\n"
- "ldr d2, [x6, #0x10]\n"
"usubl v1.8h, v1.8b, v13.8b\n"
- "mov v18.16b, v10.16b\n"
- "ldr d3, [x6, #0x18]\n"
- "ldr d4, [x6, #0x20]\n"
+ "ldp x26, x25, [x12, #0x0]\n"
+ "ldp x24, x23, [x12, #0x10]\n"
"usubl v2.8h, v2.8b, v13.8b\n"
- "ldr d5, [x6, #0x28]\n"
"usubl v3.8h, v3.8b, v13.8b\n"
- "ldr d6, [x6, #0x30]\n"
- "ldr d7, [x6, #0x38]\n"
+ "ldp x22, x21, [x12, #0x20]\n"
+ "ldp x20, x19, [x12, #0x30]\n"
"usubl v4.8h, v4.8b, v13.8b\n"
- "ldr d8, [x6, #0x40]\n"
"usubl v5.8h, v5.8b, v13.8b\n"
- "ldp x26, x25, [x8, #0x0]\n"
+ "ldr d31, [x26, x15]\n"
+ "ldr d30, [x25, x15]\n"
"usubl v6.8h, v6.8b, v13.8b\n"
- "ldp x24, x23, [x8, #0x10]\n"
"usubl v7.8h, v7.8b, v13.8b\n"
+ "ldr d29, [x24, x15]\n"
+ "ldr d28, [x23, x15]\n"
"usubl v8.8h, v8.8b, v13.8b\n"
- "ldp x22, x21, [x8, #0x20]\n"
- "ldp x20, x19, [x8, #0x30]\n"
- "ldr d31, [x26, x5]\n"
"usubl v31.8h, v31.8b, v12.8b\n"
- "ldr d30, [x25, x5]\n"
- "ldr d29, [x24, x5]\n"
+ "ldr d27, [x22, x15]\n"
+ "ldr d26, [x21, x15]\n"
"usubl v30.8h, v30.8b, v12.8b\n"
- "ldr d28, [x23, x5]\n"
- "ldr d27, [x22, x5]\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "ldr d26, [x21, x5]\n"
+ "ldr d25, [x20, x15]\n"
+ "ldr d24, [x19, x15]\n"
"usubl v28.8h, v28.8b, v12.8b\n"
- "ldr d25, [x20, x5]\n"
- "ldr d24, [x19, x5]\n"
"usubl v27.8h, v27.8b, v12.8b\n"
"usubl v26.8h, v26.8b, v12.8b\n"
"usubl v25.8h, v25.8b, v12.8b\n"
@@ -436,275 +428,267 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"bgt 1b\n"
"2:" // Tail
"smlal v15.4s, v31.4h, v8.4h\n"
- "ldr x23, [x8, #0x40]\n"
- "tst x4, #0x7\n"
"smlal2 v10.4s, v31.8h, v8.8h\n"
- "ldr x22, [x8, #0x48]\n"
- "smlal v20.4s, v31.4h, v6.4h\n"
- "ldr x21, [x8, #0x50]\n"
- "smlal2 v23.4s, v31.8h, v6.8h\n"
- "ldr x20, [x8, #0x58]\n"
- "smlal v16.4s, v31.4h, v2.4h\n"
- "ldr x19, [x8, #0x60]\n"
- "smlal2 v22.4s, v31.8h, v2.8h\n"
- "ldr x10, [x8, #0x68]\n"
- "smlal v17.4s, v31.4h, v0.4h\n"
- "ldr x9, [x8, #0x70]\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
- "ldr x28, [x8, #0x78]\n"
+ "ldr x24, [x12, #0x40]\n"
+ "ldr x23, [x12, #0x48]\n"
+ "smlal v9.4s, v31.4h, v6.4h\n"
+ "smlal2 v16.4s, v31.8h, v6.8h\n"
+ "ldr x21, [x12, #0x50]\n"
+ "ldr x19, [x12, #0x58]\n"
"smlal v15.4s, v30.4h, v0.4h\n"
- "ldr x27, [x8, #0x80]\n"
"smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr x26, [x8, #0x88]\n"
- "smlal v20.4s, v28.4h, v1.4h\n"
- "ldr x25, [x8, #0x90]\n"
- "smlal2 v23.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x5]\n"
+ "ldr x22, [x12, #0x78]\n"
+ "ldr x20, [x12, #0x60]\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "smlal2 v16.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x23, x15]\n"
"usubl v28.8h, v28.8b, v12.8b\n"
"smlal v15.4s, v29.4h, v1.4h\n"
- "ldr x24, [x8, #0x98]\n"
"smlal2 v10.4s, v29.8h, v1.8h\n"
- "ldr d29, [x23, x5]\n"
+ "ldr d29, [x24, x15]\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "ldr x23, [x8, #0xa0]\n"
- "smlal2 v23.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x5]\n"
+ "smlal v9.4s, v27.4h, v2.4h\n"
+ "smlal2 v16.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x15]\n"
"usubl v27.8h, v27.8b, v12.8b\n"
"smlal v15.4s, v26.4h, v3.4h\n"
- "ldr x22, [x8, #0xa8]\n"
"smlal2 v10.4s, v26.8h, v3.8h\n"
- "ldr d26, [x20, x5]\n"
+ "ldr d26, [x19, x15]\n"
"usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v9.4s, v24.4h, v0.4h\n"
+ "smlal2 v16.4s, v24.8h, v0.8h\n"
+ "ldr x21, [x12, #0x80]\n"
+ "ldr x19, [x12, #0x68]\n"
"smlal v15.4s, v25.4h, v4.4h\n"
- "ldr x21, [x8, #0xb0]\n"
"smlal2 v10.4s, v25.8h, v4.8h\n"
- "ldr d25, [x19, x5]\n"
+ "ldr d25, [x20, x15]\n"
"usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v9.4s, v29.4h, v4.4h\n"
+ "smlal2 v16.4s, v29.8h, v4.8h\n"
+ "ldr x20, [x12, #0x88]\n"
+ "ldr d29, [x19, x15]\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "ldr x20, [x8, #0xb8]\n"
"smlal2 v10.4s, v24.8h, v2.8h\n"
- "ldr x19, [x8, #0xc0]\n"
- "smlal v20.4s, v24.4h, v0.4h\n"
- "ldr q21, [x17, #0x0]\n"
- "smlal2 v23.4s, v24.8h, v0.8h\n"
- "ldr d24, [x9, x5]\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "smlal v20.4s, v29.4h, v4.4h\n"
- "ldr q30, [x15, #0x0]\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "ldr d29, [x10, x5]\n"
+ "ldr x19, [x12, #0x70]\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v20.4s, v28.4h, v5.4h\n"
- "ldr q31, [x17, #0x10]\n"
- "smlal2 v23.4s, v28.8h, v5.8h\n"
- "ldr d28, [x27, x5]\n"
- "add x17, x17, #0x20\n"
+ "smlal v9.4s, v28.4h, v5.4h\n"
+ "smlal2 v16.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x21, x15]\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal v22.4s, v31.4h, v2.4h\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "ldr x24, [x12, #0x98]\n"
+ "ldr d24, [x19, x15]\n"
"smlal v15.4s, v27.4h, v5.4h\n"
- "ldr q9, [x15, #0x10]\n"
- "add x15, x15, #0x20\n"
"smlal2 v10.4s, v27.8h, v5.8h\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "smlal v20.4s, v27.4h, v3.4h\n"
- "smlal2 v23.4s, v27.8h, v3.8h\n"
- "ldr d27, [x28, x5]\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "ldr x23, [x12, #0x90]\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "smlal2 v16.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x22, x15]\n"
"usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v16.4s, v26.4h, v3.4h\n"
- "smlal2 v22.4s, v26.8h, v3.8h\n"
- "ldr d26, [x26, x5]\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "smlal v22.4s, v26.4h, v3.4h\n"
+ "ldr x22, [x12, #0xa8]\n"
+ "ldr x19, [x12, #0xa0]\n"
+ "smlal2 v21.4s, v26.8h, v3.8h\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "ldr d26, [x20, x15]\n"
"usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v23.4s, v27.4h, v4.4h\n"
+ "smlal v22.4s, v25.4h, v0.4h\n"
+ "ldr x21, [x12, #0xb0]\n"
+ "ldr x20, [x12, #0xb8]\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "smlal2 v18.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x19, x15]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
"smlal v15.4s, v25.4h, v6.4h\n"
+ "ldr x19, [x12, #0xc0]\n"
+ "ldr q19, [x13, #0x0]\n"
"smlal2 v10.4s, v25.8h, v6.8h\n"
- "smlal v16.4s, v25.4h, v0.4h\n"
- "smlal2 v22.4s, v25.8h, v0.8h\n"
- "ldr d25, [x25, x5]\n"
+ "smlal v22.4s, v29.4h, v4.4h\n"
+ "ldr d25, [x23, x15]\n"
"usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v16.4s, v29.4h, v4.4h\n"
- "smlal2 v22.4s, v29.8h, v4.8h\n"
- "ldr d29, [x24, x5]\n"
+ "smlal2 v21.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x24, x15]\n"
+ "smlal2 v18.4s, v28.8h, v1.8h\n"
"usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v23.4s, v26.4h, v5.4h\n"
"smlal v15.4s, v24.4h, v7.4h\n"
+ "ldr q0, [x11, #0x0]\n"
+ "ldr q4, [x13, #0x10]\n"
"smlal2 v10.4s, v24.8h, v7.8h\n"
- "smlal v16.4s, v24.4h, v1.4h\n"
- "smlal2 v22.4s, v24.8h, v1.8h\n"
- "ldr d24, [x22, x5]\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
- "smlal2 v18.4s, v27.8h, v4.8h\n"
- "ldr d27, [x23, x5]\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v20.4s, v28.4h, v7.4h\n"
- "smlal2 v23.4s, v28.8h, v7.8h\n"
- "smlal v17.4s, v28.4h, v1.4h\n"
- "smlal2 v18.4s, v28.8h, v1.8h\n"
- "smlal v16.4s, v25.4h, v6.4h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "ldr d25, [x20, x5]\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v17.4s, v26.4h, v5.4h\n"
+ "smlal v22.4s, v24.4h, v1.4h\n"
+ "sqdmulh v15.4s, v15.4s, v19.4s\n"
+ "ldr q31, [x11, #0x10]\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x22, x15]\n"
"smlal2 v18.4s, v26.8h, v5.8h\n"
- "ldr d26, [x21, x5]\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v20.4s, v29.4h, v8.4h\n"
- "smlal2 v23.4s, v29.8h, v8.8h\n"
- "smlal v17.4s, v29.4h, v2.4h\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v23.4s, v29.4h, v2.4h\n"
+ "ldr d26, [x21, x15]\n"
"smlal2 v18.4s, v29.8h, v2.8h\n"
- "ldr d29, [x19, x5]\n"
- "add x5, x5, #0x8\n"
- "smlal v16.4s, v27.4h, v7.4h\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal2 v22.4s, v27.8h, v7.8h\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
- "smlal v16.4s, v24.4h, v5.4h\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v22.4s, v25.4h, v6.4h\n"
+ "smlal v23.4s, v24.4h, v3.4h\n"
+ "and v30.16b, v15.16b, v0.16b\n"
+ "tst x8, #0x7\n"
+ "smlal v9.4s, v28.4h, v7.4h\n"
+ "smlal2 v16.4s, v28.8h, v7.8h\n"
+ "sqdmulh v10.4s, v10.4s, v4.4s\n"
+ "add x13, x13, #0x20\n"
+ "smlal2 v21.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x20, x15]\n"
"smlal2 v18.4s, v24.8h, v3.8h\n"
- "sqrdmulh v15.4s, v15.4s, v21.4s\n"
- "smlal2 v22.4s, v24.8h, v5.8h\n"
- "smlal v17.4s, v26.4h, v7.4h\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v22.4s, v27.4h, v7.4h\n"
+ "smlal v23.4s, v26.4h, v7.4h\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "add x11, x11, #0x20\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "smlal2 v16.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x19, x15]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal2 v21.4s, v27.8h, v7.8h\n"
"smlal2 v18.4s, v26.8h, v7.8h\n"
- "smlal v16.4s, v25.4h, v8.4h\n"
- "smlal2 v22.4s, v25.8h, v8.8h\n"
- "smlal v17.4s, v25.4h, v6.4h\n"
+ "sqdmulh v9.4s, v9.4s, v19.4s\n"
+ "add x15, x15, #0x8\n"
+ "smlal v22.4s, v24.4h, v5.4h\n"
+ "smlal v23.4s, v25.4h, v6.4h\n"
+ "and v28.16b, v9.16b, v0.16b\n"
+ "smlal2 v21.4s, v24.8h, v5.8h\n"
"smlal2 v18.4s, v25.8h, v6.8h\n"
- "and v26.16b, v15.16b, v30.16b\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "smlal v17.4s, v29.4h, v8.4h\n"
+ "sqdmulh v16.4s, v16.4s, v4.4s\n"
+ "smlal v22.4s, v25.4h, v8.4h\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
+ "sqdmulh v22.4s, v22.4s, v19.4s\n"
+ "smlal2 v21.4s, v25.8h, v8.8h\n"
"smlal2 v18.4s, v29.8h, v8.8h\n"
- "sqrdmulh v10.4s, v10.4s, v31.4s\n"
- "sqrdmulh v20.4s, v20.4s, v21.4s\n"
- "sqrdmulh v23.4s, v23.4s, v31.4s\n"
- "sqrdmulh v16.4s, v16.4s, v21.4s\n"
- "sqadd v15.4s, v15.4s, v26.4s\n"
- "and v8.16b, v10.16b, v9.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v30.4s\n"
- "and v4.16b, v20.16b, v30.16b\n"
+ "sqdmulh v23.4s, v23.4s, v19.4s\n"
+ "and v29.16b, v22.16b, v0.16b\n"
+ "sqdmulh v21.4s, v21.4s, v4.4s\n"
+ "and v20.16b, v23.16b, v0.16b\n"
+ "sqdmulh v18.4s, v18.4s, v4.4s\n"
+ "and v19.16b, v10.16b, v31.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v4.16b, v16.16b, v31.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v5.16b, v21.16b, v31.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v26.16b, v18.16b, v31.16b\n"
+ "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v28.4s\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "and v2.16b, v23.16b, v9.16b\n"
- "and v1.16b, v16.16b, v30.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "add v15.4s, v15.4s, v11.4s\n"
- "sqadd v10.4s, v10.4s, v8.4s\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "sqrdmulh v22.4s, v22.4s, v31.4s\n"
- "sqadd v20.4s, v20.4s, v4.4s\n"
- "smin v15.4s, v15.4s, v14.4s\n"
- "srshl v10.4s, v10.4s, v9.4s\n"
- "sqadd v23.4s, v23.4s, v2.4s\n"
- "smax v15.4s, v15.4s, v19.4s\n"
- "srshl v20.4s, v20.4s, v30.4s\n"
- "add v10.4s, v10.4s, v11.4s\n"
- "srshl v23.4s, v23.4s, v9.4s\n"
- "sqadd v16.4s, v16.4s, v1.4s\n"
- "smin v10.4s, v10.4s, v14.4s\n"
- "add v20.4s, v20.4s, v11.4s\n"
- "add v23.4s, v23.4s, v11.4s\n"
- "smax v10.4s, v10.4s, v19.4s\n"
- "smin v20.4s, v20.4s, v14.4s\n"
- "smin v23.4s, v23.4s, v14.4s\n"
- "uzp1 v15.16b, v15.16b, v10.16b\n"
- "smax v20.4s, v20.4s, v19.4s\n"
+ "sqadd v22.4s, v22.4s, v29.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v20.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v0.4s\n"
+ "sqadd v10.4s, v10.4s, v19.4s\n"
+ "srshl v9.4s, v9.4s, v0.4s\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "sqadd v21.4s, v21.4s, v5.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "sqadd v18.4s, v18.4s, v26.4s\n"
+ "srshl v10.4s, v10.4s, v31.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v16.4s, v16.4s, v31.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v21.4s, v21.4s, v31.4s\n"
+ "sqxtn v22.4h, v22.4s\n"
+ "srshl v18.4s, v18.4s, v31.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v15.8h, v10.4s\n"
+ "sqxtn2 v9.8h, v16.4s\n"
+ "sqxtn2 v22.8h, v21.4s\n"
+ "sqxtn2 v23.8h, v18.4s\n"
+ "sqadd v15.8h, v15.8h, v11.8h\n"
+ "sqadd v9.8h, v9.8h, v11.8h\n"
+ "sqadd v22.8h, v22.8h, v11.8h\n"
+ "sqadd v23.8h, v23.8h, v11.8h\n"
+ "smax v15.8h, v15.8h, v17.8h\n"
+ "smax v9.8h, v9.8h, v17.8h\n"
+ "smax v22.8h, v22.8h, v17.8h\n"
+ "smax v23.8h, v23.8h, v17.8h\n"
+ "smin v15.8h, v15.8h, v14.8h\n"
+ "smin v9.8h, v9.8h, v14.8h\n"
+ "smin v22.8h, v22.8h, v14.8h\n"
+ "smin v23.8h, v23.8h, v14.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x14, x7]\n"
- "smax v23.4s, v23.4s, v19.4s\n"
- "srshl v16.4s, v16.4s, v30.4s\n"
- "and v24.16b, v22.16b, v9.16b\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
- "uzp1 v20.16b, v20.16b, v23.16b\n"
- "add v16.4s, v16.4s, v11.4s\n"
- "sqrdmulh v17.4s, v17.4s, v21.4s\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "str d20, [x13, x7]\n"
- "smin v16.4s, v16.4s, v14.4s\n"
- "sqrdmulh v18.4s, v18.4s, v31.4s\n"
- "sqadd v22.4s, v22.4s, v24.4s\n"
- "and v2.16b, v17.16b, v30.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "smax v16.4s, v16.4s, v19.4s\n"
- "srshl v22.4s, v22.4s, v9.4s\n"
- "and v31.16b, v18.16b, v9.16b\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "add v22.4s, v22.4s, v11.4s\n"
- "sqadd v17.4s, v17.4s, v2.4s\n"
- "smin v22.4s, v22.4s, v14.4s\n"
- "srshl v17.4s, v17.4s, v30.4s\n"
- "sqadd v18.4s, v18.4s, v31.4s\n"
- "smax v22.4s, v22.4s, v19.4s\n"
- "uzp1 v16.16b, v16.16b, v22.16b\n"
- "add v17.4s, v17.4s, v11.4s\n"
- "srshl v18.4s, v18.4s, v9.4s\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d16, [x12, x7]\n"
- "smin v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v11.4s\n"
- "smax v17.4s, v17.4s, v19.4s\n"
- "smin v18.4s, v18.4s, v14.4s\n"
- "smax v18.4s, v18.4s, v19.4s\n"
- "uzp1 v17.16b, v17.16b, v18.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "str d17, [x11, x7]\n"
- "add x7, x7, #0x8\n"
+ "str d15, [x10, x14]\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str d9, [x9, x14]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d22, [x28, x14]\n"
+ "str d23, [x27, x14]\n"
+ "add x14, x14, #0x8\n"
"beq 88f\n"
- "add x6, x6, #0x48\n"
+ "add x17, x17, #0x48\n"
"3:" // Oddments
"ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x4, #2, 5f\n"
+ "tbz x8, #2, 5f\n"
"ld1 { v15.4s }, [x19], #0x10\n"
- "tbz x4, #1, 4f\n"
+ "tbz x8, #1, 4f\n"
"ld1 { v10.d }[0], [x19], #0x8\n"
- "tbz x4, #0, 7f\n"
+ "tbz x8, #0, 7f\n"
"ld1 { v10.s }[2], [x19]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x4, #0, 7f\n"
+ "tbz x8, #0, 7f\n"
"ld1 { v10.s }[0], [x19]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x4, #1, 6f\n"
+ "tbz x8, #1, 6f\n"
"ld1 { v15.d }[0], [x19], #0x8\n"
- "tbz x4, #0, 7f\n"
+ "tbz x8, #0, 7f\n"
"ld1 { v15.s }[2], [x19]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 7f\n"
+ "tbz x8, #0, 7f\n"
"ld1 { v15.s }[0], [x19]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "mov v20.16b, v15.16b\n"
- "ldr d0, [x6, #0x0]\n"
- "mov v23.16b, v10.16b\n"
- "ldr d1, [x6, #0x8]\n"
- "mov v16.16b, v15.16b\n"
- "ldr d2, [x6, #0x10]\n"
- "mov v22.16b, v10.16b\n"
- "ldr d3, [x6, #0x18]\n"
- "mov v17.16b, v15.16b\n"
- "ldr d4, [x6, #0x20]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "mov v9.16b, v15.16b\n"
+ "mov v16.16b, v10.16b\n"
+ "ldr d2, [x17, #0x10]\n"
+ "ldr d3, [x17, #0x18]\n"
+ "mov v22.16b, v15.16b\n"
+ "mov v21.16b, v10.16b\n"
+ "ldr d4, [x17, #0x20]\n"
+ "ldr d5, [x17, #0x28]\n"
+ "mov v23.16b, v15.16b\n"
"mov v18.16b, v10.16b\n"
- "ldr d5, [x6, #0x28]\n"
+ "ldr d6, [x17, #0x30]\n"
+ "ldr d7, [x17, #0x38]\n"
+ "usubl v0.8h, v0.8b, v13.8b\n"
"usubl v1.8h, v1.8b, v13.8b\n"
- "ldr d6, [x6, #0x30]\n"
+ "ldr d8, [x17, #0x40]\n"
+ "ldp x26, x25, [x12, #0x0]\n"
"usubl v2.8h, v2.8b, v13.8b\n"
- "ldr d7, [x6, #0x38]\n"
"usubl v3.8h, v3.8b, v13.8b\n"
- "ldr d8, [x6, #0x40]\n"
+ "ldp x24, x23, [x12, #0x10]\n"
+ "ldp x22, x21, [x12, #0x20]\n"
"usubl v4.8h, v4.8b, v13.8b\n"
- "ldp x26, x25, [x8, #0x0]\n"
"usubl v5.8h, v5.8b, v13.8b\n"
- "ldp x24, x23, [x8, #0x10]\n"
+ "ldp x20, x19, [x12, #0x30]\n"
"usubl v6.8h, v6.8b, v13.8b\n"
"usubl v7.8h, v7.8b, v13.8b\n"
- "ldp x22, x21, [x8, #0x20]\n"
"usubl v8.8h, v8.8b, v13.8b\n"
- "ldp x20, x19, [x8, #0x30]\n"
- "add x26, x26, x5\n"
- "add x25, x25, x5\n"
- "add x24, x24, x5\n"
- "add x23, x23, x5\n"
- "add x22, x22, x5\n"
- "add x21, x21, x5\n"
- "add x20, x20, x5\n"
- "add x19, x19, x5\n"
- "tbz x4, #2, 9f\n"
+ "add x26, x26, x15\n"
+ "add x25, x25, x15\n"
+ "add x24, x24, x15\n"
+ "add x23, x23, x15\n"
+ "add x22, x22, x15\n"
+ "add x21, x21, x15\n"
+ "add x20, x20, x15\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 9f\n"
"ld1 { v31.s }[0], [x26], #0x4\n"
"ld1 { v30.s }[0], [x25], #0x4\n"
"ld1 { v29.s }[0], [x24], #0x4\n"
@@ -713,7 +697,7 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v26.s }[0], [x21], #0x4\n"
"ld1 { v25.s }[0], [x20], #0x4\n"
"ld1 { v24.s }[0], [x19], #0x4\n"
- "tbz x4, #1, 8f\n"
+ "tbz x8, #1, 8f\n"
"ld1 { v31.h }[2], [x26], #0x2\n"
"ld1 { v30.h }[2], [x25], #0x2\n"
"ld1 { v29.h }[2], [x24], #0x2\n"
@@ -722,7 +706,7 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v26.h }[2], [x21], #0x2\n"
"ld1 { v25.h }[2], [x20], #0x2\n"
"ld1 { v24.h }[2], [x19], #0x2\n"
- "tbz x4, #0, 11f\n"
+ "tbz x8, #0, 11f\n"
"ld1 { v31.b }[6], [x26]\n"
"ld1 { v30.b }[6], [x25]\n"
"ld1 { v29.b }[6], [x24]\n"
@@ -733,7 +717,7 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v24.b }[6], [x19]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x4, #0, 11f\n"
+ "tbz x8, #0, 11f\n"
"ld1 { v31.b }[4], [x26]\n"
"ld1 { v30.b }[4], [x25]\n"
"ld1 { v29.b }[4], [x24]\n"
@@ -744,7 +728,7 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v24.b }[4], [x19]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x4, #1, 10f\n"
+ "tbz x8, #1, 10f\n"
"ld1 { v31.h }[0], [x26], #0x2\n"
"ld1 { v30.h }[0], [x25], #0x2\n"
"ld1 { v29.h }[0], [x24], #0x2\n"
@@ -753,7 +737,7 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v26.h }[0], [x21], #0x2\n"
"ld1 { v25.h }[0], [x20], #0x2\n"
"ld1 { v24.h }[0], [x19], #0x2\n"
- "tbz x4, #0, 11f\n"
+ "tbz x8, #0, 11f\n"
"ld1 { v31.b }[2], [x26]\n"
"ld1 { v30.b }[2], [x25]\n"
"ld1 { v29.b }[2], [x24]\n"
@@ -764,7 +748,7 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v24.b }[2], [x19]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 11f\n"
+ "tbz x8, #0, 11f\n"
"ld1 { v31.b }[0], [x26]\n"
"ld1 { v30.b }[0], [x25]\n"
"ld1 { v29.b }[0], [x24]\n"
@@ -774,646 +758,636 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v25.b }[0], [x20]\n"
"ld1 { v24.b }[0], [x19]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ldr x23, [x8, #0x40]\n"
"usubl v31.8h, v31.8b, v12.8b\n"
"smlal v15.4s, v31.4h, v8.4h\n"
- "usubl v30.8h, v30.8b, v12.8b\n"
"smlal2 v10.4s, v31.8h, v8.8h\n"
- "usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v20.4s, v31.4h, v6.4h\n"
- "usubl v28.8h, v28.8b, v12.8b\n"
- "smlal2 v23.4s, v31.8h, v6.8h\n"
- "usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v16.4s, v31.4h, v2.4h\n"
- "usubl v26.8h, v26.8b, v12.8b\n"
- "smlal2 v22.4s, v31.8h, v2.8h\n"
- "usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v17.4s, v31.4h, v0.4h\n"
- "usubl v24.8h, v24.8b, v12.8b\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
- "add x23, x23, x5\n"
+ "ldr x24, [x12, #0x40]\n"
+ "usubl v30.8h, v30.8b, v12.8b\n"
"smlal v15.4s, v30.4h, v0.4h\n"
"smlal2 v10.4s, v30.8h, v0.8h\n"
- "smlal v20.4s, v28.4h, v1.4h\n"
- "smlal2 v23.4s, v28.8h, v1.8h\n"
+ "add x24, x24, x15\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v9.4s, v31.4h, v6.4h\n"
+ "smlal2 v16.4s, v31.8h, v6.8h\n"
"smlal v15.4s, v29.4h, v1.4h\n"
"smlal2 v10.4s, v29.8h, v1.8h\n"
- "smlal v20.4s, v27.4h, v2.4h\n"
- "smlal2 v23.4s, v27.8h, v2.8h\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "smlal2 v16.4s, v28.8h, v1.8h\n"
"smlal v15.4s, v26.4h, v3.4h\n"
"smlal2 v10.4s, v26.8h, v3.8h\n"
- "smlal v20.4s, v24.4h, v0.4h\n"
- "smlal2 v23.4s, v24.8h, v0.8h\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v9.4s, v27.4h, v2.4h\n"
+ "smlal2 v16.4s, v27.8h, v2.8h\n"
"smlal v15.4s, v25.4h, v4.4h\n"
"smlal2 v10.4s, v25.8h, v4.8h\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v22.4s, v31.4h, v2.4h\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
"smlal v15.4s, v24.4h, v2.4h\n"
"smlal2 v10.4s, v24.8h, v2.8h\n"
- "tbz x4, #2, 13f\n"
- "ld1 { v29.s }[0], [x23], #0x4\n"
- "tbz x4, #1, 12f\n"
- "ld1 { v29.h }[2], [x23], #0x2\n"
- "tbz x4, #0, 15f\n"
- "ld1 { v29.b }[6], [x23]\n"
+ "smlal v9.4s, v24.4h, v0.4h\n"
+ "smlal2 v16.4s, v24.8h, v0.8h\n"
+ "tbz x8, #2, 13f\n"
+ "ld1 { v29.s }[0], [x24], #0x4\n"
+ "tbz x8, #1, 12f\n"
+ "ld1 { v29.h }[2], [x24], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v29.b }[6], [x24]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 15f\n"
- "ld1 { v29.b }[4], [x23]\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v29.b }[4], [x24]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x4, #1, 14f\n"
- "ld1 { v29.h }[0], [x23], #0x2\n"
- "tbz x4, #0, 15f\n"
- "ld1 { v29.b }[2], [x23]\n"
+ "tbz x8, #1, 14f\n"
+ "ld1 { v29.h }[0], [x24], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v29.b }[2], [x24]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 15f\n"
- "ld1 { v29.b }[0], [x23]\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v29.b }[0], [x24]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "ldr x22, [x8, #0x48]\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v20.4s, v29.4h, v4.4h\n"
- "smlal2 v23.4s, v29.8h, v4.8h\n"
- "add x22, x22, x5\n"
- "tbz x4, #2, 17f\n"
- "ld1 { v28.s }[0], [x22], #0x4\n"
- "tbz x4, #1, 16f\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
- "tbz x4, #0, 19f\n"
- "ld1 { v28.b }[6], [x22]\n"
+ "ldr x23, [x12, #0x48]\n"
+ "smlal v9.4s, v29.4h, v4.4h\n"
+ "smlal2 v16.4s, v29.8h, v4.8h\n"
+ "add x23, x23, x15\n"
+ "tbz x8, #2, 17f\n"
+ "ld1 { v28.s }[0], [x23], #0x4\n"
+ "tbz x8, #1, 16f\n"
+ "ld1 { v28.h }[2], [x23], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v28.b }[6], [x23]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 19f\n"
- "ld1 { v28.b }[4], [x22]\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v28.b }[4], [x23]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
- "tbz x4, #1, 18f\n"
- "ld1 { v28.h }[0], [x22], #0x2\n"
- "tbz x4, #0, 19f\n"
- "ld1 { v28.b }[2], [x22]\n"
+ "tbz x8, #1, 18f\n"
+ "ld1 { v28.h }[0], [x23], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v28.b }[2], [x23]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 19f\n"
- "ld1 { v28.b }[0], [x22]\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v28.b }[0], [x23]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "ldr x21, [x8, #0x50]\n"
"usubl v28.8h, v28.8b, v12.8b\n"
- "smlal v20.4s, v28.4h, v5.4h\n"
- "smlal2 v23.4s, v28.8h, v5.8h\n"
- "add x21, x21, x5\n"
- "tbz x4, #2, 21f\n"
+ "ldr x21, [x12, #0x50]\n"
+ "smlal v9.4s, v28.4h, v5.4h\n"
+ "smlal2 v16.4s, v28.8h, v5.8h\n"
+ "add x21, x21, x15\n"
+ "tbz x8, #2, 21f\n"
"ld1 { v27.s }[0], [x21], #0x4\n"
- "tbz x4, #1, 20f\n"
+ "tbz x8, #1, 20f\n"
"ld1 { v27.h }[2], [x21], #0x2\n"
- "tbz x4, #0, 23f\n"
+ "tbz x8, #0, 23f\n"
"ld1 { v27.b }[6], [x21]\n"
"b 23f\n"
"20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 23f\n"
+ "tbz x8, #0, 23f\n"
"ld1 { v27.b }[4], [x21]\n"
"b 23f\n"
"21:" // Oddments: Load (1, 2): Bit 2: Unset
- "tbz x4, #1, 22f\n"
+ "tbz x8, #1, 22f\n"
"ld1 { v27.h }[0], [x21], #0x2\n"
- "tbz x4, #0, 23f\n"
+ "tbz x8, #0, 23f\n"
"ld1 { v27.b }[2], [x21]\n"
"b 23f\n"
"22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 23f\n"
+ "tbz x8, #0, 23f\n"
"ld1 { v27.b }[0], [x21]\n"
"23:" // Oddments: Load (1, 2): Bit 2: End
- "ldr x20, [x8, #0x58]\n"
"usubl v27.8h, v27.8b, v12.8b\n"
+ "ldr x19, [x12, #0x58]\n"
"smlal v15.4s, v27.4h, v5.4h\n"
"smlal2 v10.4s, v27.8h, v5.8h\n"
- "add x20, x20, x5\n"
- "smlal v20.4s, v27.4h, v3.4h\n"
- "smlal2 v23.4s, v27.8h, v3.8h\n"
- "tbz x4, #2, 25f\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 24f\n"
- "ld1 { v26.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 27f\n"
- "ld1 { v26.b }[6], [x20]\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "smlal2 v16.4s, v27.8h, v3.8h\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 25f\n"
+ "ld1 { v26.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 24f\n"
+ "ld1 { v26.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v26.b }[6], [x19]\n"
"b 27f\n"
"24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 27f\n"
- "ld1 { v26.b }[4], [x20]\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v26.b }[4], [x19]\n"
"b 27f\n"
"25:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x4, #1, 26f\n"
- "ld1 { v26.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 27f\n"
- "ld1 { v26.b }[2], [x20]\n"
+ "tbz x8, #1, 26f\n"
+ "ld1 { v26.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v26.b }[2], [x19]\n"
"b 27f\n"
"26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 27f\n"
- "ld1 { v26.b }[0], [x20]\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v26.b }[0], [x19]\n"
"27:" // Oddments: Load (3, 0): Bit 2: End
- "ldr x19, [x8, #0x60]\n"
"usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v16.4s, v26.4h, v3.4h\n"
- "smlal2 v22.4s, v26.8h, v3.8h\n"
- "add x19, x19, x5\n"
- "tbz x4, #2, 29f\n"
- "ld1 { v25.s }[0], [x19], #0x4\n"
- "tbz x4, #1, 28f\n"
- "ld1 { v25.h }[2], [x19], #0x2\n"
- "tbz x4, #0, 31f\n"
- "ld1 { v25.b }[6], [x19]\n"
+ "ldr x20, [x12, #0x60]\n"
+ "smlal v22.4s, v26.4h, v3.4h\n"
+ "smlal2 v21.4s, v26.8h, v3.8h\n"
+ "add x20, x20, x15\n"
+ "tbz x8, #2, 29f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 28f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v25.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 31f\n"
- "ld1 { v25.b }[4], [x19]\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v25.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x4, #1, 30f\n"
- "ld1 { v25.h }[0], [x19], #0x2\n"
- "tbz x4, #0, 31f\n"
- "ld1 { v25.b }[2], [x19]\n"
+ "tbz x8, #1, 30f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v25.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 31f\n"
- "ld1 { v25.b }[0], [x19]\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v25.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 0): Bit 2: End
- "ldr x10, [x8, #0x68]\n"
"usubl v25.8h, v25.8b, v12.8b\n"
+ "ldr x19, [x12, #0x68]\n"
"smlal v15.4s, v25.4h, v6.4h\n"
"smlal2 v10.4s, v25.8h, v6.8h\n"
- "add x10, x10, x5\n"
- "smlal v16.4s, v25.4h, v0.4h\n"
- "smlal2 v22.4s, v25.8h, v0.8h\n"
- "tbz x4, #2, 33f\n"
- "ld1 { v29.s }[0], [x10], #0x4\n"
- "tbz x4, #1, 32f\n"
- "ld1 { v29.h }[2], [x10], #0x2\n"
- "tbz x4, #0, 35f\n"
- "ld1 { v29.b }[6], [x10]\n"
+ "smlal v22.4s, v25.4h, v0.4h\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 33f\n"
+ "ld1 { v29.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 32f\n"
+ "ld1 { v29.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[6], [x19]\n"
"b 35f\n"
"32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 35f\n"
- "ld1 { v29.b }[4], [x10]\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[4], [x19]\n"
"b 35f\n"
"33:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x4, #1, 34f\n"
- "ld1 { v29.h }[0], [x10], #0x2\n"
- "tbz x4, #0, 35f\n"
- "ld1 { v29.b }[2], [x10]\n"
+ "tbz x8, #1, 34f\n"
+ "ld1 { v29.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[2], [x19]\n"
"b 35f\n"
"34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 35f\n"
- "ld1 { v29.b }[0], [x10]\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[0], [x19]\n"
"35:" // Oddments: Load (3, 1): Bit 2: End
- "ldr x9, [x8, #0x70]\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v16.4s, v29.4h, v4.4h\n"
- "smlal2 v22.4s, v29.8h, v4.8h\n"
- "add x9, x9, x5\n"
- "tbz x4, #2, 37f\n"
- "ld1 { v24.s }[0], [x9], #0x4\n"
- "tbz x4, #1, 36f\n"
- "ld1 { v24.h }[2], [x9], #0x2\n"
- "tbz x4, #0, 39f\n"
- "ld1 { v24.b }[6], [x9]\n"
+ "ldr x19, [x12, #0x70]\n"
+ "smlal v22.4s, v29.4h, v4.4h\n"
+ "smlal2 v21.4s, v29.8h, v4.8h\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 37f\n"
+ "ld1 { v24.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 36f\n"
+ "ld1 { v24.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v24.b }[6], [x19]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 39f\n"
- "ld1 { v24.b }[4], [x9]\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v24.b }[4], [x19]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 1): Bit 2: Unset
- "tbz x4, #1, 38f\n"
- "ld1 { v24.h }[0], [x9], #0x2\n"
- "tbz x4, #0, 39f\n"
- "ld1 { v24.b }[2], [x9]\n"
+ "tbz x8, #1, 38f\n"
+ "ld1 { v24.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v24.b }[2], [x19]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 39f\n"
- "ld1 { v24.b }[0], [x9]\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v24.b }[0], [x19]\n"
"39:" // Oddments: Load (2, 1): Bit 2: End
- "ldr x28, [x8, #0x78]\n"
"usubl v24.8h, v24.8b, v12.8b\n"
+ "ldr x22, [x12, #0x78]\n"
"smlal v15.4s, v24.4h, v7.4h\n"
"smlal2 v10.4s, v24.8h, v7.8h\n"
- "add x28, x28, x5\n"
- "smlal v16.4s, v24.4h, v1.4h\n"
- "smlal2 v22.4s, v24.8h, v1.8h\n"
- "tbz x4, #2, 41f\n"
- "ld1 { v27.s }[0], [x28], #0x4\n"
- "tbz x4, #1, 40f\n"
- "ld1 { v27.h }[2], [x28], #0x2\n"
- "tbz x4, #0, 43f\n"
- "ld1 { v27.b }[6], [x28]\n"
+ "smlal v22.4s, v24.4h, v1.4h\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "add x22, x22, x15\n"
+ "tbz x8, #2, 41f\n"
+ "ld1 { v27.s }[0], [x22], #0x4\n"
+ "tbz x8, #1, 40f\n"
+ "ld1 { v27.h }[2], [x22], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v27.b }[6], [x22]\n"
"b 43f\n"
"40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 43f\n"
- "ld1 { v27.b }[4], [x28]\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v27.b }[4], [x22]\n"
"b 43f\n"
"41:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x4, #1, 42f\n"
- "ld1 { v27.h }[0], [x28], #0x2\n"
- "tbz x4, #0, 43f\n"
- "ld1 { v27.b }[2], [x28]\n"
+ "tbz x8, #1, 42f\n"
+ "ld1 { v27.h }[0], [x22], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v27.b }[2], [x22]\n"
"b 43f\n"
"42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 43f\n"
- "ld1 { v27.b }[0], [x28]\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v27.b }[0], [x22]\n"
"43:" // Oddments: Load (3, 3): Bit 2: End
- "ldr x27, [x8, #0x80]\n"
"usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v17.4s, v27.4h, v4.4h\n"
+ "ldr x21, [x12, #0x80]\n"
+ "smlal v23.4s, v27.4h, v4.4h\n"
"smlal2 v18.4s, v27.8h, v4.8h\n"
- "add x27, x27, x5\n"
- "tbz x4, #2, 45f\n"
- "ld1 { v28.s }[0], [x27], #0x4\n"
- "tbz x4, #1, 44f\n"
- "ld1 { v28.h }[2], [x27], #0x2\n"
- "tbz x4, #0, 47f\n"
- "ld1 { v28.b }[6], [x27]\n"
+ "add x21, x21, x15\n"
+ "tbz x8, #2, 45f\n"
+ "ld1 { v28.s }[0], [x21], #0x4\n"
+ "tbz x8, #1, 44f\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v28.b }[6], [x21]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 47f\n"
- "ld1 { v28.b }[4], [x27]\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v28.b }[4], [x21]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x4, #1, 46f\n"
- "ld1 { v28.h }[0], [x27], #0x2\n"
- "tbz x4, #0, 47f\n"
- "ld1 { v28.b }[2], [x27]\n"
+ "tbz x8, #1, 46f\n"
+ "ld1 { v28.h }[0], [x21], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v28.b }[2], [x21]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 47f\n"
- "ld1 { v28.b }[0], [x27]\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v28.b }[0], [x21]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "ldr x26, [x8, #0x88]\n"
"usubl v28.8h, v28.8b, v12.8b\n"
- "smlal v20.4s, v28.4h, v7.4h\n"
- "smlal2 v23.4s, v28.8h, v7.8h\n"
- "add x26, x26, x5\n"
- "smlal v17.4s, v28.4h, v1.4h\n"
+ "ldr x20, [x12, #0x88]\n"
+ "smlal v9.4s, v28.4h, v7.4h\n"
+ "smlal2 v16.4s, v28.8h, v7.8h\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
"smlal2 v18.4s, v28.8h, v1.8h\n"
- "tbz x4, #2, 49f\n"
- "ld1 { v26.s }[0], [x26], #0x4\n"
- "tbz x4, #1, 48f\n"
- "ld1 { v26.h }[2], [x26], #0x2\n"
- "tbz x4, #0, 51f\n"
- "ld1 { v26.b }[6], [x26]\n"
+ "add x20, x20, x15\n"
+ "tbz x8, #2, 49f\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 48f\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 51f\n"
- "ld1 { v26.b }[4], [x26]\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 4): Bit 2: Unset
- "tbz x4, #1, 50f\n"
- "ld1 { v26.h }[0], [x26], #0x2\n"
- "tbz x4, #0, 51f\n"
- "ld1 { v26.b }[2], [x26]\n"
+ "tbz x8, #1, 50f\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 51f\n"
- "ld1 { v26.b }[0], [x26]\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v26.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 4): Bit 2: End
- "ldr x25, [x8, #0x90]\n"
"usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v17.4s, v26.4h, v5.4h\n"
+ "ldr x23, [x12, #0x90]\n"
+ "smlal v23.4s, v26.4h, v5.4h\n"
"smlal2 v18.4s, v26.8h, v5.8h\n"
- "add x25, x25, x5\n"
- "tbz x4, #2, 53f\n"
- "ld1 { v25.s }[0], [x25], #0x4\n"
- "tbz x4, #1, 52f\n"
- "ld1 { v25.h }[2], [x25], #0x2\n"
- "tbz x4, #0, 55f\n"
- "ld1 { v25.b }[6], [x25]\n"
+ "add x23, x23, x15\n"
+ "tbz x8, #2, 53f\n"
+ "ld1 { v25.s }[0], [x23], #0x4\n"
+ "tbz x8, #1, 52f\n"
+ "ld1 { v25.h }[2], [x23], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v25.b }[6], [x23]\n"
"b 55f\n"
"52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 55f\n"
- "ld1 { v25.b }[4], [x25]\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v25.b }[4], [x23]\n"
"b 55f\n"
"53:" // Oddments: Load (4, 0): Bit 2: Unset
- "tbz x4, #1, 54f\n"
- "ld1 { v25.h }[0], [x25], #0x2\n"
- "tbz x4, #0, 55f\n"
- "ld1 { v25.b }[2], [x25]\n"
+ "tbz x8, #1, 54f\n"
+ "ld1 { v25.h }[0], [x23], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v25.b }[2], [x23]\n"
"b 55f\n"
"54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 55f\n"
- "ld1 { v25.b }[0], [x25]\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v25.b }[0], [x23]\n"
"55:" // Oddments: Load (4, 0): Bit 2: End
- "ldr x24, [x8, #0x98]\n"
"usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v16.4s, v25.4h, v6.4h\n"
- "smlal2 v22.4s, v25.8h, v6.8h\n"
- "add x24, x24, x5\n"
- "tbz x4, #2, 57f\n"
+ "ldr x24, [x12, #0x98]\n"
+ "smlal v22.4s, v25.4h, v6.4h\n"
+ "smlal2 v21.4s, v25.8h, v6.8h\n"
+ "add x24, x24, x15\n"
+ "tbz x8, #2, 57f\n"
"ld1 { v29.s }[0], [x24], #0x4\n"
- "tbz x4, #1, 56f\n"
+ "tbz x8, #1, 56f\n"
"ld1 { v29.h }[2], [x24], #0x2\n"
- "tbz x4, #0, 59f\n"
+ "tbz x8, #0, 59f\n"
"ld1 { v29.b }[6], [x24]\n"
"b 59f\n"
"56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 59f\n"
+ "tbz x8, #0, 59f\n"
"ld1 { v29.b }[4], [x24]\n"
"b 59f\n"
"57:" // Oddments: Load (2, 4): Bit 2: Unset
- "tbz x4, #1, 58f\n"
+ "tbz x8, #1, 58f\n"
"ld1 { v29.h }[0], [x24], #0x2\n"
- "tbz x4, #0, 59f\n"
+ "tbz x8, #0, 59f\n"
"ld1 { v29.b }[2], [x24]\n"
"b 59f\n"
"58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 59f\n"
+ "tbz x8, #0, 59f\n"
"ld1 { v29.b }[0], [x24]\n"
"59:" // Oddments: Load (2, 4): Bit 2: End
- "ldr x23, [x8, #0xa0]\n"
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v20.4s, v29.4h, v8.4h\n"
- "smlal2 v23.4s, v29.8h, v8.8h\n"
- "add x23, x23, x5\n"
- "smlal v17.4s, v29.4h, v2.4h\n"
+ "ldr x19, [x12, #0xa0]\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "smlal2 v16.4s, v29.8h, v8.8h\n"
+ "smlal v23.4s, v29.4h, v2.4h\n"
"smlal2 v18.4s, v29.8h, v2.8h\n"
- "tbz x4, #2, 61f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
- "tbz x4, #1, 60f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
- "tbz x4, #0, 63f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 61f\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 60f\n"
+ "ld1 { v27.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "ld1 { v27.b }[6], [x19]\n"
"b 63f\n"
"60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 63f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "tbz x8, #0, 63f\n"
+ "ld1 { v27.b }[4], [x19]\n"
"b 63f\n"
"61:" // Oddments: Load (4, 1): Bit 2: Unset
- "tbz x4, #1, 62f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
- "tbz x4, #0, 63f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "tbz x8, #1, 62f\n"
+ "ld1 { v27.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "ld1 { v27.b }[2], [x19]\n"
"b 63f\n"
"62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 63f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "tbz x8, #0, 63f\n"
+ "ld1 { v27.b }[0], [x19]\n"
"63:" // Oddments: Load (4, 1): Bit 2: End
- "ldr x22, [x8, #0xa8]\n"
"usubl v27.8h, v27.8b, v12.8b\n"
- "smlal v16.4s, v27.4h, v7.4h\n"
- "smlal2 v22.4s, v27.8h, v7.8h\n"
- "add x22, x22, x5\n"
- "tbz x4, #2, 65f\n"
+ "ldr x22, [x12, #0xa8]\n"
+ "smlal v22.4s, v27.4h, v7.4h\n"
+ "smlal2 v21.4s, v27.8h, v7.8h\n"
+ "add x22, x22, x15\n"
+ "tbz x8, #2, 65f\n"
"ld1 { v24.s }[0], [x22], #0x4\n"
- "tbz x4, #1, 64f\n"
+ "tbz x8, #1, 64f\n"
"ld1 { v24.h }[2], [x22], #0x2\n"
- "tbz x4, #0, 67f\n"
+ "tbz x8, #0, 67f\n"
"ld1 { v24.b }[6], [x22]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 67f\n"
+ "tbz x8, #0, 67f\n"
"ld1 { v24.b }[4], [x22]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x4, #1, 66f\n"
+ "tbz x8, #1, 66f\n"
"ld1 { v24.h }[0], [x22], #0x2\n"
- "tbz x4, #0, 67f\n"
+ "tbz x8, #0, 67f\n"
"ld1 { v24.b }[2], [x22]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 67f\n"
+ "tbz x8, #0, 67f\n"
"ld1 { v24.b }[0], [x22]\n"
"67:" // Oddments: Load (3, 2): Bit 2: End
- "ldr x21, [x8, #0xb0]\n"
"usubl v24.8h, v24.8b, v12.8b\n"
- "smlal v16.4s, v24.4h, v5.4h\n"
- "smlal2 v22.4s, v24.8h, v5.8h\n"
- "add x21, x21, x5\n"
- "smlal v17.4s, v24.4h, v3.4h\n"
+ "ldr x21, [x12, #0xb0]\n"
+ "smlal v22.4s, v24.4h, v5.4h\n"
+ "smlal2 v21.4s, v24.8h, v5.8h\n"
+ "smlal v23.4s, v24.4h, v3.4h\n"
"smlal2 v18.4s, v24.8h, v3.8h\n"
- "tbz x4, #2, 69f\n"
+ "add x21, x21, x15\n"
+ "tbz x8, #2, 69f\n"
"ld1 { v26.s }[0], [x21], #0x4\n"
- "tbz x4, #1, 68f\n"
+ "tbz x8, #1, 68f\n"
"ld1 { v26.h }[2], [x21], #0x2\n"
- "tbz x4, #0, 71f\n"
+ "tbz x8, #0, 71f\n"
"ld1 { v26.b }[6], [x21]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 71f\n"
+ "tbz x8, #0, 71f\n"
"ld1 { v26.b }[4], [x21]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 3): Bit 2: Unset
- "tbz x4, #1, 70f\n"
+ "tbz x8, #1, 70f\n"
"ld1 { v26.h }[0], [x21], #0x2\n"
- "tbz x4, #0, 71f\n"
+ "tbz x8, #0, 71f\n"
"ld1 { v26.b }[2], [x21]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 71f\n"
+ "tbz x8, #0, 71f\n"
"ld1 { v26.b }[0], [x21]\n"
"71:" // Oddments: Load (4, 3): Bit 2: End
- "ldr x20, [x8, #0xb8]\n"
"usubl v26.8h, v26.8b, v12.8b\n"
- "smlal v17.4s, v26.4h, v7.4h\n"
+ "ldr x20, [x12, #0xb8]\n"
+ "smlal v23.4s, v26.4h, v7.4h\n"
"smlal2 v18.4s, v26.8h, v7.8h\n"
- "add x20, x20, x5\n"
- "tbz x4, #2, 73f\n"
+ "add x20, x20, x15\n"
+ "tbz x8, #2, 73f\n"
"ld1 { v25.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 72f\n"
+ "tbz x8, #1, 72f\n"
"ld1 { v25.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 75f\n"
+ "tbz x8, #0, 75f\n"
"ld1 { v25.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 75f\n"
+ "tbz x8, #0, 75f\n"
"ld1 { v25.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 2): Bit 2: Unset
- "tbz x4, #1, 74f\n"
+ "tbz x8, #1, 74f\n"
"ld1 { v25.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 75f\n"
+ "tbz x8, #0, 75f\n"
"ld1 { v25.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 75f\n"
+ "tbz x8, #0, 75f\n"
"ld1 { v25.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 2): Bit 2: End
- "ldr x19, [x8, #0xc0]\n"
"usubl v25.8h, v25.8b, v12.8b\n"
- "smlal v16.4s, v25.4h, v8.4h\n"
- "smlal2 v22.4s, v25.8h, v8.8h\n"
- "add x19, x19, x5\n"
- "smlal v17.4s, v25.4h, v6.4h\n"
+ "ldr x19, [x12, #0xc0]\n"
+ "smlal v22.4s, v25.4h, v8.4h\n"
+ "smlal2 v21.4s, v25.8h, v8.8h\n"
+ "smlal v23.4s, v25.4h, v6.4h\n"
"smlal2 v18.4s, v25.8h, v6.8h\n"
- "tbz x4, #2, 77f\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 77f\n"
"ld1 { v29.s }[0], [x19], #0x4\n"
- "tbz x4, #1, 76f\n"
+ "tbz x8, #1, 76f\n"
"ld1 { v29.h }[2], [x19], #0x2\n"
- "tbz x4, #0, 79f\n"
+ "tbz x8, #0, 79f\n"
"ld1 { v29.b }[6], [x19]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 79f\n"
+ "tbz x8, #0, 79f\n"
"ld1 { v29.b }[4], [x19]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 4): Bit 2: Unset
- "tbz x4, #1, 78f\n"
+ "tbz x8, #1, 78f\n"
"ld1 { v29.h }[0], [x19], #0x2\n"
- "tbz x4, #0, 79f\n"
+ "tbz x8, #0, 79f\n"
"ld1 { v29.b }[2], [x19]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 79f\n"
+ "tbz x8, #0, 79f\n"
"ld1 { v29.b }[0], [x19]\n"
"79:" // Oddments: Load (4, 4): Bit 2: End
"usubl v29.8h, v29.8b, v12.8b\n"
- "smlal v17.4s, v29.4h, v8.4h\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
"smlal2 v18.4s, v29.8h, v8.8h\n"
- "tbz x4, #2, 81f\n"
- "ld1 { v21.4s }, [x17], #0x10\n"
- "ld1 { v30.4s }, [x15], #0x10\n"
- "tbz x4, #1, 80f\n"
- "ld1 { v31.d }[0], [x17], #0x8\n"
- "ld1 { v9.d }[0], [x15], #0x8\n"
- "tbz x4, #0, 83f\n"
- "ld1 { v31.s }[2], [x17]\n"
- "ld1 { v9.s }[2], [x15]\n"
+ "tbz x8, #2, 81f\n"
+ "ld1 { v19.4s }, [x13], #0x10\n"
+ "ld1 { v0.4s }, [x11], #0x10\n"
+ "tbz x8, #1, 80f\n"
+ "ld1 { v4.d }[0], [x13], #0x8\n"
+ "ld1 { v31.d }[0], [x11], #0x8\n"
+ "tbz x8, #0, 83f\n"
+ "ld1 { v4.s }[2], [x13]\n"
+ "ld1 { v31.s }[2], [x11]\n"
"b 83f\n"
"80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x4, #0, 83f\n"
- "ld1 { v31.s }[0], [x17]\n"
- "ld1 { v9.s }[0], [x15]\n"
+ "tbz x8, #0, 83f\n"
+ "ld1 { v4.s }[0], [x13]\n"
+ "ld1 { v31.s }[0], [x11]\n"
"b 83f\n"
"81:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x4, #1, 82f\n"
- "ld1 { v21.d }[0], [x17], #0x8\n"
- "ld1 { v30.d }[0], [x15], #0x8\n"
- "tbz x4, #0, 83f\n"
- "ld1 { v21.s }[2], [x17]\n"
- "ld1 { v30.s }[2], [x15]\n"
+ "tbz x8, #1, 82f\n"
+ "ld1 { v19.d }[0], [x13], #0x8\n"
+ "ld1 { v0.d }[0], [x11], #0x8\n"
+ "tbz x8, #0, 83f\n"
+ "ld1 { v19.s }[2], [x13]\n"
+ "ld1 { v0.s }[2], [x11]\n"
"b 83f\n"
"82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 83f\n"
- "ld1 { v21.s }[0], [x17]\n"
- "ld1 { v30.s }[0], [x15]\n"
+ "tbz x8, #0, 83f\n"
+ "ld1 { v19.s }[0], [x13]\n"
+ "ld1 { v0.s }[0], [x11]\n"
"83:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v15.4s, v15.4s, v21.4s\n"
- "add x14, x14, x7\n"
- "sqrdmulh v10.4s, v10.4s, v31.4s\n"
- "add x13, x13, x7\n"
- "sqrdmulh v20.4s, v20.4s, v21.4s\n"
- "add x12, x12, x7\n"
- "sqrdmulh v23.4s, v23.4s, v31.4s\n"
- "add x11, x11, x7\n"
- "sqrdmulh v16.4s, v16.4s, v21.4s\n"
- "and v26.16b, v15.16b, v30.16b\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "and v8.16b, v10.16b, v9.16b\n"
- "and v4.16b, v20.16b, v30.16b\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "and v2.16b, v23.16b, v9.16b\n"
- "and v1.16b, v16.16b, v30.16b\n"
+ "sqdmulh v15.4s, v15.4s, v19.4s\n"
+ "sqdmulh v9.4s, v9.4s, v19.4s\n"
+ "add x10, x10, x14\n"
+ "add x9, x9, x14\n"
+ "sqdmulh v22.4s, v22.4s, v19.4s\n"
+ "sqdmulh v23.4s, v23.4s, v19.4s\n"
+ "add x28, x28, x14\n"
+ "add x27, x27, x14\n"
+ "and v30.16b, v15.16b, v0.16b\n"
+ "sqdmulh v10.4s, v10.4s, v4.4s\n"
+ "and v28.16b, v9.16b, v0.16b\n"
+ "sqdmulh v16.4s, v16.4s, v4.4s\n"
+ "and v29.16b, v22.16b, v0.16b\n"
+ "sqdmulh v21.4s, v21.4s, v4.4s\n"
+ "and v20.16b, v23.16b, v0.16b\n"
+ "sqdmulh v18.4s, v18.4s, v4.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "and v19.16b, v10.16b, v31.16b\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "and v4.16b, v16.16b, v31.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v5.16b, v21.16b, v31.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v26.16b, v18.16b, v31.16b\n"
+ "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v28.4s\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "sqrdmulh v22.4s, v22.4s, v31.4s\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "sqadd v15.4s, v15.4s, v26.4s\n"
- "sqrdmulh v17.4s, v17.4s, v21.4s\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "sqrdmulh v18.4s, v18.4s, v31.4s\n"
- "sqadd v10.4s, v10.4s, v8.4s\n"
- "sqadd v20.4s, v20.4s, v4.4s\n"
- "srshl v15.4s, v15.4s, v30.4s\n"
- "sqadd v23.4s, v23.4s, v2.4s\n"
- "srshl v10.4s, v10.4s, v9.4s\n"
- "srshl v20.4s, v20.4s, v30.4s\n"
- "add v15.4s, v15.4s, v11.4s\n"
- "srshl v23.4s, v23.4s, v9.4s\n"
- "add v10.4s, v10.4s, v11.4s\n"
- "smin v15.4s, v15.4s, v14.4s\n"
- "add v20.4s, v20.4s, v11.4s\n"
- "smin v10.4s, v10.4s, v14.4s\n"
- "smax v15.4s, v15.4s, v19.4s\n"
- "smin v20.4s, v20.4s, v14.4s\n"
- "smax v10.4s, v10.4s, v19.4s\n"
- "add v23.4s, v23.4s, v11.4s\n"
- "smax v20.4s, v20.4s, v19.4s\n"
- "uzp1 v15.16b, v15.16b, v10.16b\n"
- "smin v23.4s, v23.4s, v14.4s\n"
+ "sqadd v22.4s, v22.4s, v29.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v20.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v0.4s\n"
+ "sqadd v10.4s, v10.4s, v19.4s\n"
+ "srshl v9.4s, v9.4s, v0.4s\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "sqadd v21.4s, v21.4s, v5.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "sqadd v18.4s, v18.4s, v26.4s\n"
+ "srshl v10.4s, v10.4s, v31.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v16.4s, v16.4s, v31.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v21.4s, v21.4s, v31.4s\n"
+ "sqxtn v22.4h, v22.4s\n"
+ "srshl v18.4s, v18.4s, v31.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v15.8h, v10.4s\n"
+ "sqxtn2 v9.8h, v16.4s\n"
+ "sqxtn2 v22.8h, v21.4s\n"
+ "sqxtn2 v23.8h, v18.4s\n"
+ "sqadd v15.8h, v15.8h, v11.8h\n"
+ "sqadd v9.8h, v9.8h, v11.8h\n"
+ "sqadd v22.8h, v22.8h, v11.8h\n"
+ "sqadd v23.8h, v23.8h, v11.8h\n"
+ "smax v15.8h, v15.8h, v17.8h\n"
+ "smax v9.8h, v9.8h, v17.8h\n"
+ "smax v22.8h, v22.8h, v17.8h\n"
+ "smax v23.8h, v23.8h, v17.8h\n"
+ "smin v15.8h, v15.8h, v14.8h\n"
+ "smin v9.8h, v9.8h, v14.8h\n"
+ "smin v22.8h, v22.8h, v14.8h\n"
+ "smin v23.8h, v23.8h, v14.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "sqadd v16.4s, v16.4s, v1.4s\n"
- "smax v23.4s, v23.4s, v19.4s\n"
- "and v24.16b, v22.16b, v9.16b\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
- "uzp1 v20.16b, v20.16b, v23.16b\n"
- "srshl v16.4s, v16.4s, v30.4s\n"
- "and v2.16b, v17.16b, v30.16b\n"
- "sshr v2.4s, v2.4s, #0x1f\n"
- "uzp1 v20.16b, v20.16b, v20.16b\n"
- "add v16.4s, v16.4s, v11.4s\n"
- "sqadd v22.4s, v22.4s, v24.4s\n"
- "and v31.16b, v18.16b, v9.16b\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "smin v16.4s, v16.4s, v14.4s\n"
- "srshl v22.4s, v22.4s, v9.4s\n"
- "sqadd v17.4s, v17.4s, v2.4s\n"
- "smax v16.4s, v16.4s, v19.4s\n"
- "add v22.4s, v22.4s, v11.4s\n"
- "srshl v17.4s, v17.4s, v30.4s\n"
- "sqadd v18.4s, v18.4s, v31.4s\n"
- "smin v22.4s, v22.4s, v14.4s\n"
- "add v17.4s, v17.4s, v11.4s\n"
- "srshl v18.4s, v18.4s, v9.4s\n"
- "smax v22.4s, v22.4s, v19.4s\n"
- "smin v17.4s, v17.4s, v14.4s\n"
- "uzp1 v16.16b, v16.16b, v22.16b\n"
- "add v18.4s, v18.4s, v11.4s\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "smax v17.4s, v17.4s, v19.4s\n"
- "smin v18.4s, v18.4s, v14.4s\n"
- "smax v18.4s, v18.4s, v19.4s\n"
- "uzp1 v17.16b, v17.16b, v18.16b\n"
- "uzp1 v17.16b, v17.16b, v17.16b\n"
- "tbz x4, #2, 85f\n"
- "st1 { v15.s }[0], [x14], #0x4\n"
- "st1 { v20.s }[0], [x13], #0x4\n"
- "st1 { v16.s }[0], [x12], #0x4\n"
- "st1 { v17.s }[0], [x11], #0x4\n"
- "tbz x4, #1, 84f\n"
- "st1 { v15.h }[2], [x14], #0x2\n"
- "st1 { v20.h }[2], [x13], #0x2\n"
- "st1 { v16.h }[2], [x12], #0x2\n"
- "st1 { v17.h }[2], [x11], #0x2\n"
- "tbz x4, #0, 87f\n"
- "st1 { v15.b }[6], [x14], #0x1\n"
- "st1 { v20.b }[6], [x13], #0x1\n"
- "st1 { v16.b }[6], [x12], #0x1\n"
- "st1 { v17.b }[6], [x11], #0x1\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "tbz x8, #2, 85f\n"
+ "st1 { v15.s }[0], [x10], #0x4\n"
+ "st1 { v9.s }[0], [x9], #0x4\n"
+ "st1 { v22.s }[0], [x28], #0x4\n"
+ "st1 { v23.s }[0], [x27], #0x4\n"
+ "tbz x8, #1, 84f\n"
+ "st1 { v15.h }[2], [x10], #0x2\n"
+ "st1 { v9.h }[2], [x9], #0x2\n"
+ "st1 { v22.h }[2], [x28], #0x2\n"
+ "st1 { v23.h }[2], [x27], #0x2\n"
+ "tbz x8, #0, 87f\n"
+ "st1 { v15.b }[6], [x10], #0x1\n"
+ "st1 { v9.b }[6], [x9], #0x1\n"
+ "st1 { v22.b }[6], [x28], #0x1\n"
+ "st1 { v23.b }[6], [x27], #0x1\n"
"b 87f\n"
"84:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x4, #0, 87f\n"
- "st1 { v15.b }[4], [x14], #0x1\n"
- "st1 { v20.b }[4], [x13], #0x1\n"
- "st1 { v16.b }[4], [x12], #0x1\n"
- "st1 { v17.b }[4], [x11], #0x1\n"
+ "tbz x8, #0, 87f\n"
+ "st1 { v15.b }[4], [x10], #0x1\n"
+ "st1 { v9.b }[4], [x9], #0x1\n"
+ "st1 { v22.b }[4], [x28], #0x1\n"
+ "st1 { v23.b }[4], [x27], #0x1\n"
"b 87f\n"
"85:" // Oddments: Bit 2: Unset
- "tbz x4, #1, 86f\n"
- "st1 { v15.h }[0], [x14], #0x2\n"
- "st1 { v20.h }[0], [x13], #0x2\n"
- "st1 { v16.h }[0], [x12], #0x2\n"
- "st1 { v17.h }[0], [x11], #0x2\n"
- "tbz x4, #0, 87f\n"
- "st1 { v15.b }[2], [x14], #0x1\n"
- "st1 { v20.b }[2], [x13], #0x1\n"
- "st1 { v16.b }[2], [x12], #0x1\n"
- "st1 { v17.b }[2], [x11], #0x1\n"
+ "tbz x8, #1, 86f\n"
+ "st1 { v15.h }[0], [x10], #0x2\n"
+ "st1 { v9.h }[0], [x9], #0x2\n"
+ "st1 { v22.h }[0], [x28], #0x2\n"
+ "st1 { v23.h }[0], [x27], #0x2\n"
+ "tbz x8, #0, 87f\n"
+ "st1 { v15.b }[2], [x10], #0x1\n"
+ "st1 { v9.b }[2], [x9], #0x1\n"
+ "st1 { v22.b }[2], [x28], #0x1\n"
+ "st1 { v23.b }[2], [x27], #0x1\n"
"b 87f\n"
"86:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 87f\n"
- "st1 { v15.b }[0], [x14], #0x1\n"
- "st1 { v20.b }[0], [x13], #0x1\n"
- "st1 { v16.b }[0], [x12], #0x1\n"
- "st1 { v17.b }[0], [x11], #0x1\n"
+ "tbz x8, #0, 87f\n"
+ "st1 { v15.b }[0], [x10], #0x1\n"
+ "st1 { v9.b }[0], [x9], #0x1\n"
+ "st1 { v22.b }[0], [x28], #0x1\n"
+ "st1 { v23.b }[0], [x27], #0x1\n"
"87:" // Oddments: Bit 2: End
-
"88:" // End
-
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index 73de9650c3..11e993c51c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,37 +36,24 @@ namespace depthwise {
void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
-struct a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst
+class a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef uint8_t input_type;
- typedef uint8_t weight_type;
- typedef uint8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 5;
constexpr static unsigned int kernel_cols = 5;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 6;
- constexpr static unsigned int input_cols = 6;
-
- constexpr static parameter_packing_fn pack_parameters = interleave_a64_u8q_5x5_mla::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_a64_u8q_5x5_mla::get_packed_size;
+ a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
- kern_type kernel = a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
- a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index b42f29ab0d..6934dffc98 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,7 +46,7 @@ void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const uint8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const uint8_t *const *inptrs_raw,
- const uint8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -111,2096 +111,2070 @@ void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
- "mov x10, #0x0\n"
- "ldr x3, [%x[params], %[offsetof_Params_weights]]\n"
- "mov x1, #0x0\n"
- "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
- "add x25, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x2, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "lsr x19, x4, #0x3\n"
- "ldr x5, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "add x13, x22, %[offsetof_Requantize32_a_offset]\n"
- "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v7.16b }, [x13]\n"
- "add x8, x22, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v13.16b }, [x20]\n"
- "add x20, x22, %[offsetof_Requantize32_minval]\n"
- "ld1r { v19.4s }, [x8]\n"
- "add x8, x22, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v16.4s }, [x20]\n"
- "ld1r { v12.4s }, [x8]\n"
- "ldp x17, x16, [x21, #0x0]\n"
- "ldp x6, x8, [x21, #0x10]\n"
- "cbz x19, 3f\n"
- "subs x19, x19, #0x1\n"
- "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q15, [x12, #0x0]\n"
- "mov v18.16b, v15.16b\n"
- "ldr q20, [x12, #0x10]\n"
- "add x12, x12, #0x20\n"
- "mov v11.16b, v15.16b\n"
- "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr x10, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "add x17, x10, %[offsetof_Requantize32_a_offset]\n"
+ "add x9, x10, %[offsetof_Requantize32_b_offset]\n"
+ "ldr x25, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x4, x10, %[offsetof_Requantize32_c_offset]\n"
+ "add x14, x10, %[offsetof_Requantize32_minval]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x5, x10, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v9.16b }, [x17]\n"
+ "ld1r { v14.16b }, [x9]\n"
+ "lsr x3, x0, #0x3\n"
+ "ld1r { v18.8h }, [x4]\n"
+ "ld1r { v11.8h }, [x14]\n"
+ "mov x24, #0x0\n"
+ "mov x22, #0x0\n"
+ "ld1r { v13.8h }, [x5]\n"
+ "ldr x10, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x20, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x1, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x16, x8, [x25, #0x0]\n"
+ "ldp x4, x7, [x25, #0x10]\n"
+ "cbz x3, 3f\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q15, [x19, #0x0]\n"
+ "subs x3, x3, #0x1\n"
+ "mov v17.16b, v15.16b\n"
+ "ldr q16, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d0, [x23, #0x0]\n"
+ "ldr d1, [x23, #0x8]\n"
+ "ldr d2, [x23, #0x10]\n"
+ "mov v8.16b, v16.16b\n"
"mov v10.16b, v15.16b\n"
- "ldr d0, [x3, #0x0]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
- "mov v5.16b, v20.16b\n"
- "ldr d1, [x3, #0x8]\n"
- "mov v8.16b, v20.16b\n"
- "ldr d2, [x3, #0x10]\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
- "mov v9.16b, v20.16b\n"
- "ldr d3, [x3, #0x18]\n"
- "ldr d4, [x3, #0x20]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
- "ldp x28, x27, [x25, #0x0]\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
- "ldp x26, x13, [x25, #0x10]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
- "ldp x24, x23, [x25, #0x20]\n"
- "ldp x22, x21, [x25, #0x30]\n"
- "ldp x20, x0, [x25, #0x40]\n"
- "ldr d31, [x28, x10]\n"
- "usubl v31.8h, v31.8b, v7.8b\n"
- "ldr d30, [x27, x10]\n"
- "ldr d29, [x26, x10]\n"
- "usubl v30.8h, v30.8b, v7.8b\n"
- "ldr d28, [x13, x10]\n"
- "ldr d27, [x24, x10]\n"
- "usubl v29.8h, v29.8b, v7.8b\n"
- "ldr d23, [x23, x10]\n"
- "usubl v28.8h, v28.8b, v7.8b\n"
- "ldr d25, [x22, x10]\n"
- "ldr d24, [x21, x10]\n"
- "usubl v27.8h, v27.8b, v7.8b\n"
- "ldr d26, [x20, x10]\n"
- "usubl v23.8h, v23.8b, v7.8b\n"
- "ldr d22, [x0, x10]\n"
- "usubl v25.8h, v25.8b, v7.8b\n"
- "usubl v24.8h, v24.8b, v7.8b\n"
- "usubl v26.8h, v26.8b, v7.8b\n"
- "usubl v22.8h, v22.8b, v7.8b\n"
+ "ldr d3, [x23, #0x18]\n"
+ "ldr d4, [x23, #0x20]\n"
+ "mov v7.16b, v16.16b\n"
+ "mov v6.16b, v15.16b\n"
+ "ldp x28, x6, [x20, #0x0]\n"
+ "ldp x26, x25, [x20, #0x10]\n"
+ "mov v5.16b, v16.16b\n"
+ "usubl v0.8h, v0.8b, v14.8b\n"
+ "ldp x5, x2, [x20, #0x20]\n"
+ "ldp x27, x21, [x20, #0x30]\n"
+ "usubl v1.8h, v1.8b, v14.8b\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "ldp x12, x19, [x20, #0x40]\n"
+ "ldr d31, [x28, x24]\n"
+ "usubl v3.8h, v3.8b, v14.8b\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "ldr d30, [x6, x24]\n"
+ "ldr d29, [x26, x24]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "ldr d28, [x25, x24]\n"
+ "ldr d27, [x5, x24]\n"
+ "usubl v29.8h, v29.8b, v9.8b\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "ldr d23, [x2, x24]\n"
+ "ldr d25, [x27, x24]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "ldr d24, [x21, x24]\n"
+ "ldr d26, [x12, x24]\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "ldr d22, [x19, x24]\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
"beq 2f\n"
"1:" // Loop
"smlal v15.4s, v31.4h, v0.4h\n"
- "ldr x20, [x25, #0x50]\n"
- "subs x19, x19, #0x1\n"
- "smlal2 v20.4s, v31.8h, v0.8h\n"
- "ldr x28, [x25, #0x58]\n"
- "smlal v18.4s, v30.4h, v0.4h\n"
- "ldr x0, [x25, #0x60]\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x10]\n"
- "usubl v31.8h, v31.8b, v7.8b\n"
- "smlal v11.4s, v29.4h, v0.4h\n"
- "ldr x7, [x25, #0x68]\n"
- "smlal2 v8.4s, v29.8h, v0.8h\n"
- "ldr x26, [x25, #0x70]\n"
- "smlal v10.4s, v28.4h, v0.4h\n"
- "ldr x23, [x25, #0x78]\n"
- "smlal2 v9.4s, v28.8h, v0.8h\n"
- "ldr d0, [x3, #0x28]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal2 v16.4s, v31.8h, v0.8h\n"
+ "ldr x19, [x20, #0x50]\n"
+ "ldr d31, [x19, x24]\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal v10.4s, v29.4h, v0.4h\n"
+ "ldr x15, [x20, #0x58]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v6.4s, v28.4h, v0.4h\n"
+ "smlal2 v8.4s, v30.8h, v0.8h\n"
+ "ldr x19, [x20, #0x60]\n"
+ "ldr x27, [x20, #0x68]\n"
+ "smlal2 v7.4s, v29.8h, v0.8h\n"
"smlal v15.4s, v30.4h, v1.4h\n"
- "ldr x20, [x25, #0x80]\n"
- "smlal2 v20.4s, v30.8h, v1.8h\n"
- "ldr d30, [x28, x10]\n"
- "usubl v30.8h, v30.8b, v7.8b\n"
- "smlal v18.4s, v27.4h, v1.4h\n"
- "ldr x22, [x25, #0x88]\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "ldr x13, [x25, #0x90]\n"
- "smlal v11.4s, v28.4h, v1.4h\n"
- "ldr x21, [x25, #0x98]\n"
- "smlal2 v8.4s, v28.8h, v1.8h\n"
- "ldr x14, [x25, #0xa0]\n"
- "smlal v10.4s, v23.4h, v1.4h\n"
- "ldr x11, [x25, #0xa8]\n"
- "smlal2 v9.4s, v23.8h, v1.8h\n"
- "ldr d1, [x3, #0x30]\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x5, [x20, #0x70]\n"
+ "ldr x11, [x20, #0x78]\n"
+ "smlal2 v16.4s, v30.8h, v1.8h\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x15, x24]\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "smlal v17.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v28.4h, v1.4h\n"
+ "ldr d0, [x23, #0x28]\n"
+ "usubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v6.4s, v23.4h, v1.4h\n"
+ "smlal2 v8.4s, v27.8h, v1.8h\n"
+ "ldr x12, [x20, #0x80]\n"
+ "ldr x26, [x20, #0x88]\n"
+ "smlal2 v7.4s, v28.8h, v1.8h\n"
"smlal v15.4s, v27.4h, v2.4h\n"
- "ldr x24, [x25, #0xb0]\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "ldr d27, [x0, x10]\n"
- "usubl v27.8h, v27.8b, v7.8b\n"
- "smlal v18.4s, v25.4h, v2.4h\n"
- "ldr x0, [x25, #0xb8]\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "ldr x15, [x25, #0xc0]\n"
- "smlal v11.4s, v23.4h, v2.4h\n"
- "ldr x9, [x25, #0xc8]\n"
- "smlal2 v8.4s, v23.8h, v2.8h\n"
- "ldr x27, [x25, #0xd0]\n"
- "smlal v10.4s, v31.4h, v2.4h\n"
- "ldr x28, [x25, #0xd8]\n"
- "smlal2 v9.4s, v31.8h, v2.8h\n"
- "ldr d2, [x3, #0x38]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x14, [x20, #0x90]\n"
+ "ldr x15, [x20, #0x98]\n"
+ "smlal2 v16.4s, v27.8h, v2.8h\n"
+ "smlal2 v5.4s, v23.8h, v1.8h\n"
+ "ldr d27, [x19, x24]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal v10.4s, v23.4h, v2.4h\n"
+ "ldr d1, [x23, #0x30]\n"
+ "usubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v31.4h, v2.4h\n"
+ "smlal2 v8.4s, v25.8h, v2.8h\n"
+ "ldr x21, [x20, #0xa0]\n"
+ "ldr x2, [x20, #0xa8]\n"
+ "smlal2 v7.4s, v23.8h, v2.8h\n"
"smlal v15.4s, v25.4h, v3.4h\n"
- "ldr q6, [x2, #0x0]\n"
- "smlal2 v20.4s, v25.8h, v3.8h\n"
- "ldr d25, [x7, x10]\n"
- "usubl v25.8h, v25.8b, v7.8b\n"
- "smlal v18.4s, v24.4h, v3.4h\n"
- "ldr x12, [x25, #0xe0]\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "ldr q21, [x5, #0x0]\n"
- "smlal v11.4s, v31.4h, v3.4h\n"
- "ldr q17, [x2, #0x10]\n"
- "add x2, x2, #0x20\n"
- "smlal2 v8.4s, v31.8h, v3.8h\n"
- "ldr q14, [x5, #0x10]\n"
- "add x5, x5, #0x20\n"
- "smlal v10.4s, v30.4h, v3.4h\n"
- "smlal2 v9.4s, v30.8h, v3.8h\n"
- "ldr d3, [x3, #0x40]\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
+ "ldr x13, [x20, #0xb0]\n"
+ "ldr x9, [x20, #0xb8]\n"
+ "smlal2 v16.4s, v25.8h, v3.8h\n"
+ "smlal2 v5.4s, v31.8h, v2.8h\n"
+ "ldr d25, [x27, x24]\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal v10.4s, v31.4h, v3.4h\n"
+ "ldr d2, [x23, #0x38]\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v30.4h, v3.4h\n"
+ "smlal2 v8.4s, v24.8h, v3.8h\n"
+ "ldr x19, [x20, #0xc0]\n"
+ "ldr x28, [x20, #0xc8]\n"
+ "smlal2 v7.4s, v31.8h, v3.8h\n"
"smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v20.4s, v24.8h, v4.8h\n"
- "ldr d24, [x26, x10]\n"
- "usubl v24.8h, v24.8b, v7.8b\n"
- "smlal v18.4s, v27.4h, v4.4h\n"
- "ldr x7, [x25, #0xe8]\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
- "ldr d27, [x23, x10]\n"
- "usubl v27.8h, v27.8b, v7.8b\n"
- "smlal v11.4s, v30.4h, v4.4h\n"
- "ldr x26, [x25, #0xf0]\n"
- "smlal2 v8.4s, v30.8h, v4.8h\n"
- "smlal v10.4s, v26.4h, v4.4h\n"
- "smlal2 v9.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0x48]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
+ "ldr x6, [x20, #0xd0]\n"
+ "ldr x27, [x20, #0xd8]\n"
+ "smlal2 v16.4s, v24.8h, v4.8h\n"
+ "smlal2 v5.4s, v30.8h, v3.8h\n"
+ "ldr d24, [x5, x24]\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal v10.4s, v30.4h, v4.4h\n"
+ "ldr d3, [x23, #0x40]\n"
+ "usubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v8.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x11, x24]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal2 v7.4s, v30.8h, v4.8h\n"
"smlal v15.4s, v29.4h, v0.4h\n"
- "smlal2 v20.4s, v29.8h, v0.8h\n"
- "smlal v18.4s, v28.4h, v0.4h\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "smlal v11.4s, v22.4h, v0.4h\n"
- "smlal2 v8.4s, v22.8h, v0.8h\n"
- "smlal v10.4s, v25.4h, v0.4h\n"
- "smlal2 v9.4s, v25.8h, v0.8h\n"
- "ldr d0, [x3, #0x50]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
+ "ldr x11, [x20, #0xe0]\n"
+ "ldr x17, [x20, #0xe8]\n"
+ "smlal2 v16.4s, v29.8h, v0.8h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x23, #0x48]\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "smlal v10.4s, v22.4h, v0.4h\n"
+ "ldr x5, [x20, #0xf0]\n"
+ "ldr q12, [x10, #0x0]\n"
+ "smlal v6.4s, v25.4h, v0.4h\n"
+ "smlal2 v8.4s, v28.8h, v0.8h\n"
+ "ldr q19, [x1, #0x0]\n"
+ "ldr q20, [x10, #0x10]\n"
+ "smlal2 v7.4s, v22.8h, v0.8h\n"
"smlal v15.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x10]\n"
- "usubl v28.8h, v28.8b, v7.8b\n"
- "smlal v18.4s, v23.4h, v1.4h\n"
- "ldr x23, [x25, #0xf8]\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "smlal v11.4s, v25.4h, v1.4h\n"
- "smlal2 v8.4s, v25.8h, v1.8h\n"
- "smlal v10.4s, v24.4h, v1.4h\n"
- "smlal2 v9.4s, v24.8h, v1.8h\n"
- "ldr d1, [x3, #0x58]\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
+ "ldr q29, [x1, #0x10]\n"
+ "subs x3, x3, #0x1\n"
+ "smlal2 v16.4s, v28.8h, v1.8h\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "ldr d28, [x26, x24]\n"
+ "ldr d0, [x23, #0x50]\n"
+ "smlal v17.4s, v23.4h, v1.4h\n"
+ "smlal v10.4s, v25.4h, v1.4h\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "ldr x25, [x20, #0xf8]\n"
+ "smlal v6.4s, v24.4h, v1.4h\n"
+ "smlal2 v8.4s, v23.8h, v1.8h\n"
+ "usubl v0.8h, v0.8b, v14.8b\n"
+ "add x10, x10, #0x20\n"
+ "smlal2 v7.4s, v25.8h, v1.8h\n"
"smlal v15.4s, v23.4h, v2.4h\n"
- "smlal2 v20.4s, v23.8h, v2.8h\n"
- "ldr d23, [x20, x10]\n"
- "usubl v23.8h, v23.8b, v7.8b\n"
- "smlal v18.4s, v31.4h, v2.4h\n"
- "ldr x22, [x25, #0x100]\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "smlal v11.4s, v24.4h, v2.4h\n"
- "smlal2 v8.4s, v24.8h, v2.8h\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v9.4s, v27.8h, v2.8h\n"
- "ldr d2, [x3, #0x60]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
+ "add x1, x1, #0x20\n"
+ "smlal2 v16.4s, v23.8h, v2.8h\n"
+ "ldr d23, [x12, x24]\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "smlal v17.4s, v31.4h, v2.4h\n"
+ "smlal v10.4s, v24.4h, v2.4h\n"
+ "ldr d1, [x23, #0x58]\n"
+ "usubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v27.4h, v2.4h\n"
+ "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "ldr x26, [x20, #0x100]\n"
+ "smlal2 v7.4s, v24.8h, v2.8h\n"
"smlal v15.4s, v31.4h, v3.4h\n"
- "smlal2 v20.4s, v31.8h, v3.8h\n"
- "ldr d31, [x13, x10]\n"
- "usubl v31.8h, v31.8b, v7.8b\n"
- "smlal v18.4s, v30.4h, v3.4h\n"
- "ldr x20, [x25, #0x108]\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "smlal v11.4s, v27.4h, v3.4h\n"
- "smlal2 v8.4s, v27.8h, v3.8h\n"
- "smlal v10.4s, v23.4h, v3.4h\n"
- "smlal2 v9.4s, v23.8h, v3.8h\n"
- "ldr d3, [x3, #0x68]\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v16.4s, v31.8h, v3.8h\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "ldr d31, [x14, x24]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v17.4s, v30.4h, v3.4h\n"
+ "smlal v10.4s, v27.4h, v3.4h\n"
+ "ldr d2, [x23, #0x60]\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v23.4h, v3.4h\n"
+ "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "ldr x12, [x20, #0x108]\n"
+ "smlal2 v7.4s, v27.8h, v3.8h\n"
"smlal v15.4s, v30.4h, v4.4h\n"
- "smlal2 v20.4s, v30.8h, v4.8h\n"
- "ldr d30, [x21, x10]\n"
- "usubl v30.8h, v30.8b, v7.8b\n"
- "smlal v18.4s, v26.4h, v4.4h\n"
- "ldr x13, [x25, #0x110]\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x10]\n"
- "usubl v26.8h, v26.8b, v7.8b\n"
- "smlal v11.4s, v23.4h, v4.4h\n"
- "ldr x21, [x25, #0x118]\n"
- "smlal2 v8.4s, v23.8h, v4.8h\n"
- "smlal v10.4s, v28.4h, v4.4h\n"
- "smlal2 v9.4s, v28.8h, v4.8h\n"
- "ldr d4, [x3, #0x70]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal2 v16.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x15, x24]\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal v10.4s, v23.4h, v4.4h\n"
+ "ldr d3, [x23, #0x68]\n"
+ "usubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v28.4h, v4.4h\n"
+ "smlal2 v8.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x21, x24]\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "smlal2 v7.4s, v23.8h, v4.8h\n"
"smlal v15.4s, v22.4h, v0.4h\n"
- "smlal2 v20.4s, v22.8h, v0.8h\n"
- "ldr d22, [x0, x10]\n"
- "usubl v22.8h, v22.8b, v7.8b\n"
- "smlal v18.4s, v25.4h, v0.4h\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v8.4s, v31.8h, v0.8h\n"
- "smlal v10.4s, v30.4h, v0.4h\n"
- "smlal2 v9.4s, v30.8h, v0.8h\n"
- "ldr d0, [x3, #0x78]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
+ "ldr x14, [x20, #0x110]\n"
+ "ldr x21, [x20, #0x118]\n"
+ "smlal2 v16.4s, v22.8h, v0.8h\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x23, #0x70]\n"
+ "ldr d22, [x9, x24]\n"
+ "smlal v17.4s, v25.4h, v0.4h\n"
+ "smlal v10.4s, v31.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v6.4s, v30.4h, v0.4h\n"
+ "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "smlal2 v7.4s, v31.8h, v0.8h\n"
"smlal v15.4s, v25.4h, v1.4h\n"
- "smlal2 v20.4s, v25.8h, v1.8h\n"
- "ldr d25, [x11, x10]\n"
- "usubl v25.8h, v25.8b, v7.8b\n"
- "smlal v18.4s, v24.4h, v1.4h\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
- "smlal v11.4s, v30.4h, v1.4h\n"
- "smlal2 v8.4s, v30.8h, v1.8h\n"
- "smlal v10.4s, v26.4h, v1.4h\n"
- "smlal2 v9.4s, v26.8h, v1.8h\n"
- "ldr d1, [x3, #0x80]\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal2 v16.4s, v25.8h, v1.8h\n"
+ "ldr d25, [x2, x24]\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "smlal v17.4s, v24.4h, v1.4h\n"
+ "smlal v10.4s, v30.4h, v1.4h\n"
+ "ldr d0, [x23, #0x78]\n"
+ "usubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v6.4s, v26.4h, v1.4h\n"
+ "smlal2 v8.4s, v24.8h, v1.8h\n"
+ "smlal2 v7.4s, v30.8h, v1.8h\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v20.4s, v24.8h, v2.8h\n"
- "ldr d24, [x24, x10]\n"
- "usubl v24.8h, v24.8b, v7.8b\n"
- "smlal v18.4s, v27.4h, v2.4h\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "smlal v11.4s, v26.4h, v2.4h\n"
- "smlal2 v8.4s, v26.8h, v2.8h\n"
- "smlal v10.4s, v25.4h, v2.4h\n"
- "smlal2 v9.4s, v25.8h, v2.8h\n"
- "ldr d2, [x3, #0x88]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal2 v16.4s, v24.8h, v2.8h\n"
+ "ldr d24, [x13, x24]\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal v10.4s, v26.4h, v2.4h\n"
+ "ldr d1, [x23, #0x80]\n"
+ "usubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v8.4s, v27.8h, v2.8h\n"
+ "smlal2 v7.4s, v26.8h, v2.8h\n"
"smlal v15.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
- "ldr d27, [x15, x10]\n"
- "usubl v27.8h, v27.8b, v7.8b\n"
- "smlal v18.4s, v23.4h, v3.4h\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v8.4s, v25.8h, v3.8h\n"
- "smlal v10.4s, v24.4h, v3.4h\n"
- "smlal2 v9.4s, v24.8h, v3.8h\n"
- "ldr d3, [x3, #0x90]\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v16.4s, v27.8h, v3.8h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "ldr d27, [x19, x24]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v17.4s, v23.4h, v3.4h\n"
+ "smlal v10.4s, v25.4h, v3.4h\n"
+ "ldr d2, [x23, #0x88]\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v8.4s, v23.8h, v3.8h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
"smlal v15.4s, v23.4h, v4.4h\n"
- "smlal2 v20.4s, v23.8h, v4.8h\n"
- "ldr d23, [x9, x10]\n"
- "usubl v23.8h, v23.8b, v7.8b\n"
- "smlal v18.4s, v28.4h, v4.4h\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "ldr d28, [x12, x10]\n"
- "usubl v28.8h, v28.8b, v7.8b\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v8.4s, v24.8h, v4.8h\n"
- "smlal v10.4s, v22.4h, v4.4h\n"
- "smlal2 v9.4s, v22.8h, v4.8h\n"
- "ldr d4, [x3, #0x98]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal2 v16.4s, v23.8h, v4.8h\n"
+ "ldr d23, [x28, x24]\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "smlal v10.4s, v24.4h, v4.4h\n"
+ "ldr d3, [x23, #0x90]\n"
+ "usubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v22.4h, v4.4h\n"
+ "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x11, x24]\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
"smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v20.4s, v31.8h, v0.8h\n"
- "ldr d31, [x27, x10]\n"
- "usubl v31.8h, v31.8b, v7.8b\n"
- "smlal v18.4s, v30.4h, v0.4h\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "smlal v11.4s, v27.4h, v0.4h\n"
- "smlal2 v8.4s, v27.8h, v0.8h\n"
- "smlal v10.4s, v23.4h, v0.4h\n"
- "smlal2 v9.4s, v23.8h, v0.8h\n"
- "ldr d0, [x3, #0xa0]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal2 v16.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x6, x24]\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal v10.4s, v27.4h, v0.4h\n"
+ "ldr d4, [x23, #0x98]\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v6.4s, v23.4h, v0.4h\n"
+ "smlal2 v8.4s, v30.8h, v0.8h\n"
+ "smlal2 v7.4s, v27.8h, v0.8h\n"
"smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v20.4s, v30.8h, v1.8h\n"
- "ldr d30, [x28, x10]\n"
- "usubl v30.8h, v30.8b, v7.8b\n"
- "smlal v18.4s, v26.4h, v1.4h\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
- "smlal v11.4s, v23.4h, v1.4h\n"
- "smlal2 v8.4s, v23.8h, v1.8h\n"
- "smlal v10.4s, v31.4h, v1.4h\n"
- "smlal2 v9.4s, v31.8h, v1.8h\n"
- "ldr d1, [x3, #0xa8]\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal2 v16.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x27, x24]\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "smlal v17.4s, v26.4h, v1.4h\n"
+ "smlal v10.4s, v23.4h, v1.4h\n"
+ "ldr d0, [x23, #0xa0]\n"
+ "usubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v6.4s, v31.4h, v1.4h\n"
+ "smlal2 v8.4s, v26.8h, v1.8h\n"
+ "smlal2 v7.4s, v23.8h, v1.8h\n"
"smlal v15.4s, v26.4h, v2.4h\n"
- "smlal2 v20.4s, v26.8h, v2.8h\n"
- "ldr d26, [x7, x10]\n"
- "usubl v26.8h, v26.8b, v7.8b\n"
- "smlal v18.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "smlal2 v8.4s, v31.8h, v2.8h\n"
- "smlal v10.4s, v30.4h, v2.4h\n"
- "smlal2 v9.4s, v30.8h, v2.8h\n"
- "ldr d2, [x3, #0xb0]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal2 v16.4s, v26.8h, v2.8h\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "ldr d26, [x17, x24]\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal v10.4s, v31.4h, v2.4h\n"
+ "ldr d1, [x23, #0xa8]\n"
+ "usubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v30.4h, v2.4h\n"
+ "smlal2 v8.4s, v25.8h, v2.8h\n"
+ "smlal2 v7.4s, v31.8h, v2.8h\n"
"smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v20.4s, v25.8h, v3.8h\n"
- "ldr d25, [x26, x10]\n"
- "usubl v25.8h, v25.8b, v7.8b\n"
- "smlal v18.4s, v24.4h, v3.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "smlal v11.4s, v30.4h, v3.4h\n"
- "smlal2 v8.4s, v30.8h, v3.8h\n"
- "smlal v10.4s, v28.4h, v3.4h\n"
- "smlal2 v9.4s, v28.8h, v3.8h\n"
- "ldr d3, [x3, #0xb8]\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v16.4s, v25.8h, v3.8h\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "ldr d25, [x5, x24]\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal v10.4s, v30.4h, v3.4h\n"
+ "ldr d2, [x23, #0xb0]\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v28.4h, v3.4h\n"
+ "smlal2 v8.4s, v24.8h, v3.8h\n"
+ "smlal2 v7.4s, v30.8h, v3.8h\n"
"smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v20.4s, v24.8h, v4.8h\n"
- "ldr d24, [x23, x10]\n"
- "usubl v24.8h, v24.8b, v7.8b\n"
- "smlal v18.4s, v22.4h, v4.4h\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
- "smlal v11.4s, v28.4h, v4.4h\n"
- "smlal2 v8.4s, v28.8h, v4.8h\n"
- "smlal v10.4s, v26.4h, v4.4h\n"
- "smlal2 v9.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0xc0]\n"
- "add x3, x3, #0xc8\n"
+ "smlal2 v16.4s, v24.8h, v4.8h\n"
+ "ldr d24, [x25, x24]\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v17.4s, v22.4h, v4.4h\n"
+ "smlal v10.4s, v28.4h, v4.4h\n"
+ "ldr d3, [x23, #0xb8]\n"
+ "usubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v7.4s, v28.8h, v4.8h\n"
"smlal v15.4s, v27.4h, v0.4h\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
- "smlal2 v20.4s, v27.8h, v0.8h\n"
- "ldr d27, [x22, x10]\n"
- "smlal v18.4s, v23.4h, v0.4h\n"
- "usubl v27.8h, v27.8b, v7.8b\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
- "smlal v11.4s, v25.4h, v0.4h\n"
- "smlal2 v8.4s, v25.8h, v0.8h\n"
- "ldr d25, [x20, x10]\n"
- "usubl v25.8h, v25.8b, v7.8b\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v9.4s, v24.8h, v0.8h\n"
+ "smlal2 v16.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x26, x24]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal2 v8.4s, v22.8h, v4.8h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x23, #0xc0]\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v17.4s, v23.4h, v0.4h\n"
+ "smlal v10.4s, v25.4h, v0.4h\n"
+ "add x23, x23, #0xc8\n"
+ "smlal v6.4s, v24.4h, v0.4h\n"
+ "smlal2 v7.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x12, x24]\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "smlal2 v8.4s, v23.8h, v0.8h\n"
+ "smlal2 v5.4s, v24.8h, v0.8h\n"
"smlal v15.4s, v23.4h, v1.4h\n"
- "smlal2 v20.4s, v23.8h, v1.8h\n"
- "smlal v18.4s, v31.4h, v1.4h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "smlal v11.4s, v24.4h, v1.4h\n"
- "smlal2 v8.4s, v24.8h, v1.8h\n"
- "ldr d24, [x13, x10]\n"
- "usubl v24.8h, v24.8b, v7.8b\n"
- "smlal v10.4s, v27.4h, v1.4h\n"
- "smlal2 v9.4s, v27.8h, v1.8h\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal v10.4s, v24.4h, v1.4h\n"
+ "smlal v6.4s, v27.4h, v1.4h\n"
+ "smlal2 v7.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x14, x24]\n"
+ "smlal2 v16.4s, v23.8h, v1.8h\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "smlal2 v8.4s, v31.8h, v1.8h\n"
+ "smlal2 v5.4s, v27.8h, v1.8h\n"
"smlal v15.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "smlal v18.4s, v30.4h, v2.4h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "smlal v11.4s, v27.4h, v2.4h\n"
- "smlal2 v8.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x10]\n"
- "add x10, x10, #0x8\n"
- "smlal v10.4s, v25.4h, v2.4h\n"
- "usubl v27.8h, v27.8b, v7.8b\n"
- "smlal2 v9.4s, v25.8h, v2.8h\n"
+ "smlal v17.4s, v30.4h, v2.4h\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v7.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x24]\n"
+ "smlal2 v16.4s, v31.8h, v2.8h\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal2 v8.4s, v30.8h, v2.8h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "add x24, x24, #0x8\n"
"smlal v15.4s, v30.4h, v3.4h\n"
- "smlal2 v20.4s, v30.8h, v3.8h\n"
- "smlal v18.4s, v28.4h, v3.4h\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v8.4s, v25.8h, v3.8h\n"
- "smlal v10.4s, v24.4h, v3.4h\n"
- "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "smlal v17.4s, v28.4h, v3.4h\n"
+ "smlal v10.4s, v25.4h, v3.4h\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v16.4s, v30.8h, v3.8h\n"
+ "smlal2 v8.4s, v28.8h, v3.8h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
"smlal v15.4s, v28.4h, v4.4h\n"
- "smlal2 v20.4s, v28.8h, v4.8h\n"
- "smlal v18.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v8.4s, v24.8h, v4.8h\n"
- "smlal v10.4s, v27.4h, v4.4h\n"
- "smlal2 v9.4s, v27.8h, v4.8h\n"
- "sqrdmulh v15.4s, v15.4s, v6.4s\n"
- "sqrdmulh v20.4s, v20.4s, v17.4s\n"
- "sqrdmulh v18.4s, v18.4s, v6.4s\n"
- "sqrdmulh v5.4s, v5.4s, v17.4s\n"
- "and v1.16b, v15.16b, v21.16b\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "and v29.16b, v20.16b, v14.16b\n"
- "and v3.16b, v18.16b, v21.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v2.16b, v5.16b, v14.16b\n"
- "sqrdmulh v11.4s, v11.4s, v6.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "sqdmulh v15.4s, v15.4s, v12.4s\n"
+ "smlal v10.4s, v24.4h, v4.4h\n"
+ "smlal v6.4s, v27.4h, v4.4h\n"
+ "sqdmulh v17.4s, v17.4s, v12.4s\n"
+ "smlal2 v16.4s, v28.8h, v4.8h\n"
+ "smlal2 v8.4s, v26.8h, v4.8h\n"
+ "sqdmulh v10.4s, v10.4s, v12.4s\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "smlal2 v5.4s, v27.8h, v4.8h\n"
+ "sqdmulh v6.4s, v6.4s, v12.4s\n"
+ "and v23.16b, v15.16b, v19.16b\n"
+ "sqdmulh v16.4s, v16.4s, v20.4s\n"
+ "and v22.16b, v17.16b, v19.16b\n"
+ "sqdmulh v8.4s, v8.4s, v20.4s\n"
+ "and v21.16b, v10.16b, v19.16b\n"
+ "sqdmulh v7.4s, v7.4s, v20.4s\n"
+ "and v26.16b, v6.16b, v19.16b\n"
+ "sqdmulh v5.4s, v5.4s, v20.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v4.16b, v16.16b, v29.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v2.16b, v8.16b, v29.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v3.16b, v7.16b, v29.16b\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "and v25.16b, v5.16b, v29.16b\n"
+ "sqadd v15.4s, v15.4s, v23.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v22.4s\n"
"sshr v2.4s, v2.4s, #0x1f\n"
- "sqadd v15.4s, v15.4s, v1.4s\n"
- "sqrdmulh v10.4s, v10.4s, v6.4s\n"
- "and v0.16b, v11.16b, v21.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v21.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sqadd v18.4s, v18.4s, v3.4s\n"
- "sqadd v5.4s, v5.4s, v2.4s\n"
- "and v27.16b, v8.16b, v14.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "add v15.4s, v15.4s, v19.4s\n"
- "srshl v20.4s, v20.4s, v14.4s\n"
- "srshl v18.4s, v18.4s, v21.4s\n"
- "srshl v5.4s, v5.4s, v14.4s\n"
- "smin v15.4s, v15.4s, v12.4s\n"
- "add v20.4s, v20.4s, v19.4s\n"
- "add v18.4s, v18.4s, v19.4s\n"
- "smax v15.4s, v15.4s, v16.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "smin v18.4s, v18.4s, v12.4s\n"
- "add v5.4s, v5.4s, v19.4s\n"
- "smax v20.4s, v20.4s, v16.4s\n"
- "smax v18.4s, v18.4s, v16.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "uzp1 v15.16b, v15.16b, v20.16b\n"
- "sqadd v11.4s, v11.4s, v0.4s\n"
+ "sqadd v10.4s, v10.4s, v21.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v26.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v19.4s\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "srshl v17.4s, v17.4s, v19.4s\n"
+ "sqadd v8.4s, v8.4s, v2.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqadd v7.4s, v7.4s, v3.4s\n"
+ "srshl v6.4s, v6.4s, v19.4s\n"
+ "sqadd v5.4s, v5.4s, v25.4s\n"
+ "srshl v16.4s, v16.4s, v29.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqxtn v17.4h, v17.4s\n"
+ "srshl v7.4s, v7.4s, v29.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v5.4s, v5.4s, v29.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v15.8h, v16.4s\n"
+ "sqxtn2 v17.8h, v8.4s\n"
+ "sqxtn2 v10.8h, v7.4s\n"
+ "sqxtn2 v6.8h, v5.4s\n"
+ "sqadd v15.8h, v15.8h, v18.8h\n"
+ "sqadd v17.8h, v17.8h, v18.8h\n"
+ "sqadd v10.8h, v10.8h, v18.8h\n"
+ "sqadd v6.8h, v6.8h, v18.8h\n"
+ "smax v15.8h, v15.8h, v11.8h\n"
+ "smax v17.8h, v17.8h, v11.8h\n"
+ "smax v10.8h, v10.8h, v11.8h\n"
+ "smax v6.8h, v6.8h, v11.8h\n"
+ "smin v15.8h, v15.8h, v13.8h\n"
+ "smin v17.8h, v17.8h, v13.8h\n"
+ "smin v10.8h, v10.8h, v13.8h\n"
+ "smin v6.8h, v6.8h, v13.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x17, x1]\n"
- "smax v5.4s, v5.4s, v16.4s\n"
- "sqadd v8.4s, v8.4s, v27.4s\n"
- "srshl v11.4s, v11.4s, v21.4s\n"
- "and v30.16b, v10.16b, v21.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "uzp1 v18.16b, v18.16b, v5.16b\n"
- "add v11.4s, v11.4s, v19.4s\n"
- "srshl v8.4s, v8.4s, v14.4s\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
- "str d18, [x16, x1]\n"
- "smin v11.4s, v11.4s, v12.4s\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "add v8.4s, v8.4s, v19.4s\n"
- "sqadd v10.4s, v10.4s, v30.4s\n"
- "smax v11.4s, v11.4s, v16.4s\n"
- "smin v8.4s, v8.4s, v12.4s\n"
- "and v6.16b, v9.16b, v14.16b\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "smax v8.4s, v8.4s, v16.4s\n"
- "srshl v10.4s, v10.4s, v21.4s\n"
- "uzp1 v11.16b, v11.16b, v8.16b\n"
- "add v10.4s, v10.4s, v19.4s\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "str d11, [x6, x1]\n"
- "smin v10.4s, v10.4s, v12.4s\n"
- "sqadd v9.4s, v9.4s, v6.4s\n"
- "smax v10.4s, v10.4s, v16.4s\n"
- "srshl v9.4s, v9.4s, v14.4s\n"
- "add v9.4s, v9.4s, v19.4s\n"
- "smin v9.4s, v9.4s, v12.4s\n"
- "smax v9.4s, v9.4s, v16.4s\n"
- "uzp1 v10.16b, v10.16b, v9.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "str d15, [x16, x22]\n"
"uzp1 v10.16b, v10.16b, v10.16b\n"
- "str d10, [x8, x1]\n"
- "add x1, x1, #0x8\n"
- "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q15, [x12, #0x0]\n"
- "mov v18.16b, v15.16b\n"
- "ldr q20, [x12, #0x10]\n"
- "add x12, x12, #0x20\n"
- "mov v11.16b, v15.16b\n"
- "str x12, [%x[params], %[offsetof_Params_bias]]\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str d17, [x8, x22]\n"
+ "str d10, [x4, x22]\n"
+ "str d6, [x7, x22]\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q15, [x19, #0x0]\n"
+ "add x22, x22, #0x8\n"
+ "ldr q16, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d0, [x23, #0x0]\n"
+ "ldr d1, [x23, #0x8]\n"
+ "ldr d2, [x23, #0x10]\n"
+ "mov v17.16b, v15.16b\n"
+ "mov v8.16b, v16.16b\n"
+ "ldr d3, [x23, #0x18]\n"
+ "ldr d4, [x23, #0x20]\n"
"mov v10.16b, v15.16b\n"
- "ldr d0, [x3, #0x0]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
- "mov v5.16b, v20.16b\n"
- "ldr d1, [x3, #0x8]\n"
- "mov v8.16b, v20.16b\n"
- "ldr d2, [x3, #0x10]\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
- "mov v9.16b, v20.16b\n"
- "ldr d3, [x3, #0x18]\n"
- "ldr d4, [x3, #0x20]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
- "ldp x28, x27, [x25, #0x0]\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
- "ldp x26, x13, [x25, #0x10]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
- "ldp x24, x23, [x25, #0x20]\n"
- "ldp x22, x21, [x25, #0x30]\n"
- "ldp x20, x0, [x25, #0x40]\n"
- "ldr d31, [x28, x10]\n"
- "usubl v31.8h, v31.8b, v7.8b\n"
- "ldr d30, [x27, x10]\n"
- "ldr d29, [x26, x10]\n"
- "usubl v30.8h, v30.8b, v7.8b\n"
- "ldr d28, [x13, x10]\n"
- "ldr d27, [x24, x10]\n"
- "usubl v29.8h, v29.8b, v7.8b\n"
- "ldr d23, [x23, x10]\n"
- "usubl v28.8h, v28.8b, v7.8b\n"
- "ldr d25, [x22, x10]\n"
- "ldr d24, [x21, x10]\n"
- "usubl v27.8h, v27.8b, v7.8b\n"
- "ldr d26, [x20, x10]\n"
- "usubl v23.8h, v23.8b, v7.8b\n"
- "ldr d22, [x0, x10]\n"
- "usubl v25.8h, v25.8b, v7.8b\n"
- "usubl v24.8h, v24.8b, v7.8b\n"
- "usubl v26.8h, v26.8b, v7.8b\n"
- "usubl v22.8h, v22.8b, v7.8b\n"
+ "mov v7.16b, v16.16b\n"
+ "ldp x28, x6, [x20, #0x0]\n"
+ "ldp x26, x25, [x20, #0x10]\n"
+ "mov v6.16b, v15.16b\n"
+ "mov v5.16b, v16.16b\n"
+ "ldp x5, x2, [x20, #0x20]\n"
+ "ldp x27, x21, [x20, #0x30]\n"
+ "usubl v0.8h, v0.8b, v14.8b\n"
+ "usubl v1.8h, v1.8b, v14.8b\n"
+ "ldp x12, x19, [x20, #0x40]\n"
+ "ldr d31, [x28, x24]\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "usubl v3.8h, v3.8b, v14.8b\n"
+ "ldr d30, [x6, x24]\n"
+ "ldr d29, [x26, x24]\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "ldr d28, [x25, x24]\n"
+ "ldr d27, [x5, x24]\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "usubl v29.8h, v29.8b, v9.8b\n"
+ "ldr d23, [x2, x24]\n"
+ "ldr d25, [x27, x24]\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "ldr d24, [x21, x24]\n"
+ "ldr d26, [x12, x24]\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "ldr d22, [x19, x24]\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
"bgt 1b\n"
"2:" // Tail
"smlal v15.4s, v31.4h, v0.4h\n"
- "ldr x20, [x25, #0x50]\n"
- "tst x4, #0x7\n"
- "smlal2 v20.4s, v31.8h, v0.8h\n"
- "ldr x28, [x25, #0x58]\n"
- "smlal v18.4s, v30.4h, v0.4h\n"
- "ldr x0, [x25, #0x60]\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x10]\n"
- "usubl v31.8h, v31.8b, v7.8b\n"
- "smlal v11.4s, v29.4h, v0.4h\n"
- "ldr x7, [x25, #0x68]\n"
- "smlal2 v8.4s, v29.8h, v0.8h\n"
- "ldr x26, [x25, #0x70]\n"
- "smlal v10.4s, v28.4h, v0.4h\n"
- "ldr x23, [x25, #0x78]\n"
- "smlal2 v9.4s, v28.8h, v0.8h\n"
- "ldr d0, [x3, #0x28]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal2 v16.4s, v31.8h, v0.8h\n"
+ "ldr x19, [x20, #0x50]\n"
+ "ldr d31, [x19, x24]\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal v10.4s, v29.4h, v0.4h\n"
+ "ldr x15, [x20, #0x58]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v6.4s, v28.4h, v0.4h\n"
+ "smlal2 v8.4s, v30.8h, v0.8h\n"
+ "ldr x19, [x20, #0x60]\n"
+ "ldr x27, [x20, #0x68]\n"
+ "smlal2 v7.4s, v29.8h, v0.8h\n"
"smlal v15.4s, v30.4h, v1.4h\n"
- "ldr x20, [x25, #0x80]\n"
- "smlal2 v20.4s, v30.8h, v1.8h\n"
- "ldr d30, [x28, x10]\n"
- "usubl v30.8h, v30.8b, v7.8b\n"
- "smlal v18.4s, v27.4h, v1.4h\n"
- "ldr x22, [x25, #0x88]\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "ldr x13, [x25, #0x90]\n"
- "smlal v11.4s, v28.4h, v1.4h\n"
- "ldr x21, [x25, #0x98]\n"
- "smlal2 v8.4s, v28.8h, v1.8h\n"
- "ldr x14, [x25, #0xa0]\n"
- "smlal v10.4s, v23.4h, v1.4h\n"
- "ldr x11, [x25, #0xa8]\n"
- "smlal2 v9.4s, v23.8h, v1.8h\n"
- "ldr d1, [x3, #0x30]\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
+ "ldr x5, [x20, #0x70]\n"
+ "ldr x11, [x20, #0x78]\n"
+ "smlal2 v16.4s, v30.8h, v1.8h\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x15, x24]\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "smlal v17.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v28.4h, v1.4h\n"
+ "ldr d0, [x23, #0x28]\n"
+ "usubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v6.4s, v23.4h, v1.4h\n"
+ "smlal2 v8.4s, v27.8h, v1.8h\n"
+ "ldr x12, [x20, #0x80]\n"
+ "ldr x26, [x20, #0x88]\n"
+ "smlal2 v7.4s, v28.8h, v1.8h\n"
"smlal v15.4s, v27.4h, v2.4h\n"
- "ldr x24, [x25, #0xb0]\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "ldr d27, [x0, x10]\n"
- "usubl v27.8h, v27.8b, v7.8b\n"
- "smlal v18.4s, v25.4h, v2.4h\n"
- "ldr x0, [x25, #0xb8]\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "ldr x15, [x25, #0xc0]\n"
- "smlal v11.4s, v23.4h, v2.4h\n"
- "ldr x9, [x25, #0xc8]\n"
- "smlal2 v8.4s, v23.8h, v2.8h\n"
- "ldr x27, [x25, #0xd0]\n"
- "smlal v10.4s, v31.4h, v2.4h\n"
- "ldr x28, [x25, #0xd8]\n"
- "smlal2 v9.4s, v31.8h, v2.8h\n"
- "ldr d2, [x3, #0x38]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
+ "ldr x14, [x20, #0x90]\n"
+ "ldr x15, [x20, #0x98]\n"
+ "smlal2 v16.4s, v27.8h, v2.8h\n"
+ "smlal2 v5.4s, v23.8h, v1.8h\n"
+ "ldr d27, [x19, x24]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal v10.4s, v23.4h, v2.4h\n"
+ "ldr d1, [x23, #0x30]\n"
+ "usubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v31.4h, v2.4h\n"
+ "smlal2 v8.4s, v25.8h, v2.8h\n"
+ "ldr x21, [x20, #0xa0]\n"
+ "ldr x2, [x20, #0xa8]\n"
+ "smlal2 v7.4s, v23.8h, v2.8h\n"
"smlal v15.4s, v25.4h, v3.4h\n"
- "ldr x12, [x25, #0xe0]\n"
- "smlal2 v20.4s, v25.8h, v3.8h\n"
- "ldr d25, [x7, x10]\n"
- "usubl v25.8h, v25.8b, v7.8b\n"
- "smlal v18.4s, v24.4h, v3.4h\n"
- "ldr x7, [x25, #0xe8]\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "ldr q6, [x2, #0x0]\n"
- "smlal v11.4s, v31.4h, v3.4h\n"
- "ldr q21, [x5, #0x0]\n"
- "smlal2 v8.4s, v31.8h, v3.8h\n"
- "ldr q17, [x2, #0x10]\n"
- "add x2, x2, #0x20\n"
- "smlal v10.4s, v30.4h, v3.4h\n"
- "ldr q14, [x5, #0x10]\n"
- "add x5, x5, #0x20\n"
- "smlal2 v9.4s, v30.8h, v3.8h\n"
- "ldr d3, [x3, #0x40]\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
+ "ldr x13, [x20, #0xb0]\n"
+ "ldr x9, [x20, #0xb8]\n"
+ "smlal2 v16.4s, v25.8h, v3.8h\n"
+ "smlal2 v5.4s, v31.8h, v2.8h\n"
+ "ldr d25, [x27, x24]\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal v10.4s, v31.4h, v3.4h\n"
+ "ldr d2, [x23, #0x38]\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v30.4h, v3.4h\n"
+ "smlal2 v8.4s, v24.8h, v3.8h\n"
+ "ldr x19, [x20, #0xc0]\n"
+ "ldr x28, [x20, #0xc8]\n"
+ "smlal2 v7.4s, v31.8h, v3.8h\n"
"smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v20.4s, v24.8h, v4.8h\n"
- "ldr d24, [x26, x10]\n"
- "usubl v24.8h, v24.8b, v7.8b\n"
- "smlal v18.4s, v27.4h, v4.4h\n"
- "ldr x26, [x25, #0xf0]\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
- "ldr d27, [x23, x10]\n"
- "usubl v27.8h, v27.8b, v7.8b\n"
- "smlal v11.4s, v30.4h, v4.4h\n"
- "ldr x23, [x25, #0xf8]\n"
- "smlal2 v8.4s, v30.8h, v4.8h\n"
- "smlal v10.4s, v26.4h, v4.4h\n"
- "smlal2 v9.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0x48]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
+ "ldr x6, [x20, #0xd0]\n"
+ "ldr x27, [x20, #0xd8]\n"
+ "smlal2 v16.4s, v24.8h, v4.8h\n"
+ "smlal2 v5.4s, v30.8h, v3.8h\n"
+ "ldr d24, [x5, x24]\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal v10.4s, v30.4h, v4.4h\n"
+ "ldr d3, [x23, #0x40]\n"
+ "usubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v8.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x11, x24]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal2 v7.4s, v30.8h, v4.8h\n"
"smlal v15.4s, v29.4h, v0.4h\n"
- "smlal2 v20.4s, v29.8h, v0.8h\n"
- "smlal v18.4s, v28.4h, v0.4h\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "smlal v11.4s, v22.4h, v0.4h\n"
- "smlal2 v8.4s, v22.8h, v0.8h\n"
- "smlal v10.4s, v25.4h, v0.4h\n"
- "smlal2 v9.4s, v25.8h, v0.8h\n"
- "ldr d0, [x3, #0x50]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
+ "ldr x11, [x20, #0xe0]\n"
+ "ldr x17, [x20, #0xe8]\n"
+ "smlal2 v16.4s, v29.8h, v0.8h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x23, #0x48]\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "smlal v10.4s, v22.4h, v0.4h\n"
+ "ldr x5, [x20, #0xf0]\n"
+ "ldr x25, [x20, #0xf8]\n"
+ "smlal v6.4s, v25.4h, v0.4h\n"
+ "smlal2 v8.4s, v28.8h, v0.8h\n"
+ "ldr q12, [x10, #0x0]\n"
+ "ldr q19, [x1, #0x0]\n"
+ "smlal2 v7.4s, v22.8h, v0.8h\n"
"smlal v15.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x10]\n"
- "usubl v28.8h, v28.8b, v7.8b\n"
- "smlal v18.4s, v23.4h, v1.4h\n"
- "ldr x22, [x25, #0x100]\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "smlal v11.4s, v25.4h, v1.4h\n"
- "smlal2 v8.4s, v25.8h, v1.8h\n"
- "smlal v10.4s, v24.4h, v1.4h\n"
- "smlal2 v9.4s, v24.8h, v1.8h\n"
- "ldr d1, [x3, #0x58]\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
+ "ldr q20, [x10, #0x10]\n"
+ "ldr q29, [x1, #0x10]\n"
+ "smlal2 v16.4s, v28.8h, v1.8h\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "ldr d28, [x26, x24]\n"
+ "ldr d0, [x23, #0x50]\n"
+ "smlal v17.4s, v23.4h, v1.4h\n"
+ "smlal v10.4s, v25.4h, v1.4h\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "ldr x26, [x20, #0x100]\n"
+ "smlal v6.4s, v24.4h, v1.4h\n"
+ "smlal2 v8.4s, v23.8h, v1.8h\n"
+ "usubl v0.8h, v0.8b, v14.8b\n"
+ "tst x0, #0x7\n"
+ "smlal2 v7.4s, v25.8h, v1.8h\n"
"smlal v15.4s, v23.4h, v2.4h\n"
- "smlal2 v20.4s, v23.8h, v2.8h\n"
- "ldr d23, [x20, x10]\n"
- "usubl v23.8h, v23.8b, v7.8b\n"
- "smlal v18.4s, v31.4h, v2.4h\n"
- "ldr x20, [x25, #0x108]\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "smlal v11.4s, v24.4h, v2.4h\n"
- "smlal2 v8.4s, v24.8h, v2.8h\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "smlal2 v9.4s, v27.8h, v2.8h\n"
- "ldr d2, [x3, #0x60]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
+ "add x10, x10, #0x20\n"
+ "add x1, x1, #0x20\n"
+ "smlal2 v16.4s, v23.8h, v2.8h\n"
+ "ldr d23, [x12, x24]\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "smlal v17.4s, v31.4h, v2.4h\n"
+ "smlal v10.4s, v24.4h, v2.4h\n"
+ "ldr d1, [x23, #0x58]\n"
+ "usubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v27.4h, v2.4h\n"
+ "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "ldr x12, [x20, #0x108]\n"
+ "smlal2 v7.4s, v24.8h, v2.8h\n"
"smlal v15.4s, v31.4h, v3.4h\n"
- "smlal2 v20.4s, v31.8h, v3.8h\n"
- "ldr d31, [x13, x10]\n"
- "usubl v31.8h, v31.8b, v7.8b\n"
- "smlal v18.4s, v30.4h, v3.4h\n"
- "ldr x13, [x25, #0x110]\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "smlal v11.4s, v27.4h, v3.4h\n"
- "smlal2 v8.4s, v27.8h, v3.8h\n"
- "smlal v10.4s, v23.4h, v3.4h\n"
- "smlal2 v9.4s, v23.8h, v3.8h\n"
- "ldr d3, [x3, #0x68]\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v16.4s, v31.8h, v3.8h\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "ldr d31, [x14, x24]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v17.4s, v30.4h, v3.4h\n"
+ "smlal v10.4s, v27.4h, v3.4h\n"
+ "ldr d2, [x23, #0x60]\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v23.4h, v3.4h\n"
+ "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "ldr x14, [x20, #0x110]\n"
+ "smlal2 v7.4s, v27.8h, v3.8h\n"
"smlal v15.4s, v30.4h, v4.4h\n"
- "smlal2 v20.4s, v30.8h, v4.8h\n"
- "ldr d30, [x21, x10]\n"
- "usubl v30.8h, v30.8b, v7.8b\n"
- "smlal v18.4s, v26.4h, v4.4h\n"
- "ldr x21, [x25, #0x118]\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x10]\n"
- "usubl v26.8h, v26.8b, v7.8b\n"
- "smlal v11.4s, v23.4h, v4.4h\n"
- "smlal2 v8.4s, v23.8h, v4.8h\n"
- "smlal v10.4s, v28.4h, v4.4h\n"
- "smlal2 v9.4s, v28.8h, v4.8h\n"
- "ldr d4, [x3, #0x70]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal2 v16.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x15, x24]\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal v10.4s, v23.4h, v4.4h\n"
+ "ldr d3, [x23, #0x68]\n"
+ "usubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v28.4h, v4.4h\n"
+ "smlal2 v8.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x21, x24]\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "smlal2 v7.4s, v23.8h, v4.8h\n"
"smlal v15.4s, v22.4h, v0.4h\n"
- "smlal2 v20.4s, v22.8h, v0.8h\n"
- "ldr d22, [x0, x10]\n"
- "usubl v22.8h, v22.8b, v7.8b\n"
- "smlal v18.4s, v25.4h, v0.4h\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v8.4s, v31.8h, v0.8h\n"
- "smlal v10.4s, v30.4h, v0.4h\n"
- "smlal2 v9.4s, v30.8h, v0.8h\n"
- "ldr d0, [x3, #0x78]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
+ "ldr x21, [x20, #0x118]\n"
+ "smlal2 v16.4s, v22.8h, v0.8h\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x23, #0x70]\n"
+ "ldr d22, [x9, x24]\n"
+ "smlal v17.4s, v25.4h, v0.4h\n"
+ "smlal v10.4s, v31.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v6.4s, v30.4h, v0.4h\n"
+ "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "smlal2 v7.4s, v31.8h, v0.8h\n"
"smlal v15.4s, v25.4h, v1.4h\n"
- "smlal2 v20.4s, v25.8h, v1.8h\n"
- "ldr d25, [x11, x10]\n"
- "usubl v25.8h, v25.8b, v7.8b\n"
- "smlal v18.4s, v24.4h, v1.4h\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
- "smlal v11.4s, v30.4h, v1.4h\n"
- "smlal2 v8.4s, v30.8h, v1.8h\n"
- "smlal v10.4s, v26.4h, v1.4h\n"
- "smlal2 v9.4s, v26.8h, v1.8h\n"
- "ldr d1, [x3, #0x80]\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal2 v16.4s, v25.8h, v1.8h\n"
+ "ldr d25, [x2, x24]\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "smlal v17.4s, v24.4h, v1.4h\n"
+ "smlal v10.4s, v30.4h, v1.4h\n"
+ "ldr d0, [x23, #0x78]\n"
+ "usubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v6.4s, v26.4h, v1.4h\n"
+ "smlal2 v8.4s, v24.8h, v1.8h\n"
+ "smlal2 v7.4s, v30.8h, v1.8h\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v20.4s, v24.8h, v2.8h\n"
- "ldr d24, [x24, x10]\n"
- "usubl v24.8h, v24.8b, v7.8b\n"
- "smlal v18.4s, v27.4h, v2.4h\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "smlal v11.4s, v26.4h, v2.4h\n"
- "smlal2 v8.4s, v26.8h, v2.8h\n"
- "smlal v10.4s, v25.4h, v2.4h\n"
- "smlal2 v9.4s, v25.8h, v2.8h\n"
- "ldr d2, [x3, #0x88]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal2 v16.4s, v24.8h, v2.8h\n"
+ "ldr d24, [x13, x24]\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal v10.4s, v26.4h, v2.4h\n"
+ "ldr d1, [x23, #0x80]\n"
+ "usubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v8.4s, v27.8h, v2.8h\n"
+ "smlal2 v7.4s, v26.8h, v2.8h\n"
"smlal v15.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
- "ldr d27, [x15, x10]\n"
- "usubl v27.8h, v27.8b, v7.8b\n"
- "smlal v18.4s, v23.4h, v3.4h\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v8.4s, v25.8h, v3.8h\n"
- "smlal v10.4s, v24.4h, v3.4h\n"
- "smlal2 v9.4s, v24.8h, v3.8h\n"
- "ldr d3, [x3, #0x90]\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v16.4s, v27.8h, v3.8h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "ldr d27, [x19, x24]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v17.4s, v23.4h, v3.4h\n"
+ "smlal v10.4s, v25.4h, v3.4h\n"
+ "ldr d2, [x23, #0x88]\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v8.4s, v23.8h, v3.8h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
"smlal v15.4s, v23.4h, v4.4h\n"
- "smlal2 v20.4s, v23.8h, v4.8h\n"
- "ldr d23, [x9, x10]\n"
- "usubl v23.8h, v23.8b, v7.8b\n"
- "smlal v18.4s, v28.4h, v4.4h\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "ldr d28, [x12, x10]\n"
- "usubl v28.8h, v28.8b, v7.8b\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v8.4s, v24.8h, v4.8h\n"
- "smlal v10.4s, v22.4h, v4.4h\n"
- "smlal2 v9.4s, v22.8h, v4.8h\n"
- "ldr d4, [x3, #0x98]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal2 v16.4s, v23.8h, v4.8h\n"
+ "ldr d23, [x28, x24]\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "smlal v10.4s, v24.4h, v4.4h\n"
+ "ldr d3, [x23, #0x90]\n"
+ "usubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v22.4h, v4.4h\n"
+ "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x11, x24]\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
"smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v20.4s, v31.8h, v0.8h\n"
- "ldr d31, [x27, x10]\n"
- "usubl v31.8h, v31.8b, v7.8b\n"
- "smlal v18.4s, v30.4h, v0.4h\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "smlal v11.4s, v27.4h, v0.4h\n"
- "smlal2 v8.4s, v27.8h, v0.8h\n"
- "smlal v10.4s, v23.4h, v0.4h\n"
- "smlal2 v9.4s, v23.8h, v0.8h\n"
- "ldr d0, [x3, #0xa0]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
+ "smlal2 v16.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x6, x24]\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal v10.4s, v27.4h, v0.4h\n"
+ "ldr d4, [x23, #0x98]\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v6.4s, v23.4h, v0.4h\n"
+ "smlal2 v8.4s, v30.8h, v0.8h\n"
+ "smlal2 v7.4s, v27.8h, v0.8h\n"
"smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v20.4s, v30.8h, v1.8h\n"
- "ldr d30, [x28, x10]\n"
- "usubl v30.8h, v30.8b, v7.8b\n"
- "smlal v18.4s, v26.4h, v1.4h\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
- "smlal v11.4s, v23.4h, v1.4h\n"
- "smlal2 v8.4s, v23.8h, v1.8h\n"
- "smlal v10.4s, v31.4h, v1.4h\n"
- "smlal2 v9.4s, v31.8h, v1.8h\n"
- "ldr d1, [x3, #0xa8]\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
+ "smlal2 v16.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x27, x24]\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "smlal v17.4s, v26.4h, v1.4h\n"
+ "smlal v10.4s, v23.4h, v1.4h\n"
+ "ldr d0, [x23, #0xa0]\n"
+ "usubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v6.4s, v31.4h, v1.4h\n"
+ "smlal2 v8.4s, v26.8h, v1.8h\n"
+ "smlal2 v7.4s, v23.8h, v1.8h\n"
"smlal v15.4s, v26.4h, v2.4h\n"
- "smlal2 v20.4s, v26.8h, v2.8h\n"
- "ldr d26, [x7, x10]\n"
- "usubl v26.8h, v26.8b, v7.8b\n"
- "smlal v18.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "smlal2 v8.4s, v31.8h, v2.8h\n"
- "smlal v10.4s, v30.4h, v2.4h\n"
- "smlal2 v9.4s, v30.8h, v2.8h\n"
- "ldr d2, [x3, #0xb0]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
+ "smlal2 v16.4s, v26.8h, v2.8h\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "ldr d26, [x17, x24]\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal v10.4s, v31.4h, v2.4h\n"
+ "ldr d1, [x23, #0xa8]\n"
+ "usubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v30.4h, v2.4h\n"
+ "smlal2 v8.4s, v25.8h, v2.8h\n"
+ "smlal2 v7.4s, v31.8h, v2.8h\n"
"smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v20.4s, v25.8h, v3.8h\n"
- "ldr d25, [x26, x10]\n"
- "usubl v25.8h, v25.8b, v7.8b\n"
- "smlal v18.4s, v24.4h, v3.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "smlal v11.4s, v30.4h, v3.4h\n"
- "smlal2 v8.4s, v30.8h, v3.8h\n"
- "smlal v10.4s, v28.4h, v3.4h\n"
- "smlal2 v9.4s, v28.8h, v3.8h\n"
- "ldr d3, [x3, #0xb8]\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
+ "smlal2 v16.4s, v25.8h, v3.8h\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "ldr d25, [x5, x24]\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal v10.4s, v30.4h, v3.4h\n"
+ "ldr d2, [x23, #0xb0]\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v28.4h, v3.4h\n"
+ "smlal2 v8.4s, v24.8h, v3.8h\n"
+ "smlal2 v7.4s, v30.8h, v3.8h\n"
"smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v20.4s, v24.8h, v4.8h\n"
- "ldr d24, [x23, x10]\n"
- "usubl v24.8h, v24.8b, v7.8b\n"
- "smlal v18.4s, v22.4h, v4.4h\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
- "smlal v11.4s, v28.4h, v4.4h\n"
- "smlal2 v8.4s, v28.8h, v4.8h\n"
- "smlal v10.4s, v26.4h, v4.4h\n"
- "smlal2 v9.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0xc0]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
+ "smlal2 v16.4s, v24.8h, v4.8h\n"
+ "ldr d24, [x25, x24]\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v17.4s, v22.4h, v4.4h\n"
+ "smlal v10.4s, v28.4h, v4.4h\n"
+ "ldr d3, [x23, #0xb8]\n"
+ "usubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v7.4s, v28.8h, v4.8h\n"
"smlal v15.4s, v27.4h, v0.4h\n"
- "smlal2 v20.4s, v27.8h, v0.8h\n"
- "ldr d27, [x22, x10]\n"
- "usubl v27.8h, v27.8b, v7.8b\n"
- "smlal v18.4s, v23.4h, v0.4h\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
- "smlal v11.4s, v25.4h, v0.4h\n"
- "smlal2 v8.4s, v25.8h, v0.8h\n"
- "ldr d25, [x20, x10]\n"
- "usubl v25.8h, v25.8b, v7.8b\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "smlal2 v9.4s, v24.8h, v0.8h\n"
+ "smlal2 v16.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x26, x24]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal2 v8.4s, v22.8h, v4.8h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x23, #0xc0]\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v17.4s, v23.4h, v0.4h\n"
+ "smlal v10.4s, v25.4h, v0.4h\n"
+ "smlal v6.4s, v24.4h, v0.4h\n"
+ "smlal2 v7.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x12, x24]\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "smlal2 v8.4s, v23.8h, v0.8h\n"
+ "smlal2 v5.4s, v24.8h, v0.8h\n"
"smlal v15.4s, v23.4h, v1.4h\n"
- "smlal2 v20.4s, v23.8h, v1.8h\n"
- "smlal v18.4s, v31.4h, v1.4h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "smlal v11.4s, v24.4h, v1.4h\n"
- "smlal2 v8.4s, v24.8h, v1.8h\n"
- "ldr d24, [x13, x10]\n"
- "usubl v24.8h, v24.8b, v7.8b\n"
- "smlal v10.4s, v27.4h, v1.4h\n"
- "smlal2 v9.4s, v27.8h, v1.8h\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal v10.4s, v24.4h, v1.4h\n"
+ "smlal v6.4s, v27.4h, v1.4h\n"
+ "smlal2 v7.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x14, x24]\n"
+ "smlal2 v16.4s, v23.8h, v1.8h\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "smlal2 v8.4s, v31.8h, v1.8h\n"
+ "smlal2 v5.4s, v27.8h, v1.8h\n"
"smlal v15.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "smlal v18.4s, v30.4h, v2.4h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "smlal v11.4s, v27.4h, v2.4h\n"
- "smlal2 v8.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x10]\n"
- "add x10, x10, #0x8\n"
- "smlal v10.4s, v25.4h, v2.4h\n"
- "usubl v27.8h, v27.8b, v7.8b\n"
- "smlal2 v9.4s, v25.8h, v2.8h\n"
+ "smlal v17.4s, v30.4h, v2.4h\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v7.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x24]\n"
+ "smlal2 v16.4s, v31.8h, v2.8h\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal2 v8.4s, v30.8h, v2.8h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "add x24, x24, #0x8\n"
"smlal v15.4s, v30.4h, v3.4h\n"
- "smlal2 v20.4s, v30.8h, v3.8h\n"
- "smlal v18.4s, v28.4h, v3.4h\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v8.4s, v25.8h, v3.8h\n"
- "smlal v10.4s, v24.4h, v3.4h\n"
- "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "smlal v17.4s, v28.4h, v3.4h\n"
+ "smlal v10.4s, v25.4h, v3.4h\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v16.4s, v30.8h, v3.8h\n"
+ "smlal2 v8.4s, v28.8h, v3.8h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
"smlal v15.4s, v28.4h, v4.4h\n"
- "smlal2 v20.4s, v28.8h, v4.8h\n"
- "smlal v18.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v8.4s, v24.8h, v4.8h\n"
- "smlal v10.4s, v27.4h, v4.4h\n"
- "smlal2 v9.4s, v27.8h, v4.8h\n"
- "sqrdmulh v15.4s, v15.4s, v6.4s\n"
- "sqrdmulh v20.4s, v20.4s, v17.4s\n"
- "sqrdmulh v18.4s, v18.4s, v6.4s\n"
- "sqrdmulh v5.4s, v5.4s, v17.4s\n"
- "and v1.16b, v15.16b, v21.16b\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "and v29.16b, v20.16b, v14.16b\n"
- "and v3.16b, v18.16b, v21.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v2.16b, v5.16b, v14.16b\n"
- "sqrdmulh v11.4s, v11.4s, v6.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "sqdmulh v15.4s, v15.4s, v12.4s\n"
+ "smlal v10.4s, v24.4h, v4.4h\n"
+ "smlal v6.4s, v27.4h, v4.4h\n"
+ "sqdmulh v17.4s, v17.4s, v12.4s\n"
+ "smlal2 v16.4s, v28.8h, v4.8h\n"
+ "smlal2 v8.4s, v26.8h, v4.8h\n"
+ "sqdmulh v10.4s, v10.4s, v12.4s\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "smlal2 v5.4s, v27.8h, v4.8h\n"
+ "sqdmulh v6.4s, v6.4s, v12.4s\n"
+ "and v23.16b, v15.16b, v19.16b\n"
+ "sqdmulh v16.4s, v16.4s, v20.4s\n"
+ "and v22.16b, v17.16b, v19.16b\n"
+ "sqdmulh v8.4s, v8.4s, v20.4s\n"
+ "and v21.16b, v10.16b, v19.16b\n"
+ "sqdmulh v7.4s, v7.4s, v20.4s\n"
+ "and v26.16b, v6.16b, v19.16b\n"
+ "sqdmulh v5.4s, v5.4s, v20.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v4.16b, v16.16b, v29.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v2.16b, v8.16b, v29.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v3.16b, v7.16b, v29.16b\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "and v25.16b, v5.16b, v29.16b\n"
+ "sqadd v15.4s, v15.4s, v23.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v22.4s\n"
"sshr v2.4s, v2.4s, #0x1f\n"
- "sqadd v15.4s, v15.4s, v1.4s\n"
- "sqrdmulh v10.4s, v10.4s, v6.4s\n"
- "and v0.16b, v11.16b, v21.16b\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "srshl v15.4s, v15.4s, v21.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sqadd v18.4s, v18.4s, v3.4s\n"
- "sqadd v5.4s, v5.4s, v2.4s\n"
- "and v27.16b, v8.16b, v14.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "add v15.4s, v15.4s, v19.4s\n"
- "srshl v20.4s, v20.4s, v14.4s\n"
- "srshl v18.4s, v18.4s, v21.4s\n"
- "srshl v5.4s, v5.4s, v14.4s\n"
- "smin v15.4s, v15.4s, v12.4s\n"
- "add v20.4s, v20.4s, v19.4s\n"
- "add v18.4s, v18.4s, v19.4s\n"
- "smax v15.4s, v15.4s, v16.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "smin v18.4s, v18.4s, v12.4s\n"
- "add v5.4s, v5.4s, v19.4s\n"
- "smax v20.4s, v20.4s, v16.4s\n"
- "smax v18.4s, v18.4s, v16.4s\n"
- "smin v5.4s, v5.4s, v12.4s\n"
- "uzp1 v15.16b, v15.16b, v20.16b\n"
- "sqadd v11.4s, v11.4s, v0.4s\n"
+ "sqadd v10.4s, v10.4s, v21.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v26.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v19.4s\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "srshl v17.4s, v17.4s, v19.4s\n"
+ "sqadd v8.4s, v8.4s, v2.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqadd v7.4s, v7.4s, v3.4s\n"
+ "srshl v6.4s, v6.4s, v19.4s\n"
+ "sqadd v5.4s, v5.4s, v25.4s\n"
+ "srshl v16.4s, v16.4s, v29.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqxtn v17.4h, v17.4s\n"
+ "srshl v7.4s, v7.4s, v29.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v5.4s, v5.4s, v29.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v15.8h, v16.4s\n"
+ "sqxtn2 v17.8h, v8.4s\n"
+ "sqxtn2 v10.8h, v7.4s\n"
+ "sqxtn2 v6.8h, v5.4s\n"
+ "sqadd v15.8h, v15.8h, v18.8h\n"
+ "sqadd v17.8h, v17.8h, v18.8h\n"
+ "sqadd v10.8h, v10.8h, v18.8h\n"
+ "sqadd v6.8h, v6.8h, v18.8h\n"
+ "smax v15.8h, v15.8h, v11.8h\n"
+ "smax v17.8h, v17.8h, v11.8h\n"
+ "smax v10.8h, v10.8h, v11.8h\n"
+ "smax v6.8h, v6.8h, v11.8h\n"
+ "smin v15.8h, v15.8h, v13.8h\n"
+ "smin v17.8h, v17.8h, v13.8h\n"
+ "smin v10.8h, v10.8h, v13.8h\n"
+ "smin v6.8h, v6.8h, v13.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x17, x1]\n"
- "smax v5.4s, v5.4s, v16.4s\n"
- "sqadd v8.4s, v8.4s, v27.4s\n"
- "srshl v11.4s, v11.4s, v21.4s\n"
- "and v30.16b, v10.16b, v21.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "uzp1 v18.16b, v18.16b, v5.16b\n"
- "add v11.4s, v11.4s, v19.4s\n"
- "srshl v8.4s, v8.4s, v14.4s\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
- "str d18, [x16, x1]\n"
- "smin v11.4s, v11.4s, v12.4s\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "add v8.4s, v8.4s, v19.4s\n"
- "sqadd v10.4s, v10.4s, v30.4s\n"
- "smax v11.4s, v11.4s, v16.4s\n"
- "smin v8.4s, v8.4s, v12.4s\n"
- "and v6.16b, v9.16b, v14.16b\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "smax v8.4s, v8.4s, v16.4s\n"
- "srshl v10.4s, v10.4s, v21.4s\n"
- "uzp1 v11.16b, v11.16b, v8.16b\n"
- "add v10.4s, v10.4s, v19.4s\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "str d11, [x6, x1]\n"
- "smin v10.4s, v10.4s, v12.4s\n"
- "sqadd v9.4s, v9.4s, v6.4s\n"
- "smax v10.4s, v10.4s, v16.4s\n"
- "srshl v9.4s, v9.4s, v14.4s\n"
- "add v9.4s, v9.4s, v19.4s\n"
- "smin v9.4s, v9.4s, v12.4s\n"
- "smax v9.4s, v9.4s, v16.4s\n"
- "uzp1 v10.16b, v10.16b, v9.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "str d15, [x16, x22]\n"
"uzp1 v10.16b, v10.16b, v10.16b\n"
- "str d10, [x8, x1]\n"
- "add x1, x1, #0x8\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str d17, [x8, x22]\n"
+ "str d10, [x4, x22]\n"
+ "str d6, [x7, x22]\n"
+ "add x22, x22, #0x8\n"
"beq 124f\n"
- "add x3, x3, #0xc8\n"
+ "add x23, x23, #0xc8\n"
"3:" // Oddments
- "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x4, #2, 5f\n"
- "ld1 { v15.4s }, [x12], #0x10\n"
- "tbz x4, #1, 4f\n"
- "ld1 { v20.d }[0], [x12], #0x8\n"
- "tbz x4, #0, 7f\n"
- "ld1 { v20.s }[2], [x12]\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x0, #2, 5f\n"
+ "ld1 { v15.4s }, [x19], #0x10\n"
+ "tbz x0, #1, 4f\n"
+ "ld1 { v16.d }[0], [x19], #0x8\n"
+ "tbz x0, #0, 7f\n"
+ "ld1 { v16.s }[2], [x19]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x4, #0, 7f\n"
- "ld1 { v20.s }[0], [x12]\n"
+ "tbz x0, #0, 7f\n"
+ "ld1 { v16.s }[0], [x19]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x4, #1, 6f\n"
- "ld1 { v15.d }[0], [x12], #0x8\n"
- "tbz x4, #0, 7f\n"
- "ld1 { v15.s }[2], [x12]\n"
+ "tbz x0, #1, 6f\n"
+ "ld1 { v15.d }[0], [x19], #0x8\n"
+ "tbz x0, #0, 7f\n"
+ "ld1 { v15.s }[2], [x19]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 7f\n"
- "ld1 { v15.s }[0], [x12]\n"
+ "tbz x0, #0, 7f\n"
+ "ld1 { v15.s }[0], [x19]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "mov v18.16b, v15.16b\n"
- "ldr d0, [x3, #0x0]\n"
- "mov v5.16b, v20.16b\n"
- "ldr d1, [x3, #0x8]\n"
- "mov v11.16b, v15.16b\n"
- "ldr d2, [x3, #0x10]\n"
- "mov v8.16b, v20.16b\n"
- "ldr d3, [x3, #0x18]\n"
+ "ldr d0, [x23, #0x0]\n"
+ "ldr d1, [x23, #0x8]\n"
+ "mov v17.16b, v15.16b\n"
+ "mov v8.16b, v16.16b\n"
+ "ldr d2, [x23, #0x10]\n"
+ "ldr d3, [x23, #0x18]\n"
"mov v10.16b, v15.16b\n"
- "ldr d4, [x3, #0x20]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
- "mov v9.16b, v20.16b\n"
- "ldp x28, x27, [x25, #0x0]\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
- "ldp x26, x13, [x25, #0x10]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
- "ldp x24, x23, [x25, #0x20]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
- "ldp x22, x21, [x25, #0x30]\n"
- "ldp x20, x0, [x25, #0x40]\n"
- "add x28, x28, x10\n"
- "add x27, x27, x10\n"
- "add x26, x26, x10\n"
- "add x13, x13, x10\n"
- "add x24, x24, x10\n"
- "add x23, x23, x10\n"
- "add x22, x22, x10\n"
- "add x21, x21, x10\n"
- "add x20, x20, x10\n"
- "add x0, x0, x10\n"
- "tbz x4, #2, 9f\n"
+ "mov v7.16b, v16.16b\n"
+ "ldr d4, [x23, #0x20]\n"
+ "ldp x28, x6, [x20, #0x0]\n"
+ "mov v6.16b, v15.16b\n"
+ "mov v5.16b, v16.16b\n"
+ "ldp x26, x25, [x20, #0x10]\n"
+ "ldp x5, x2, [x20, #0x20]\n"
+ "usubl v0.8h, v0.8b, v14.8b\n"
+ "usubl v1.8h, v1.8b, v14.8b\n"
+ "ldp x27, x21, [x20, #0x30]\n"
+ "ldp x12, x19, [x20, #0x40]\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "usubl v3.8h, v3.8b, v14.8b\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "add x28, x28, x24\n"
+ "add x6, x6, x24\n"
+ "add x26, x26, x24\n"
+ "add x25, x25, x24\n"
+ "add x5, x5, x24\n"
+ "add x2, x2, x24\n"
+ "add x27, x27, x24\n"
+ "add x21, x21, x24\n"
+ "add x12, x12, x24\n"
+ "add x19, x19, x24\n"
+ "tbz x0, #2, 9f\n"
"ld1 { v31.s }[0], [x28], #0x4\n"
- "ld1 { v30.s }[0], [x27], #0x4\n"
+ "ld1 { v30.s }[0], [x6], #0x4\n"
"ld1 { v29.s }[0], [x26], #0x4\n"
- "ld1 { v28.s }[0], [x13], #0x4\n"
- "ld1 { v27.s }[0], [x24], #0x4\n"
- "ld1 { v23.s }[0], [x23], #0x4\n"
- "ld1 { v25.s }[0], [x22], #0x4\n"
+ "ld1 { v28.s }[0], [x25], #0x4\n"
+ "ld1 { v27.s }[0], [x5], #0x4\n"
+ "ld1 { v23.s }[0], [x2], #0x4\n"
+ "ld1 { v25.s }[0], [x27], #0x4\n"
"ld1 { v24.s }[0], [x21], #0x4\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
- "ld1 { v22.s }[0], [x0], #0x4\n"
- "tbz x4, #1, 8f\n"
+ "ld1 { v26.s }[0], [x12], #0x4\n"
+ "ld1 { v22.s }[0], [x19], #0x4\n"
+ "tbz x0, #1, 8f\n"
"ld1 { v31.h }[2], [x28], #0x2\n"
- "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v30.h }[2], [x6], #0x2\n"
"ld1 { v29.h }[2], [x26], #0x2\n"
- "ld1 { v28.h }[2], [x13], #0x2\n"
- "ld1 { v27.h }[2], [x24], #0x2\n"
- "ld1 { v23.h }[2], [x23], #0x2\n"
- "ld1 { v25.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x25], #0x2\n"
+ "ld1 { v27.h }[2], [x5], #0x2\n"
+ "ld1 { v23.h }[2], [x2], #0x2\n"
+ "ld1 { v25.h }[2], [x27], #0x2\n"
"ld1 { v24.h }[2], [x21], #0x2\n"
- "ld1 { v26.h }[2], [x20], #0x2\n"
- "ld1 { v22.h }[2], [x0], #0x2\n"
- "tbz x4, #0, 11f\n"
+ "ld1 { v26.h }[2], [x12], #0x2\n"
+ "ld1 { v22.h }[2], [x19], #0x2\n"
+ "tbz x0, #0, 11f\n"
"ld1 { v31.b }[6], [x28]\n"
- "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v30.b }[6], [x6]\n"
"ld1 { v29.b }[6], [x26]\n"
- "ld1 { v28.b }[6], [x13]\n"
- "ld1 { v27.b }[6], [x24]\n"
- "ld1 { v23.b }[6], [x23]\n"
- "ld1 { v25.b }[6], [x22]\n"
+ "ld1 { v28.b }[6], [x25]\n"
+ "ld1 { v27.b }[6], [x5]\n"
+ "ld1 { v23.b }[6], [x2]\n"
+ "ld1 { v25.b }[6], [x27]\n"
"ld1 { v24.b }[6], [x21]\n"
- "ld1 { v26.b }[6], [x20]\n"
- "ld1 { v22.b }[6], [x0]\n"
+ "ld1 { v26.b }[6], [x12]\n"
+ "ld1 { v22.b }[6], [x19]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x4, #0, 11f\n"
+ "tbz x0, #0, 11f\n"
"ld1 { v31.b }[4], [x28]\n"
- "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v30.b }[4], [x6]\n"
"ld1 { v29.b }[4], [x26]\n"
- "ld1 { v28.b }[4], [x13]\n"
- "ld1 { v27.b }[4], [x24]\n"
- "ld1 { v23.b }[4], [x23]\n"
- "ld1 { v25.b }[4], [x22]\n"
+ "ld1 { v28.b }[4], [x25]\n"
+ "ld1 { v27.b }[4], [x5]\n"
+ "ld1 { v23.b }[4], [x2]\n"
+ "ld1 { v25.b }[4], [x27]\n"
"ld1 { v24.b }[4], [x21]\n"
- "ld1 { v26.b }[4], [x20]\n"
- "ld1 { v22.b }[4], [x0]\n"
+ "ld1 { v26.b }[4], [x12]\n"
+ "ld1 { v22.b }[4], [x19]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x4, #1, 10f\n"
+ "tbz x0, #1, 10f\n"
"ld1 { v31.h }[0], [x28], #0x2\n"
- "ld1 { v30.h }[0], [x27], #0x2\n"
+ "ld1 { v30.h }[0], [x6], #0x2\n"
"ld1 { v29.h }[0], [x26], #0x2\n"
- "ld1 { v28.h }[0], [x13], #0x2\n"
- "ld1 { v27.h }[0], [x24], #0x2\n"
- "ld1 { v23.h }[0], [x23], #0x2\n"
- "ld1 { v25.h }[0], [x22], #0x2\n"
+ "ld1 { v28.h }[0], [x25], #0x2\n"
+ "ld1 { v27.h }[0], [x5], #0x2\n"
+ "ld1 { v23.h }[0], [x2], #0x2\n"
+ "ld1 { v25.h }[0], [x27], #0x2\n"
"ld1 { v24.h }[0], [x21], #0x2\n"
- "ld1 { v26.h }[0], [x20], #0x2\n"
- "ld1 { v22.h }[0], [x0], #0x2\n"
- "tbz x4, #0, 11f\n"
+ "ld1 { v26.h }[0], [x12], #0x2\n"
+ "ld1 { v22.h }[0], [x19], #0x2\n"
+ "tbz x0, #0, 11f\n"
"ld1 { v31.b }[2], [x28]\n"
- "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v30.b }[2], [x6]\n"
"ld1 { v29.b }[2], [x26]\n"
- "ld1 { v28.b }[2], [x13]\n"
- "ld1 { v27.b }[2], [x24]\n"
- "ld1 { v23.b }[2], [x23]\n"
- "ld1 { v25.b }[2], [x22]\n"
+ "ld1 { v28.b }[2], [x25]\n"
+ "ld1 { v27.b }[2], [x5]\n"
+ "ld1 { v23.b }[2], [x2]\n"
+ "ld1 { v25.b }[2], [x27]\n"
"ld1 { v24.b }[2], [x21]\n"
- "ld1 { v26.b }[2], [x20]\n"
- "ld1 { v22.b }[2], [x0]\n"
+ "ld1 { v26.b }[2], [x12]\n"
+ "ld1 { v22.b }[2], [x19]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 11f\n"
+ "tbz x0, #0, 11f\n"
"ld1 { v31.b }[0], [x28]\n"
- "ld1 { v30.b }[0], [x27]\n"
+ "ld1 { v30.b }[0], [x6]\n"
"ld1 { v29.b }[0], [x26]\n"
- "ld1 { v28.b }[0], [x13]\n"
- "ld1 { v27.b }[0], [x24]\n"
- "ld1 { v23.b }[0], [x23]\n"
- "ld1 { v25.b }[0], [x22]\n"
+ "ld1 { v28.b }[0], [x25]\n"
+ "ld1 { v27.b }[0], [x5]\n"
+ "ld1 { v23.b }[0], [x2]\n"
+ "ld1 { v25.b }[0], [x27]\n"
"ld1 { v24.b }[0], [x21]\n"
- "ld1 { v26.b }[0], [x20]\n"
- "ld1 { v22.b }[0], [x0]\n"
+ "ld1 { v26.b }[0], [x12]\n"
+ "ld1 { v22.b }[0], [x19]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "ldr x20, [x25, #0x50]\n"
- "usubl v31.8h, v31.8b, v7.8b\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
"smlal v15.4s, v31.4h, v0.4h\n"
- "usubl v30.8h, v30.8b, v7.8b\n"
- "smlal2 v20.4s, v31.8h, v0.8h\n"
- "usubl v29.8h, v29.8b, v7.8b\n"
- "usubl v28.8h, v28.8b, v7.8b\n"
- "smlal v18.4s, v30.4h, v0.4h\n"
- "usubl v27.8h, v27.8b, v7.8b\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "usubl v23.8h, v23.8b, v7.8b\n"
- "smlal v11.4s, v29.4h, v0.4h\n"
- "usubl v25.8h, v25.8b, v7.8b\n"
- "smlal2 v8.4s, v29.8h, v0.8h\n"
- "usubl v24.8h, v24.8b, v7.8b\n"
- "smlal v10.4s, v28.4h, v0.4h\n"
- "usubl v26.8h, v26.8b, v7.8b\n"
- "smlal2 v9.4s, v28.8h, v0.8h\n"
- "usubl v22.8h, v22.8b, v7.8b\n"
+ "ldr x19, [x20, #0x50]\n"
+ "usubl v29.8h, v29.8b, v9.8b\n"
+ "smlal2 v16.4s, v31.8h, v0.8h\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal2 v8.4s, v30.8h, v0.8h\n"
+ "smlal v10.4s, v29.4h, v0.4h\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "add x19, x19, x24\n"
+ "smlal2 v7.4s, v29.8h, v0.8h\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v6.4s, v28.4h, v0.4h\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
"smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v20.4s, v30.8h, v1.8h\n"
- "add x20, x20, x10\n"
- "smlal v18.4s, v27.4h, v1.4h\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
- "smlal v11.4s, v28.4h, v1.4h\n"
- "smlal2 v8.4s, v28.8h, v1.8h\n"
- "smlal v10.4s, v23.4h, v1.4h\n"
- "smlal2 v9.4s, v23.8h, v1.8h\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "smlal2 v16.4s, v30.8h, v1.8h\n"
+ "smlal v17.4s, v27.4h, v1.4h\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "smlal2 v8.4s, v27.8h, v1.8h\n"
+ "smlal v10.4s, v28.4h, v1.4h\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "smlal2 v7.4s, v28.8h, v1.8h\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "smlal v6.4s, v23.4h, v1.4h\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "smlal2 v5.4s, v23.8h, v1.8h\n"
"smlal v15.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "smlal v18.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "smlal v11.4s, v23.4h, v2.4h\n"
- "smlal2 v8.4s, v23.8h, v2.8h\n"
- "tbz x4, #2, 13f\n"
- "ld1 { v31.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 12f\n"
- "ld1 { v31.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 15f\n"
- "ld1 { v31.b }[6], [x20]\n"
+ "smlal2 v16.4s, v27.8h, v2.8h\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v8.4s, v25.8h, v2.8h\n"
+ "smlal v10.4s, v23.4h, v2.4h\n"
+ "smlal2 v7.4s, v23.8h, v2.8h\n"
+ "tbz x0, #2, 13f\n"
+ "ld1 { v31.s }[0], [x19], #0x4\n"
+ "tbz x0, #1, 12f\n"
+ "ld1 { v31.h }[2], [x19], #0x2\n"
+ "tbz x0, #0, 15f\n"
+ "ld1 { v31.b }[6], [x19]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 15f\n"
- "ld1 { v31.b }[4], [x20]\n"
+ "tbz x0, #0, 15f\n"
+ "ld1 { v31.b }[4], [x19]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x4, #1, 14f\n"
- "ld1 { v31.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 15f\n"
- "ld1 { v31.b }[2], [x20]\n"
+ "tbz x0, #1, 14f\n"
+ "ld1 { v31.h }[0], [x19], #0x2\n"
+ "tbz x0, #0, 15f\n"
+ "ld1 { v31.b }[2], [x19]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 15f\n"
- "ld1 { v31.b }[0], [x20]\n"
+ "tbz x0, #0, 15f\n"
+ "ld1 { v31.b }[0], [x19]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "ldr x15, [x20, #0x58]\n"
+ "smlal v6.4s, v31.4h, v2.4h\n"
+ "smlal2 v5.4s, v31.8h, v2.8h\n"
"smlal v15.4s, v25.4h, v3.4h\n"
- "ldr x28, [x25, #0x58]\n"
- "usubl v31.8h, v31.8b, v7.8b\n"
- "smlal2 v20.4s, v25.8h, v3.8h\n"
- "smlal v18.4s, v24.4h, v3.4h\n"
- "add x28, x28, x10\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "smlal v10.4s, v31.4h, v2.4h\n"
- "smlal2 v9.4s, v31.8h, v2.8h\n"
- "smlal v11.4s, v31.4h, v3.4h\n"
- "smlal2 v8.4s, v31.8h, v3.8h\n"
- "tbz x4, #2, 17f\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
- "tbz x4, #1, 16f\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
- "tbz x4, #0, 19f\n"
- "ld1 { v30.b }[6], [x28]\n"
+ "smlal2 v16.4s, v25.8h, v3.8h\n"
+ "add x15, x15, x24\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal2 v8.4s, v24.8h, v3.8h\n"
+ "smlal v10.4s, v31.4h, v3.4h\n"
+ "smlal2 v7.4s, v31.8h, v3.8h\n"
+ "tbz x0, #2, 17f\n"
+ "ld1 { v30.s }[0], [x15], #0x4\n"
+ "tbz x0, #1, 16f\n"
+ "ld1 { v30.h }[2], [x15], #0x2\n"
+ "tbz x0, #0, 19f\n"
+ "ld1 { v30.b }[6], [x15]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 19f\n"
- "ld1 { v30.b }[4], [x28]\n"
+ "tbz x0, #0, 19f\n"
+ "ld1 { v30.b }[4], [x15]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
- "tbz x4, #1, 18f\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
- "tbz x4, #0, 19f\n"
- "ld1 { v30.b }[2], [x28]\n"
+ "tbz x0, #1, 18f\n"
+ "ld1 { v30.h }[0], [x15], #0x2\n"
+ "tbz x0, #0, 19f\n"
+ "ld1 { v30.b }[2], [x15]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 19f\n"
- "ld1 { v30.b }[0], [x28]\n"
+ "tbz x0, #0, 19f\n"
+ "ld1 { v30.b }[0], [x15]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "ldr x19, [x20, #0x60]\n"
+ "smlal v6.4s, v30.4h, v3.4h\n"
+ "smlal2 v5.4s, v30.8h, v3.8h\n"
"smlal v15.4s, v24.4h, v4.4h\n"
- "ldr x0, [x25, #0x60]\n"
- "usubl v30.8h, v30.8b, v7.8b\n"
- "smlal2 v20.4s, v24.8h, v4.8h\n"
- "add x0, x0, x10\n"
- "smlal v10.4s, v30.4h, v3.4h\n"
- "smlal2 v9.4s, v30.8h, v3.8h\n"
- "tbz x4, #2, 21f\n"
- "ld1 { v27.s }[0], [x0], #0x4\n"
- "tbz x4, #1, 20f\n"
- "ld1 { v27.h }[2], [x0], #0x2\n"
- "tbz x4, #0, 23f\n"
- "ld1 { v27.b }[6], [x0]\n"
+ "smlal2 v16.4s, v24.8h, v4.8h\n"
+ "add x19, x19, x24\n"
+ "tbz x0, #2, 21f\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "tbz x0, #1, 20f\n"
+ "ld1 { v27.h }[2], [x19], #0x2\n"
+ "tbz x0, #0, 23f\n"
+ "ld1 { v27.b }[6], [x19]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 23f\n"
- "ld1 { v27.b }[4], [x0]\n"
+ "tbz x0, #0, 23f\n"
+ "ld1 { v27.b }[4], [x19]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 5): Bit 2: Unset
- "tbz x4, #1, 22f\n"
- "ld1 { v27.h }[0], [x0], #0x2\n"
- "tbz x4, #0, 23f\n"
- "ld1 { v27.b }[2], [x0]\n"
+ "tbz x0, #1, 22f\n"
+ "ld1 { v27.h }[0], [x19], #0x2\n"
+ "tbz x0, #0, 23f\n"
+ "ld1 { v27.b }[2], [x19]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 23f\n"
- "ld1 { v27.b }[0], [x0]\n"
+ "tbz x0, #0, 23f\n"
+ "ld1 { v27.b }[0], [x19]\n"
"23:" // Oddments: Load (0, 5): Bit 2: End
- "smlal v11.4s, v30.4h, v4.4h\n"
- "ldr d0, [x3, #0x28]\n"
- "usubl v27.8h, v27.8b, v7.8b\n"
- "smlal2 v8.4s, v30.8h, v4.8h\n"
- "ldr x7, [x25, #0x68]\n"
- "smlal v10.4s, v26.4h, v4.4h\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
- "smlal2 v9.4s, v26.8h, v4.8h\n"
- "add x7, x7, x10\n"
- "smlal v18.4s, v27.4h, v4.4h\n"
- "smlal2 v5.4s, v27.8h, v4.8h\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "ldr d0, [x23, #0x28]\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v27.8h, v4.8h\n"
+ "smlal v10.4s, v30.4h, v4.4h\n"
+ "smlal2 v7.4s, v30.8h, v4.8h\n"
+ "usubl v0.8h, v0.8b, v14.8b\n"
+ "ldr x27, [x20, #0x68]\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "add x27, x27, x24\n"
"smlal v15.4s, v29.4h, v0.4h\n"
- "smlal2 v20.4s, v29.8h, v0.8h\n"
- "smlal v18.4s, v28.4h, v0.4h\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "smlal v11.4s, v22.4h, v0.4h\n"
- "smlal2 v8.4s, v22.8h, v0.8h\n"
- "tbz x4, #2, 25f\n"
- "ld1 { v25.s }[0], [x7], #0x4\n"
- "tbz x4, #1, 24f\n"
- "ld1 { v25.h }[2], [x7], #0x2\n"
- "tbz x4, #0, 27f\n"
- "ld1 { v25.b }[6], [x7]\n"
+ "smlal2 v16.4s, v29.8h, v0.8h\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "smlal2 v8.4s, v28.8h, v0.8h\n"
+ "smlal v10.4s, v22.4h, v0.4h\n"
+ "smlal2 v7.4s, v22.8h, v0.8h\n"
+ "tbz x0, #2, 25f\n"
+ "ld1 { v25.s }[0], [x27], #0x4\n"
+ "tbz x0, #1, 24f\n"
+ "ld1 { v25.h }[2], [x27], #0x2\n"
+ "tbz x0, #0, 27f\n"
+ "ld1 { v25.b }[6], [x27]\n"
"b 27f\n"
"24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 27f\n"
- "ld1 { v25.b }[4], [x7]\n"
+ "tbz x0, #0, 27f\n"
+ "ld1 { v25.b }[4], [x27]\n"
"b 27f\n"
"25:" // Oddments: Load (2, 1): Bit 2: Unset
- "tbz x4, #1, 26f\n"
- "ld1 { v25.h }[0], [x7], #0x2\n"
- "tbz x4, #0, 27f\n"
- "ld1 { v25.b }[2], [x7]\n"
+ "tbz x0, #1, 26f\n"
+ "ld1 { v25.h }[0], [x27], #0x2\n"
+ "tbz x0, #0, 27f\n"
+ "ld1 { v25.b }[2], [x27]\n"
"b 27f\n"
"26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 27f\n"
- "ld1 { v25.b }[0], [x7]\n"
+ "tbz x0, #0, 27f\n"
+ "ld1 { v25.b }[0], [x27]\n"
"27:" // Oddments: Load (2, 1): Bit 2: End
- "ldr d1, [x3, #0x30]\n"
- "usubl v25.8h, v25.8b, v7.8b\n"
- "smlal v10.4s, v25.4h, v0.4h\n"
- "ldr x26, [x25, #0x70]\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
- "smlal2 v9.4s, v25.8h, v0.8h\n"
- "add x26, x26, x10\n"
+ "ldr d1, [x23, #0x30]\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "usubl v1.8h, v1.8b, v14.8b\n"
+ "ldr x5, [x20, #0x70]\n"
+ "smlal v6.4s, v25.4h, v0.4h\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "add x5, x5, x24\n"
"smlal v15.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v18.4s, v23.4h, v1.4h\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "smlal v11.4s, v25.4h, v1.4h\n"
- "smlal2 v8.4s, v25.8h, v1.8h\n"
- "tbz x4, #2, 29f\n"
- "ld1 { v24.s }[0], [x26], #0x4\n"
- "tbz x4, #1, 28f\n"
- "ld1 { v24.h }[2], [x26], #0x2\n"
- "tbz x4, #0, 31f\n"
- "ld1 { v24.b }[6], [x26]\n"
+ "smlal2 v16.4s, v28.8h, v1.8h\n"
+ "smlal v17.4s, v23.4h, v1.4h\n"
+ "smlal2 v8.4s, v23.8h, v1.8h\n"
+ "smlal v10.4s, v25.4h, v1.4h\n"
+ "smlal2 v7.4s, v25.8h, v1.8h\n"
+ "tbz x0, #2, 29f\n"
+ "ld1 { v24.s }[0], [x5], #0x4\n"
+ "tbz x0, #1, 28f\n"
+ "ld1 { v24.h }[2], [x5], #0x2\n"
+ "tbz x0, #0, 31f\n"
+ "ld1 { v24.b }[6], [x5]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 31f\n"
- "ld1 { v24.b }[4], [x26]\n"
+ "tbz x0, #0, 31f\n"
+ "ld1 { v24.b }[4], [x5]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x4, #1, 30f\n"
- "ld1 { v24.h }[0], [x26], #0x2\n"
- "tbz x4, #0, 31f\n"
- "ld1 { v24.b }[2], [x26]\n"
+ "tbz x0, #1, 30f\n"
+ "ld1 { v24.h }[0], [x5], #0x2\n"
+ "tbz x0, #0, 31f\n"
+ "ld1 { v24.b }[2], [x5]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 31f\n"
- "ld1 { v24.b }[0], [x26]\n"
+ "tbz x0, #0, 31f\n"
+ "ld1 { v24.b }[0], [x5]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "ldr d2, [x3, #0x38]\n"
- "usubl v24.8h, v24.8b, v7.8b\n"
- "smlal v10.4s, v24.4h, v1.4h\n"
- "ldr x23, [x25, #0x78]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
- "smlal2 v9.4s, v24.8h, v1.8h\n"
- "add x23, x23, x10\n"
+ "ldr d2, [x23, #0x38]\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "ldr x11, [x20, #0x78]\n"
+ "smlal v6.4s, v24.4h, v1.4h\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "add x11, x11, x24\n"
"smlal v15.4s, v23.4h, v2.4h\n"
- "smlal2 v20.4s, v23.8h, v2.8h\n"
- "smlal v18.4s, v31.4h, v2.4h\n"
- "smlal2 v5.4s, v31.8h, v2.8h\n"
- "smlal v11.4s, v24.4h, v2.4h\n"
- "smlal2 v8.4s, v24.8h, v2.8h\n"
- "tbz x4, #2, 33f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
- "tbz x4, #1, 32f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
- "tbz x4, #0, 35f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "smlal2 v16.4s, v23.8h, v2.8h\n"
+ "smlal v17.4s, v31.4h, v2.4h\n"
+ "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "smlal v10.4s, v24.4h, v2.4h\n"
+ "smlal2 v7.4s, v24.8h, v2.8h\n"
+ "tbz x0, #2, 33f\n"
+ "ld1 { v27.s }[0], [x11], #0x4\n"
+ "tbz x0, #1, 32f\n"
+ "ld1 { v27.h }[2], [x11], #0x2\n"
+ "tbz x0, #0, 35f\n"
+ "ld1 { v27.b }[6], [x11]\n"
"b 35f\n"
"32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 35f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "tbz x0, #0, 35f\n"
+ "ld1 { v27.b }[4], [x11]\n"
"b 35f\n"
"33:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x4, #1, 34f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
- "tbz x4, #0, 35f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "tbz x0, #1, 34f\n"
+ "ld1 { v27.h }[0], [x11], #0x2\n"
+ "tbz x0, #0, 35f\n"
+ "ld1 { v27.b }[2], [x11]\n"
"b 35f\n"
"34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 35f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "tbz x0, #0, 35f\n"
+ "ld1 { v27.b }[0], [x11]\n"
"35:" // Oddments: Load (2, 3): Bit 2: End
- "ldr d3, [x3, #0x40]\n"
- "usubl v27.8h, v27.8b, v7.8b\n"
- "smlal v10.4s, v27.4h, v2.4h\n"
- "ldr x20, [x25, #0x80]\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
- "smlal2 v9.4s, v27.8h, v2.8h\n"
- "add x20, x20, x10\n"
+ "ldr d3, [x23, #0x40]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "usubl v3.8h, v3.8b, v14.8b\n"
+ "ldr x12, [x20, #0x80]\n"
+ "smlal v6.4s, v27.4h, v2.4h\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "add x12, x12, x24\n"
"smlal v15.4s, v31.4h, v3.4h\n"
- "smlal2 v20.4s, v31.8h, v3.8h\n"
- "smlal v18.4s, v30.4h, v3.4h\n"
- "smlal2 v5.4s, v30.8h, v3.8h\n"
- "smlal v11.4s, v27.4h, v3.4h\n"
- "smlal2 v8.4s, v27.8h, v3.8h\n"
- "tbz x4, #2, 37f\n"
- "ld1 { v23.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 36f\n"
- "ld1 { v23.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 39f\n"
- "ld1 { v23.b }[6], [x20]\n"
+ "smlal2 v16.4s, v31.8h, v3.8h\n"
+ "smlal v17.4s, v30.4h, v3.4h\n"
+ "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "smlal v10.4s, v27.4h, v3.4h\n"
+ "smlal2 v7.4s, v27.8h, v3.8h\n"
+ "tbz x0, #2, 37f\n"
+ "ld1 { v23.s }[0], [x12], #0x4\n"
+ "tbz x0, #1, 36f\n"
+ "ld1 { v23.h }[2], [x12], #0x2\n"
+ "tbz x0, #0, 39f\n"
+ "ld1 { v23.b }[6], [x12]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 39f\n"
- "ld1 { v23.b }[4], [x20]\n"
+ "tbz x0, #0, 39f\n"
+ "ld1 { v23.b }[4], [x12]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 4): Bit 2: Unset
- "tbz x4, #1, 38f\n"
- "ld1 { v23.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 39f\n"
- "ld1 { v23.b }[2], [x20]\n"
+ "tbz x0, #1, 38f\n"
+ "ld1 { v23.h }[0], [x12], #0x2\n"
+ "tbz x0, #0, 39f\n"
+ "ld1 { v23.b }[2], [x12]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 39f\n"
- "ld1 { v23.b }[0], [x20]\n"
+ "tbz x0, #0, 39f\n"
+ "ld1 { v23.b }[0], [x12]\n"
"39:" // Oddments: Load (2, 4): Bit 2: End
- "ldr d4, [x3, #0x48]\n"
- "usubl v23.8h, v23.8b, v7.8b\n"
- "smlal v10.4s, v23.4h, v3.4h\n"
- "ldr x22, [x25, #0x88]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
- "smlal2 v9.4s, v23.8h, v3.8h\n"
- "add x22, x22, x10\n"
+ "ldr d4, [x23, #0x48]\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x26, [x20, #0x88]\n"
+ "smlal v6.4s, v23.4h, v3.4h\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "add x26, x26, x24\n"
"smlal v15.4s, v30.4h, v4.4h\n"
- "smlal2 v20.4s, v30.8h, v4.8h\n"
- "smlal v18.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "smlal v11.4s, v23.4h, v4.4h\n"
- "smlal2 v8.4s, v23.8h, v4.8h\n"
- "tbz x4, #2, 41f\n"
- "ld1 { v28.s }[0], [x22], #0x4\n"
- "tbz x4, #1, 40f\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
- "tbz x4, #0, 43f\n"
- "ld1 { v28.b }[6], [x22]\n"
+ "smlal2 v16.4s, v30.8h, v4.8h\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal2 v8.4s, v26.8h, v4.8h\n"
+ "smlal v10.4s, v23.4h, v4.4h\n"
+ "smlal2 v7.4s, v23.8h, v4.8h\n"
+ "tbz x0, #2, 41f\n"
+ "ld1 { v28.s }[0], [x26], #0x4\n"
+ "tbz x0, #1, 40f\n"
+ "ld1 { v28.h }[2], [x26], #0x2\n"
+ "tbz x0, #0, 43f\n"
+ "ld1 { v28.b }[6], [x26]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 43f\n"
- "ld1 { v28.b }[4], [x22]\n"
+ "tbz x0, #0, 43f\n"
+ "ld1 { v28.b }[4], [x26]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 5): Bit 2: Unset
- "tbz x4, #1, 42f\n"
- "ld1 { v28.h }[0], [x22], #0x2\n"
- "tbz x4, #0, 43f\n"
- "ld1 { v28.b }[2], [x22]\n"
+ "tbz x0, #1, 42f\n"
+ "ld1 { v28.h }[0], [x26], #0x2\n"
+ "tbz x0, #0, 43f\n"
+ "ld1 { v28.b }[2], [x26]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 43f\n"
- "ld1 { v28.b }[0], [x22]\n"
+ "tbz x0, #0, 43f\n"
+ "ld1 { v28.b }[0], [x26]\n"
"43:" // Oddments: Load (2, 5): Bit 2: End
- "ldr d0, [x3, #0x50]\n"
- "usubl v28.8h, v28.8b, v7.8b\n"
- "smlal v10.4s, v28.4h, v4.4h\n"
- "ldr x13, [x25, #0x90]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
- "smlal2 v9.4s, v28.8h, v4.8h\n"
- "add x13, x13, x10\n"
+ "ldr d0, [x23, #0x50]\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "usubl v0.8h, v0.8b, v14.8b\n"
+ "ldr x14, [x20, #0x90]\n"
+ "smlal v6.4s, v28.4h, v4.4h\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "add x14, x14, x24\n"
"smlal v15.4s, v22.4h, v0.4h\n"
- "smlal2 v20.4s, v22.8h, v0.8h\n"
- "smlal v18.4s, v25.4h, v0.4h\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "tbz x4, #2, 45f\n"
- "ld1 { v31.s }[0], [x13], #0x4\n"
- "tbz x4, #1, 44f\n"
- "ld1 { v31.h }[2], [x13], #0x2\n"
- "tbz x4, #0, 47f\n"
- "ld1 { v31.b }[6], [x13]\n"
+ "smlal2 v16.4s, v22.8h, v0.8h\n"
+ "smlal v17.4s, v25.4h, v0.4h\n"
+ "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "tbz x0, #2, 45f\n"
+ "ld1 { v31.s }[0], [x14], #0x4\n"
+ "tbz x0, #1, 44f\n"
+ "ld1 { v31.h }[2], [x14], #0x2\n"
+ "tbz x0, #0, 47f\n"
+ "ld1 { v31.b }[6], [x14]\n"
"b 47f\n"
"44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 47f\n"
- "ld1 { v31.b }[4], [x13]\n"
+ "tbz x0, #0, 47f\n"
+ "ld1 { v31.b }[4], [x14]\n"
"b 47f\n"
"45:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x4, #1, 46f\n"
- "ld1 { v31.h }[0], [x13], #0x2\n"
- "tbz x4, #0, 47f\n"
- "ld1 { v31.b }[2], [x13]\n"
+ "tbz x0, #1, 46f\n"
+ "ld1 { v31.h }[0], [x14], #0x2\n"
+ "tbz x0, #0, 47f\n"
+ "ld1 { v31.b }[2], [x14]\n"
"b 47f\n"
"46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 47f\n"
- "ld1 { v31.b }[0], [x13]\n"
+ "tbz x0, #0, 47f\n"
+ "ld1 { v31.b }[0], [x14]\n"
"47:" // Oddments: Load (3, 0): Bit 2: End
- "ldr x21, [x25, #0x98]\n"
- "usubl v31.8h, v31.8b, v7.8b\n"
- "smlal v11.4s, v31.4h, v0.4h\n"
- "smlal2 v8.4s, v31.8h, v0.8h\n"
- "add x21, x21, x10\n"
- "tbz x4, #2, 49f\n"
- "ld1 { v30.s }[0], [x21], #0x4\n"
- "tbz x4, #1, 48f\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
- "tbz x4, #0, 51f\n"
- "ld1 { v30.b }[6], [x21]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "ldr x15, [x20, #0x98]\n"
+ "smlal v10.4s, v31.4h, v0.4h\n"
+ "smlal2 v7.4s, v31.8h, v0.8h\n"
+ "add x15, x15, x24\n"
+ "tbz x0, #2, 49f\n"
+ "ld1 { v30.s }[0], [x15], #0x4\n"
+ "tbz x0, #1, 48f\n"
+ "ld1 { v30.h }[2], [x15], #0x2\n"
+ "tbz x0, #0, 51f\n"
+ "ld1 { v30.b }[6], [x15]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 51f\n"
- "ld1 { v30.b }[4], [x21]\n"
+ "tbz x0, #0, 51f\n"
+ "ld1 { v30.b }[4], [x15]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x4, #1, 50f\n"
- "ld1 { v30.h }[0], [x21], #0x2\n"
- "tbz x4, #0, 51f\n"
- "ld1 { v30.b }[2], [x21]\n"
+ "tbz x0, #1, 50f\n"
+ "ld1 { v30.h }[0], [x15], #0x2\n"
+ "tbz x0, #0, 51f\n"
+ "ld1 { v30.b }[2], [x15]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 51f\n"
- "ld1 { v30.b }[0], [x21]\n"
+ "tbz x0, #0, 51f\n"
+ "ld1 { v30.b }[0], [x15]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "ldr d1, [x3, #0x58]\n"
- "usubl v30.8h, v30.8b, v7.8b\n"
- "smlal v10.4s, v30.4h, v0.4h\n"
- "ldr x14, [x25, #0xa0]\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
- "smlal2 v9.4s, v30.8h, v0.8h\n"
- "add x14, x14, x10\n"
+ "ldr d1, [x23, #0x58]\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "usubl v1.8h, v1.8b, v14.8b\n"
+ "ldr x21, [x20, #0xa0]\n"
+ "smlal v6.4s, v30.4h, v0.4h\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "add x21, x21, x24\n"
"smlal v15.4s, v25.4h, v1.4h\n"
- "smlal2 v20.4s, v25.8h, v1.8h\n"
- "smlal v18.4s, v24.4h, v1.4h\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
- "smlal v11.4s, v30.4h, v1.4h\n"
- "smlal2 v8.4s, v30.8h, v1.8h\n"
- "tbz x4, #2, 53f\n"
- "ld1 { v26.s }[0], [x14], #0x4\n"
- "tbz x4, #1, 52f\n"
- "ld1 { v26.h }[2], [x14], #0x2\n"
- "tbz x4, #0, 55f\n"
- "ld1 { v26.b }[6], [x14]\n"
+ "smlal2 v16.4s, v25.8h, v1.8h\n"
+ "smlal v17.4s, v24.4h, v1.4h\n"
+ "smlal2 v8.4s, v24.8h, v1.8h\n"
+ "smlal v10.4s, v30.4h, v1.4h\n"
+ "smlal2 v7.4s, v30.8h, v1.8h\n"
+ "tbz x0, #2, 53f\n"
+ "ld1 { v26.s }[0], [x21], #0x4\n"
+ "tbz x0, #1, 52f\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "tbz x0, #0, 55f\n"
+ "ld1 { v26.b }[6], [x21]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 55f\n"
- "ld1 { v26.b }[4], [x14]\n"
+ "tbz x0, #0, 55f\n"
+ "ld1 { v26.b }[4], [x21]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x4, #1, 54f\n"
- "ld1 { v26.h }[0], [x14], #0x2\n"
- "tbz x4, #0, 55f\n"
- "ld1 { v26.b }[2], [x14]\n"
+ "tbz x0, #1, 54f\n"
+ "ld1 { v26.h }[0], [x21], #0x2\n"
+ "tbz x0, #0, 55f\n"
+ "ld1 { v26.b }[2], [x21]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 55f\n"
- "ld1 { v26.b }[0], [x14]\n"
+ "tbz x0, #0, 55f\n"
+ "ld1 { v26.b }[0], [x21]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "ldr d2, [x3, #0x60]\n"
- "usubl v26.8h, v26.8b, v7.8b\n"
- "smlal v10.4s, v26.4h, v1.4h\n"
- "ldr x11, [x25, #0xa8]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
- "smlal2 v9.4s, v26.8h, v1.8h\n"
- "add x11, x11, x10\n"
+ "ldr d2, [x23, #0x60]\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "ldr x2, [x20, #0xa8]\n"
+ "smlal v6.4s, v26.4h, v1.4h\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "add x2, x2, x24\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v20.4s, v24.8h, v2.8h\n"
- "smlal v18.4s, v27.4h, v2.4h\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "smlal v11.4s, v26.4h, v2.4h\n"
- "smlal2 v8.4s, v26.8h, v2.8h\n"
- "tbz x4, #2, 57f\n"
- "ld1 { v25.s }[0], [x11], #0x4\n"
- "tbz x4, #1, 56f\n"
- "ld1 { v25.h }[2], [x11], #0x2\n"
- "tbz x4, #0, 59f\n"
- "ld1 { v25.b }[6], [x11]\n"
+ "smlal2 v16.4s, v24.8h, v2.8h\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal2 v8.4s, v27.8h, v2.8h\n"
+ "smlal v10.4s, v26.4h, v2.4h\n"
+ "smlal2 v7.4s, v26.8h, v2.8h\n"
+ "tbz x0, #2, 57f\n"
+ "ld1 { v25.s }[0], [x2], #0x4\n"
+ "tbz x0, #1, 56f\n"
+ "ld1 { v25.h }[2], [x2], #0x2\n"
+ "tbz x0, #0, 59f\n"
+ "ld1 { v25.b }[6], [x2]\n"
"b 59f\n"
"56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 59f\n"
- "ld1 { v25.b }[4], [x11]\n"
+ "tbz x0, #0, 59f\n"
+ "ld1 { v25.b }[4], [x2]\n"
"b 59f\n"
"57:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x4, #1, 58f\n"
- "ld1 { v25.h }[0], [x11], #0x2\n"
- "tbz x4, #0, 59f\n"
- "ld1 { v25.b }[2], [x11]\n"
+ "tbz x0, #1, 58f\n"
+ "ld1 { v25.h }[0], [x2], #0x2\n"
+ "tbz x0, #0, 59f\n"
+ "ld1 { v25.b }[2], [x2]\n"
"b 59f\n"
"58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 59f\n"
- "ld1 { v25.b }[0], [x11]\n"
+ "tbz x0, #0, 59f\n"
+ "ld1 { v25.b }[0], [x2]\n"
"59:" // Oddments: Load (3, 3): Bit 2: End
- "ldr d3, [x3, #0x68]\n"
- "usubl v25.8h, v25.8b, v7.8b\n"
- "smlal v10.4s, v25.4h, v2.4h\n"
- "ldr x24, [x25, #0xb0]\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
- "smlal2 v9.4s, v25.8h, v2.8h\n"
- "add x24, x24, x10\n"
+ "ldr d3, [x23, #0x68]\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "usubl v3.8h, v3.8b, v14.8b\n"
+ "ldr x13, [x20, #0xb0]\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "add x13, x13, x24\n"
"smlal v15.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
- "smlal v18.4s, v23.4h, v3.4h\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v8.4s, v25.8h, v3.8h\n"
- "tbz x4, #2, 61f\n"
- "ld1 { v24.s }[0], [x24], #0x4\n"
- "tbz x4, #1, 60f\n"
- "ld1 { v24.h }[2], [x24], #0x2\n"
- "tbz x4, #0, 63f\n"
- "ld1 { v24.b }[6], [x24]\n"
+ "smlal2 v16.4s, v27.8h, v3.8h\n"
+ "smlal v17.4s, v23.4h, v3.4h\n"
+ "smlal2 v8.4s, v23.8h, v3.8h\n"
+ "smlal v10.4s, v25.4h, v3.4h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "tbz x0, #2, 61f\n"
+ "ld1 { v24.s }[0], [x13], #0x4\n"
+ "tbz x0, #1, 60f\n"
+ "ld1 { v24.h }[2], [x13], #0x2\n"
+ "tbz x0, #0, 63f\n"
+ "ld1 { v24.b }[6], [x13]\n"
"b 63f\n"
"60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 63f\n"
- "ld1 { v24.b }[4], [x24]\n"
+ "tbz x0, #0, 63f\n"
+ "ld1 { v24.b }[4], [x13]\n"
"b 63f\n"
"61:" // Oddments: Load (3, 4): Bit 2: Unset
- "tbz x4, #1, 62f\n"
- "ld1 { v24.h }[0], [x24], #0x2\n"
- "tbz x4, #0, 63f\n"
- "ld1 { v24.b }[2], [x24]\n"
+ "tbz x0, #1, 62f\n"
+ "ld1 { v24.h }[0], [x13], #0x2\n"
+ "tbz x0, #0, 63f\n"
+ "ld1 { v24.b }[2], [x13]\n"
"b 63f\n"
"62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 63f\n"
- "ld1 { v24.b }[0], [x24]\n"
+ "tbz x0, #0, 63f\n"
+ "ld1 { v24.b }[0], [x13]\n"
"63:" // Oddments: Load (3, 4): Bit 2: End
- "ldr d4, [x3, #0x70]\n"
- "usubl v24.8h, v24.8b, v7.8b\n"
- "smlal v10.4s, v24.4h, v3.4h\n"
- "ldr x0, [x25, #0xb8]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
- "smlal2 v9.4s, v24.8h, v3.8h\n"
- "add x0, x0, x10\n"
+ "ldr d4, [x23, #0x70]\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x9, [x20, #0xb8]\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "add x9, x9, x24\n"
"smlal v15.4s, v23.4h, v4.4h\n"
- "smlal2 v20.4s, v23.8h, v4.8h\n"
- "smlal v18.4s, v28.4h, v4.4h\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v8.4s, v24.8h, v4.8h\n"
- "tbz x4, #2, 65f\n"
- "ld1 { v22.s }[0], [x0], #0x4\n"
- "tbz x4, #1, 64f\n"
- "ld1 { v22.h }[2], [x0], #0x2\n"
- "tbz x4, #0, 67f\n"
- "ld1 { v22.b }[6], [x0]\n"
+ "smlal2 v16.4s, v23.8h, v4.8h\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "smlal v10.4s, v24.4h, v4.4h\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "tbz x0, #2, 65f\n"
+ "ld1 { v22.s }[0], [x9], #0x4\n"
+ "tbz x0, #1, 64f\n"
+ "ld1 { v22.h }[2], [x9], #0x2\n"
+ "tbz x0, #0, 67f\n"
+ "ld1 { v22.b }[6], [x9]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 67f\n"
- "ld1 { v22.b }[4], [x0]\n"
+ "tbz x0, #0, 67f\n"
+ "ld1 { v22.b }[4], [x9]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 5): Bit 2: Unset
- "tbz x4, #1, 66f\n"
- "ld1 { v22.h }[0], [x0], #0x2\n"
- "tbz x4, #0, 67f\n"
- "ld1 { v22.b }[2], [x0]\n"
+ "tbz x0, #1, 66f\n"
+ "ld1 { v22.h }[0], [x9], #0x2\n"
+ "tbz x0, #0, 67f\n"
+ "ld1 { v22.b }[2], [x9]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 67f\n"
- "ld1 { v22.b }[0], [x0]\n"
+ "tbz x0, #0, 67f\n"
+ "ld1 { v22.b }[0], [x9]\n"
"67:" // Oddments: Load (3, 5): Bit 2: End
- "ldr d0, [x3, #0x78]\n"
- "usubl v22.8h, v22.8b, v7.8b\n"
- "smlal v10.4s, v22.4h, v4.4h\n"
- "ldr x15, [x25, #0xc0]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
- "smlal2 v9.4s, v22.8h, v4.8h\n"
- "add x15, x15, x10\n"
+ "ldr d0, [x23, #0x78]\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "usubl v0.8h, v0.8b, v14.8b\n"
+ "ldr x19, [x20, #0xc0]\n"
+ "smlal v6.4s, v22.4h, v4.4h\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "add x19, x19, x24\n"
"smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v20.4s, v31.8h, v0.8h\n"
- "smlal v18.4s, v30.4h, v0.4h\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "tbz x4, #2, 69f\n"
- "ld1 { v27.s }[0], [x15], #0x4\n"
- "tbz x4, #1, 68f\n"
- "ld1 { v27.h }[2], [x15], #0x2\n"
- "tbz x4, #0, 71f\n"
- "ld1 { v27.b }[6], [x15]\n"
+ "smlal2 v16.4s, v31.8h, v0.8h\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal2 v8.4s, v30.8h, v0.8h\n"
+ "tbz x0, #2, 69f\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "tbz x0, #1, 68f\n"
+ "ld1 { v27.h }[2], [x19], #0x2\n"
+ "tbz x0, #0, 71f\n"
+ "ld1 { v27.b }[6], [x19]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 71f\n"
- "ld1 { v27.b }[4], [x15]\n"
+ "tbz x0, #0, 71f\n"
+ "ld1 { v27.b }[4], [x19]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 0): Bit 2: Unset
- "tbz x4, #1, 70f\n"
- "ld1 { v27.h }[0], [x15], #0x2\n"
- "tbz x4, #0, 71f\n"
- "ld1 { v27.b }[2], [x15]\n"
+ "tbz x0, #1, 70f\n"
+ "ld1 { v27.h }[0], [x19], #0x2\n"
+ "tbz x0, #0, 71f\n"
+ "ld1 { v27.b }[2], [x19]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 71f\n"
- "ld1 { v27.b }[0], [x15]\n"
+ "tbz x0, #0, 71f\n"
+ "ld1 { v27.b }[0], [x19]\n"
"71:" // Oddments: Load (4, 0): Bit 2: End
- "ldr x9, [x25, #0xc8]\n"
- "usubl v27.8h, v27.8b, v7.8b\n"
- "smlal v11.4s, v27.4h, v0.4h\n"
- "smlal2 v8.4s, v27.8h, v0.8h\n"
- "add x9, x9, x10\n"
- "tbz x4, #2, 73f\n"
- "ld1 { v23.s }[0], [x9], #0x4\n"
- "tbz x4, #1, 72f\n"
- "ld1 { v23.h }[2], [x9], #0x2\n"
- "tbz x4, #0, 75f\n"
- "ld1 { v23.b }[6], [x9]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "ldr x28, [x20, #0xc8]\n"
+ "smlal v10.4s, v27.4h, v0.4h\n"
+ "smlal2 v7.4s, v27.8h, v0.8h\n"
+ "add x28, x28, x24\n"
+ "tbz x0, #2, 73f\n"
+ "ld1 { v23.s }[0], [x28], #0x4\n"
+ "tbz x0, #1, 72f\n"
+ "ld1 { v23.h }[2], [x28], #0x2\n"
+ "tbz x0, #0, 75f\n"
+ "ld1 { v23.b }[6], [x28]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 75f\n"
- "ld1 { v23.b }[4], [x9]\n"
+ "tbz x0, #0, 75f\n"
+ "ld1 { v23.b }[4], [x28]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 1): Bit 2: Unset
- "tbz x4, #1, 74f\n"
- "ld1 { v23.h }[0], [x9], #0x2\n"
- "tbz x4, #0, 75f\n"
- "ld1 { v23.b }[2], [x9]\n"
+ "tbz x0, #1, 74f\n"
+ "ld1 { v23.h }[0], [x28], #0x2\n"
+ "tbz x0, #0, 75f\n"
+ "ld1 { v23.b }[2], [x28]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 75f\n"
- "ld1 { v23.b }[0], [x9]\n"
+ "tbz x0, #0, 75f\n"
+ "ld1 { v23.b }[0], [x28]\n"
"75:" // Oddments: Load (4, 1): Bit 2: End
- "ldr d1, [x3, #0x80]\n"
- "usubl v23.8h, v23.8b, v7.8b\n"
- "smlal v10.4s, v23.4h, v0.4h\n"
- "ldr x27, [x25, #0xd0]\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
- "smlal2 v9.4s, v23.8h, v0.8h\n"
- "add x27, x27, x10\n"
+ "ldr d1, [x23, #0x80]\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "usubl v1.8h, v1.8b, v14.8b\n"
+ "ldr x6, [x20, #0xd0]\n"
+ "smlal v6.4s, v23.4h, v0.4h\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "add x6, x6, x24\n"
"smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v20.4s, v30.8h, v1.8h\n"
- "smlal v18.4s, v26.4h, v1.4h\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
- "smlal v11.4s, v23.4h, v1.4h\n"
- "smlal2 v8.4s, v23.8h, v1.8h\n"
- "tbz x4, #2, 77f\n"
- "ld1 { v31.s }[0], [x27], #0x4\n"
- "tbz x4, #1, 76f\n"
- "ld1 { v31.h }[2], [x27], #0x2\n"
- "tbz x4, #0, 79f\n"
- "ld1 { v31.b }[6], [x27]\n"
+ "smlal2 v16.4s, v30.8h, v1.8h\n"
+ "smlal v17.4s, v26.4h, v1.4h\n"
+ "smlal2 v8.4s, v26.8h, v1.8h\n"
+ "smlal v10.4s, v23.4h, v1.4h\n"
+ "smlal2 v7.4s, v23.8h, v1.8h\n"
+ "tbz x0, #2, 77f\n"
+ "ld1 { v31.s }[0], [x6], #0x4\n"
+ "tbz x0, #1, 76f\n"
+ "ld1 { v31.h }[2], [x6], #0x2\n"
+ "tbz x0, #0, 79f\n"
+ "ld1 { v31.b }[6], [x6]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 79f\n"
- "ld1 { v31.b }[4], [x27]\n"
+ "tbz x0, #0, 79f\n"
+ "ld1 { v31.b }[4], [x6]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 2): Bit 2: Unset
- "tbz x4, #1, 78f\n"
- "ld1 { v31.h }[0], [x27], #0x2\n"
- "tbz x4, #0, 79f\n"
- "ld1 { v31.b }[2], [x27]\n"
+ "tbz x0, #1, 78f\n"
+ "ld1 { v31.h }[0], [x6], #0x2\n"
+ "tbz x0, #0, 79f\n"
+ "ld1 { v31.b }[2], [x6]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 79f\n"
- "ld1 { v31.b }[0], [x27]\n"
+ "tbz x0, #0, 79f\n"
+ "ld1 { v31.b }[0], [x6]\n"
"79:" // Oddments: Load (4, 2): Bit 2: End
- "ldr d2, [x3, #0x88]\n"
- "usubl v31.8h, v31.8b, v7.8b\n"
- "smlal v10.4s, v31.4h, v1.4h\n"
- "ldr x28, [x25, #0xd8]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
- "smlal2 v9.4s, v31.8h, v1.8h\n"
- "add x28, x28, x10\n"
+ "ldr d2, [x23, #0x88]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "ldr x27, [x20, #0xd8]\n"
+ "smlal v6.4s, v31.4h, v1.4h\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "add x27, x27, x24\n"
"smlal v15.4s, v26.4h, v2.4h\n"
- "smlal2 v20.4s, v26.8h, v2.8h\n"
- "smlal v18.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "smlal2 v8.4s, v31.8h, v2.8h\n"
- "tbz x4, #2, 81f\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
- "tbz x4, #1, 80f\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
- "tbz x4, #0, 83f\n"
- "ld1 { v30.b }[6], [x28]\n"
+ "smlal2 v16.4s, v26.8h, v2.8h\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v8.4s, v25.8h, v2.8h\n"
+ "smlal v10.4s, v31.4h, v2.4h\n"
+ "smlal2 v7.4s, v31.8h, v2.8h\n"
+ "tbz x0, #2, 81f\n"
+ "ld1 { v30.s }[0], [x27], #0x4\n"
+ "tbz x0, #1, 80f\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "tbz x0, #0, 83f\n"
+ "ld1 { v30.b }[6], [x27]\n"
"b 83f\n"
"80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 83f\n"
- "ld1 { v30.b }[4], [x28]\n"
+ "tbz x0, #0, 83f\n"
+ "ld1 { v30.b }[4], [x27]\n"
"b 83f\n"
"81:" // Oddments: Load (4, 3): Bit 2: Unset
- "tbz x4, #1, 82f\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
- "tbz x4, #0, 83f\n"
- "ld1 { v30.b }[2], [x28]\n"
+ "tbz x0, #1, 82f\n"
+ "ld1 { v30.h }[0], [x27], #0x2\n"
+ "tbz x0, #0, 83f\n"
+ "ld1 { v30.b }[2], [x27]\n"
"b 83f\n"
"82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 83f\n"
- "ld1 { v30.b }[0], [x28]\n"
+ "tbz x0, #0, 83f\n"
+ "ld1 { v30.b }[0], [x27]\n"
"83:" // Oddments: Load (4, 3): Bit 2: End
- "ldr d3, [x3, #0x90]\n"
- "usubl v30.8h, v30.8b, v7.8b\n"
- "smlal v10.4s, v30.4h, v2.4h\n"
- "ldr x12, [x25, #0xe0]\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
- "smlal2 v9.4s, v30.8h, v2.8h\n"
- "add x12, x12, x10\n"
+ "ldr d3, [x23, #0x90]\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "usubl v3.8h, v3.8b, v14.8b\n"
+ "ldr x11, [x20, #0xe0]\n"
+ "smlal v6.4s, v30.4h, v2.4h\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "add x11, x11, x24\n"
"smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v20.4s, v25.8h, v3.8h\n"
- "smlal v18.4s, v24.4h, v3.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "smlal v11.4s, v30.4h, v3.4h\n"
- "smlal2 v8.4s, v30.8h, v3.8h\n"
- "tbz x4, #2, 85f\n"
- "ld1 { v28.s }[0], [x12], #0x4\n"
- "tbz x4, #1, 84f\n"
- "ld1 { v28.h }[2], [x12], #0x2\n"
- "tbz x4, #0, 87f\n"
- "ld1 { v28.b }[6], [x12]\n"
+ "smlal2 v16.4s, v25.8h, v3.8h\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal2 v8.4s, v24.8h, v3.8h\n"
+ "smlal v10.4s, v30.4h, v3.4h\n"
+ "smlal2 v7.4s, v30.8h, v3.8h\n"
+ "tbz x0, #2, 85f\n"
+ "ld1 { v28.s }[0], [x11], #0x4\n"
+ "tbz x0, #1, 84f\n"
+ "ld1 { v28.h }[2], [x11], #0x2\n"
+ "tbz x0, #0, 87f\n"
+ "ld1 { v28.b }[6], [x11]\n"
"b 87f\n"
"84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 87f\n"
- "ld1 { v28.b }[4], [x12]\n"
+ "tbz x0, #0, 87f\n"
+ "ld1 { v28.b }[4], [x11]\n"
"b 87f\n"
"85:" // Oddments: Load (4, 4): Bit 2: Unset
- "tbz x4, #1, 86f\n"
- "ld1 { v28.h }[0], [x12], #0x2\n"
- "tbz x4, #0, 87f\n"
- "ld1 { v28.b }[2], [x12]\n"
+ "tbz x0, #1, 86f\n"
+ "ld1 { v28.h }[0], [x11], #0x2\n"
+ "tbz x0, #0, 87f\n"
+ "ld1 { v28.b }[2], [x11]\n"
"b 87f\n"
"86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 87f\n"
- "ld1 { v28.b }[0], [x12]\n"
+ "tbz x0, #0, 87f\n"
+ "ld1 { v28.b }[0], [x11]\n"
"87:" // Oddments: Load (4, 4): Bit 2: End
- "ldr d4, [x3, #0x98]\n"
- "usubl v28.8h, v28.8b, v7.8b\n"
- "smlal v10.4s, v28.4h, v3.4h\n"
- "ldr x7, [x25, #0xe8]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
- "smlal2 v9.4s, v28.8h, v3.8h\n"
- "add x7, x7, x10\n"
+ "ldr d4, [x23, #0x98]\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x17, [x20, #0xe8]\n"
+ "smlal v6.4s, v28.4h, v3.4h\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "add x17, x17, x24\n"
"smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v20.4s, v24.8h, v4.8h\n"
- "smlal v18.4s, v22.4h, v4.4h\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
- "smlal v11.4s, v28.4h, v4.4h\n"
- "smlal2 v8.4s, v28.8h, v4.8h\n"
- "tbz x4, #2, 89f\n"
- "ld1 { v26.s }[0], [x7], #0x4\n"
- "tbz x4, #1, 88f\n"
- "ld1 { v26.h }[2], [x7], #0x2\n"
- "tbz x4, #0, 91f\n"
- "ld1 { v26.b }[6], [x7]\n"
+ "smlal2 v16.4s, v24.8h, v4.8h\n"
+ "smlal v17.4s, v22.4h, v4.4h\n"
+ "smlal2 v8.4s, v22.8h, v4.8h\n"
+ "smlal v10.4s, v28.4h, v4.4h\n"
+ "smlal2 v7.4s, v28.8h, v4.8h\n"
+ "tbz x0, #2, 89f\n"
+ "ld1 { v26.s }[0], [x17], #0x4\n"
+ "tbz x0, #1, 88f\n"
+ "ld1 { v26.h }[2], [x17], #0x2\n"
+ "tbz x0, #0, 91f\n"
+ "ld1 { v26.b }[6], [x17]\n"
"b 91f\n"
"88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 91f\n"
- "ld1 { v26.b }[4], [x7]\n"
+ "tbz x0, #0, 91f\n"
+ "ld1 { v26.b }[4], [x17]\n"
"b 91f\n"
"89:" // Oddments: Load (4, 5): Bit 2: Unset
- "tbz x4, #1, 90f\n"
- "ld1 { v26.h }[0], [x7], #0x2\n"
- "tbz x4, #0, 91f\n"
- "ld1 { v26.b }[2], [x7]\n"
+ "tbz x0, #1, 90f\n"
+ "ld1 { v26.h }[0], [x17], #0x2\n"
+ "tbz x0, #0, 91f\n"
+ "ld1 { v26.b }[2], [x17]\n"
"b 91f\n"
"90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 91f\n"
- "ld1 { v26.b }[0], [x7]\n"
+ "tbz x0, #0, 91f\n"
+ "ld1 { v26.b }[0], [x17]\n"
"91:" // Oddments: Load (4, 5): Bit 2: End
- "ldr d0, [x3, #0xa0]\n"
- "usubl v26.8h, v26.8b, v7.8b\n"
- "smlal v10.4s, v26.4h, v4.4h\n"
- "ldr x26, [x25, #0xf0]\n"
- "usubl v0.8h, v0.8b, v13.8b\n"
- "smlal2 v9.4s, v26.8h, v4.8h\n"
- "add x26, x26, x10\n"
+ "ldr d0, [x23, #0xa0]\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "usubl v0.8h, v0.8b, v14.8b\n"
+ "ldr x5, [x20, #0xf0]\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "add x5, x5, x24\n"
"smlal v15.4s, v27.4h, v0.4h\n"
- "smlal2 v20.4s, v27.8h, v0.8h\n"
- "smlal v18.4s, v23.4h, v0.4h\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
- "tbz x4, #2, 93f\n"
- "ld1 { v25.s }[0], [x26], #0x4\n"
- "tbz x4, #1, 92f\n"
- "ld1 { v25.h }[2], [x26], #0x2\n"
- "tbz x4, #0, 95f\n"
- "ld1 { v25.b }[6], [x26]\n"
+ "smlal2 v16.4s, v27.8h, v0.8h\n"
+ "smlal v17.4s, v23.4h, v0.4h\n"
+ "smlal2 v8.4s, v23.8h, v0.8h\n"
+ "tbz x0, #2, 93f\n"
+ "ld1 { v25.s }[0], [x5], #0x4\n"
+ "tbz x0, #1, 92f\n"
+ "ld1 { v25.h }[2], [x5], #0x2\n"
+ "tbz x0, #0, 95f\n"
+ "ld1 { v25.b }[6], [x5]\n"
"b 95f\n"
"92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 95f\n"
- "ld1 { v25.b }[4], [x26]\n"
+ "tbz x0, #0, 95f\n"
+ "ld1 { v25.b }[4], [x5]\n"
"b 95f\n"
"93:" // Oddments: Load (5, 0): Bit 2: Unset
- "tbz x4, #1, 94f\n"
- "ld1 { v25.h }[0], [x26], #0x2\n"
- "tbz x4, #0, 95f\n"
- "ld1 { v25.b }[2], [x26]\n"
+ "tbz x0, #1, 94f\n"
+ "ld1 { v25.h }[0], [x5], #0x2\n"
+ "tbz x0, #0, 95f\n"
+ "ld1 { v25.b }[2], [x5]\n"
"b 95f\n"
"94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 95f\n"
- "ld1 { v25.b }[0], [x26]\n"
+ "tbz x0, #0, 95f\n"
+ "ld1 { v25.b }[0], [x5]\n"
"95:" // Oddments: Load (5, 0): Bit 2: End
- "ldr x23, [x25, #0xf8]\n"
- "usubl v25.8h, v25.8b, v7.8b\n"
- "smlal v11.4s, v25.4h, v0.4h\n"
- "smlal2 v8.4s, v25.8h, v0.8h\n"
- "add x23, x23, x10\n"
- "tbz x4, #2, 97f\n"
- "ld1 { v24.s }[0], [x23], #0x4\n"
- "tbz x4, #1, 96f\n"
- "ld1 { v24.h }[2], [x23], #0x2\n"
- "tbz x4, #0, 99f\n"
- "ld1 { v24.b }[6], [x23]\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "ldr x25, [x20, #0xf8]\n"
+ "smlal v10.4s, v25.4h, v0.4h\n"
+ "smlal2 v7.4s, v25.8h, v0.8h\n"
+ "add x25, x25, x24\n"
+ "tbz x0, #2, 97f\n"
+ "ld1 { v24.s }[0], [x25], #0x4\n"
+ "tbz x0, #1, 96f\n"
+ "ld1 { v24.h }[2], [x25], #0x2\n"
+ "tbz x0, #0, 99f\n"
+ "ld1 { v24.b }[6], [x25]\n"
"b 99f\n"
"96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 99f\n"
- "ld1 { v24.b }[4], [x23]\n"
+ "tbz x0, #0, 99f\n"
+ "ld1 { v24.b }[4], [x25]\n"
"b 99f\n"
"97:" // Oddments: Load (5, 1): Bit 2: Unset
- "tbz x4, #1, 98f\n"
- "ld1 { v24.h }[0], [x23], #0x2\n"
- "tbz x4, #0, 99f\n"
- "ld1 { v24.b }[2], [x23]\n"
+ "tbz x0, #1, 98f\n"
+ "ld1 { v24.h }[0], [x25], #0x2\n"
+ "tbz x0, #0, 99f\n"
+ "ld1 { v24.b }[2], [x25]\n"
"b 99f\n"
"98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 99f\n"
- "ld1 { v24.b }[0], [x23]\n"
+ "tbz x0, #0, 99f\n"
+ "ld1 { v24.b }[0], [x25]\n"
"99:" // Oddments: Load (5, 1): Bit 2: End
- "ldr d1, [x3, #0xa8]\n"
- "usubl v24.8h, v24.8b, v7.8b\n"
- "smlal v10.4s, v24.4h, v0.4h\n"
- "ldr x22, [x25, #0x100]\n"
- "usubl v1.8h, v1.8b, v13.8b\n"
- "smlal2 v9.4s, v24.8h, v0.8h\n"
- "add x22, x22, x10\n"
+ "ldr d1, [x23, #0xa8]\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "usubl v1.8h, v1.8b, v14.8b\n"
+ "ldr x26, [x20, #0x100]\n"
+ "smlal v6.4s, v24.4h, v0.4h\n"
+ "smlal2 v5.4s, v24.8h, v0.8h\n"
+ "add x26, x26, x24\n"
"smlal v15.4s, v23.4h, v1.4h\n"
- "smlal2 v20.4s, v23.8h, v1.8h\n"
- "smlal v18.4s, v31.4h, v1.4h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "smlal v11.4s, v24.4h, v1.4h\n"
- "smlal2 v8.4s, v24.8h, v1.8h\n"
- "tbz x4, #2, 101f\n"
- "ld1 { v27.s }[0], [x22], #0x4\n"
- "tbz x4, #1, 100f\n"
- "ld1 { v27.h }[2], [x22], #0x2\n"
- "tbz x4, #0, 103f\n"
- "ld1 { v27.b }[6], [x22]\n"
+ "smlal2 v16.4s, v23.8h, v1.8h\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal2 v8.4s, v31.8h, v1.8h\n"
+ "smlal v10.4s, v24.4h, v1.4h\n"
+ "smlal2 v7.4s, v24.8h, v1.8h\n"
+ "tbz x0, #2, 101f\n"
+ "ld1 { v27.s }[0], [x26], #0x4\n"
+ "tbz x0, #1, 100f\n"
+ "ld1 { v27.h }[2], [x26], #0x2\n"
+ "tbz x0, #0, 103f\n"
+ "ld1 { v27.b }[6], [x26]\n"
"b 103f\n"
"100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 103f\n"
- "ld1 { v27.b }[4], [x22]\n"
+ "tbz x0, #0, 103f\n"
+ "ld1 { v27.b }[4], [x26]\n"
"b 103f\n"
"101:" // Oddments: Load (5, 2): Bit 2: Unset
- "tbz x4, #1, 102f\n"
- "ld1 { v27.h }[0], [x22], #0x2\n"
- "tbz x4, #0, 103f\n"
- "ld1 { v27.b }[2], [x22]\n"
+ "tbz x0, #1, 102f\n"
+ "ld1 { v27.h }[0], [x26], #0x2\n"
+ "tbz x0, #0, 103f\n"
+ "ld1 { v27.b }[2], [x26]\n"
"b 103f\n"
"102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 103f\n"
- "ld1 { v27.b }[0], [x22]\n"
+ "tbz x0, #0, 103f\n"
+ "ld1 { v27.b }[0], [x26]\n"
"103:" // Oddments: Load (5, 2): Bit 2: End
- "ldr d2, [x3, #0xb0]\n"
- "usubl v27.8h, v27.8b, v7.8b\n"
- "smlal v10.4s, v27.4h, v1.4h\n"
- "ldr x20, [x25, #0x108]\n"
- "usubl v2.8h, v2.8b, v13.8b\n"
- "smlal2 v9.4s, v27.8h, v1.8h\n"
- "add x20, x20, x10\n"
+ "ldr d2, [x23, #0xb0]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "usubl v2.8h, v2.8b, v14.8b\n"
+ "ldr x12, [x20, #0x108]\n"
+ "smlal v6.4s, v27.4h, v1.4h\n"
+ "smlal2 v5.4s, v27.8h, v1.8h\n"
+ "add x12, x12, x24\n"
"smlal v15.4s, v31.4h, v2.4h\n"
- "smlal2 v20.4s, v31.8h, v2.8h\n"
- "smlal v18.4s, v30.4h, v2.4h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "smlal v11.4s, v27.4h, v2.4h\n"
- "smlal2 v8.4s, v27.8h, v2.8h\n"
- "tbz x4, #2, 105f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 104f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 107f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "smlal2 v16.4s, v31.8h, v2.8h\n"
+ "smlal v17.4s, v30.4h, v2.4h\n"
+ "smlal2 v8.4s, v30.8h, v2.8h\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal2 v7.4s, v27.8h, v2.8h\n"
+ "tbz x0, #2, 105f\n"
+ "ld1 { v25.s }[0], [x12], #0x4\n"
+ "tbz x0, #1, 104f\n"
+ "ld1 { v25.h }[2], [x12], #0x2\n"
+ "tbz x0, #0, 107f\n"
+ "ld1 { v25.b }[6], [x12]\n"
"b 107f\n"
"104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 107f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "tbz x0, #0, 107f\n"
+ "ld1 { v25.b }[4], [x12]\n"
"b 107f\n"
"105:" // Oddments: Load (5, 3): Bit 2: Unset
- "tbz x4, #1, 106f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 107f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "tbz x0, #1, 106f\n"
+ "ld1 { v25.h }[0], [x12], #0x2\n"
+ "tbz x0, #0, 107f\n"
+ "ld1 { v25.b }[2], [x12]\n"
"b 107f\n"
"106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 107f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "tbz x0, #0, 107f\n"
+ "ld1 { v25.b }[0], [x12]\n"
"107:" // Oddments: Load (5, 3): Bit 2: End
- "ldr d3, [x3, #0xb8]\n"
- "usubl v25.8h, v25.8b, v7.8b\n"
- "smlal v10.4s, v25.4h, v2.4h\n"
- "ldr x13, [x25, #0x110]\n"
- "usubl v3.8h, v3.8b, v13.8b\n"
- "smlal2 v9.4s, v25.8h, v2.8h\n"
- "add x13, x13, x10\n"
+ "ldr d3, [x23, #0xb8]\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "usubl v3.8h, v3.8b, v14.8b\n"
+ "ldr x14, [x20, #0x110]\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "add x14, x14, x24\n"
"smlal v15.4s, v30.4h, v3.4h\n"
- "smlal2 v20.4s, v30.8h, v3.8h\n"
- "smlal v18.4s, v28.4h, v3.4h\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
- "smlal v11.4s, v25.4h, v3.4h\n"
- "smlal2 v8.4s, v25.8h, v3.8h\n"
- "tbz x4, #2, 109f\n"
- "ld1 { v24.s }[0], [x13], #0x4\n"
- "tbz x4, #1, 108f\n"
- "ld1 { v24.h }[2], [x13], #0x2\n"
- "tbz x4, #0, 111f\n"
- "ld1 { v24.b }[6], [x13]\n"
+ "smlal2 v16.4s, v30.8h, v3.8h\n"
+ "smlal v17.4s, v28.4h, v3.4h\n"
+ "smlal2 v8.4s, v28.8h, v3.8h\n"
+ "smlal v10.4s, v25.4h, v3.4h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "tbz x0, #2, 109f\n"
+ "ld1 { v24.s }[0], [x14], #0x4\n"
+ "tbz x0, #1, 108f\n"
+ "ld1 { v24.h }[2], [x14], #0x2\n"
+ "tbz x0, #0, 111f\n"
+ "ld1 { v24.b }[6], [x14]\n"
"b 111f\n"
"108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 111f\n"
- "ld1 { v24.b }[4], [x13]\n"
+ "tbz x0, #0, 111f\n"
+ "ld1 { v24.b }[4], [x14]\n"
"b 111f\n"
"109:" // Oddments: Load (5, 4): Bit 2: Unset
- "tbz x4, #1, 110f\n"
- "ld1 { v24.h }[0], [x13], #0x2\n"
- "tbz x4, #0, 111f\n"
- "ld1 { v24.b }[2], [x13]\n"
+ "tbz x0, #1, 110f\n"
+ "ld1 { v24.h }[0], [x14], #0x2\n"
+ "tbz x0, #0, 111f\n"
+ "ld1 { v24.b }[2], [x14]\n"
"b 111f\n"
"110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 111f\n"
- "ld1 { v24.b }[0], [x13]\n"
+ "tbz x0, #0, 111f\n"
+ "ld1 { v24.b }[0], [x14]\n"
"111:" // Oddments: Load (5, 4): Bit 2: End
- "ldr d4, [x3, #0xc0]\n"
- "usubl v24.8h, v24.8b, v7.8b\n"
- "smlal v10.4s, v24.4h, v3.4h\n"
- "ldr x21, [x25, #0x118]\n"
- "usubl v4.8h, v4.8b, v13.8b\n"
- "smlal2 v9.4s, v24.8h, v3.8h\n"
- "add x21, x21, x10\n"
+ "ldr d4, [x23, #0xc0]\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "usubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x21, [x20, #0x118]\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "add x21, x21, x24\n"
"smlal v15.4s, v28.4h, v4.4h\n"
- "smlal2 v20.4s, v28.8h, v4.8h\n"
- "smlal v18.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "smlal v11.4s, v24.4h, v4.4h\n"
- "smlal2 v8.4s, v24.8h, v4.8h\n"
- "tbz x4, #2, 113f\n"
+ "smlal2 v16.4s, v28.8h, v4.8h\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal2 v8.4s, v26.8h, v4.8h\n"
+ "smlal v10.4s, v24.4h, v4.4h\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "tbz x0, #2, 113f\n"
"ld1 { v27.s }[0], [x21], #0x4\n"
- "tbz x4, #1, 112f\n"
+ "tbz x0, #1, 112f\n"
"ld1 { v27.h }[2], [x21], #0x2\n"
- "tbz x4, #0, 115f\n"
+ "tbz x0, #0, 115f\n"
"ld1 { v27.b }[6], [x21]\n"
"b 115f\n"
"112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 115f\n"
+ "tbz x0, #0, 115f\n"
"ld1 { v27.b }[4], [x21]\n"
"b 115f\n"
"113:" // Oddments: Load (5, 5): Bit 2: Unset
- "tbz x4, #1, 114f\n"
+ "tbz x0, #1, 114f\n"
"ld1 { v27.h }[0], [x21], #0x2\n"
- "tbz x4, #0, 115f\n"
+ "tbz x0, #0, 115f\n"
"ld1 { v27.b }[2], [x21]\n"
"b 115f\n"
"114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 115f\n"
+ "tbz x0, #0, 115f\n"
"ld1 { v27.b }[0], [x21]\n"
"115:" // Oddments: Load (5, 5): Bit 2: End
- "usubl v27.8h, v27.8b, v7.8b\n"
- "smlal v10.4s, v27.4h, v4.4h\n"
- "smlal2 v9.4s, v27.8h, v4.8h\n"
- "tbz x4, #2, 117f\n"
- "ld1 { v6.4s }, [x2], #0x10\n"
- "ld1 { v21.4s }, [x5], #0x10\n"
- "tbz x4, #1, 116f\n"
- "ld1 { v17.d }[0], [x2], #0x8\n"
- "ld1 { v14.d }[0], [x5], #0x8\n"
- "tbz x4, #0, 119f\n"
- "ld1 { v17.s }[2], [x2]\n"
- "ld1 { v14.s }[2], [x5]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v6.4s, v27.4h, v4.4h\n"
+ "smlal2 v5.4s, v27.8h, v4.8h\n"
+ "tbz x0, #2, 117f\n"
+ "ld1 { v12.4s }, [x10], #0x10\n"
+ "ld1 { v19.4s }, [x1], #0x10\n"
+ "tbz x0, #1, 116f\n"
+ "ld1 { v20.d }[0], [x10], #0x8\n"
+ "ld1 { v29.d }[0], [x1], #0x8\n"
+ "tbz x0, #0, 119f\n"
+ "ld1 { v20.s }[2], [x10]\n"
+ "ld1 { v29.s }[2], [x1]\n"
"b 119f\n"
"116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x4, #0, 119f\n"
- "ld1 { v17.s }[0], [x2]\n"
- "ld1 { v14.s }[0], [x5]\n"
+ "tbz x0, #0, 119f\n"
+ "ld1 { v20.s }[0], [x10]\n"
+ "ld1 { v29.s }[0], [x1]\n"
"b 119f\n"
"117:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x4, #1, 118f\n"
- "ld1 { v6.d }[0], [x2], #0x8\n"
- "ld1 { v21.d }[0], [x5], #0x8\n"
- "tbz x4, #0, 119f\n"
- "ld1 { v6.s }[2], [x2]\n"
- "ld1 { v21.s }[2], [x5]\n"
+ "tbz x0, #1, 118f\n"
+ "ld1 { v12.d }[0], [x10], #0x8\n"
+ "ld1 { v19.d }[0], [x1], #0x8\n"
+ "tbz x0, #0, 119f\n"
+ "ld1 { v12.s }[2], [x10]\n"
+ "ld1 { v19.s }[2], [x1]\n"
"b 119f\n"
"118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 119f\n"
- "ld1 { v6.s }[0], [x2]\n"
- "ld1 { v21.s }[0], [x5]\n"
+ "tbz x0, #0, 119f\n"
+ "ld1 { v12.s }[0], [x10]\n"
+ "ld1 { v19.s }[0], [x1]\n"
"119:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v15.4s, v15.4s, v6.4s\n"
- "add x17, x17, x1\n"
- "sqrdmulh v20.4s, v20.4s, v17.4s\n"
- "add x16, x16, x1\n"
- "sqrdmulh v18.4s, v18.4s, v6.4s\n"
- "add x6, x6, x1\n"
- "sqrdmulh v5.4s, v5.4s, v17.4s\n"
- "add x8, x8, x1\n"
- "sqrdmulh v11.4s, v11.4s, v6.4s\n"
- "and v1.16b, v15.16b, v21.16b\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "and v29.16b, v20.16b, v14.16b\n"
- "and v3.16b, v18.16b, v21.16b\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "and v2.16b, v5.16b, v14.16b\n"
- "and v0.16b, v11.16b, v21.16b\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "sqrdmulh v8.4s, v8.4s, v17.4s\n"
+ "sqdmulh v15.4s, v15.4s, v12.4s\n"
+ "sqdmulh v17.4s, v17.4s, v12.4s\n"
+ "add x16, x16, x22\n"
+ "add x8, x8, x22\n"
+ "sqdmulh v10.4s, v10.4s, v12.4s\n"
+ "sqdmulh v6.4s, v6.4s, v12.4s\n"
+ "add x4, x4, x22\n"
+ "add x7, x7, x22\n"
+ "and v23.16b, v15.16b, v19.16b\n"
+ "sqdmulh v16.4s, v16.4s, v20.4s\n"
+ "and v22.16b, v17.16b, v19.16b\n"
+ "sqdmulh v8.4s, v8.4s, v20.4s\n"
+ "and v21.16b, v10.16b, v19.16b\n"
+ "sqdmulh v7.4s, v7.4s, v20.4s\n"
+ "and v26.16b, v6.16b, v19.16b\n"
+ "sqdmulh v5.4s, v5.4s, v20.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v4.16b, v16.16b, v29.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v2.16b, v8.16b, v29.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v3.16b, v7.16b, v29.16b\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "and v25.16b, v5.16b, v29.16b\n"
+ "sqadd v15.4s, v15.4s, v23.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v22.4s\n"
"sshr v2.4s, v2.4s, #0x1f\n"
- "sqadd v15.4s, v15.4s, v1.4s\n"
- "sqrdmulh v10.4s, v10.4s, v6.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sqrdmulh v9.4s, v9.4s, v17.4s\n"
- "sqadd v20.4s, v20.4s, v29.4s\n"
- "sqadd v18.4s, v18.4s, v3.4s\n"
- "srshl v15.4s, v15.4s, v21.4s\n"
- "sqadd v5.4s, v5.4s, v2.4s\n"
- "srshl v20.4s, v20.4s, v14.4s\n"
- "srshl v18.4s, v18.4s, v21.4s\n"
- "add v15.4s, v15.4s, v19.4s\n"
- "srshl v5.4s, v5.4s, v14.4s\n"
- "add v20.4s, v20.4s, v19.4s\n"
- "smin v15.4s, v15.4s, v12.4s\n"
- "add v18.4s, v18.4s, v19.4s\n"
- "smin v20.4s, v20.4s, v12.4s\n"
- "smax v15.4s, v15.4s, v16.4s\n"
- "smin v18.4s, v18.4s, v12.4s\n"
- "smax v20.4s, v20.4s, v16.4s\n"
- "add v5.4s, v5.4s, v19.4s\n"
- "smax v18.4s, v18.4s, v16.4s\n"
- "uzp1 v15.16b, v15.16b, v20.16b\n"
- "smin v5.4s, v5.4s, v12.4s\n"
+ "sqadd v10.4s, v10.4s, v21.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v26.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v19.4s\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "srshl v17.4s, v17.4s, v19.4s\n"
+ "sqadd v8.4s, v8.4s, v2.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqadd v7.4s, v7.4s, v3.4s\n"
+ "srshl v6.4s, v6.4s, v19.4s\n"
+ "sqadd v5.4s, v5.4s, v25.4s\n"
+ "srshl v16.4s, v16.4s, v29.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqxtn v17.4h, v17.4s\n"
+ "srshl v7.4s, v7.4s, v29.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v5.4s, v5.4s, v29.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v15.8h, v16.4s\n"
+ "sqxtn2 v17.8h, v8.4s\n"
+ "sqxtn2 v10.8h, v7.4s\n"
+ "sqxtn2 v6.8h, v5.4s\n"
+ "sqadd v15.8h, v15.8h, v18.8h\n"
+ "sqadd v17.8h, v17.8h, v18.8h\n"
+ "sqadd v10.8h, v10.8h, v18.8h\n"
+ "sqadd v6.8h, v6.8h, v18.8h\n"
+ "smax v15.8h, v15.8h, v11.8h\n"
+ "smax v17.8h, v17.8h, v11.8h\n"
+ "smax v10.8h, v10.8h, v11.8h\n"
+ "smax v6.8h, v6.8h, v11.8h\n"
+ "smin v15.8h, v15.8h, v13.8h\n"
+ "smin v17.8h, v17.8h, v13.8h\n"
+ "smin v10.8h, v10.8h, v13.8h\n"
+ "smin v6.8h, v6.8h, v13.8h\n"
"uzp1 v15.16b, v15.16b, v15.16b\n"
- "sqadd v11.4s, v11.4s, v0.4s\n"
- "smax v5.4s, v5.4s, v16.4s\n"
- "and v27.16b, v8.16b, v14.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "uzp1 v18.16b, v18.16b, v5.16b\n"
- "srshl v11.4s, v11.4s, v21.4s\n"
- "and v30.16b, v10.16b, v21.16b\n"
- "sshr v30.4s, v30.4s, #0x1f\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
- "add v11.4s, v11.4s, v19.4s\n"
- "sqadd v8.4s, v8.4s, v27.4s\n"
- "and v6.16b, v9.16b, v14.16b\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "smin v11.4s, v11.4s, v12.4s\n"
- "srshl v8.4s, v8.4s, v14.4s\n"
- "sqadd v10.4s, v10.4s, v30.4s\n"
- "smax v11.4s, v11.4s, v16.4s\n"
- "add v8.4s, v8.4s, v19.4s\n"
- "srshl v10.4s, v10.4s, v21.4s\n"
- "sqadd v9.4s, v9.4s, v6.4s\n"
- "smin v8.4s, v8.4s, v12.4s\n"
- "add v10.4s, v10.4s, v19.4s\n"
- "srshl v9.4s, v9.4s, v14.4s\n"
- "smax v8.4s, v8.4s, v16.4s\n"
- "smin v10.4s, v10.4s, v12.4s\n"
- "uzp1 v11.16b, v11.16b, v8.16b\n"
- "add v9.4s, v9.4s, v19.4s\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "smax v10.4s, v10.4s, v16.4s\n"
- "smin v9.4s, v9.4s, v12.4s\n"
- "smax v9.4s, v9.4s, v16.4s\n"
- "uzp1 v10.16b, v10.16b, v9.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
"uzp1 v10.16b, v10.16b, v10.16b\n"
- "tbz x4, #2, 121f\n"
- "st1 { v15.s }[0], [x17], #0x4\n"
- "st1 { v18.s }[0], [x16], #0x4\n"
- "st1 { v11.s }[0], [x6], #0x4\n"
- "st1 { v10.s }[0], [x8], #0x4\n"
- "tbz x4, #1, 120f\n"
- "st1 { v15.h }[2], [x17], #0x2\n"
- "st1 { v18.h }[2], [x16], #0x2\n"
- "st1 { v11.h }[2], [x6], #0x2\n"
- "st1 { v10.h }[2], [x8], #0x2\n"
- "tbz x4, #0, 123f\n"
- "st1 { v15.b }[6], [x17], #0x1\n"
- "st1 { v18.b }[6], [x16], #0x1\n"
- "st1 { v11.b }[6], [x6], #0x1\n"
- "st1 { v10.b }[6], [x8], #0x1\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "tbz x0, #2, 121f\n"
+ "st1 { v15.s }[0], [x16], #0x4\n"
+ "st1 { v17.s }[0], [x8], #0x4\n"
+ "st1 { v10.s }[0], [x4], #0x4\n"
+ "st1 { v6.s }[0], [x7], #0x4\n"
+ "tbz x0, #1, 120f\n"
+ "st1 { v15.h }[2], [x16], #0x2\n"
+ "st1 { v17.h }[2], [x8], #0x2\n"
+ "st1 { v10.h }[2], [x4], #0x2\n"
+ "st1 { v6.h }[2], [x7], #0x2\n"
+ "tbz x0, #0, 123f\n"
+ "st1 { v15.b }[6], [x16], #0x1\n"
+ "st1 { v17.b }[6], [x8], #0x1\n"
+ "st1 { v10.b }[6], [x4], #0x1\n"
+ "st1 { v6.b }[6], [x7], #0x1\n"
"b 123f\n"
"120:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x4, #0, 123f\n"
- "st1 { v15.b }[4], [x17], #0x1\n"
- "st1 { v18.b }[4], [x16], #0x1\n"
- "st1 { v11.b }[4], [x6], #0x1\n"
- "st1 { v10.b }[4], [x8], #0x1\n"
+ "tbz x0, #0, 123f\n"
+ "st1 { v15.b }[4], [x16], #0x1\n"
+ "st1 { v17.b }[4], [x8], #0x1\n"
+ "st1 { v10.b }[4], [x4], #0x1\n"
+ "st1 { v6.b }[4], [x7], #0x1\n"
"b 123f\n"
"121:" // Oddments: Bit 2: Unset
- "tbz x4, #1, 122f\n"
- "st1 { v15.h }[0], [x17], #0x2\n"
- "st1 { v18.h }[0], [x16], #0x2\n"
- "st1 { v11.h }[0], [x6], #0x2\n"
- "st1 { v10.h }[0], [x8], #0x2\n"
- "tbz x4, #0, 123f\n"
- "st1 { v15.b }[2], [x17], #0x1\n"
- "st1 { v18.b }[2], [x16], #0x1\n"
- "st1 { v11.b }[2], [x6], #0x1\n"
- "st1 { v10.b }[2], [x8], #0x1\n"
+ "tbz x0, #1, 122f\n"
+ "st1 { v15.h }[0], [x16], #0x2\n"
+ "st1 { v17.h }[0], [x8], #0x2\n"
+ "st1 { v10.h }[0], [x4], #0x2\n"
+ "st1 { v6.h }[0], [x7], #0x2\n"
+ "tbz x0, #0, 123f\n"
+ "st1 { v15.b }[2], [x16], #0x1\n"
+ "st1 { v17.b }[2], [x8], #0x1\n"
+ "st1 { v10.b }[2], [x4], #0x1\n"
+ "st1 { v6.b }[2], [x7], #0x1\n"
"b 123f\n"
"122:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 123f\n"
- "st1 { v15.b }[0], [x17], #0x1\n"
- "st1 { v18.b }[0], [x16], #0x1\n"
- "st1 { v11.b }[0], [x6], #0x1\n"
- "st1 { v10.b }[0], [x8], #0x1\n"
+ "tbz x0, #0, 123f\n"
+ "st1 { v15.b }[0], [x16], #0x1\n"
+ "st1 { v17.b }[0], [x8], #0x1\n"
+ "st1 { v10.b }[0], [x4], #0x1\n"
+ "st1 { v6.b }[0], [x7], #0x1\n"
"123:" // Oddments: Bit 2: End
-
"124:" // End
-
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp
index f5459c2ac1..b859978b1e 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,28 +28,23 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
-struct a64_u8q_nhwc_generic_output9_mla_depthfirst
+class a64_u8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef uint8_t input_type;
- typedef uint8_t weight_type;
- typedef uint8_t return_type;
-
- typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- constexpr static unsigned int n_output_points = 9;
+ KernelType kernel = a64_u8q_nhwc_generic_output9_mla_depthfirst_impl;
- kern_type kernel = a64_u8q_nhwc_generic_output9_mla_depthfirst_impl;
+ public:
+ a64_u8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<uint8_t, uint8_t, uint8_t, int32_t>(9, arm_gemm::VLType::None) {}
- a64_u8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {}
+ KernelType get_kernel() const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
index e8ac603928..134f657fb8 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,39 +28,33 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
-struct a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst
+struct a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst : DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
- typedef uint32_t bias_type;
- typedef uint8_t input_type;
- typedef uint8_t weight_type;
- typedef uint8_t return_type;
-
- typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
+ using Parent = DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 2;
constexpr static unsigned int stride_cols = 2;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 4;
-
- constexpr static unsigned int input_rows = 5;
- constexpr static unsigned int input_cols = 9;
- constexpr static unsigned int input_col_quads = 1;
+ a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *)
+ : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
- kern_type kernel = a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
- a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
index c5e0417c20..b575a5d169 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,39 +28,33 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
-struct a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst
+struct a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst : DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
- typedef uint32_t bias_type;
- typedef uint8_t input_type;
- typedef uint8_t weight_type;
- typedef uint8_t return_type;
-
- typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
+ using Parent = DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
constexpr static unsigned int kernel_rows = 5;
constexpr static unsigned int kernel_cols = 5;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 4;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 8;
- constexpr static unsigned int input_cols = 6;
- constexpr static unsigned int input_col_quads = 1;
+ a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *)
+ : Parent(4, 2, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
- kern_type kernel = a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; }
- a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
index 6b52017ce1..13f903b95d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,31 +28,25 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const uint8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
-struct a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
+struct a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef uint8_t input_type;
- typedef uint8_t weight_type;
- typedef uint8_t return_type;
-
- typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const uint8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- constexpr static unsigned int output_rows(void) { return 2; };
- constexpr static unsigned int output_cols(void) { return 8; };
-
- constexpr static unsigned int output_col_regs(void) { return 2; };
-
- kern_type kernel = a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
-
- a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {}
+ using Parent = GenericDepthfirstMultiplierKernelStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+ a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+ : Parent(2, 8, arm_gemm::VLType::None)
+ {
+ }
+ Parent::KernelType kernel = a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..2d2b452630
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..2410d38512
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1164 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[16];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[5];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[3];
+ inptrs[3] = inptrs_raw[6];
+ inptrs[4] = inptrs_raw[9];
+ inptrs[5] = inptrs_raw[12];
+ inptrs[6] = inptrs_raw[15];
+ inptrs[7] = inptrs_raw[1];
+ inptrs[8] = inptrs_raw[2];
+ inptrs[9] = inptrs_raw[10];
+ inptrs[10] = inptrs_raw[4];
+ inptrs[11] = inptrs_raw[7];
+ inptrs[12] = inptrs_raw[8];
+ inptrs[13] = inptrs_raw[11];
+ inptrs[14] = inptrs_raw[13];
+ inptrs[15] = inptrs_raw[14];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x19, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "add x23, x19, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x19, %[offsetof_Requantize32_c_offset]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x20, x19, %[offsetof_Requantize32_minval]\n"
+ "add x19, x19, %[offsetof_Requantize32_maxval]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+ "ld1r { v15.16b }, [x23]\n"
+ "ld1r { v13.8h }, [x22]\n"
+ "lsr x16, x8, #0x3\n"
+ "mov x15, #0x0\n"
+ "ld1r { v11.8h }, [x20]\n"
+ "ld1r { v25.8h }, [x19]\n"
+ "mov x14, #0x0\n"
+ "add x13, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x10, x9, [x21, #0x0]\n"
+ "ldp x28, x27, [x21, #0x10]\n"
+ "cbz x16, 3f\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q12, [x19, #0x0]\n"
+ "subs x16, x16, #0x1\n"
+ "mov v14.16b, v12.16b\n"
+ "ldr q17, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "ldr d2, [x17, #0x10]\n"
+ "mov v9.16b, v17.16b\n"
+ "mov v16.16b, v12.16b\n"
+ "ldr d3, [x17, #0x18]\n"
+ "ldr d4, [x17, #0x20]\n"
+ "mov v10.16b, v17.16b\n"
+ "mov v18.16b, v12.16b\n"
+ "ldr d5, [x17, #0x28]\n"
+ "ldr d6, [x17, #0x30]\n"
+ "mov v26.16b, v17.16b\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "ldr d7, [x17, #0x38]\n"
+ "ldr d8, [x17, #0x40]\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "ldp x23, x22, [x13, #0x0]\n"
+ "ldp x21, x20, [x13, #0x10]\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x19, [x13, #0x20]\n"
+ "ldr d31, [x23, x15]\n"
+ "usubl v5.8h, v5.8b, v15.8b\n"
+ "usubl v6.8h, v6.8b, v15.8b\n"
+ "ldr d30, [x22, x15]\n"
+ "ldr d29, [x21, x15]\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "usubl v8.8h, v8.8b, v15.8b\n"
+ "ldr d28, [x20, x15]\n"
+ "ldr d27, [x19, x15]\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "smlal v12.4s, v31.4h, v4.4h\n"
+ "smlal2 v17.4s, v31.8h, v4.8h\n"
+ "ldr x21, [x13, #0x28]\n"
+ "ldr x26, [x13, #0x38]\n"
+ "smlal v14.4s, v31.4h, v3.4h\n"
+ "smlal2 v9.4s, v31.8h, v3.8h\n"
+ "ldr x20, [x13, #0x30]\n"
+ "ldr x25, [x13, #0x40]\n"
+ "smlal v12.4s, v30.4h, v0.4h\n"
+ "smlal2 v17.4s, v30.8h, v0.8h\n"
+ "ldr x19, [x13, #0x48]\n"
+ "ldr d30, [x19, x15]\n"
+ "smlal v14.4s, v29.4h, v2.4h\n"
+ "smlal2 v9.4s, v29.8h, v2.8h\n"
+ "ldr d29, [x20, x15]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "smlal2 v10.4s, v31.8h, v1.8h\n"
+ "ldr x24, [x13, #0x50]\n"
+ "ldr x23, [x13, #0x58]\n"
+ "smlal v18.4s, v31.4h, v0.4h\n"
+ "smlal2 v26.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x21, x15]\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal v12.4s, v28.4h, v5.4h\n"
+ "smlal2 v17.4s, v28.8h, v5.8h\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ldr x22, [x13, #0x60]\n"
+ "smlal v14.4s, v28.4h, v4.4h\n"
+ "smlal2 v9.4s, v28.8h, v4.8h\n"
+ "ldr x21, [x13, #0x68]\n"
+ "ldr x20, [x13, #0x70]\n"
+ "smlal v16.4s, v28.4h, v2.4h\n"
+ "smlal2 v10.4s, v28.8h, v2.8h\n"
+ "ldr x19, [x13, #0x78]\n"
+ "ldr q21, [x12, #0x0]\n"
+ "smlal v18.4s, v28.4h, v1.4h\n"
+ "smlal2 v26.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x26, x15]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v12.4s, v27.4h, v7.4h\n"
+ "smlal2 v17.4s, v27.8h, v7.8h\n"
+ "ldr q24, [x11, #0x0]\n"
+ "ldr q19, [x12, #0x10]\n"
+ "smlal v14.4s, v27.4h, v6.4h\n"
+ "smlal2 v9.4s, v27.8h, v6.8h\n"
+ "ldr q23, [x11, #0x10]\n"
+ "add x17, x17, #0x48\n"
+ "smlal v16.4s, v31.4h, v6.4h\n"
+ "smlal2 v10.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x25, x15]\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal v18.4s, v27.4h, v3.4h\n"
+ "smlal2 v26.4s, v27.8h, v3.8h\n"
+ "subs x16, x16, #0x1\n"
+ "add x12, x12, #0x20\n"
+ "smlal v12.4s, v28.4h, v1.4h\n"
+ "smlal2 v17.4s, v28.8h, v1.8h\n"
+ "add x11, x11, #0x20\n"
+ "smlal v14.4s, v28.4h, v0.4h\n"
+ "smlal2 v9.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x23, x15]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal v18.4s, v29.4h, v8.4h\n"
+ "smlal2 v10.4s, v27.8h, v4.8h\n"
+ "smlal2 v26.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x24, x15]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "smlal v12.4s, v31.4h, v2.4h\n"
+ "smlal2 v17.4s, v31.8h, v2.8h\n"
+ "smlal v14.4s, v31.4h, v1.4h\n"
+ "smlal2 v9.4s, v31.8h, v1.8h\n"
+ "ldr d31, [x22, x15]\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal v16.4s, v30.4h, v5.4h\n"
+ "smlal v18.4s, v30.4h, v4.4h\n"
+ "smlal v12.4s, v30.4h, v8.4h\n"
+ "smlal2 v17.4s, v30.8h, v8.8h\n"
+ "smlal v14.4s, v30.4h, v7.4h\n"
+ "smlal2 v9.4s, v30.8h, v7.8h\n"
+ "smlal2 v10.4s, v30.8h, v5.8h\n"
+ "smlal2 v26.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x21, x15]\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "smlal v16.4s, v29.4h, v0.4h\n"
+ "smlal v18.4s, v28.4h, v2.4h\n"
+ "smlal v12.4s, v29.4h, v3.4h\n"
+ "smlal2 v17.4s, v29.8h, v3.8h\n"
+ "smlal2 v10.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x20, x15]\n"
+ "smlal2 v26.4s, v28.8h, v2.8h\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
+ "smlal v18.4s, v30.4h, v5.4h\n"
+ "smlal v14.4s, v28.4h, v5.4h\n"
+ "smlal2 v9.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x19, x15]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal2 v10.4s, v31.8h, v3.8h\n"
+ "smlal2 v26.4s, v30.8h, v5.8h\n"
+ "add x15, x15, #0x8\n"
+ "smlal v16.4s, v29.4h, v7.4h\n"
+ "smlal v18.4s, v29.4h, v6.4h\n"
+ "smlal2 v10.4s, v29.8h, v7.8h\n"
+ "smlal2 v26.4s, v29.8h, v6.8h\n"
+ "smlal v12.4s, v31.4h, v6.4h\n"
+ "smlal v14.4s, v30.4h, v8.4h\n"
+ "sqdmulh v12.4s, v12.4s, v21.4s\n"
+ "smlal v16.4s, v28.4h, v8.4h\n"
+ "smlal v18.4s, v28.4h, v7.4h\n"
+ "sqdmulh v14.4s, v14.4s, v21.4s\n"
+ "smlal2 v17.4s, v31.8h, v6.8h\n"
+ "smlal2 v9.4s, v30.8h, v8.8h\n"
+ "sqdmulh v16.4s, v16.4s, v21.4s\n"
+ "smlal2 v10.4s, v28.8h, v8.8h\n"
+ "smlal2 v26.4s, v28.8h, v7.8h\n"
+ "sqdmulh v18.4s, v18.4s, v21.4s\n"
+ "and v29.16b, v12.16b, v24.16b\n"
+ "sqdmulh v17.4s, v17.4s, v19.4s\n"
+ "and v22.16b, v14.16b, v24.16b\n"
+ "sqdmulh v9.4s, v9.4s, v19.4s\n"
+ "and v21.16b, v16.16b, v24.16b\n"
+ "sqdmulh v10.4s, v10.4s, v19.4s\n"
+ "and v20.16b, v18.16b, v24.16b\n"
+ "sqdmulh v26.4s, v26.4s, v19.4s\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v19.16b, v17.16b, v23.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v30.16b, v9.16b, v23.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v3.16b, v10.16b, v23.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v28.16b, v26.16b, v23.16b\n"
+ "sqadd v12.4s, v12.4s, v29.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v14.4s, v14.4s, v22.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v21.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v20.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v12.4s, v12.4s, v24.4s\n"
+ "sqadd v17.4s, v17.4s, v19.4s\n"
+ "srshl v14.4s, v14.4s, v24.4s\n"
+ "sqadd v9.4s, v9.4s, v30.4s\n"
+ "srshl v16.4s, v16.4s, v24.4s\n"
+ "sqadd v10.4s, v10.4s, v3.4s\n"
+ "srshl v18.4s, v18.4s, v24.4s\n"
+ "sqadd v26.4s, v26.4s, v28.4s\n"
+ "srshl v17.4s, v17.4s, v23.4s\n"
+ "sqxtn v12.4h, v12.4s\n"
+ "srshl v9.4s, v9.4s, v23.4s\n"
+ "sqxtn v14.4h, v14.4s\n"
+ "srshl v10.4s, v10.4s, v23.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "srshl v26.4s, v26.4s, v23.4s\n"
+ "sqxtn v18.4h, v18.4s\n"
+ "sqxtn2 v12.8h, v17.4s\n"
+ "sqxtn2 v14.8h, v9.4s\n"
+ "sqxtn2 v16.8h, v10.4s\n"
+ "sqxtn2 v18.8h, v26.4s\n"
+ "sqadd v12.8h, v12.8h, v13.8h\n"
+ "sqadd v14.8h, v14.8h, v13.8h\n"
+ "sqadd v16.8h, v16.8h, v13.8h\n"
+ "sqadd v18.8h, v18.8h, v13.8h\n"
+ "smax v12.8h, v12.8h, v11.8h\n"
+ "smax v14.8h, v14.8h, v11.8h\n"
+ "smax v16.8h, v16.8h, v11.8h\n"
+ "smax v18.8h, v18.8h, v11.8h\n"
+ "smin v12.8h, v12.8h, v25.8h\n"
+ "smin v14.8h, v14.8h, v25.8h\n"
+ "smin v16.8h, v16.8h, v25.8h\n"
+ "smin v18.8h, v18.8h, v25.8h\n"
+ "uzp1 v12.16b, v12.16b, v12.16b\n"
+ "uzp1 v14.16b, v14.16b, v14.16b\n"
+ "str d12, [x10, x14]\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str d14, [x9, x14]\n"
+ "str d16, [x28, x14]\n"
+ "str d18, [x27, x14]\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q12, [x19, #0x0]\n"
+ "add x14, x14, #0x8\n"
+ "ldr q17, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "ldr d2, [x17, #0x10]\n"
+ "mov v14.16b, v12.16b\n"
+ "mov v9.16b, v17.16b\n"
+ "ldr d3, [x17, #0x18]\n"
+ "ldr d4, [x17, #0x20]\n"
+ "mov v16.16b, v12.16b\n"
+ "mov v10.16b, v17.16b\n"
+ "ldr d5, [x17, #0x28]\n"
+ "ldr d6, [x17, #0x30]\n"
+ "mov v18.16b, v12.16b\n"
+ "mov v26.16b, v17.16b\n"
+ "ldr d7, [x17, #0x38]\n"
+ "ldr d8, [x17, #0x40]\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "ldp x23, x22, [x13, #0x0]\n"
+ "ldp x21, x20, [x13, #0x10]\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x19, [x13, #0x20]\n"
+ "ldr d31, [x23, x15]\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "usubl v5.8h, v5.8b, v15.8b\n"
+ "ldr d30, [x22, x15]\n"
+ "ldr d29, [x21, x15]\n"
+ "usubl v6.8h, v6.8b, v15.8b\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "ldr d28, [x20, x15]\n"
+ "ldr d27, [x19, x15]\n"
+ "usubl v8.8h, v8.8b, v15.8b\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "smlal v12.4s, v31.4h, v4.4h\n"
+ "smlal2 v17.4s, v31.8h, v4.8h\n"
+ "ldr x21, [x13, #0x28]\n"
+ "ldr x26, [x13, #0x38]\n"
+ "smlal v14.4s, v31.4h, v3.4h\n"
+ "smlal2 v9.4s, v31.8h, v3.8h\n"
+ "ldr x20, [x13, #0x30]\n"
+ "ldr x25, [x13, #0x40]\n"
+ "smlal v12.4s, v30.4h, v0.4h\n"
+ "smlal2 v17.4s, v30.8h, v0.8h\n"
+ "ldr x19, [x13, #0x48]\n"
+ "ldr d30, [x19, x15]\n"
+ "smlal v14.4s, v29.4h, v2.4h\n"
+ "smlal2 v9.4s, v29.8h, v2.8h\n"
+ "ldr d29, [x20, x15]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "smlal2 v10.4s, v31.8h, v1.8h\n"
+ "ldr x24, [x13, #0x50]\n"
+ "ldr x23, [x13, #0x58]\n"
+ "smlal v18.4s, v31.4h, v0.4h\n"
+ "smlal2 v26.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x21, x15]\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal v12.4s, v28.4h, v5.4h\n"
+ "smlal2 v17.4s, v28.8h, v5.8h\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ldr x22, [x13, #0x60]\n"
+ "smlal v14.4s, v28.4h, v4.4h\n"
+ "smlal2 v9.4s, v28.8h, v4.8h\n"
+ "ldr x21, [x13, #0x68]\n"
+ "ldr x20, [x13, #0x70]\n"
+ "smlal v16.4s, v28.4h, v2.4h\n"
+ "smlal2 v10.4s, v28.8h, v2.8h\n"
+ "ldr x19, [x13, #0x78]\n"
+ "ldr q21, [x12, #0x0]\n"
+ "smlal v18.4s, v28.4h, v1.4h\n"
+ "smlal2 v26.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x26, x15]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v12.4s, v27.4h, v7.4h\n"
+ "smlal2 v17.4s, v27.8h, v7.8h\n"
+ "ldr q24, [x11, #0x0]\n"
+ "ldr q19, [x12, #0x10]\n"
+ "smlal v14.4s, v27.4h, v6.4h\n"
+ "smlal2 v9.4s, v27.8h, v6.8h\n"
+ "ldr q23, [x11, #0x10]\n"
+ "tst x8, #0x7\n"
+ "smlal v16.4s, v31.4h, v6.4h\n"
+ "smlal2 v10.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x25, x15]\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal v18.4s, v27.4h, v3.4h\n"
+ "smlal2 v26.4s, v27.8h, v3.8h\n"
+ "add x12, x12, #0x20\n"
+ "add x11, x11, #0x20\n"
+ "smlal v12.4s, v28.4h, v1.4h\n"
+ "smlal2 v17.4s, v28.8h, v1.8h\n"
+ "smlal v14.4s, v28.4h, v0.4h\n"
+ "smlal2 v9.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x23, x15]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal v18.4s, v29.4h, v8.4h\n"
+ "smlal2 v10.4s, v27.8h, v4.8h\n"
+ "smlal2 v26.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x24, x15]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "smlal v12.4s, v31.4h, v2.4h\n"
+ "smlal2 v17.4s, v31.8h, v2.8h\n"
+ "smlal v14.4s, v31.4h, v1.4h\n"
+ "smlal2 v9.4s, v31.8h, v1.8h\n"
+ "ldr d31, [x22, x15]\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal v16.4s, v30.4h, v5.4h\n"
+ "smlal v18.4s, v30.4h, v4.4h\n"
+ "smlal v12.4s, v30.4h, v8.4h\n"
+ "smlal2 v17.4s, v30.8h, v8.8h\n"
+ "smlal v14.4s, v30.4h, v7.4h\n"
+ "smlal2 v9.4s, v30.8h, v7.8h\n"
+ "smlal2 v10.4s, v30.8h, v5.8h\n"
+ "smlal2 v26.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x21, x15]\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "smlal v16.4s, v29.4h, v0.4h\n"
+ "smlal v18.4s, v28.4h, v2.4h\n"
+ "smlal v12.4s, v29.4h, v3.4h\n"
+ "smlal2 v17.4s, v29.8h, v3.8h\n"
+ "smlal2 v10.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x20, x15]\n"
+ "smlal2 v26.4s, v28.8h, v2.8h\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
+ "smlal v18.4s, v30.4h, v5.4h\n"
+ "smlal v14.4s, v28.4h, v5.4h\n"
+ "smlal2 v9.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x19, x15]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal2 v10.4s, v31.8h, v3.8h\n"
+ "smlal2 v26.4s, v30.8h, v5.8h\n"
+ "add x15, x15, #0x8\n"
+ "smlal v16.4s, v29.4h, v7.4h\n"
+ "smlal v18.4s, v29.4h, v6.4h\n"
+ "smlal2 v10.4s, v29.8h, v7.8h\n"
+ "smlal2 v26.4s, v29.8h, v6.8h\n"
+ "smlal v12.4s, v31.4h, v6.4h\n"
+ "smlal v14.4s, v30.4h, v8.4h\n"
+ "sqdmulh v12.4s, v12.4s, v21.4s\n"
+ "smlal v16.4s, v28.4h, v8.4h\n"
+ "smlal v18.4s, v28.4h, v7.4h\n"
+ "sqdmulh v14.4s, v14.4s, v21.4s\n"
+ "smlal2 v17.4s, v31.8h, v6.8h\n"
+ "smlal2 v9.4s, v30.8h, v8.8h\n"
+ "sqdmulh v16.4s, v16.4s, v21.4s\n"
+ "smlal2 v10.4s, v28.8h, v8.8h\n"
+ "smlal2 v26.4s, v28.8h, v7.8h\n"
+ "sqdmulh v18.4s, v18.4s, v21.4s\n"
+ "and v29.16b, v12.16b, v24.16b\n"
+ "sqdmulh v17.4s, v17.4s, v19.4s\n"
+ "and v22.16b, v14.16b, v24.16b\n"
+ "sqdmulh v9.4s, v9.4s, v19.4s\n"
+ "and v21.16b, v16.16b, v24.16b\n"
+ "sqdmulh v10.4s, v10.4s, v19.4s\n"
+ "and v20.16b, v18.16b, v24.16b\n"
+ "sqdmulh v26.4s, v26.4s, v19.4s\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v19.16b, v17.16b, v23.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v30.16b, v9.16b, v23.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v3.16b, v10.16b, v23.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v28.16b, v26.16b, v23.16b\n"
+ "sqadd v12.4s, v12.4s, v29.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v14.4s, v14.4s, v22.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v21.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v20.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v12.4s, v12.4s, v24.4s\n"
+ "sqadd v17.4s, v17.4s, v19.4s\n"
+ "srshl v14.4s, v14.4s, v24.4s\n"
+ "sqadd v9.4s, v9.4s, v30.4s\n"
+ "srshl v16.4s, v16.4s, v24.4s\n"
+ "sqadd v10.4s, v10.4s, v3.4s\n"
+ "srshl v18.4s, v18.4s, v24.4s\n"
+ "sqadd v26.4s, v26.4s, v28.4s\n"
+ "srshl v17.4s, v17.4s, v23.4s\n"
+ "sqxtn v12.4h, v12.4s\n"
+ "srshl v9.4s, v9.4s, v23.4s\n"
+ "sqxtn v14.4h, v14.4s\n"
+ "srshl v10.4s, v10.4s, v23.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "srshl v26.4s, v26.4s, v23.4s\n"
+ "sqxtn v18.4h, v18.4s\n"
+ "sqxtn2 v12.8h, v17.4s\n"
+ "sqxtn2 v14.8h, v9.4s\n"
+ "sqxtn2 v16.8h, v10.4s\n"
+ "sqxtn2 v18.8h, v26.4s\n"
+ "sqadd v12.8h, v12.8h, v13.8h\n"
+ "sqadd v14.8h, v14.8h, v13.8h\n"
+ "sqadd v16.8h, v16.8h, v13.8h\n"
+ "sqadd v18.8h, v18.8h, v13.8h\n"
+ "smax v12.8h, v12.8h, v11.8h\n"
+ "smax v14.8h, v14.8h, v11.8h\n"
+ "smax v16.8h, v16.8h, v11.8h\n"
+ "smax v18.8h, v18.8h, v11.8h\n"
+ "smin v12.8h, v12.8h, v25.8h\n"
+ "smin v14.8h, v14.8h, v25.8h\n"
+ "smin v16.8h, v16.8h, v25.8h\n"
+ "smin v18.8h, v18.8h, v25.8h\n"
+ "uzp1 v12.16b, v12.16b, v12.16b\n"
+ "uzp1 v14.16b, v14.16b, v14.16b\n"
+ "str d12, [x10, x14]\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "str d14, [x9, x14]\n"
+ "str d16, [x28, x14]\n"
+ "str d18, [x27, x14]\n"
+ "add x14, x14, #0x8\n"
+ "beq 64f\n"
+ "add x17, x17, #0x48\n"
+ "3:" // Oddments
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x8, #2, 5f\n"
+ "ld1 { v12.4s }, [x19], #0x10\n"
+ "tbz x8, #1, 4f\n"
+ "ld1 { v17.d }[0], [x19], #0x8\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v17.s }[2], [x19]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x8, #0, 7f\n"
+ "ld1 { v17.s }[0], [x19]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x8, #1, 6f\n"
+ "ld1 { v12.d }[0], [x19], #0x8\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v12.s }[2], [x19]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 7f\n"
+ "ld1 { v12.s }[0], [x19]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "mov v14.16b, v12.16b\n"
+ "mov v9.16b, v17.16b\n"
+ "ldr d2, [x17, #0x10]\n"
+ "ldr d3, [x17, #0x18]\n"
+ "mov v16.16b, v12.16b\n"
+ "mov v10.16b, v17.16b\n"
+ "ldr d4, [x17, #0x20]\n"
+ "ldr d5, [x17, #0x28]\n"
+ "mov v18.16b, v12.16b\n"
+ "mov v26.16b, v17.16b\n"
+ "ldr d6, [x17, #0x30]\n"
+ "ldr d7, [x17, #0x38]\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "ldr d8, [x17, #0x40]\n"
+ "ldp x23, x22, [x13, #0x0]\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "ldp x21, x20, [x13, #0x10]\n"
+ "ldr x19, [x13, #0x20]\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "usubl v5.8h, v5.8b, v15.8b\n"
+ "usubl v6.8h, v6.8b, v15.8b\n"
+ "usubl v7.8h, v7.8b, v15.8b\n"
+ "usubl v8.8h, v8.8b, v15.8b\n"
+ "add x23, x23, x15\n"
+ "add x22, x22, x15\n"
+ "add x21, x21, x15\n"
+ "add x20, x20, x15\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 9f\n"
+ "ld1 { v31.s }[0], [x23], #0x4\n"
+ "ld1 { v30.s }[0], [x22], #0x4\n"
+ "ld1 { v29.s }[0], [x21], #0x4\n"
+ "ld1 { v28.s }[0], [x20], #0x4\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 8f\n"
+ "ld1 { v31.h }[2], [x23], #0x2\n"
+ "ld1 { v30.h }[2], [x22], #0x2\n"
+ "ld1 { v29.h }[2], [x21], #0x2\n"
+ "ld1 { v28.h }[2], [x20], #0x2\n"
+ "ld1 { v27.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v31.b }[6], [x23]\n"
+ "ld1 { v30.b }[6], [x22]\n"
+ "ld1 { v29.b }[6], [x21]\n"
+ "ld1 { v28.b }[6], [x20]\n"
+ "ld1 { v27.b }[6], [x19]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x8, #0, 11f\n"
+ "ld1 { v31.b }[4], [x23]\n"
+ "ld1 { v30.b }[4], [x22]\n"
+ "ld1 { v29.b }[4], [x21]\n"
+ "ld1 { v28.b }[4], [x20]\n"
+ "ld1 { v27.b }[4], [x19]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x8, #1, 10f\n"
+ "ld1 { v31.h }[0], [x23], #0x2\n"
+ "ld1 { v30.h }[0], [x22], #0x2\n"
+ "ld1 { v29.h }[0], [x21], #0x2\n"
+ "ld1 { v28.h }[0], [x20], #0x2\n"
+ "ld1 { v27.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v31.b }[2], [x23]\n"
+ "ld1 { v30.b }[2], [x22]\n"
+ "ld1 { v29.b }[2], [x21]\n"
+ "ld1 { v28.b }[2], [x20]\n"
+ "ld1 { v27.b }[2], [x19]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 11f\n"
+ "ld1 { v31.b }[0], [x23]\n"
+ "ld1 { v30.b }[0], [x22]\n"
+ "ld1 { v29.b }[0], [x21]\n"
+ "ld1 { v28.b }[0], [x20]\n"
+ "ld1 { v27.b }[0], [x19]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal v12.4s, v31.4h, v4.4h\n"
+ "smlal2 v17.4s, v31.8h, v4.8h\n"
+ "ldr x21, [x13, #0x28]\n"
+ "smlal v14.4s, v31.4h, v3.4h\n"
+ "smlal2 v9.4s, v31.8h, v3.8h\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "add x21, x21, x15\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "smlal v16.4s, v31.4h, v1.4h\n"
+ "smlal2 v10.4s, v31.8h, v1.8h\n"
+ "smlal v18.4s, v31.4h, v0.4h\n"
+ "smlal2 v26.4s, v31.8h, v0.8h\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v12.4s, v30.4h, v0.4h\n"
+ "smlal2 v17.4s, v30.8h, v0.8h\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal v14.4s, v29.4h, v2.4h\n"
+ "smlal2 v9.4s, v29.8h, v2.8h\n"
+ "smlal v12.4s, v28.4h, v5.4h\n"
+ "smlal2 v17.4s, v28.8h, v5.8h\n"
+ "smlal v14.4s, v28.4h, v4.4h\n"
+ "smlal2 v9.4s, v28.8h, v4.8h\n"
+ "smlal v16.4s, v28.4h, v2.4h\n"
+ "smlal2 v10.4s, v28.8h, v2.8h\n"
+ "smlal v18.4s, v28.4h, v1.4h\n"
+ "smlal2 v26.4s, v28.8h, v1.8h\n"
+ "tbz x8, #2, 13f\n"
+ "ld1 { v31.s }[0], [x21], #0x4\n"
+ "tbz x8, #1, 12f\n"
+ "ld1 { v31.h }[2], [x21], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v31.b }[6], [x21]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 15f\n"
+ "ld1 { v31.b }[4], [x21]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x8, #1, 14f\n"
+ "ld1 { v31.h }[0], [x21], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v31.b }[2], [x21]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 15f\n"
+ "ld1 { v31.b }[0], [x21]\n"
+ "15:" // Oddments: Load (3, 0): Bit 2: End
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal v16.4s, v31.4h, v6.4h\n"
+ "smlal2 v10.4s, v31.8h, v6.8h\n"
+ "ldr x20, [x13, #0x30]\n"
+ "smlal v12.4s, v27.4h, v7.4h\n"
+ "smlal2 v17.4s, v27.8h, v7.8h\n"
+ "add x20, x20, x15\n"
+ "smlal v14.4s, v27.4h, v6.4h\n"
+ "smlal2 v9.4s, v27.8h, v6.8h\n"
+ "smlal v16.4s, v27.4h, v4.4h\n"
+ "smlal2 v10.4s, v27.8h, v4.8h\n"
+ "smlal v18.4s, v27.4h, v3.4h\n"
+ "smlal2 v26.4s, v27.8h, v3.8h\n"
+ "tbz x8, #2, 17f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 16f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v29.b }[6], [x20]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 19f\n"
+ "ld1 { v29.b }[4], [x20]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x8, #1, 18f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v29.b }[2], [x20]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 19f\n"
+ "ld1 { v29.b }[0], [x20]\n"
+ "19:" // Oddments: Load (3, 3): Bit 2: End
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr x26, [x13, #0x38]\n"
+ "smlal v18.4s, v29.4h, v8.4h\n"
+ "smlal2 v26.4s, v29.8h, v8.8h\n"
+ "add x26, x26, x15\n"
+ "tbz x8, #2, 21f\n"
+ "ld1 { v28.s }[0], [x26], #0x4\n"
+ "tbz x8, #1, 20f\n"
+ "ld1 { v28.h }[2], [x26], #0x2\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v28.b }[6], [x26]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 23f\n"
+ "ld1 { v28.b }[4], [x26]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (0, 1): Bit 2: Unset
+ "tbz x8, #1, 22f\n"
+ "ld1 { v28.h }[0], [x26], #0x2\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v28.b }[2], [x26]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 23f\n"
+ "ld1 { v28.b }[0], [x26]\n"
+ "23:" // Oddments: Load (0, 1): Bit 2: End
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ldr x25, [x13, #0x40]\n"
+ "smlal v12.4s, v28.4h, v1.4h\n"
+ "smlal2 v17.4s, v28.8h, v1.8h\n"
+ "smlal v14.4s, v28.4h, v0.4h\n"
+ "smlal2 v9.4s, v28.8h, v0.8h\n"
+ "add x25, x25, x15\n"
+ "tbz x8, #2, 25f\n"
+ "ld1 { v31.s }[0], [x25], #0x4\n"
+ "tbz x8, #1, 24f\n"
+ "ld1 { v31.h }[2], [x25], #0x2\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v31.b }[6], [x25]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 27f\n"
+ "ld1 { v31.b }[4], [x25]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (0, 2): Bit 2: Unset
+ "tbz x8, #1, 26f\n"
+ "ld1 { v31.h }[0], [x25], #0x2\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v31.b }[2], [x25]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 27f\n"
+ "ld1 { v31.b }[0], [x25]\n"
+ "27:" // Oddments: Load (0, 2): Bit 2: End
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ldr x19, [x13, #0x48]\n"
+ "smlal v12.4s, v31.4h, v2.4h\n"
+ "smlal2 v17.4s, v31.8h, v2.8h\n"
+ "smlal v14.4s, v31.4h, v1.4h\n"
+ "smlal2 v9.4s, v31.8h, v1.8h\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 29f\n"
+ "ld1 { v30.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 28f\n"
+ "ld1 { v30.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v30.b }[6], [x19]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 31f\n"
+ "ld1 { v30.b }[4], [x19]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 2): Bit 2: Unset
+ "tbz x8, #1, 30f\n"
+ "ld1 { v30.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v30.b }[2], [x19]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 31f\n"
+ "ld1 { v30.b }[0], [x19]\n"
+ "31:" // Oddments: Load (2, 2): Bit 2: End
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ldr x24, [x13, #0x50]\n"
+ "smlal v12.4s, v30.4h, v8.4h\n"
+ "smlal2 v17.4s, v30.8h, v8.8h\n"
+ "smlal v14.4s, v30.4h, v7.4h\n"
+ "smlal2 v9.4s, v30.8h, v7.8h\n"
+ "add x24, x24, x15\n"
+ "smlal v16.4s, v30.4h, v5.4h\n"
+ "smlal2 v10.4s, v30.8h, v5.8h\n"
+ "smlal v18.4s, v30.4h, v4.4h\n"
+ "smlal2 v26.4s, v30.8h, v4.8h\n"
+ "tbz x8, #2, 33f\n"
+ "ld1 { v29.s }[0], [x24], #0x4\n"
+ "tbz x8, #1, 32f\n"
+ "ld1 { v29.h }[2], [x24], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[6], [x24]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[4], [x24]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (1, 0): Bit 2: Unset
+ "tbz x8, #1, 34f\n"
+ "ld1 { v29.h }[0], [x24], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[2], [x24]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[0], [x24]\n"
+ "35:" // Oddments: Load (1, 0): Bit 2: End
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr x23, [x13, #0x58]\n"
+ "smlal v12.4s, v29.4h, v3.4h\n"
+ "smlal2 v17.4s, v29.8h, v3.8h\n"
+ "smlal v16.4s, v29.4h, v0.4h\n"
+ "smlal2 v10.4s, v29.8h, v0.8h\n"
+ "add x23, x23, x15\n"
+ "tbz x8, #2, 37f\n"
+ "ld1 { v28.s }[0], [x23], #0x4\n"
+ "tbz x8, #1, 36f\n"
+ "ld1 { v28.h }[2], [x23], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v28.b }[6], [x23]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 39f\n"
+ "ld1 { v28.b }[4], [x23]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x8, #1, 38f\n"
+ "ld1 { v28.h }[0], [x23], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v28.b }[2], [x23]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 39f\n"
+ "ld1 { v28.b }[0], [x23]\n"
+ "39:" // Oddments: Load (1, 3): Bit 2: End
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ldr x22, [x13, #0x60]\n"
+ "smlal v14.4s, v28.4h, v5.4h\n"
+ "smlal2 v9.4s, v28.8h, v5.8h\n"
+ "smlal v18.4s, v28.4h, v2.4h\n"
+ "smlal2 v26.4s, v28.8h, v2.8h\n"
+ "add x22, x22, x15\n"
+ "tbz x8, #2, 41f\n"
+ "ld1 { v31.s }[0], [x22], #0x4\n"
+ "tbz x8, #1, 40f\n"
+ "ld1 { v31.h }[2], [x22], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v31.b }[6], [x22]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 43f\n"
+ "ld1 { v31.b }[4], [x22]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (2, 0): Bit 2: Unset
+ "tbz x8, #1, 42f\n"
+ "ld1 { v31.h }[0], [x22], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v31.b }[2], [x22]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 43f\n"
+ "ld1 { v31.b }[0], [x22]\n"
+ "43:" // Oddments: Load (2, 0): Bit 2: End
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ldr x21, [x13, #0x68]\n"
+ "smlal v12.4s, v31.4h, v6.4h\n"
+ "smlal2 v17.4s, v31.8h, v6.8h\n"
+ "smlal v16.4s, v31.4h, v3.4h\n"
+ "smlal2 v10.4s, v31.8h, v3.8h\n"
+ "add x21, x21, x15\n"
+ "tbz x8, #2, 45f\n"
+ "ld1 { v30.s }[0], [x21], #0x4\n"
+ "tbz x8, #1, 44f\n"
+ "ld1 { v30.h }[2], [x21], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v30.b }[6], [x21]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 47f\n"
+ "ld1 { v30.b }[4], [x21]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x8, #1, 46f\n"
+ "ld1 { v30.h }[0], [x21], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v30.b }[2], [x21]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 47f\n"
+ "ld1 { v30.b }[0], [x21]\n"
+ "47:" // Oddments: Load (2, 3): Bit 2: End
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ldr x20, [x13, #0x70]\n"
+ "smlal v14.4s, v30.4h, v8.4h\n"
+ "smlal2 v9.4s, v30.8h, v8.8h\n"
+ "smlal v18.4s, v30.4h, v5.4h\n"
+ "smlal2 v26.4s, v30.8h, v5.8h\n"
+ "add x20, x20, x15\n"
+ "tbz x8, #2, 49f\n"
+ "ld1 { v29.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 48f\n"
+ "ld1 { v29.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v29.b }[6], [x20]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 51f\n"
+ "ld1 { v29.b }[4], [x20]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x8, #1, 50f\n"
+ "ld1 { v29.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v29.b }[2], [x20]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 51f\n"
+ "ld1 { v29.b }[0], [x20]\n"
+ "51:" // Oddments: Load (3, 1): Bit 2: End
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr x19, [x13, #0x78]\n"
+ "smlal v16.4s, v29.4h, v7.4h\n"
+ "smlal2 v10.4s, v29.8h, v7.8h\n"
+ "smlal v18.4s, v29.4h, v6.4h\n"
+ "smlal2 v26.4s, v29.8h, v6.8h\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 53f\n"
+ "ld1 { v28.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 52f\n"
+ "ld1 { v28.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v28.b }[6], [x19]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 55f\n"
+ "ld1 { v28.b }[4], [x19]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x8, #1, 54f\n"
+ "ld1 { v28.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v28.b }[2], [x19]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 55f\n"
+ "ld1 { v28.b }[0], [x19]\n"
+ "55:" // Oddments: Load (3, 2): Bit 2: End
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v16.4s, v28.4h, v8.4h\n"
+ "smlal2 v10.4s, v28.8h, v8.8h\n"
+ "smlal v18.4s, v28.4h, v7.4h\n"
+ "smlal2 v26.4s, v28.8h, v7.8h\n"
+ "tbz x8, #2, 57f\n"
+ "ld1 { v21.4s }, [x12], #0x10\n"
+ "ld1 { v24.4s }, [x11], #0x10\n"
+ "tbz x8, #1, 56f\n"
+ "ld1 { v19.d }[0], [x12], #0x8\n"
+ "ld1 { v23.d }[0], [x11], #0x8\n"
+ "tbz x8, #0, 59f\n"
+ "ld1 { v19.s }[2], [x12]\n"
+ "ld1 { v23.s }[2], [x11]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x8, #0, 59f\n"
+ "ld1 { v19.s }[0], [x12]\n"
+ "ld1 { v23.s }[0], [x11]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x8, #1, 58f\n"
+ "ld1 { v21.d }[0], [x12], #0x8\n"
+ "ld1 { v24.d }[0], [x11], #0x8\n"
+ "tbz x8, #0, 59f\n"
+ "ld1 { v21.s }[2], [x12]\n"
+ "ld1 { v24.s }[2], [x11]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 59f\n"
+ "ld1 { v21.s }[0], [x12]\n"
+ "ld1 { v24.s }[0], [x11]\n"
+ "59:" // Oddments: Load requant params: Bit 2: End
+ "sqdmulh v12.4s, v12.4s, v21.4s\n"
+ "sqdmulh v14.4s, v14.4s, v21.4s\n"
+ "add x10, x10, x14\n"
+ "add x9, x9, x14\n"
+ "sqdmulh v16.4s, v16.4s, v21.4s\n"
+ "sqdmulh v18.4s, v18.4s, v21.4s\n"
+ "add x28, x28, x14\n"
+ "add x27, x27, x14\n"
+ "and v29.16b, v12.16b, v24.16b\n"
+ "sqdmulh v17.4s, v17.4s, v19.4s\n"
+ "and v22.16b, v14.16b, v24.16b\n"
+ "sqdmulh v9.4s, v9.4s, v19.4s\n"
+ "and v21.16b, v16.16b, v24.16b\n"
+ "sqdmulh v10.4s, v10.4s, v19.4s\n"
+ "and v20.16b, v18.16b, v24.16b\n"
+ "sqdmulh v26.4s, v26.4s, v19.4s\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v19.16b, v17.16b, v23.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v30.16b, v9.16b, v23.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v3.16b, v10.16b, v23.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v28.16b, v26.16b, v23.16b\n"
+ "sqadd v12.4s, v12.4s, v29.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v14.4s, v14.4s, v22.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v21.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v20.4s\n"
+ "sshr v28.4s, v28.4s, #0x1f\n"
+ "srshl v12.4s, v12.4s, v24.4s\n"
+ "sqadd v17.4s, v17.4s, v19.4s\n"
+ "srshl v14.4s, v14.4s, v24.4s\n"
+ "sqadd v9.4s, v9.4s, v30.4s\n"
+ "srshl v16.4s, v16.4s, v24.4s\n"
+ "sqadd v10.4s, v10.4s, v3.4s\n"
+ "srshl v18.4s, v18.4s, v24.4s\n"
+ "sqadd v26.4s, v26.4s, v28.4s\n"
+ "srshl v17.4s, v17.4s, v23.4s\n"
+ "sqxtn v12.4h, v12.4s\n"
+ "srshl v9.4s, v9.4s, v23.4s\n"
+ "sqxtn v14.4h, v14.4s\n"
+ "srshl v10.4s, v10.4s, v23.4s\n"
+ "sqxtn v16.4h, v16.4s\n"
+ "srshl v26.4s, v26.4s, v23.4s\n"
+ "sqxtn v18.4h, v18.4s\n"
+ "sqxtn2 v12.8h, v17.4s\n"
+ "sqxtn2 v14.8h, v9.4s\n"
+ "sqxtn2 v16.8h, v10.4s\n"
+ "sqxtn2 v18.8h, v26.4s\n"
+ "sqadd v12.8h, v12.8h, v13.8h\n"
+ "sqadd v14.8h, v14.8h, v13.8h\n"
+ "sqadd v16.8h, v16.8h, v13.8h\n"
+ "sqadd v18.8h, v18.8h, v13.8h\n"
+ "smax v12.8h, v12.8h, v11.8h\n"
+ "smax v14.8h, v14.8h, v11.8h\n"
+ "smax v16.8h, v16.8h, v11.8h\n"
+ "smax v18.8h, v18.8h, v11.8h\n"
+ "smin v12.8h, v12.8h, v25.8h\n"
+ "smin v14.8h, v14.8h, v25.8h\n"
+ "smin v16.8h, v16.8h, v25.8h\n"
+ "smin v18.8h, v18.8h, v25.8h\n"
+ "uzp1 v12.16b, v12.16b, v12.16b\n"
+ "uzp1 v14.16b, v14.16b, v14.16b\n"
+ "uzp1 v16.16b, v16.16b, v16.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "tbz x8, #2, 61f\n"
+ "st1 { v12.s }[0], [x10], #0x4\n"
+ "st1 { v14.s }[0], [x9], #0x4\n"
+ "st1 { v16.s }[0], [x28], #0x4\n"
+ "st1 { v18.s }[0], [x27], #0x4\n"
+ "tbz x8, #1, 60f\n"
+ "st1 { v12.h }[2], [x10], #0x2\n"
+ "st1 { v14.h }[2], [x9], #0x2\n"
+ "st1 { v16.h }[2], [x28], #0x2\n"
+ "st1 { v18.h }[2], [x27], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "st1 { v12.b }[6], [x10], #0x1\n"
+ "st1 { v14.b }[6], [x9], #0x1\n"
+ "st1 { v16.b }[6], [x28], #0x1\n"
+ "st1 { v18.b }[6], [x27], #0x1\n"
+ "b 63f\n"
+ "60:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x8, #0, 63f\n"
+ "st1 { v12.b }[4], [x10], #0x1\n"
+ "st1 { v14.b }[4], [x9], #0x1\n"
+ "st1 { v16.b }[4], [x28], #0x1\n"
+ "st1 { v18.b }[4], [x27], #0x1\n"
+ "b 63f\n"
+ "61:" // Oddments: Bit 2: Unset
+ "tbz x8, #1, 62f\n"
+ "st1 { v12.h }[0], [x10], #0x2\n"
+ "st1 { v14.h }[0], [x9], #0x2\n"
+ "st1 { v16.h }[0], [x28], #0x2\n"
+ "st1 { v18.h }[0], [x27], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "st1 { v12.b }[2], [x10], #0x1\n"
+ "st1 { v14.b }[2], [x9], #0x1\n"
+ "st1 { v16.b }[2], [x28], #0x1\n"
+ "st1 { v18.b }[2], [x27], #0x1\n"
+ "b 63f\n"
+ "62:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 63f\n"
+ "st1 { v12.b }[0], [x10], #0x1\n"
+ "st1 { v14.b }[0], [x9], #0x1\n"
+ "st1 { v16.b }[0], [x28], #0x1\n"
+ "st1 { v18.b }[0], [x27], #0x1\n"
+ "63:" // Oddments: Bit 2: End
+ "64:" // End
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..b479dbf7a4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 3;
+ constexpr static unsigned int kernel_cols = 3;
+
+ constexpr static unsigned int stride_rows = 2;
+ constexpr static unsigned int stride_cols = 2;
+
+ a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..49f69c4204
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,1395 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[25];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[12];
+ inptrs[1] = inptrs_raw[0];
+ inptrs[2] = inptrs_raw[1];
+ inptrs[3] = inptrs_raw[3];
+ inptrs[4] = inptrs_raw[4];
+ inptrs[5] = inptrs_raw[5];
+ inptrs[6] = inptrs_raw[6];
+ inptrs[7] = inptrs_raw[2];
+ inptrs[8] = inptrs_raw[8];
+ inptrs[9] = inptrs_raw[9];
+ inptrs[10] = inptrs_raw[7];
+ inptrs[11] = inptrs_raw[15];
+ inptrs[12] = inptrs_raw[10];
+ inptrs[13] = inptrs_raw[16];
+ inptrs[14] = inptrs_raw[11];
+ inptrs[15] = inptrs_raw[18];
+ inptrs[16] = inptrs_raw[13];
+ inptrs[17] = inptrs_raw[19];
+ inptrs[18] = inptrs_raw[20];
+ inptrs[19] = inptrs_raw[14];
+ inptrs[20] = inptrs_raw[21];
+ inptrs[21] = inptrs_raw[17];
+ inptrs[22] = inptrs_raw[23];
+ inptrs[23] = inptrs_raw[22];
+ inptrs[24] = inptrs_raw[24];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x19, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "add x23, x19, %[offsetof_Requantize32_b_offset]\n"
+ "add x22, x19, %[offsetof_Requantize32_c_offset]\n"
+ "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x20, x19, %[offsetof_Requantize32_minval]\n"
+ "add x19, x19, %[offsetof_Requantize32_maxval]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+ "ld1r { v16.16b }, [x23]\n"
+ "ld1r { v12.8h }, [x22]\n"
+ "lsr x16, x8, #0x3\n"
+ "mov x15, #0x0\n"
+ "ld1r { v14.8h }, [x20]\n"
+ "ld1r { v21.8h }, [x19]\n"
+ "mov x14, #0x0\n"
+ "add x13, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x12, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x10, x9, [x21, #0x0]\n"
+ "ldp x28, x27, [x21, #0x10]\n"
+ "cbz x16, 3f\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q15, [x19, #0x0]\n"
+ "subs x16, x16, #0x1\n"
+ "mov v13.16b, v15.16b\n"
+ "ldr q18, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "ldr d2, [x17, #0x10]\n"
+ "mov v17.16b, v18.16b\n"
+ "mov v11.16b, v15.16b\n"
+ "ldr d3, [x17, #0x18]\n"
+ "ldr d4, [x17, #0x20]\n"
+ "mov v10.16b, v18.16b\n"
+ "mov v23.16b, v15.16b\n"
+ "ldr d5, [x17, #0x28]\n"
+ "ldr d6, [x17, #0x30]\n"
+ "mov v9.16b, v18.16b\n"
+ "usubl v0.8h, v0.8b, v16.8b\n"
+ "ldr d7, [x17, #0x38]\n"
+ "ldr d8, [x17, #0x40]\n"
+ "usubl v1.8h, v1.8b, v16.8b\n"
+ "usubl v2.8h, v2.8b, v16.8b\n"
+ "ldp x26, x25, [x13, #0x0]\n"
+ "ldp x24, x23, [x13, #0x10]\n"
+ "usubl v3.8h, v3.8b, v16.8b\n"
+ "usubl v4.8h, v4.8b, v16.8b\n"
+ "ldp x22, x21, [x13, #0x20]\n"
+ "ldp x20, x19, [x13, #0x30]\n"
+ "usubl v5.8h, v5.8b, v16.8b\n"
+ "usubl v6.8h, v6.8b, v16.8b\n"
+ "ldr d31, [x26, x15]\n"
+ "ldr d30, [x25, x15]\n"
+ "usubl v7.8h, v7.8b, v16.8b\n"
+ "usubl v8.8h, v8.8b, v16.8b\n"
+ "ldr d29, [x24, x15]\n"
+ "ldr d28, [x23, x15]\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ldr d27, [x22, x15]\n"
+ "ldr d26, [x21, x15]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ldr d25, [x20, x15]\n"
+ "ldr d24, [x19, x15]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "smlal v15.4s, v31.4h, v8.4h\n"
+ "smlal2 v18.4s, v31.8h, v8.8h\n"
+ "ldr x24, [x13, #0x40]\n"
+ "ldr x23, [x13, #0x48]\n"
+ "smlal v13.4s, v31.4h, v6.4h\n"
+ "smlal2 v17.4s, v31.8h, v6.8h\n"
+ "ldr x21, [x13, #0x50]\n"
+ "ldr x19, [x13, #0x58]\n"
+ "smlal v15.4s, v30.4h, v0.4h\n"
+ "smlal2 v18.4s, v30.8h, v0.8h\n"
+ "ldr x22, [x13, #0x78]\n"
+ "ldr x20, [x13, #0x60]\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
+ "smlal2 v17.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x23, x15]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v15.4s, v29.4h, v1.4h\n"
+ "smlal2 v18.4s, v29.8h, v1.8h\n"
+ "ldr d29, [x24, x15]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "smlal v13.4s, v27.4h, v2.4h\n"
+ "smlal2 v17.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x15]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal v15.4s, v26.4h, v3.4h\n"
+ "smlal2 v18.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x19, x15]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v13.4s, v24.4h, v0.4h\n"
+ "smlal2 v17.4s, v24.8h, v0.8h\n"
+ "ldr x21, [x13, #0x80]\n"
+ "ldr x19, [x13, #0x68]\n"
+ "smlal v15.4s, v25.4h, v4.4h\n"
+ "smlal2 v18.4s, v25.8h, v4.8h\n"
+ "ldr d25, [x20, x15]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal v13.4s, v29.4h, v4.4h\n"
+ "smlal2 v17.4s, v29.8h, v4.8h\n"
+ "ldr x20, [x13, #0x88]\n"
+ "ldr d29, [x19, x15]\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
+ "smlal2 v18.4s, v24.8h, v2.8h\n"
+ "ldr x19, [x13, #0x70]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "smlal v13.4s, v28.4h, v5.4h\n"
+ "smlal2 v17.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x21, x15]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v11.4s, v31.4h, v2.4h\n"
+ "smlal2 v10.4s, v31.8h, v2.8h\n"
+ "ldr x24, [x13, #0x98]\n"
+ "ldr d24, [x19, x15]\n"
+ "smlal v15.4s, v27.4h, v5.4h\n"
+ "smlal2 v18.4s, v27.8h, v5.8h\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "ldr x23, [x13, #0x90]\n"
+ "smlal v13.4s, v27.4h, v3.4h\n"
+ "smlal2 v17.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x22, x15]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "smlal v11.4s, v26.4h, v3.4h\n"
+ "ldr x22, [x13, #0xa8]\n"
+ "ldr x19, [x13, #0xa0]\n"
+ "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "smlal2 v9.4s, v31.8h, v0.8h\n"
+ "ldr d26, [x20, x15]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v23.4s, v27.4h, v4.4h\n"
+ "smlal v11.4s, v25.4h, v0.4h\n"
+ "ldr x21, [x13, #0xb0]\n"
+ "ldr x20, [x13, #0xb8]\n"
+ "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "smlal2 v9.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x19, x15]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal v15.4s, v25.4h, v6.4h\n"
+ "ldr x19, [x13, #0xc0]\n"
+ "ldr q22, [x12, #0x0]\n"
+ "smlal2 v18.4s, v25.8h, v6.8h\n"
+ "smlal v11.4s, v29.4h, v4.4h\n"
+ "ldr d25, [x23, x15]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal2 v10.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x24, x15]\n"
+ "smlal2 v9.4s, v28.8h, v1.8h\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "smlal v23.4s, v26.4h, v5.4h\n"
+ "smlal v15.4s, v24.4h, v7.4h\n"
+ "ldr q31, [x11, #0x0]\n"
+ "ldr q19, [x12, #0x10]\n"
+ "smlal2 v18.4s, v24.8h, v7.8h\n"
+ "smlal v11.4s, v24.4h, v1.4h\n"
+ "sqdmulh v15.4s, v15.4s, v22.4s\n"
+ "ldr q30, [x11, #0x10]\n"
+ "smlal2 v10.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x22, x15]\n"
+ "smlal2 v9.4s, v26.8h, v5.8h\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "smlal v23.4s, v29.4h, v2.4h\n"
+ "ldr d26, [x21, x15]\n"
+ "smlal2 v9.4s, v29.8h, v2.8h\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v11.4s, v25.4h, v6.4h\n"
+ "smlal v23.4s, v24.4h, v3.4h\n"
+ "and v4.16b, v15.16b, v31.16b\n"
+ "add x17, x17, #0x48\n"
+ "smlal v13.4s, v28.4h, v7.4h\n"
+ "smlal2 v17.4s, v28.8h, v7.8h\n"
+ "sqdmulh v18.4s, v18.4s, v19.4s\n"
+ "subs x16, x16, #0x1\n"
+ "smlal2 v10.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x20, x15]\n"
+ "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal v11.4s, v27.4h, v7.4h\n"
+ "smlal v23.4s, v26.4h, v7.4h\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "add x12, x12, #0x20\n"
+ "smlal v13.4s, v29.4h, v8.4h\n"
+ "smlal2 v17.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x19, x15]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "smlal2 v10.4s, v27.8h, v7.8h\n"
+ "smlal2 v9.4s, v26.8h, v7.8h\n"
+ "sqdmulh v13.4s, v13.4s, v22.4s\n"
+ "add x15, x15, #0x8\n"
+ "smlal v11.4s, v24.4h, v5.4h\n"
+ "smlal v23.4s, v25.4h, v6.4h\n"
+ "and v1.16b, v13.16b, v31.16b\n"
+ "add x11, x11, #0x20\n"
+ "smlal2 v10.4s, v24.8h, v5.8h\n"
+ "smlal2 v9.4s, v25.8h, v6.8h\n"
+ "sqdmulh v17.4s, v17.4s, v19.4s\n"
+ "smlal v11.4s, v25.4h, v8.4h\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
+ "sqdmulh v11.4s, v11.4s, v22.4s\n"
+ "smlal2 v10.4s, v25.8h, v8.8h\n"
+ "smlal2 v9.4s, v29.8h, v8.8h\n"
+ "sqdmulh v23.4s, v23.4s, v22.4s\n"
+ "and v22.16b, v11.16b, v31.16b\n"
+ "sqdmulh v10.4s, v10.4s, v19.4s\n"
+ "and v20.16b, v23.16b, v31.16b\n"
+ "sqdmulh v9.4s, v9.4s, v19.4s\n"
+ "and v19.16b, v18.16b, v30.16b\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "and v27.16b, v17.16b, v30.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v25.16b, v10.16b, v30.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v0.16b, v9.16b, v30.16b\n"
+ "sqadd v15.4s, v15.4s, v4.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v13.4s, v13.4s, v1.4s\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sqadd v11.4s, v11.4s, v22.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v20.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v31.4s\n"
+ "sqadd v18.4s, v18.4s, v19.4s\n"
+ "srshl v13.4s, v13.4s, v31.4s\n"
+ "sqadd v17.4s, v17.4s, v27.4s\n"
+ "srshl v11.4s, v11.4s, v31.4s\n"
+ "sqadd v10.4s, v10.4s, v25.4s\n"
+ "srshl v23.4s, v23.4s, v31.4s\n"
+ "sqadd v9.4s, v9.4s, v0.4s\n"
+ "srshl v18.4s, v18.4s, v30.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v17.4s, v17.4s, v30.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v10.4s, v10.4s, v30.4s\n"
+ "sqxtn v11.4h, v11.4s\n"
+ "srshl v9.4s, v9.4s, v30.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v15.8h, v18.4s\n"
+ "sqxtn2 v13.8h, v17.4s\n"
+ "sqxtn2 v11.8h, v10.4s\n"
+ "sqxtn2 v23.8h, v9.4s\n"
+ "sqadd v15.8h, v15.8h, v12.8h\n"
+ "sqadd v13.8h, v13.8h, v12.8h\n"
+ "sqadd v11.8h, v11.8h, v12.8h\n"
+ "sqadd v23.8h, v23.8h, v12.8h\n"
+ "smax v15.8h, v15.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v14.8h\n"
+ "smax v11.8h, v11.8h, v14.8h\n"
+ "smax v23.8h, v23.8h, v14.8h\n"
+ "smin v15.8h, v15.8h, v21.8h\n"
+ "smin v13.8h, v13.8h, v21.8h\n"
+ "smin v11.8h, v11.8h, v21.8h\n"
+ "smin v23.8h, v23.8h, v21.8h\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "str d15, [x10, x14]\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "str d13, [x9, x14]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d11, [x28, x14]\n"
+ "str d23, [x27, x14]\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q15, [x19, #0x0]\n"
+ "add x14, x14, #0x8\n"
+ "ldr q18, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "ldr d2, [x17, #0x10]\n"
+ "mov v13.16b, v15.16b\n"
+ "mov v17.16b, v18.16b\n"
+ "ldr d3, [x17, #0x18]\n"
+ "ldr d4, [x17, #0x20]\n"
+ "mov v11.16b, v15.16b\n"
+ "mov v10.16b, v18.16b\n"
+ "ldr d5, [x17, #0x28]\n"
+ "ldr d6, [x17, #0x30]\n"
+ "mov v23.16b, v15.16b\n"
+ "mov v9.16b, v18.16b\n"
+ "ldr d7, [x17, #0x38]\n"
+ "ldr d8, [x17, #0x40]\n"
+ "usubl v0.8h, v0.8b, v16.8b\n"
+ "usubl v1.8h, v1.8b, v16.8b\n"
+ "ldp x26, x25, [x13, #0x0]\n"
+ "ldp x24, x23, [x13, #0x10]\n"
+ "usubl v2.8h, v2.8b, v16.8b\n"
+ "usubl v3.8h, v3.8b, v16.8b\n"
+ "ldp x22, x21, [x13, #0x20]\n"
+ "ldp x20, x19, [x13, #0x30]\n"
+ "usubl v4.8h, v4.8b, v16.8b\n"
+ "usubl v5.8h, v5.8b, v16.8b\n"
+ "ldr d31, [x26, x15]\n"
+ "ldr d30, [x25, x15]\n"
+ "usubl v6.8h, v6.8b, v16.8b\n"
+ "usubl v7.8h, v7.8b, v16.8b\n"
+ "ldr d29, [x24, x15]\n"
+ "ldr d28, [x23, x15]\n"
+ "usubl v8.8h, v8.8b, v16.8b\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ldr d27, [x22, x15]\n"
+ "ldr d26, [x21, x15]\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr d25, [x20, x15]\n"
+ "ldr d24, [x19, x15]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "smlal v15.4s, v31.4h, v8.4h\n"
+ "smlal2 v18.4s, v31.8h, v8.8h\n"
+ "ldr x24, [x13, #0x40]\n"
+ "ldr x23, [x13, #0x48]\n"
+ "smlal v13.4s, v31.4h, v6.4h\n"
+ "smlal2 v17.4s, v31.8h, v6.8h\n"
+ "ldr x21, [x13, #0x50]\n"
+ "ldr x19, [x13, #0x58]\n"
+ "smlal v15.4s, v30.4h, v0.4h\n"
+ "smlal2 v18.4s, v30.8h, v0.8h\n"
+ "ldr x22, [x13, #0x78]\n"
+ "ldr x20, [x13, #0x60]\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
+ "smlal2 v17.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x23, x15]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v15.4s, v29.4h, v1.4h\n"
+ "smlal2 v18.4s, v29.8h, v1.8h\n"
+ "ldr d29, [x24, x15]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "smlal v13.4s, v27.4h, v2.4h\n"
+ "smlal2 v17.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x15]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal v15.4s, v26.4h, v3.4h\n"
+ "smlal2 v18.4s, v26.8h, v3.8h\n"
+ "ldr d26, [x19, x15]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v13.4s, v24.4h, v0.4h\n"
+ "smlal2 v17.4s, v24.8h, v0.8h\n"
+ "ldr x21, [x13, #0x80]\n"
+ "ldr x19, [x13, #0x68]\n"
+ "smlal v15.4s, v25.4h, v4.4h\n"
+ "smlal2 v18.4s, v25.8h, v4.8h\n"
+ "ldr d25, [x20, x15]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal v13.4s, v29.4h, v4.4h\n"
+ "smlal2 v17.4s, v29.8h, v4.8h\n"
+ "ldr x20, [x13, #0x88]\n"
+ "ldr d29, [x19, x15]\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
+ "smlal2 v18.4s, v24.8h, v2.8h\n"
+ "ldr x19, [x13, #0x70]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "smlal v13.4s, v28.4h, v5.4h\n"
+ "smlal2 v17.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x21, x15]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal v11.4s, v31.4h, v2.4h\n"
+ "smlal2 v10.4s, v31.8h, v2.8h\n"
+ "ldr x24, [x13, #0x98]\n"
+ "ldr d24, [x19, x15]\n"
+ "smlal v15.4s, v27.4h, v5.4h\n"
+ "smlal2 v18.4s, v27.8h, v5.8h\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "ldr x23, [x13, #0x90]\n"
+ "smlal v13.4s, v27.4h, v3.4h\n"
+ "smlal2 v17.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x22, x15]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "smlal v11.4s, v26.4h, v3.4h\n"
+ "ldr x22, [x13, #0xa8]\n"
+ "ldr x19, [x13, #0xa0]\n"
+ "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "smlal2 v9.4s, v31.8h, v0.8h\n"
+ "ldr d26, [x20, x15]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v23.4s, v27.4h, v4.4h\n"
+ "smlal v11.4s, v25.4h, v0.4h\n"
+ "ldr x21, [x13, #0xb0]\n"
+ "ldr x20, [x13, #0xb8]\n"
+ "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "smlal2 v9.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x19, x15]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal v15.4s, v25.4h, v6.4h\n"
+ "ldr x19, [x13, #0xc0]\n"
+ "ldr q22, [x12, #0x0]\n"
+ "smlal2 v18.4s, v25.8h, v6.8h\n"
+ "smlal v11.4s, v29.4h, v4.4h\n"
+ "ldr d25, [x23, x15]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal2 v10.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x24, x15]\n"
+ "smlal2 v9.4s, v28.8h, v1.8h\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "smlal v23.4s, v26.4h, v5.4h\n"
+ "smlal v15.4s, v24.4h, v7.4h\n"
+ "ldr q31, [x11, #0x0]\n"
+ "ldr q19, [x12, #0x10]\n"
+ "smlal2 v18.4s, v24.8h, v7.8h\n"
+ "smlal v11.4s, v24.4h, v1.4h\n"
+ "sqdmulh v15.4s, v15.4s, v22.4s\n"
+ "ldr q30, [x11, #0x10]\n"
+ "smlal2 v10.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x22, x15]\n"
+ "smlal2 v9.4s, v26.8h, v5.8h\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "smlal v23.4s, v29.4h, v2.4h\n"
+ "ldr d26, [x21, x15]\n"
+ "smlal2 v9.4s, v29.8h, v2.8h\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v11.4s, v25.4h, v6.4h\n"
+ "smlal v23.4s, v24.4h, v3.4h\n"
+ "and v4.16b, v15.16b, v31.16b\n"
+ "tst x8, #0x7\n"
+ "smlal v13.4s, v28.4h, v7.4h\n"
+ "smlal2 v17.4s, v28.8h, v7.8h\n"
+ "sqdmulh v18.4s, v18.4s, v19.4s\n"
+ "add x12, x12, #0x20\n"
+ "smlal2 v10.4s, v25.8h, v6.8h\n"
+ "ldr d25, [x20, x15]\n"
+ "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal v11.4s, v27.4h, v7.4h\n"
+ "smlal v23.4s, v26.4h, v7.4h\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "add x11, x11, #0x20\n"
+ "smlal v13.4s, v29.4h, v8.4h\n"
+ "smlal2 v17.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x19, x15]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "smlal2 v10.4s, v27.8h, v7.8h\n"
+ "smlal2 v9.4s, v26.8h, v7.8h\n"
+ "sqdmulh v13.4s, v13.4s, v22.4s\n"
+ "add x15, x15, #0x8\n"
+ "smlal v11.4s, v24.4h, v5.4h\n"
+ "smlal v23.4s, v25.4h, v6.4h\n"
+ "and v1.16b, v13.16b, v31.16b\n"
+ "smlal2 v10.4s, v24.8h, v5.8h\n"
+ "smlal2 v9.4s, v25.8h, v6.8h\n"
+ "sqdmulh v17.4s, v17.4s, v19.4s\n"
+ "smlal v11.4s, v25.4h, v8.4h\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
+ "sqdmulh v11.4s, v11.4s, v22.4s\n"
+ "smlal2 v10.4s, v25.8h, v8.8h\n"
+ "smlal2 v9.4s, v29.8h, v8.8h\n"
+ "sqdmulh v23.4s, v23.4s, v22.4s\n"
+ "and v22.16b, v11.16b, v31.16b\n"
+ "sqdmulh v10.4s, v10.4s, v19.4s\n"
+ "and v20.16b, v23.16b, v31.16b\n"
+ "sqdmulh v9.4s, v9.4s, v19.4s\n"
+ "and v19.16b, v18.16b, v30.16b\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "and v27.16b, v17.16b, v30.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v25.16b, v10.16b, v30.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v0.16b, v9.16b, v30.16b\n"
+ "sqadd v15.4s, v15.4s, v4.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v13.4s, v13.4s, v1.4s\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sqadd v11.4s, v11.4s, v22.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v20.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v31.4s\n"
+ "sqadd v18.4s, v18.4s, v19.4s\n"
+ "srshl v13.4s, v13.4s, v31.4s\n"
+ "sqadd v17.4s, v17.4s, v27.4s\n"
+ "srshl v11.4s, v11.4s, v31.4s\n"
+ "sqadd v10.4s, v10.4s, v25.4s\n"
+ "srshl v23.4s, v23.4s, v31.4s\n"
+ "sqadd v9.4s, v9.4s, v0.4s\n"
+ "srshl v18.4s, v18.4s, v30.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v17.4s, v17.4s, v30.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v10.4s, v10.4s, v30.4s\n"
+ "sqxtn v11.4h, v11.4s\n"
+ "srshl v9.4s, v9.4s, v30.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v15.8h, v18.4s\n"
+ "sqxtn2 v13.8h, v17.4s\n"
+ "sqxtn2 v11.8h, v10.4s\n"
+ "sqxtn2 v23.8h, v9.4s\n"
+ "sqadd v15.8h, v15.8h, v12.8h\n"
+ "sqadd v13.8h, v13.8h, v12.8h\n"
+ "sqadd v11.8h, v11.8h, v12.8h\n"
+ "sqadd v23.8h, v23.8h, v12.8h\n"
+ "smax v15.8h, v15.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v14.8h\n"
+ "smax v11.8h, v11.8h, v14.8h\n"
+ "smax v23.8h, v23.8h, v14.8h\n"
+ "smin v15.8h, v15.8h, v21.8h\n"
+ "smin v13.8h, v13.8h, v21.8h\n"
+ "smin v11.8h, v11.8h, v21.8h\n"
+ "smin v23.8h, v23.8h, v21.8h\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "str d15, [x10, x14]\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "str d13, [x9, x14]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d11, [x28, x14]\n"
+ "str d23, [x27, x14]\n"
+ "add x14, x14, #0x8\n"
+ "beq 88f\n"
+ "add x17, x17, #0x48\n"
+ "3:" // Oddments
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x8, #2, 5f\n"
+ "ld1 { v15.4s }, [x19], #0x10\n"
+ "tbz x8, #1, 4f\n"
+ "ld1 { v18.d }[0], [x19], #0x8\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v18.s }[2], [x19]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x8, #0, 7f\n"
+ "ld1 { v18.s }[0], [x19]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x8, #1, 6f\n"
+ "ld1 { v15.d }[0], [x19], #0x8\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v15.s }[2], [x19]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 7f\n"
+ "ld1 { v15.s }[0], [x19]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "mov v13.16b, v15.16b\n"
+ "mov v17.16b, v18.16b\n"
+ "ldr d2, [x17, #0x10]\n"
+ "ldr d3, [x17, #0x18]\n"
+ "mov v11.16b, v15.16b\n"
+ "mov v10.16b, v18.16b\n"
+ "ldr d4, [x17, #0x20]\n"
+ "ldr d5, [x17, #0x28]\n"
+ "mov v23.16b, v15.16b\n"
+ "mov v9.16b, v18.16b\n"
+ "ldr d6, [x17, #0x30]\n"
+ "ldr d7, [x17, #0x38]\n"
+ "usubl v0.8h, v0.8b, v16.8b\n"
+ "usubl v1.8h, v1.8b, v16.8b\n"
+ "ldr d8, [x17, #0x40]\n"
+ "ldp x26, x25, [x13, #0x0]\n"
+ "usubl v2.8h, v2.8b, v16.8b\n"
+ "usubl v3.8h, v3.8b, v16.8b\n"
+ "ldp x24, x23, [x13, #0x10]\n"
+ "ldp x22, x21, [x13, #0x20]\n"
+ "usubl v4.8h, v4.8b, v16.8b\n"
+ "usubl v5.8h, v5.8b, v16.8b\n"
+ "ldp x20, x19, [x13, #0x30]\n"
+ "usubl v6.8h, v6.8b, v16.8b\n"
+ "usubl v7.8h, v7.8b, v16.8b\n"
+ "usubl v8.8h, v8.8b, v16.8b\n"
+ "add x26, x26, x15\n"
+ "add x25, x25, x15\n"
+ "add x24, x24, x15\n"
+ "add x23, x23, x15\n"
+ "add x22, x22, x15\n"
+ "add x21, x21, x15\n"
+ "add x20, x20, x15\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 9f\n"
+ "ld1 { v31.s }[0], [x26], #0x4\n"
+ "ld1 { v30.s }[0], [x25], #0x4\n"
+ "ld1 { v29.s }[0], [x24], #0x4\n"
+ "ld1 { v28.s }[0], [x23], #0x4\n"
+ "ld1 { v27.s }[0], [x22], #0x4\n"
+ "ld1 { v26.s }[0], [x21], #0x4\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "ld1 { v24.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 8f\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "ld1 { v30.h }[2], [x25], #0x2\n"
+ "ld1 { v29.h }[2], [x24], #0x2\n"
+ "ld1 { v28.h }[2], [x23], #0x2\n"
+ "ld1 { v27.h }[2], [x22], #0x2\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "ld1 { v24.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v31.b }[6], [x26]\n"
+ "ld1 { v30.b }[6], [x25]\n"
+ "ld1 { v29.b }[6], [x24]\n"
+ "ld1 { v28.b }[6], [x23]\n"
+ "ld1 { v27.b }[6], [x22]\n"
+ "ld1 { v26.b }[6], [x21]\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "ld1 { v24.b }[6], [x19]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x8, #0, 11f\n"
+ "ld1 { v31.b }[4], [x26]\n"
+ "ld1 { v30.b }[4], [x25]\n"
+ "ld1 { v29.b }[4], [x24]\n"
+ "ld1 { v28.b }[4], [x23]\n"
+ "ld1 { v27.b }[4], [x22]\n"
+ "ld1 { v26.b }[4], [x21]\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "ld1 { v24.b }[4], [x19]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x8, #1, 10f\n"
+ "ld1 { v31.h }[0], [x26], #0x2\n"
+ "ld1 { v30.h }[0], [x25], #0x2\n"
+ "ld1 { v29.h }[0], [x24], #0x2\n"
+ "ld1 { v28.h }[0], [x23], #0x2\n"
+ "ld1 { v27.h }[0], [x22], #0x2\n"
+ "ld1 { v26.h }[0], [x21], #0x2\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "ld1 { v24.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 11f\n"
+ "ld1 { v31.b }[2], [x26]\n"
+ "ld1 { v30.b }[2], [x25]\n"
+ "ld1 { v29.b }[2], [x24]\n"
+ "ld1 { v28.b }[2], [x23]\n"
+ "ld1 { v27.b }[2], [x22]\n"
+ "ld1 { v26.b }[2], [x21]\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "ld1 { v24.b }[2], [x19]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 11f\n"
+ "ld1 { v31.b }[0], [x26]\n"
+ "ld1 { v30.b }[0], [x25]\n"
+ "ld1 { v29.b }[0], [x24]\n"
+ "ld1 { v28.b }[0], [x23]\n"
+ "ld1 { v27.b }[0], [x22]\n"
+ "ld1 { v26.b }[0], [x21]\n"
+ "ld1 { v25.b }[0], [x20]\n"
+ "ld1 { v24.b }[0], [x19]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal v15.4s, v31.4h, v8.4h\n"
+ "smlal2 v18.4s, v31.8h, v8.8h\n"
+ "ldr x24, [x13, #0x40]\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "smlal v15.4s, v30.4h, v0.4h\n"
+ "smlal2 v18.4s, v30.8h, v0.8h\n"
+ "add x24, x24, x15\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "smlal v13.4s, v31.4h, v6.4h\n"
+ "smlal2 v17.4s, v31.8h, v6.8h\n"
+ "smlal v15.4s, v29.4h, v1.4h\n"
+ "smlal2 v18.4s, v29.8h, v1.8h\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
+ "smlal2 v17.4s, v28.8h, v1.8h\n"
+ "smlal v15.4s, v26.4h, v3.4h\n"
+ "smlal2 v18.4s, v26.8h, v3.8h\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal v13.4s, v27.4h, v2.4h\n"
+ "smlal2 v17.4s, v27.8h, v2.8h\n"
+ "smlal v15.4s, v25.4h, v4.4h\n"
+ "smlal2 v18.4s, v25.8h, v4.8h\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "smlal v11.4s, v31.4h, v2.4h\n"
+ "smlal2 v10.4s, v31.8h, v2.8h\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "smlal2 v9.4s, v31.8h, v0.8h\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
+ "smlal2 v18.4s, v24.8h, v2.8h\n"
+ "smlal v13.4s, v24.4h, v0.4h\n"
+ "smlal2 v17.4s, v24.8h, v0.8h\n"
+ "tbz x8, #2, 13f\n"
+ "ld1 { v29.s }[0], [x24], #0x4\n"
+ "tbz x8, #1, 12f\n"
+ "ld1 { v29.h }[2], [x24], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v29.b }[6], [x24]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 15f\n"
+ "ld1 { v29.b }[4], [x24]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x8, #1, 14f\n"
+ "ld1 { v29.h }[0], [x24], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v29.b }[2], [x24]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 15f\n"
+ "ld1 { v29.b }[0], [x24]\n"
+ "15:" // Oddments: Load (1, 3): Bit 2: End
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr x23, [x13, #0x48]\n"
+ "smlal v13.4s, v29.4h, v4.4h\n"
+ "smlal2 v17.4s, v29.8h, v4.8h\n"
+ "add x23, x23, x15\n"
+ "tbz x8, #2, 17f\n"
+ "ld1 { v28.s }[0], [x23], #0x4\n"
+ "tbz x8, #1, 16f\n"
+ "ld1 { v28.h }[2], [x23], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v28.b }[6], [x23]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 19f\n"
+ "ld1 { v28.b }[4], [x23]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (1, 4): Bit 2: Unset
+ "tbz x8, #1, 18f\n"
+ "ld1 { v28.h }[0], [x23], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v28.b }[2], [x23]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 19f\n"
+ "ld1 { v28.b }[0], [x23]\n"
+ "19:" // Oddments: Load (1, 4): Bit 2: End
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ldr x21, [x13, #0x50]\n"
+ "smlal v13.4s, v28.4h, v5.4h\n"
+ "smlal2 v17.4s, v28.8h, v5.8h\n"
+ "add x21, x21, x15\n"
+ "tbz x8, #2, 21f\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "tbz x8, #1, 20f\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v27.b }[6], [x21]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 23f\n"
+ "ld1 { v27.b }[4], [x21]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (1, 2): Bit 2: Unset
+ "tbz x8, #1, 22f\n"
+ "ld1 { v27.h }[0], [x21], #0x2\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v27.b }[2], [x21]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 23f\n"
+ "ld1 { v27.b }[0], [x21]\n"
+ "23:" // Oddments: Load (1, 2): Bit 2: End
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ldr x19, [x13, #0x58]\n"
+ "smlal v15.4s, v27.4h, v5.4h\n"
+ "smlal2 v18.4s, v27.8h, v5.8h\n"
+ "smlal v13.4s, v27.4h, v3.4h\n"
+ "smlal2 v17.4s, v27.8h, v3.8h\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 25f\n"
+ "ld1 { v26.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 24f\n"
+ "ld1 { v26.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v26.b }[6], [x19]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 27f\n"
+ "ld1 { v26.b }[4], [x19]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x8, #1, 26f\n"
+ "ld1 { v26.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 27f\n"
+ "ld1 { v26.b }[2], [x19]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 27f\n"
+ "ld1 { v26.b }[0], [x19]\n"
+ "27:" // Oddments: Load (3, 0): Bit 2: End
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ldr x20, [x13, #0x60]\n"
+ "smlal v11.4s, v26.4h, v3.4h\n"
+ "smlal2 v10.4s, v26.8h, v3.8h\n"
+ "add x20, x20, x15\n"
+ "tbz x8, #2, 29f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 28f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 31f\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 0): Bit 2: Unset
+ "tbz x8, #1, 30f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 31f\n"
+ "ld1 { v25.b }[0], [x20]\n"
+ "31:" // Oddments: Load (2, 0): Bit 2: End
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ldr x19, [x13, #0x68]\n"
+ "smlal v15.4s, v25.4h, v6.4h\n"
+ "smlal2 v18.4s, v25.8h, v6.8h\n"
+ "smlal v11.4s, v25.4h, v0.4h\n"
+ "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 33f\n"
+ "ld1 { v29.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 32f\n"
+ "ld1 { v29.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[6], [x19]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[4], [x19]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x8, #1, 34f\n"
+ "ld1 { v29.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[2], [x19]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[0], [x19]\n"
+ "35:" // Oddments: Load (3, 1): Bit 2: End
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr x19, [x13, #0x70]\n"
+ "smlal v11.4s, v29.4h, v4.4h\n"
+ "smlal2 v10.4s, v29.8h, v4.8h\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 37f\n"
+ "ld1 { v24.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 36f\n"
+ "ld1 { v24.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v24.b }[6], [x19]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 39f\n"
+ "ld1 { v24.b }[4], [x19]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (2, 1): Bit 2: Unset
+ "tbz x8, #1, 38f\n"
+ "ld1 { v24.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v24.b }[2], [x19]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 39f\n"
+ "ld1 { v24.b }[0], [x19]\n"
+ "39:" // Oddments: Load (2, 1): Bit 2: End
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "ldr x22, [x13, #0x78]\n"
+ "smlal v15.4s, v24.4h, v7.4h\n"
+ "smlal2 v18.4s, v24.8h, v7.8h\n"
+ "smlal v11.4s, v24.4h, v1.4h\n"
+ "smlal2 v10.4s, v24.8h, v1.8h\n"
+ "add x22, x22, x15\n"
+ "tbz x8, #2, 41f\n"
+ "ld1 { v27.s }[0], [x22], #0x4\n"
+ "tbz x8, #1, 40f\n"
+ "ld1 { v27.h }[2], [x22], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v27.b }[6], [x22]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 43f\n"
+ "ld1 { v27.b }[4], [x22]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x8, #1, 42f\n"
+ "ld1 { v27.h }[0], [x22], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v27.b }[2], [x22]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 43f\n"
+ "ld1 { v27.b }[0], [x22]\n"
+ "43:" // Oddments: Load (3, 3): Bit 2: End
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ldr x21, [x13, #0x80]\n"
+ "smlal v23.4s, v27.4h, v4.4h\n"
+ "smlal2 v9.4s, v27.8h, v4.8h\n"
+ "add x21, x21, x15\n"
+ "tbz x8, #2, 45f\n"
+ "ld1 { v28.s }[0], [x21], #0x4\n"
+ "tbz x8, #1, 44f\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v28.b }[6], [x21]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 47f\n"
+ "ld1 { v28.b }[4], [x21]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x8, #1, 46f\n"
+ "ld1 { v28.h }[0], [x21], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v28.b }[2], [x21]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 47f\n"
+ "ld1 { v28.b }[0], [x21]\n"
+ "47:" // Oddments: Load (2, 3): Bit 2: End
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ldr x20, [x13, #0x88]\n"
+ "smlal v13.4s, v28.4h, v7.4h\n"
+ "smlal2 v17.4s, v28.8h, v7.8h\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal2 v9.4s, v28.8h, v1.8h\n"
+ "add x20, x20, x15\n"
+ "tbz x8, #2, 49f\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 48f\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v26.b }[6], [x20]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 51f\n"
+ "ld1 { v26.b }[4], [x20]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 4): Bit 2: Unset
+ "tbz x8, #1, 50f\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v26.b }[2], [x20]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 51f\n"
+ "ld1 { v26.b }[0], [x20]\n"
+ "51:" // Oddments: Load (3, 4): Bit 2: End
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ldr x23, [x13, #0x90]\n"
+ "smlal v23.4s, v26.4h, v5.4h\n"
+ "smlal2 v9.4s, v26.8h, v5.8h\n"
+ "add x23, x23, x15\n"
+ "tbz x8, #2, 53f\n"
+ "ld1 { v25.s }[0], [x23], #0x4\n"
+ "tbz x8, #1, 52f\n"
+ "ld1 { v25.h }[2], [x23], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v25.b }[6], [x23]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 55f\n"
+ "ld1 { v25.b }[4], [x23]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (4, 0): Bit 2: Unset
+ "tbz x8, #1, 54f\n"
+ "ld1 { v25.h }[0], [x23], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v25.b }[2], [x23]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 55f\n"
+ "ld1 { v25.b }[0], [x23]\n"
+ "55:" // Oddments: Load (4, 0): Bit 2: End
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ldr x24, [x13, #0x98]\n"
+ "smlal v11.4s, v25.4h, v6.4h\n"
+ "smlal2 v10.4s, v25.8h, v6.8h\n"
+ "add x24, x24, x15\n"
+ "tbz x8, #2, 57f\n"
+ "ld1 { v29.s }[0], [x24], #0x4\n"
+ "tbz x8, #1, 56f\n"
+ "ld1 { v29.h }[2], [x24], #0x2\n"
+ "tbz x8, #0, 59f\n"
+ "ld1 { v29.b }[6], [x24]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 59f\n"
+ "ld1 { v29.b }[4], [x24]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load (2, 4): Bit 2: Unset
+ "tbz x8, #1, 58f\n"
+ "ld1 { v29.h }[0], [x24], #0x2\n"
+ "tbz x8, #0, 59f\n"
+ "ld1 { v29.b }[2], [x24]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 59f\n"
+ "ld1 { v29.b }[0], [x24]\n"
+ "59:" // Oddments: Load (2, 4): Bit 2: End
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr x19, [x13, #0xa0]\n"
+ "smlal v13.4s, v29.4h, v8.4h\n"
+ "smlal2 v17.4s, v29.8h, v8.8h\n"
+ "smlal v23.4s, v29.4h, v2.4h\n"
+ "smlal2 v9.4s, v29.8h, v2.8h\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 61f\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 60f\n"
+ "ld1 { v27.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "ld1 { v27.b }[6], [x19]\n"
+ "b 63f\n"
+ "60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 63f\n"
+ "ld1 { v27.b }[4], [x19]\n"
+ "b 63f\n"
+ "61:" // Oddments: Load (4, 1): Bit 2: Unset
+ "tbz x8, #1, 62f\n"
+ "ld1 { v27.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "ld1 { v27.b }[2], [x19]\n"
+ "b 63f\n"
+ "62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 63f\n"
+ "ld1 { v27.b }[0], [x19]\n"
+ "63:" // Oddments: Load (4, 1): Bit 2: End
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ldr x22, [x13, #0xa8]\n"
+ "smlal v11.4s, v27.4h, v7.4h\n"
+ "smlal2 v10.4s, v27.8h, v7.8h\n"
+ "add x22, x22, x15\n"
+ "tbz x8, #2, 65f\n"
+ "ld1 { v24.s }[0], [x22], #0x4\n"
+ "tbz x8, #1, 64f\n"
+ "ld1 { v24.h }[2], [x22], #0x2\n"
+ "tbz x8, #0, 67f\n"
+ "ld1 { v24.b }[6], [x22]\n"
+ "b 67f\n"
+ "64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 67f\n"
+ "ld1 { v24.b }[4], [x22]\n"
+ "b 67f\n"
+ "65:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x8, #1, 66f\n"
+ "ld1 { v24.h }[0], [x22], #0x2\n"
+ "tbz x8, #0, 67f\n"
+ "ld1 { v24.b }[2], [x22]\n"
+ "b 67f\n"
+ "66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 67f\n"
+ "ld1 { v24.b }[0], [x22]\n"
+ "67:" // Oddments: Load (3, 2): Bit 2: End
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "ldr x21, [x13, #0xb0]\n"
+ "smlal v11.4s, v24.4h, v5.4h\n"
+ "smlal2 v10.4s, v24.8h, v5.8h\n"
+ "smlal v23.4s, v24.4h, v3.4h\n"
+ "smlal2 v9.4s, v24.8h, v3.8h\n"
+ "add x21, x21, x15\n"
+ "tbz x8, #2, 69f\n"
+ "ld1 { v26.s }[0], [x21], #0x4\n"
+ "tbz x8, #1, 68f\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "tbz x8, #0, 71f\n"
+ "ld1 { v26.b }[6], [x21]\n"
+ "b 71f\n"
+ "68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 71f\n"
+ "ld1 { v26.b }[4], [x21]\n"
+ "b 71f\n"
+ "69:" // Oddments: Load (4, 3): Bit 2: Unset
+ "tbz x8, #1, 70f\n"
+ "ld1 { v26.h }[0], [x21], #0x2\n"
+ "tbz x8, #0, 71f\n"
+ "ld1 { v26.b }[2], [x21]\n"
+ "b 71f\n"
+ "70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 71f\n"
+ "ld1 { v26.b }[0], [x21]\n"
+ "71:" // Oddments: Load (4, 3): Bit 2: End
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ldr x20, [x13, #0xb8]\n"
+ "smlal v23.4s, v26.4h, v7.4h\n"
+ "smlal2 v9.4s, v26.8h, v7.8h\n"
+ "add x20, x20, x15\n"
+ "tbz x8, #2, 73f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 72f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 75f\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "b 75f\n"
+ "72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 75f\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "b 75f\n"
+ "73:" // Oddments: Load (4, 2): Bit 2: Unset
+ "tbz x8, #1, 74f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 75f\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "b 75f\n"
+ "74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 75f\n"
+ "ld1 { v25.b }[0], [x20]\n"
+ "75:" // Oddments: Load (4, 2): Bit 2: End
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ldr x19, [x13, #0xc0]\n"
+ "smlal v11.4s, v25.4h, v8.4h\n"
+ "smlal2 v10.4s, v25.8h, v8.8h\n"
+ "smlal v23.4s, v25.4h, v6.4h\n"
+ "smlal2 v9.4s, v25.8h, v6.8h\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 77f\n"
+ "ld1 { v29.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 76f\n"
+ "ld1 { v29.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 79f\n"
+ "ld1 { v29.b }[6], [x19]\n"
+ "b 79f\n"
+ "76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+ "tbz x8, #0, 79f\n"
+ "ld1 { v29.b }[4], [x19]\n"
+ "b 79f\n"
+ "77:" // Oddments: Load (4, 4): Bit 2: Unset
+ "tbz x8, #1, 78f\n"
+ "ld1 { v29.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 79f\n"
+ "ld1 { v29.b }[2], [x19]\n"
+ "b 79f\n"
+ "78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 79f\n"
+ "ld1 { v29.b }[0], [x19]\n"
+ "79:" // Oddments: Load (4, 4): Bit 2: End
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
+ "smlal2 v9.4s, v29.8h, v8.8h\n"
+ "tbz x8, #2, 81f\n"
+ "ld1 { v22.4s }, [x12], #0x10\n"
+ "ld1 { v31.4s }, [x11], #0x10\n"
+ "tbz x8, #1, 80f\n"
+ "ld1 { v19.d }[0], [x12], #0x8\n"
+ "ld1 { v30.d }[0], [x11], #0x8\n"
+ "tbz x8, #0, 83f\n"
+ "ld1 { v19.s }[2], [x12]\n"
+ "ld1 { v30.s }[2], [x11]\n"
+ "b 83f\n"
+ "80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x8, #0, 83f\n"
+ "ld1 { v19.s }[0], [x12]\n"
+ "ld1 { v30.s }[0], [x11]\n"
+ "b 83f\n"
+ "81:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x8, #1, 82f\n"
+ "ld1 { v22.d }[0], [x12], #0x8\n"
+ "ld1 { v31.d }[0], [x11], #0x8\n"
+ "tbz x8, #0, 83f\n"
+ "ld1 { v22.s }[2], [x12]\n"
+ "ld1 { v31.s }[2], [x11]\n"
+ "b 83f\n"
+ "82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 83f\n"
+ "ld1 { v22.s }[0], [x12]\n"
+ "ld1 { v31.s }[0], [x11]\n"
+ "83:" // Oddments: Load requant params: Bit 2: End
+ "sqdmulh v15.4s, v15.4s, v22.4s\n"
+ "sqdmulh v13.4s, v13.4s, v22.4s\n"
+ "add x10, x10, x14\n"
+ "add x9, x9, x14\n"
+ "sqdmulh v11.4s, v11.4s, v22.4s\n"
+ "sqdmulh v23.4s, v23.4s, v22.4s\n"
+ "add x28, x28, x14\n"
+ "add x27, x27, x14\n"
+ "and v4.16b, v15.16b, v31.16b\n"
+ "sqdmulh v18.4s, v18.4s, v19.4s\n"
+ "and v1.16b, v13.16b, v31.16b\n"
+ "sqdmulh v17.4s, v17.4s, v19.4s\n"
+ "and v22.16b, v11.16b, v31.16b\n"
+ "sqdmulh v10.4s, v10.4s, v19.4s\n"
+ "and v20.16b, v23.16b, v31.16b\n"
+ "sqdmulh v9.4s, v9.4s, v19.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v19.16b, v18.16b, v30.16b\n"
+ "sshr v1.4s, v1.4s, #0x1f\n"
+ "and v27.16b, v17.16b, v30.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v25.16b, v10.16b, v30.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v0.16b, v9.16b, v30.16b\n"
+ "sqadd v15.4s, v15.4s, v4.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v13.4s, v13.4s, v1.4s\n"
+ "sshr v27.4s, v27.4s, #0x1f\n"
+ "sqadd v11.4s, v11.4s, v22.4s\n"
+ "sshr v25.4s, v25.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v20.4s\n"
+ "sshr v0.4s, v0.4s, #0x1f\n"
+ "srshl v15.4s, v15.4s, v31.4s\n"
+ "sqadd v18.4s, v18.4s, v19.4s\n"
+ "srshl v13.4s, v13.4s, v31.4s\n"
+ "sqadd v17.4s, v17.4s, v27.4s\n"
+ "srshl v11.4s, v11.4s, v31.4s\n"
+ "sqadd v10.4s, v10.4s, v25.4s\n"
+ "srshl v23.4s, v23.4s, v31.4s\n"
+ "sqadd v9.4s, v9.4s, v0.4s\n"
+ "srshl v18.4s, v18.4s, v30.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v17.4s, v17.4s, v30.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v10.4s, v10.4s, v30.4s\n"
+ "sqxtn v11.4h, v11.4s\n"
+ "srshl v9.4s, v9.4s, v30.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v15.8h, v18.4s\n"
+ "sqxtn2 v13.8h, v17.4s\n"
+ "sqxtn2 v11.8h, v10.4s\n"
+ "sqxtn2 v23.8h, v9.4s\n"
+ "sqadd v15.8h, v15.8h, v12.8h\n"
+ "sqadd v13.8h, v13.8h, v12.8h\n"
+ "sqadd v11.8h, v11.8h, v12.8h\n"
+ "sqadd v23.8h, v23.8h, v12.8h\n"
+ "smax v15.8h, v15.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v14.8h\n"
+ "smax v11.8h, v11.8h, v14.8h\n"
+ "smax v23.8h, v23.8h, v14.8h\n"
+ "smin v15.8h, v15.8h, v21.8h\n"
+ "smin v13.8h, v13.8h, v21.8h\n"
+ "smin v11.8h, v11.8h, v21.8h\n"
+ "smin v23.8h, v23.8h, v21.8h\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "tbz x8, #2, 85f\n"
+ "st1 { v15.s }[0], [x10], #0x4\n"
+ "st1 { v13.s }[0], [x9], #0x4\n"
+ "st1 { v11.s }[0], [x28], #0x4\n"
+ "st1 { v23.s }[0], [x27], #0x4\n"
+ "tbz x8, #1, 84f\n"
+ "st1 { v15.h }[2], [x10], #0x2\n"
+ "st1 { v13.h }[2], [x9], #0x2\n"
+ "st1 { v11.h }[2], [x28], #0x2\n"
+ "st1 { v23.h }[2], [x27], #0x2\n"
+ "tbz x8, #0, 87f\n"
+ "st1 { v15.b }[6], [x10], #0x1\n"
+ "st1 { v13.b }[6], [x9], #0x1\n"
+ "st1 { v11.b }[6], [x28], #0x1\n"
+ "st1 { v23.b }[6], [x27], #0x1\n"
+ "b 87f\n"
+ "84:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x8, #0, 87f\n"
+ "st1 { v15.b }[4], [x10], #0x1\n"
+ "st1 { v13.b }[4], [x9], #0x1\n"
+ "st1 { v11.b }[4], [x28], #0x1\n"
+ "st1 { v23.b }[4], [x27], #0x1\n"
+ "b 87f\n"
+ "85:" // Oddments: Bit 2: Unset
+ "tbz x8, #1, 86f\n"
+ "st1 { v15.h }[0], [x10], #0x2\n"
+ "st1 { v13.h }[0], [x9], #0x2\n"
+ "st1 { v11.h }[0], [x28], #0x2\n"
+ "st1 { v23.h }[0], [x27], #0x2\n"
+ "tbz x8, #0, 87f\n"
+ "st1 { v15.b }[2], [x10], #0x1\n"
+ "st1 { v13.b }[2], [x9], #0x1\n"
+ "st1 { v11.b }[2], [x28], #0x1\n"
+ "st1 { v23.b }[2], [x27], #0x1\n"
+ "b 87f\n"
+ "86:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x8, #0, 87f\n"
+ "st1 { v15.b }[0], [x10], #0x1\n"
+ "st1 { v13.b }[0], [x9], #0x1\n"
+ "st1 { v11.b }[0], [x28], #0x1\n"
+ "st1 { v23.b }[0], [x27], #0x1\n"
+ "87:" // Oddments: Bit 2: End
+ "88:" // End
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
new file mode 100644
index 0000000000..482d1af80c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp"
+
+#include <cstdint>
+
+#pragma once
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
+
+class a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
+{
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+
+ public:
+ constexpr static unsigned int kernel_rows = 5;
+ constexpr static unsigned int kernel_cols = 5;
+
+ constexpr static unsigned int stride_rows = 1;
+ constexpr static unsigned int stride_cols = 1;
+
+ a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
+
+ Parent::KernelType kernel = a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
+};
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
new file mode 100644
index 0000000000..7f1fd1d568
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -0,0 +1,2185 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_gemm.hpp"
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(__aarch64__)
+
+namespace arm_conv {
+namespace depthwise {
+
+void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const uint8_t *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *const outptrs
+)
+{
+ struct Params
+ {
+ long unsigned int n_channels;
+ const void *weights;
+ const int32_t *bias;
+ const arm_gemm::Requantize32 *requant;
+ const int32_t *const requant_muls;
+ const int32_t *const requant_shifts;
+ uint8_t *const *const outptrs;
+ const uint8_t *inptrs[36];
+
+ Params(
+ long unsigned int n_channels,
+ const uint8_t *const *inptrs_raw,
+ const void *const weights,
+ const int32_t *const bias,
+ const arm_gemm::Requantize32 &qp,
+ const int32_t *const requant_muls,
+ const int32_t *const requant_shifts,
+ uint8_t *const *outptrs
+ ) : n_channels(n_channels), weights(weights), bias(bias),
+ requant(&qp), requant_muls(requant_muls),
+ requant_shifts(requant_shifts), outptrs(outptrs)
+ {
+ inptrs[0] = inptrs_raw[0];
+ inptrs[1] = inptrs_raw[1];
+ inptrs[2] = inptrs_raw[6];
+ inptrs[3] = inptrs_raw[7];
+ inptrs[4] = inptrs_raw[2];
+ inptrs[5] = inptrs_raw[8];
+ inptrs[6] = inptrs_raw[3];
+ inptrs[7] = inptrs_raw[4];
+ inptrs[8] = inptrs_raw[11];
+ inptrs[9] = inptrs_raw[12];
+ inptrs[10] = inptrs_raw[9];
+ inptrs[11] = inptrs_raw[10];
+ inptrs[12] = inptrs_raw[5];
+ inptrs[13] = inptrs_raw[13];
+ inptrs[14] = inptrs_raw[14];
+ inptrs[15] = inptrs_raw[15];
+ inptrs[16] = inptrs_raw[16];
+ inptrs[17] = inptrs_raw[17];
+ inptrs[18] = inptrs_raw[18];
+ inptrs[19] = inptrs_raw[19];
+ inptrs[20] = inptrs_raw[20];
+ inptrs[21] = inptrs_raw[21];
+ inptrs[22] = inptrs_raw[22];
+ inptrs[23] = inptrs_raw[23];
+ inptrs[24] = inptrs_raw[24];
+ inptrs[25] = inptrs_raw[25];
+ inptrs[26] = inptrs_raw[26];
+ inptrs[27] = inptrs_raw[27];
+ inptrs[28] = inptrs_raw[28];
+ inptrs[29] = inptrs_raw[29];
+ inptrs[30] = inptrs_raw[30];
+ inptrs[31] = inptrs_raw[31];
+ inptrs[32] = inptrs_raw[32];
+ inptrs[33] = inptrs_raw[33];
+ inptrs[34] = inptrs_raw[34];
+ inptrs[35] = inptrs_raw[35];
+
+ }
+ };
+
+ const Params params(n_channels, inptrs, weights, bias, qp,
+ requant_muls, requant_shifts, outptrs);
+
+ __asm__ __volatile__(
+ "ldr x16, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "add x9, x16, %[offsetof_Requantize32_b_offset]\n"
+ "add x19, x16, %[offsetof_Requantize32_c_offset]\n"
+ "ldr x10, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x24, x16, %[offsetof_Requantize32_minval]\n"
+ "add x2, x16, %[offsetof_Requantize32_maxval]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_weights]]\n"
+ "ld1r { v15.16b }, [x9]\n"
+ "ld1r { v16.8h }, [x19]\n"
+ "lsr x3, x4, #0x3\n"
+ "mov x1, #0x0\n"
+ "ld1r { v12.8h }, [x24]\n"
+ "ld1r { v13.8h }, [x2]\n"
+ "mov x2, #0x0\n"
+ "add x0, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x5, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "ldr x6, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x21, x15, [x10, #0x0]\n"
+ "ldp x17, x16, [x10, #0x10]\n"
+ "cbz x3, 3f\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q11, [x19, #0x0]\n"
+ "subs x3, x3, #0x1\n"
+ "mov v14.16b, v11.16b\n"
+ "ldr q21, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d0, [x8, #0x0]\n"
+ "ldr d1, [x8, #0x8]\n"
+ "ldr d2, [x8, #0x10]\n"
+ "mov v10.16b, v21.16b\n"
+ "mov v9.16b, v11.16b\n"
+ "ldr d3, [x8, #0x18]\n"
+ "ldr d4, [x8, #0x20]\n"
+ "mov v8.16b, v21.16b\n"
+ "mov v7.16b, v11.16b\n"
+ "ldp x28, x27, [x0, #0x0]\n"
+ "ldp x10, x26, [x0, #0x10]\n"
+ "mov v6.16b, v21.16b\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "ldp x24, x23, [x0, #0x20]\n"
+ "ldp x22, x25, [x0, #0x30]\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "ldp x20, x19, [x0, #0x40]\n"
+ "ldr d31, [x28, x1]\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "ldr d30, [x27, x1]\n"
+ "ldr d29, [x10, x1]\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ldr d28, [x26, x1]\n"
+ "ldr d27, [x24, x1]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ldr d23, [x23, x1]\n"
+ "ldr d25, [x22, x1]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ldr d24, [x25, x1]\n"
+ "ldr d26, [x20, x1]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "ldr d22, [x19, x1]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ushll v22.8h, v22.8b, #0x0\n"
+ "beq 2f\n"
+ "1:" // Loop
+ "smlal v11.4s, v31.4h, v0.4h\n"
+ "smlal2 v21.4s, v31.8h, v0.8h\n"
+ "ldr x19, [x0, #0x50]\n"
+ "ldr d31, [x19, x1]\n"
+ "smlal v14.4s, v30.4h, v0.4h\n"
+ "smlal v9.4s, v29.4h, v0.4h\n"
+ "ldr x20, [x0, #0x58]\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal v7.4s, v28.4h, v0.4h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr x19, [x0, #0x60]\n"
+ "ldr x24, [x0, #0x68]\n"
+ "smlal2 v8.4s, v29.8h, v0.8h\n"
+ "smlal v11.4s, v30.4h, v1.4h\n"
+ "ldr x23, [x0, #0x70]\n"
+ "ldr x26, [x0, #0x78]\n"
+ "smlal2 v21.4s, v30.8h, v1.8h\n"
+ "smlal2 v6.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x20, x1]\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "smlal v14.4s, v27.4h, v1.4h\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "ldr d0, [x8, #0x28]\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v7.4s, v23.4h, v1.4h\n"
+ "smlal2 v10.4s, v27.8h, v1.8h\n"
+ "ldr x7, [x0, #0x80]\n"
+ "ldr x22, [x0, #0x88]\n"
+ "smlal2 v8.4s, v28.8h, v1.8h\n"
+ "smlal v11.4s, v27.4h, v2.4h\n"
+ "ldr x20, [x0, #0x90]\n"
+ "ldr x14, [x0, #0x98]\n"
+ "smlal2 v21.4s, v27.8h, v2.8h\n"
+ "smlal2 v6.4s, v23.8h, v1.8h\n"
+ "ldr d27, [x19, x1]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal v14.4s, v25.4h, v2.4h\n"
+ "smlal v9.4s, v23.4h, v2.4h\n"
+ "ldr d1, [x8, #0x30]\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v7.4s, v31.4h, v2.4h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "ldr x19, [x0, #0xa0]\n"
+ "ldr x13, [x0, #0xa8]\n"
+ "smlal2 v8.4s, v23.8h, v2.8h\n"
+ "smlal v11.4s, v25.4h, v3.4h\n"
+ "ldr x12, [x0, #0xb0]\n"
+ "ldr x11, [x0, #0xb8]\n"
+ "smlal2 v21.4s, v25.8h, v3.8h\n"
+ "smlal2 v6.4s, v31.8h, v2.8h\n"
+ "ldr d25, [x24, x1]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal v14.4s, v24.4h, v3.4h\n"
+ "smlal v9.4s, v31.4h, v3.4h\n"
+ "ldr d2, [x8, #0x38]\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v7.4s, v30.4h, v3.4h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "ldr x10, [x0, #0xc0]\n"
+ "ldr x9, [x0, #0xc8]\n"
+ "smlal2 v8.4s, v31.8h, v3.8h\n"
+ "smlal v11.4s, v24.4h, v4.4h\n"
+ "ldr x28, [x0, #0xd0]\n"
+ "ldr x27, [x0, #0xd8]\n"
+ "smlal2 v21.4s, v24.8h, v4.8h\n"
+ "smlal2 v6.4s, v30.8h, v3.8h\n"
+ "ldr d24, [x23, x1]\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "smlal v14.4s, v27.4h, v4.4h\n"
+ "smlal v9.4s, v30.4h, v4.4h\n"
+ "ldr d3, [x8, #0x40]\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v7.4s, v26.4h, v4.4h\n"
+ "smlal2 v10.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x26, x1]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal2 v8.4s, v30.8h, v4.8h\n"
+ "smlal v11.4s, v29.4h, v0.4h\n"
+ "ldr x26, [x0, #0xe0]\n"
+ "ldr x25, [x0, #0xe8]\n"
+ "smlal2 v21.4s, v29.8h, v0.8h\n"
+ "smlal2 v6.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x8, #0x48]\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v14.4s, v28.4h, v0.4h\n"
+ "smlal v9.4s, v22.4h, v0.4h\n"
+ "ldr x24, [x0, #0xf0]\n"
+ "ldr q17, [x5, #0x0]\n"
+ "smlal v7.4s, v25.4h, v0.4h\n"
+ "smlal2 v10.4s, v28.8h, v0.8h\n"
+ "ldr q5, [x6, #0x0]\n"
+ "ldr q18, [x5, #0x10]\n"
+ "smlal2 v8.4s, v22.8h, v0.8h\n"
+ "smlal v11.4s, v28.4h, v1.4h\n"
+ "ldr q29, [x6, #0x10]\n"
+ "subs x3, x3, #0x1\n"
+ "smlal2 v21.4s, v28.8h, v1.8h\n"
+ "smlal2 v6.4s, v25.8h, v0.8h\n"
+ "ldr d28, [x22, x1]\n"
+ "ldr d0, [x8, #0x50]\n"
+ "smlal v14.4s, v23.4h, v1.4h\n"
+ "smlal v9.4s, v25.4h, v1.4h\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ldr x23, [x0, #0xf8]\n"
+ "smlal v7.4s, v24.4h, v1.4h\n"
+ "smlal2 v10.4s, v23.8h, v1.8h\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "add x5, x5, #0x20\n"
+ "smlal2 v8.4s, v25.8h, v1.8h\n"
+ "smlal v11.4s, v23.4h, v2.4h\n"
+ "add x6, x6, #0x20\n"
+ "smlal2 v21.4s, v23.8h, v2.8h\n"
+ "ldr d23, [x7, x1]\n"
+ "smlal2 v6.4s, v24.8h, v1.8h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal v14.4s, v31.4h, v2.4h\n"
+ "smlal v9.4s, v24.4h, v2.4h\n"
+ "ldr d1, [x8, #0x58]\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v7.4s, v27.4h, v2.4h\n"
+ "smlal2 v10.4s, v31.8h, v2.8h\n"
+ "ldr x22, [x0, #0x100]\n"
+ "smlal2 v8.4s, v24.8h, v2.8h\n"
+ "smlal v11.4s, v31.4h, v3.4h\n"
+ "smlal2 v21.4s, v31.8h, v3.8h\n"
+ "smlal2 v6.4s, v27.8h, v2.8h\n"
+ "ldr d31, [x20, x1]\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal v14.4s, v30.4h, v3.4h\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "ldr d2, [x8, #0x60]\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v7.4s, v23.4h, v3.4h\n"
+ "smlal2 v10.4s, v30.8h, v3.8h\n"
+ "ldr x7, [x0, #0x108]\n"
+ "smlal2 v8.4s, v27.8h, v3.8h\n"
+ "smlal v11.4s, v30.4h, v4.4h\n"
+ "smlal2 v21.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x14, x1]\n"
+ "smlal2 v6.4s, v23.8h, v3.8h\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "smlal v14.4s, v26.4h, v4.4h\n"
+ "smlal v9.4s, v23.4h, v4.4h\n"
+ "ldr d3, [x8, #0x68]\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v7.4s, v28.4h, v4.4h\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x19, x1]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "smlal v11.4s, v22.4h, v0.4h\n"
+ "ldr x20, [x0, #0x110]\n"
+ "ldr x19, [x0, #0x118]\n"
+ "smlal2 v21.4s, v22.8h, v0.8h\n"
+ "smlal2 v6.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x8, #0x70]\n"
+ "ldr d22, [x11, x1]\n"
+ "smlal v14.4s, v25.4h, v0.4h\n"
+ "smlal v9.4s, v31.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v7.4s, v30.4h, v0.4h\n"
+ "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "ushll v22.8h, v22.8b, #0x0\n"
+ "smlal2 v8.4s, v31.8h, v0.8h\n"
+ "smlal v11.4s, v25.4h, v1.4h\n"
+ "smlal2 v21.4s, v25.8h, v1.8h\n"
+ "ldr d25, [x13, x1]\n"
+ "smlal2 v6.4s, v30.8h, v0.8h\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal v14.4s, v24.4h, v1.4h\n"
+ "smlal v9.4s, v30.4h, v1.4h\n"
+ "ldr d0, [x8, #0x78]\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v7.4s, v26.4h, v1.4h\n"
+ "smlal2 v10.4s, v24.8h, v1.8h\n"
+ "smlal2 v8.4s, v30.8h, v1.8h\n"
+ "smlal v11.4s, v24.4h, v2.4h\n"
+ "smlal2 v21.4s, v24.8h, v2.8h\n"
+ "ldr d24, [x12, x1]\n"
+ "smlal2 v6.4s, v26.8h, v1.8h\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "smlal v14.4s, v27.4h, v2.4h\n"
+ "smlal v9.4s, v26.4h, v2.4h\n"
+ "ldr d1, [x8, #0x80]\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v10.4s, v27.8h, v2.8h\n"
+ "smlal2 v8.4s, v26.8h, v2.8h\n"
+ "smlal v11.4s, v27.4h, v3.4h\n"
+ "smlal2 v21.4s, v27.8h, v3.8h\n"
+ "smlal2 v6.4s, v25.8h, v2.8h\n"
+ "ldr d27, [x10, x1]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal v14.4s, v23.4h, v3.4h\n"
+ "smlal v9.4s, v25.4h, v3.4h\n"
+ "ldr d2, [x8, #0x88]\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v7.4s, v24.4h, v3.4h\n"
+ "smlal2 v10.4s, v23.8h, v3.8h\n"
+ "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "smlal v11.4s, v23.4h, v4.4h\n"
+ "smlal2 v21.4s, v23.8h, v4.8h\n"
+ "ldr d23, [x9, x1]\n"
+ "smlal2 v6.4s, v24.8h, v3.8h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal v14.4s, v28.4h, v4.4h\n"
+ "smlal v9.4s, v24.4h, v4.4h\n"
+ "ldr d3, [x8, #0x90]\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v7.4s, v22.4h, v4.4h\n"
+ "smlal2 v10.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x26, x1]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "smlal v11.4s, v31.4h, v0.4h\n"
+ "smlal2 v21.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x28, x1]\n"
+ "smlal2 v6.4s, v22.8h, v4.8h\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal v14.4s, v30.4h, v0.4h\n"
+ "smlal v9.4s, v27.4h, v0.4h\n"
+ "ldr d4, [x8, #0x98]\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v7.4s, v23.4h, v0.4h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "smlal2 v8.4s, v27.8h, v0.8h\n"
+ "smlal v11.4s, v30.4h, v1.4h\n"
+ "smlal2 v21.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x27, x1]\n"
+ "smlal2 v6.4s, v23.8h, v0.8h\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "smlal v14.4s, v26.4h, v1.4h\n"
+ "smlal v9.4s, v23.4h, v1.4h\n"
+ "ldr d0, [x8, #0xa0]\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v7.4s, v31.4h, v1.4h\n"
+ "smlal2 v10.4s, v26.8h, v1.8h\n"
+ "smlal2 v8.4s, v23.8h, v1.8h\n"
+ "smlal v11.4s, v26.4h, v2.4h\n"
+ "smlal2 v21.4s, v26.8h, v2.8h\n"
+ "smlal2 v6.4s, v31.8h, v1.8h\n"
+ "ldr d26, [x25, x1]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v14.4s, v25.4h, v2.4h\n"
+ "smlal v9.4s, v31.4h, v2.4h\n"
+ "ldr d1, [x8, #0xa8]\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v7.4s, v30.4h, v2.4h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "smlal v11.4s, v25.4h, v3.4h\n"
+ "smlal2 v21.4s, v25.8h, v3.8h\n"
+ "smlal2 v6.4s, v30.8h, v2.8h\n"
+ "ldr d25, [x24, x1]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal v14.4s, v24.4h, v3.4h\n"
+ "smlal v9.4s, v30.4h, v3.4h\n"
+ "ldr d2, [x8, #0xb0]\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v7.4s, v28.4h, v3.4h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "smlal v11.4s, v24.4h, v4.4h\n"
+ "smlal2 v21.4s, v24.8h, v4.8h\n"
+ "ldr d24, [x23, x1]\n"
+ "smlal2 v6.4s, v28.8h, v3.8h\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "smlal v14.4s, v22.4h, v4.4h\n"
+ "smlal v9.4s, v28.4h, v4.4h\n"
+ "ldr d3, [x8, #0xb8]\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v7.4s, v26.4h, v4.4h\n"
+ "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "smlal v11.4s, v27.4h, v0.4h\n"
+ "smlal2 v21.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x22, x1]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal2 v10.4s, v22.8h, v4.8h\n"
+ "smlal2 v6.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x8, #0xc0]\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v14.4s, v23.4h, v0.4h\n"
+ "smlal v9.4s, v25.4h, v0.4h\n"
+ "add x8, x8, #0xc8\n"
+ "smlal v7.4s, v24.4h, v0.4h\n"
+ "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x7, x1]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal2 v10.4s, v23.8h, v0.8h\n"
+ "smlal2 v6.4s, v24.8h, v0.8h\n"
+ "smlal v11.4s, v23.4h, v1.4h\n"
+ "smlal v14.4s, v31.4h, v1.4h\n"
+ "smlal v9.4s, v24.4h, v1.4h\n"
+ "smlal v7.4s, v27.4h, v1.4h\n"
+ "smlal2 v8.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x20, x1]\n"
+ "smlal2 v21.4s, v23.8h, v1.8h\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "smlal2 v10.4s, v31.8h, v1.8h\n"
+ "smlal2 v6.4s, v27.8h, v1.8h\n"
+ "smlal v11.4s, v31.4h, v2.4h\n"
+ "smlal v14.4s, v30.4h, v2.4h\n"
+ "smlal v9.4s, v27.4h, v2.4h\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v8.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x19, x1]\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal2 v10.4s, v30.8h, v2.8h\n"
+ "smlal2 v6.4s, v25.8h, v2.8h\n"
+ "add x1, x1, #0x8\n"
+ "smlal v11.4s, v30.4h, v3.4h\n"
+ "smlal v14.4s, v28.4h, v3.4h\n"
+ "smlal v9.4s, v25.4h, v3.4h\n"
+ "smlal v7.4s, v24.4h, v3.4h\n"
+ "smlal2 v21.4s, v30.8h, v3.8h\n"
+ "smlal2 v10.4s, v28.8h, v3.8h\n"
+ "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "smlal2 v6.4s, v24.8h, v3.8h\n"
+ "smlal v11.4s, v28.4h, v4.4h\n"
+ "smlal v14.4s, v26.4h, v4.4h\n"
+ "sqdmulh v11.4s, v11.4s, v17.4s\n"
+ "smlal v9.4s, v24.4h, v4.4h\n"
+ "smlal v7.4s, v27.4h, v4.4h\n"
+ "sqdmulh v14.4s, v14.4s, v17.4s\n"
+ "smlal2 v21.4s, v28.8h, v4.8h\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "sqdmulh v9.4s, v9.4s, v17.4s\n"
+ "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "smlal2 v6.4s, v27.8h, v4.8h\n"
+ "sqdmulh v7.4s, v7.4s, v17.4s\n"
+ "and v23.16b, v11.16b, v5.16b\n"
+ "sqdmulh v21.4s, v21.4s, v18.4s\n"
+ "and v22.16b, v14.16b, v5.16b\n"
+ "sqdmulh v10.4s, v10.4s, v18.4s\n"
+ "and v17.16b, v9.16b, v5.16b\n"
+ "sqdmulh v8.4s, v8.4s, v18.4s\n"
+ "and v20.16b, v7.16b, v5.16b\n"
+ "sqdmulh v6.4s, v6.4s, v18.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v19.16b, v21.16b, v29.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v18.16b, v10.16b, v29.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v26.16b, v8.16b, v29.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v4.16b, v6.16b, v29.16b\n"
+ "sqadd v11.4s, v11.4s, v23.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v14.4s, v14.4s, v22.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v17.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sqadd v7.4s, v7.4s, v20.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "srshl v11.4s, v11.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v19.4s\n"
+ "srshl v14.4s, v14.4s, v5.4s\n"
+ "sqadd v10.4s, v10.4s, v18.4s\n"
+ "srshl v9.4s, v9.4s, v5.4s\n"
+ "sqadd v8.4s, v8.4s, v26.4s\n"
+ "srshl v7.4s, v7.4s, v5.4s\n"
+ "sqadd v6.4s, v6.4s, v4.4s\n"
+ "srshl v21.4s, v21.4s, v29.4s\n"
+ "sqxtn v11.4h, v11.4s\n"
+ "srshl v10.4s, v10.4s, v29.4s\n"
+ "sqxtn v14.4h, v14.4s\n"
+ "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v6.4s, v6.4s, v29.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "sqxtn2 v11.8h, v21.4s\n"
+ "sqxtn2 v14.8h, v10.4s\n"
+ "sqxtn2 v9.8h, v8.4s\n"
+ "sqxtn2 v7.8h, v6.4s\n"
+ "sqadd v11.8h, v11.8h, v16.8h\n"
+ "sqadd v14.8h, v14.8h, v16.8h\n"
+ "sqadd v9.8h, v9.8h, v16.8h\n"
+ "sqadd v7.8h, v7.8h, v16.8h\n"
+ "smax v11.8h, v11.8h, v12.8h\n"
+ "smax v14.8h, v14.8h, v12.8h\n"
+ "smax v9.8h, v9.8h, v12.8h\n"
+ "smax v7.8h, v7.8h, v12.8h\n"
+ "smin v11.8h, v11.8h, v13.8h\n"
+ "smin v14.8h, v14.8h, v13.8h\n"
+ "smin v9.8h, v9.8h, v13.8h\n"
+ "smin v7.8h, v7.8h, v13.8h\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "uzp1 v14.16b, v14.16b, v14.16b\n"
+ "str d11, [x21, x2]\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d14, [x15, x2]\n"
+ "str d9, [x17, x2]\n"
+ "str d7, [x16, x2]\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q11, [x19, #0x0]\n"
+ "add x2, x2, #0x8\n"
+ "ldr q21, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d0, [x8, #0x0]\n"
+ "ldr d1, [x8, #0x8]\n"
+ "ldr d2, [x8, #0x10]\n"
+ "mov v14.16b, v11.16b\n"
+ "mov v10.16b, v21.16b\n"
+ "ldr d3, [x8, #0x18]\n"
+ "ldr d4, [x8, #0x20]\n"
+ "mov v9.16b, v11.16b\n"
+ "mov v8.16b, v21.16b\n"
+ "ldp x28, x27, [x0, #0x0]\n"
+ "ldp x10, x26, [x0, #0x10]\n"
+ "mov v7.16b, v11.16b\n"
+ "mov v6.16b, v21.16b\n"
+ "ldp x24, x23, [x0, #0x20]\n"
+ "ldp x22, x25, [x0, #0x30]\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "ldp x20, x19, [x0, #0x40]\n"
+ "ldr d31, [x28, x1]\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "ldr d30, [x27, x1]\n"
+ "ldr d29, [x10, x1]\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ldr d28, [x26, x1]\n"
+ "ldr d27, [x24, x1]\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ldr d23, [x23, x1]\n"
+ "ldr d25, [x22, x1]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ldr d24, [x25, x1]\n"
+ "ldr d26, [x20, x1]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ldr d22, [x19, x1]\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ushll v22.8h, v22.8b, #0x0\n"
+ "bgt 1b\n"
+ "2:" // Tail
+ "smlal v11.4s, v31.4h, v0.4h\n"
+ "smlal2 v21.4s, v31.8h, v0.8h\n"
+ "ldr x19, [x0, #0x50]\n"
+ "ldr d31, [x19, x1]\n"
+ "smlal v14.4s, v30.4h, v0.4h\n"
+ "smlal v9.4s, v29.4h, v0.4h\n"
+ "ldr x20, [x0, #0x58]\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal v7.4s, v28.4h, v0.4h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "ldr x19, [x0, #0x60]\n"
+ "ldr x24, [x0, #0x68]\n"
+ "smlal2 v8.4s, v29.8h, v0.8h\n"
+ "smlal v11.4s, v30.4h, v1.4h\n"
+ "ldr x23, [x0, #0x70]\n"
+ "ldr x26, [x0, #0x78]\n"
+ "smlal2 v21.4s, v30.8h, v1.8h\n"
+ "smlal2 v6.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x20, x1]\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "smlal v14.4s, v27.4h, v1.4h\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "ldr d0, [x8, #0x28]\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v7.4s, v23.4h, v1.4h\n"
+ "smlal2 v10.4s, v27.8h, v1.8h\n"
+ "ldr x7, [x0, #0x80]\n"
+ "ldr x22, [x0, #0x88]\n"
+ "smlal2 v8.4s, v28.8h, v1.8h\n"
+ "smlal v11.4s, v27.4h, v2.4h\n"
+ "ldr x20, [x0, #0x90]\n"
+ "ldr x14, [x0, #0x98]\n"
+ "smlal2 v21.4s, v27.8h, v2.8h\n"
+ "smlal2 v6.4s, v23.8h, v1.8h\n"
+ "ldr d27, [x19, x1]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal v14.4s, v25.4h, v2.4h\n"
+ "smlal v9.4s, v23.4h, v2.4h\n"
+ "ldr d1, [x8, #0x30]\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v7.4s, v31.4h, v2.4h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "ldr x19, [x0, #0xa0]\n"
+ "ldr x13, [x0, #0xa8]\n"
+ "smlal2 v8.4s, v23.8h, v2.8h\n"
+ "smlal v11.4s, v25.4h, v3.4h\n"
+ "ldr x12, [x0, #0xb0]\n"
+ "ldr x11, [x0, #0xb8]\n"
+ "smlal2 v21.4s, v25.8h, v3.8h\n"
+ "smlal2 v6.4s, v31.8h, v2.8h\n"
+ "ldr d25, [x24, x1]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal v14.4s, v24.4h, v3.4h\n"
+ "smlal v9.4s, v31.4h, v3.4h\n"
+ "ldr d2, [x8, #0x38]\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v7.4s, v30.4h, v3.4h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "ldr x10, [x0, #0xc0]\n"
+ "ldr x9, [x0, #0xc8]\n"
+ "smlal2 v8.4s, v31.8h, v3.8h\n"
+ "smlal v11.4s, v24.4h, v4.4h\n"
+ "ldr x28, [x0, #0xd0]\n"
+ "ldr x27, [x0, #0xd8]\n"
+ "smlal2 v21.4s, v24.8h, v4.8h\n"
+ "smlal2 v6.4s, v30.8h, v3.8h\n"
+ "ldr d24, [x23, x1]\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "smlal v14.4s, v27.4h, v4.4h\n"
+ "smlal v9.4s, v30.4h, v4.4h\n"
+ "ldr d3, [x8, #0x40]\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v7.4s, v26.4h, v4.4h\n"
+ "smlal2 v10.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x26, x1]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal2 v8.4s, v30.8h, v4.8h\n"
+ "smlal v11.4s, v29.4h, v0.4h\n"
+ "ldr x26, [x0, #0xe0]\n"
+ "ldr x25, [x0, #0xe8]\n"
+ "smlal2 v21.4s, v29.8h, v0.8h\n"
+ "smlal2 v6.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x8, #0x48]\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v14.4s, v28.4h, v0.4h\n"
+ "smlal v9.4s, v22.4h, v0.4h\n"
+ "ldr x24, [x0, #0xf0]\n"
+ "ldr x23, [x0, #0xf8]\n"
+ "smlal v7.4s, v25.4h, v0.4h\n"
+ "smlal2 v10.4s, v28.8h, v0.8h\n"
+ "ldr q17, [x5, #0x0]\n"
+ "ldr q5, [x6, #0x0]\n"
+ "smlal2 v8.4s, v22.8h, v0.8h\n"
+ "smlal v11.4s, v28.4h, v1.4h\n"
+ "ldr q18, [x5, #0x10]\n"
+ "ldr q29, [x6, #0x10]\n"
+ "smlal2 v21.4s, v28.8h, v1.8h\n"
+ "smlal2 v6.4s, v25.8h, v0.8h\n"
+ "ldr d28, [x22, x1]\n"
+ "ldr d0, [x8, #0x50]\n"
+ "smlal v14.4s, v23.4h, v1.4h\n"
+ "smlal v9.4s, v25.4h, v1.4h\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ldr x22, [x0, #0x100]\n"
+ "smlal v7.4s, v24.4h, v1.4h\n"
+ "smlal2 v10.4s, v23.8h, v1.8h\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "tst x4, #0x7\n"
+ "smlal2 v8.4s, v25.8h, v1.8h\n"
+ "smlal v11.4s, v23.4h, v2.4h\n"
+ "add x5, x5, #0x20\n"
+ "add x6, x6, #0x20\n"
+ "smlal2 v21.4s, v23.8h, v2.8h\n"
+ "ldr d23, [x7, x1]\n"
+ "smlal2 v6.4s, v24.8h, v1.8h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal v14.4s, v31.4h, v2.4h\n"
+ "smlal v9.4s, v24.4h, v2.4h\n"
+ "ldr d1, [x8, #0x58]\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v7.4s, v27.4h, v2.4h\n"
+ "smlal2 v10.4s, v31.8h, v2.8h\n"
+ "ldr x7, [x0, #0x108]\n"
+ "smlal2 v8.4s, v24.8h, v2.8h\n"
+ "smlal v11.4s, v31.4h, v3.4h\n"
+ "smlal2 v21.4s, v31.8h, v3.8h\n"
+ "smlal2 v6.4s, v27.8h, v2.8h\n"
+ "ldr d31, [x20, x1]\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal v14.4s, v30.4h, v3.4h\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "ldr d2, [x8, #0x60]\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v7.4s, v23.4h, v3.4h\n"
+ "smlal2 v10.4s, v30.8h, v3.8h\n"
+ "ldr x20, [x0, #0x110]\n"
+ "smlal2 v8.4s, v27.8h, v3.8h\n"
+ "smlal v11.4s, v30.4h, v4.4h\n"
+ "smlal2 v21.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x14, x1]\n"
+ "smlal2 v6.4s, v23.8h, v3.8h\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "smlal v14.4s, v26.4h, v4.4h\n"
+ "smlal v9.4s, v23.4h, v4.4h\n"
+ "ldr d3, [x8, #0x68]\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v7.4s, v28.4h, v4.4h\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x19, x1]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "smlal v11.4s, v22.4h, v0.4h\n"
+ "ldr x19, [x0, #0x118]\n"
+ "smlal2 v21.4s, v22.8h, v0.8h\n"
+ "smlal2 v6.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x8, #0x70]\n"
+ "ldr d22, [x11, x1]\n"
+ "smlal v14.4s, v25.4h, v0.4h\n"
+ "smlal v9.4s, v31.4h, v0.4h\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v7.4s, v30.4h, v0.4h\n"
+ "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "ushll v22.8h, v22.8b, #0x0\n"
+ "smlal2 v8.4s, v31.8h, v0.8h\n"
+ "smlal v11.4s, v25.4h, v1.4h\n"
+ "smlal2 v21.4s, v25.8h, v1.8h\n"
+ "ldr d25, [x13, x1]\n"
+ "smlal2 v6.4s, v30.8h, v0.8h\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal v14.4s, v24.4h, v1.4h\n"
+ "smlal v9.4s, v30.4h, v1.4h\n"
+ "ldr d0, [x8, #0x78]\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v7.4s, v26.4h, v1.4h\n"
+ "smlal2 v10.4s, v24.8h, v1.8h\n"
+ "smlal2 v8.4s, v30.8h, v1.8h\n"
+ "smlal v11.4s, v24.4h, v2.4h\n"
+ "smlal2 v21.4s, v24.8h, v2.8h\n"
+ "ldr d24, [x12, x1]\n"
+ "smlal2 v6.4s, v26.8h, v1.8h\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "smlal v14.4s, v27.4h, v2.4h\n"
+ "smlal v9.4s, v26.4h, v2.4h\n"
+ "ldr d1, [x8, #0x80]\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v10.4s, v27.8h, v2.8h\n"
+ "smlal2 v8.4s, v26.8h, v2.8h\n"
+ "smlal v11.4s, v27.4h, v3.4h\n"
+ "smlal2 v21.4s, v27.8h, v3.8h\n"
+ "smlal2 v6.4s, v25.8h, v2.8h\n"
+ "ldr d27, [x10, x1]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal v14.4s, v23.4h, v3.4h\n"
+ "smlal v9.4s, v25.4h, v3.4h\n"
+ "ldr d2, [x8, #0x88]\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v7.4s, v24.4h, v3.4h\n"
+ "smlal2 v10.4s, v23.8h, v3.8h\n"
+ "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "smlal v11.4s, v23.4h, v4.4h\n"
+ "smlal2 v21.4s, v23.8h, v4.8h\n"
+ "ldr d23, [x9, x1]\n"
+ "smlal2 v6.4s, v24.8h, v3.8h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal v14.4s, v28.4h, v4.4h\n"
+ "smlal v9.4s, v24.4h, v4.4h\n"
+ "ldr d3, [x8, #0x90]\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v7.4s, v22.4h, v4.4h\n"
+ "smlal2 v10.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x26, x1]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "smlal v11.4s, v31.4h, v0.4h\n"
+ "smlal2 v21.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x28, x1]\n"
+ "smlal2 v6.4s, v22.8h, v4.8h\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "smlal v14.4s, v30.4h, v0.4h\n"
+ "smlal v9.4s, v27.4h, v0.4h\n"
+ "ldr d4, [x8, #0x98]\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v7.4s, v23.4h, v0.4h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "smlal2 v8.4s, v27.8h, v0.8h\n"
+ "smlal v11.4s, v30.4h, v1.4h\n"
+ "smlal2 v21.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x27, x1]\n"
+ "smlal2 v6.4s, v23.8h, v0.8h\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "smlal v14.4s, v26.4h, v1.4h\n"
+ "smlal v9.4s, v23.4h, v1.4h\n"
+ "ldr d0, [x8, #0xa0]\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "smlal v7.4s, v31.4h, v1.4h\n"
+ "smlal2 v10.4s, v26.8h, v1.8h\n"
+ "smlal2 v8.4s, v23.8h, v1.8h\n"
+ "smlal v11.4s, v26.4h, v2.4h\n"
+ "smlal2 v21.4s, v26.8h, v2.8h\n"
+ "smlal2 v6.4s, v31.8h, v1.8h\n"
+ "ldr d26, [x25, x1]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v14.4s, v25.4h, v2.4h\n"
+ "smlal v9.4s, v31.4h, v2.4h\n"
+ "ldr d1, [x8, #0xa8]\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "smlal v7.4s, v30.4h, v2.4h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "smlal v11.4s, v25.4h, v3.4h\n"
+ "smlal2 v21.4s, v25.8h, v3.8h\n"
+ "smlal2 v6.4s, v30.8h, v2.8h\n"
+ "ldr d25, [x24, x1]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal v14.4s, v24.4h, v3.4h\n"
+ "smlal v9.4s, v30.4h, v3.4h\n"
+ "ldr d2, [x8, #0xb0]\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "smlal v7.4s, v28.4h, v3.4h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "smlal v11.4s, v24.4h, v4.4h\n"
+ "smlal2 v21.4s, v24.8h, v4.8h\n"
+ "ldr d24, [x23, x1]\n"
+ "smlal2 v6.4s, v28.8h, v3.8h\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "smlal v14.4s, v22.4h, v4.4h\n"
+ "smlal v9.4s, v28.4h, v4.4h\n"
+ "ldr d3, [x8, #0xb8]\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "smlal v7.4s, v26.4h, v4.4h\n"
+ "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "smlal v11.4s, v27.4h, v0.4h\n"
+ "smlal2 v21.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x22, x1]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal2 v10.4s, v22.8h, v4.8h\n"
+ "smlal2 v6.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x8, #0xc0]\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "smlal v14.4s, v23.4h, v0.4h\n"
+ "smlal v9.4s, v25.4h, v0.4h\n"
+ "smlal v7.4s, v24.4h, v0.4h\n"
+ "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x7, x1]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal2 v10.4s, v23.8h, v0.8h\n"
+ "smlal2 v6.4s, v24.8h, v0.8h\n"
+ "smlal v11.4s, v23.4h, v1.4h\n"
+ "smlal v14.4s, v31.4h, v1.4h\n"
+ "smlal v9.4s, v24.4h, v1.4h\n"
+ "smlal v7.4s, v27.4h, v1.4h\n"
+ "smlal2 v8.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x20, x1]\n"
+ "smlal2 v21.4s, v23.8h, v1.8h\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "smlal2 v10.4s, v31.8h, v1.8h\n"
+ "smlal2 v6.4s, v27.8h, v1.8h\n"
+ "smlal v11.4s, v31.4h, v2.4h\n"
+ "smlal v14.4s, v30.4h, v2.4h\n"
+ "smlal v9.4s, v27.4h, v2.4h\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v8.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x19, x1]\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal2 v10.4s, v30.8h, v2.8h\n"
+ "smlal2 v6.4s, v25.8h, v2.8h\n"
+ "add x1, x1, #0x8\n"
+ "smlal v11.4s, v30.4h, v3.4h\n"
+ "smlal v14.4s, v28.4h, v3.4h\n"
+ "smlal v9.4s, v25.4h, v3.4h\n"
+ "smlal v7.4s, v24.4h, v3.4h\n"
+ "smlal2 v21.4s, v30.8h, v3.8h\n"
+ "smlal2 v10.4s, v28.8h, v3.8h\n"
+ "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "smlal2 v6.4s, v24.8h, v3.8h\n"
+ "smlal v11.4s, v28.4h, v4.4h\n"
+ "smlal v14.4s, v26.4h, v4.4h\n"
+ "sqdmulh v11.4s, v11.4s, v17.4s\n"
+ "smlal v9.4s, v24.4h, v4.4h\n"
+ "smlal v7.4s, v27.4h, v4.4h\n"
+ "sqdmulh v14.4s, v14.4s, v17.4s\n"
+ "smlal2 v21.4s, v28.8h, v4.8h\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "sqdmulh v9.4s, v9.4s, v17.4s\n"
+ "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "smlal2 v6.4s, v27.8h, v4.8h\n"
+ "sqdmulh v7.4s, v7.4s, v17.4s\n"
+ "and v23.16b, v11.16b, v5.16b\n"
+ "sqdmulh v21.4s, v21.4s, v18.4s\n"
+ "and v22.16b, v14.16b, v5.16b\n"
+ "sqdmulh v10.4s, v10.4s, v18.4s\n"
+ "and v17.16b, v9.16b, v5.16b\n"
+ "sqdmulh v8.4s, v8.4s, v18.4s\n"
+ "and v20.16b, v7.16b, v5.16b\n"
+ "sqdmulh v6.4s, v6.4s, v18.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v19.16b, v21.16b, v29.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v18.16b, v10.16b, v29.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v26.16b, v8.16b, v29.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v4.16b, v6.16b, v29.16b\n"
+ "sqadd v11.4s, v11.4s, v23.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v14.4s, v14.4s, v22.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v17.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sqadd v7.4s, v7.4s, v20.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "srshl v11.4s, v11.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v19.4s\n"
+ "srshl v14.4s, v14.4s, v5.4s\n"
+ "sqadd v10.4s, v10.4s, v18.4s\n"
+ "srshl v9.4s, v9.4s, v5.4s\n"
+ "sqadd v8.4s, v8.4s, v26.4s\n"
+ "srshl v7.4s, v7.4s, v5.4s\n"
+ "sqadd v6.4s, v6.4s, v4.4s\n"
+ "srshl v21.4s, v21.4s, v29.4s\n"
+ "sqxtn v11.4h, v11.4s\n"
+ "srshl v10.4s, v10.4s, v29.4s\n"
+ "sqxtn v14.4h, v14.4s\n"
+ "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v6.4s, v6.4s, v29.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "sqxtn2 v11.8h, v21.4s\n"
+ "sqxtn2 v14.8h, v10.4s\n"
+ "sqxtn2 v9.8h, v8.4s\n"
+ "sqxtn2 v7.8h, v6.4s\n"
+ "sqadd v11.8h, v11.8h, v16.8h\n"
+ "sqadd v14.8h, v14.8h, v16.8h\n"
+ "sqadd v9.8h, v9.8h, v16.8h\n"
+ "sqadd v7.8h, v7.8h, v16.8h\n"
+ "smax v11.8h, v11.8h, v12.8h\n"
+ "smax v14.8h, v14.8h, v12.8h\n"
+ "smax v9.8h, v9.8h, v12.8h\n"
+ "smax v7.8h, v7.8h, v12.8h\n"
+ "smin v11.8h, v11.8h, v13.8h\n"
+ "smin v14.8h, v14.8h, v13.8h\n"
+ "smin v9.8h, v9.8h, v13.8h\n"
+ "smin v7.8h, v7.8h, v13.8h\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "uzp1 v14.16b, v14.16b, v14.16b\n"
+ "str d11, [x21, x2]\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "str d14, [x15, x2]\n"
+ "str d9, [x17, x2]\n"
+ "str d7, [x16, x2]\n"
+ "add x2, x2, #0x8\n"
+ "beq 124f\n"
+ "add x8, x8, #0xc8\n"
+ "3:" // Oddments
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x4, #2, 5f\n"
+ "ld1 { v11.4s }, [x19], #0x10\n"
+ "tbz x4, #1, 4f\n"
+ "ld1 { v21.d }[0], [x19], #0x8\n"
+ "tbz x4, #0, 7f\n"
+ "ld1 { v21.s }[2], [x19]\n"
+ "b 7f\n"
+ "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 7f\n"
+ "ld1 { v21.s }[0], [x19]\n"
+ "b 7f\n"
+ "5:" // Oddments: Load bias: Bit 2: Unset
+ "tbz x4, #1, 6f\n"
+ "ld1 { v11.d }[0], [x19], #0x8\n"
+ "tbz x4, #0, 7f\n"
+ "ld1 { v11.s }[2], [x19]\n"
+ "b 7f\n"
+ "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 7f\n"
+ "ld1 { v11.s }[0], [x19]\n"
+ "7:" // Oddments: Load bias: Bit 2: End
+ "ldr d0, [x8, #0x0]\n"
+ "ldr d1, [x8, #0x8]\n"
+ "mov v14.16b, v11.16b\n"
+ "mov v10.16b, v21.16b\n"
+ "ldr d2, [x8, #0x10]\n"
+ "ldr d3, [x8, #0x18]\n"
+ "mov v9.16b, v11.16b\n"
+ "mov v8.16b, v21.16b\n"
+ "ldr d4, [x8, #0x20]\n"
+ "ldp x28, x27, [x0, #0x0]\n"
+ "mov v7.16b, v11.16b\n"
+ "mov v6.16b, v21.16b\n"
+ "ldp x10, x26, [x0, #0x10]\n"
+ "ldp x24, x23, [x0, #0x20]\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "ldp x22, x25, [x0, #0x30]\n"
+ "ldp x20, x19, [x0, #0x40]\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "add x28, x28, x1\n"
+ "add x27, x27, x1\n"
+ "add x10, x10, x1\n"
+ "add x26, x26, x1\n"
+ "add x24, x24, x1\n"
+ "add x23, x23, x1\n"
+ "add x22, x22, x1\n"
+ "add x25, x25, x1\n"
+ "add x20, x20, x1\n"
+ "add x19, x19, x1\n"
+ "tbz x4, #2, 9f\n"
+ "ld1 { v31.s }[0], [x28], #0x4\n"
+ "ld1 { v30.s }[0], [x27], #0x4\n"
+ "ld1 { v29.s }[0], [x10], #0x4\n"
+ "ld1 { v28.s }[0], [x26], #0x4\n"
+ "ld1 { v27.s }[0], [x24], #0x4\n"
+ "ld1 { v23.s }[0], [x23], #0x4\n"
+ "ld1 { v25.s }[0], [x22], #0x4\n"
+ "ld1 { v24.s }[0], [x25], #0x4\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "ld1 { v22.s }[0], [x19], #0x4\n"
+ "tbz x4, #1, 8f\n"
+ "ld1 { v31.h }[2], [x28], #0x2\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v29.h }[2], [x10], #0x2\n"
+ "ld1 { v28.h }[2], [x26], #0x2\n"
+ "ld1 { v27.h }[2], [x24], #0x2\n"
+ "ld1 { v23.h }[2], [x23], #0x2\n"
+ "ld1 { v25.h }[2], [x22], #0x2\n"
+ "ld1 { v24.h }[2], [x25], #0x2\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "ld1 { v22.h }[2], [x19], #0x2\n"
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[6], [x28]\n"
+ "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v29.b }[6], [x10]\n"
+ "ld1 { v28.b }[6], [x26]\n"
+ "ld1 { v27.b }[6], [x24]\n"
+ "ld1 { v23.b }[6], [x23]\n"
+ "ld1 { v25.b }[6], [x22]\n"
+ "ld1 { v24.b }[6], [x25]\n"
+ "ld1 { v26.b }[6], [x20]\n"
+ "ld1 { v22.b }[6], [x19]\n"
+ "b 11f\n"
+ "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[4], [x28]\n"
+ "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v29.b }[4], [x10]\n"
+ "ld1 { v28.b }[4], [x26]\n"
+ "ld1 { v27.b }[4], [x24]\n"
+ "ld1 { v23.b }[4], [x23]\n"
+ "ld1 { v25.b }[4], [x22]\n"
+ "ld1 { v24.b }[4], [x25]\n"
+ "ld1 { v26.b }[4], [x20]\n"
+ "ld1 { v22.b }[4], [x19]\n"
+ "b 11f\n"
+ "9:" // Oddments: Initial loads: Bit 2: Unset
+ "tbz x4, #1, 10f\n"
+ "ld1 { v31.h }[0], [x28], #0x2\n"
+ "ld1 { v30.h }[0], [x27], #0x2\n"
+ "ld1 { v29.h }[0], [x10], #0x2\n"
+ "ld1 { v28.h }[0], [x26], #0x2\n"
+ "ld1 { v27.h }[0], [x24], #0x2\n"
+ "ld1 { v23.h }[0], [x23], #0x2\n"
+ "ld1 { v25.h }[0], [x22], #0x2\n"
+ "ld1 { v24.h }[0], [x25], #0x2\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "ld1 { v22.h }[0], [x19], #0x2\n"
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[2], [x28]\n"
+ "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v29.b }[2], [x10]\n"
+ "ld1 { v28.b }[2], [x26]\n"
+ "ld1 { v27.b }[2], [x24]\n"
+ "ld1 { v23.b }[2], [x23]\n"
+ "ld1 { v25.b }[2], [x22]\n"
+ "ld1 { v24.b }[2], [x25]\n"
+ "ld1 { v26.b }[2], [x20]\n"
+ "ld1 { v22.b }[2], [x19]\n"
+ "b 11f\n"
+ "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 11f\n"
+ "ld1 { v31.b }[0], [x28]\n"
+ "ld1 { v30.b }[0], [x27]\n"
+ "ld1 { v29.b }[0], [x10]\n"
+ "ld1 { v28.b }[0], [x26]\n"
+ "ld1 { v27.b }[0], [x24]\n"
+ "ld1 { v23.b }[0], [x23]\n"
+ "ld1 { v25.b }[0], [x22]\n"
+ "ld1 { v24.b }[0], [x25]\n"
+ "ld1 { v26.b }[0], [x20]\n"
+ "ld1 { v22.b }[0], [x19]\n"
+ "11:" // Oddments: Initial loads: Bit 2: End
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "smlal v11.4s, v31.4h, v0.4h\n"
+ "ldr x19, [x0, #0x50]\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "smlal2 v21.4s, v31.8h, v0.8h\n"
+ "smlal v14.4s, v30.4h, v0.4h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "smlal v9.4s, v29.4h, v0.4h\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "add x19, x19, x1\n"
+ "smlal2 v8.4s, v29.8h, v0.8h\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal v7.4s, v28.4h, v0.4h\n"
+ "smlal2 v6.4s, v28.8h, v0.8h\n"
+ "smlal v11.4s, v30.4h, v1.4h\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "smlal2 v21.4s, v30.8h, v1.8h\n"
+ "smlal v14.4s, v27.4h, v1.4h\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "smlal2 v10.4s, v27.8h, v1.8h\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "smlal2 v8.4s, v28.8h, v1.8h\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "smlal v7.4s, v23.4h, v1.4h\n"
+ "ushll v22.8h, v22.8b, #0x0\n"
+ "smlal2 v6.4s, v23.8h, v1.8h\n"
+ "smlal v11.4s, v27.4h, v2.4h\n"
+ "smlal2 v21.4s, v27.8h, v2.8h\n"
+ "smlal v14.4s, v25.4h, v2.4h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "smlal v9.4s, v23.4h, v2.4h\n"
+ "smlal2 v8.4s, v23.8h, v2.8h\n"
+ "tbz x4, #2, 13f\n"
+ "ld1 { v31.s }[0], [x19], #0x4\n"
+ "tbz x4, #1, 12f\n"
+ "ld1 { v31.h }[2], [x19], #0x2\n"
+ "tbz x4, #0, 15f\n"
+ "ld1 { v31.b }[6], [x19]\n"
+ "b 15f\n"
+ "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 15f\n"
+ "ld1 { v31.b }[4], [x19]\n"
+ "b 15f\n"
+ "13:" // Oddments: Load (1, 3): Bit 2: Unset
+ "tbz x4, #1, 14f\n"
+ "ld1 { v31.h }[0], [x19], #0x2\n"
+ "tbz x4, #0, 15f\n"
+ "ld1 { v31.b }[2], [x19]\n"
+ "b 15f\n"
+ "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 15f\n"
+ "ld1 { v31.b }[0], [x19]\n"
+ "15:" // Oddments: Load (1, 3): Bit 2: End
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ldr x20, [x0, #0x58]\n"
+ "smlal v7.4s, v31.4h, v2.4h\n"
+ "smlal2 v6.4s, v31.8h, v2.8h\n"
+ "smlal v11.4s, v25.4h, v3.4h\n"
+ "smlal2 v21.4s, v25.8h, v3.8h\n"
+ "add x20, x20, x1\n"
+ "smlal v14.4s, v24.4h, v3.4h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "smlal v9.4s, v31.4h, v3.4h\n"
+ "smlal2 v8.4s, v31.8h, v3.8h\n"
+ "tbz x4, #2, 17f\n"
+ "ld1 { v30.s }[0], [x20], #0x4\n"
+ "tbz x4, #1, 16f\n"
+ "ld1 { v30.h }[2], [x20], #0x2\n"
+ "tbz x4, #0, 19f\n"
+ "ld1 { v30.b }[6], [x20]\n"
+ "b 19f\n"
+ "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 19f\n"
+ "ld1 { v30.b }[4], [x20]\n"
+ "b 19f\n"
+ "17:" // Oddments: Load (1, 4): Bit 2: Unset
+ "tbz x4, #1, 18f\n"
+ "ld1 { v30.h }[0], [x20], #0x2\n"
+ "tbz x4, #0, 19f\n"
+ "ld1 { v30.b }[2], [x20]\n"
+ "b 19f\n"
+ "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 19f\n"
+ "ld1 { v30.b }[0], [x20]\n"
+ "19:" // Oddments: Load (1, 4): Bit 2: End
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ldr x19, [x0, #0x60]\n"
+ "smlal v7.4s, v30.4h, v3.4h\n"
+ "smlal2 v6.4s, v30.8h, v3.8h\n"
+ "smlal v11.4s, v24.4h, v4.4h\n"
+ "smlal2 v21.4s, v24.8h, v4.8h\n"
+ "add x19, x19, x1\n"
+ "tbz x4, #2, 21f\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "tbz x4, #1, 20f\n"
+ "ld1 { v27.h }[2], [x19], #0x2\n"
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[6], [x19]\n"
+ "b 23f\n"
+ "20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[4], [x19]\n"
+ "b 23f\n"
+ "21:" // Oddments: Load (0, 5): Bit 2: Unset
+ "tbz x4, #1, 22f\n"
+ "ld1 { v27.h }[0], [x19], #0x2\n"
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[2], [x19]\n"
+ "b 23f\n"
+ "22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 23f\n"
+ "ld1 { v27.b }[0], [x19]\n"
+ "23:" // Oddments: Load (0, 5): Bit 2: End
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ldr d0, [x8, #0x28]\n"
+ "smlal v14.4s, v27.4h, v4.4h\n"
+ "smlal2 v10.4s, v27.8h, v4.8h\n"
+ "smlal v9.4s, v30.4h, v4.4h\n"
+ "smlal2 v8.4s, v30.8h, v4.8h\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "ldr x24, [x0, #0x68]\n"
+ "smlal v7.4s, v26.4h, v4.4h\n"
+ "smlal2 v6.4s, v26.8h, v4.8h\n"
+ "add x24, x24, x1\n"
+ "smlal v11.4s, v29.4h, v0.4h\n"
+ "smlal2 v21.4s, v29.8h, v0.8h\n"
+ "smlal v14.4s, v28.4h, v0.4h\n"
+ "smlal2 v10.4s, v28.8h, v0.8h\n"
+ "smlal v9.4s, v22.4h, v0.4h\n"
+ "smlal2 v8.4s, v22.8h, v0.8h\n"
+ "tbz x4, #2, 25f\n"
+ "ld1 { v25.s }[0], [x24], #0x4\n"
+ "tbz x4, #1, 24f\n"
+ "ld1 { v25.h }[2], [x24], #0x2\n"
+ "tbz x4, #0, 27f\n"
+ "ld1 { v25.b }[6], [x24]\n"
+ "b 27f\n"
+ "24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 27f\n"
+ "ld1 { v25.b }[4], [x24]\n"
+ "b 27f\n"
+ "25:" // Oddments: Load (2, 1): Bit 2: Unset
+ "tbz x4, #1, 26f\n"
+ "ld1 { v25.h }[0], [x24], #0x2\n"
+ "tbz x4, #0, 27f\n"
+ "ld1 { v25.b }[2], [x24]\n"
+ "b 27f\n"
+ "26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 27f\n"
+ "ld1 { v25.b }[0], [x24]\n"
+ "27:" // Oddments: Load (2, 1): Bit 2: End
+ "ldr d1, [x8, #0x30]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "ldr x23, [x0, #0x70]\n"
+ "smlal v7.4s, v25.4h, v0.4h\n"
+ "smlal2 v6.4s, v25.8h, v0.8h\n"
+ "add x23, x23, x1\n"
+ "smlal v11.4s, v28.4h, v1.4h\n"
+ "smlal2 v21.4s, v28.8h, v1.8h\n"
+ "smlal v14.4s, v23.4h, v1.4h\n"
+ "smlal2 v10.4s, v23.8h, v1.8h\n"
+ "smlal v9.4s, v25.4h, v1.4h\n"
+ "smlal2 v8.4s, v25.8h, v1.8h\n"
+ "tbz x4, #2, 29f\n"
+ "ld1 { v24.s }[0], [x23], #0x4\n"
+ "tbz x4, #1, 28f\n"
+ "ld1 { v24.h }[2], [x23], #0x2\n"
+ "tbz x4, #0, 31f\n"
+ "ld1 { v24.b }[6], [x23]\n"
+ "b 31f\n"
+ "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 31f\n"
+ "ld1 { v24.b }[4], [x23]\n"
+ "b 31f\n"
+ "29:" // Oddments: Load (2, 2): Bit 2: Unset
+ "tbz x4, #1, 30f\n"
+ "ld1 { v24.h }[0], [x23], #0x2\n"
+ "tbz x4, #0, 31f\n"
+ "ld1 { v24.b }[2], [x23]\n"
+ "b 31f\n"
+ "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 31f\n"
+ "ld1 { v24.b }[0], [x23]\n"
+ "31:" // Oddments: Load (2, 2): Bit 2: End
+ "ldr d2, [x8, #0x38]\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "ldr x26, [x0, #0x78]\n"
+ "smlal v7.4s, v24.4h, v1.4h\n"
+ "smlal2 v6.4s, v24.8h, v1.8h\n"
+ "add x26, x26, x1\n"
+ "smlal v11.4s, v23.4h, v2.4h\n"
+ "smlal2 v21.4s, v23.8h, v2.8h\n"
+ "smlal v14.4s, v31.4h, v2.4h\n"
+ "smlal2 v10.4s, v31.8h, v2.8h\n"
+ "smlal v9.4s, v24.4h, v2.4h\n"
+ "smlal2 v8.4s, v24.8h, v2.8h\n"
+ "tbz x4, #2, 33f\n"
+ "ld1 { v27.s }[0], [x26], #0x4\n"
+ "tbz x4, #1, 32f\n"
+ "ld1 { v27.h }[2], [x26], #0x2\n"
+ "tbz x4, #0, 35f\n"
+ "ld1 { v27.b }[6], [x26]\n"
+ "b 35f\n"
+ "32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 35f\n"
+ "ld1 { v27.b }[4], [x26]\n"
+ "b 35f\n"
+ "33:" // Oddments: Load (2, 3): Bit 2: Unset
+ "tbz x4, #1, 34f\n"
+ "ld1 { v27.h }[0], [x26], #0x2\n"
+ "tbz x4, #0, 35f\n"
+ "ld1 { v27.b }[2], [x26]\n"
+ "b 35f\n"
+ "34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 35f\n"
+ "ld1 { v27.b }[0], [x26]\n"
+ "35:" // Oddments: Load (2, 3): Bit 2: End
+ "ldr d3, [x8, #0x40]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x7, [x0, #0x80]\n"
+ "smlal v7.4s, v27.4h, v2.4h\n"
+ "smlal2 v6.4s, v27.8h, v2.8h\n"
+ "add x7, x7, x1\n"
+ "smlal v11.4s, v31.4h, v3.4h\n"
+ "smlal2 v21.4s, v31.8h, v3.8h\n"
+ "smlal v14.4s, v30.4h, v3.4h\n"
+ "smlal2 v10.4s, v30.8h, v3.8h\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "smlal2 v8.4s, v27.8h, v3.8h\n"
+ "tbz x4, #2, 37f\n"
+ "ld1 { v23.s }[0], [x7], #0x4\n"
+ "tbz x4, #1, 36f\n"
+ "ld1 { v23.h }[2], [x7], #0x2\n"
+ "tbz x4, #0, 39f\n"
+ "ld1 { v23.b }[6], [x7]\n"
+ "b 39f\n"
+ "36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 39f\n"
+ "ld1 { v23.b }[4], [x7]\n"
+ "b 39f\n"
+ "37:" // Oddments: Load (2, 4): Bit 2: Unset
+ "tbz x4, #1, 38f\n"
+ "ld1 { v23.h }[0], [x7], #0x2\n"
+ "tbz x4, #0, 39f\n"
+ "ld1 { v23.b }[2], [x7]\n"
+ "b 39f\n"
+ "38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 39f\n"
+ "ld1 { v23.b }[0], [x7]\n"
+ "39:" // Oddments: Load (2, 4): Bit 2: End
+ "ldr d4, [x8, #0x48]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x22, [x0, #0x88]\n"
+ "smlal v7.4s, v23.4h, v3.4h\n"
+ "smlal2 v6.4s, v23.8h, v3.8h\n"
+ "add x22, x22, x1\n"
+ "smlal v11.4s, v30.4h, v4.4h\n"
+ "smlal2 v21.4s, v30.8h, v4.8h\n"
+ "smlal v14.4s, v26.4h, v4.4h\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "smlal v9.4s, v23.4h, v4.4h\n"
+ "smlal2 v8.4s, v23.8h, v4.8h\n"
+ "tbz x4, #2, 41f\n"
+ "ld1 { v28.s }[0], [x22], #0x4\n"
+ "tbz x4, #1, 40f\n"
+ "ld1 { v28.h }[2], [x22], #0x2\n"
+ "tbz x4, #0, 43f\n"
+ "ld1 { v28.b }[6], [x22]\n"
+ "b 43f\n"
+ "40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 43f\n"
+ "ld1 { v28.b }[4], [x22]\n"
+ "b 43f\n"
+ "41:" // Oddments: Load (2, 5): Bit 2: Unset
+ "tbz x4, #1, 42f\n"
+ "ld1 { v28.h }[0], [x22], #0x2\n"
+ "tbz x4, #0, 43f\n"
+ "ld1 { v28.b }[2], [x22]\n"
+ "b 43f\n"
+ "42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 43f\n"
+ "ld1 { v28.b }[0], [x22]\n"
+ "43:" // Oddments: Load (2, 5): Bit 2: End
+ "ldr d0, [x8, #0x50]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "ldr x20, [x0, #0x90]\n"
+ "smlal v7.4s, v28.4h, v4.4h\n"
+ "smlal2 v6.4s, v28.8h, v4.8h\n"
+ "add x20, x20, x1\n"
+ "smlal v11.4s, v22.4h, v0.4h\n"
+ "smlal2 v21.4s, v22.8h, v0.8h\n"
+ "smlal v14.4s, v25.4h, v0.4h\n"
+ "smlal2 v10.4s, v25.8h, v0.8h\n"
+ "tbz x4, #2, 45f\n"
+ "ld1 { v31.s }[0], [x20], #0x4\n"
+ "tbz x4, #1, 44f\n"
+ "ld1 { v31.h }[2], [x20], #0x2\n"
+ "tbz x4, #0, 47f\n"
+ "ld1 { v31.b }[6], [x20]\n"
+ "b 47f\n"
+ "44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 47f\n"
+ "ld1 { v31.b }[4], [x20]\n"
+ "b 47f\n"
+ "45:" // Oddments: Load (3, 0): Bit 2: Unset
+ "tbz x4, #1, 46f\n"
+ "ld1 { v31.h }[0], [x20], #0x2\n"
+ "tbz x4, #0, 47f\n"
+ "ld1 { v31.b }[2], [x20]\n"
+ "b 47f\n"
+ "46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 47f\n"
+ "ld1 { v31.b }[0], [x20]\n"
+ "47:" // Oddments: Load (3, 0): Bit 2: End
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "ldr x14, [x0, #0x98]\n"
+ "smlal v9.4s, v31.4h, v0.4h\n"
+ "smlal2 v8.4s, v31.8h, v0.8h\n"
+ "add x14, x14, x1\n"
+ "tbz x4, #2, 49f\n"
+ "ld1 { v30.s }[0], [x14], #0x4\n"
+ "tbz x4, #1, 48f\n"
+ "ld1 { v30.h }[2], [x14], #0x2\n"
+ "tbz x4, #0, 51f\n"
+ "ld1 { v30.b }[6], [x14]\n"
+ "b 51f\n"
+ "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 51f\n"
+ "ld1 { v30.b }[4], [x14]\n"
+ "b 51f\n"
+ "49:" // Oddments: Load (3, 1): Bit 2: Unset
+ "tbz x4, #1, 50f\n"
+ "ld1 { v30.h }[0], [x14], #0x2\n"
+ "tbz x4, #0, 51f\n"
+ "ld1 { v30.b }[2], [x14]\n"
+ "b 51f\n"
+ "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 51f\n"
+ "ld1 { v30.b }[0], [x14]\n"
+ "51:" // Oddments: Load (3, 1): Bit 2: End
+ "ldr d1, [x8, #0x58]\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "ldr x19, [x0, #0xa0]\n"
+ "smlal v7.4s, v30.4h, v0.4h\n"
+ "smlal2 v6.4s, v30.8h, v0.8h\n"
+ "add x19, x19, x1\n"
+ "smlal v11.4s, v25.4h, v1.4h\n"
+ "smlal2 v21.4s, v25.8h, v1.8h\n"
+ "smlal v14.4s, v24.4h, v1.4h\n"
+ "smlal2 v10.4s, v24.8h, v1.8h\n"
+ "smlal v9.4s, v30.4h, v1.4h\n"
+ "smlal2 v8.4s, v30.8h, v1.8h\n"
+ "tbz x4, #2, 53f\n"
+ "ld1 { v26.s }[0], [x19], #0x4\n"
+ "tbz x4, #1, 52f\n"
+ "ld1 { v26.h }[2], [x19], #0x2\n"
+ "tbz x4, #0, 55f\n"
+ "ld1 { v26.b }[6], [x19]\n"
+ "b 55f\n"
+ "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 55f\n"
+ "ld1 { v26.b }[4], [x19]\n"
+ "b 55f\n"
+ "53:" // Oddments: Load (3, 2): Bit 2: Unset
+ "tbz x4, #1, 54f\n"
+ "ld1 { v26.h }[0], [x19], #0x2\n"
+ "tbz x4, #0, 55f\n"
+ "ld1 { v26.b }[2], [x19]\n"
+ "b 55f\n"
+ "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 55f\n"
+ "ld1 { v26.b }[0], [x19]\n"
+ "55:" // Oddments: Load (3, 2): Bit 2: End
+ "ldr d2, [x8, #0x60]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "ldr x13, [x0, #0xa8]\n"
+ "smlal v7.4s, v26.4h, v1.4h\n"
+ "smlal2 v6.4s, v26.8h, v1.8h\n"
+ "add x13, x13, x1\n"
+ "smlal v11.4s, v24.4h, v2.4h\n"
+ "smlal2 v21.4s, v24.8h, v2.8h\n"
+ "smlal v14.4s, v27.4h, v2.4h\n"
+ "smlal2 v10.4s, v27.8h, v2.8h\n"
+ "smlal v9.4s, v26.4h, v2.4h\n"
+ "smlal2 v8.4s, v26.8h, v2.8h\n"
+ "tbz x4, #2, 57f\n"
+ "ld1 { v25.s }[0], [x13], #0x4\n"
+ "tbz x4, #1, 56f\n"
+ "ld1 { v25.h }[2], [x13], #0x2\n"
+ "tbz x4, #0, 59f\n"
+ "ld1 { v25.b }[6], [x13]\n"
+ "b 59f\n"
+ "56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 59f\n"
+ "ld1 { v25.b }[4], [x13]\n"
+ "b 59f\n"
+ "57:" // Oddments: Load (3, 3): Bit 2: Unset
+ "tbz x4, #1, 58f\n"
+ "ld1 { v25.h }[0], [x13], #0x2\n"
+ "tbz x4, #0, 59f\n"
+ "ld1 { v25.b }[2], [x13]\n"
+ "b 59f\n"
+ "58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 59f\n"
+ "ld1 { v25.b }[0], [x13]\n"
+ "59:" // Oddments: Load (3, 3): Bit 2: End
+ "ldr d3, [x8, #0x68]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x12, [x0, #0xb0]\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v6.4s, v25.8h, v2.8h\n"
+ "add x12, x12, x1\n"
+ "smlal v11.4s, v27.4h, v3.4h\n"
+ "smlal2 v21.4s, v27.8h, v3.8h\n"
+ "smlal v14.4s, v23.4h, v3.4h\n"
+ "smlal2 v10.4s, v23.8h, v3.8h\n"
+ "smlal v9.4s, v25.4h, v3.4h\n"
+ "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "tbz x4, #2, 61f\n"
+ "ld1 { v24.s }[0], [x12], #0x4\n"
+ "tbz x4, #1, 60f\n"
+ "ld1 { v24.h }[2], [x12], #0x2\n"
+ "tbz x4, #0, 63f\n"
+ "ld1 { v24.b }[6], [x12]\n"
+ "b 63f\n"
+ "60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 63f\n"
+ "ld1 { v24.b }[4], [x12]\n"
+ "b 63f\n"
+ "61:" // Oddments: Load (3, 4): Bit 2: Unset
+ "tbz x4, #1, 62f\n"
+ "ld1 { v24.h }[0], [x12], #0x2\n"
+ "tbz x4, #0, 63f\n"
+ "ld1 { v24.b }[2], [x12]\n"
+ "b 63f\n"
+ "62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 63f\n"
+ "ld1 { v24.b }[0], [x12]\n"
+ "63:" // Oddments: Load (3, 4): Bit 2: End
+ "ldr d4, [x8, #0x70]\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x11, [x0, #0xb8]\n"
+ "smlal v7.4s, v24.4h, v3.4h\n"
+ "smlal2 v6.4s, v24.8h, v3.8h\n"
+ "add x11, x11, x1\n"
+ "smlal v11.4s, v23.4h, v4.4h\n"
+ "smlal2 v21.4s, v23.8h, v4.8h\n"
+ "smlal v14.4s, v28.4h, v4.4h\n"
+ "smlal2 v10.4s, v28.8h, v4.8h\n"
+ "smlal v9.4s, v24.4h, v4.4h\n"
+ "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "tbz x4, #2, 65f\n"
+ "ld1 { v22.s }[0], [x11], #0x4\n"
+ "tbz x4, #1, 64f\n"
+ "ld1 { v22.h }[2], [x11], #0x2\n"
+ "tbz x4, #0, 67f\n"
+ "ld1 { v22.b }[6], [x11]\n"
+ "b 67f\n"
+ "64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 67f\n"
+ "ld1 { v22.b }[4], [x11]\n"
+ "b 67f\n"
+ "65:" // Oddments: Load (3, 5): Bit 2: Unset
+ "tbz x4, #1, 66f\n"
+ "ld1 { v22.h }[0], [x11], #0x2\n"
+ "tbz x4, #0, 67f\n"
+ "ld1 { v22.b }[2], [x11]\n"
+ "b 67f\n"
+ "66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 67f\n"
+ "ld1 { v22.b }[0], [x11]\n"
+ "67:" // Oddments: Load (3, 5): Bit 2: End
+ "ldr d0, [x8, #0x78]\n"
+ "ushll v22.8h, v22.8b, #0x0\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "ldr x10, [x0, #0xc0]\n"
+ "smlal v7.4s, v22.4h, v4.4h\n"
+ "smlal2 v6.4s, v22.8h, v4.8h\n"
+ "add x10, x10, x1\n"
+ "smlal v11.4s, v31.4h, v0.4h\n"
+ "smlal2 v21.4s, v31.8h, v0.8h\n"
+ "smlal v14.4s, v30.4h, v0.4h\n"
+ "smlal2 v10.4s, v30.8h, v0.8h\n"
+ "tbz x4, #2, 69f\n"
+ "ld1 { v27.s }[0], [x10], #0x4\n"
+ "tbz x4, #1, 68f\n"
+ "ld1 { v27.h }[2], [x10], #0x2\n"
+ "tbz x4, #0, 71f\n"
+ "ld1 { v27.b }[6], [x10]\n"
+ "b 71f\n"
+ "68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 71f\n"
+ "ld1 { v27.b }[4], [x10]\n"
+ "b 71f\n"
+ "69:" // Oddments: Load (4, 0): Bit 2: Unset
+ "tbz x4, #1, 70f\n"
+ "ld1 { v27.h }[0], [x10], #0x2\n"
+ "tbz x4, #0, 71f\n"
+ "ld1 { v27.b }[2], [x10]\n"
+ "b 71f\n"
+ "70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 71f\n"
+ "ld1 { v27.b }[0], [x10]\n"
+ "71:" // Oddments: Load (4, 0): Bit 2: End
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ldr x9, [x0, #0xc8]\n"
+ "smlal v9.4s, v27.4h, v0.4h\n"
+ "smlal2 v8.4s, v27.8h, v0.8h\n"
+ "add x9, x9, x1\n"
+ "tbz x4, #2, 73f\n"
+ "ld1 { v23.s }[0], [x9], #0x4\n"
+ "tbz x4, #1, 72f\n"
+ "ld1 { v23.h }[2], [x9], #0x2\n"
+ "tbz x4, #0, 75f\n"
+ "ld1 { v23.b }[6], [x9]\n"
+ "b 75f\n"
+ "72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 75f\n"
+ "ld1 { v23.b }[4], [x9]\n"
+ "b 75f\n"
+ "73:" // Oddments: Load (4, 1): Bit 2: Unset
+ "tbz x4, #1, 74f\n"
+ "ld1 { v23.h }[0], [x9], #0x2\n"
+ "tbz x4, #0, 75f\n"
+ "ld1 { v23.b }[2], [x9]\n"
+ "b 75f\n"
+ "74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 75f\n"
+ "ld1 { v23.b }[0], [x9]\n"
+ "75:" // Oddments: Load (4, 1): Bit 2: End
+ "ldr d1, [x8, #0x80]\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "ldr x28, [x0, #0xd0]\n"
+ "smlal v7.4s, v23.4h, v0.4h\n"
+ "smlal2 v6.4s, v23.8h, v0.8h\n"
+ "add x28, x28, x1\n"
+ "smlal v11.4s, v30.4h, v1.4h\n"
+ "smlal2 v21.4s, v30.8h, v1.8h\n"
+ "smlal v14.4s, v26.4h, v1.4h\n"
+ "smlal2 v10.4s, v26.8h, v1.8h\n"
+ "smlal v9.4s, v23.4h, v1.4h\n"
+ "smlal2 v8.4s, v23.8h, v1.8h\n"
+ "tbz x4, #2, 77f\n"
+ "ld1 { v31.s }[0], [x28], #0x4\n"
+ "tbz x4, #1, 76f\n"
+ "ld1 { v31.h }[2], [x28], #0x2\n"
+ "tbz x4, #0, 79f\n"
+ "ld1 { v31.b }[6], [x28]\n"
+ "b 79f\n"
+ "76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 79f\n"
+ "ld1 { v31.b }[4], [x28]\n"
+ "b 79f\n"
+ "77:" // Oddments: Load (4, 2): Bit 2: Unset
+ "tbz x4, #1, 78f\n"
+ "ld1 { v31.h }[0], [x28], #0x2\n"
+ "tbz x4, #0, 79f\n"
+ "ld1 { v31.b }[2], [x28]\n"
+ "b 79f\n"
+ "78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 79f\n"
+ "ld1 { v31.b }[0], [x28]\n"
+ "79:" // Oddments: Load (4, 2): Bit 2: End
+ "ldr d2, [x8, #0x88]\n"
+ "ushll v31.8h, v31.8b, #0x0\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "ldr x27, [x0, #0xd8]\n"
+ "smlal v7.4s, v31.4h, v1.4h\n"
+ "smlal2 v6.4s, v31.8h, v1.8h\n"
+ "add x27, x27, x1\n"
+ "smlal v11.4s, v26.4h, v2.4h\n"
+ "smlal2 v21.4s, v26.8h, v2.8h\n"
+ "smlal v14.4s, v25.4h, v2.4h\n"
+ "smlal2 v10.4s, v25.8h, v2.8h\n"
+ "smlal v9.4s, v31.4h, v2.4h\n"
+ "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "tbz x4, #2, 81f\n"
+ "ld1 { v30.s }[0], [x27], #0x4\n"
+ "tbz x4, #1, 80f\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "tbz x4, #0, 83f\n"
+ "ld1 { v30.b }[6], [x27]\n"
+ "b 83f\n"
+ "80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 83f\n"
+ "ld1 { v30.b }[4], [x27]\n"
+ "b 83f\n"
+ "81:" // Oddments: Load (4, 3): Bit 2: Unset
+ "tbz x4, #1, 82f\n"
+ "ld1 { v30.h }[0], [x27], #0x2\n"
+ "tbz x4, #0, 83f\n"
+ "ld1 { v30.b }[2], [x27]\n"
+ "b 83f\n"
+ "82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 83f\n"
+ "ld1 { v30.b }[0], [x27]\n"
+ "83:" // Oddments: Load (4, 3): Bit 2: End
+ "ldr d3, [x8, #0x90]\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x26, [x0, #0xe0]\n"
+ "smlal v7.4s, v30.4h, v2.4h\n"
+ "smlal2 v6.4s, v30.8h, v2.8h\n"
+ "add x26, x26, x1\n"
+ "smlal v11.4s, v25.4h, v3.4h\n"
+ "smlal2 v21.4s, v25.8h, v3.8h\n"
+ "smlal v14.4s, v24.4h, v3.4h\n"
+ "smlal2 v10.4s, v24.8h, v3.8h\n"
+ "smlal v9.4s, v30.4h, v3.4h\n"
+ "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "tbz x4, #2, 85f\n"
+ "ld1 { v28.s }[0], [x26], #0x4\n"
+ "tbz x4, #1, 84f\n"
+ "ld1 { v28.h }[2], [x26], #0x2\n"
+ "tbz x4, #0, 87f\n"
+ "ld1 { v28.b }[6], [x26]\n"
+ "b 87f\n"
+ "84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 87f\n"
+ "ld1 { v28.b }[4], [x26]\n"
+ "b 87f\n"
+ "85:" // Oddments: Load (4, 4): Bit 2: Unset
+ "tbz x4, #1, 86f\n"
+ "ld1 { v28.h }[0], [x26], #0x2\n"
+ "tbz x4, #0, 87f\n"
+ "ld1 { v28.b }[2], [x26]\n"
+ "b 87f\n"
+ "86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 87f\n"
+ "ld1 { v28.b }[0], [x26]\n"
+ "87:" // Oddments: Load (4, 4): Bit 2: End
+ "ldr d4, [x8, #0x98]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x25, [x0, #0xe8]\n"
+ "smlal v7.4s, v28.4h, v3.4h\n"
+ "smlal2 v6.4s, v28.8h, v3.8h\n"
+ "add x25, x25, x1\n"
+ "smlal v11.4s, v24.4h, v4.4h\n"
+ "smlal2 v21.4s, v24.8h, v4.8h\n"
+ "smlal v14.4s, v22.4h, v4.4h\n"
+ "smlal2 v10.4s, v22.8h, v4.8h\n"
+ "smlal v9.4s, v28.4h, v4.4h\n"
+ "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "tbz x4, #2, 89f\n"
+ "ld1 { v26.s }[0], [x25], #0x4\n"
+ "tbz x4, #1, 88f\n"
+ "ld1 { v26.h }[2], [x25], #0x2\n"
+ "tbz x4, #0, 91f\n"
+ "ld1 { v26.b }[6], [x25]\n"
+ "b 91f\n"
+ "88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 91f\n"
+ "ld1 { v26.b }[4], [x25]\n"
+ "b 91f\n"
+ "89:" // Oddments: Load (4, 5): Bit 2: Unset
+ "tbz x4, #1, 90f\n"
+ "ld1 { v26.h }[0], [x25], #0x2\n"
+ "tbz x4, #0, 91f\n"
+ "ld1 { v26.b }[2], [x25]\n"
+ "b 91f\n"
+ "90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 91f\n"
+ "ld1 { v26.b }[0], [x25]\n"
+ "91:" // Oddments: Load (4, 5): Bit 2: End
+ "ldr d0, [x8, #0xa0]\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "usubl v0.8h, v0.8b, v15.8b\n"
+ "ldr x24, [x0, #0xf0]\n"
+ "smlal v7.4s, v26.4h, v4.4h\n"
+ "smlal2 v6.4s, v26.8h, v4.8h\n"
+ "add x24, x24, x1\n"
+ "smlal v11.4s, v27.4h, v0.4h\n"
+ "smlal2 v21.4s, v27.8h, v0.8h\n"
+ "smlal v14.4s, v23.4h, v0.4h\n"
+ "smlal2 v10.4s, v23.8h, v0.8h\n"
+ "tbz x4, #2, 93f\n"
+ "ld1 { v25.s }[0], [x24], #0x4\n"
+ "tbz x4, #1, 92f\n"
+ "ld1 { v25.h }[2], [x24], #0x2\n"
+ "tbz x4, #0, 95f\n"
+ "ld1 { v25.b }[6], [x24]\n"
+ "b 95f\n"
+ "92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 95f\n"
+ "ld1 { v25.b }[4], [x24]\n"
+ "b 95f\n"
+ "93:" // Oddments: Load (5, 0): Bit 2: Unset
+ "tbz x4, #1, 94f\n"
+ "ld1 { v25.h }[0], [x24], #0x2\n"
+ "tbz x4, #0, 95f\n"
+ "ld1 { v25.b }[2], [x24]\n"
+ "b 95f\n"
+ "94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 95f\n"
+ "ld1 { v25.b }[0], [x24]\n"
+ "95:" // Oddments: Load (5, 0): Bit 2: End
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "ldr x23, [x0, #0xf8]\n"
+ "smlal v9.4s, v25.4h, v0.4h\n"
+ "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "add x23, x23, x1\n"
+ "tbz x4, #2, 97f\n"
+ "ld1 { v24.s }[0], [x23], #0x4\n"
+ "tbz x4, #1, 96f\n"
+ "ld1 { v24.h }[2], [x23], #0x2\n"
+ "tbz x4, #0, 99f\n"
+ "ld1 { v24.b }[6], [x23]\n"
+ "b 99f\n"
+ "96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 99f\n"
+ "ld1 { v24.b }[4], [x23]\n"
+ "b 99f\n"
+ "97:" // Oddments: Load (5, 1): Bit 2: Unset
+ "tbz x4, #1, 98f\n"
+ "ld1 { v24.h }[0], [x23], #0x2\n"
+ "tbz x4, #0, 99f\n"
+ "ld1 { v24.b }[2], [x23]\n"
+ "b 99f\n"
+ "98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 99f\n"
+ "ld1 { v24.b }[0], [x23]\n"
+ "99:" // Oddments: Load (5, 1): Bit 2: End
+ "ldr d1, [x8, #0xa8]\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "usubl v1.8h, v1.8b, v15.8b\n"
+ "ldr x22, [x0, #0x100]\n"
+ "smlal v7.4s, v24.4h, v0.4h\n"
+ "smlal2 v6.4s, v24.8h, v0.8h\n"
+ "add x22, x22, x1\n"
+ "smlal v11.4s, v23.4h, v1.4h\n"
+ "smlal2 v21.4s, v23.8h, v1.8h\n"
+ "smlal v14.4s, v31.4h, v1.4h\n"
+ "smlal2 v10.4s, v31.8h, v1.8h\n"
+ "smlal v9.4s, v24.4h, v1.4h\n"
+ "smlal2 v8.4s, v24.8h, v1.8h\n"
+ "tbz x4, #2, 101f\n"
+ "ld1 { v27.s }[0], [x22], #0x4\n"
+ "tbz x4, #1, 100f\n"
+ "ld1 { v27.h }[2], [x22], #0x2\n"
+ "tbz x4, #0, 103f\n"
+ "ld1 { v27.b }[6], [x22]\n"
+ "b 103f\n"
+ "100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 103f\n"
+ "ld1 { v27.b }[4], [x22]\n"
+ "b 103f\n"
+ "101:" // Oddments: Load (5, 2): Bit 2: Unset
+ "tbz x4, #1, 102f\n"
+ "ld1 { v27.h }[0], [x22], #0x2\n"
+ "tbz x4, #0, 103f\n"
+ "ld1 { v27.b }[2], [x22]\n"
+ "b 103f\n"
+ "102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 103f\n"
+ "ld1 { v27.b }[0], [x22]\n"
+ "103:" // Oddments: Load (5, 2): Bit 2: End
+ "ldr d2, [x8, #0xb0]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "usubl v2.8h, v2.8b, v15.8b\n"
+ "ldr x7, [x0, #0x108]\n"
+ "smlal v7.4s, v27.4h, v1.4h\n"
+ "smlal2 v6.4s, v27.8h, v1.8h\n"
+ "add x7, x7, x1\n"
+ "smlal v11.4s, v31.4h, v2.4h\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "smlal v14.4s, v30.4h, v2.4h\n"
+ "smlal2 v10.4s, v30.8h, v2.8h\n"
+ "smlal v9.4s, v27.4h, v2.4h\n"
+ "smlal2 v8.4s, v27.8h, v2.8h\n"
+ "tbz x4, #2, 105f\n"
+ "ld1 { v25.s }[0], [x7], #0x4\n"
+ "tbz x4, #1, 104f\n"
+ "ld1 { v25.h }[2], [x7], #0x2\n"
+ "tbz x4, #0, 107f\n"
+ "ld1 { v25.b }[6], [x7]\n"
+ "b 107f\n"
+ "104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 107f\n"
+ "ld1 { v25.b }[4], [x7]\n"
+ "b 107f\n"
+ "105:" // Oddments: Load (5, 3): Bit 2: Unset
+ "tbz x4, #1, 106f\n"
+ "ld1 { v25.h }[0], [x7], #0x2\n"
+ "tbz x4, #0, 107f\n"
+ "ld1 { v25.b }[2], [x7]\n"
+ "b 107f\n"
+ "106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 107f\n"
+ "ld1 { v25.b }[0], [x7]\n"
+ "107:" // Oddments: Load (5, 3): Bit 2: End
+ "ldr d3, [x8, #0xb8]\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "usubl v3.8h, v3.8b, v15.8b\n"
+ "ldr x20, [x0, #0x110]\n"
+ "smlal v7.4s, v25.4h, v2.4h\n"
+ "smlal2 v6.4s, v25.8h, v2.8h\n"
+ "add x20, x20, x1\n"
+ "smlal v11.4s, v30.4h, v3.4h\n"
+ "smlal2 v21.4s, v30.8h, v3.8h\n"
+ "smlal v14.4s, v28.4h, v3.4h\n"
+ "smlal2 v10.4s, v28.8h, v3.8h\n"
+ "smlal v9.4s, v25.4h, v3.4h\n"
+ "smlal2 v8.4s, v25.8h, v3.8h\n"
+ "tbz x4, #2, 109f\n"
+ "ld1 { v24.s }[0], [x20], #0x4\n"
+ "tbz x4, #1, 108f\n"
+ "ld1 { v24.h }[2], [x20], #0x2\n"
+ "tbz x4, #0, 111f\n"
+ "ld1 { v24.b }[6], [x20]\n"
+ "b 111f\n"
+ "108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 111f\n"
+ "ld1 { v24.b }[4], [x20]\n"
+ "b 111f\n"
+ "109:" // Oddments: Load (5, 4): Bit 2: Unset
+ "tbz x4, #1, 110f\n"
+ "ld1 { v24.h }[0], [x20], #0x2\n"
+ "tbz x4, #0, 111f\n"
+ "ld1 { v24.b }[2], [x20]\n"
+ "b 111f\n"
+ "110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 111f\n"
+ "ld1 { v24.b }[0], [x20]\n"
+ "111:" // Oddments: Load (5, 4): Bit 2: End
+ "ldr d4, [x8, #0xc0]\n"
+ "ushll v24.8h, v24.8b, #0x0\n"
+ "usubl v4.8h, v4.8b, v15.8b\n"
+ "ldr x19, [x0, #0x118]\n"
+ "smlal v7.4s, v24.4h, v3.4h\n"
+ "smlal2 v6.4s, v24.8h, v3.8h\n"
+ "add x19, x19, x1\n"
+ "smlal v11.4s, v28.4h, v4.4h\n"
+ "smlal2 v21.4s, v28.8h, v4.8h\n"
+ "smlal v14.4s, v26.4h, v4.4h\n"
+ "smlal2 v10.4s, v26.8h, v4.8h\n"
+ "smlal v9.4s, v24.4h, v4.4h\n"
+ "smlal2 v8.4s, v24.8h, v4.8h\n"
+ "tbz x4, #2, 113f\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "tbz x4, #1, 112f\n"
+ "ld1 { v27.h }[2], [x19], #0x2\n"
+ "tbz x4, #0, 115f\n"
+ "ld1 { v27.b }[6], [x19]\n"
+ "b 115f\n"
+ "112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
+ "tbz x4, #0, 115f\n"
+ "ld1 { v27.b }[4], [x19]\n"
+ "b 115f\n"
+ "113:" // Oddments: Load (5, 5): Bit 2: Unset
+ "tbz x4, #1, 114f\n"
+ "ld1 { v27.h }[0], [x19], #0x2\n"
+ "tbz x4, #0, 115f\n"
+ "ld1 { v27.b }[2], [x19]\n"
+ "b 115f\n"
+ "114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 115f\n"
+ "ld1 { v27.b }[0], [x19]\n"
+ "115:" // Oddments: Load (5, 5): Bit 2: End
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "smlal v7.4s, v27.4h, v4.4h\n"
+ "smlal2 v6.4s, v27.8h, v4.8h\n"
+ "tbz x4, #2, 117f\n"
+ "ld1 { v17.4s }, [x5], #0x10\n"
+ "ld1 { v5.4s }, [x6], #0x10\n"
+ "tbz x4, #1, 116f\n"
+ "ld1 { v18.d }[0], [x5], #0x8\n"
+ "ld1 { v29.d }[0], [x6], #0x8\n"
+ "tbz x4, #0, 119f\n"
+ "ld1 { v18.s }[2], [x5]\n"
+ "ld1 { v29.s }[2], [x6]\n"
+ "b 119f\n"
+ "116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 119f\n"
+ "ld1 { v18.s }[0], [x5]\n"
+ "ld1 { v29.s }[0], [x6]\n"
+ "b 119f\n"
+ "117:" // Oddments: Load requant params: Bit 2: Unset
+ "tbz x4, #1, 118f\n"
+ "ld1 { v17.d }[0], [x5], #0x8\n"
+ "ld1 { v5.d }[0], [x6], #0x8\n"
+ "tbz x4, #0, 119f\n"
+ "ld1 { v17.s }[2], [x5]\n"
+ "ld1 { v5.s }[2], [x6]\n"
+ "b 119f\n"
+ "118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 119f\n"
+ "ld1 { v17.s }[0], [x5]\n"
+ "ld1 { v5.s }[0], [x6]\n"
+ "119:" // Oddments: Load requant params: Bit 2: End
+ "sqdmulh v11.4s, v11.4s, v17.4s\n"
+ "sqdmulh v14.4s, v14.4s, v17.4s\n"
+ "add x21, x21, x2\n"
+ "add x15, x15, x2\n"
+ "sqdmulh v9.4s, v9.4s, v17.4s\n"
+ "sqdmulh v7.4s, v7.4s, v17.4s\n"
+ "add x17, x17, x2\n"
+ "add x16, x16, x2\n"
+ "and v23.16b, v11.16b, v5.16b\n"
+ "sqdmulh v21.4s, v21.4s, v18.4s\n"
+ "and v22.16b, v14.16b, v5.16b\n"
+ "sqdmulh v10.4s, v10.4s, v18.4s\n"
+ "and v17.16b, v9.16b, v5.16b\n"
+ "sqdmulh v8.4s, v8.4s, v18.4s\n"
+ "and v20.16b, v7.16b, v5.16b\n"
+ "sqdmulh v6.4s, v6.4s, v18.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v19.16b, v21.16b, v29.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v18.16b, v10.16b, v29.16b\n"
+ "sshr v17.4s, v17.4s, #0x1f\n"
+ "and v26.16b, v8.16b, v29.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v4.16b, v6.16b, v29.16b\n"
+ "sqadd v11.4s, v11.4s, v23.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v14.4s, v14.4s, v22.4s\n"
+ "sshr v18.4s, v18.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v17.4s\n"
+ "sshr v26.4s, v26.4s, #0x1f\n"
+ "sqadd v7.4s, v7.4s, v20.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "srshl v11.4s, v11.4s, v5.4s\n"
+ "sqadd v21.4s, v21.4s, v19.4s\n"
+ "srshl v14.4s, v14.4s, v5.4s\n"
+ "sqadd v10.4s, v10.4s, v18.4s\n"
+ "srshl v9.4s, v9.4s, v5.4s\n"
+ "sqadd v8.4s, v8.4s, v26.4s\n"
+ "srshl v7.4s, v7.4s, v5.4s\n"
+ "sqadd v6.4s, v6.4s, v4.4s\n"
+ "srshl v21.4s, v21.4s, v29.4s\n"
+ "sqxtn v11.4h, v11.4s\n"
+ "srshl v10.4s, v10.4s, v29.4s\n"
+ "sqxtn v14.4h, v14.4s\n"
+ "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v6.4s, v6.4s, v29.4s\n"
+ "sqxtn v7.4h, v7.4s\n"
+ "sqxtn2 v11.8h, v21.4s\n"
+ "sqxtn2 v14.8h, v10.4s\n"
+ "sqxtn2 v9.8h, v8.4s\n"
+ "sqxtn2 v7.8h, v6.4s\n"
+ "sqadd v11.8h, v11.8h, v16.8h\n"
+ "sqadd v14.8h, v14.8h, v16.8h\n"
+ "sqadd v9.8h, v9.8h, v16.8h\n"
+ "sqadd v7.8h, v7.8h, v16.8h\n"
+ "smax v11.8h, v11.8h, v12.8h\n"
+ "smax v14.8h, v14.8h, v12.8h\n"
+ "smax v9.8h, v9.8h, v12.8h\n"
+ "smax v7.8h, v7.8h, v12.8h\n"
+ "smin v11.8h, v11.8h, v13.8h\n"
+ "smin v14.8h, v14.8h, v13.8h\n"
+ "smin v9.8h, v9.8h, v13.8h\n"
+ "smin v7.8h, v7.8h, v13.8h\n"
+ "uzp1 v11.16b, v11.16b, v11.16b\n"
+ "uzp1 v14.16b, v14.16b, v14.16b\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "uzp1 v7.16b, v7.16b, v7.16b\n"
+ "tbz x4, #2, 121f\n"
+ "st1 { v11.s }[0], [x21], #0x4\n"
+ "st1 { v14.s }[0], [x15], #0x4\n"
+ "st1 { v9.s }[0], [x17], #0x4\n"
+ "st1 { v7.s }[0], [x16], #0x4\n"
+ "tbz x4, #1, 120f\n"
+ "st1 { v11.h }[2], [x21], #0x2\n"
+ "st1 { v14.h }[2], [x15], #0x2\n"
+ "st1 { v9.h }[2], [x17], #0x2\n"
+ "st1 { v7.h }[2], [x16], #0x2\n"
+ "tbz x4, #0, 123f\n"
+ "st1 { v11.b }[6], [x21], #0x1\n"
+ "st1 { v14.b }[6], [x15], #0x1\n"
+ "st1 { v9.b }[6], [x17], #0x1\n"
+ "st1 { v7.b }[6], [x16], #0x1\n"
+ "b 123f\n"
+ "120:" // Oddments: Bit 2: Bit 1: Unset
+ "tbz x4, #0, 123f\n"
+ "st1 { v11.b }[4], [x21], #0x1\n"
+ "st1 { v14.b }[4], [x15], #0x1\n"
+ "st1 { v9.b }[4], [x17], #0x1\n"
+ "st1 { v7.b }[4], [x16], #0x1\n"
+ "b 123f\n"
+ "121:" // Oddments: Bit 2: Unset
+ "tbz x4, #1, 122f\n"
+ "st1 { v11.h }[0], [x21], #0x2\n"
+ "st1 { v14.h }[0], [x15], #0x2\n"
+ "st1 { v9.h }[0], [x17], #0x2\n"
+ "st1 { v7.h }[0], [x16], #0x2\n"
+ "tbz x4, #0, 123f\n"
+ "st1 { v11.b }[2], [x21], #0x1\n"
+ "st1 { v14.b }[2], [x15], #0x1\n"
+ "st1 { v9.b }[2], [x17], #0x1\n"
+ "st1 { v7.b }[2], [x16], #0x1\n"
+ "b 123f\n"
+ "122:" // Oddments: Bit 2: Unset: Bit 1: Unset
+ "tbz x4, #0, 123f\n"
+ "st1 { v11.b }[0], [x21], #0x1\n"
+ "st1 { v14.b }[0], [x15], #0x1\n"
+ "st1 { v9.b }[0], [x17], #0x1\n"
+ "st1 { v7.b }[0], [x16], #0x1\n"
+ "123:" // Oddments: Bit 2: End
+ "124:" // End
+ :
+ : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace depthwise
+} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 1bacb5ffe7..281511a1fb 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,37 +36,24 @@ namespace depthwise {
void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
-struct a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst
+class a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef uint8_t input_type;
- typedef int8_t weight_type;
- typedef uint8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 4;
- constexpr static unsigned int input_cols = 4;
-
- constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_mla::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_mla::get_packed_size;
+ a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
- kern_type kernel = a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
- a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 8cbbfae00d..22f95746fc 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,7 +46,7 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const int8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const uint8_t *const *inptrs_raw,
- const int8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -91,505 +91,489 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
+ "ldr x19, [%x[params], %[offsetof_Params_requant]]\n"
"ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
- "mov x17, #0x0\n"
- "ldr x16, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x24, x19, %[offsetof_Requantize32_a_offset]\n"
+ "add x23, x19, %[offsetof_Requantize32_b_offset]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x19, %[offsetof_Requantize32_c_offset]\n"
+ "add x20, x19, %[offsetof_Requantize32_minval]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x19, x19, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v22.16b }, [x24]\n"
+ "ld1r { v12.16b }, [x23]\n"
+ "lsr x16, x8, #0x3\n"
+ "ld1r { v14.8h }, [x21]\n"
+ "ld1r { v17.8h }, [x20]\n"
"mov x15, #0x0\n"
- "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
- "add x14, %x[params], %[offsetof_Params_inptrs]\n"
+ "mov x14, #0x0\n"
+ "ld1r { v15.8h }, [x19]\n"
"ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "lsr x12, x8, #0x3\n"
+ "add x12, %x[params], %[offsetof_Params_inptrs]\n"
"ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
- "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v21.16b }, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v17.16b }, [x20]\n"
- "add x20, x22, %[offsetof_Requantize32_minval]\n"
- "ld1r { v13.4s }, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v15.4s }, [x20]\n"
- "ld1r { v14.4s }, [x19]\n"
- "ldp x10, x9, [x21, #0x0]\n"
- "ldp x28, x27, [x21, #0x10]\n"
- "cbz x12, 3f\n"
- "subs x12, x12, #0x1\n"
+ "ldp x10, x9, [x22, #0x0]\n"
+ "ldp x28, x27, [x22, #0x10]\n"
+ "cbz x16, 3f\n"
"ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q11, [x19, #0x0]\n"
- "mov v23.16b, v11.16b\n"
+ "ldr q13, [x19, #0x0]\n"
+ "subs x16, x16, #0x1\n"
+ "mov v19.16b, v13.16b\n"
"ldr q26, [x19, #0x10]\n"
"add x19, x19, #0x20\n"
- "mov v12.16b, v11.16b\n"
"str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "mov v24.16b, v11.16b\n"
- "ldr d0, [x16, #0x0]\n"
- "ldr d1, [x16, #0x8]\n"
- "mov v9.16b, v26.16b\n"
- "ldr d2, [x16, #0x10]\n"
- "mov v22.16b, v26.16b\n"
- "ldr d3, [x16, #0x18]\n"
- "mov v10.16b, v26.16b\n"
- "ldr d4, [x16, #0x20]\n"
- "ssubl v0.8h, v0.8b, v17.8b\n"
- "ldr d5, [x16, #0x28]\n"
- "ssubl v1.8h, v1.8b, v17.8b\n"
- "ldr d6, [x16, #0x30]\n"
- "ssubl v2.8h, v2.8b, v17.8b\n"
- "ldr d7, [x16, #0x38]\n"
- "ssubl v3.8h, v3.8b, v17.8b\n"
- "ldr d8, [x16, #0x40]\n"
- "ssubl v4.8h, v4.8b, v17.8b\n"
- "ldp x23, x22, [x14, #0x0]\n"
- "ssubl v5.8h, v5.8b, v17.8b\n"
- "ldp x21, x20, [x14, #0x10]\n"
- "ssubl v6.8h, v6.8b, v17.8b\n"
- "ssubl v7.8h, v7.8b, v17.8b\n"
- "ldr x19, [x14, #0x20]\n"
- "ssubl v8.8h, v8.8b, v17.8b\n"
- "ldr d31, [x23, x17]\n"
- "usubl v31.8h, v31.8b, v21.8b\n"
- "ldr d30, [x22, x17]\n"
- "ldr d29, [x21, x17]\n"
- "usubl v30.8h, v30.8b, v21.8b\n"
- "ldr d28, [x20, x17]\n"
- "usubl v29.8h, v29.8b, v21.8b\n"
- "ldr d27, [x19, x17]\n"
- "usubl v28.8h, v28.8b, v21.8b\n"
- "usubl v27.8h, v27.8b, v21.8b\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "ldr d2, [x17, #0x10]\n"
+ "mov v11.16b, v26.16b\n"
+ "mov v18.16b, v13.16b\n"
+ "ldr d3, [x17, #0x18]\n"
+ "ldr d4, [x17, #0x20]\n"
+ "mov v24.16b, v26.16b\n"
+ "mov v9.16b, v13.16b\n"
+ "ldr d5, [x17, #0x28]\n"
+ "ldr d6, [x17, #0x30]\n"
+ "mov v23.16b, v26.16b\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
+ "ldr d7, [x17, #0x38]\n"
+ "ldr d8, [x17, #0x40]\n"
+ "ssubl v1.8h, v1.8b, v12.8b\n"
+ "ssubl v2.8h, v2.8b, v12.8b\n"
+ "ldp x23, x22, [x12, #0x0]\n"
+ "ldp x21, x20, [x12, #0x10]\n"
+ "ssubl v3.8h, v3.8b, v12.8b\n"
+ "ssubl v4.8h, v4.8b, v12.8b\n"
+ "ldr x19, [x12, #0x20]\n"
+ "ldr d31, [x23, x15]\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "ssubl v6.8h, v6.8b, v12.8b\n"
+ "ldr d30, [x22, x15]\n"
+ "ldr d29, [x21, x15]\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "ldr d28, [x20, x15]\n"
+ "ldr d27, [x19, x15]\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "usubl v27.8h, v27.8b, v22.8b\n"
"beq 2f\n"
"1:" // Loop
- "smlal v11.4s, v31.4h, v4.4h\n"
- "ldr x21, [x14, #0x28]\n"
- "add x16, x16, #0x48\n"
+ "smlal v13.4s, v31.4h, v4.4h\n"
"smlal2 v26.4s, v31.8h, v4.8h\n"
- "ldr x20, [x14, #0x30]\n"
- "subs x12, x12, #0x1\n"
- "smlal v23.4s, v31.4h, v3.4h\n"
- "ldr x26, [x14, #0x38]\n"
- "smlal2 v9.4s, v31.8h, v3.8h\n"
- "ldr x25, [x14, #0x40]\n"
- "smlal v12.4s, v31.4h, v1.4h\n"
- "ldr x19, [x14, #0x48]\n"
- "smlal2 v22.4s, v31.8h, v1.8h\n"
- "ldr x24, [x14, #0x50]\n"
- "smlal v24.4s, v31.4h, v0.4h\n"
- "ldr x23, [x14, #0x58]\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x17]\n"
- "smlal v11.4s, v30.4h, v0.4h\n"
- "ldr x22, [x14, #0x60]\n"
+ "ldr x21, [x12, #0x28]\n"
+ "ldr x26, [x12, #0x38]\n"
+ "smlal v19.4s, v31.4h, v3.4h\n"
+ "smlal2 v11.4s, v31.8h, v3.8h\n"
+ "ldr x20, [x12, #0x30]\n"
+ "ldr x25, [x12, #0x40]\n"
+ "smlal v13.4s, v30.4h, v0.4h\n"
"smlal2 v26.4s, v30.8h, v0.8h\n"
- "ldr d30, [x19, x17]\n"
- "smlal v23.4s, v29.4h, v2.4h\n"
- "ldr x21, [x14, #0x68]\n"
- "smlal2 v9.4s, v29.8h, v2.8h\n"
- "ldr d29, [x20, x17]\n"
- "smlal v11.4s, v28.4h, v5.4h\n"
- "ldr x20, [x14, #0x70]\n"
+ "ldr x19, [x12, #0x48]\n"
+ "ldr d30, [x19, x15]\n"
+ "smlal v19.4s, v29.4h, v2.4h\n"
+ "smlal2 v11.4s, v29.8h, v2.8h\n"
+ "ldr d29, [x20, x15]\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "smlal v18.4s, v31.4h, v1.4h\n"
+ "smlal2 v24.4s, v31.8h, v1.8h\n"
+ "ldr x24, [x12, #0x50]\n"
+ "ldr x23, [x12, #0x58]\n"
+ "smlal v9.4s, v31.4h, v0.4h\n"
+ "smlal2 v23.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x21, x15]\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "smlal v13.4s, v28.4h, v5.4h\n"
"smlal2 v26.4s, v28.8h, v5.8h\n"
- "ldr x19, [x14, #0x78]\n"
- "smlal v23.4s, v28.4h, v4.4h\n"
- "ldr q25, [x13, #0x0]\n"
- "smlal2 v9.4s, v28.8h, v4.8h\n"
- "ldr q18, [x11, #0x0]\n"
- "smlal v12.4s, v28.4h, v2.4h\n"
- "ldr q16, [x13, #0x10]\n"
- "add x13, x13, #0x20\n"
- "smlal2 v22.4s, v28.8h, v2.8h\n"
- "ldr q20, [x11, #0x10]\n"
- "add x11, x11, #0x20\n"
- "smlal v24.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
- "ldr d28, [x26, x17]\n"
- "usubl v31.8h, v31.8b, v21.8b\n"
- "smlal v11.4s, v27.4h, v7.4h\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "ldr x22, [x12, #0x60]\n"
+ "smlal v19.4s, v28.4h, v4.4h\n"
+ "smlal2 v11.4s, v28.8h, v4.8h\n"
+ "ldr x21, [x12, #0x68]\n"
+ "ldr x20, [x12, #0x70]\n"
+ "smlal v18.4s, v28.4h, v2.4h\n"
+ "smlal2 v24.4s, v28.8h, v2.8h\n"
+ "ldr x19, [x12, #0x78]\n"
+ "ldr q21, [x13, #0x0]\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "smlal2 v23.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x26, x15]\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal v13.4s, v27.4h, v7.4h\n"
"smlal2 v26.4s, v27.8h, v7.8h\n"
- "smlal v12.4s, v31.4h, v6.4h\n"
- "smlal2 v22.4s, v31.8h, v6.8h\n"
- "ldr d31, [x25, x17]\n"
- "smlal v23.4s, v27.4h, v6.4h\n"
- "smlal2 v9.4s, v27.8h, v6.8h\n"
- "smlal v12.4s, v27.4h, v4.4h\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "smlal v24.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
- "usubl v29.8h, v29.8b, v21.8b\n"
- "usubl v28.8h, v28.8b, v21.8b\n"
- "usubl v31.8h, v31.8b, v21.8b\n"
- "smlal v24.4s, v29.4h, v8.4h\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
- "ldr d29, [x24, x17]\n"
- "smlal v11.4s, v28.4h, v1.4h\n"
+ "ldr q25, [x11, #0x0]\n"
+ "ldr q10, [x13, #0x10]\n"
+ "smlal v19.4s, v27.4h, v6.4h\n"
+ "smlal2 v11.4s, v27.8h, v6.8h\n"
+ "ldr q16, [x11, #0x10]\n"
+ "add x17, x17, #0x48\n"
+ "smlal v18.4s, v31.4h, v6.4h\n"
+ "smlal2 v24.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x25, x15]\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "smlal2 v23.4s, v27.8h, v3.8h\n"
+ "subs x16, x16, #0x1\n"
+ "add x13, x13, #0x20\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
"smlal2 v26.4s, v28.8h, v1.8h\n"
- "smlal v23.4s, v28.4h, v0.4h\n"
- "smlal2 v9.4s, v28.8h, v0.8h\n"
- "ldr d28, [x23, x17]\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
+ "add x11, x11, #0x20\n"
+ "smlal v19.4s, v28.4h, v0.4h\n"
+ "smlal2 v11.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x23, x15]\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal v18.4s, v27.4h, v4.4h\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "smlal2 v24.4s, v27.8h, v4.8h\n"
+ "smlal2 v23.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x24, x15]\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "smlal v13.4s, v31.4h, v2.4h\n"
"smlal2 v26.4s, v31.8h, v2.8h\n"
- "smlal v23.4s, v31.4h, v1.4h\n"
- "smlal2 v9.4s, v31.8h, v1.8h\n"
- "ldr d31, [x22, x17]\n"
- "usubl v30.8h, v30.8b, v21.8b\n"
- "usubl v29.8h, v29.8b, v21.8b\n"
- "usubl v28.8h, v28.8b, v21.8b\n"
- "smlal v11.4s, v30.4h, v8.4h\n"
+ "smlal v19.4s, v31.4h, v1.4h\n"
+ "smlal2 v11.4s, v31.8h, v1.8h\n"
+ "ldr d31, [x22, x15]\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "smlal v18.4s, v30.4h, v5.4h\n"
+ "smlal v9.4s, v30.4h, v4.4h\n"
+ "smlal v13.4s, v30.4h, v8.4h\n"
"smlal2 v26.4s, v30.8h, v8.8h\n"
- "smlal v23.4s, v30.4h, v7.4h\n"
- "smlal2 v9.4s, v30.8h, v7.8h\n"
- "smlal v12.4s, v30.4h, v5.4h\n"
- "smlal2 v22.4s, v30.8h, v5.8h\n"
- "smlal v24.4s, v30.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
- "ldr d30, [x21, x17]\n"
- "smlal v11.4s, v29.4h, v3.4h\n"
+ "smlal v19.4s, v30.4h, v7.4h\n"
+ "smlal2 v11.4s, v30.8h, v7.8h\n"
+ "smlal2 v24.4s, v30.8h, v5.8h\n"
+ "smlal2 v23.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x21, x15]\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "smlal v18.4s, v29.4h, v0.4h\n"
+ "smlal v9.4s, v28.4h, v2.4h\n"
+ "smlal v13.4s, v29.4h, v3.4h\n"
"smlal2 v26.4s, v29.8h, v3.8h\n"
- "smlal v12.4s, v29.4h, v0.4h\n"
- "smlal2 v22.4s, v29.8h, v0.8h\n"
- "ldr d29, [x20, x17]\n"
- "smlal v23.4s, v28.4h, v5.4h\n"
- "smlal2 v9.4s, v28.8h, v5.8h\n"
- "smlal v24.4s, v28.4h, v2.4h\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "ldr d28, [x19, x17]\n"
- "add x17, x17, #0x8\n"
- "usubl v31.8h, v31.8b, v21.8b\n"
- "usubl v30.8h, v30.8b, v21.8b\n"
- "usubl v29.8h, v29.8b, v21.8b\n"
- "smlal v11.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "smlal v12.4s, v31.4h, v3.4h\n"
- "smlal2 v22.4s, v31.8h, v3.8h\n"
- "smlal v23.4s, v30.4h, v8.4h\n"
- "smlal2 v9.4s, v30.8h, v8.8h\n"
- "smlal v24.4s, v30.4h, v5.4h\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "smlal v12.4s, v29.4h, v7.4h\n"
- "smlal2 v22.4s, v29.8h, v7.8h\n"
- "smlal v24.4s, v29.4h, v6.4h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
- "usubl v28.8h, v28.8b, v21.8b\n"
- "sqrdmulh v11.4s, v11.4s, v25.4s\n"
- "sqrdmulh v26.4s, v26.4s, v16.4s\n"
- "smlal v12.4s, v28.4h, v8.4h\n"
- "smlal2 v22.4s, v28.8h, v8.8h\n"
- "smlal v24.4s, v28.4h, v7.4h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
- "and v19.16b, v11.16b, v18.16b\n"
- "and v5.16b, v26.16b, v20.16b\n"
- "sqrdmulh v23.4s, v23.4s, v25.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqrdmulh v9.4s, v9.4s, v16.4s\n"
- "sqadd v11.4s, v11.4s, v19.4s\n"
- "sqadd v26.4s, v26.4s, v5.4s\n"
- "and v28.16b, v23.16b, v18.16b\n"
- "and v8.16b, v9.16b, v20.16b\n"
- "srshl v11.4s, v11.4s, v18.4s\n"
- "srshl v26.4s, v26.4s, v20.4s\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "add v11.4s, v11.4s, v13.4s\n"
- "add v26.4s, v26.4s, v13.4s\n"
- "sqadd v23.4s, v23.4s, v28.4s\n"
- "smin v11.4s, v11.4s, v14.4s\n"
- "smin v26.4s, v26.4s, v14.4s\n"
- "sqadd v9.4s, v9.4s, v8.4s\n"
- "smax v11.4s, v11.4s, v15.4s\n"
- "smax v26.4s, v26.4s, v15.4s\n"
- "srshl v23.4s, v23.4s, v18.4s\n"
- "srshl v9.4s, v9.4s, v20.4s\n"
- "uzp1 v11.16b, v11.16b, v26.16b\n"
- "sqrdmulh v12.4s, v12.4s, v25.4s\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "str d11, [x10, x15]\n"
- "add v23.4s, v23.4s, v13.4s\n"
- "add v9.4s, v9.4s, v13.4s\n"
- "and v1.16b, v12.16b, v18.16b\n"
- "sqrdmulh v22.4s, v22.4s, v16.4s\n"
- "smin v23.4s, v23.4s, v14.4s\n"
- "smin v9.4s, v9.4s, v14.4s\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "smax v23.4s, v23.4s, v15.4s\n"
- "smax v9.4s, v9.4s, v15.4s\n"
- "sqadd v12.4s, v12.4s, v1.4s\n"
- "and v0.16b, v22.16b, v20.16b\n"
- "uzp1 v23.16b, v23.16b, v9.16b\n"
- "sqrdmulh v24.4s, v24.4s, v25.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d23, [x9, x15]\n"
- "srshl v12.4s, v12.4s, v18.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v26.16b, v24.16b, v18.16b\n"
- "sqrdmulh v10.4s, v10.4s, v16.4s\n"
- "sqadd v22.4s, v22.4s, v0.4s\n"
- "add v12.4s, v12.4s, v13.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "and v16.16b, v10.16b, v20.16b\n"
- "smin v12.4s, v12.4s, v14.4s\n"
- "srshl v22.4s, v22.4s, v20.4s\n"
- "sqadd v24.4s, v24.4s, v26.4s\n"
- "smax v12.4s, v12.4s, v15.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v22.4s, v22.4s, v13.4s\n"
- "srshl v24.4s, v24.4s, v18.4s\n"
- "sqadd v10.4s, v10.4s, v16.4s\n"
- "smin v22.4s, v22.4s, v14.4s\n"
- "add v24.4s, v24.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v15.4s\n"
- "srshl v10.4s, v10.4s, v20.4s\n"
- "smin v24.4s, v24.4s, v14.4s\n"
- "uzp1 v12.16b, v12.16b, v22.16b\n"
- "add v10.4s, v10.4s, v13.4s\n"
- "uzp1 v12.16b, v12.16b, v12.16b\n"
- "str d12, [x28, x15]\n"
- "smax v24.4s, v24.4s, v15.4s\n"
- "smin v10.4s, v10.4s, v14.4s\n"
- "smax v10.4s, v10.4s, v15.4s\n"
- "uzp1 v24.16b, v24.16b, v10.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "str d24, [x27, x15]\n"
+ "smlal2 v24.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x20, x15]\n"
+ "smlal2 v23.4s, v28.8h, v2.8h\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "smlal v18.4s, v31.4h, v3.4h\n"
+ "smlal v9.4s, v30.4h, v5.4h\n"
+ "smlal v19.4s, v28.4h, v5.4h\n"
+ "smlal2 v11.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x19, x15]\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal2 v24.4s, v31.8h, v3.8h\n"
+ "smlal2 v23.4s, v30.8h, v5.8h\n"
"add x15, x15, #0x8\n"
+ "smlal v18.4s, v29.4h, v7.4h\n"
+ "smlal v9.4s, v29.4h, v6.4h\n"
+ "smlal2 v24.4s, v29.8h, v7.8h\n"
+ "smlal2 v23.4s, v29.8h, v6.8h\n"
+ "smlal v13.4s, v31.4h, v6.4h\n"
+ "smlal v19.4s, v30.4h, v8.4h\n"
+ "sqdmulh v13.4s, v13.4s, v21.4s\n"
+ "smlal v18.4s, v28.4h, v8.4h\n"
+ "smlal v9.4s, v28.4h, v7.4h\n"
+ "sqdmulh v19.4s, v19.4s, v21.4s\n"
+ "smlal2 v26.4s, v31.8h, v6.8h\n"
+ "smlal2 v11.4s, v30.8h, v8.8h\n"
+ "sqdmulh v18.4s, v18.4s, v21.4s\n"
+ "smlal2 v24.4s, v28.8h, v8.8h\n"
+ "smlal2 v23.4s, v28.8h, v7.8h\n"
+ "sqdmulh v9.4s, v9.4s, v21.4s\n"
+ "and v7.16b, v13.16b, v25.16b\n"
+ "sqdmulh v26.4s, v26.4s, v10.4s\n"
+ "and v4.16b, v19.16b, v25.16b\n"
+ "sqdmulh v11.4s, v11.4s, v10.4s\n"
+ "and v21.16b, v18.16b, v25.16b\n"
+ "sqdmulh v24.4s, v24.4s, v10.4s\n"
+ "and v20.16b, v9.16b, v25.16b\n"
+ "sqdmulh v23.4s, v23.4s, v10.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v29.16b, v26.16b, v16.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v10.16b, v11.16b, v16.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v31.16b, v24.16b, v16.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v30.16b, v23.16b, v16.16b\n"
+ "sqadd v13.4s, v13.4s, v7.4s\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v21.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v20.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v25.4s\n"
+ "sqadd v26.4s, v26.4s, v29.4s\n"
+ "srshl v19.4s, v19.4s, v25.4s\n"
+ "sqadd v11.4s, v11.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v25.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "srshl v9.4s, v9.4s, v25.4s\n"
+ "sqadd v23.4s, v23.4s, v30.4s\n"
+ "srshl v26.4s, v26.4s, v16.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v11.4s, v11.4s, v16.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "srshl v24.4s, v24.4s, v16.4s\n"
+ "sqxtn v18.4h, v18.4s\n"
+ "srshl v23.4s, v23.4s, v16.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "sqxtn2 v13.8h, v26.4s\n"
+ "sqxtn2 v19.8h, v11.4s\n"
+ "sqxtn2 v18.8h, v24.4s\n"
+ "sqxtn2 v9.8h, v23.4s\n"
+ "sqadd v13.8h, v13.8h, v14.8h\n"
+ "sqadd v19.8h, v19.8h, v14.8h\n"
+ "sqadd v18.8h, v18.8h, v14.8h\n"
+ "sqadd v9.8h, v9.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smax v18.8h, v18.8h, v17.8h\n"
+ "smax v9.8h, v9.8h, v17.8h\n"
+ "smin v13.8h, v13.8h, v15.8h\n"
+ "smin v19.8h, v19.8h, v15.8h\n"
+ "smin v18.8h, v18.8h, v15.8h\n"
+ "smin v9.8h, v9.8h, v15.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d13, [x10, x14]\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "str d19, [x9, x14]\n"
+ "str d18, [x28, x14]\n"
+ "str d9, [x27, x14]\n"
"ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q11, [x19, #0x0]\n"
- "mov v23.16b, v11.16b\n"
+ "ldr q13, [x19, #0x0]\n"
+ "add x14, x14, #0x8\n"
"ldr q26, [x19, #0x10]\n"
"add x19, x19, #0x20\n"
- "mov v12.16b, v11.16b\n"
"str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "mov v24.16b, v11.16b\n"
- "ldr d0, [x16, #0x0]\n"
- "ldr d1, [x16, #0x8]\n"
- "mov v9.16b, v26.16b\n"
- "ldr d2, [x16, #0x10]\n"
- "mov v22.16b, v26.16b\n"
- "ldr d3, [x16, #0x18]\n"
- "mov v10.16b, v26.16b\n"
- "ldr d4, [x16, #0x20]\n"
- "ssubl v0.8h, v0.8b, v17.8b\n"
- "ldr d5, [x16, #0x28]\n"
- "ssubl v1.8h, v1.8b, v17.8b\n"
- "ldr d6, [x16, #0x30]\n"
- "ssubl v2.8h, v2.8b, v17.8b\n"
- "ldr d7, [x16, #0x38]\n"
- "ssubl v3.8h, v3.8b, v17.8b\n"
- "ldr d8, [x16, #0x40]\n"
- "ssubl v4.8h, v4.8b, v17.8b\n"
- "ldp x23, x22, [x14, #0x0]\n"
- "ssubl v5.8h, v5.8b, v17.8b\n"
- "ldp x21, x20, [x14, #0x10]\n"
- "ssubl v6.8h, v6.8b, v17.8b\n"
- "ssubl v7.8h, v7.8b, v17.8b\n"
- "ldr x19, [x14, #0x20]\n"
- "ssubl v8.8h, v8.8b, v17.8b\n"
- "ldr d31, [x23, x17]\n"
- "usubl v31.8h, v31.8b, v21.8b\n"
- "ldr d30, [x22, x17]\n"
- "ldr d29, [x21, x17]\n"
- "usubl v30.8h, v30.8b, v21.8b\n"
- "ldr d28, [x20, x17]\n"
- "usubl v29.8h, v29.8b, v21.8b\n"
- "ldr d27, [x19, x17]\n"
- "usubl v28.8h, v28.8b, v21.8b\n"
- "usubl v27.8h, v27.8b, v21.8b\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "ldr d2, [x17, #0x10]\n"
+ "mov v19.16b, v13.16b\n"
+ "mov v11.16b, v26.16b\n"
+ "ldr d3, [x17, #0x18]\n"
+ "ldr d4, [x17, #0x20]\n"
+ "mov v18.16b, v13.16b\n"
+ "mov v24.16b, v26.16b\n"
+ "ldr d5, [x17, #0x28]\n"
+ "ldr d6, [x17, #0x30]\n"
+ "mov v9.16b, v13.16b\n"
+ "mov v23.16b, v26.16b\n"
+ "ldr d7, [x17, #0x38]\n"
+ "ldr d8, [x17, #0x40]\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
+ "ssubl v1.8h, v1.8b, v12.8b\n"
+ "ldp x23, x22, [x12, #0x0]\n"
+ "ldp x21, x20, [x12, #0x10]\n"
+ "ssubl v2.8h, v2.8b, v12.8b\n"
+ "ssubl v3.8h, v3.8b, v12.8b\n"
+ "ldr x19, [x12, #0x20]\n"
+ "ldr d31, [x23, x15]\n"
+ "ssubl v4.8h, v4.8b, v12.8b\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "ldr d30, [x22, x15]\n"
+ "ldr d29, [x21, x15]\n"
+ "ssubl v6.8h, v6.8b, v12.8b\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ldr d28, [x20, x15]\n"
+ "ldr d27, [x19, x15]\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "usubl v27.8h, v27.8b, v22.8b\n"
"bgt 1b\n"
"2:" // Tail
- "smlal v11.4s, v31.4h, v4.4h\n"
- "ldr x21, [x14, #0x28]\n"
- "tst x8, #0x7\n"
+ "smlal v13.4s, v31.4h, v4.4h\n"
"smlal2 v26.4s, v31.8h, v4.8h\n"
- "ldr x20, [x14, #0x30]\n"
- "smlal v23.4s, v31.4h, v3.4h\n"
- "ldr x26, [x14, #0x38]\n"
- "smlal2 v9.4s, v31.8h, v3.8h\n"
- "ldr x25, [x14, #0x40]\n"
- "smlal v12.4s, v31.4h, v1.4h\n"
- "ldr x19, [x14, #0x48]\n"
- "smlal2 v22.4s, v31.8h, v1.8h\n"
- "ldr x24, [x14, #0x50]\n"
- "smlal v24.4s, v31.4h, v0.4h\n"
- "ldr x23, [x14, #0x58]\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "ldr d31, [x21, x17]\n"
- "smlal v11.4s, v30.4h, v0.4h\n"
- "ldr x22, [x14, #0x60]\n"
+ "ldr x21, [x12, #0x28]\n"
+ "ldr x26, [x12, #0x38]\n"
+ "smlal v19.4s, v31.4h, v3.4h\n"
+ "smlal2 v11.4s, v31.8h, v3.8h\n"
+ "ldr x20, [x12, #0x30]\n"
+ "ldr x25, [x12, #0x40]\n"
+ "smlal v13.4s, v30.4h, v0.4h\n"
"smlal2 v26.4s, v30.8h, v0.8h\n"
- "ldr d30, [x19, x17]\n"
- "smlal v23.4s, v29.4h, v2.4h\n"
- "ldr x21, [x14, #0x68]\n"
- "smlal2 v9.4s, v29.8h, v2.8h\n"
- "ldr d29, [x20, x17]\n"
- "smlal v11.4s, v28.4h, v5.4h\n"
- "ldr x20, [x14, #0x70]\n"
+ "ldr x19, [x12, #0x48]\n"
+ "ldr d30, [x19, x15]\n"
+ "smlal v19.4s, v29.4h, v2.4h\n"
+ "smlal2 v11.4s, v29.8h, v2.8h\n"
+ "ldr d29, [x20, x15]\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "smlal v18.4s, v31.4h, v1.4h\n"
+ "smlal2 v24.4s, v31.8h, v1.8h\n"
+ "ldr x24, [x12, #0x50]\n"
+ "ldr x23, [x12, #0x58]\n"
+ "smlal v9.4s, v31.4h, v0.4h\n"
+ "smlal2 v23.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x21, x15]\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "smlal v13.4s, v28.4h, v5.4h\n"
"smlal2 v26.4s, v28.8h, v5.8h\n"
- "ldr x19, [x14, #0x78]\n"
- "smlal v23.4s, v28.4h, v4.4h\n"
- "ldr q25, [x13, #0x0]\n"
- "smlal2 v9.4s, v28.8h, v4.8h\n"
- "ldr q18, [x11, #0x0]\n"
- "smlal v12.4s, v28.4h, v2.4h\n"
- "ldr q16, [x13, #0x10]\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "ldr x22, [x12, #0x60]\n"
+ "smlal v19.4s, v28.4h, v4.4h\n"
+ "smlal2 v11.4s, v28.8h, v4.8h\n"
+ "ldr x21, [x12, #0x68]\n"
+ "ldr x20, [x12, #0x70]\n"
+ "smlal v18.4s, v28.4h, v2.4h\n"
+ "smlal2 v24.4s, v28.8h, v2.8h\n"
+ "ldr x19, [x12, #0x78]\n"
+ "ldr q21, [x13, #0x0]\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "smlal2 v23.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x26, x15]\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal v13.4s, v27.4h, v7.4h\n"
+ "smlal2 v26.4s, v27.8h, v7.8h\n"
+ "ldr q25, [x11, #0x0]\n"
+ "ldr q10, [x13, #0x10]\n"
+ "smlal v19.4s, v27.4h, v6.4h\n"
+ "smlal2 v11.4s, v27.8h, v6.8h\n"
+ "ldr q16, [x11, #0x10]\n"
+ "tst x8, #0x7\n"
+ "smlal v18.4s, v31.4h, v6.4h\n"
+ "smlal2 v24.4s, v31.8h, v6.8h\n"
+ "ldr d31, [x25, x15]\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "smlal2 v23.4s, v27.8h, v3.8h\n"
"add x13, x13, #0x20\n"
- "smlal2 v22.4s, v28.8h, v2.8h\n"
- "ldr q20, [x11, #0x10]\n"
"add x11, x11, #0x20\n"
- "smlal v24.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
- "ldr d28, [x26, x17]\n"
- "usubl v31.8h, v31.8b, v21.8b\n"
- "smlal v11.4s, v27.4h, v7.4h\n"
- "smlal2 v26.4s, v27.8h, v7.8h\n"
- "smlal v12.4s, v31.4h, v6.4h\n"
- "smlal2 v22.4s, v31.8h, v6.8h\n"
- "ldr d31, [x25, x17]\n"
- "smlal v23.4s, v27.4h, v6.4h\n"
- "smlal2 v9.4s, v27.8h, v6.8h\n"
- "smlal v12.4s, v27.4h, v4.4h\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "smlal v24.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
- "usubl v29.8h, v29.8b, v21.8b\n"
- "usubl v28.8h, v28.8b, v21.8b\n"
- "usubl v31.8h, v31.8b, v21.8b\n"
- "smlal v24.4s, v29.4h, v8.4h\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
- "ldr d29, [x24, x17]\n"
- "smlal v11.4s, v28.4h, v1.4h\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
"smlal2 v26.4s, v28.8h, v1.8h\n"
- "smlal v23.4s, v28.4h, v0.4h\n"
- "smlal2 v9.4s, v28.8h, v0.8h\n"
- "ldr d28, [x23, x17]\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
+ "smlal v19.4s, v28.4h, v0.4h\n"
+ "smlal2 v11.4s, v28.8h, v0.8h\n"
+ "ldr d28, [x23, x15]\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal v18.4s, v27.4h, v4.4h\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "smlal2 v24.4s, v27.8h, v4.8h\n"
+ "smlal2 v23.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x24, x15]\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "smlal v13.4s, v31.4h, v2.4h\n"
"smlal2 v26.4s, v31.8h, v2.8h\n"
- "smlal v23.4s, v31.4h, v1.4h\n"
- "smlal2 v9.4s, v31.8h, v1.8h\n"
- "ldr d31, [x22, x17]\n"
- "usubl v30.8h, v30.8b, v21.8b\n"
- "usubl v29.8h, v29.8b, v21.8b\n"
- "usubl v28.8h, v28.8b, v21.8b\n"
- "smlal v11.4s, v30.4h, v8.4h\n"
+ "smlal v19.4s, v31.4h, v1.4h\n"
+ "smlal2 v11.4s, v31.8h, v1.8h\n"
+ "ldr d31, [x22, x15]\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "smlal v18.4s, v30.4h, v5.4h\n"
+ "smlal v9.4s, v30.4h, v4.4h\n"
+ "smlal v13.4s, v30.4h, v8.4h\n"
"smlal2 v26.4s, v30.8h, v8.8h\n"
- "smlal v23.4s, v30.4h, v7.4h\n"
- "smlal2 v9.4s, v30.8h, v7.8h\n"
- "smlal v12.4s, v30.4h, v5.4h\n"
- "smlal2 v22.4s, v30.8h, v5.8h\n"
- "smlal v24.4s, v30.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
- "ldr d30, [x21, x17]\n"
- "smlal v11.4s, v29.4h, v3.4h\n"
+ "smlal v19.4s, v30.4h, v7.4h\n"
+ "smlal2 v11.4s, v30.8h, v7.8h\n"
+ "smlal2 v24.4s, v30.8h, v5.8h\n"
+ "smlal2 v23.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x21, x15]\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "smlal v18.4s, v29.4h, v0.4h\n"
+ "smlal v9.4s, v28.4h, v2.4h\n"
+ "smlal v13.4s, v29.4h, v3.4h\n"
"smlal2 v26.4s, v29.8h, v3.8h\n"
- "smlal v12.4s, v29.4h, v0.4h\n"
- "smlal2 v22.4s, v29.8h, v0.8h\n"
- "ldr d29, [x20, x17]\n"
- "smlal v23.4s, v28.4h, v5.4h\n"
- "smlal2 v9.4s, v28.8h, v5.8h\n"
- "smlal v24.4s, v28.4h, v2.4h\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
- "ldr d28, [x19, x17]\n"
- "add x17, x17, #0x8\n"
- "usubl v31.8h, v31.8b, v21.8b\n"
- "usubl v30.8h, v30.8b, v21.8b\n"
- "usubl v29.8h, v29.8b, v21.8b\n"
- "smlal v11.4s, v31.4h, v6.4h\n"
- "smlal2 v26.4s, v31.8h, v6.8h\n"
- "smlal v12.4s, v31.4h, v3.4h\n"
- "smlal2 v22.4s, v31.8h, v3.8h\n"
- "smlal v23.4s, v30.4h, v8.4h\n"
- "smlal2 v9.4s, v30.8h, v8.8h\n"
- "smlal v24.4s, v30.4h, v5.4h\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
- "smlal v12.4s, v29.4h, v7.4h\n"
- "smlal2 v22.4s, v29.8h, v7.8h\n"
- "smlal v24.4s, v29.4h, v6.4h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
- "usubl v28.8h, v28.8b, v21.8b\n"
- "sqrdmulh v11.4s, v11.4s, v25.4s\n"
- "sqrdmulh v26.4s, v26.4s, v16.4s\n"
- "smlal v12.4s, v28.4h, v8.4h\n"
- "smlal2 v22.4s, v28.8h, v8.8h\n"
- "smlal v24.4s, v28.4h, v7.4h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
- "and v19.16b, v11.16b, v18.16b\n"
- "and v5.16b, v26.16b, v20.16b\n"
- "sqrdmulh v23.4s, v23.4s, v25.4s\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sqrdmulh v9.4s, v9.4s, v16.4s\n"
- "sqadd v11.4s, v11.4s, v19.4s\n"
- "sqadd v26.4s, v26.4s, v5.4s\n"
- "and v28.16b, v23.16b, v18.16b\n"
- "and v8.16b, v9.16b, v20.16b\n"
- "srshl v11.4s, v11.4s, v18.4s\n"
- "srshl v26.4s, v26.4s, v20.4s\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "add v11.4s, v11.4s, v13.4s\n"
- "add v26.4s, v26.4s, v13.4s\n"
- "sqadd v23.4s, v23.4s, v28.4s\n"
- "smin v11.4s, v11.4s, v14.4s\n"
- "smin v26.4s, v26.4s, v14.4s\n"
- "sqadd v9.4s, v9.4s, v8.4s\n"
- "smax v11.4s, v11.4s, v15.4s\n"
- "smax v26.4s, v26.4s, v15.4s\n"
- "srshl v23.4s, v23.4s, v18.4s\n"
- "srshl v9.4s, v9.4s, v20.4s\n"
- "uzp1 v11.16b, v11.16b, v26.16b\n"
- "sqrdmulh v12.4s, v12.4s, v25.4s\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "str d11, [x10, x15]\n"
- "add v23.4s, v23.4s, v13.4s\n"
- "add v9.4s, v9.4s, v13.4s\n"
- "and v1.16b, v12.16b, v18.16b\n"
- "sqrdmulh v22.4s, v22.4s, v16.4s\n"
- "smin v23.4s, v23.4s, v14.4s\n"
- "smin v9.4s, v9.4s, v14.4s\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "smax v23.4s, v23.4s, v15.4s\n"
- "smax v9.4s, v9.4s, v15.4s\n"
- "sqadd v12.4s, v12.4s, v1.4s\n"
- "and v0.16b, v22.16b, v20.16b\n"
- "uzp1 v23.16b, v23.16b, v9.16b\n"
- "sqrdmulh v24.4s, v24.4s, v25.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "str d23, [x9, x15]\n"
- "srshl v12.4s, v12.4s, v18.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "and v26.16b, v24.16b, v18.16b\n"
- "sqrdmulh v10.4s, v10.4s, v16.4s\n"
- "sqadd v22.4s, v22.4s, v0.4s\n"
- "add v12.4s, v12.4s, v13.4s\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "and v16.16b, v10.16b, v20.16b\n"
- "smin v12.4s, v12.4s, v14.4s\n"
- "srshl v22.4s, v22.4s, v20.4s\n"
- "sqadd v24.4s, v24.4s, v26.4s\n"
- "smax v12.4s, v12.4s, v15.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "add v22.4s, v22.4s, v13.4s\n"
- "srshl v24.4s, v24.4s, v18.4s\n"
- "sqadd v10.4s, v10.4s, v16.4s\n"
- "smin v22.4s, v22.4s, v14.4s\n"
- "add v24.4s, v24.4s, v13.4s\n"
- "smax v22.4s, v22.4s, v15.4s\n"
- "srshl v10.4s, v10.4s, v20.4s\n"
- "smin v24.4s, v24.4s, v14.4s\n"
- "uzp1 v12.16b, v12.16b, v22.16b\n"
- "add v10.4s, v10.4s, v13.4s\n"
- "uzp1 v12.16b, v12.16b, v12.16b\n"
- "str d12, [x28, x15]\n"
- "smax v24.4s, v24.4s, v15.4s\n"
- "smin v10.4s, v10.4s, v14.4s\n"
- "smax v10.4s, v10.4s, v15.4s\n"
- "uzp1 v24.16b, v24.16b, v10.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
- "str d24, [x27, x15]\n"
+ "smlal2 v24.4s, v29.8h, v0.8h\n"
+ "ldr d29, [x20, x15]\n"
+ "smlal2 v23.4s, v28.8h, v2.8h\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "smlal v18.4s, v31.4h, v3.4h\n"
+ "smlal v9.4s, v30.4h, v5.4h\n"
+ "smlal v19.4s, v28.4h, v5.4h\n"
+ "smlal2 v11.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x19, x15]\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal2 v24.4s, v31.8h, v3.8h\n"
+ "smlal2 v23.4s, v30.8h, v5.8h\n"
"add x15, x15, #0x8\n"
+ "smlal v18.4s, v29.4h, v7.4h\n"
+ "smlal v9.4s, v29.4h, v6.4h\n"
+ "smlal2 v24.4s, v29.8h, v7.8h\n"
+ "smlal2 v23.4s, v29.8h, v6.8h\n"
+ "smlal v13.4s, v31.4h, v6.4h\n"
+ "smlal v19.4s, v30.4h, v8.4h\n"
+ "sqdmulh v13.4s, v13.4s, v21.4s\n"
+ "smlal v18.4s, v28.4h, v8.4h\n"
+ "smlal v9.4s, v28.4h, v7.4h\n"
+ "sqdmulh v19.4s, v19.4s, v21.4s\n"
+ "smlal2 v26.4s, v31.8h, v6.8h\n"
+ "smlal2 v11.4s, v30.8h, v8.8h\n"
+ "sqdmulh v18.4s, v18.4s, v21.4s\n"
+ "smlal2 v24.4s, v28.8h, v8.8h\n"
+ "smlal2 v23.4s, v28.8h, v7.8h\n"
+ "sqdmulh v9.4s, v9.4s, v21.4s\n"
+ "and v7.16b, v13.16b, v25.16b\n"
+ "sqdmulh v26.4s, v26.4s, v10.4s\n"
+ "and v4.16b, v19.16b, v25.16b\n"
+ "sqdmulh v11.4s, v11.4s, v10.4s\n"
+ "and v21.16b, v18.16b, v25.16b\n"
+ "sqdmulh v24.4s, v24.4s, v10.4s\n"
+ "and v20.16b, v9.16b, v25.16b\n"
+ "sqdmulh v23.4s, v23.4s, v10.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v29.16b, v26.16b, v16.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v10.16b, v11.16b, v16.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v31.16b, v24.16b, v16.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v30.16b, v23.16b, v16.16b\n"
+ "sqadd v13.4s, v13.4s, v7.4s\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v21.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v20.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v25.4s\n"
+ "sqadd v26.4s, v26.4s, v29.4s\n"
+ "srshl v19.4s, v19.4s, v25.4s\n"
+ "sqadd v11.4s, v11.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v25.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "srshl v9.4s, v9.4s, v25.4s\n"
+ "sqadd v23.4s, v23.4s, v30.4s\n"
+ "srshl v26.4s, v26.4s, v16.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v11.4s, v11.4s, v16.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "srshl v24.4s, v24.4s, v16.4s\n"
+ "sqxtn v18.4h, v18.4s\n"
+ "srshl v23.4s, v23.4s, v16.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "sqxtn2 v13.8h, v26.4s\n"
+ "sqxtn2 v19.8h, v11.4s\n"
+ "sqxtn2 v18.8h, v24.4s\n"
+ "sqxtn2 v9.8h, v23.4s\n"
+ "sqadd v13.8h, v13.8h, v14.8h\n"
+ "sqadd v19.8h, v19.8h, v14.8h\n"
+ "sqadd v18.8h, v18.8h, v14.8h\n"
+ "sqadd v9.8h, v9.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smax v18.8h, v18.8h, v17.8h\n"
+ "smax v9.8h, v9.8h, v17.8h\n"
+ "smin v13.8h, v13.8h, v15.8h\n"
+ "smin v19.8h, v19.8h, v15.8h\n"
+ "smin v18.8h, v18.8h, v15.8h\n"
+ "smin v9.8h, v9.8h, v15.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "str d13, [x10, x14]\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "str d19, [x9, x14]\n"
+ "str d18, [x28, x14]\n"
+ "str d9, [x27, x14]\n"
+ "add x14, x14, #0x8\n"
"beq 64f\n"
- "add x16, x16, #0x48\n"
+ "add x17, x17, #0x48\n"
"3:" // Oddments
"ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
"tbz x8, #2, 5f\n"
- "ld1 { v11.4s }, [x19], #0x10\n"
+ "ld1 { v13.4s }, [x19], #0x10\n"
"tbz x8, #1, 4f\n"
"ld1 { v26.d }[0], [x19], #0x8\n"
"tbz x8, #0, 7f\n"
@@ -601,46 +585,46 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
"tbz x8, #1, 6f\n"
- "ld1 { v11.d }[0], [x19], #0x8\n"
+ "ld1 { v13.d }[0], [x19], #0x8\n"
"tbz x8, #0, 7f\n"
- "ld1 { v11.s }[2], [x19]\n"
+ "ld1 { v13.s }[2], [x19]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
"tbz x8, #0, 7f\n"
- "ld1 { v11.s }[0], [x19]\n"
+ "ld1 { v13.s }[0], [x19]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "mov v23.16b, v11.16b\n"
- "ldr d0, [x16, #0x0]\n"
- "mov v9.16b, v26.16b\n"
- "ldr d1, [x16, #0x8]\n"
- "mov v12.16b, v11.16b\n"
- "ldr d2, [x16, #0x10]\n"
- "mov v22.16b, v26.16b\n"
- "ldr d3, [x16, #0x18]\n"
- "mov v24.16b, v11.16b\n"
- "ldr d4, [x16, #0x20]\n"
- "mov v10.16b, v26.16b\n"
- "ldr d5, [x16, #0x28]\n"
- "ssubl v0.8h, v0.8b, v17.8b\n"
- "ldr d6, [x16, #0x30]\n"
- "ssubl v1.8h, v1.8b, v17.8b\n"
- "ldr d7, [x16, #0x38]\n"
- "ssubl v2.8h, v2.8b, v17.8b\n"
- "ldr d8, [x16, #0x40]\n"
- "ssubl v3.8h, v3.8b, v17.8b\n"
- "ldp x23, x22, [x14, #0x0]\n"
- "add x23, x23, x17\n"
- "ssubl v4.8h, v4.8b, v17.8b\n"
- "ldp x21, x20, [x14, #0x10]\n"
- "ssubl v5.8h, v5.8b, v17.8b\n"
- "ldr x19, [x14, #0x20]\n"
- "ssubl v6.8h, v6.8b, v17.8b\n"
- "add x22, x22, x17\n"
- "ssubl v7.8h, v7.8b, v17.8b\n"
- "add x21, x21, x17\n"
- "ssubl v8.8h, v8.8b, v17.8b\n"
- "add x20, x20, x17\n"
- "add x19, x19, x17\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "mov v19.16b, v13.16b\n"
+ "mov v11.16b, v26.16b\n"
+ "ldr d2, [x17, #0x10]\n"
+ "ldr d3, [x17, #0x18]\n"
+ "mov v18.16b, v13.16b\n"
+ "mov v24.16b, v26.16b\n"
+ "ldr d4, [x17, #0x20]\n"
+ "ldr d5, [x17, #0x28]\n"
+ "mov v9.16b, v13.16b\n"
+ "mov v23.16b, v26.16b\n"
+ "ldr d6, [x17, #0x30]\n"
+ "ldr d7, [x17, #0x38]\n"
+ "ssubl v0.8h, v0.8b, v12.8b\n"
+ "ssubl v1.8h, v1.8b, v12.8b\n"
+ "ldr d8, [x17, #0x40]\n"
+ "ldp x23, x22, [x12, #0x0]\n"
+ "ssubl v2.8h, v2.8b, v12.8b\n"
+ "ssubl v3.8h, v3.8b, v12.8b\n"
+ "ldp x21, x20, [x12, #0x10]\n"
+ "ldr x19, [x12, #0x20]\n"
+ "ssubl v4.8h, v4.8b, v12.8b\n"
+ "ssubl v5.8h, v5.8b, v12.8b\n"
+ "ssubl v6.8h, v6.8b, v12.8b\n"
+ "ssubl v7.8h, v7.8b, v12.8b\n"
+ "ssubl v8.8h, v8.8b, v12.8b\n"
+ "add x23, x23, x15\n"
+ "add x22, x22, x15\n"
+ "add x21, x21, x15\n"
+ "add x20, x20, x15\n"
+ "add x19, x19, x15\n"
"tbz x8, #2, 9f\n"
"ld1 { v31.s }[0], [x23], #0x4\n"
"ld1 { v30.s }[0], [x22], #0x4\n"
@@ -690,33 +674,33 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"ld1 { v28.b }[0], [x20]\n"
"ld1 { v27.b }[0], [x19]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v31.8h, v31.8b, v21.8b\n"
- "ldr x21, [x14, #0x28]\n"
- "add x21, x21, x17\n"
- "usubl v30.8h, v30.8b, v21.8b\n"
- "usubl v29.8h, v29.8b, v21.8b\n"
- "usubl v28.8h, v28.8b, v21.8b\n"
- "usubl v27.8h, v27.8b, v21.8b\n"
- "smlal v11.4s, v31.4h, v4.4h\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "smlal v13.4s, v31.4h, v4.4h\n"
"smlal2 v26.4s, v31.8h, v4.8h\n"
- "smlal v23.4s, v31.4h, v3.4h\n"
- "smlal2 v9.4s, v31.8h, v3.8h\n"
- "smlal v12.4s, v31.4h, v1.4h\n"
- "smlal2 v22.4s, v31.8h, v1.8h\n"
- "smlal v24.4s, v31.4h, v0.4h\n"
- "smlal2 v10.4s, v31.8h, v0.8h\n"
- "smlal v11.4s, v30.4h, v0.4h\n"
+ "ldr x21, [x12, #0x28]\n"
+ "smlal v19.4s, v31.4h, v3.4h\n"
+ "smlal2 v11.4s, v31.8h, v3.8h\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "add x21, x21, x15\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "smlal v18.4s, v31.4h, v1.4h\n"
+ "smlal2 v24.4s, v31.8h, v1.8h\n"
+ "smlal v9.4s, v31.4h, v0.4h\n"
+ "smlal2 v23.4s, v31.8h, v0.8h\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal v13.4s, v30.4h, v0.4h\n"
"smlal2 v26.4s, v30.8h, v0.8h\n"
- "smlal v23.4s, v29.4h, v2.4h\n"
- "smlal2 v9.4s, v29.8h, v2.8h\n"
- "smlal v11.4s, v28.4h, v5.4h\n"
+ "usubl v27.8h, v27.8b, v22.8b\n"
+ "smlal v19.4s, v29.4h, v2.4h\n"
+ "smlal2 v11.4s, v29.8h, v2.8h\n"
+ "smlal v13.4s, v28.4h, v5.4h\n"
"smlal2 v26.4s, v28.8h, v5.8h\n"
- "smlal v23.4s, v28.4h, v4.4h\n"
- "smlal2 v9.4s, v28.8h, v4.8h\n"
- "smlal v12.4s, v28.4h, v2.4h\n"
- "smlal2 v22.4s, v28.8h, v2.8h\n"
- "smlal v24.4s, v28.4h, v1.4h\n"
- "smlal2 v10.4s, v28.8h, v1.8h\n"
+ "smlal v19.4s, v28.4h, v4.4h\n"
+ "smlal2 v11.4s, v28.8h, v4.8h\n"
+ "smlal v18.4s, v28.4h, v2.4h\n"
+ "smlal2 v24.4s, v28.8h, v2.8h\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "smlal2 v23.4s, v28.8h, v1.8h\n"
"tbz x8, #2, 13f\n"
"ld1 { v31.s }[0], [x21], #0x4\n"
"tbz x8, #1, 12f\n"
@@ -738,19 +722,19 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 15f\n"
"ld1 { v31.b }[0], [x21]\n"
"15:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v31.8h, v31.8b, v21.8b\n"
- "ldr x20, [x14, #0x30]\n"
- "smlal v11.4s, v27.4h, v7.4h\n"
- "add x20, x20, x17\n"
- "smlal v12.4s, v31.4h, v6.4h\n"
- "smlal2 v22.4s, v31.8h, v6.8h\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "smlal v18.4s, v31.4h, v6.4h\n"
+ "smlal2 v24.4s, v31.8h, v6.8h\n"
+ "ldr x20, [x12, #0x30]\n"
+ "smlal v13.4s, v27.4h, v7.4h\n"
"smlal2 v26.4s, v27.8h, v7.8h\n"
- "smlal v23.4s, v27.4h, v6.4h\n"
- "smlal2 v9.4s, v27.8h, v6.8h\n"
- "smlal v12.4s, v27.4h, v4.4h\n"
- "smlal2 v22.4s, v27.8h, v4.8h\n"
- "smlal v24.4s, v27.4h, v3.4h\n"
- "smlal2 v10.4s, v27.8h, v3.8h\n"
+ "add x20, x20, x15\n"
+ "smlal v19.4s, v27.4h, v6.4h\n"
+ "smlal2 v11.4s, v27.8h, v6.8h\n"
+ "smlal v18.4s, v27.4h, v4.4h\n"
+ "smlal2 v24.4s, v27.8h, v4.8h\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "smlal2 v23.4s, v27.8h, v3.8h\n"
"tbz x8, #2, 17f\n"
"ld1 { v29.s }[0], [x20], #0x4\n"
"tbz x8, #1, 16f\n"
@@ -772,11 +756,11 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 19f\n"
"ld1 { v29.b }[0], [x20]\n"
"19:" // Oddments: Load (3, 3): Bit 2: End
- "usubl v29.8h, v29.8b, v21.8b\n"
- "ldr x26, [x14, #0x38]\n"
- "smlal v24.4s, v29.4h, v8.4h\n"
- "add x26, x26, x17\n"
- "smlal2 v10.4s, v29.8h, v8.8h\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "ldr x26, [x12, #0x38]\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "smlal2 v23.4s, v29.8h, v8.8h\n"
+ "add x26, x26, x15\n"
"tbz x8, #2, 21f\n"
"ld1 { v28.s }[0], [x26], #0x4\n"
"tbz x8, #1, 20f\n"
@@ -798,13 +782,13 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 23f\n"
"ld1 { v28.b }[0], [x26]\n"
"23:" // Oddments: Load (0, 1): Bit 2: End
- "usubl v28.8h, v28.8b, v21.8b\n"
- "ldr x25, [x14, #0x40]\n"
- "smlal v11.4s, v28.4h, v1.4h\n"
- "add x25, x25, x17\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "ldr x25, [x12, #0x40]\n"
+ "smlal v13.4s, v28.4h, v1.4h\n"
"smlal2 v26.4s, v28.8h, v1.8h\n"
- "smlal v23.4s, v28.4h, v0.4h\n"
- "smlal2 v9.4s, v28.8h, v0.8h\n"
+ "smlal v19.4s, v28.4h, v0.4h\n"
+ "smlal2 v11.4s, v28.8h, v0.8h\n"
+ "add x25, x25, x15\n"
"tbz x8, #2, 25f\n"
"ld1 { v31.s }[0], [x25], #0x4\n"
"tbz x8, #1, 24f\n"
@@ -826,13 +810,13 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 27f\n"
"ld1 { v31.b }[0], [x25]\n"
"27:" // Oddments: Load (0, 2): Bit 2: End
- "usubl v31.8h, v31.8b, v21.8b\n"
- "ldr x19, [x14, #0x48]\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "add x19, x19, x17\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "ldr x19, [x12, #0x48]\n"
+ "smlal v13.4s, v31.4h, v2.4h\n"
"smlal2 v26.4s, v31.8h, v2.8h\n"
- "smlal v23.4s, v31.4h, v1.4h\n"
- "smlal2 v9.4s, v31.8h, v1.8h\n"
+ "smlal v19.4s, v31.4h, v1.4h\n"
+ "smlal2 v11.4s, v31.8h, v1.8h\n"
+ "add x19, x19, x15\n"
"tbz x8, #2, 29f\n"
"ld1 { v30.s }[0], [x19], #0x4\n"
"tbz x8, #1, 28f\n"
@@ -854,17 +838,17 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 31f\n"
"ld1 { v30.b }[0], [x19]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
- "usubl v30.8h, v30.8b, v21.8b\n"
- "ldr x24, [x14, #0x50]\n"
- "smlal v11.4s, v30.4h, v8.4h\n"
- "add x24, x24, x17\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "ldr x24, [x12, #0x50]\n"
+ "smlal v13.4s, v30.4h, v8.4h\n"
"smlal2 v26.4s, v30.8h, v8.8h\n"
- "smlal v23.4s, v30.4h, v7.4h\n"
- "smlal2 v9.4s, v30.8h, v7.8h\n"
- "smlal v12.4s, v30.4h, v5.4h\n"
- "smlal2 v22.4s, v30.8h, v5.8h\n"
- "smlal v24.4s, v30.4h, v4.4h\n"
- "smlal2 v10.4s, v30.8h, v4.8h\n"
+ "smlal v19.4s, v30.4h, v7.4h\n"
+ "smlal2 v11.4s, v30.8h, v7.8h\n"
+ "add x24, x24, x15\n"
+ "smlal v18.4s, v30.4h, v5.4h\n"
+ "smlal2 v24.4s, v30.8h, v5.8h\n"
+ "smlal v9.4s, v30.4h, v4.4h\n"
+ "smlal2 v23.4s, v30.8h, v4.8h\n"
"tbz x8, #2, 33f\n"
"ld1 { v29.s }[0], [x24], #0x4\n"
"tbz x8, #1, 32f\n"
@@ -886,13 +870,13 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 35f\n"
"ld1 { v29.b }[0], [x24]\n"
"35:" // Oddments: Load (1, 0): Bit 2: End
- "usubl v29.8h, v29.8b, v21.8b\n"
- "ldr x23, [x14, #0x58]\n"
- "smlal v11.4s, v29.4h, v3.4h\n"
- "add x23, x23, x17\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "ldr x23, [x12, #0x58]\n"
+ "smlal v13.4s, v29.4h, v3.4h\n"
"smlal2 v26.4s, v29.8h, v3.8h\n"
- "smlal v12.4s, v29.4h, v0.4h\n"
- "smlal2 v22.4s, v29.8h, v0.8h\n"
+ "smlal v18.4s, v29.4h, v0.4h\n"
+ "smlal2 v24.4s, v29.8h, v0.8h\n"
+ "add x23, x23, x15\n"
"tbz x8, #2, 37f\n"
"ld1 { v28.s }[0], [x23], #0x4\n"
"tbz x8, #1, 36f\n"
@@ -914,13 +898,13 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 39f\n"
"ld1 { v28.b }[0], [x23]\n"
"39:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v28.8h, v28.8b, v21.8b\n"
- "ldr x22, [x14, #0x60]\n"
- "smlal v23.4s, v28.4h, v5.4h\n"
- "add x22, x22, x17\n"
- "smlal2 v9.4s, v28.8h, v5.8h\n"
- "smlal v24.4s, v28.4h, v2.4h\n"
- "smlal2 v10.4s, v28.8h, v2.8h\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "ldr x22, [x12, #0x60]\n"
+ "smlal v19.4s, v28.4h, v5.4h\n"
+ "smlal2 v11.4s, v28.8h, v5.8h\n"
+ "smlal v9.4s, v28.4h, v2.4h\n"
+ "smlal2 v23.4s, v28.8h, v2.8h\n"
+ "add x22, x22, x15\n"
"tbz x8, #2, 41f\n"
"ld1 { v31.s }[0], [x22], #0x4\n"
"tbz x8, #1, 40f\n"
@@ -942,13 +926,13 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 43f\n"
"ld1 { v31.b }[0], [x22]\n"
"43:" // Oddments: Load (2, 0): Bit 2: End
- "usubl v31.8h, v31.8b, v21.8b\n"
- "ldr x21, [x14, #0x68]\n"
- "smlal v11.4s, v31.4h, v6.4h\n"
- "add x21, x21, x17\n"
+ "usubl v31.8h, v31.8b, v22.8b\n"
+ "ldr x21, [x12, #0x68]\n"
+ "smlal v13.4s, v31.4h, v6.4h\n"
"smlal2 v26.4s, v31.8h, v6.8h\n"
- "smlal v12.4s, v31.4h, v3.4h\n"
- "smlal2 v22.4s, v31.8h, v3.8h\n"
+ "smlal v18.4s, v31.4h, v3.4h\n"
+ "smlal2 v24.4s, v31.8h, v3.8h\n"
+ "add x21, x21, x15\n"
"tbz x8, #2, 45f\n"
"ld1 { v30.s }[0], [x21], #0x4\n"
"tbz x8, #1, 44f\n"
@@ -970,13 +954,13 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 47f\n"
"ld1 { v30.b }[0], [x21]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "usubl v30.8h, v30.8b, v21.8b\n"
- "ldr x20, [x14, #0x70]\n"
- "smlal v23.4s, v30.4h, v8.4h\n"
- "add x20, x20, x17\n"
- "smlal2 v9.4s, v30.8h, v8.8h\n"
- "smlal v24.4s, v30.4h, v5.4h\n"
- "smlal2 v10.4s, v30.8h, v5.8h\n"
+ "usubl v30.8h, v30.8b, v22.8b\n"
+ "ldr x20, [x12, #0x70]\n"
+ "smlal v19.4s, v30.4h, v8.4h\n"
+ "smlal2 v11.4s, v30.8h, v8.8h\n"
+ "smlal v9.4s, v30.4h, v5.4h\n"
+ "smlal2 v23.4s, v30.8h, v5.8h\n"
+ "add x20, x20, x15\n"
"tbz x8, #2, 49f\n"
"ld1 { v29.s }[0], [x20], #0x4\n"
"tbz x8, #1, 48f\n"
@@ -998,13 +982,13 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 51f\n"
"ld1 { v29.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
- "usubl v29.8h, v29.8b, v21.8b\n"
- "ldr x19, [x14, #0x78]\n"
- "smlal v12.4s, v29.4h, v7.4h\n"
- "add x19, x19, x17\n"
- "smlal2 v22.4s, v29.8h, v7.8h\n"
- "smlal v24.4s, v29.4h, v6.4h\n"
- "smlal2 v10.4s, v29.8h, v6.8h\n"
+ "usubl v29.8h, v29.8b, v22.8b\n"
+ "ldr x19, [x12, #0x78]\n"
+ "smlal v18.4s, v29.4h, v7.4h\n"
+ "smlal2 v24.4s, v29.8h, v7.8h\n"
+ "smlal v9.4s, v29.4h, v6.4h\n"
+ "smlal2 v23.4s, v29.8h, v6.8h\n"
+ "add x19, x19, x15\n"
"tbz x8, #2, 53f\n"
"ld1 { v28.s }[0], [x19], #0x4\n"
"tbz x8, #1, 52f\n"
@@ -1026,160 +1010,150 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
"tbz x8, #0, 55f\n"
"ld1 { v28.b }[0], [x19]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
- "usubl v28.8h, v28.8b, v21.8b\n"
- "smlal v12.4s, v28.4h, v8.4h\n"
- "smlal2 v22.4s, v28.8h, v8.8h\n"
- "smlal v24.4s, v28.4h, v7.4h\n"
- "smlal2 v10.4s, v28.8h, v7.8h\n"
+ "usubl v28.8h, v28.8b, v22.8b\n"
+ "smlal v18.4s, v28.4h, v8.4h\n"
+ "smlal2 v24.4s, v28.8h, v8.8h\n"
+ "smlal v9.4s, v28.4h, v7.4h\n"
+ "smlal2 v23.4s, v28.8h, v7.8h\n"
"tbz x8, #2, 57f\n"
- "ld1 { v25.4s }, [x13], #0x10\n"
- "ld1 { v18.4s }, [x11], #0x10\n"
+ "ld1 { v21.4s }, [x13], #0x10\n"
+ "ld1 { v25.4s }, [x11], #0x10\n"
"tbz x8, #1, 56f\n"
- "ld1 { v16.d }[0], [x13], #0x8\n"
- "ld1 { v20.d }[0], [x11], #0x8\n"
+ "ld1 { v10.d }[0], [x13], #0x8\n"
+ "ld1 { v16.d }[0], [x11], #0x8\n"
"tbz x8, #0, 59f\n"
- "ld1 { v16.s }[2], [x13]\n"
- "ld1 { v20.s }[2], [x11]\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v16.s }[2], [x11]\n"
"b 59f\n"
"56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
"tbz x8, #0, 59f\n"
- "ld1 { v16.s }[0], [x13]\n"
- "ld1 { v20.s }[0], [x11]\n"
+ "ld1 { v10.s }[0], [x13]\n"
+ "ld1 { v16.s }[0], [x11]\n"
"b 59f\n"
"57:" // Oddments: Load requant params: Bit 2: Unset
"tbz x8, #1, 58f\n"
- "ld1 { v25.d }[0], [x13], #0x8\n"
- "ld1 { v18.d }[0], [x11], #0x8\n"
+ "ld1 { v21.d }[0], [x13], #0x8\n"
+ "ld1 { v25.d }[0], [x11], #0x8\n"
"tbz x8, #0, 59f\n"
- "ld1 { v25.s }[2], [x13]\n"
- "ld1 { v18.s }[2], [x11]\n"
+ "ld1 { v21.s }[2], [x13]\n"
+ "ld1 { v25.s }[2], [x11]\n"
"b 59f\n"
"58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
"tbz x8, #0, 59f\n"
- "ld1 { v25.s }[0], [x13]\n"
- "ld1 { v18.s }[0], [x11]\n"
+ "ld1 { v21.s }[0], [x13]\n"
+ "ld1 { v25.s }[0], [x11]\n"
"59:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v11.4s, v11.4s, v25.4s\n"
- "add x10, x10, x15\n"
- "sqrdmulh v26.4s, v26.4s, v16.4s\n"
- "add x9, x9, x15\n"
- "sqrdmulh v23.4s, v23.4s, v25.4s\n"
- "add x28, x28, x15\n"
- "sqrdmulh v9.4s, v9.4s, v16.4s\n"
- "add x27, x27, x15\n"
- "sqrdmulh v12.4s, v12.4s, v25.4s\n"
- "and v19.16b, v11.16b, v18.16b\n"
- "and v5.16b, v26.16b, v20.16b\n"
- "and v28.16b, v23.16b, v18.16b\n"
- "sshr v19.4s, v19.4s, #0x1f\n"
- "sshr v5.4s, v5.4s, #0x1f\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
- "sqadd v11.4s, v11.4s, v19.4s\n"
- "sqadd v26.4s, v26.4s, v5.4s\n"
- "sqadd v23.4s, v23.4s, v28.4s\n"
- "and v8.16b, v9.16b, v20.16b\n"
- "srshl v11.4s, v11.4s, v18.4s\n"
- "srshl v26.4s, v26.4s, v20.4s\n"
- "srshl v23.4s, v23.4s, v18.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "add v11.4s, v11.4s, v13.4s\n"
- "add v26.4s, v26.4s, v13.4s\n"
- "add v23.4s, v23.4s, v13.4s\n"
- "smin v11.4s, v11.4s, v14.4s\n"
- "smin v26.4s, v26.4s, v14.4s\n"
- "smin v23.4s, v23.4s, v14.4s\n"
- "smax v11.4s, v11.4s, v15.4s\n"
- "smax v26.4s, v26.4s, v15.4s\n"
- "smax v23.4s, v23.4s, v15.4s\n"
- "sqadd v9.4s, v9.4s, v8.4s\n"
- "uzp1 v11.16b, v11.16b, v26.16b\n"
- "and v1.16b, v12.16b, v18.16b\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "srshl v9.4s, v9.4s, v20.4s\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "sqrdmulh v22.4s, v22.4s, v16.4s\n"
- "sqrdmulh v24.4s, v24.4s, v25.4s\n"
- "add v9.4s, v9.4s, v13.4s\n"
- "sqadd v12.4s, v12.4s, v1.4s\n"
- "and v0.16b, v22.16b, v20.16b\n"
- "smin v9.4s, v9.4s, v14.4s\n"
- "and v26.16b, v24.16b, v18.16b\n"
- "srshl v12.4s, v12.4s, v18.4s\n"
- "smax v9.4s, v9.4s, v15.4s\n"
- "sshr v0.4s, v0.4s, #0x1f\n"
- "sshr v26.4s, v26.4s, #0x1f\n"
- "uzp1 v23.16b, v23.16b, v9.16b\n"
- "add v12.4s, v12.4s, v13.4s\n"
- "uzp1 v23.16b, v23.16b, v23.16b\n"
- "sqadd v22.4s, v22.4s, v0.4s\n"
- "smin v12.4s, v12.4s, v14.4s\n"
- "sqadd v24.4s, v24.4s, v26.4s\n"
- "sqrdmulh v10.4s, v10.4s, v16.4s\n"
- "smax v12.4s, v12.4s, v15.4s\n"
- "srshl v22.4s, v22.4s, v20.4s\n"
- "srshl v24.4s, v24.4s, v18.4s\n"
- "and v16.16b, v10.16b, v20.16b\n"
- "add v22.4s, v22.4s, v13.4s\n"
- "add v24.4s, v24.4s, v13.4s\n"
- "sshr v16.4s, v16.4s, #0x1f\n"
- "smin v22.4s, v22.4s, v14.4s\n"
- "smin v24.4s, v24.4s, v14.4s\n"
- "sqadd v10.4s, v10.4s, v16.4s\n"
- "smax v22.4s, v22.4s, v15.4s\n"
- "smax v24.4s, v24.4s, v15.4s\n"
- "srshl v10.4s, v10.4s, v20.4s\n"
- "uzp1 v12.16b, v12.16b, v22.16b\n"
- "uzp1 v12.16b, v12.16b, v12.16b\n"
- "add v10.4s, v10.4s, v13.4s\n"
- "smin v10.4s, v10.4s, v14.4s\n"
- "smax v10.4s, v10.4s, v15.4s\n"
- "uzp1 v24.16b, v24.16b, v10.16b\n"
- "uzp1 v24.16b, v24.16b, v24.16b\n"
+ "sqdmulh v13.4s, v13.4s, v21.4s\n"
+ "sqdmulh v19.4s, v19.4s, v21.4s\n"
+ "add x10, x10, x14\n"
+ "add x9, x9, x14\n"
+ "sqdmulh v18.4s, v18.4s, v21.4s\n"
+ "sqdmulh v9.4s, v9.4s, v21.4s\n"
+ "add x28, x28, x14\n"
+ "add x27, x27, x14\n"
+ "and v7.16b, v13.16b, v25.16b\n"
+ "sqdmulh v26.4s, v26.4s, v10.4s\n"
+ "and v4.16b, v19.16b, v25.16b\n"
+ "sqdmulh v11.4s, v11.4s, v10.4s\n"
+ "and v21.16b, v18.16b, v25.16b\n"
+ "sqdmulh v24.4s, v24.4s, v10.4s\n"
+ "and v20.16b, v9.16b, v25.16b\n"
+ "sqdmulh v23.4s, v23.4s, v10.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v29.16b, v26.16b, v16.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v10.16b, v11.16b, v16.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v31.16b, v24.16b, v16.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v30.16b, v23.16b, v16.16b\n"
+ "sqadd v13.4s, v13.4s, v7.4s\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v4.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v21.4s\n"
+ "sshr v31.4s, v31.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v20.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "srshl v13.4s, v13.4s, v25.4s\n"
+ "sqadd v26.4s, v26.4s, v29.4s\n"
+ "srshl v19.4s, v19.4s, v25.4s\n"
+ "sqadd v11.4s, v11.4s, v10.4s\n"
+ "srshl v18.4s, v18.4s, v25.4s\n"
+ "sqadd v24.4s, v24.4s, v31.4s\n"
+ "srshl v9.4s, v9.4s, v25.4s\n"
+ "sqadd v23.4s, v23.4s, v30.4s\n"
+ "srshl v26.4s, v26.4s, v16.4s\n"
+ "sqxtn v13.4h, v13.4s\n"
+ "srshl v11.4s, v11.4s, v16.4s\n"
+ "sqxtn v19.4h, v19.4s\n"
+ "srshl v24.4s, v24.4s, v16.4s\n"
+ "sqxtn v18.4h, v18.4s\n"
+ "srshl v23.4s, v23.4s, v16.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "sqxtn2 v13.8h, v26.4s\n"
+ "sqxtn2 v19.8h, v11.4s\n"
+ "sqxtn2 v18.8h, v24.4s\n"
+ "sqxtn2 v9.8h, v23.4s\n"
+ "sqadd v13.8h, v13.8h, v14.8h\n"
+ "sqadd v19.8h, v19.8h, v14.8h\n"
+ "sqadd v18.8h, v18.8h, v14.8h\n"
+ "sqadd v9.8h, v9.8h, v14.8h\n"
+ "smax v13.8h, v13.8h, v17.8h\n"
+ "smax v19.8h, v19.8h, v17.8h\n"
+ "smax v18.8h, v18.8h, v17.8h\n"
+ "smax v9.8h, v9.8h, v17.8h\n"
+ "smin v13.8h, v13.8h, v15.8h\n"
+ "smin v19.8h, v19.8h, v15.8h\n"
+ "smin v18.8h, v18.8h, v15.8h\n"
+ "smin v9.8h, v9.8h, v15.8h\n"
+ "uzp1 v13.16b, v13.16b, v13.16b\n"
+ "uzp1 v19.16b, v19.16b, v19.16b\n"
+ "uzp1 v18.16b, v18.16b, v18.16b\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
"tbz x8, #2, 61f\n"
- "st1 { v11.s }[0], [x10], #0x4\n"
- "st1 { v23.s }[0], [x9], #0x4\n"
- "st1 { v12.s }[0], [x28], #0x4\n"
- "st1 { v24.s }[0], [x27], #0x4\n"
+ "st1 { v13.s }[0], [x10], #0x4\n"
+ "st1 { v19.s }[0], [x9], #0x4\n"
+ "st1 { v18.s }[0], [x28], #0x4\n"
+ "st1 { v9.s }[0], [x27], #0x4\n"
"tbz x8, #1, 60f\n"
- "st1 { v11.h }[2], [x10], #0x2\n"
- "st1 { v23.h }[2], [x9], #0x2\n"
- "st1 { v12.h }[2], [x28], #0x2\n"
- "st1 { v24.h }[2], [x27], #0x2\n"
+ "st1 { v13.h }[2], [x10], #0x2\n"
+ "st1 { v19.h }[2], [x9], #0x2\n"
+ "st1 { v18.h }[2], [x28], #0x2\n"
+ "st1 { v9.h }[2], [x27], #0x2\n"
"tbz x8, #0, 63f\n"
- "st1 { v11.b }[6], [x10], #0x1\n"
- "st1 { v23.b }[6], [x9], #0x1\n"
- "st1 { v12.b }[6], [x28], #0x1\n"
- "st1 { v24.b }[6], [x27], #0x1\n"
+ "st1 { v13.b }[6], [x10], #0x1\n"
+ "st1 { v19.b }[6], [x9], #0x1\n"
+ "st1 { v18.b }[6], [x28], #0x1\n"
+ "st1 { v9.b }[6], [x27], #0x1\n"
"b 63f\n"
"60:" // Oddments: Bit 2: Bit 1: Unset
"tbz x8, #0, 63f\n"
- "st1 { v11.b }[4], [x10], #0x1\n"
- "st1 { v23.b }[4], [x9], #0x1\n"
- "st1 { v12.b }[4], [x28], #0x1\n"
- "st1 { v24.b }[4], [x27], #0x1\n"
+ "st1 { v13.b }[4], [x10], #0x1\n"
+ "st1 { v19.b }[4], [x9], #0x1\n"
+ "st1 { v18.b }[4], [x28], #0x1\n"
+ "st1 { v9.b }[4], [x27], #0x1\n"
"b 63f\n"
"61:" // Oddments: Bit 2: Unset
"tbz x8, #1, 62f\n"
- "st1 { v11.h }[0], [x10], #0x2\n"
- "st1 { v23.h }[0], [x9], #0x2\n"
- "st1 { v12.h }[0], [x28], #0x2\n"
- "st1 { v24.h }[0], [x27], #0x2\n"
+ "st1 { v13.h }[0], [x10], #0x2\n"
+ "st1 { v19.h }[0], [x9], #0x2\n"
+ "st1 { v18.h }[0], [x28], #0x2\n"
+ "st1 { v9.h }[0], [x27], #0x2\n"
"tbz x8, #0, 63f\n"
- "st1 { v11.b }[2], [x10], #0x1\n"
- "st1 { v23.b }[2], [x9], #0x1\n"
- "st1 { v12.b }[2], [x28], #0x1\n"
- "st1 { v24.b }[2], [x27], #0x1\n"
+ "st1 { v13.b }[2], [x10], #0x1\n"
+ "st1 { v19.b }[2], [x9], #0x1\n"
+ "st1 { v18.b }[2], [x28], #0x1\n"
+ "st1 { v9.b }[2], [x27], #0x1\n"
"b 63f\n"
"62:" // Oddments: Bit 2: Unset: Bit 1: Unset
"tbz x8, #0, 63f\n"
- "st1 { v11.b }[0], [x10], #0x1\n"
- "st1 { v23.b }[0], [x9], #0x1\n"
- "st1 { v12.b }[0], [x28], #0x1\n"
- "st1 { v24.b }[0], [x27], #0x1\n"
+ "st1 { v13.b }[0], [x10], #0x1\n"
+ "st1 { v19.b }[0], [x9], #0x1\n"
+ "st1 { v18.b }[0], [x28], #0x1\n"
+ "st1 { v9.b }[0], [x27], #0x1\n"
"63:" // Oddments: Bit 2: End
-
"64:" // End
-
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 77861e94f0..9a1b64ed7b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,37 +36,24 @@ namespace depthwise {
void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
-struct a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst
+class a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef uint8_t input_type;
- typedef int8_t weight_type;
- typedef uint8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 2;
constexpr static unsigned int stride_cols = 2;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 5;
- constexpr static unsigned int input_cols = 5;
-
- constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_mla::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_mla::get_packed_size;
+ a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
- kern_type kernel = a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
- a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 4e1586b033..790d26bc6d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,7 +46,7 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const int8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const uint8_t *const *inptrs_raw,
- const int8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -100,611 +100,595 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
- "mov x4, #0x0\n"
- "ldr x5, [%x[params], %[offsetof_Params_weights]]\n"
- "mov x6, #0x0\n"
- "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
- "add x7, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x8, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "lsr x17, x3, #0x3\n"
- "ldr x16, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "add x19, x22, %[offsetof_Requantize32_a_offset]\n"
- "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v22.16b }, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v12.16b }, [x20]\n"
- "add x20, x22, %[offsetof_Requantize32_minval]\n"
- "ld1r { v14.4s }, [x19]\n"
- "add x19, x22, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v16.4s }, [x20]\n"
- "ld1r { v15.4s }, [x19]\n"
- "ldp x15, x14, [x21, #0x0]\n"
- "ldp x13, x12, [x21, #0x10]\n"
- "cbz x17, 3f\n"
- "subs x17, x17, #0x1\n"
+ "ldr x19, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "add x24, x19, %[offsetof_Requantize32_a_offset]\n"
+ "add x23, x19, %[offsetof_Requantize32_b_offset]\n"
+ "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x21, x19, %[offsetof_Requantize32_c_offset]\n"
+ "add x20, x19, %[offsetof_Requantize32_minval]\n"
+ "ldr x17, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x19, x19, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v12.16b }, [x24]\n"
+ "ld1r { v13.16b }, [x23]\n"
+ "lsr x16, x8, #0x3\n"
+ "ld1r { v11.8h }, [x21]\n"
+ "ld1r { v17.8h }, [x20]\n"
+ "mov x15, #0x0\n"
+ "mov x14, #0x0\n"
+ "ld1r { v14.8h }, [x19]\n"
+ "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x12, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x10, x9, [x22, #0x0]\n"
+ "ldp x28, x27, [x22, #0x10]\n"
+ "cbz x16, 3f\n"
"ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q13, [x19, #0x0]\n"
- "mov v19.16b, v13.16b\n"
+ "ldr q15, [x19, #0x0]\n"
+ "subs x16, x16, #0x1\n"
+ "mov v9.16b, v15.16b\n"
"ldr q10, [x19, #0x10]\n"
"add x19, x19, #0x20\n"
- "mov v11.16b, v13.16b\n"
"str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "mov v18.16b, v13.16b\n"
- "ldr d0, [x5, #0x0]\n"
- "ldr d1, [x5, #0x8]\n"
- "mov v20.16b, v10.16b\n"
- "ldr d2, [x5, #0x10]\n"
- "mov v17.16b, v10.16b\n"
- "ldr d3, [x5, #0x18]\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "ldr d2, [x17, #0x10]\n"
+ "mov v16.16b, v10.16b\n"
+ "mov v22.16b, v15.16b\n"
+ "ldr d3, [x17, #0x18]\n"
+ "ldr d4, [x17, #0x20]\n"
"mov v21.16b, v10.16b\n"
- "ldr d4, [x5, #0x20]\n"
- "ssubl v0.8h, v0.8b, v12.8b\n"
- "ldr d5, [x5, #0x28]\n"
- "ssubl v1.8h, v1.8b, v12.8b\n"
- "ldr d6, [x5, #0x30]\n"
- "ssubl v2.8h, v2.8b, v12.8b\n"
- "ldr d7, [x5, #0x38]\n"
- "ssubl v3.8h, v3.8b, v12.8b\n"
- "ldr d8, [x5, #0x40]\n"
- "ssubl v4.8h, v4.8b, v12.8b\n"
- "ldp x26, x25, [x7, #0x0]\n"
- "ssubl v5.8h, v5.8b, v12.8b\n"
- "ldp x24, x23, [x7, #0x10]\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
- "ssubl v7.8h, v7.8b, v12.8b\n"
- "ldp x22, x21, [x7, #0x20]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "ldp x20, x19, [x7, #0x30]\n"
- "ldr d31, [x26, x4]\n"
- "usubl v31.8h, v31.8b, v22.8b\n"
- "ldr d30, [x25, x4]\n"
- "ldr d29, [x24, x4]\n"
- "usubl v30.8h, v30.8b, v22.8b\n"
- "ldr d28, [x23, x4]\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "ldr d27, [x22, x4]\n"
- "ldr d26, [x21, x4]\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
- "ldr d25, [x20, x4]\n"
- "ldr d24, [x19, x4]\n"
- "usubl v27.8h, v27.8b, v22.8b\n"
- "usubl v26.8h, v26.8b, v22.8b\n"
- "usubl v25.8h, v25.8b, v22.8b\n"
- "usubl v24.8h, v24.8b, v22.8b\n"
+ "mov v23.16b, v15.16b\n"
+ "ldr d5, [x17, #0x28]\n"
+ "ldr d6, [x17, #0x30]\n"
+ "mov v18.16b, v10.16b\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "ldr d7, [x17, #0x38]\n"
+ "ldr d8, [x17, #0x40]\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ldp x26, x25, [x12, #0x0]\n"
+ "ldp x24, x23, [x12, #0x10]\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "ldp x22, x21, [x12, #0x20]\n"
+ "ldp x20, x19, [x12, #0x30]\n"
+ "ssubl v5.8h, v5.8b, v13.8b\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ldr d31, [x26, x15]\n"
+ "ldr d30, [x25, x15]\n"
+ "ssubl v7.8h, v7.8b, v13.8b\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "ldr d29, [x24, x15]\n"
+ "ldr d28, [x23, x15]\n"
+ "usubl v31.8h, v31.8b, v12.8b\n"
+ "usubl v30.8h, v30.8b, v12.8b\n"
+ "ldr d27, [x22, x15]\n"
+ "ldr d26, [x21, x15]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "ldr d25, [x20, x15]\n"
+ "ldr d24, [x19, x15]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
"beq 2f\n"
"1:" // Loop
- "smlal v13.4s, v31.4h, v8.4h\n"
- "ldr x22, [x7, #0x40]\n"
- "add x5, x5, #0x48\n"
+ "smlal v15.4s, v31.4h, v8.4h\n"
"smlal2 v10.4s, v31.8h, v8.8h\n"
- "ldr x21, [x7, #0x48]\n"
- "subs x17, x17, #0x1\n"
- "smlal v19.4s, v31.4h, v6.4h\n"
- "ldr x20, [x7, #0x50]\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "ldr x19, [x7, #0x58]\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "ldr x11, [x7, #0x60]\n"
- "smlal2 v17.4s, v31.8h, v2.8h\n"
- "ldr x10, [x7, #0x68]\n"
- "smlal v18.4s, v31.4h, v0.4h\n"
- "ldr x9, [x7, #0x70]\n"
- "smlal2 v21.4s, v31.8h, v0.8h\n"
- "ldr x28, [x7, #0x78]\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
- "ldr x27, [x7, #0x80]\n"
+ "ldr x24, [x12, #0x40]\n"
+ "ldr x23, [x12, #0x48]\n"
+ "smlal v9.4s, v31.4h, v6.4h\n"
+ "smlal2 v16.4s, v31.8h, v6.8h\n"
+ "ldr x21, [x12, #0x50]\n"
+ "ldr x19, [x12, #0x58]\n"
+ "smlal v15.4s, v30.4h, v0.4h\n"
"smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr x26, [x7, #0x88]\n"
- "smlal v19.4s, v28.4h, v1.4h\n"
- "ldr x25, [x7, #0x90]\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x21, x4]\n"
- "smlal v13.4s, v29.4h, v1.4h\n"
- "ldr x24, [x7, #0x98]\n"
+ "ldr x22, [x12, #0x78]\n"
+ "ldr x20, [x12, #0x60]\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "smlal2 v16.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x23, x15]\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal v15.4s, v29.4h, v1.4h\n"
"smlal2 v10.4s, v29.8h, v1.8h\n"
- "ldr d29, [x22, x4]\n"
- "smlal v19.4s, v27.4h, v2.4h\n"
- "ldr x23, [x7, #0xa0]\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "ldr d27, [x20, x4]\n"
- "smlal v13.4s, v26.4h, v3.4h\n"
- "ldr x22, [x7, #0xa8]\n"
+ "ldr d29, [x24, x15]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v9.4s, v27.4h, v2.4h\n"
+ "smlal2 v16.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x15]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v15.4s, v26.4h, v3.4h\n"
"smlal2 v10.4s, v26.8h, v3.8h\n"
- "ldr d26, [x19, x4]\n"
- "smlal v19.4s, v24.4h, v0.4h\n"
- "ldr x21, [x7, #0xb0]\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
- "ldr x20, [x7, #0xb8]\n"
- "smlal v13.4s, v25.4h, v4.4h\n"
- "ldr x19, [x7, #0xc0]\n"
+ "ldr d26, [x19, x15]\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v9.4s, v24.4h, v0.4h\n"
+ "smlal2 v16.4s, v24.8h, v0.8h\n"
+ "ldr x21, [x12, #0x80]\n"
+ "ldr x19, [x12, #0x68]\n"
+ "smlal v15.4s, v25.4h, v4.4h\n"
"smlal2 v10.4s, v25.8h, v4.8h\n"
- "ldr d25, [x11, x4]\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "ldr q31, [x8, #0x0]\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
- "ldr q30, [x16, #0x0]\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
- "ldr q23, [x8, #0x10]\n"
- "add x8, x8, #0x20\n"
+ "ldr d25, [x20, x15]\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v9.4s, v29.4h, v4.4h\n"
+ "smlal2 v16.4s, v29.8h, v4.8h\n"
+ "ldr x20, [x12, #0x88]\n"
+ "ldr d29, [x19, x15]\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
"smlal2 v10.4s, v24.8h, v2.8h\n"
- "ldr d24, [x9, x4]\n"
- "smlal v19.4s, v29.4h, v4.4h\n"
- "ldr q9, [x16, #0x10]\n"
- "add x16, x16, #0x20\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "ldr d29, [x10, x4]\n"
- "usubl v27.8h, v27.8b, v22.8b\n"
- "usubl v26.8h, v26.8b, v22.8b\n"
- "smlal v19.4s, v28.4h, v5.4h\n"
- "smlal v13.4s, v27.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ldr d28, [x27, x4]\n"
+ "ldr x19, [x12, #0x70]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v9.4s, v28.4h, v5.4h\n"
+ "smlal2 v16.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x21, x15]\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal v22.4s, v31.4h, v2.4h\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "ldr x24, [x12, #0x98]\n"
+ "ldr d24, [x19, x15]\n"
+ "smlal v15.4s, v27.4h, v5.4h\n"
"smlal2 v10.4s, v27.8h, v5.8h\n"
- "smlal v19.4s, v27.4h, v3.4h\n"
- "smlal v11.4s, v26.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
- "ldr d27, [x28, x4]\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "ldr d26, [x26, x4]\n"
- "usubl v25.8h, v25.8b, v22.8b\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "usubl v24.8h, v24.8b, v22.8b\n"
- "smlal v13.4s, v25.4h, v6.4h\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "ldr x23, [x12, #0x90]\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "smlal2 v16.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x22, x15]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "smlal v22.4s, v26.4h, v3.4h\n"
+ "ldr x22, [x12, #0xa8]\n"
+ "ldr x19, [x12, #0xa0]\n"
+ "smlal2 v21.4s, v26.8h, v3.8h\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "ldr d26, [x20, x15]\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v23.4s, v27.4h, v4.4h\n"
+ "smlal v22.4s, v25.4h, v0.4h\n"
+ "ldr x21, [x12, #0xb0]\n"
+ "ldr x20, [x12, #0xb8]\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "smlal2 v18.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x19, x15]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal v15.4s, v25.4h, v6.4h\n"
+ "ldr x19, [x12, #0xc0]\n"
+ "ldr q19, [x13, #0x0]\n"
"smlal2 v10.4s, v25.8h, v6.8h\n"
- "smlal v11.4s, v25.4h, v0.4h\n"
- "smlal2 v17.4s, v25.8h, v0.8h\n"
- "ldr d25, [x25, x4]\n"
- "smlal v13.4s, v24.4h, v7.4h\n"
+ "smlal v22.4s, v29.4h, v4.4h\n"
+ "ldr d25, [x23, x15]\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal2 v21.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x24, x15]\n"
+ "smlal2 v18.4s, v28.8h, v1.8h\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v23.4s, v26.4h, v5.4h\n"
+ "smlal v15.4s, v24.4h, v7.4h\n"
+ "ldr q0, [x11, #0x0]\n"
+ "ldr q4, [x13, #0x10]\n"
"smlal2 v10.4s, v24.8h, v7.8h\n"
- "smlal v11.4s, v29.4h, v4.4h\n"
- "smlal2 v17.4s, v29.8h, v4.8h\n"
- "ldr d29, [x24, x4]\n"
- "usubl v27.8h, v27.8b, v22.8b\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
- "smlal v11.4s, v24.4h, v1.4h\n"
- "smlal2 v17.4s, v24.8h, v1.8h\n"
- "ldr d24, [x22, x4]\n"
- "smlal v18.4s, v27.4h, v4.4h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "ldr d27, [x23, x4]\n"
- "smlal v19.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "smlal v18.4s, v28.4h, v1.4h\n"
- "smlal2 v21.4s, v28.8h, v1.8h\n"
- "usubl v26.8h, v26.8b, v22.8b\n"
- "usubl v25.8h, v25.8b, v22.8b\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "smlal v18.4s, v26.4h, v5.4h\n"
- "smlal2 v21.4s, v26.8h, v5.8h\n"
- "ldr d26, [x21, x4]\n"
- "smlal v11.4s, v25.4h, v6.4h\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "ldr d25, [x20, x4]\n"
- "smlal v19.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "smlal v18.4s, v29.4h, v2.4h\n"
- "smlal2 v21.4s, v29.8h, v2.8h\n"
- "ldr d29, [x19, x4]\n"
- "add x4, x4, #0x8\n"
- "usubl v27.8h, v27.8b, v22.8b\n"
- "usubl v24.8h, v24.8b, v22.8b\n"
- "usubl v26.8h, v26.8b, v22.8b\n"
- "usubl v25.8h, v25.8b, v22.8b\n"
- "smlal v11.4s, v27.4h, v7.4h\n"
- "smlal2 v17.4s, v27.8h, v7.8h\n"
- "smlal v18.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "smlal v11.4s, v24.4h, v5.4h\n"
- "smlal2 v17.4s, v24.8h, v5.8h\n"
- "smlal v18.4s, v26.4h, v7.4h\n"
- "smlal2 v21.4s, v26.8h, v7.8h\n"
- "smlal v11.4s, v25.4h, v8.4h\n"
- "smlal2 v17.4s, v25.8h, v8.8h\n"
- "smlal v18.4s, v25.4h, v6.4h\n"
+ "smlal v22.4s, v24.4h, v1.4h\n"
+ "sqdmulh v15.4s, v15.4s, v19.4s\n"
+ "ldr q31, [x11, #0x10]\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x22, x15]\n"
+ "smlal2 v18.4s, v26.8h, v5.8h\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v23.4s, v29.4h, v2.4h\n"
+ "ldr d26, [x21, x15]\n"
+ "smlal2 v18.4s, v29.8h, v2.8h\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v22.4s, v25.4h, v6.4h\n"
+ "smlal v23.4s, v24.4h, v3.4h\n"
+ "and v30.16b, v15.16b, v0.16b\n"
+ "add x17, x17, #0x48\n"
+ "smlal v9.4s, v28.4h, v7.4h\n"
+ "smlal2 v16.4s, v28.8h, v7.8h\n"
+ "sqdmulh v10.4s, v10.4s, v4.4s\n"
+ "subs x16, x16, #0x1\n"
"smlal2 v21.4s, v25.8h, v6.8h\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "sqrdmulh v13.4s, v13.4s, v31.4s\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "smlal v18.4s, v29.4h, v8.4h\n"
- "smlal2 v21.4s, v29.8h, v8.8h\n"
- "and v27.16b, v13.16b, v30.16b\n"
- "and v7.16b, v10.16b, v9.16b\n"
- "sqrdmulh v19.4s, v19.4s, v31.4s\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "sqadd v13.4s, v13.4s, v27.4s\n"
- "sqadd v10.4s, v10.4s, v7.4s\n"
- "and v6.16b, v19.16b, v30.16b\n"
- "and v3.16b, v20.16b, v9.16b\n"
- "srshl v13.4s, v13.4s, v30.4s\n"
- "srshl v10.4s, v10.4s, v9.4s\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "add v13.4s, v13.4s, v14.4s\n"
- "add v10.4s, v10.4s, v14.4s\n"
- "sqadd v19.4s, v19.4s, v6.4s\n"
- "smin v13.4s, v13.4s, v15.4s\n"
- "smin v10.4s, v10.4s, v15.4s\n"
- "sqadd v20.4s, v20.4s, v3.4s\n"
- "smax v13.4s, v13.4s, v16.4s\n"
- "smax v10.4s, v10.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v30.4s\n"
- "srshl v20.4s, v20.4s, v9.4s\n"
- "uzp1 v13.16b, v13.16b, v10.16b\n"
- "sqrdmulh v11.4s, v11.4s, v31.4s\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "str d13, [x15, x6]\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "and v28.16b, v11.16b, v30.16b\n"
- "sqrdmulh v17.4s, v17.4s, v23.4s\n"
- "smin v19.4s, v19.4s, v15.4s\n"
- "smin v20.4s, v20.4s, v15.4s\n"
+ "ldr d25, [x20, x15]\n"
+ "smlal2 v18.4s, v24.8h, v3.8h\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v22.4s, v27.4h, v7.4h\n"
+ "smlal v23.4s, v26.4h, v7.4h\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "add x13, x13, #0x20\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "smlal2 v16.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x19, x15]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal2 v21.4s, v27.8h, v7.8h\n"
+ "smlal2 v18.4s, v26.8h, v7.8h\n"
+ "sqdmulh v9.4s, v9.4s, v19.4s\n"
+ "add x15, x15, #0x8\n"
+ "smlal v22.4s, v24.4h, v5.4h\n"
+ "smlal v23.4s, v25.4h, v6.4h\n"
+ "and v28.16b, v9.16b, v0.16b\n"
+ "add x11, x11, #0x20\n"
+ "smlal2 v21.4s, v24.8h, v5.8h\n"
+ "smlal2 v18.4s, v25.8h, v6.8h\n"
+ "sqdmulh v16.4s, v16.4s, v4.4s\n"
+ "smlal v22.4s, v25.4h, v8.4h\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
+ "sqdmulh v22.4s, v22.4s, v19.4s\n"
+ "smlal2 v21.4s, v25.8h, v8.8h\n"
+ "smlal2 v18.4s, v29.8h, v8.8h\n"
+ "sqdmulh v23.4s, v23.4s, v19.4s\n"
+ "and v29.16b, v22.16b, v0.16b\n"
+ "sqdmulh v21.4s, v21.4s, v4.4s\n"
+ "and v20.16b, v23.16b, v0.16b\n"
+ "sqdmulh v18.4s, v18.4s, v4.4s\n"
+ "and v19.16b, v10.16b, v31.16b\n"
"sshr v28.4s, v28.4s, #0x1f\n"
- "smax v19.4s, v19.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v16.4s\n"
- "sqadd v11.4s, v11.4s, v28.4s\n"
- "and v26.16b, v17.16b, v9.16b\n"
- "uzp1 v19.16b, v19.16b, v20.16b\n"
- "sqrdmulh v18.4s, v18.4s, v31.4s\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str d19, [x14, x6]\n"
- "srshl v11.4s, v11.4s, v30.4s\n"
+ "and v4.16b, v16.16b, v31.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v5.16b, v21.16b, v31.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v26.16b, v18.16b, v31.16b\n"
+ "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v28.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v29.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v20.4s\n"
"sshr v26.4s, v26.4s, #0x1f\n"
- "and v8.16b, v18.16b, v30.16b\n"
- "sqrdmulh v21.4s, v21.4s, v23.4s\n"
- "sqadd v17.4s, v17.4s, v26.4s\n"
- "add v11.4s, v11.4s, v14.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "and v27.16b, v21.16b, v9.16b\n"
- "smin v11.4s, v11.4s, v15.4s\n"
- "srshl v17.4s, v17.4s, v9.4s\n"
- "sqadd v18.4s, v18.4s, v8.4s\n"
- "smax v11.4s, v11.4s, v16.4s\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "srshl v18.4s, v18.4s, v30.4s\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "smin v17.4s, v17.4s, v15.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "smax v17.4s, v17.4s, v16.4s\n"
- "srshl v21.4s, v21.4s, v9.4s\n"
- "smin v18.4s, v18.4s, v15.4s\n"
- "uzp1 v11.16b, v11.16b, v17.16b\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "str d11, [x13, x6]\n"
- "smax v18.4s, v18.4s, v16.4s\n"
- "smin v21.4s, v21.4s, v15.4s\n"
- "smax v21.4s, v21.4s, v16.4s\n"
- "uzp1 v18.16b, v18.16b, v21.16b\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
- "str d18, [x12, x6]\n"
- "add x6, x6, #0x8\n"
+ "srshl v15.4s, v15.4s, v0.4s\n"
+ "sqadd v10.4s, v10.4s, v19.4s\n"
+ "srshl v9.4s, v9.4s, v0.4s\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "sqadd v21.4s, v21.4s, v5.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "sqadd v18.4s, v18.4s, v26.4s\n"
+ "srshl v10.4s, v10.4s, v31.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v16.4s, v16.4s, v31.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v21.4s, v21.4s, v31.4s\n"
+ "sqxtn v22.4h, v22.4s\n"
+ "srshl v18.4s, v18.4s, v31.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v15.8h, v10.4s\n"
+ "sqxtn2 v9.8h, v16.4s\n"
+ "sqxtn2 v22.8h, v21.4s\n"
+ "sqxtn2 v23.8h, v18.4s\n"
+ "sqadd v15.8h, v15.8h, v11.8h\n"
+ "sqadd v9.8h, v9.8h, v11.8h\n"
+ "sqadd v22.8h, v22.8h, v11.8h\n"
+ "sqadd v23.8h, v23.8h, v11.8h\n"
+ "smax v15.8h, v15.8h, v17.8h\n"
+ "smax v9.8h, v9.8h, v17.8h\n"
+ "smax v22.8h, v22.8h, v17.8h\n"
+ "smax v23.8h, v23.8h, v17.8h\n"
+ "smin v15.8h, v15.8h, v14.8h\n"
+ "smin v9.8h, v9.8h, v14.8h\n"
+ "smin v22.8h, v22.8h, v14.8h\n"
+ "smin v23.8h, v23.8h, v14.8h\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "str d15, [x10, x14]\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str d9, [x9, x14]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d22, [x28, x14]\n"
+ "str d23, [x27, x14]\n"
"ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q13, [x19, #0x0]\n"
- "mov v19.16b, v13.16b\n"
+ "ldr q15, [x19, #0x0]\n"
+ "add x14, x14, #0x8\n"
"ldr q10, [x19, #0x10]\n"
"add x19, x19, #0x20\n"
- "mov v11.16b, v13.16b\n"
"str x19, [%x[params], %[offsetof_Params_bias]]\n"
- "mov v18.16b, v13.16b\n"
- "ldr d0, [x5, #0x0]\n"
- "ldr d1, [x5, #0x8]\n"
- "mov v20.16b, v10.16b\n"
- "ldr d2, [x5, #0x10]\n"
- "mov v17.16b, v10.16b\n"
- "ldr d3, [x5, #0x18]\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "ldr d2, [x17, #0x10]\n"
+ "mov v9.16b, v15.16b\n"
+ "mov v16.16b, v10.16b\n"
+ "ldr d3, [x17, #0x18]\n"
+ "ldr d4, [x17, #0x20]\n"
+ "mov v22.16b, v15.16b\n"
"mov v21.16b, v10.16b\n"
- "ldr d4, [x5, #0x20]\n"
- "ssubl v0.8h, v0.8b, v12.8b\n"
- "ldr d5, [x5, #0x28]\n"
- "ssubl v1.8h, v1.8b, v12.8b\n"
- "ldr d6, [x5, #0x30]\n"
- "ssubl v2.8h, v2.8b, v12.8b\n"
- "ldr d7, [x5, #0x38]\n"
- "ssubl v3.8h, v3.8b, v12.8b\n"
- "ldr d8, [x5, #0x40]\n"
- "ssubl v4.8h, v4.8b, v12.8b\n"
- "ldp x26, x25, [x7, #0x0]\n"
- "ssubl v5.8h, v5.8b, v12.8b\n"
- "ldp x24, x23, [x7, #0x10]\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
- "ssubl v7.8h, v7.8b, v12.8b\n"
- "ldp x22, x21, [x7, #0x20]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "ldp x20, x19, [x7, #0x30]\n"
- "ldr d31, [x26, x4]\n"
- "usubl v31.8h, v31.8b, v22.8b\n"
- "ldr d30, [x25, x4]\n"
- "ldr d29, [x24, x4]\n"
- "usubl v30.8h, v30.8b, v22.8b\n"
- "ldr d28, [x23, x4]\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "ldr d27, [x22, x4]\n"
- "ldr d26, [x21, x4]\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
- "ldr d25, [x20, x4]\n"
- "ldr d24, [x19, x4]\n"
- "usubl v27.8h, v27.8b, v22.8b\n"
- "usubl v26.8h, v26.8b, v22.8b\n"
- "usubl v25.8h, v25.8b, v22.8b\n"
- "usubl v24.8h, v24.8b, v22.8b\n"
+ "ldr d5, [x17, #0x28]\n"
+ "ldr d6, [x17, #0x30]\n"
+ "mov v23.16b, v15.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "ldr d7, [x17, #0x38]\n"
+ "ldr d8, [x17, #0x40]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldp x26, x25, [x12, #0x0]\n"
+ "ldp x24, x23, [x12, #0x10]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ldp x22, x21, [x12, #0x20]\n"
+ "ldp x20, x19, [x12, #0x30]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "ssubl v5.8h, v5.8b, v13.8b\n"
+ "ldr d31, [x26, x15]\n"
+ "ldr d30, [x25, x15]\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v7.8h, v7.8b, v13.8b\n"
+ "ldr d29, [x24, x15]\n"
+ "ldr d28, [x23, x15]\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "usubl v31.8h, v31.8b, v12.8b\n"
+ "ldr d27, [x22, x15]\n"
+ "ldr d26, [x21, x15]\n"
+ "usubl v30.8h, v30.8b, v12.8b\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "ldr d25, [x20, x15]\n"
+ "ldr d24, [x19, x15]\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
"bgt 1b\n"
"2:" // Tail
- "smlal v13.4s, v31.4h, v8.4h\n"
- "ldr x22, [x7, #0x40]\n"
- "tst x3, #0x7\n"
+ "smlal v15.4s, v31.4h, v8.4h\n"
"smlal2 v10.4s, v31.8h, v8.8h\n"
- "ldr x21, [x7, #0x48]\n"
- "smlal v19.4s, v31.4h, v6.4h\n"
- "ldr x20, [x7, #0x50]\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "ldr x19, [x7, #0x58]\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "ldr x11, [x7, #0x60]\n"
- "smlal2 v17.4s, v31.8h, v2.8h\n"
- "ldr x10, [x7, #0x68]\n"
- "smlal v18.4s, v31.4h, v0.4h\n"
- "ldr x9, [x7, #0x70]\n"
- "smlal2 v21.4s, v31.8h, v0.8h\n"
- "ldr x28, [x7, #0x78]\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
- "ldr x27, [x7, #0x80]\n"
+ "ldr x24, [x12, #0x40]\n"
+ "ldr x23, [x12, #0x48]\n"
+ "smlal v9.4s, v31.4h, v6.4h\n"
+ "smlal2 v16.4s, v31.8h, v6.8h\n"
+ "ldr x21, [x12, #0x50]\n"
+ "ldr x19, [x12, #0x58]\n"
+ "smlal v15.4s, v30.4h, v0.4h\n"
"smlal2 v10.4s, v30.8h, v0.8h\n"
- "ldr x26, [x7, #0x88]\n"
- "smlal v19.4s, v28.4h, v1.4h\n"
- "ldr x25, [x7, #0x90]\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "ldr d28, [x21, x4]\n"
- "smlal v13.4s, v29.4h, v1.4h\n"
- "ldr x24, [x7, #0x98]\n"
+ "ldr x22, [x12, #0x78]\n"
+ "ldr x20, [x12, #0x60]\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "smlal2 v16.4s, v28.8h, v1.8h\n"
+ "ldr d28, [x23, x15]\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal v15.4s, v29.4h, v1.4h\n"
"smlal2 v10.4s, v29.8h, v1.8h\n"
- "ldr d29, [x22, x4]\n"
- "smlal v19.4s, v27.4h, v2.4h\n"
- "ldr x23, [x7, #0xa0]\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "ldr d27, [x20, x4]\n"
- "smlal v13.4s, v26.4h, v3.4h\n"
- "ldr x22, [x7, #0xa8]\n"
+ "ldr d29, [x24, x15]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v9.4s, v27.4h, v2.4h\n"
+ "smlal2 v16.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x15]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v15.4s, v26.4h, v3.4h\n"
"smlal2 v10.4s, v26.8h, v3.8h\n"
- "ldr d26, [x19, x4]\n"
- "smlal v19.4s, v24.4h, v0.4h\n"
- "ldr x21, [x7, #0xb0]\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
- "ldr x20, [x7, #0xb8]\n"
- "smlal v13.4s, v25.4h, v4.4h\n"
- "ldr x19, [x7, #0xc0]\n"
+ "ldr d26, [x19, x15]\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v9.4s, v24.4h, v0.4h\n"
+ "smlal2 v16.4s, v24.8h, v0.8h\n"
+ "ldr x21, [x12, #0x80]\n"
+ "ldr x19, [x12, #0x68]\n"
+ "smlal v15.4s, v25.4h, v4.4h\n"
"smlal2 v10.4s, v25.8h, v4.8h\n"
- "ldr d25, [x11, x4]\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "ldr q31, [x8, #0x0]\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
- "ldr q30, [x16, #0x0]\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
- "ldr q23, [x8, #0x10]\n"
- "add x8, x8, #0x20\n"
+ "ldr d25, [x20, x15]\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v9.4s, v29.4h, v4.4h\n"
+ "smlal2 v16.4s, v29.8h, v4.8h\n"
+ "ldr x20, [x12, #0x88]\n"
+ "ldr d29, [x19, x15]\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
"smlal2 v10.4s, v24.8h, v2.8h\n"
- "ldr d24, [x9, x4]\n"
- "smlal v19.4s, v29.4h, v4.4h\n"
- "ldr q9, [x16, #0x10]\n"
- "add x16, x16, #0x20\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "ldr d29, [x10, x4]\n"
- "usubl v27.8h, v27.8b, v22.8b\n"
- "usubl v26.8h, v26.8b, v22.8b\n"
- "smlal v19.4s, v28.4h, v5.4h\n"
- "smlal v13.4s, v27.4h, v5.4h\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "ldr d28, [x27, x4]\n"
+ "ldr x19, [x12, #0x70]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v9.4s, v28.4h, v5.4h\n"
+ "smlal2 v16.4s, v28.8h, v5.8h\n"
+ "ldr d28, [x21, x15]\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "smlal v22.4s, v31.4h, v2.4h\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "ldr x24, [x12, #0x98]\n"
+ "ldr d24, [x19, x15]\n"
+ "smlal v15.4s, v27.4h, v5.4h\n"
"smlal2 v10.4s, v27.8h, v5.8h\n"
- "smlal v19.4s, v27.4h, v3.4h\n"
- "smlal v11.4s, v26.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
- "ldr d27, [x28, x4]\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "ldr d26, [x26, x4]\n"
- "usubl v25.8h, v25.8b, v22.8b\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "usubl v24.8h, v24.8b, v22.8b\n"
- "smlal v13.4s, v25.4h, v6.4h\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "ldr x23, [x12, #0x90]\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "smlal2 v16.4s, v27.8h, v3.8h\n"
+ "ldr d27, [x22, x15]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "smlal v22.4s, v26.4h, v3.4h\n"
+ "ldr x22, [x12, #0xa8]\n"
+ "ldr x19, [x12, #0xa0]\n"
+ "smlal2 v21.4s, v26.8h, v3.8h\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "ldr d26, [x20, x15]\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v23.4s, v27.4h, v4.4h\n"
+ "smlal v22.4s, v25.4h, v0.4h\n"
+ "ldr x21, [x12, #0xb0]\n"
+ "ldr x20, [x12, #0xb8]\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "smlal2 v18.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x19, x15]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal v15.4s, v25.4h, v6.4h\n"
+ "ldr x19, [x12, #0xc0]\n"
+ "ldr q19, [x13, #0x0]\n"
"smlal2 v10.4s, v25.8h, v6.8h\n"
- "smlal v11.4s, v25.4h, v0.4h\n"
- "smlal2 v17.4s, v25.8h, v0.8h\n"
- "ldr d25, [x25, x4]\n"
- "smlal v13.4s, v24.4h, v7.4h\n"
+ "smlal v22.4s, v29.4h, v4.4h\n"
+ "ldr d25, [x23, x15]\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal2 v21.4s, v29.8h, v4.8h\n"
+ "ldr d29, [x24, x15]\n"
+ "smlal2 v18.4s, v28.8h, v1.8h\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v23.4s, v26.4h, v5.4h\n"
+ "smlal v15.4s, v24.4h, v7.4h\n"
+ "ldr q0, [x11, #0x0]\n"
+ "ldr q4, [x13, #0x10]\n"
"smlal2 v10.4s, v24.8h, v7.8h\n"
- "smlal v11.4s, v29.4h, v4.4h\n"
- "smlal2 v17.4s, v29.8h, v4.8h\n"
- "ldr d29, [x24, x4]\n"
- "usubl v27.8h, v27.8b, v22.8b\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
- "smlal v11.4s, v24.4h, v1.4h\n"
- "smlal2 v17.4s, v24.8h, v1.8h\n"
- "ldr d24, [x22, x4]\n"
- "smlal v18.4s, v27.4h, v4.4h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "ldr d27, [x23, x4]\n"
- "smlal v19.4s, v28.4h, v7.4h\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "smlal v18.4s, v28.4h, v1.4h\n"
- "smlal2 v21.4s, v28.8h, v1.8h\n"
- "usubl v26.8h, v26.8b, v22.8b\n"
- "usubl v25.8h, v25.8b, v22.8b\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "smlal v18.4s, v26.4h, v5.4h\n"
- "smlal2 v21.4s, v26.8h, v5.8h\n"
- "ldr d26, [x21, x4]\n"
- "smlal v11.4s, v25.4h, v6.4h\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "ldr d25, [x20, x4]\n"
- "smlal v19.4s, v29.4h, v8.4h\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "smlal v18.4s, v29.4h, v2.4h\n"
- "smlal2 v21.4s, v29.8h, v2.8h\n"
- "ldr d29, [x19, x4]\n"
- "add x4, x4, #0x8\n"
- "usubl v27.8h, v27.8b, v22.8b\n"
- "usubl v24.8h, v24.8b, v22.8b\n"
- "usubl v26.8h, v26.8b, v22.8b\n"
- "usubl v25.8h, v25.8b, v22.8b\n"
- "smlal v11.4s, v27.4h, v7.4h\n"
- "smlal2 v17.4s, v27.8h, v7.8h\n"
- "smlal v18.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "smlal v11.4s, v24.4h, v5.4h\n"
- "smlal2 v17.4s, v24.8h, v5.8h\n"
- "smlal v18.4s, v26.4h, v7.4h\n"
- "smlal2 v21.4s, v26.8h, v7.8h\n"
- "smlal v11.4s, v25.4h, v8.4h\n"
- "smlal2 v17.4s, v25.8h, v8.8h\n"
- "smlal v18.4s, v25.4h, v6.4h\n"
+ "smlal v22.4s, v24.4h, v1.4h\n"
+ "sqdmulh v15.4s, v15.4s, v19.4s\n"
+ "ldr q31, [x11, #0x10]\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x22, x15]\n"
+ "smlal2 v18.4s, v26.8h, v5.8h\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v23.4s, v29.4h, v2.4h\n"
+ "ldr d26, [x21, x15]\n"
+ "smlal2 v18.4s, v29.8h, v2.8h\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v22.4s, v25.4h, v6.4h\n"
+ "smlal v23.4s, v24.4h, v3.4h\n"
+ "and v30.16b, v15.16b, v0.16b\n"
+ "tst x8, #0x7\n"
+ "smlal v9.4s, v28.4h, v7.4h\n"
+ "smlal2 v16.4s, v28.8h, v7.8h\n"
+ "sqdmulh v10.4s, v10.4s, v4.4s\n"
+ "add x13, x13, #0x20\n"
"smlal2 v21.4s, v25.8h, v6.8h\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "sqrdmulh v13.4s, v13.4s, v31.4s\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "smlal v18.4s, v29.4h, v8.4h\n"
- "smlal2 v21.4s, v29.8h, v8.8h\n"
- "and v27.16b, v13.16b, v30.16b\n"
- "and v7.16b, v10.16b, v9.16b\n"
- "sqrdmulh v19.4s, v19.4s, v31.4s\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "sqadd v13.4s, v13.4s, v27.4s\n"
- "sqadd v10.4s, v10.4s, v7.4s\n"
- "and v6.16b, v19.16b, v30.16b\n"
- "and v3.16b, v20.16b, v9.16b\n"
- "srshl v13.4s, v13.4s, v30.4s\n"
- "srshl v10.4s, v10.4s, v9.4s\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "add v13.4s, v13.4s, v14.4s\n"
- "add v10.4s, v10.4s, v14.4s\n"
- "sqadd v19.4s, v19.4s, v6.4s\n"
- "smin v13.4s, v13.4s, v15.4s\n"
- "smin v10.4s, v10.4s, v15.4s\n"
- "sqadd v20.4s, v20.4s, v3.4s\n"
- "smax v13.4s, v13.4s, v16.4s\n"
- "smax v10.4s, v10.4s, v16.4s\n"
- "srshl v19.4s, v19.4s, v30.4s\n"
- "srshl v20.4s, v20.4s, v9.4s\n"
- "uzp1 v13.16b, v13.16b, v10.16b\n"
- "sqrdmulh v11.4s, v11.4s, v31.4s\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "str d13, [x15, x6]\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "and v28.16b, v11.16b, v30.16b\n"
- "sqrdmulh v17.4s, v17.4s, v23.4s\n"
- "smin v19.4s, v19.4s, v15.4s\n"
- "smin v20.4s, v20.4s, v15.4s\n"
+ "ldr d25, [x20, x15]\n"
+ "smlal2 v18.4s, v24.8h, v3.8h\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v22.4s, v27.4h, v7.4h\n"
+ "smlal v23.4s, v26.4h, v7.4h\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "add x11, x11, #0x20\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "smlal2 v16.4s, v29.8h, v8.8h\n"
+ "ldr d29, [x19, x15]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal2 v21.4s, v27.8h, v7.8h\n"
+ "smlal2 v18.4s, v26.8h, v7.8h\n"
+ "sqdmulh v9.4s, v9.4s, v19.4s\n"
+ "add x15, x15, #0x8\n"
+ "smlal v22.4s, v24.4h, v5.4h\n"
+ "smlal v23.4s, v25.4h, v6.4h\n"
+ "and v28.16b, v9.16b, v0.16b\n"
+ "smlal2 v21.4s, v24.8h, v5.8h\n"
+ "smlal2 v18.4s, v25.8h, v6.8h\n"
+ "sqdmulh v16.4s, v16.4s, v4.4s\n"
+ "smlal v22.4s, v25.4h, v8.4h\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
+ "sqdmulh v22.4s, v22.4s, v19.4s\n"
+ "smlal2 v21.4s, v25.8h, v8.8h\n"
+ "smlal2 v18.4s, v29.8h, v8.8h\n"
+ "sqdmulh v23.4s, v23.4s, v19.4s\n"
+ "and v29.16b, v22.16b, v0.16b\n"
+ "sqdmulh v21.4s, v21.4s, v4.4s\n"
+ "and v20.16b, v23.16b, v0.16b\n"
+ "sqdmulh v18.4s, v18.4s, v4.4s\n"
+ "and v19.16b, v10.16b, v31.16b\n"
"sshr v28.4s, v28.4s, #0x1f\n"
- "smax v19.4s, v19.4s, v16.4s\n"
- "smax v20.4s, v20.4s, v16.4s\n"
- "sqadd v11.4s, v11.4s, v28.4s\n"
- "and v26.16b, v17.16b, v9.16b\n"
- "uzp1 v19.16b, v19.16b, v20.16b\n"
- "sqrdmulh v18.4s, v18.4s, v31.4s\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "str d19, [x14, x6]\n"
- "srshl v11.4s, v11.4s, v30.4s\n"
+ "and v4.16b, v16.16b, v31.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v5.16b, v21.16b, v31.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v26.16b, v18.16b, v31.16b\n"
+ "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v28.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v29.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v20.4s\n"
"sshr v26.4s, v26.4s, #0x1f\n"
- "and v8.16b, v18.16b, v30.16b\n"
- "sqrdmulh v21.4s, v21.4s, v23.4s\n"
- "sqadd v17.4s, v17.4s, v26.4s\n"
- "add v11.4s, v11.4s, v14.4s\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "and v27.16b, v21.16b, v9.16b\n"
- "smin v11.4s, v11.4s, v15.4s\n"
- "srshl v17.4s, v17.4s, v9.4s\n"
- "sqadd v18.4s, v18.4s, v8.4s\n"
- "smax v11.4s, v11.4s, v16.4s\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "srshl v18.4s, v18.4s, v30.4s\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "smin v17.4s, v17.4s, v15.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "smax v17.4s, v17.4s, v16.4s\n"
- "srshl v21.4s, v21.4s, v9.4s\n"
- "smin v18.4s, v18.4s, v15.4s\n"
- "uzp1 v11.16b, v11.16b, v17.16b\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "str d11, [x13, x6]\n"
- "smax v18.4s, v18.4s, v16.4s\n"
- "smin v21.4s, v21.4s, v15.4s\n"
- "smax v21.4s, v21.4s, v16.4s\n"
- "uzp1 v18.16b, v18.16b, v21.16b\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
- "str d18, [x12, x6]\n"
- "add x6, x6, #0x8\n"
+ "srshl v15.4s, v15.4s, v0.4s\n"
+ "sqadd v10.4s, v10.4s, v19.4s\n"
+ "srshl v9.4s, v9.4s, v0.4s\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "sqadd v21.4s, v21.4s, v5.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "sqadd v18.4s, v18.4s, v26.4s\n"
+ "srshl v10.4s, v10.4s, v31.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v16.4s, v16.4s, v31.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v21.4s, v21.4s, v31.4s\n"
+ "sqxtn v22.4h, v22.4s\n"
+ "srshl v18.4s, v18.4s, v31.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v15.8h, v10.4s\n"
+ "sqxtn2 v9.8h, v16.4s\n"
+ "sqxtn2 v22.8h, v21.4s\n"
+ "sqxtn2 v23.8h, v18.4s\n"
+ "sqadd v15.8h, v15.8h, v11.8h\n"
+ "sqadd v9.8h, v9.8h, v11.8h\n"
+ "sqadd v22.8h, v22.8h, v11.8h\n"
+ "sqadd v23.8h, v23.8h, v11.8h\n"
+ "smax v15.8h, v15.8h, v17.8h\n"
+ "smax v9.8h, v9.8h, v17.8h\n"
+ "smax v22.8h, v22.8h, v17.8h\n"
+ "smax v23.8h, v23.8h, v17.8h\n"
+ "smin v15.8h, v15.8h, v14.8h\n"
+ "smin v9.8h, v9.8h, v14.8h\n"
+ "smin v22.8h, v22.8h, v14.8h\n"
+ "smin v23.8h, v23.8h, v14.8h\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "str d15, [x10, x14]\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "str d9, [x9, x14]\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "str d22, [x28, x14]\n"
+ "str d23, [x27, x14]\n"
+ "add x14, x14, #0x8\n"
"beq 88f\n"
- "add x5, x5, #0x48\n"
+ "add x17, x17, #0x48\n"
"3:" // Oddments
"ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x3, #2, 5f\n"
- "ld1 { v13.4s }, [x19], #0x10\n"
- "tbz x3, #1, 4f\n"
+ "tbz x8, #2, 5f\n"
+ "ld1 { v15.4s }, [x19], #0x10\n"
+ "tbz x8, #1, 4f\n"
"ld1 { v10.d }[0], [x19], #0x8\n"
- "tbz x3, #0, 7f\n"
+ "tbz x8, #0, 7f\n"
"ld1 { v10.s }[2], [x19]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x3, #0, 7f\n"
+ "tbz x8, #0, 7f\n"
"ld1 { v10.s }[0], [x19]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x3, #1, 6f\n"
- "ld1 { v13.d }[0], [x19], #0x8\n"
- "tbz x3, #0, 7f\n"
- "ld1 { v13.s }[2], [x19]\n"
+ "tbz x8, #1, 6f\n"
+ "ld1 { v15.d }[0], [x19], #0x8\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v15.s }[2], [x19]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 7f\n"
- "ld1 { v13.s }[0], [x19]\n"
+ "tbz x8, #0, 7f\n"
+ "ld1 { v15.s }[0], [x19]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "mov v19.16b, v13.16b\n"
- "ldr d0, [x5, #0x0]\n"
- "mov v20.16b, v10.16b\n"
- "ldr d1, [x5, #0x8]\n"
- "mov v11.16b, v13.16b\n"
- "ldr d2, [x5, #0x10]\n"
- "mov v17.16b, v10.16b\n"
- "ldr d3, [x5, #0x18]\n"
- "mov v18.16b, v13.16b\n"
- "ldr d4, [x5, #0x20]\n"
+ "ldr d0, [x17, #0x0]\n"
+ "ldr d1, [x17, #0x8]\n"
+ "mov v9.16b, v15.16b\n"
+ "mov v16.16b, v10.16b\n"
+ "ldr d2, [x17, #0x10]\n"
+ "ldr d3, [x17, #0x18]\n"
+ "mov v22.16b, v15.16b\n"
"mov v21.16b, v10.16b\n"
- "ldr d5, [x5, #0x28]\n"
- "ssubl v0.8h, v0.8b, v12.8b\n"
- "ldr d6, [x5, #0x30]\n"
- "ssubl v1.8h, v1.8b, v12.8b\n"
- "ldr d7, [x5, #0x38]\n"
- "ssubl v2.8h, v2.8b, v12.8b\n"
- "ldr d8, [x5, #0x40]\n"
- "ssubl v3.8h, v3.8b, v12.8b\n"
- "ldp x26, x25, [x7, #0x0]\n"
- "add x26, x26, x4\n"
- "ssubl v4.8h, v4.8b, v12.8b\n"
- "ldp x24, x23, [x7, #0x10]\n"
- "ssubl v5.8h, v5.8b, v12.8b\n"
- "ldp x22, x21, [x7, #0x20]\n"
- "ssubl v6.8h, v6.8b, v12.8b\n"
- "add x25, x25, x4\n"
- "ssubl v7.8h, v7.8b, v12.8b\n"
- "ldp x20, x19, [x7, #0x30]\n"
- "ssubl v8.8h, v8.8b, v12.8b\n"
- "add x24, x24, x4\n"
- "add x23, x23, x4\n"
- "add x22, x22, x4\n"
- "add x21, x21, x4\n"
- "add x20, x20, x4\n"
- "add x19, x19, x4\n"
- "tbz x3, #2, 9f\n"
+ "ldr d4, [x17, #0x20]\n"
+ "ldr d5, [x17, #0x28]\n"
+ "mov v23.16b, v15.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "ldr d6, [x17, #0x30]\n"
+ "ldr d7, [x17, #0x38]\n"
+ "ssubl v0.8h, v0.8b, v13.8b\n"
+ "ssubl v1.8h, v1.8b, v13.8b\n"
+ "ldr d8, [x17, #0x40]\n"
+ "ldp x26, x25, [x12, #0x0]\n"
+ "ssubl v2.8h, v2.8b, v13.8b\n"
+ "ssubl v3.8h, v3.8b, v13.8b\n"
+ "ldp x24, x23, [x12, #0x10]\n"
+ "ldp x22, x21, [x12, #0x20]\n"
+ "ssubl v4.8h, v4.8b, v13.8b\n"
+ "ssubl v5.8h, v5.8b, v13.8b\n"
+ "ldp x20, x19, [x12, #0x30]\n"
+ "ssubl v6.8h, v6.8b, v13.8b\n"
+ "ssubl v7.8h, v7.8b, v13.8b\n"
+ "ssubl v8.8h, v8.8b, v13.8b\n"
+ "add x26, x26, x15\n"
+ "add x25, x25, x15\n"
+ "add x24, x24, x15\n"
+ "add x23, x23, x15\n"
+ "add x22, x22, x15\n"
+ "add x21, x21, x15\n"
+ "add x20, x20, x15\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 9f\n"
"ld1 { v31.s }[0], [x26], #0x4\n"
"ld1 { v30.s }[0], [x25], #0x4\n"
"ld1 { v29.s }[0], [x24], #0x4\n"
@@ -713,7 +697,7 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v26.s }[0], [x21], #0x4\n"
"ld1 { v25.s }[0], [x20], #0x4\n"
"ld1 { v24.s }[0], [x19], #0x4\n"
- "tbz x3, #1, 8f\n"
+ "tbz x8, #1, 8f\n"
"ld1 { v31.h }[2], [x26], #0x2\n"
"ld1 { v30.h }[2], [x25], #0x2\n"
"ld1 { v29.h }[2], [x24], #0x2\n"
@@ -722,7 +706,7 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v26.h }[2], [x21], #0x2\n"
"ld1 { v25.h }[2], [x20], #0x2\n"
"ld1 { v24.h }[2], [x19], #0x2\n"
- "tbz x3, #0, 11f\n"
+ "tbz x8, #0, 11f\n"
"ld1 { v31.b }[6], [x26]\n"
"ld1 { v30.b }[6], [x25]\n"
"ld1 { v29.b }[6], [x24]\n"
@@ -733,7 +717,7 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v24.b }[6], [x19]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x3, #0, 11f\n"
+ "tbz x8, #0, 11f\n"
"ld1 { v31.b }[4], [x26]\n"
"ld1 { v30.b }[4], [x25]\n"
"ld1 { v29.b }[4], [x24]\n"
@@ -744,7 +728,7 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v24.b }[4], [x19]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x3, #1, 10f\n"
+ "tbz x8, #1, 10f\n"
"ld1 { v31.h }[0], [x26], #0x2\n"
"ld1 { v30.h }[0], [x25], #0x2\n"
"ld1 { v29.h }[0], [x24], #0x2\n"
@@ -753,7 +737,7 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v26.h }[0], [x21], #0x2\n"
"ld1 { v25.h }[0], [x20], #0x2\n"
"ld1 { v24.h }[0], [x19], #0x2\n"
- "tbz x3, #0, 11f\n"
+ "tbz x8, #0, 11f\n"
"ld1 { v31.b }[2], [x26]\n"
"ld1 { v30.b }[2], [x25]\n"
"ld1 { v29.b }[2], [x24]\n"
@@ -764,7 +748,7 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v24.b }[2], [x19]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 11f\n"
+ "tbz x8, #0, 11f\n"
"ld1 { v31.b }[0], [x26]\n"
"ld1 { v30.b }[0], [x25]\n"
"ld1 { v29.b }[0], [x24]\n"
@@ -774,646 +758,636 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
"ld1 { v25.b }[0], [x20]\n"
"ld1 { v24.b }[0], [x19]\n"
"11:" // Oddments: Initial loads: Bit 2: End
- "usubl v31.8h, v31.8b, v22.8b\n"
- "ldr x22, [x7, #0x40]\n"
- "add x22, x22, x4\n"
- "usubl v30.8h, v30.8b, v22.8b\n"
- "usubl v29.8h, v29.8b, v22.8b\n"
- "usubl v28.8h, v28.8b, v22.8b\n"
- "usubl v27.8h, v27.8b, v22.8b\n"
- "usubl v26.8h, v26.8b, v22.8b\n"
- "usubl v25.8h, v25.8b, v22.8b\n"
- "usubl v24.8h, v24.8b, v22.8b\n"
- "smlal v13.4s, v31.4h, v8.4h\n"
+ "usubl v31.8h, v31.8b, v12.8b\n"
+ "smlal v15.4s, v31.4h, v8.4h\n"
"smlal2 v10.4s, v31.8h, v8.8h\n"
- "smlal v19.4s, v31.4h, v6.4h\n"
- "smlal2 v20.4s, v31.8h, v6.8h\n"
- "smlal v11.4s, v31.4h, v2.4h\n"
- "smlal2 v17.4s, v31.8h, v2.8h\n"
- "smlal v18.4s, v31.4h, v0.4h\n"
- "smlal2 v21.4s, v31.8h, v0.8h\n"
- "smlal v13.4s, v30.4h, v0.4h\n"
+ "ldr x24, [x12, #0x40]\n"
+ "usubl v30.8h, v30.8b, v12.8b\n"
+ "smlal v15.4s, v30.4h, v0.4h\n"
"smlal2 v10.4s, v30.8h, v0.8h\n"
- "smlal v19.4s, v28.4h, v1.4h\n"
- "smlal2 v20.4s, v28.8h, v1.8h\n"
- "smlal v13.4s, v29.4h, v1.4h\n"
+ "add x24, x24, x15\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v9.4s, v31.4h, v6.4h\n"
+ "smlal2 v16.4s, v31.8h, v6.8h\n"
+ "smlal v15.4s, v29.4h, v1.4h\n"
"smlal2 v10.4s, v29.8h, v1.8h\n"
- "smlal v19.4s, v27.4h, v2.4h\n"
- "smlal2 v20.4s, v27.8h, v2.8h\n"
- "smlal v13.4s, v26.4h, v3.4h\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "smlal v9.4s, v28.4h, v1.4h\n"
+ "smlal2 v16.4s, v28.8h, v1.8h\n"
+ "smlal v15.4s, v26.4h, v3.4h\n"
"smlal2 v10.4s, v26.8h, v3.8h\n"
- "smlal v19.4s, v24.4h, v0.4h\n"
- "smlal2 v20.4s, v24.8h, v0.8h\n"
- "smlal v13.4s, v25.4h, v4.4h\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "smlal v9.4s, v27.4h, v2.4h\n"
+ "smlal2 v16.4s, v27.8h, v2.8h\n"
+ "smlal v15.4s, v25.4h, v4.4h\n"
"smlal2 v10.4s, v25.8h, v4.8h\n"
- "smlal v13.4s, v24.4h, v2.4h\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "smlal v22.4s, v31.4h, v2.4h\n"
+ "smlal2 v21.4s, v31.8h, v2.8h\n"
+ "smlal v23.4s, v31.4h, v0.4h\n"
+ "smlal2 v18.4s, v31.8h, v0.8h\n"
+ "smlal v15.4s, v24.4h, v2.4h\n"
"smlal2 v10.4s, v24.8h, v2.8h\n"
- "tbz x3, #2, 13f\n"
- "ld1 { v29.s }[0], [x22], #0x4\n"
- "tbz x3, #1, 12f\n"
- "ld1 { v29.h }[2], [x22], #0x2\n"
- "tbz x3, #0, 15f\n"
- "ld1 { v29.b }[6], [x22]\n"
+ "smlal v9.4s, v24.4h, v0.4h\n"
+ "smlal2 v16.4s, v24.8h, v0.8h\n"
+ "tbz x8, #2, 13f\n"
+ "ld1 { v29.s }[0], [x24], #0x4\n"
+ "tbz x8, #1, 12f\n"
+ "ld1 { v29.h }[2], [x24], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v29.b }[6], [x24]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x3, #0, 15f\n"
- "ld1 { v29.b }[4], [x22]\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v29.b }[4], [x24]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x3, #1, 14f\n"
- "ld1 { v29.h }[0], [x22], #0x2\n"
- "tbz x3, #0, 15f\n"
- "ld1 { v29.b }[2], [x22]\n"
+ "tbz x8, #1, 14f\n"
+ "ld1 { v29.h }[0], [x24], #0x2\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v29.b }[2], [x24]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 15f\n"
- "ld1 { v29.b }[0], [x22]\n"
+ "tbz x8, #0, 15f\n"
+ "ld1 { v29.b }[0], [x24]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
- "usubl v29.8h, v29.8b, v22.8b\n"
- "ldr x21, [x7, #0x48]\n"
- "smlal v19.4s, v29.4h, v4.4h\n"
- "add x21, x21, x4\n"
- "smlal2 v20.4s, v29.8h, v4.8h\n"
- "tbz x3, #2, 17f\n"
- "ld1 { v28.s }[0], [x21], #0x4\n"
- "tbz x3, #1, 16f\n"
- "ld1 { v28.h }[2], [x21], #0x2\n"
- "tbz x3, #0, 19f\n"
- "ld1 { v28.b }[6], [x21]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "ldr x23, [x12, #0x48]\n"
+ "smlal v9.4s, v29.4h, v4.4h\n"
+ "smlal2 v16.4s, v29.8h, v4.8h\n"
+ "add x23, x23, x15\n"
+ "tbz x8, #2, 17f\n"
+ "ld1 { v28.s }[0], [x23], #0x4\n"
+ "tbz x8, #1, 16f\n"
+ "ld1 { v28.h }[2], [x23], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v28.b }[6], [x23]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
- "tbz x3, #0, 19f\n"
- "ld1 { v28.b }[4], [x21]\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v28.b }[4], [x23]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
- "tbz x3, #1, 18f\n"
- "ld1 { v28.h }[0], [x21], #0x2\n"
- "tbz x3, #0, 19f\n"
- "ld1 { v28.b }[2], [x21]\n"
+ "tbz x8, #1, 18f\n"
+ "ld1 { v28.h }[0], [x23], #0x2\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v28.b }[2], [x23]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 19f\n"
- "ld1 { v28.b }[0], [x21]\n"
+ "tbz x8, #0, 19f\n"
+ "ld1 { v28.b }[0], [x23]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
- "usubl v28.8h, v28.8b, v22.8b\n"
- "ldr x20, [x7, #0x50]\n"
- "smlal v19.4s, v28.4h, v5.4h\n"
- "add x20, x20, x4\n"
- "smlal2 v20.4s, v28.8h, v5.8h\n"
- "tbz x3, #2, 21f\n"
- "ld1 { v27.s }[0], [x20], #0x4\n"
- "tbz x3, #1, 20f\n"
- "ld1 { v27.h }[2], [x20], #0x2\n"
- "tbz x3, #0, 23f\n"
- "ld1 { v27.b }[6], [x20]\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "ldr x21, [x12, #0x50]\n"
+ "smlal v9.4s, v28.4h, v5.4h\n"
+ "smlal2 v16.4s, v28.8h, v5.8h\n"
+ "add x21, x21, x15\n"
+ "tbz x8, #2, 21f\n"
+ "ld1 { v27.s }[0], [x21], #0x4\n"
+ "tbz x8, #1, 20f\n"
+ "ld1 { v27.h }[2], [x21], #0x2\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v27.b }[6], [x21]\n"
"b 23f\n"
"20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset
- "tbz x3, #0, 23f\n"
- "ld1 { v27.b }[4], [x20]\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v27.b }[4], [x21]\n"
"b 23f\n"
"21:" // Oddments: Load (1, 2): Bit 2: Unset
- "tbz x3, #1, 22f\n"
- "ld1 { v27.h }[0], [x20], #0x2\n"
- "tbz x3, #0, 23f\n"
- "ld1 { v27.b }[2], [x20]\n"
+ "tbz x8, #1, 22f\n"
+ "ld1 { v27.h }[0], [x21], #0x2\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v27.b }[2], [x21]\n"
"b 23f\n"
"22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 23f\n"
- "ld1 { v27.b }[0], [x20]\n"
+ "tbz x8, #0, 23f\n"
+ "ld1 { v27.b }[0], [x21]\n"
"23:" // Oddments: Load (1, 2): Bit 2: End
- "usubl v27.8h, v27.8b, v22.8b\n"
- "ldr x19, [x7, #0x58]\n"
- "smlal v13.4s, v27.4h, v5.4h\n"
- "add x19, x19, x4\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "ldr x19, [x12, #0x58]\n"
+ "smlal v15.4s, v27.4h, v5.4h\n"
"smlal2 v10.4s, v27.8h, v5.8h\n"
- "smlal v19.4s, v27.4h, v3.4h\n"
- "smlal2 v20.4s, v27.8h, v3.8h\n"
- "tbz x3, #2, 25f\n"
+ "smlal v9.4s, v27.4h, v3.4h\n"
+ "smlal2 v16.4s, v27.8h, v3.8h\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 25f\n"
"ld1 { v26.s }[0], [x19], #0x4\n"
- "tbz x3, #1, 24f\n"
+ "tbz x8, #1, 24f\n"
"ld1 { v26.h }[2], [x19], #0x2\n"
- "tbz x3, #0, 27f\n"
+ "tbz x8, #0, 27f\n"
"ld1 { v26.b }[6], [x19]\n"
"b 27f\n"
"24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x3, #0, 27f\n"
+ "tbz x8, #0, 27f\n"
"ld1 { v26.b }[4], [x19]\n"
"b 27f\n"
"25:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x3, #1, 26f\n"
+ "tbz x8, #1, 26f\n"
"ld1 { v26.h }[0], [x19], #0x2\n"
- "tbz x3, #0, 27f\n"
+ "tbz x8, #0, 27f\n"
"ld1 { v26.b }[2], [x19]\n"
"b 27f\n"
"26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 27f\n"
+ "tbz x8, #0, 27f\n"
"ld1 { v26.b }[0], [x19]\n"
"27:" // Oddments: Load (3, 0): Bit 2: End
- "usubl v26.8h, v26.8b, v22.8b\n"
- "ldr x11, [x7, #0x60]\n"
- "smlal v11.4s, v26.4h, v3.4h\n"
- "add x11, x11, x4\n"
- "smlal2 v17.4s, v26.8h, v3.8h\n"
- "tbz x3, #2, 29f\n"
- "ld1 { v25.s }[0], [x11], #0x4\n"
- "tbz x3, #1, 28f\n"
- "ld1 { v25.h }[2], [x11], #0x2\n"
- "tbz x3, #0, 31f\n"
- "ld1 { v25.b }[6], [x11]\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "ldr x20, [x12, #0x60]\n"
+ "smlal v22.4s, v26.4h, v3.4h\n"
+ "smlal2 v21.4s, v26.8h, v3.8h\n"
+ "add x20, x20, x15\n"
+ "tbz x8, #2, 29f\n"
+ "ld1 { v25.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 28f\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v25.b }[6], [x20]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset
- "tbz x3, #0, 31f\n"
- "ld1 { v25.b }[4], [x11]\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v25.b }[4], [x20]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 0): Bit 2: Unset
- "tbz x3, #1, 30f\n"
- "ld1 { v25.h }[0], [x11], #0x2\n"
- "tbz x3, #0, 31f\n"
- "ld1 { v25.b }[2], [x11]\n"
+ "tbz x8, #1, 30f\n"
+ "ld1 { v25.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v25.b }[2], [x20]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 31f\n"
- "ld1 { v25.b }[0], [x11]\n"
+ "tbz x8, #0, 31f\n"
+ "ld1 { v25.b }[0], [x20]\n"
"31:" // Oddments: Load (2, 0): Bit 2: End
- "usubl v25.8h, v25.8b, v22.8b\n"
- "ldr x10, [x7, #0x68]\n"
- "smlal v13.4s, v25.4h, v6.4h\n"
- "add x10, x10, x4\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "ldr x19, [x12, #0x68]\n"
+ "smlal v15.4s, v25.4h, v6.4h\n"
"smlal2 v10.4s, v25.8h, v6.8h\n"
- "smlal v11.4s, v25.4h, v0.4h\n"
- "smlal2 v17.4s, v25.8h, v0.8h\n"
- "tbz x3, #2, 33f\n"
- "ld1 { v29.s }[0], [x10], #0x4\n"
- "tbz x3, #1, 32f\n"
- "ld1 { v29.h }[2], [x10], #0x2\n"
- "tbz x3, #0, 35f\n"
- "ld1 { v29.b }[6], [x10]\n"
+ "smlal v22.4s, v25.4h, v0.4h\n"
+ "smlal2 v21.4s, v25.8h, v0.8h\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 33f\n"
+ "ld1 { v29.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 32f\n"
+ "ld1 { v29.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[6], [x19]\n"
"b 35f\n"
"32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x3, #0, 35f\n"
- "ld1 { v29.b }[4], [x10]\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[4], [x19]\n"
"b 35f\n"
"33:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x3, #1, 34f\n"
- "ld1 { v29.h }[0], [x10], #0x2\n"
- "tbz x3, #0, 35f\n"
- "ld1 { v29.b }[2], [x10]\n"
+ "tbz x8, #1, 34f\n"
+ "ld1 { v29.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[2], [x19]\n"
"b 35f\n"
"34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 35f\n"
- "ld1 { v29.b }[0], [x10]\n"
+ "tbz x8, #0, 35f\n"
+ "ld1 { v29.b }[0], [x19]\n"
"35:" // Oddments: Load (3, 1): Bit 2: End
- "usubl v29.8h, v29.8b, v22.8b\n"
- "ldr x9, [x7, #0x70]\n"
- "smlal v11.4s, v29.4h, v4.4h\n"
- "add x9, x9, x4\n"
- "smlal2 v17.4s, v29.8h, v4.8h\n"
- "tbz x3, #2, 37f\n"
- "ld1 { v24.s }[0], [x9], #0x4\n"
- "tbz x3, #1, 36f\n"
- "ld1 { v24.h }[2], [x9], #0x2\n"
- "tbz x3, #0, 39f\n"
- "ld1 { v24.b }[6], [x9]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "ldr x19, [x12, #0x70]\n"
+ "smlal v22.4s, v29.4h, v4.4h\n"
+ "smlal2 v21.4s, v29.8h, v4.8h\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 37f\n"
+ "ld1 { v24.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 36f\n"
+ "ld1 { v24.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v24.b }[6], [x19]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
- "tbz x3, #0, 39f\n"
- "ld1 { v24.b }[4], [x9]\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v24.b }[4], [x19]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 1): Bit 2: Unset
- "tbz x3, #1, 38f\n"
- "ld1 { v24.h }[0], [x9], #0x2\n"
- "tbz x3, #0, 39f\n"
- "ld1 { v24.b }[2], [x9]\n"
+ "tbz x8, #1, 38f\n"
+ "ld1 { v24.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v24.b }[2], [x19]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 39f\n"
- "ld1 { v24.b }[0], [x9]\n"
+ "tbz x8, #0, 39f\n"
+ "ld1 { v24.b }[0], [x19]\n"
"39:" // Oddments: Load (2, 1): Bit 2: End
- "usubl v24.8h, v24.8b, v22.8b\n"
- "ldr x28, [x7, #0x78]\n"
- "smlal v13.4s, v24.4h, v7.4h\n"
- "add x28, x28, x4\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "ldr x22, [x12, #0x78]\n"
+ "smlal v15.4s, v24.4h, v7.4h\n"
"smlal2 v10.4s, v24.8h, v7.8h\n"
- "smlal v11.4s, v24.4h, v1.4h\n"
- "smlal2 v17.4s, v24.8h, v1.8h\n"
- "tbz x3, #2, 41f\n"
- "ld1 { v27.s }[0], [x28], #0x4\n"
- "tbz x3, #1, 40f\n"
- "ld1 { v27.h }[2], [x28], #0x2\n"
- "tbz x3, #0, 43f\n"
- "ld1 { v27.b }[6], [x28]\n"
+ "smlal v22.4s, v24.4h, v1.4h\n"
+ "smlal2 v21.4s, v24.8h, v1.8h\n"
+ "add x22, x22, x15\n"
+ "tbz x8, #2, 41f\n"
+ "ld1 { v27.s }[0], [x22], #0x4\n"
+ "tbz x8, #1, 40f\n"
+ "ld1 { v27.h }[2], [x22], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v27.b }[6], [x22]\n"
"b 43f\n"
"40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x3, #0, 43f\n"
- "ld1 { v27.b }[4], [x28]\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v27.b }[4], [x22]\n"
"b 43f\n"
"41:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x3, #1, 42f\n"
- "ld1 { v27.h }[0], [x28], #0x2\n"
- "tbz x3, #0, 43f\n"
- "ld1 { v27.b }[2], [x28]\n"
+ "tbz x8, #1, 42f\n"
+ "ld1 { v27.h }[0], [x22], #0x2\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v27.b }[2], [x22]\n"
"b 43f\n"
"42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 43f\n"
- "ld1 { v27.b }[0], [x28]\n"
+ "tbz x8, #0, 43f\n"
+ "ld1 { v27.b }[0], [x22]\n"
"43:" // Oddments: Load (3, 3): Bit 2: End
- "usubl v27.8h, v27.8b, v22.8b\n"
- "ldr x27, [x7, #0x80]\n"
- "smlal v18.4s, v27.4h, v4.4h\n"
- "add x27, x27, x4\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "tbz x3, #2, 45f\n"
- "ld1 { v28.s }[0], [x27], #0x4\n"
- "tbz x3, #1, 44f\n"
- "ld1 { v28.h }[2], [x27], #0x2\n"
- "tbz x3, #0, 47f\n"
- "ld1 { v28.b }[6], [x27]\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "ldr x21, [x12, #0x80]\n"
+ "smlal v23.4s, v27.4h, v4.4h\n"
+ "smlal2 v18.4s, v27.8h, v4.8h\n"
+ "add x21, x21, x15\n"
+ "tbz x8, #2, 45f\n"
+ "ld1 { v28.s }[0], [x21], #0x4\n"
+ "tbz x8, #1, 44f\n"
+ "ld1 { v28.h }[2], [x21], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v28.b }[6], [x21]\n"
"b 47f\n"
"44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x3, #0, 47f\n"
- "ld1 { v28.b }[4], [x27]\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v28.b }[4], [x21]\n"
"b 47f\n"
"45:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x3, #1, 46f\n"
- "ld1 { v28.h }[0], [x27], #0x2\n"
- "tbz x3, #0, 47f\n"
- "ld1 { v28.b }[2], [x27]\n"
+ "tbz x8, #1, 46f\n"
+ "ld1 { v28.h }[0], [x21], #0x2\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v28.b }[2], [x21]\n"
"b 47f\n"
"46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 47f\n"
- "ld1 { v28.b }[0], [x27]\n"
+ "tbz x8, #0, 47f\n"
+ "ld1 { v28.b }[0], [x21]\n"
"47:" // Oddments: Load (2, 3): Bit 2: End
- "usubl v28.8h, v28.8b, v22.8b\n"
- "ldr x26, [x7, #0x88]\n"
- "smlal v19.4s, v28.4h, v7.4h\n"
- "add x26, x26, x4\n"
- "smlal2 v20.4s, v28.8h, v7.8h\n"
- "smlal v18.4s, v28.4h, v1.4h\n"
- "smlal2 v21.4s, v28.8h, v1.8h\n"
- "tbz x3, #2, 49f\n"
- "ld1 { v26.s }[0], [x26], #0x4\n"
- "tbz x3, #1, 48f\n"
- "ld1 { v26.h }[2], [x26], #0x2\n"
- "tbz x3, #0, 51f\n"
- "ld1 { v26.b }[6], [x26]\n"
+ "usubl v28.8h, v28.8b, v12.8b\n"
+ "ldr x20, [x12, #0x88]\n"
+ "smlal v9.4s, v28.4h, v7.4h\n"
+ "smlal2 v16.4s, v28.8h, v7.8h\n"
+ "smlal v23.4s, v28.4h, v1.4h\n"
+ "smlal2 v18.4s, v28.8h, v1.8h\n"
+ "add x20, x20, x15\n"
+ "tbz x8, #2, 49f\n"
+ "ld1 { v26.s }[0], [x20], #0x4\n"
+ "tbz x8, #1, 48f\n"
+ "ld1 { v26.h }[2], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v26.b }[6], [x20]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
- "tbz x3, #0, 51f\n"
- "ld1 { v26.b }[4], [x26]\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v26.b }[4], [x20]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 4): Bit 2: Unset
- "tbz x3, #1, 50f\n"
- "ld1 { v26.h }[0], [x26], #0x2\n"
- "tbz x3, #0, 51f\n"
- "ld1 { v26.b }[2], [x26]\n"
+ "tbz x8, #1, 50f\n"
+ "ld1 { v26.h }[0], [x20], #0x2\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v26.b }[2], [x20]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 51f\n"
- "ld1 { v26.b }[0], [x26]\n"
+ "tbz x8, #0, 51f\n"
+ "ld1 { v26.b }[0], [x20]\n"
"51:" // Oddments: Load (3, 4): Bit 2: End
- "usubl v26.8h, v26.8b, v22.8b\n"
- "ldr x25, [x7, #0x90]\n"
- "smlal v18.4s, v26.4h, v5.4h\n"
- "add x25, x25, x4\n"
- "smlal2 v21.4s, v26.8h, v5.8h\n"
- "tbz x3, #2, 53f\n"
- "ld1 { v25.s }[0], [x25], #0x4\n"
- "tbz x3, #1, 52f\n"
- "ld1 { v25.h }[2], [x25], #0x2\n"
- "tbz x3, #0, 55f\n"
- "ld1 { v25.b }[6], [x25]\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "ldr x23, [x12, #0x90]\n"
+ "smlal v23.4s, v26.4h, v5.4h\n"
+ "smlal2 v18.4s, v26.8h, v5.8h\n"
+ "add x23, x23, x15\n"
+ "tbz x8, #2, 53f\n"
+ "ld1 { v25.s }[0], [x23], #0x4\n"
+ "tbz x8, #1, 52f\n"
+ "ld1 { v25.h }[2], [x23], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v25.b }[6], [x23]\n"
"b 55f\n"
"52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
- "tbz x3, #0, 55f\n"
- "ld1 { v25.b }[4], [x25]\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v25.b }[4], [x23]\n"
"b 55f\n"
"53:" // Oddments: Load (4, 0): Bit 2: Unset
- "tbz x3, #1, 54f\n"
- "ld1 { v25.h }[0], [x25], #0x2\n"
- "tbz x3, #0, 55f\n"
- "ld1 { v25.b }[2], [x25]\n"
+ "tbz x8, #1, 54f\n"
+ "ld1 { v25.h }[0], [x23], #0x2\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v25.b }[2], [x23]\n"
"b 55f\n"
"54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 55f\n"
- "ld1 { v25.b }[0], [x25]\n"
+ "tbz x8, #0, 55f\n"
+ "ld1 { v25.b }[0], [x23]\n"
"55:" // Oddments: Load (4, 0): Bit 2: End
- "usubl v25.8h, v25.8b, v22.8b\n"
- "ldr x24, [x7, #0x98]\n"
- "smlal v11.4s, v25.4h, v6.4h\n"
- "add x24, x24, x4\n"
- "smlal2 v17.4s, v25.8h, v6.8h\n"
- "tbz x3, #2, 57f\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "ldr x24, [x12, #0x98]\n"
+ "smlal v22.4s, v25.4h, v6.4h\n"
+ "smlal2 v21.4s, v25.8h, v6.8h\n"
+ "add x24, x24, x15\n"
+ "tbz x8, #2, 57f\n"
"ld1 { v29.s }[0], [x24], #0x4\n"
- "tbz x3, #1, 56f\n"
+ "tbz x8, #1, 56f\n"
"ld1 { v29.h }[2], [x24], #0x2\n"
- "tbz x3, #0, 59f\n"
+ "tbz x8, #0, 59f\n"
"ld1 { v29.b }[6], [x24]\n"
"b 59f\n"
"56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
- "tbz x3, #0, 59f\n"
+ "tbz x8, #0, 59f\n"
"ld1 { v29.b }[4], [x24]\n"
"b 59f\n"
"57:" // Oddments: Load (2, 4): Bit 2: Unset
- "tbz x3, #1, 58f\n"
+ "tbz x8, #1, 58f\n"
"ld1 { v29.h }[0], [x24], #0x2\n"
- "tbz x3, #0, 59f\n"
+ "tbz x8, #0, 59f\n"
"ld1 { v29.b }[2], [x24]\n"
"b 59f\n"
"58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 59f\n"
+ "tbz x8, #0, 59f\n"
"ld1 { v29.b }[0], [x24]\n"
"59:" // Oddments: Load (2, 4): Bit 2: End
- "usubl v29.8h, v29.8b, v22.8b\n"
- "ldr x23, [x7, #0xa0]\n"
- "smlal v19.4s, v29.4h, v8.4h\n"
- "add x23, x23, x4\n"
- "smlal2 v20.4s, v29.8h, v8.8h\n"
- "smlal v18.4s, v29.4h, v2.4h\n"
- "smlal2 v21.4s, v29.8h, v2.8h\n"
- "tbz x3, #2, 61f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
- "tbz x3, #1, 60f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
- "tbz x3, #0, 63f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "ldr x19, [x12, #0xa0]\n"
+ "smlal v9.4s, v29.4h, v8.4h\n"
+ "smlal2 v16.4s, v29.8h, v8.8h\n"
+ "smlal v23.4s, v29.4h, v2.4h\n"
+ "smlal2 v18.4s, v29.8h, v2.8h\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 61f\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "tbz x8, #1, 60f\n"
+ "ld1 { v27.h }[2], [x19], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "ld1 { v27.b }[6], [x19]\n"
"b 63f\n"
"60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
- "tbz x3, #0, 63f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "tbz x8, #0, 63f\n"
+ "ld1 { v27.b }[4], [x19]\n"
"b 63f\n"
"61:" // Oddments: Load (4, 1): Bit 2: Unset
- "tbz x3, #1, 62f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
- "tbz x3, #0, 63f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "tbz x8, #1, 62f\n"
+ "ld1 { v27.h }[0], [x19], #0x2\n"
+ "tbz x8, #0, 63f\n"
+ "ld1 { v27.b }[2], [x19]\n"
"b 63f\n"
"62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 63f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "tbz x8, #0, 63f\n"
+ "ld1 { v27.b }[0], [x19]\n"
"63:" // Oddments: Load (4, 1): Bit 2: End
- "usubl v27.8h, v27.8b, v22.8b\n"
- "ldr x22, [x7, #0xa8]\n"
- "smlal v11.4s, v27.4h, v7.4h\n"
- "add x22, x22, x4\n"
- "smlal2 v17.4s, v27.8h, v7.8h\n"
- "tbz x3, #2, 65f\n"
+ "usubl v27.8h, v27.8b, v12.8b\n"
+ "ldr x22, [x12, #0xa8]\n"
+ "smlal v22.4s, v27.4h, v7.4h\n"
+ "smlal2 v21.4s, v27.8h, v7.8h\n"
+ "add x22, x22, x15\n"
+ "tbz x8, #2, 65f\n"
"ld1 { v24.s }[0], [x22], #0x4\n"
- "tbz x3, #1, 64f\n"
+ "tbz x8, #1, 64f\n"
"ld1 { v24.h }[2], [x22], #0x2\n"
- "tbz x3, #0, 67f\n"
+ "tbz x8, #0, 67f\n"
"ld1 { v24.b }[6], [x22]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x3, #0, 67f\n"
+ "tbz x8, #0, 67f\n"
"ld1 { v24.b }[4], [x22]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x3, #1, 66f\n"
+ "tbz x8, #1, 66f\n"
"ld1 { v24.h }[0], [x22], #0x2\n"
- "tbz x3, #0, 67f\n"
+ "tbz x8, #0, 67f\n"
"ld1 { v24.b }[2], [x22]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 67f\n"
+ "tbz x8, #0, 67f\n"
"ld1 { v24.b }[0], [x22]\n"
"67:" // Oddments: Load (3, 2): Bit 2: End
- "usubl v24.8h, v24.8b, v22.8b\n"
- "ldr x21, [x7, #0xb0]\n"
- "smlal v11.4s, v24.4h, v5.4h\n"
- "add x21, x21, x4\n"
- "smlal2 v17.4s, v24.8h, v5.8h\n"
- "smlal v18.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "tbz x3, #2, 69f\n"
+ "usubl v24.8h, v24.8b, v12.8b\n"
+ "ldr x21, [x12, #0xb0]\n"
+ "smlal v22.4s, v24.4h, v5.4h\n"
+ "smlal2 v21.4s, v24.8h, v5.8h\n"
+ "smlal v23.4s, v24.4h, v3.4h\n"
+ "smlal2 v18.4s, v24.8h, v3.8h\n"
+ "add x21, x21, x15\n"
+ "tbz x8, #2, 69f\n"
"ld1 { v26.s }[0], [x21], #0x4\n"
- "tbz x3, #1, 68f\n"
+ "tbz x8, #1, 68f\n"
"ld1 { v26.h }[2], [x21], #0x2\n"
- "tbz x3, #0, 71f\n"
+ "tbz x8, #0, 71f\n"
"ld1 { v26.b }[6], [x21]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
- "tbz x3, #0, 71f\n"
+ "tbz x8, #0, 71f\n"
"ld1 { v26.b }[4], [x21]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 3): Bit 2: Unset
- "tbz x3, #1, 70f\n"
+ "tbz x8, #1, 70f\n"
"ld1 { v26.h }[0], [x21], #0x2\n"
- "tbz x3, #0, 71f\n"
+ "tbz x8, #0, 71f\n"
"ld1 { v26.b }[2], [x21]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 71f\n"
+ "tbz x8, #0, 71f\n"
"ld1 { v26.b }[0], [x21]\n"
"71:" // Oddments: Load (4, 3): Bit 2: End
- "usubl v26.8h, v26.8b, v22.8b\n"
- "ldr x20, [x7, #0xb8]\n"
- "smlal v18.4s, v26.4h, v7.4h\n"
- "add x20, x20, x4\n"
- "smlal2 v21.4s, v26.8h, v7.8h\n"
- "tbz x3, #2, 73f\n"
+ "usubl v26.8h, v26.8b, v12.8b\n"
+ "ldr x20, [x12, #0xb8]\n"
+ "smlal v23.4s, v26.4h, v7.4h\n"
+ "smlal2 v18.4s, v26.8h, v7.8h\n"
+ "add x20, x20, x15\n"
+ "tbz x8, #2, 73f\n"
"ld1 { v25.s }[0], [x20], #0x4\n"
- "tbz x3, #1, 72f\n"
+ "tbz x8, #1, 72f\n"
"ld1 { v25.h }[2], [x20], #0x2\n"
- "tbz x3, #0, 75f\n"
+ "tbz x8, #0, 75f\n"
"ld1 { v25.b }[6], [x20]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
- "tbz x3, #0, 75f\n"
+ "tbz x8, #0, 75f\n"
"ld1 { v25.b }[4], [x20]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 2): Bit 2: Unset
- "tbz x3, #1, 74f\n"
+ "tbz x8, #1, 74f\n"
"ld1 { v25.h }[0], [x20], #0x2\n"
- "tbz x3, #0, 75f\n"
+ "tbz x8, #0, 75f\n"
"ld1 { v25.b }[2], [x20]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 75f\n"
+ "tbz x8, #0, 75f\n"
"ld1 { v25.b }[0], [x20]\n"
"75:" // Oddments: Load (4, 2): Bit 2: End
- "usubl v25.8h, v25.8b, v22.8b\n"
- "ldr x19, [x7, #0xc0]\n"
- "smlal v11.4s, v25.4h, v8.4h\n"
- "add x19, x19, x4\n"
- "smlal2 v17.4s, v25.8h, v8.8h\n"
- "smlal v18.4s, v25.4h, v6.4h\n"
- "smlal2 v21.4s, v25.8h, v6.8h\n"
- "tbz x3, #2, 77f\n"
+ "usubl v25.8h, v25.8b, v12.8b\n"
+ "ldr x19, [x12, #0xc0]\n"
+ "smlal v22.4s, v25.4h, v8.4h\n"
+ "smlal2 v21.4s, v25.8h, v8.8h\n"
+ "smlal v23.4s, v25.4h, v6.4h\n"
+ "smlal2 v18.4s, v25.8h, v6.8h\n"
+ "add x19, x19, x15\n"
+ "tbz x8, #2, 77f\n"
"ld1 { v29.s }[0], [x19], #0x4\n"
- "tbz x3, #1, 76f\n"
+ "tbz x8, #1, 76f\n"
"ld1 { v29.h }[2], [x19], #0x2\n"
- "tbz x3, #0, 79f\n"
+ "tbz x8, #0, 79f\n"
"ld1 { v29.b }[6], [x19]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
- "tbz x3, #0, 79f\n"
+ "tbz x8, #0, 79f\n"
"ld1 { v29.b }[4], [x19]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 4): Bit 2: Unset
- "tbz x3, #1, 78f\n"
+ "tbz x8, #1, 78f\n"
"ld1 { v29.h }[0], [x19], #0x2\n"
- "tbz x3, #0, 79f\n"
+ "tbz x8, #0, 79f\n"
"ld1 { v29.b }[2], [x19]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 79f\n"
+ "tbz x8, #0, 79f\n"
"ld1 { v29.b }[0], [x19]\n"
"79:" // Oddments: Load (4, 4): Bit 2: End
- "usubl v29.8h, v29.8b, v22.8b\n"
- "smlal v18.4s, v29.4h, v8.4h\n"
- "smlal2 v21.4s, v29.8h, v8.8h\n"
- "tbz x3, #2, 81f\n"
- "ld1 { v31.4s }, [x8], #0x10\n"
- "ld1 { v30.4s }, [x16], #0x10\n"
- "tbz x3, #1, 80f\n"
- "ld1 { v23.d }[0], [x8], #0x8\n"
- "ld1 { v9.d }[0], [x16], #0x8\n"
- "tbz x3, #0, 83f\n"
- "ld1 { v23.s }[2], [x8]\n"
- "ld1 { v9.s }[2], [x16]\n"
+ "usubl v29.8h, v29.8b, v12.8b\n"
+ "smlal v23.4s, v29.4h, v8.4h\n"
+ "smlal2 v18.4s, v29.8h, v8.8h\n"
+ "tbz x8, #2, 81f\n"
+ "ld1 { v19.4s }, [x13], #0x10\n"
+ "ld1 { v0.4s }, [x11], #0x10\n"
+ "tbz x8, #1, 80f\n"
+ "ld1 { v4.d }[0], [x13], #0x8\n"
+ "ld1 { v31.d }[0], [x11], #0x8\n"
+ "tbz x8, #0, 83f\n"
+ "ld1 { v4.s }[2], [x13]\n"
+ "ld1 { v31.s }[2], [x11]\n"
"b 83f\n"
"80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x3, #0, 83f\n"
- "ld1 { v23.s }[0], [x8]\n"
- "ld1 { v9.s }[0], [x16]\n"
+ "tbz x8, #0, 83f\n"
+ "ld1 { v4.s }[0], [x13]\n"
+ "ld1 { v31.s }[0], [x11]\n"
"b 83f\n"
"81:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x3, #1, 82f\n"
- "ld1 { v31.d }[0], [x8], #0x8\n"
- "ld1 { v30.d }[0], [x16], #0x8\n"
- "tbz x3, #0, 83f\n"
- "ld1 { v31.s }[2], [x8]\n"
- "ld1 { v30.s }[2], [x16]\n"
+ "tbz x8, #1, 82f\n"
+ "ld1 { v19.d }[0], [x13], #0x8\n"
+ "ld1 { v0.d }[0], [x11], #0x8\n"
+ "tbz x8, #0, 83f\n"
+ "ld1 { v19.s }[2], [x13]\n"
+ "ld1 { v0.s }[2], [x11]\n"
"b 83f\n"
"82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 83f\n"
- "ld1 { v31.s }[0], [x8]\n"
- "ld1 { v30.s }[0], [x16]\n"
+ "tbz x8, #0, 83f\n"
+ "ld1 { v19.s }[0], [x13]\n"
+ "ld1 { v0.s }[0], [x11]\n"
"83:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v13.4s, v13.4s, v31.4s\n"
- "add x15, x15, x6\n"
- "sqrdmulh v10.4s, v10.4s, v23.4s\n"
- "add x14, x14, x6\n"
- "sqrdmulh v19.4s, v19.4s, v31.4s\n"
- "add x13, x13, x6\n"
- "sqrdmulh v20.4s, v20.4s, v23.4s\n"
- "add x12, x12, x6\n"
- "sqrdmulh v11.4s, v11.4s, v31.4s\n"
- "and v27.16b, v13.16b, v30.16b\n"
- "and v7.16b, v10.16b, v9.16b\n"
- "and v6.16b, v19.16b, v30.16b\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "sshr v7.4s, v7.4s, #0x1f\n"
- "sshr v6.4s, v6.4s, #0x1f\n"
- "sqadd v13.4s, v13.4s, v27.4s\n"
- "sqadd v10.4s, v10.4s, v7.4s\n"
- "sqadd v19.4s, v19.4s, v6.4s\n"
- "and v3.16b, v20.16b, v9.16b\n"
- "srshl v13.4s, v13.4s, v30.4s\n"
- "srshl v10.4s, v10.4s, v9.4s\n"
- "srshl v19.4s, v19.4s, v30.4s\n"
- "sshr v3.4s, v3.4s, #0x1f\n"
- "add v13.4s, v13.4s, v14.4s\n"
- "add v10.4s, v10.4s, v14.4s\n"
- "add v19.4s, v19.4s, v14.4s\n"
- "smin v13.4s, v13.4s, v15.4s\n"
- "smin v10.4s, v10.4s, v15.4s\n"
- "smin v19.4s, v19.4s, v15.4s\n"
- "smax v13.4s, v13.4s, v16.4s\n"
- "smax v10.4s, v10.4s, v16.4s\n"
- "smax v19.4s, v19.4s, v16.4s\n"
- "sqadd v20.4s, v20.4s, v3.4s\n"
- "uzp1 v13.16b, v13.16b, v10.16b\n"
- "and v28.16b, v11.16b, v30.16b\n"
- "uzp1 v13.16b, v13.16b, v13.16b\n"
- "srshl v20.4s, v20.4s, v9.4s\n"
+ "sqdmulh v15.4s, v15.4s, v19.4s\n"
+ "sqdmulh v9.4s, v9.4s, v19.4s\n"
+ "add x10, x10, x14\n"
+ "add x9, x9, x14\n"
+ "sqdmulh v22.4s, v22.4s, v19.4s\n"
+ "sqdmulh v23.4s, v23.4s, v19.4s\n"
+ "add x28, x28, x14\n"
+ "add x27, x27, x14\n"
+ "and v30.16b, v15.16b, v0.16b\n"
+ "sqdmulh v10.4s, v10.4s, v4.4s\n"
+ "and v28.16b, v9.16b, v0.16b\n"
+ "sqdmulh v16.4s, v16.4s, v4.4s\n"
+ "and v29.16b, v22.16b, v0.16b\n"
+ "sqdmulh v21.4s, v21.4s, v4.4s\n"
+ "and v20.16b, v23.16b, v0.16b\n"
+ "sqdmulh v18.4s, v18.4s, v4.4s\n"
+ "sshr v30.4s, v30.4s, #0x1f\n"
+ "and v19.16b, v10.16b, v31.16b\n"
"sshr v28.4s, v28.4s, #0x1f\n"
- "sqrdmulh v17.4s, v17.4s, v23.4s\n"
- "sqrdmulh v18.4s, v18.4s, v31.4s\n"
- "add v20.4s, v20.4s, v14.4s\n"
- "sqadd v11.4s, v11.4s, v28.4s\n"
- "and v26.16b, v17.16b, v9.16b\n"
- "smin v20.4s, v20.4s, v15.4s\n"
- "and v8.16b, v18.16b, v30.16b\n"
- "srshl v11.4s, v11.4s, v30.4s\n"
- "smax v20.4s, v20.4s, v16.4s\n"
+ "and v4.16b, v16.16b, v31.16b\n"
+ "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v5.16b, v21.16b, v31.16b\n"
+ "sshr v20.4s, v20.4s, #0x1f\n"
+ "and v26.16b, v18.16b, v31.16b\n"
+ "sqadd v15.4s, v15.4s, v30.4s\n"
+ "sshr v19.4s, v19.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v28.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v29.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v20.4s\n"
"sshr v26.4s, v26.4s, #0x1f\n"
- "sshr v8.4s, v8.4s, #0x1f\n"
- "uzp1 v19.16b, v19.16b, v20.16b\n"
- "add v11.4s, v11.4s, v14.4s\n"
- "uzp1 v19.16b, v19.16b, v19.16b\n"
- "sqadd v17.4s, v17.4s, v26.4s\n"
- "smin v11.4s, v11.4s, v15.4s\n"
- "sqadd v18.4s, v18.4s, v8.4s\n"
- "sqrdmulh v21.4s, v21.4s, v23.4s\n"
- "smax v11.4s, v11.4s, v16.4s\n"
- "srshl v17.4s, v17.4s, v9.4s\n"
- "srshl v18.4s, v18.4s, v30.4s\n"
- "and v27.16b, v21.16b, v9.16b\n"
- "add v17.4s, v17.4s, v14.4s\n"
- "add v18.4s, v18.4s, v14.4s\n"
- "sshr v27.4s, v27.4s, #0x1f\n"
- "smin v17.4s, v17.4s, v15.4s\n"
- "smin v18.4s, v18.4s, v15.4s\n"
- "sqadd v21.4s, v21.4s, v27.4s\n"
- "smax v17.4s, v17.4s, v16.4s\n"
- "smax v18.4s, v18.4s, v16.4s\n"
- "srshl v21.4s, v21.4s, v9.4s\n"
- "uzp1 v11.16b, v11.16b, v17.16b\n"
- "uzp1 v11.16b, v11.16b, v11.16b\n"
- "add v21.4s, v21.4s, v14.4s\n"
- "smin v21.4s, v21.4s, v15.4s\n"
- "smax v21.4s, v21.4s, v16.4s\n"
- "uzp1 v18.16b, v18.16b, v21.16b\n"
- "uzp1 v18.16b, v18.16b, v18.16b\n"
- "tbz x3, #2, 85f\n"
- "st1 { v13.s }[0], [x15], #0x4\n"
- "st1 { v19.s }[0], [x14], #0x4\n"
- "st1 { v11.s }[0], [x13], #0x4\n"
- "st1 { v18.s }[0], [x12], #0x4\n"
- "tbz x3, #1, 84f\n"
- "st1 { v13.h }[2], [x15], #0x2\n"
- "st1 { v19.h }[2], [x14], #0x2\n"
- "st1 { v11.h }[2], [x13], #0x2\n"
- "st1 { v18.h }[2], [x12], #0x2\n"
- "tbz x3, #0, 87f\n"
- "st1 { v13.b }[6], [x15], #0x1\n"
- "st1 { v19.b }[6], [x14], #0x1\n"
- "st1 { v11.b }[6], [x13], #0x1\n"
- "st1 { v18.b }[6], [x12], #0x1\n"
+ "srshl v15.4s, v15.4s, v0.4s\n"
+ "sqadd v10.4s, v10.4s, v19.4s\n"
+ "srshl v9.4s, v9.4s, v0.4s\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "sqadd v21.4s, v21.4s, v5.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "sqadd v18.4s, v18.4s, v26.4s\n"
+ "srshl v10.4s, v10.4s, v31.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v16.4s, v16.4s, v31.4s\n"
+ "sqxtn v9.4h, v9.4s\n"
+ "srshl v21.4s, v21.4s, v31.4s\n"
+ "sqxtn v22.4h, v22.4s\n"
+ "srshl v18.4s, v18.4s, v31.4s\n"
+ "sqxtn v23.4h, v23.4s\n"
+ "sqxtn2 v15.8h, v10.4s\n"
+ "sqxtn2 v9.8h, v16.4s\n"
+ "sqxtn2 v22.8h, v21.4s\n"
+ "sqxtn2 v23.8h, v18.4s\n"
+ "sqadd v15.8h, v15.8h, v11.8h\n"
+ "sqadd v9.8h, v9.8h, v11.8h\n"
+ "sqadd v22.8h, v22.8h, v11.8h\n"
+ "sqadd v23.8h, v23.8h, v11.8h\n"
+ "smax v15.8h, v15.8h, v17.8h\n"
+ "smax v9.8h, v9.8h, v17.8h\n"
+ "smax v22.8h, v22.8h, v17.8h\n"
+ "smax v23.8h, v23.8h, v17.8h\n"
+ "smin v15.8h, v15.8h, v14.8h\n"
+ "smin v9.8h, v9.8h, v14.8h\n"
+ "smin v22.8h, v22.8h, v14.8h\n"
+ "smin v23.8h, v23.8h, v14.8h\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "uzp1 v9.16b, v9.16b, v9.16b\n"
+ "uzp1 v22.16b, v22.16b, v22.16b\n"
+ "uzp1 v23.16b, v23.16b, v23.16b\n"
+ "tbz x8, #2, 85f\n"
+ "st1 { v15.s }[0], [x10], #0x4\n"
+ "st1 { v9.s }[0], [x9], #0x4\n"
+ "st1 { v22.s }[0], [x28], #0x4\n"
+ "st1 { v23.s }[0], [x27], #0x4\n"
+ "tbz x8, #1, 84f\n"
+ "st1 { v15.h }[2], [x10], #0x2\n"
+ "st1 { v9.h }[2], [x9], #0x2\n"
+ "st1 { v22.h }[2], [x28], #0x2\n"
+ "st1 { v23.h }[2], [x27], #0x2\n"
+ "tbz x8, #0, 87f\n"
+ "st1 { v15.b }[6], [x10], #0x1\n"
+ "st1 { v9.b }[6], [x9], #0x1\n"
+ "st1 { v22.b }[6], [x28], #0x1\n"
+ "st1 { v23.b }[6], [x27], #0x1\n"
"b 87f\n"
"84:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x3, #0, 87f\n"
- "st1 { v13.b }[4], [x15], #0x1\n"
- "st1 { v19.b }[4], [x14], #0x1\n"
- "st1 { v11.b }[4], [x13], #0x1\n"
- "st1 { v18.b }[4], [x12], #0x1\n"
+ "tbz x8, #0, 87f\n"
+ "st1 { v15.b }[4], [x10], #0x1\n"
+ "st1 { v9.b }[4], [x9], #0x1\n"
+ "st1 { v22.b }[4], [x28], #0x1\n"
+ "st1 { v23.b }[4], [x27], #0x1\n"
"b 87f\n"
"85:" // Oddments: Bit 2: Unset
- "tbz x3, #1, 86f\n"
- "st1 { v13.h }[0], [x15], #0x2\n"
- "st1 { v19.h }[0], [x14], #0x2\n"
- "st1 { v11.h }[0], [x13], #0x2\n"
- "st1 { v18.h }[0], [x12], #0x2\n"
- "tbz x3, #0, 87f\n"
- "st1 { v13.b }[2], [x15], #0x1\n"
- "st1 { v19.b }[2], [x14], #0x1\n"
- "st1 { v11.b }[2], [x13], #0x1\n"
- "st1 { v18.b }[2], [x12], #0x1\n"
+ "tbz x8, #1, 86f\n"
+ "st1 { v15.h }[0], [x10], #0x2\n"
+ "st1 { v9.h }[0], [x9], #0x2\n"
+ "st1 { v22.h }[0], [x28], #0x2\n"
+ "st1 { v23.h }[0], [x27], #0x2\n"
+ "tbz x8, #0, 87f\n"
+ "st1 { v15.b }[2], [x10], #0x1\n"
+ "st1 { v9.b }[2], [x9], #0x1\n"
+ "st1 { v22.b }[2], [x28], #0x1\n"
+ "st1 { v23.b }[2], [x27], #0x1\n"
"b 87f\n"
"86:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x3, #0, 87f\n"
- "st1 { v13.b }[0], [x15], #0x1\n"
- "st1 { v19.b }[0], [x14], #0x1\n"
- "st1 { v11.b }[0], [x13], #0x1\n"
- "st1 { v18.b }[0], [x12], #0x1\n"
+ "tbz x8, #0, 87f\n"
+ "st1 { v15.b }[0], [x10], #0x1\n"
+ "st1 { v9.b }[0], [x9], #0x1\n"
+ "st1 { v22.b }[0], [x28], #0x1\n"
+ "st1 { v23.b }[0], [x27], #0x1\n"
"87:" // Oddments: Bit 2: End
-
"88:" // End
-
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index d3d5000d4c..ea70b56349 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,37 +36,24 @@ namespace depthwise {
void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
-struct a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst
+class a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef uint8_t input_type;
- typedef int8_t weight_type;
- typedef uint8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 5;
constexpr static unsigned int kernel_cols = 5;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 6;
- constexpr static unsigned int input_cols = 6;
-
- constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_5x5_mla::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_5x5_mla::get_packed_size;
+ a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
- kern_type kernel = a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; }
- a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 97156137bf..291ffec0bb 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -46,7 +46,7 @@ void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
struct Params
{
long unsigned int n_channels;
- const int8_t *weights;
+ const void *weights;
const int32_t *bias;
const arm_gemm::Requantize32 *requant;
const int32_t *const requant_muls;
@@ -57,7 +57,7 @@ void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
Params(
long unsigned int n_channels,
const uint8_t *const *inptrs_raw,
- const int8_t *const weights,
+ const void *const weights,
const int32_t *const bias,
const arm_gemm::Requantize32 &qp,
const int32_t *const requant_muls,
@@ -111,2096 +111,2070 @@ void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
requant_muls, requant_shifts, outptrs);
__asm__ __volatile__(
- "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n"
- "mov x10, #0x0\n"
- "ldr x3, [%x[params], %[offsetof_Params_weights]]\n"
- "mov x1, #0x0\n"
- "ldr x22, [%x[params], %[offsetof_Params_requant]]\n"
- "add x25, %x[params], %[offsetof_Params_inptrs]\n"
- "ldr x2, [%x[params], %[offsetof_Params_requant_muls]]\n"
- "lsr x19, x4, #0x3\n"
- "ldr x5, [%x[params], %[offsetof_Params_requant_shifts]]\n"
- "add x13, x22, %[offsetof_Requantize32_a_offset]\n"
- "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n"
- "add x20, x22, %[offsetof_Requantize32_b_offset]\n"
- "ld1r { v9.16b }, [x13]\n"
- "add x8, x22, %[offsetof_Requantize32_c_offset]\n"
- "ld1r { v14.16b }, [x20]\n"
- "add x20, x22, %[offsetof_Requantize32_minval]\n"
- "ld1r { v10.4s }, [x8]\n"
- "add x8, x22, %[offsetof_Requantize32_maxval]\n"
- "ld1r { v11.4s }, [x20]\n"
- "ld1r { v13.4s }, [x8]\n"
- "ldp x17, x16, [x21, #0x0]\n"
- "ldp x6, x8, [x21, #0x10]\n"
- "cbz x19, 3f\n"
- "subs x19, x19, #0x1\n"
- "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q15, [x12, #0x0]\n"
- "mov v16.16b, v15.16b\n"
- "ldr q18, [x12, #0x10]\n"
- "add x12, x12, #0x20\n"
- "mov v7.16b, v15.16b\n"
- "str x12, [%x[params], %[offsetof_Params_bias]]\n"
- "mov v8.16b, v15.16b\n"
- "ldr d0, [x3, #0x0]\n"
- "ldr d1, [x3, #0x8]\n"
- "mov v21.16b, v18.16b\n"
- "ldr d2, [x3, #0x10]\n"
- "mov v17.16b, v18.16b\n"
- "ldr d3, [x3, #0x18]\n"
- "mov v5.16b, v18.16b\n"
- "ldr d4, [x3, #0x20]\n"
+ "ldr x10, [%x[params], %[offsetof_Params_requant]]\n"
+ "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n"
+ "add x17, x10, %[offsetof_Requantize32_a_offset]\n"
+ "add x9, x10, %[offsetof_Requantize32_b_offset]\n"
+ "ldr x25, [%x[params], %[offsetof_Params_outptrs]]\n"
+ "add x4, x10, %[offsetof_Requantize32_c_offset]\n"
+ "add x14, x10, %[offsetof_Requantize32_minval]\n"
+ "ldr x23, [%x[params], %[offsetof_Params_weights]]\n"
+ "add x5, x10, %[offsetof_Requantize32_maxval]\n"
+ "ld1r { v9.16b }, [x17]\n"
+ "ld1r { v14.16b }, [x9]\n"
+ "lsr x3, x0, #0x3\n"
+ "ld1r { v18.8h }, [x4]\n"
+ "ld1r { v11.8h }, [x14]\n"
+ "mov x24, #0x0\n"
+ "mov x22, #0x0\n"
+ "ld1r { v13.8h }, [x5]\n"
+ "ldr x10, [%x[params], %[offsetof_Params_requant_muls]]\n"
+ "add x20, %x[params], %[offsetof_Params_inptrs]\n"
+ "ldr x1, [%x[params], %[offsetof_Params_requant_shifts]]\n"
+ "ldp x16, x8, [x25, #0x0]\n"
+ "ldp x4, x7, [x25, #0x10]\n"
+ "cbz x3, 3f\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q15, [x19, #0x0]\n"
+ "subs x3, x3, #0x1\n"
+ "mov v17.16b, v15.16b\n"
+ "ldr q16, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d0, [x23, #0x0]\n"
+ "ldr d1, [x23, #0x8]\n"
+ "ldr d2, [x23, #0x10]\n"
+ "mov v8.16b, v16.16b\n"
+ "mov v10.16b, v15.16b\n"
+ "ldr d3, [x23, #0x18]\n"
+ "ldr d4, [x23, #0x20]\n"
+ "mov v7.16b, v16.16b\n"
+ "mov v6.16b, v15.16b\n"
+ "ldp x28, x6, [x20, #0x0]\n"
+ "ldp x26, x25, [x20, #0x10]\n"
+ "mov v5.16b, v16.16b\n"
"ssubl v0.8h, v0.8b, v14.8b\n"
- "ldp x28, x27, [x25, #0x0]\n"
+ "ldp x5, x2, [x20, #0x20]\n"
+ "ldp x27, x21, [x20, #0x30]\n"
"ssubl v1.8h, v1.8b, v14.8b\n"
- "ldp x26, x13, [x25, #0x10]\n"
"ssubl v2.8h, v2.8b, v14.8b\n"
+ "ldp x12, x19, [x20, #0x40]\n"
+ "ldr d31, [x28, x24]\n"
"ssubl v3.8h, v3.8b, v14.8b\n"
- "ldp x24, x23, [x25, #0x20]\n"
"ssubl v4.8h, v4.8b, v14.8b\n"
- "ldp x22, x21, [x25, #0x30]\n"
- "ldp x20, x0, [x25, #0x40]\n"
- "ldr d31, [x28, x10]\n"
+ "ldr d30, [x6, x24]\n"
+ "ldr d29, [x26, x24]\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "ldr d30, [x27, x10]\n"
- "ldr d29, [x26, x10]\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "ldr d28, [x13, x10]\n"
+ "ldr d28, [x25, x24]\n"
+ "ldr d27, [x5, x24]\n"
"usubl v29.8h, v29.8b, v9.8b\n"
- "ldr d27, [x24, x10]\n"
- "ldr d23, [x23, x10]\n"
"usubl v28.8h, v28.8b, v9.8b\n"
- "ldr d25, [x22, x10]\n"
- "ldr d24, [x21, x10]\n"
+ "ldr d23, [x2, x24]\n"
+ "ldr d25, [x27, x24]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "ldr d26, [x20, x10]\n"
- "ldr d22, [x0, x10]\n"
+ "ldr d24, [x21, x24]\n"
+ "ldr d26, [x12, x24]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
"usubl v24.8h, v24.8b, v9.8b\n"
+ "ldr d22, [x19, x24]\n"
"usubl v26.8h, v26.8b, v9.8b\n"
"usubl v22.8h, v22.8b, v9.8b\n"
"beq 2f\n"
"1:" // Loop
"smlal v15.4s, v31.4h, v0.4h\n"
- "ldr x20, [x25, #0x50]\n"
- "subs x19, x19, #0x1\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
- "ldr x28, [x25, #0x58]\n"
- "smlal v16.4s, v30.4h, v0.4h\n"
- "ldr x0, [x25, #0x60]\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x10]\n"
- "smlal v7.4s, v29.4h, v0.4h\n"
- "ldr x7, [x25, #0x68]\n"
- "smlal2 v17.4s, v29.8h, v0.8h\n"
- "ldr x26, [x25, #0x70]\n"
- "smlal v8.4s, v28.4h, v0.4h\n"
- "ldr x23, [x25, #0x78]\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "ldr d0, [x3, #0x28]\n"
+ "smlal2 v16.4s, v31.8h, v0.8h\n"
+ "ldr x19, [x20, #0x50]\n"
+ "ldr d31, [x19, x24]\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal v10.4s, v29.4h, v0.4h\n"
+ "ldr x15, [x20, #0x58]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v6.4s, v28.4h, v0.4h\n"
+ "smlal2 v8.4s, v30.8h, v0.8h\n"
+ "ldr x19, [x20, #0x60]\n"
+ "ldr x27, [x20, #0x68]\n"
+ "smlal2 v7.4s, v29.8h, v0.8h\n"
"smlal v15.4s, v30.4h, v1.4h\n"
- "ldr x20, [x25, #0x80]\n"
- "smlal2 v18.4s, v30.8h, v1.8h\n"
- "ldr d30, [x28, x10]\n"
- "smlal v16.4s, v27.4h, v1.4h\n"
- "ldr x22, [x25, #0x88]\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
- "ldr x13, [x25, #0x90]\n"
- "smlal v7.4s, v28.4h, v1.4h\n"
- "ldr x21, [x25, #0x98]\n"
- "smlal2 v17.4s, v28.8h, v1.8h\n"
- "ldr x14, [x25, #0xa0]\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "ldr x11, [x25, #0xa8]\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "ldr d1, [x3, #0x30]\n"
+ "ldr x5, [x20, #0x70]\n"
+ "ldr x11, [x20, #0x78]\n"
+ "smlal2 v16.4s, v30.8h, v1.8h\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x15, x24]\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "smlal v17.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v28.4h, v1.4h\n"
+ "ldr d0, [x23, #0x28]\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v6.4s, v23.4h, v1.4h\n"
+ "smlal2 v8.4s, v27.8h, v1.8h\n"
+ "ldr x12, [x20, #0x80]\n"
+ "ldr x26, [x20, #0x88]\n"
+ "smlal2 v7.4s, v28.8h, v1.8h\n"
"smlal v15.4s, v27.4h, v2.4h\n"
- "ldr x24, [x25, #0xb0]\n"
- "smlal2 v18.4s, v27.8h, v2.8h\n"
- "ldr d27, [x0, x10]\n"
- "smlal v16.4s, v25.4h, v2.4h\n"
- "ldr x0, [x25, #0xb8]\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "ldr x15, [x25, #0xc0]\n"
- "smlal v7.4s, v23.4h, v2.4h\n"
- "ldr x9, [x25, #0xc8]\n"
- "smlal2 v17.4s, v23.8h, v2.8h\n"
- "ldr x27, [x25, #0xd0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "ldr x28, [x25, #0xd8]\n"
+ "ldr x14, [x20, #0x90]\n"
+ "ldr x15, [x20, #0x98]\n"
+ "smlal2 v16.4s, v27.8h, v2.8h\n"
+ "smlal2 v5.4s, v23.8h, v1.8h\n"
+ "ldr d27, [x19, x24]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal v10.4s, v23.4h, v2.4h\n"
+ "ldr d1, [x23, #0x30]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v31.4h, v2.4h\n"
+ "smlal2 v8.4s, v25.8h, v2.8h\n"
+ "ldr x21, [x20, #0xa0]\n"
+ "ldr x2, [x20, #0xa8]\n"
+ "smlal2 v7.4s, v23.8h, v2.8h\n"
"smlal v15.4s, v25.4h, v3.4h\n"
- "ldr q6, [x2, #0x0]\n"
- "smlal2 v18.4s, v25.8h, v3.8h\n"
- "ldr d25, [x7, x10]\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "ldr x12, [x25, #0xe0]\n"
+ "ldr x13, [x20, #0xb0]\n"
+ "ldr x9, [x20, #0xb8]\n"
+ "smlal2 v16.4s, v25.8h, v3.8h\n"
"smlal2 v5.4s, v31.8h, v2.8h\n"
- "ldr d2, [x3, #0x38]\n"
- "smlal v16.4s, v24.4h, v3.4h\n"
- "ldr q19, [x5, #0x0]\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "ldr q20, [x2, #0x10]\n"
- "add x2, x2, #0x20\n"
- "smlal v7.4s, v31.4h, v3.4h\n"
- "ldr q12, [x5, #0x10]\n"
- "add x5, x5, #0x20\n"
- "smlal2 v17.4s, v31.8h, v3.8h\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
+ "ldr d25, [x27, x24]\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal v10.4s, v31.4h, v3.4h\n"
+ "ldr d2, [x23, #0x38]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v30.4h, v3.4h\n"
+ "smlal2 v8.4s, v24.8h, v3.8h\n"
+ "ldr x19, [x20, #0xc0]\n"
+ "ldr x28, [x20, #0xc8]\n"
+ "smlal2 v7.4s, v31.8h, v3.8h\n"
"smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v18.4s, v24.8h, v4.8h\n"
- "ldr d24, [x26, x10]\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
- "ldr x7, [x25, #0xe8]\n"
+ "ldr x6, [x20, #0xd0]\n"
+ "ldr x27, [x20, #0xd8]\n"
+ "smlal2 v16.4s, v24.8h, v4.8h\n"
"smlal2 v5.4s, v30.8h, v3.8h\n"
- "ldr d3, [x3, #0x40]\n"
+ "ldr d24, [x5, x24]\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal v10.4s, v30.4h, v4.4h\n"
+ "ldr d3, [x23, #0x40]\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v8.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x11, x24]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v7.4s, v30.4h, v4.4h\n"
- "smlal2 v17.4s, v30.8h, v4.8h\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "ldr d27, [x23, x10]\n"
- "smlal v8.4s, v26.4h, v4.4h\n"
- "ldr x26, [x25, #0xf0]\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0x48]\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal2 v7.4s, v30.8h, v4.8h\n"
"smlal v15.4s, v29.4h, v0.4h\n"
- "smlal2 v18.4s, v29.8h, v0.8h\n"
- "smlal v16.4s, v28.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "smlal v7.4s, v22.4h, v0.4h\n"
- "smlal2 v17.4s, v22.8h, v0.8h\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "ldr d0, [x3, #0x50]\n"
+ "ldr x11, [x20, #0xe0]\n"
+ "ldr x17, [x20, #0xe8]\n"
+ "smlal2 v16.4s, v29.8h, v0.8h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x23, #0x48]\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "smlal v10.4s, v22.4h, v0.4h\n"
+ "ldr x5, [x20, #0xf0]\n"
+ "ldr q12, [x10, #0x0]\n"
+ "smlal v6.4s, v25.4h, v0.4h\n"
+ "smlal2 v8.4s, v28.8h, v0.8h\n"
+ "ldr q19, [x1, #0x0]\n"
+ "ldr q20, [x10, #0x10]\n"
+ "smlal2 v7.4s, v22.8h, v0.8h\n"
"smlal v15.4s, v28.4h, v1.4h\n"
- "smlal2 v18.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x10]\n"
- "smlal v16.4s, v23.4h, v1.4h\n"
- "ldr x23, [x25, #0xf8]\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "smlal v7.4s, v25.4h, v1.4h\n"
- "smlal2 v17.4s, v25.8h, v1.8h\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
- "ldr d1, [x3, #0x58]\n"
+ "ldr q29, [x1, #0x10]\n"
+ "subs x3, x3, #0x1\n"
+ "smlal2 v16.4s, v28.8h, v1.8h\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "ldr d28, [x26, x24]\n"
+ "ldr d0, [x23, #0x50]\n"
+ "smlal v17.4s, v23.4h, v1.4h\n"
+ "smlal v10.4s, v25.4h, v1.4h\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "ldr x25, [x20, #0xf8]\n"
+ "smlal v6.4s, v24.4h, v1.4h\n"
+ "smlal2 v8.4s, v23.8h, v1.8h\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "add x10, x10, #0x20\n"
+ "smlal2 v7.4s, v25.8h, v1.8h\n"
"smlal v15.4s, v23.4h, v2.4h\n"
- "smlal2 v18.4s, v23.8h, v2.8h\n"
- "ldr d23, [x20, x10]\n"
- "smlal v16.4s, v31.4h, v2.4h\n"
- "ldr x22, [x25, #0x100]\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "smlal v7.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "ldr d2, [x3, #0x60]\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
+ "add x1, x1, #0x20\n"
+ "smlal2 v16.4s, v23.8h, v2.8h\n"
+ "ldr d23, [x12, x24]\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v17.4s, v31.4h, v2.4h\n"
+ "smlal v10.4s, v24.4h, v2.4h\n"
+ "ldr d1, [x23, #0x58]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v27.4h, v2.4h\n"
+ "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "ldr x26, [x20, #0x100]\n"
+ "smlal2 v7.4s, v24.8h, v2.8h\n"
"smlal v15.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "ldr d31, [x13, x10]\n"
- "smlal v16.4s, v30.4h, v3.4h\n"
- "ldr x20, [x25, #0x108]\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "smlal v7.4s, v27.4h, v3.4h\n"
- "smlal2 v17.4s, v27.8h, v3.8h\n"
- "smlal v8.4s, v23.4h, v3.4h\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
- "ldr d3, [x3, #0x68]\n"
- "smlal v15.4s, v30.4h, v4.4h\n"
- "smlal2 v18.4s, v30.8h, v4.8h\n"
- "ldr d30, [x21, x10]\n"
- "smlal v16.4s, v26.4h, v4.4h\n"
- "ldr x13, [x25, #0x110]\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x10]\n"
- "smlal v7.4s, v23.4h, v4.4h\n"
- "ldr x21, [x25, #0x118]\n"
- "smlal2 v17.4s, v23.8h, v4.8h\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal2 v16.4s, v31.8h, v3.8h\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "ldr d31, [x14, x24]\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "ldr d4, [x3, #0x70]\n"
- "smlal v15.4s, v22.4h, v0.4h\n"
- "smlal2 v18.4s, v22.8h, v0.8h\n"
- "ldr d22, [x0, x10]\n"
- "smlal v16.4s, v25.4h, v0.4h\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "smlal v7.4s, v31.4h, v0.4h\n"
- "smlal2 v17.4s, v31.8h, v0.8h\n"
+ "smlal v17.4s, v30.4h, v3.4h\n"
+ "smlal v10.4s, v27.4h, v3.4h\n"
+ "ldr d2, [x23, #0x60]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v23.4h, v3.4h\n"
+ "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "ldr x12, [x20, #0x108]\n"
+ "smlal2 v7.4s, v27.8h, v3.8h\n"
+ "smlal v15.4s, v30.4h, v4.4h\n"
+ "smlal2 v16.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x15, x24]\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal v10.4s, v23.4h, v4.4h\n"
+ "ldr d3, [x23, #0x68]\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v28.4h, v4.4h\n"
+ "smlal2 v8.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x21, x24]\n"
"usubl v26.8h, v26.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v0.4h\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "ldr d0, [x3, #0x78]\n"
+ "smlal2 v7.4s, v23.8h, v4.8h\n"
+ "smlal v15.4s, v22.4h, v0.4h\n"
+ "ldr x14, [x20, #0x110]\n"
+ "ldr x21, [x20, #0x118]\n"
+ "smlal2 v16.4s, v22.8h, v0.8h\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x23, #0x70]\n"
+ "ldr d22, [x9, x24]\n"
+ "smlal v17.4s, v25.4h, v0.4h\n"
+ "smlal v10.4s, v31.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v6.4s, v30.4h, v0.4h\n"
+ "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "smlal2 v7.4s, v31.8h, v0.8h\n"
"smlal v15.4s, v25.4h, v1.4h\n"
- "smlal2 v18.4s, v25.8h, v1.8h\n"
- "ldr d25, [x11, x10]\n"
- "smlal v16.4s, v24.4h, v1.4h\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "smlal v7.4s, v30.4h, v1.4h\n"
- "smlal2 v17.4s, v30.8h, v1.8h\n"
- "smlal v8.4s, v26.4h, v1.4h\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
- "ldr d1, [x3, #0x80]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal2 v16.4s, v25.8h, v1.8h\n"
+ "ldr d25, [x2, x24]\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v17.4s, v24.4h, v1.4h\n"
+ "smlal v10.4s, v30.4h, v1.4h\n"
+ "ldr d0, [x23, #0x78]\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v6.4s, v26.4h, v1.4h\n"
+ "smlal2 v8.4s, v24.8h, v1.8h\n"
+ "smlal2 v7.4s, v30.8h, v1.8h\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v18.4s, v24.8h, v2.8h\n"
- "ldr d24, [x24, x10]\n"
- "smlal v16.4s, v27.4h, v2.4h\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "smlal v7.4s, v26.4h, v2.4h\n"
- "smlal2 v17.4s, v26.8h, v2.8h\n"
- "smlal v8.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "ldr d2, [x3, #0x88]\n"
- "smlal v15.4s, v27.4h, v3.4h\n"
- "smlal2 v18.4s, v27.8h, v3.8h\n"
- "ldr d27, [x15, x10]\n"
- "smlal v16.4s, v23.4h, v3.4h\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "smlal v7.4s, v25.4h, v3.4h\n"
- "smlal2 v17.4s, v25.8h, v3.8h\n"
+ "smlal2 v16.4s, v24.8h, v2.8h\n"
+ "ldr d24, [x13, x24]\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
- "smlal v8.4s, v24.4h, v3.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "ldr d3, [x3, #0x90]\n"
- "smlal v15.4s, v23.4h, v4.4h\n"
- "smlal2 v18.4s, v23.8h, v4.8h\n"
- "ldr d23, [x9, x10]\n"
- "smlal v16.4s, v28.4h, v4.4h\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "ldr d28, [x12, x10]\n"
- "smlal v7.4s, v24.4h, v4.4h\n"
- "smlal2 v17.4s, v24.8h, v4.8h\n"
- "smlal v8.4s, v22.4h, v4.4h\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
- "ldr d4, [x3, #0x98]\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal v10.4s, v26.4h, v2.4h\n"
+ "ldr d1, [x23, #0x80]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v8.4s, v27.8h, v2.8h\n"
+ "smlal2 v7.4s, v26.8h, v2.8h\n"
+ "smlal v15.4s, v27.4h, v3.4h\n"
+ "smlal2 v16.4s, v27.8h, v3.8h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "ldr d27, [x19, x24]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v17.4s, v23.4h, v3.4h\n"
+ "smlal v10.4s, v25.4h, v3.4h\n"
+ "ldr d2, [x23, #0x88]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v8.4s, v23.8h, v3.8h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "smlal v15.4s, v23.4h, v4.4h\n"
+ "smlal2 v16.4s, v23.8h, v4.8h\n"
+ "ldr d23, [x28, x24]\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
"usubl v23.8h, v23.8b, v9.8b\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "smlal v10.4s, v24.4h, v4.4h\n"
+ "ldr d3, [x23, #0x90]\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v22.4h, v4.4h\n"
+ "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x11, x24]\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
"smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
- "ldr d31, [x27, x10]\n"
- "smlal v16.4s, v30.4h, v0.4h\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "smlal v7.4s, v27.4h, v0.4h\n"
- "smlal2 v17.4s, v27.8h, v0.8h\n"
- "smlal v8.4s, v23.4h, v0.4h\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
- "ldr d0, [x3, #0xa0]\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal2 v16.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x6, x24]\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal v10.4s, v27.4h, v0.4h\n"
+ "ldr d4, [x23, #0x98]\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v6.4s, v23.4h, v0.4h\n"
+ "smlal2 v8.4s, v30.8h, v0.8h\n"
+ "smlal2 v7.4s, v27.8h, v0.8h\n"
"smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v18.4s, v30.8h, v1.8h\n"
- "ldr d30, [x28, x10]\n"
- "smlal v16.4s, v26.4h, v1.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "smlal v7.4s, v23.4h, v1.4h\n"
- "smlal2 v17.4s, v23.8h, v1.8h\n"
- "smlal v8.4s, v31.4h, v1.4h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "ldr d1, [x3, #0xa8]\n"
- "smlal v15.4s, v26.4h, v2.4h\n"
- "smlal2 v18.4s, v26.8h, v2.8h\n"
- "ldr d26, [x7, x10]\n"
- "smlal v16.4s, v25.4h, v2.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "smlal v7.4s, v31.4h, v2.4h\n"
- "smlal2 v17.4s, v31.8h, v2.8h\n"
+ "smlal2 v16.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x27, x24]\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v2.4h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "ldr d2, [x3, #0xb0]\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v18.4s, v25.8h, v3.8h\n"
- "ldr d25, [x26, x10]\n"
- "smlal v16.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "smlal v7.4s, v30.4h, v3.4h\n"
- "smlal2 v17.4s, v30.8h, v3.8h\n"
- "smlal v8.4s, v28.4h, v3.4h\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
- "ldr d3, [x3, #0xb8]\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
+ "smlal v17.4s, v26.4h, v1.4h\n"
+ "smlal v10.4s, v23.4h, v1.4h\n"
+ "ldr d0, [x23, #0xa0]\n"
"ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v6.4s, v31.4h, v1.4h\n"
+ "smlal2 v8.4s, v26.8h, v1.8h\n"
+ "smlal2 v7.4s, v23.8h, v1.8h\n"
+ "smlal v15.4s, v26.4h, v2.4h\n"
+ "smlal2 v16.4s, v26.8h, v2.8h\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "ldr d26, [x17, x24]\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal v10.4s, v31.4h, v2.4h\n"
+ "ldr d1, [x23, #0xa8]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v30.4h, v2.4h\n"
+ "smlal2 v8.4s, v25.8h, v2.8h\n"
+ "smlal2 v7.4s, v31.8h, v2.8h\n"
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "smlal2 v16.4s, v25.8h, v3.8h\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "ldr d25, [x5, x24]\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal v10.4s, v30.4h, v3.4h\n"
+ "ldr d2, [x23, #0xb0]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v28.4h, v3.4h\n"
+ "smlal2 v8.4s, v24.8h, v3.8h\n"
+ "smlal2 v7.4s, v30.8h, v3.8h\n"
"smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v18.4s, v24.8h, v4.8h\n"
- "ldr d24, [x23, x10]\n"
- "smlal v16.4s, v22.4h, v4.4h\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "smlal v7.4s, v28.4h, v4.4h\n"
- "smlal2 v17.4s, v28.8h, v4.8h\n"
- "smlal v8.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0xc0]\n"
- "add x3, x3, #0xc8\n"
+ "smlal2 v16.4s, v24.8h, v4.8h\n"
+ "ldr d24, [x25, x24]\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v17.4s, v22.4h, v4.4h\n"
+ "smlal v10.4s, v28.4h, v4.4h\n"
+ "ldr d3, [x23, #0xb8]\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v7.4s, v28.8h, v4.8h\n"
"smlal v15.4s, v27.4h, v0.4h\n"
- "smlal2 v18.4s, v27.8h, v0.8h\n"
- "ldr d27, [x22, x10]\n"
- "smlal v16.4s, v23.4h, v0.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
+ "smlal2 v16.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x26, x24]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal2 v8.4s, v22.8h, v4.8h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x23, #0xc0]\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v17.4s, v23.4h, v0.4h\n"
+ "smlal v10.4s, v25.4h, v0.4h\n"
+ "add x23, x23, #0xc8\n"
+ "smlal v6.4s, v24.4h, v0.4h\n"
+ "smlal2 v7.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x12, x24]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "smlal v7.4s, v25.4h, v0.4h\n"
- "smlal2 v17.4s, v25.8h, v0.8h\n"
- "ldr d25, [x20, x10]\n"
- "smlal v8.4s, v24.4h, v0.4h\n"
+ "smlal2 v8.4s, v23.8h, v0.8h\n"
"smlal2 v5.4s, v24.8h, v0.8h\n"
"smlal v15.4s, v23.4h, v1.4h\n"
- "smlal2 v18.4s, v23.8h, v1.8h\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "smlal v7.4s, v24.4h, v1.4h\n"
- "smlal2 v17.4s, v24.8h, v1.8h\n"
- "ldr d24, [x13, x10]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v8.4s, v27.4h, v1.4h\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal v10.4s, v24.4h, v1.4h\n"
+ "smlal v6.4s, v27.4h, v1.4h\n"
+ "smlal2 v7.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x14, x24]\n"
+ "smlal2 v16.4s, v23.8h, v1.8h\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "smlal2 v8.4s, v31.8h, v1.8h\n"
"smlal2 v5.4s, v27.8h, v1.8h\n"
"smlal v15.4s, v31.4h, v2.4h\n"
- "smlal2 v18.4s, v31.8h, v2.8h\n"
- "smlal v16.4s, v30.4h, v2.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "smlal v7.4s, v27.4h, v2.4h\n"
- "smlal2 v17.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x10]\n"
- "add x10, x10, #0x8\n"
- "smlal v8.4s, v25.4h, v2.4h\n"
+ "smlal v17.4s, v30.4h, v2.4h\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v7.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x24]\n"
+ "smlal2 v16.4s, v31.8h, v2.8h\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal2 v8.4s, v30.8h, v2.8h\n"
"smlal2 v5.4s, v25.8h, v2.8h\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
+ "add x24, x24, #0x8\n"
"smlal v15.4s, v30.4h, v3.4h\n"
- "smlal2 v18.4s, v30.8h, v3.8h\n"
- "smlal v16.4s, v28.4h, v3.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "smlal v7.4s, v25.4h, v3.4h\n"
- "smlal2 v17.4s, v25.8h, v3.8h\n"
- "smlal v8.4s, v24.4h, v3.4h\n"
+ "smlal v17.4s, v28.4h, v3.4h\n"
+ "smlal v10.4s, v25.4h, v3.4h\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v16.4s, v30.8h, v3.8h\n"
+ "smlal2 v8.4s, v28.8h, v3.8h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
"smlal2 v5.4s, v24.8h, v3.8h\n"
"smlal v15.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "smlal v16.4s, v26.4h, v4.4h\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "smlal v7.4s, v24.4h, v4.4h\n"
- "smlal2 v17.4s, v24.8h, v4.8h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "sqrdmulh v15.4s, v15.4s, v6.4s\n"
- "sqrdmulh v18.4s, v18.4s, v20.4s\n"
- "smlal v8.4s, v27.4h, v4.4h\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "sqdmulh v15.4s, v15.4s, v12.4s\n"
+ "smlal v10.4s, v24.4h, v4.4h\n"
+ "smlal v6.4s, v27.4h, v4.4h\n"
+ "sqdmulh v17.4s, v17.4s, v12.4s\n"
+ "smlal2 v16.4s, v28.8h, v4.8h\n"
+ "smlal2 v8.4s, v26.8h, v4.8h\n"
+ "sqdmulh v10.4s, v10.4s, v12.4s\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
"smlal2 v5.4s, v27.8h, v4.8h\n"
- "and v28.16b, v15.16b, v19.16b\n"
- "and v26.16b, v18.16b, v12.16b\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqdmulh v6.4s, v6.4s, v12.4s\n"
+ "and v23.16b, v15.16b, v19.16b\n"
+ "sqdmulh v16.4s, v16.4s, v20.4s\n"
+ "and v22.16b, v17.16b, v19.16b\n"
+ "sqdmulh v8.4s, v8.4s, v20.4s\n"
+ "and v21.16b, v10.16b, v19.16b\n"
+ "sqdmulh v7.4s, v7.4s, v20.4s\n"
+ "and v26.16b, v6.16b, v19.16b\n"
+ "sqdmulh v5.4s, v5.4s, v20.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v4.16b, v16.16b, v29.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v2.16b, v8.16b, v29.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v3.16b, v7.16b, v29.16b\n"
"sshr v26.4s, v26.4s, #0x1f\n"
- "sqrdmulh v21.4s, v21.4s, v20.4s\n"
- "sqadd v15.4s, v15.4s, v28.4s\n"
- "sqadd v18.4s, v18.4s, v26.4s\n"
- "and v29.16b, v16.16b, v19.16b\n"
- "and v4.16b, v21.16b, v12.16b\n"
- "srshl v15.4s, v15.4s, v19.4s\n"
- "srshl v18.4s, v18.4s, v12.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v25.16b, v5.16b, v29.16b\n"
+ "sqadd v15.4s, v15.4s, v23.4s\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "add v15.4s, v15.4s, v10.4s\n"
- "add v18.4s, v18.4s, v10.4s\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "smin v15.4s, v15.4s, v13.4s\n"
- "smin v18.4s, v18.4s, v13.4s\n"
- "sqadd v21.4s, v21.4s, v4.4s\n"
- "smax v15.4s, v15.4s, v11.4s\n"
- "smax v18.4s, v18.4s, v11.4s\n"
- "srshl v16.4s, v16.4s, v19.4s\n"
- "srshl v21.4s, v21.4s, v12.4s\n"
- "uzp1 v15.16b, v15.16b, v18.16b\n"
- "sqrdmulh v7.4s, v7.4s, v6.4s\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x17, x1]\n"
- "add v16.4s, v16.4s, v10.4s\n"
- "add v21.4s, v21.4s, v10.4s\n"
- "and v25.16b, v7.16b, v19.16b\n"
- "sqrdmulh v17.4s, v17.4s, v20.4s\n"
- "smin v16.4s, v16.4s, v13.4s\n"
- "smin v21.4s, v21.4s, v13.4s\n"
+ "sqadd v17.4s, v17.4s, v22.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v21.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v26.4s\n"
"sshr v25.4s, v25.4s, #0x1f\n"
- "smax v16.4s, v16.4s, v11.4s\n"
- "smax v21.4s, v21.4s, v11.4s\n"
- "sqadd v7.4s, v7.4s, v25.4s\n"
- "and v31.16b, v17.16b, v12.16b\n"
- "uzp1 v16.16b, v16.16b, v21.16b\n"
- "sqrdmulh v8.4s, v8.4s, v6.4s\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d16, [x16, x1]\n"
- "srshl v7.4s, v7.4s, v19.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "and v24.16b, v8.16b, v19.16b\n"
- "sqrdmulh v5.4s, v5.4s, v20.4s\n"
- "sqadd v17.4s, v17.4s, v31.4s\n"
- "add v7.4s, v7.4s, v10.4s\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
- "and v1.16b, v5.16b, v12.16b\n"
- "smin v7.4s, v7.4s, v13.4s\n"
- "srshl v17.4s, v17.4s, v12.4s\n"
- "sqadd v8.4s, v8.4s, v24.4s\n"
- "smax v7.4s, v7.4s, v11.4s\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "add v17.4s, v17.4s, v10.4s\n"
- "srshl v8.4s, v8.4s, v19.4s\n"
- "sqadd v5.4s, v5.4s, v1.4s\n"
- "smin v17.4s, v17.4s, v13.4s\n"
- "add v8.4s, v8.4s, v10.4s\n"
- "smax v17.4s, v17.4s, v11.4s\n"
- "srshl v5.4s, v5.4s, v12.4s\n"
- "smin v8.4s, v8.4s, v13.4s\n"
- "uzp1 v7.16b, v7.16b, v17.16b\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
- "str d7, [x6, x1]\n"
- "smax v8.4s, v8.4s, v11.4s\n"
- "smin v5.4s, v5.4s, v13.4s\n"
- "smax v5.4s, v5.4s, v11.4s\n"
- "uzp1 v8.16b, v8.16b, v5.16b\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "str d8, [x8, x1]\n"
- "add x1, x1, #0x8\n"
- "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
- "ldr q15, [x12, #0x0]\n"
- "mov v16.16b, v15.16b\n"
- "ldr q18, [x12, #0x10]\n"
- "add x12, x12, #0x20\n"
- "mov v7.16b, v15.16b\n"
- "str x12, [%x[params], %[offsetof_Params_bias]]\n"
- "mov v8.16b, v15.16b\n"
- "ldr d0, [x3, #0x0]\n"
- "ldr d1, [x3, #0x8]\n"
- "mov v21.16b, v18.16b\n"
- "ldr d2, [x3, #0x10]\n"
- "mov v17.16b, v18.16b\n"
- "ldr d3, [x3, #0x18]\n"
- "mov v5.16b, v18.16b\n"
- "ldr d4, [x3, #0x20]\n"
+ "srshl v15.4s, v15.4s, v19.4s\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "srshl v17.4s, v17.4s, v19.4s\n"
+ "sqadd v8.4s, v8.4s, v2.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqadd v7.4s, v7.4s, v3.4s\n"
+ "srshl v6.4s, v6.4s, v19.4s\n"
+ "sqadd v5.4s, v5.4s, v25.4s\n"
+ "srshl v16.4s, v16.4s, v29.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqxtn v17.4h, v17.4s\n"
+ "srshl v7.4s, v7.4s, v29.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v5.4s, v5.4s, v29.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v15.8h, v16.4s\n"
+ "sqxtn2 v17.8h, v8.4s\n"
+ "sqxtn2 v10.8h, v7.4s\n"
+ "sqxtn2 v6.8h, v5.4s\n"
+ "sqadd v15.8h, v15.8h, v18.8h\n"
+ "sqadd v17.8h, v17.8h, v18.8h\n"
+ "sqadd v10.8h, v10.8h, v18.8h\n"
+ "sqadd v6.8h, v6.8h, v18.8h\n"
+ "smax v15.8h, v15.8h, v11.8h\n"
+ "smax v17.8h, v17.8h, v11.8h\n"
+ "smax v10.8h, v10.8h, v11.8h\n"
+ "smax v6.8h, v6.8h, v11.8h\n"
+ "smin v15.8h, v15.8h, v13.8h\n"
+ "smin v17.8h, v17.8h, v13.8h\n"
+ "smin v10.8h, v10.8h, v13.8h\n"
+ "smin v6.8h, v6.8h, v13.8h\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "str d15, [x16, x22]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str d17, [x8, x22]\n"
+ "str d10, [x4, x22]\n"
+ "str d6, [x7, x22]\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr q15, [x19, #0x0]\n"
+ "add x22, x22, #0x8\n"
+ "ldr q16, [x19, #0x10]\n"
+ "add x19, x19, #0x20\n"
+ "str x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "ldr d0, [x23, #0x0]\n"
+ "ldr d1, [x23, #0x8]\n"
+ "ldr d2, [x23, #0x10]\n"
+ "mov v17.16b, v15.16b\n"
+ "mov v8.16b, v16.16b\n"
+ "ldr d3, [x23, #0x18]\n"
+ "ldr d4, [x23, #0x20]\n"
+ "mov v10.16b, v15.16b\n"
+ "mov v7.16b, v16.16b\n"
+ "ldp x28, x6, [x20, #0x0]\n"
+ "ldp x26, x25, [x20, #0x10]\n"
+ "mov v6.16b, v15.16b\n"
+ "mov v5.16b, v16.16b\n"
+ "ldp x5, x2, [x20, #0x20]\n"
+ "ldp x27, x21, [x20, #0x30]\n"
"ssubl v0.8h, v0.8b, v14.8b\n"
- "ldp x28, x27, [x25, #0x0]\n"
"ssubl v1.8h, v1.8b, v14.8b\n"
- "ldp x26, x13, [x25, #0x10]\n"
+ "ldp x12, x19, [x20, #0x40]\n"
+ "ldr d31, [x28, x24]\n"
"ssubl v2.8h, v2.8b, v14.8b\n"
"ssubl v3.8h, v3.8b, v14.8b\n"
- "ldp x24, x23, [x25, #0x20]\n"
+ "ldr d30, [x6, x24]\n"
+ "ldr d29, [x26, x24]\n"
"ssubl v4.8h, v4.8b, v14.8b\n"
- "ldp x22, x21, [x25, #0x30]\n"
- "ldp x20, x0, [x25, #0x40]\n"
- "ldr d31, [x28, x10]\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "ldr d30, [x27, x10]\n"
- "ldr d29, [x26, x10]\n"
+ "ldr d28, [x25, x24]\n"
+ "ldr d27, [x5, x24]\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "ldr d28, [x13, x10]\n"
"usubl v29.8h, v29.8b, v9.8b\n"
- "ldr d27, [x24, x10]\n"
- "ldr d23, [x23, x10]\n"
+ "ldr d23, [x2, x24]\n"
+ "ldr d25, [x27, x24]\n"
"usubl v28.8h, v28.8b, v9.8b\n"
- "ldr d25, [x22, x10]\n"
- "ldr d24, [x21, x10]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
+ "ldr d24, [x21, x24]\n"
+ "ldr d26, [x12, x24]\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "ldr d26, [x20, x10]\n"
- "ldr d22, [x0, x10]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
+ "ldr d22, [x19, x24]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
"usubl v26.8h, v26.8b, v9.8b\n"
"usubl v22.8h, v22.8b, v9.8b\n"
"bgt 1b\n"
"2:" // Tail
"smlal v15.4s, v31.4h, v0.4h\n"
- "ldr x20, [x25, #0x50]\n"
- "tst x4, #0x7\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
- "ldr x28, [x25, #0x58]\n"
- "smlal v16.4s, v30.4h, v0.4h\n"
- "ldr x0, [x25, #0x60]\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "ldr d31, [x20, x10]\n"
- "smlal v7.4s, v29.4h, v0.4h\n"
- "ldr x7, [x25, #0x68]\n"
- "smlal2 v17.4s, v29.8h, v0.8h\n"
- "ldr x26, [x25, #0x70]\n"
- "smlal v8.4s, v28.4h, v0.4h\n"
- "ldr x23, [x25, #0x78]\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "ldr d0, [x3, #0x28]\n"
+ "smlal2 v16.4s, v31.8h, v0.8h\n"
+ "ldr x19, [x20, #0x50]\n"
+ "ldr d31, [x19, x24]\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal v10.4s, v29.4h, v0.4h\n"
+ "ldr x15, [x20, #0x58]\n"
+ "usubl v31.8h, v31.8b, v9.8b\n"
+ "smlal v6.4s, v28.4h, v0.4h\n"
+ "smlal2 v8.4s, v30.8h, v0.8h\n"
+ "ldr x19, [x20, #0x60]\n"
+ "ldr x27, [x20, #0x68]\n"
+ "smlal2 v7.4s, v29.8h, v0.8h\n"
"smlal v15.4s, v30.4h, v1.4h\n"
- "ldr x20, [x25, #0x80]\n"
- "smlal2 v18.4s, v30.8h, v1.8h\n"
- "ldr d30, [x28, x10]\n"
- "smlal v16.4s, v27.4h, v1.4h\n"
- "ldr x22, [x25, #0x88]\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
- "ldr x13, [x25, #0x90]\n"
- "smlal v7.4s, v28.4h, v1.4h\n"
- "ldr x21, [x25, #0x98]\n"
- "smlal2 v17.4s, v28.8h, v1.8h\n"
- "ldr x14, [x25, #0xa0]\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
- "ldr x11, [x25, #0xa8]\n"
- "smlal2 v5.4s, v23.8h, v1.8h\n"
- "ldr d1, [x3, #0x30]\n"
+ "ldr x5, [x20, #0x70]\n"
+ "ldr x11, [x20, #0x78]\n"
+ "smlal2 v16.4s, v30.8h, v1.8h\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
+ "ldr d30, [x15, x24]\n"
+ "usubl v30.8h, v30.8b, v9.8b\n"
+ "smlal v17.4s, v27.4h, v1.4h\n"
+ "smlal v10.4s, v28.4h, v1.4h\n"
+ "ldr d0, [x23, #0x28]\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v6.4s, v23.4h, v1.4h\n"
+ "smlal2 v8.4s, v27.8h, v1.8h\n"
+ "ldr x12, [x20, #0x80]\n"
+ "ldr x26, [x20, #0x88]\n"
+ "smlal2 v7.4s, v28.8h, v1.8h\n"
"smlal v15.4s, v27.4h, v2.4h\n"
- "ldr x24, [x25, #0xb0]\n"
- "smlal2 v18.4s, v27.8h, v2.8h\n"
- "ldr d27, [x0, x10]\n"
- "smlal v16.4s, v25.4h, v2.4h\n"
- "ldr x0, [x25, #0xb8]\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "ldr x15, [x25, #0xc0]\n"
- "smlal v7.4s, v23.4h, v2.4h\n"
- "ldr x9, [x25, #0xc8]\n"
- "smlal2 v17.4s, v23.8h, v2.8h\n"
- "ldr x27, [x25, #0xd0]\n"
- "usubl v31.8h, v31.8b, v9.8b\n"
- "ldr x28, [x25, #0xd8]\n"
+ "ldr x14, [x20, #0x90]\n"
+ "ldr x15, [x20, #0x98]\n"
+ "smlal2 v16.4s, v27.8h, v2.8h\n"
+ "smlal2 v5.4s, v23.8h, v1.8h\n"
+ "ldr d27, [x19, x24]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal v10.4s, v23.4h, v2.4h\n"
+ "ldr d1, [x23, #0x30]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v31.4h, v2.4h\n"
+ "smlal2 v8.4s, v25.8h, v2.8h\n"
+ "ldr x21, [x20, #0xa0]\n"
+ "ldr x2, [x20, #0xa8]\n"
+ "smlal2 v7.4s, v23.8h, v2.8h\n"
"smlal v15.4s, v25.4h, v3.4h\n"
- "ldr x12, [x25, #0xe0]\n"
- "smlal2 v18.4s, v25.8h, v3.8h\n"
- "ldr d25, [x7, x10]\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
- "ldr x7, [x25, #0xe8]\n"
+ "ldr x13, [x20, #0xb0]\n"
+ "ldr x9, [x20, #0xb8]\n"
+ "smlal2 v16.4s, v25.8h, v3.8h\n"
"smlal2 v5.4s, v31.8h, v2.8h\n"
- "ldr d2, [x3, #0x38]\n"
- "smlal v16.4s, v24.4h, v3.4h\n"
- "ldr q6, [x2, #0x0]\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "ldr q19, [x5, #0x0]\n"
- "smlal v7.4s, v31.4h, v3.4h\n"
- "ldr q20, [x2, #0x10]\n"
- "add x2, x2, #0x20\n"
- "smlal2 v17.4s, v31.8h, v3.8h\n"
- "ldr q12, [x5, #0x10]\n"
- "add x5, x5, #0x20\n"
- "usubl v30.8h, v30.8b, v9.8b\n"
+ "ldr d25, [x27, x24]\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal v10.4s, v31.4h, v3.4h\n"
+ "ldr d2, [x23, #0x38]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v30.4h, v3.4h\n"
+ "smlal2 v8.4s, v24.8h, v3.8h\n"
+ "ldr x19, [x20, #0xc0]\n"
+ "ldr x28, [x20, #0xc8]\n"
+ "smlal2 v7.4s, v31.8h, v3.8h\n"
"smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v18.4s, v24.8h, v4.8h\n"
- "ldr d24, [x26, x10]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "ldr x26, [x25, #0xf0]\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
+ "ldr x6, [x20, #0xd0]\n"
+ "ldr x27, [x20, #0xd8]\n"
+ "smlal2 v16.4s, v24.8h, v4.8h\n"
"smlal2 v5.4s, v30.8h, v3.8h\n"
- "ldr d3, [x3, #0x40]\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "ldr d27, [x23, x10]\n"
- "smlal v7.4s, v30.4h, v4.4h\n"
- "ldr x23, [x25, #0xf8]\n"
- "smlal2 v17.4s, v30.8h, v4.8h\n"
- "smlal v8.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0x48]\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "smlal v15.4s, v29.4h, v0.4h\n"
- "smlal2 v18.4s, v29.8h, v0.8h\n"
- "smlal v16.4s, v28.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "smlal v7.4s, v22.4h, v0.4h\n"
- "smlal2 v17.4s, v22.8h, v0.8h\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
- "ldr d0, [x3, #0x50]\n"
- "smlal v15.4s, v28.4h, v1.4h\n"
- "smlal2 v18.4s, v28.8h, v1.8h\n"
- "ldr d28, [x22, x10]\n"
- "smlal v16.4s, v23.4h, v1.4h\n"
- "ldr x22, [x25, #0x100]\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "smlal v7.4s, v25.4h, v1.4h\n"
- "smlal2 v17.4s, v25.8h, v1.8h\n"
+ "ldr d24, [x5, x24]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
- "ldr d1, [x3, #0x58]\n"
- "smlal v15.4s, v23.4h, v2.4h\n"
- "smlal2 v18.4s, v23.8h, v2.8h\n"
- "ldr d23, [x20, x10]\n"
- "smlal v16.4s, v31.4h, v2.4h\n"
- "ldr x20, [x25, #0x108]\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "smlal v7.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
- "ldr d2, [x3, #0x60]\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal v10.4s, v30.4h, v4.4h\n"
+ "ldr d3, [x23, #0x40]\n"
"ssubl v3.8h, v3.8b, v14.8b\n"
- "usubl v23.8h, v23.8b, v9.8b\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v8.4s, v27.8h, v4.8h\n"
+ "ldr d27, [x11, x24]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal2 v7.4s, v30.8h, v4.8h\n"
+ "smlal v15.4s, v29.4h, v0.4h\n"
+ "ldr x11, [x20, #0xe0]\n"
+ "ldr x17, [x20, #0xe8]\n"
+ "smlal2 v16.4s, v29.8h, v0.8h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x23, #0x48]\n"
"ssubl v4.8h, v4.8b, v14.8b\n"
- "smlal v15.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "ldr d31, [x13, x10]\n"
- "smlal v16.4s, v30.4h, v3.4h\n"
- "ldr x13, [x25, #0x110]\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "smlal v7.4s, v27.4h, v3.4h\n"
- "smlal2 v17.4s, v27.8h, v3.8h\n"
- "smlal v8.4s, v23.4h, v3.4h\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
- "ldr d3, [x3, #0x68]\n"
- "smlal v15.4s, v30.4h, v4.4h\n"
- "smlal2 v18.4s, v30.8h, v4.8h\n"
- "ldr d30, [x21, x10]\n"
- "smlal v16.4s, v26.4h, v4.4h\n"
- "ldr x21, [x25, #0x118]\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "ldr d26, [x14, x10]\n"
- "smlal v7.4s, v23.4h, v4.4h\n"
- "smlal2 v17.4s, v23.8h, v4.8h\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "smlal v10.4s, v22.4h, v0.4h\n"
+ "ldr x5, [x20, #0xf0]\n"
+ "ldr x25, [x20, #0xf8]\n"
+ "smlal v6.4s, v25.4h, v0.4h\n"
+ "smlal2 v8.4s, v28.8h, v0.8h\n"
+ "ldr q12, [x10, #0x0]\n"
+ "ldr q19, [x1, #0x0]\n"
+ "smlal2 v7.4s, v22.8h, v0.8h\n"
+ "smlal v15.4s, v28.4h, v1.4h\n"
+ "ldr q20, [x10, #0x10]\n"
+ "ldr q29, [x1, #0x10]\n"
+ "smlal2 v16.4s, v28.8h, v1.8h\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "ldr d28, [x26, x24]\n"
+ "ldr d0, [x23, #0x50]\n"
+ "smlal v17.4s, v23.4h, v1.4h\n"
+ "smlal v10.4s, v25.4h, v1.4h\n"
"usubl v28.8h, v28.8b, v9.8b\n"
+ "ldr x26, [x20, #0x100]\n"
+ "smlal v6.4s, v24.4h, v1.4h\n"
+ "smlal2 v8.4s, v23.8h, v1.8h\n"
"ssubl v0.8h, v0.8b, v14.8b\n"
+ "tst x0, #0x7\n"
+ "smlal2 v7.4s, v25.8h, v1.8h\n"
+ "smlal v15.4s, v23.4h, v2.4h\n"
+ "add x10, x10, #0x20\n"
+ "add x1, x1, #0x20\n"
+ "smlal2 v16.4s, v23.8h, v2.8h\n"
+ "ldr d23, [x12, x24]\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "usubl v23.8h, v23.8b, v9.8b\n"
+ "smlal v17.4s, v31.4h, v2.4h\n"
+ "smlal v10.4s, v24.4h, v2.4h\n"
+ "ldr d1, [x23, #0x58]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v27.4h, v2.4h\n"
+ "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "ldr x12, [x20, #0x108]\n"
+ "smlal2 v7.4s, v24.8h, v2.8h\n"
+ "smlal v15.4s, v31.4h, v3.4h\n"
+ "smlal2 v16.4s, v31.8h, v3.8h\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "ldr d31, [x14, x24]\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
- "ldr d4, [x3, #0x70]\n"
- "smlal v15.4s, v22.4h, v0.4h\n"
- "smlal2 v18.4s, v22.8h, v0.8h\n"
- "ldr d22, [x0, x10]\n"
- "smlal v16.4s, v25.4h, v0.4h\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "smlal v7.4s, v31.4h, v0.4h\n"
- "smlal2 v17.4s, v31.8h, v0.8h\n"
+ "smlal v17.4s, v30.4h, v3.4h\n"
+ "smlal v10.4s, v27.4h, v3.4h\n"
+ "ldr d2, [x23, #0x60]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v23.4h, v3.4h\n"
+ "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "ldr x14, [x20, #0x110]\n"
+ "smlal2 v7.4s, v27.8h, v3.8h\n"
+ "smlal v15.4s, v30.4h, v4.4h\n"
+ "smlal2 v16.4s, v30.8h, v4.8h\n"
+ "ldr d30, [x15, x24]\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal v10.4s, v23.4h, v4.4h\n"
+ "ldr d3, [x23, #0x68]\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v28.4h, v4.4h\n"
+ "smlal2 v8.4s, v26.8h, v4.8h\n"
+ "ldr d26, [x21, x24]\n"
"usubl v26.8h, v26.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v0.4h\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
- "ldr d0, [x3, #0x78]\n"
+ "smlal2 v7.4s, v23.8h, v4.8h\n"
+ "smlal v15.4s, v22.4h, v0.4h\n"
+ "ldr x21, [x20, #0x118]\n"
+ "smlal2 v16.4s, v22.8h, v0.8h\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "ldr d4, [x23, #0x70]\n"
+ "ldr d22, [x9, x24]\n"
+ "smlal v17.4s, v25.4h, v0.4h\n"
+ "smlal v10.4s, v31.4h, v0.4h\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v6.4s, v30.4h, v0.4h\n"
+ "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "usubl v22.8h, v22.8b, v9.8b\n"
+ "smlal2 v7.4s, v31.8h, v0.8h\n"
"smlal v15.4s, v25.4h, v1.4h\n"
- "smlal2 v18.4s, v25.8h, v1.8h\n"
- "ldr d25, [x11, x10]\n"
- "smlal v16.4s, v24.4h, v1.4h\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "smlal v7.4s, v30.4h, v1.4h\n"
- "smlal2 v17.4s, v30.8h, v1.8h\n"
- "smlal v8.4s, v26.4h, v1.4h\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
- "ldr d1, [x3, #0x80]\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal2 v16.4s, v25.8h, v1.8h\n"
+ "ldr d25, [x2, x24]\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v17.4s, v24.4h, v1.4h\n"
+ "smlal v10.4s, v30.4h, v1.4h\n"
+ "ldr d0, [x23, #0x78]\n"
+ "ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v6.4s, v26.4h, v1.4h\n"
+ "smlal2 v8.4s, v24.8h, v1.8h\n"
+ "smlal2 v7.4s, v30.8h, v1.8h\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v18.4s, v24.8h, v2.8h\n"
- "ldr d24, [x24, x10]\n"
- "smlal v16.4s, v27.4h, v2.4h\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "smlal v7.4s, v26.4h, v2.4h\n"
- "smlal2 v17.4s, v26.8h, v2.8h\n"
- "smlal v8.4s, v25.4h, v2.4h\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
- "ldr d2, [x3, #0x88]\n"
- "smlal v15.4s, v27.4h, v3.4h\n"
- "smlal2 v18.4s, v27.8h, v3.8h\n"
- "ldr d27, [x15, x10]\n"
- "smlal v16.4s, v23.4h, v3.4h\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "smlal v7.4s, v25.4h, v3.4h\n"
- "smlal2 v17.4s, v25.8h, v3.8h\n"
+ "smlal2 v16.4s, v24.8h, v2.8h\n"
+ "ldr d24, [x13, x24]\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "usubl v22.8h, v22.8b, v9.8b\n"
- "smlal v8.4s, v24.4h, v3.4h\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
- "ldr d3, [x3, #0x90]\n"
- "smlal v15.4s, v23.4h, v4.4h\n"
- "smlal2 v18.4s, v23.8h, v4.8h\n"
- "ldr d23, [x9, x10]\n"
- "smlal v16.4s, v28.4h, v4.4h\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "ldr d28, [x12, x10]\n"
- "smlal v7.4s, v24.4h, v4.4h\n"
- "smlal2 v17.4s, v24.8h, v4.8h\n"
- "smlal v8.4s, v22.4h, v4.4h\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
- "ldr d4, [x3, #0x98]\n"
- "ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal v10.4s, v26.4h, v2.4h\n"
+ "ldr d1, [x23, #0x80]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v8.4s, v27.8h, v2.8h\n"
+ "smlal2 v7.4s, v26.8h, v2.8h\n"
+ "smlal v15.4s, v27.4h, v3.4h\n"
+ "smlal2 v16.4s, v27.8h, v3.8h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "ldr d27, [x19, x24]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v17.4s, v23.4h, v3.4h\n"
+ "smlal v10.4s, v25.4h, v3.4h\n"
+ "ldr d2, [x23, #0x88]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v8.4s, v23.8h, v3.8h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "smlal v15.4s, v23.4h, v4.4h\n"
+ "smlal2 v16.4s, v23.8h, v4.8h\n"
+ "ldr d23, [x28, x24]\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
"usubl v23.8h, v23.8b, v9.8b\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "smlal v10.4s, v24.4h, v4.4h\n"
+ "ldr d3, [x23, #0x90]\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v22.4h, v4.4h\n"
+ "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "ldr d28, [x11, x24]\n"
+ "usubl v28.8h, v28.8b, v9.8b\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
"smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
- "ldr d31, [x27, x10]\n"
- "smlal v16.4s, v30.4h, v0.4h\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "smlal v7.4s, v27.4h, v0.4h\n"
- "smlal2 v17.4s, v27.8h, v0.8h\n"
- "smlal v8.4s, v23.4h, v0.4h\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
- "ldr d0, [x3, #0xa0]\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal2 v16.4s, v31.8h, v0.8h\n"
+ "ldr d31, [x6, x24]\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal v10.4s, v27.4h, v0.4h\n"
+ "ldr d4, [x23, #0x98]\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v6.4s, v23.4h, v0.4h\n"
+ "smlal2 v8.4s, v30.8h, v0.8h\n"
+ "smlal2 v7.4s, v27.8h, v0.8h\n"
"smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v18.4s, v30.8h, v1.8h\n"
- "ldr d30, [x28, x10]\n"
- "smlal v16.4s, v26.4h, v1.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "smlal v7.4s, v23.4h, v1.4h\n"
- "smlal2 v17.4s, v23.8h, v1.8h\n"
- "smlal v8.4s, v31.4h, v1.4h\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
- "ldr d1, [x3, #0xa8]\n"
- "smlal v15.4s, v26.4h, v2.4h\n"
- "smlal2 v18.4s, v26.8h, v2.8h\n"
- "ldr d26, [x7, x10]\n"
- "smlal v16.4s, v25.4h, v2.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "smlal v7.4s, v31.4h, v2.4h\n"
- "smlal2 v17.4s, v31.8h, v2.8h\n"
+ "smlal2 v16.4s, v30.8h, v1.8h\n"
+ "ldr d30, [x27, x24]\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "usubl v28.8h, v28.8b, v9.8b\n"
- "smlal v8.4s, v30.4h, v2.4h\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
- "ldr d2, [x3, #0xb0]\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v18.4s, v25.8h, v3.8h\n"
- "ldr d25, [x26, x10]\n"
- "smlal v16.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "smlal v7.4s, v30.4h, v3.4h\n"
- "smlal2 v17.4s, v30.8h, v3.8h\n"
- "smlal v8.4s, v28.4h, v3.4h\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
- "ldr d3, [x3, #0xb8]\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
- "usubl v26.8h, v26.8b, v9.8b\n"
+ "smlal v17.4s, v26.4h, v1.4h\n"
+ "smlal v10.4s, v23.4h, v1.4h\n"
+ "ldr d0, [x23, #0xa0]\n"
"ssubl v0.8h, v0.8b, v14.8b\n"
+ "smlal v6.4s, v31.4h, v1.4h\n"
+ "smlal2 v8.4s, v26.8h, v1.8h\n"
+ "smlal2 v7.4s, v23.8h, v1.8h\n"
+ "smlal v15.4s, v26.4h, v2.4h\n"
+ "smlal2 v16.4s, v26.8h, v2.8h\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "ldr d26, [x17, x24]\n"
+ "usubl v26.8h, v26.8b, v9.8b\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal v10.4s, v31.4h, v2.4h\n"
+ "ldr d1, [x23, #0xa8]\n"
+ "ssubl v1.8h, v1.8b, v14.8b\n"
+ "smlal v6.4s, v30.4h, v2.4h\n"
+ "smlal2 v8.4s, v25.8h, v2.8h\n"
+ "smlal2 v7.4s, v31.8h, v2.8h\n"
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "smlal2 v16.4s, v25.8h, v3.8h\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "ldr d25, [x5, x24]\n"
+ "usubl v25.8h, v25.8b, v9.8b\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal v10.4s, v30.4h, v3.4h\n"
+ "ldr d2, [x23, #0xb0]\n"
+ "ssubl v2.8h, v2.8b, v14.8b\n"
+ "smlal v6.4s, v28.4h, v3.4h\n"
+ "smlal2 v8.4s, v24.8h, v3.8h\n"
+ "smlal2 v7.4s, v30.8h, v3.8h\n"
"smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v18.4s, v24.8h, v4.8h\n"
- "ldr d24, [x23, x10]\n"
- "smlal v16.4s, v22.4h, v4.4h\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "smlal v7.4s, v28.4h, v4.4h\n"
- "smlal2 v17.4s, v28.8h, v4.8h\n"
- "smlal v8.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
- "ldr d4, [x3, #0xc0]\n"
+ "smlal2 v16.4s, v24.8h, v4.8h\n"
+ "ldr d24, [x25, x24]\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "smlal v17.4s, v22.4h, v4.4h\n"
+ "smlal v10.4s, v28.4h, v4.4h\n"
+ "ldr d3, [x23, #0xb8]\n"
+ "ssubl v3.8h, v3.8b, v14.8b\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v7.4s, v28.8h, v4.8h\n"
"smlal v15.4s, v27.4h, v0.4h\n"
- "smlal2 v18.4s, v27.8h, v0.8h\n"
- "ldr d27, [x22, x10]\n"
- "smlal v16.4s, v23.4h, v0.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
+ "smlal2 v16.4s, v27.8h, v0.8h\n"
+ "ldr d27, [x26, x24]\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal2 v8.4s, v22.8h, v4.8h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d4, [x23, #0xc0]\n"
+ "ssubl v4.8h, v4.8b, v14.8b\n"
+ "smlal v17.4s, v23.4h, v0.4h\n"
+ "smlal v10.4s, v25.4h, v0.4h\n"
+ "smlal v6.4s, v24.4h, v0.4h\n"
+ "smlal2 v7.4s, v25.8h, v0.8h\n"
+ "ldr d25, [x12, x24]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v1.8h, v1.8b, v14.8b\n"
- "smlal v7.4s, v25.4h, v0.4h\n"
- "smlal2 v17.4s, v25.8h, v0.8h\n"
- "ldr d25, [x20, x10]\n"
- "smlal v8.4s, v24.4h, v0.4h\n"
+ "smlal2 v8.4s, v23.8h, v0.8h\n"
"smlal2 v5.4s, v24.8h, v0.8h\n"
"smlal v15.4s, v23.4h, v1.4h\n"
- "smlal2 v18.4s, v23.8h, v1.8h\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "smlal v7.4s, v24.4h, v1.4h\n"
- "smlal2 v17.4s, v24.8h, v1.8h\n"
- "ldr d24, [x13, x10]\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "ssubl v2.8h, v2.8b, v14.8b\n"
- "usubl v25.8h, v25.8b, v9.8b\n"
- "smlal v8.4s, v27.4h, v1.4h\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal v10.4s, v24.4h, v1.4h\n"
+ "smlal v6.4s, v27.4h, v1.4h\n"
+ "smlal2 v7.4s, v24.8h, v1.8h\n"
+ "ldr d24, [x14, x24]\n"
+ "smlal2 v16.4s, v23.8h, v1.8h\n"
+ "usubl v24.8h, v24.8b, v9.8b\n"
+ "smlal2 v8.4s, v31.8h, v1.8h\n"
"smlal2 v5.4s, v27.8h, v1.8h\n"
"smlal v15.4s, v31.4h, v2.4h\n"
- "smlal2 v18.4s, v31.8h, v2.8h\n"
- "smlal v16.4s, v30.4h, v2.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "smlal v7.4s, v27.4h, v2.4h\n"
- "smlal2 v17.4s, v27.8h, v2.8h\n"
- "ldr d27, [x21, x10]\n"
- "add x10, x10, #0x8\n"
- "smlal v8.4s, v25.4h, v2.4h\n"
+ "smlal v17.4s, v30.4h, v2.4h\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v7.4s, v27.8h, v2.8h\n"
+ "ldr d27, [x21, x24]\n"
+ "smlal2 v16.4s, v31.8h, v2.8h\n"
+ "usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal2 v8.4s, v30.8h, v2.8h\n"
"smlal2 v5.4s, v25.8h, v2.8h\n"
- "ssubl v3.8h, v3.8b, v14.8b\n"
- "usubl v24.8h, v24.8b, v9.8b\n"
- "ssubl v4.8h, v4.8b, v14.8b\n"
+ "add x24, x24, #0x8\n"
"smlal v15.4s, v30.4h, v3.4h\n"
- "smlal2 v18.4s, v30.8h, v3.8h\n"
- "smlal v16.4s, v28.4h, v3.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "smlal v7.4s, v25.4h, v3.4h\n"
- "smlal2 v17.4s, v25.8h, v3.8h\n"
- "smlal v8.4s, v24.4h, v3.4h\n"
+ "smlal v17.4s, v28.4h, v3.4h\n"
+ "smlal v10.4s, v25.4h, v3.4h\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v16.4s, v30.8h, v3.8h\n"
+ "smlal2 v8.4s, v28.8h, v3.8h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
"smlal2 v5.4s, v24.8h, v3.8h\n"
"smlal v15.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "smlal v16.4s, v26.4h, v4.4h\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "smlal v7.4s, v24.4h, v4.4h\n"
- "smlal2 v17.4s, v24.8h, v4.8h\n"
- "usubl v27.8h, v27.8b, v9.8b\n"
- "sqrdmulh v15.4s, v15.4s, v6.4s\n"
- "sqrdmulh v18.4s, v18.4s, v20.4s\n"
- "smlal v8.4s, v27.4h, v4.4h\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "sqdmulh v15.4s, v15.4s, v12.4s\n"
+ "smlal v10.4s, v24.4h, v4.4h\n"
+ "smlal v6.4s, v27.4h, v4.4h\n"
+ "sqdmulh v17.4s, v17.4s, v12.4s\n"
+ "smlal2 v16.4s, v28.8h, v4.8h\n"
+ "smlal2 v8.4s, v26.8h, v4.8h\n"
+ "sqdmulh v10.4s, v10.4s, v12.4s\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
"smlal2 v5.4s, v27.8h, v4.8h\n"
- "and v28.16b, v15.16b, v19.16b\n"
- "and v26.16b, v18.16b, v12.16b\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqdmulh v6.4s, v6.4s, v12.4s\n"
+ "and v23.16b, v15.16b, v19.16b\n"
+ "sqdmulh v16.4s, v16.4s, v20.4s\n"
+ "and v22.16b, v17.16b, v19.16b\n"
+ "sqdmulh v8.4s, v8.4s, v20.4s\n"
+ "and v21.16b, v10.16b, v19.16b\n"
+ "sqdmulh v7.4s, v7.4s, v20.4s\n"
+ "and v26.16b, v6.16b, v19.16b\n"
+ "sqdmulh v5.4s, v5.4s, v20.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v4.16b, v16.16b, v29.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v2.16b, v8.16b, v29.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v3.16b, v7.16b, v29.16b\n"
"sshr v26.4s, v26.4s, #0x1f\n"
- "sqrdmulh v21.4s, v21.4s, v20.4s\n"
- "sqadd v15.4s, v15.4s, v28.4s\n"
- "sqadd v18.4s, v18.4s, v26.4s\n"
- "and v29.16b, v16.16b, v19.16b\n"
- "and v4.16b, v21.16b, v12.16b\n"
- "srshl v15.4s, v15.4s, v19.4s\n"
- "srshl v18.4s, v18.4s, v12.4s\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
+ "and v25.16b, v5.16b, v29.16b\n"
+ "sqadd v15.4s, v15.4s, v23.4s\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "add v15.4s, v15.4s, v10.4s\n"
- "add v18.4s, v18.4s, v10.4s\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "smin v15.4s, v15.4s, v13.4s\n"
- "smin v18.4s, v18.4s, v13.4s\n"
- "sqadd v21.4s, v21.4s, v4.4s\n"
- "smax v15.4s, v15.4s, v11.4s\n"
- "smax v18.4s, v18.4s, v11.4s\n"
- "srshl v16.4s, v16.4s, v19.4s\n"
- "srshl v21.4s, v21.4s, v12.4s\n"
- "uzp1 v15.16b, v15.16b, v18.16b\n"
- "sqrdmulh v7.4s, v7.4s, v6.4s\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "str d15, [x17, x1]\n"
- "add v16.4s, v16.4s, v10.4s\n"
- "add v21.4s, v21.4s, v10.4s\n"
- "and v25.16b, v7.16b, v19.16b\n"
- "sqrdmulh v17.4s, v17.4s, v20.4s\n"
- "smin v16.4s, v16.4s, v13.4s\n"
- "smin v21.4s, v21.4s, v13.4s\n"
+ "sqadd v17.4s, v17.4s, v22.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v21.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v26.4s\n"
"sshr v25.4s, v25.4s, #0x1f\n"
- "smax v16.4s, v16.4s, v11.4s\n"
- "smax v21.4s, v21.4s, v11.4s\n"
- "sqadd v7.4s, v7.4s, v25.4s\n"
- "and v31.16b, v17.16b, v12.16b\n"
- "uzp1 v16.16b, v16.16b, v21.16b\n"
- "sqrdmulh v8.4s, v8.4s, v6.4s\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "str d16, [x16, x1]\n"
- "srshl v7.4s, v7.4s, v19.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "and v24.16b, v8.16b, v19.16b\n"
- "sqrdmulh v5.4s, v5.4s, v20.4s\n"
- "sqadd v17.4s, v17.4s, v31.4s\n"
- "add v7.4s, v7.4s, v10.4s\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
- "and v1.16b, v5.16b, v12.16b\n"
- "smin v7.4s, v7.4s, v13.4s\n"
- "srshl v17.4s, v17.4s, v12.4s\n"
- "sqadd v8.4s, v8.4s, v24.4s\n"
- "smax v7.4s, v7.4s, v11.4s\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "add v17.4s, v17.4s, v10.4s\n"
- "srshl v8.4s, v8.4s, v19.4s\n"
- "sqadd v5.4s, v5.4s, v1.4s\n"
- "smin v17.4s, v17.4s, v13.4s\n"
- "add v8.4s, v8.4s, v10.4s\n"
- "smax v17.4s, v17.4s, v11.4s\n"
- "srshl v5.4s, v5.4s, v12.4s\n"
- "smin v8.4s, v8.4s, v13.4s\n"
- "uzp1 v7.16b, v7.16b, v17.16b\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
- "str d7, [x6, x1]\n"
- "smax v8.4s, v8.4s, v11.4s\n"
- "smin v5.4s, v5.4s, v13.4s\n"
- "smax v5.4s, v5.4s, v11.4s\n"
- "uzp1 v8.16b, v8.16b, v5.16b\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "str d8, [x8, x1]\n"
- "add x1, x1, #0x8\n"
+ "srshl v15.4s, v15.4s, v19.4s\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "srshl v17.4s, v17.4s, v19.4s\n"
+ "sqadd v8.4s, v8.4s, v2.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqadd v7.4s, v7.4s, v3.4s\n"
+ "srshl v6.4s, v6.4s, v19.4s\n"
+ "sqadd v5.4s, v5.4s, v25.4s\n"
+ "srshl v16.4s, v16.4s, v29.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqxtn v17.4h, v17.4s\n"
+ "srshl v7.4s, v7.4s, v29.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v5.4s, v5.4s, v29.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v15.8h, v16.4s\n"
+ "sqxtn2 v17.8h, v8.4s\n"
+ "sqxtn2 v10.8h, v7.4s\n"
+ "sqxtn2 v6.8h, v5.4s\n"
+ "sqadd v15.8h, v15.8h, v18.8h\n"
+ "sqadd v17.8h, v17.8h, v18.8h\n"
+ "sqadd v10.8h, v10.8h, v18.8h\n"
+ "sqadd v6.8h, v6.8h, v18.8h\n"
+ "smax v15.8h, v15.8h, v11.8h\n"
+ "smax v17.8h, v17.8h, v11.8h\n"
+ "smax v10.8h, v10.8h, v11.8h\n"
+ "smax v6.8h, v6.8h, v11.8h\n"
+ "smin v15.8h, v15.8h, v13.8h\n"
+ "smin v17.8h, v17.8h, v13.8h\n"
+ "smin v10.8h, v10.8h, v13.8h\n"
+ "smin v6.8h, v6.8h, v13.8h\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "str d15, [x16, x22]\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "str d17, [x8, x22]\n"
+ "str d10, [x4, x22]\n"
+ "str d6, [x7, x22]\n"
+ "add x22, x22, #0x8\n"
"beq 124f\n"
- "add x3, x3, #0xc8\n"
+ "add x23, x23, #0xc8\n"
"3:" // Oddments
- "ldr x12, [%x[params], %[offsetof_Params_bias]]\n"
- "tbz x4, #2, 5f\n"
- "ld1 { v15.4s }, [x12], #0x10\n"
- "tbz x4, #1, 4f\n"
- "ld1 { v18.d }[0], [x12], #0x8\n"
- "tbz x4, #0, 7f\n"
- "ld1 { v18.s }[2], [x12]\n"
+ "ldr x19, [%x[params], %[offsetof_Params_bias]]\n"
+ "tbz x0, #2, 5f\n"
+ "ld1 { v15.4s }, [x19], #0x10\n"
+ "tbz x0, #1, 4f\n"
+ "ld1 { v16.d }[0], [x19], #0x8\n"
+ "tbz x0, #0, 7f\n"
+ "ld1 { v16.s }[2], [x19]\n"
"b 7f\n"
"4:" // Oddments: Load bias: Bit 2: Bit 1: Unset
- "tbz x4, #0, 7f\n"
- "ld1 { v18.s }[0], [x12]\n"
+ "tbz x0, #0, 7f\n"
+ "ld1 { v16.s }[0], [x19]\n"
"b 7f\n"
"5:" // Oddments: Load bias: Bit 2: Unset
- "tbz x4, #1, 6f\n"
- "ld1 { v15.d }[0], [x12], #0x8\n"
- "tbz x4, #0, 7f\n"
- "ld1 { v15.s }[2], [x12]\n"
+ "tbz x0, #1, 6f\n"
+ "ld1 { v15.d }[0], [x19], #0x8\n"
+ "tbz x0, #0, 7f\n"
+ "ld1 { v15.s }[2], [x19]\n"
"b 7f\n"
"6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 7f\n"
- "ld1 { v15.s }[0], [x12]\n"
+ "tbz x0, #0, 7f\n"
+ "ld1 { v15.s }[0], [x19]\n"
"7:" // Oddments: Load bias: Bit 2: End
- "mov v16.16b, v15.16b\n"
- "ldr d0, [x3, #0x0]\n"
- "mov v21.16b, v18.16b\n"
- "ldr d1, [x3, #0x8]\n"
- "mov v7.16b, v15.16b\n"
- "ldr d2, [x3, #0x10]\n"
- "mov v17.16b, v18.16b\n"
- "ldr d3, [x3, #0x18]\n"
- "mov v8.16b, v15.16b\n"
- "ldr d4, [x3, #0x20]\n"
- "mov v5.16b, v18.16b\n"
- "ldp x28, x27, [x25, #0x0]\n"
+ "ldr d0, [x23, #0x0]\n"
+ "ldr d1, [x23, #0x8]\n"
+ "mov v17.16b, v15.16b\n"
+ "mov v8.16b, v16.16b\n"
+ "ldr d2, [x23, #0x10]\n"
+ "ldr d3, [x23, #0x18]\n"
+ "mov v10.16b, v15.16b\n"
+ "mov v7.16b, v16.16b\n"
+ "ldr d4, [x23, #0x20]\n"
+ "ldp x28, x6, [x20, #0x0]\n"
+ "mov v6.16b, v15.16b\n"
+ "mov v5.16b, v16.16b\n"
+ "ldp x26, x25, [x20, #0x10]\n"
+ "ldp x5, x2, [x20, #0x20]\n"
"ssubl v0.8h, v0.8b, v14.8b\n"
- "ldp x26, x13, [x25, #0x10]\n"
"ssubl v1.8h, v1.8b, v14.8b\n"
+ "ldp x27, x21, [x20, #0x30]\n"
+ "ldp x12, x19, [x20, #0x40]\n"
"ssubl v2.8h, v2.8b, v14.8b\n"
- "ldp x24, x23, [x25, #0x20]\n"
"ssubl v3.8h, v3.8b, v14.8b\n"
"ssubl v4.8h, v4.8b, v14.8b\n"
- "ldp x22, x21, [x25, #0x30]\n"
- "ldp x20, x0, [x25, #0x40]\n"
- "add x28, x28, x10\n"
- "add x27, x27, x10\n"
- "add x26, x26, x10\n"
- "add x13, x13, x10\n"
- "add x24, x24, x10\n"
- "add x23, x23, x10\n"
- "add x22, x22, x10\n"
- "add x21, x21, x10\n"
- "add x20, x20, x10\n"
- "add x0, x0, x10\n"
- "tbz x4, #2, 9f\n"
+ "add x28, x28, x24\n"
+ "add x6, x6, x24\n"
+ "add x26, x26, x24\n"
+ "add x25, x25, x24\n"
+ "add x5, x5, x24\n"
+ "add x2, x2, x24\n"
+ "add x27, x27, x24\n"
+ "add x21, x21, x24\n"
+ "add x12, x12, x24\n"
+ "add x19, x19, x24\n"
+ "tbz x0, #2, 9f\n"
"ld1 { v31.s }[0], [x28], #0x4\n"
- "ld1 { v30.s }[0], [x27], #0x4\n"
+ "ld1 { v30.s }[0], [x6], #0x4\n"
"ld1 { v29.s }[0], [x26], #0x4\n"
- "ld1 { v28.s }[0], [x13], #0x4\n"
- "ld1 { v27.s }[0], [x24], #0x4\n"
- "ld1 { v23.s }[0], [x23], #0x4\n"
- "ld1 { v25.s }[0], [x22], #0x4\n"
+ "ld1 { v28.s }[0], [x25], #0x4\n"
+ "ld1 { v27.s }[0], [x5], #0x4\n"
+ "ld1 { v23.s }[0], [x2], #0x4\n"
+ "ld1 { v25.s }[0], [x27], #0x4\n"
"ld1 { v24.s }[0], [x21], #0x4\n"
- "ld1 { v26.s }[0], [x20], #0x4\n"
- "ld1 { v22.s }[0], [x0], #0x4\n"
- "tbz x4, #1, 8f\n"
+ "ld1 { v26.s }[0], [x12], #0x4\n"
+ "ld1 { v22.s }[0], [x19], #0x4\n"
+ "tbz x0, #1, 8f\n"
"ld1 { v31.h }[2], [x28], #0x2\n"
- "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v30.h }[2], [x6], #0x2\n"
"ld1 { v29.h }[2], [x26], #0x2\n"
- "ld1 { v28.h }[2], [x13], #0x2\n"
- "ld1 { v27.h }[2], [x24], #0x2\n"
- "ld1 { v23.h }[2], [x23], #0x2\n"
- "ld1 { v25.h }[2], [x22], #0x2\n"
+ "ld1 { v28.h }[2], [x25], #0x2\n"
+ "ld1 { v27.h }[2], [x5], #0x2\n"
+ "ld1 { v23.h }[2], [x2], #0x2\n"
+ "ld1 { v25.h }[2], [x27], #0x2\n"
"ld1 { v24.h }[2], [x21], #0x2\n"
- "ld1 { v26.h }[2], [x20], #0x2\n"
- "ld1 { v22.h }[2], [x0], #0x2\n"
- "tbz x4, #0, 11f\n"
+ "ld1 { v26.h }[2], [x12], #0x2\n"
+ "ld1 { v22.h }[2], [x19], #0x2\n"
+ "tbz x0, #0, 11f\n"
"ld1 { v31.b }[6], [x28]\n"
- "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v30.b }[6], [x6]\n"
"ld1 { v29.b }[6], [x26]\n"
- "ld1 { v28.b }[6], [x13]\n"
- "ld1 { v27.b }[6], [x24]\n"
- "ld1 { v23.b }[6], [x23]\n"
- "ld1 { v25.b }[6], [x22]\n"
+ "ld1 { v28.b }[6], [x25]\n"
+ "ld1 { v27.b }[6], [x5]\n"
+ "ld1 { v23.b }[6], [x2]\n"
+ "ld1 { v25.b }[6], [x27]\n"
"ld1 { v24.b }[6], [x21]\n"
- "ld1 { v26.b }[6], [x20]\n"
- "ld1 { v22.b }[6], [x0]\n"
+ "ld1 { v26.b }[6], [x12]\n"
+ "ld1 { v22.b }[6], [x19]\n"
"b 11f\n"
"8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset
- "tbz x4, #0, 11f\n"
+ "tbz x0, #0, 11f\n"
"ld1 { v31.b }[4], [x28]\n"
- "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v30.b }[4], [x6]\n"
"ld1 { v29.b }[4], [x26]\n"
- "ld1 { v28.b }[4], [x13]\n"
- "ld1 { v27.b }[4], [x24]\n"
- "ld1 { v23.b }[4], [x23]\n"
- "ld1 { v25.b }[4], [x22]\n"
+ "ld1 { v28.b }[4], [x25]\n"
+ "ld1 { v27.b }[4], [x5]\n"
+ "ld1 { v23.b }[4], [x2]\n"
+ "ld1 { v25.b }[4], [x27]\n"
"ld1 { v24.b }[4], [x21]\n"
- "ld1 { v26.b }[4], [x20]\n"
- "ld1 { v22.b }[4], [x0]\n"
+ "ld1 { v26.b }[4], [x12]\n"
+ "ld1 { v22.b }[4], [x19]\n"
"b 11f\n"
"9:" // Oddments: Initial loads: Bit 2: Unset
- "tbz x4, #1, 10f\n"
+ "tbz x0, #1, 10f\n"
"ld1 { v31.h }[0], [x28], #0x2\n"
- "ld1 { v30.h }[0], [x27], #0x2\n"
+ "ld1 { v30.h }[0], [x6], #0x2\n"
"ld1 { v29.h }[0], [x26], #0x2\n"
- "ld1 { v28.h }[0], [x13], #0x2\n"
- "ld1 { v27.h }[0], [x24], #0x2\n"
- "ld1 { v23.h }[0], [x23], #0x2\n"
- "ld1 { v25.h }[0], [x22], #0x2\n"
+ "ld1 { v28.h }[0], [x25], #0x2\n"
+ "ld1 { v27.h }[0], [x5], #0x2\n"
+ "ld1 { v23.h }[0], [x2], #0x2\n"
+ "ld1 { v25.h }[0], [x27], #0x2\n"
"ld1 { v24.h }[0], [x21], #0x2\n"
- "ld1 { v26.h }[0], [x20], #0x2\n"
- "ld1 { v22.h }[0], [x0], #0x2\n"
- "tbz x4, #0, 11f\n"
+ "ld1 { v26.h }[0], [x12], #0x2\n"
+ "ld1 { v22.h }[0], [x19], #0x2\n"
+ "tbz x0, #0, 11f\n"
"ld1 { v31.b }[2], [x28]\n"
- "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v30.b }[2], [x6]\n"
"ld1 { v29.b }[2], [x26]\n"
- "ld1 { v28.b }[2], [x13]\n"
- "ld1 { v27.b }[2], [x24]\n"
- "ld1 { v23.b }[2], [x23]\n"
- "ld1 { v25.b }[2], [x22]\n"
+ "ld1 { v28.b }[2], [x25]\n"
+ "ld1 { v27.b }[2], [x5]\n"
+ "ld1 { v23.b }[2], [x2]\n"
+ "ld1 { v25.b }[2], [x27]\n"
"ld1 { v24.b }[2], [x21]\n"
- "ld1 { v26.b }[2], [x20]\n"
- "ld1 { v22.b }[2], [x0]\n"
+ "ld1 { v26.b }[2], [x12]\n"
+ "ld1 { v22.b }[2], [x19]\n"
"b 11f\n"
"10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 11f\n"
+ "tbz x0, #0, 11f\n"
"ld1 { v31.b }[0], [x28]\n"
- "ld1 { v30.b }[0], [x27]\n"
+ "ld1 { v30.b }[0], [x6]\n"
"ld1 { v29.b }[0], [x26]\n"
- "ld1 { v28.b }[0], [x13]\n"
- "ld1 { v27.b }[0], [x24]\n"
- "ld1 { v23.b }[0], [x23]\n"
- "ld1 { v25.b }[0], [x22]\n"
+ "ld1 { v28.b }[0], [x25]\n"
+ "ld1 { v27.b }[0], [x5]\n"
+ "ld1 { v23.b }[0], [x2]\n"
+ "ld1 { v25.b }[0], [x27]\n"
"ld1 { v24.b }[0], [x21]\n"
- "ld1 { v26.b }[0], [x20]\n"
- "ld1 { v22.b }[0], [x0]\n"
+ "ld1 { v26.b }[0], [x12]\n"
+ "ld1 { v22.b }[0], [x19]\n"
"11:" // Oddments: Initial loads: Bit 2: End
"usubl v31.8h, v31.8b, v9.8b\n"
- "ldr x20, [x25, #0x50]\n"
- "add x20, x20, x10\n"
"usubl v30.8h, v30.8b, v9.8b\n"
+ "smlal v15.4s, v31.4h, v0.4h\n"
+ "ldr x19, [x20, #0x50]\n"
"usubl v29.8h, v29.8b, v9.8b\n"
+ "smlal2 v16.4s, v31.8h, v0.8h\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal2 v8.4s, v30.8h, v0.8h\n"
+ "smlal v10.4s, v29.4h, v0.4h\n"
"usubl v28.8h, v28.8b, v9.8b\n"
+ "add x19, x19, x24\n"
+ "smlal2 v7.4s, v29.8h, v0.8h\n"
"usubl v27.8h, v27.8b, v9.8b\n"
+ "smlal v6.4s, v28.4h, v0.4h\n"
+ "smlal2 v5.4s, v28.8h, v0.8h\n"
+ "smlal v15.4s, v30.4h, v1.4h\n"
"usubl v23.8h, v23.8b, v9.8b\n"
+ "smlal2 v16.4s, v30.8h, v1.8h\n"
+ "smlal v17.4s, v27.4h, v1.4h\n"
"usubl v25.8h, v25.8b, v9.8b\n"
+ "smlal2 v8.4s, v27.8h, v1.8h\n"
+ "smlal v10.4s, v28.4h, v1.4h\n"
"usubl v24.8h, v24.8b, v9.8b\n"
+ "smlal2 v7.4s, v28.8h, v1.8h\n"
"usubl v26.8h, v26.8b, v9.8b\n"
+ "smlal v6.4s, v23.4h, v1.4h\n"
"usubl v22.8h, v22.8b, v9.8b\n"
- "smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
- "smlal v16.4s, v30.4h, v0.4h\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "smlal v7.4s, v29.4h, v0.4h\n"
- "smlal2 v17.4s, v29.8h, v0.8h\n"
- "smlal v8.4s, v28.4h, v0.4h\n"
- "smlal2 v5.4s, v28.8h, v0.8h\n"
- "smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v18.4s, v30.8h, v1.8h\n"
- "smlal v16.4s, v27.4h, v1.4h\n"
- "smlal2 v21.4s, v27.8h, v1.8h\n"
- "smlal v7.4s, v28.4h, v1.4h\n"
- "smlal2 v17.4s, v28.8h, v1.8h\n"
- "smlal v8.4s, v23.4h, v1.4h\n"
"smlal2 v5.4s, v23.8h, v1.8h\n"
"smlal v15.4s, v27.4h, v2.4h\n"
- "smlal2 v18.4s, v27.8h, v2.8h\n"
- "smlal v16.4s, v25.4h, v2.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "smlal v7.4s, v23.4h, v2.4h\n"
- "smlal2 v17.4s, v23.8h, v2.8h\n"
- "tbz x4, #2, 13f\n"
- "ld1 { v31.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 12f\n"
- "ld1 { v31.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 15f\n"
- "ld1 { v31.b }[6], [x20]\n"
+ "smlal2 v16.4s, v27.8h, v2.8h\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v8.4s, v25.8h, v2.8h\n"
+ "smlal v10.4s, v23.4h, v2.4h\n"
+ "smlal2 v7.4s, v23.8h, v2.8h\n"
+ "tbz x0, #2, 13f\n"
+ "ld1 { v31.s }[0], [x19], #0x4\n"
+ "tbz x0, #1, 12f\n"
+ "ld1 { v31.h }[2], [x19], #0x2\n"
+ "tbz x0, #0, 15f\n"
+ "ld1 { v31.b }[6], [x19]\n"
"b 15f\n"
"12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 15f\n"
- "ld1 { v31.b }[4], [x20]\n"
+ "tbz x0, #0, 15f\n"
+ "ld1 { v31.b }[4], [x19]\n"
"b 15f\n"
"13:" // Oddments: Load (1, 3): Bit 2: Unset
- "tbz x4, #1, 14f\n"
- "ld1 { v31.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 15f\n"
- "ld1 { v31.b }[2], [x20]\n"
+ "tbz x0, #1, 14f\n"
+ "ld1 { v31.h }[0], [x19], #0x2\n"
+ "tbz x0, #0, 15f\n"
+ "ld1 { v31.b }[2], [x19]\n"
"b 15f\n"
"14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 15f\n"
- "ld1 { v31.b }[0], [x20]\n"
+ "tbz x0, #0, 15f\n"
+ "ld1 { v31.b }[0], [x19]\n"
"15:" // Oddments: Load (1, 3): Bit 2: End
"usubl v31.8h, v31.8b, v9.8b\n"
- "ldr x28, [x25, #0x58]\n"
- "smlal v15.4s, v25.4h, v3.4h\n"
- "add x28, x28, x10\n"
- "smlal v8.4s, v31.4h, v2.4h\n"
+ "ldr x15, [x20, #0x58]\n"
+ "smlal v6.4s, v31.4h, v2.4h\n"
"smlal2 v5.4s, v31.8h, v2.8h\n"
- "smlal2 v18.4s, v25.8h, v3.8h\n"
- "smlal v16.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "smlal v7.4s, v31.4h, v3.4h\n"
- "smlal2 v17.4s, v31.8h, v3.8h\n"
- "tbz x4, #2, 17f\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
- "tbz x4, #1, 16f\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
- "tbz x4, #0, 19f\n"
- "ld1 { v30.b }[6], [x28]\n"
+ "smlal v15.4s, v25.4h, v3.4h\n"
+ "smlal2 v16.4s, v25.8h, v3.8h\n"
+ "add x15, x15, x24\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal2 v8.4s, v24.8h, v3.8h\n"
+ "smlal v10.4s, v31.4h, v3.4h\n"
+ "smlal2 v7.4s, v31.8h, v3.8h\n"
+ "tbz x0, #2, 17f\n"
+ "ld1 { v30.s }[0], [x15], #0x4\n"
+ "tbz x0, #1, 16f\n"
+ "ld1 { v30.h }[2], [x15], #0x2\n"
+ "tbz x0, #0, 19f\n"
+ "ld1 { v30.b }[6], [x15]\n"
"b 19f\n"
"16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 19f\n"
- "ld1 { v30.b }[4], [x28]\n"
+ "tbz x0, #0, 19f\n"
+ "ld1 { v30.b }[4], [x15]\n"
"b 19f\n"
"17:" // Oddments: Load (1, 4): Bit 2: Unset
- "tbz x4, #1, 18f\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
- "tbz x4, #0, 19f\n"
- "ld1 { v30.b }[2], [x28]\n"
+ "tbz x0, #1, 18f\n"
+ "ld1 { v30.h }[0], [x15], #0x2\n"
+ "tbz x0, #0, 19f\n"
+ "ld1 { v30.b }[2], [x15]\n"
"b 19f\n"
"18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 19f\n"
- "ld1 { v30.b }[0], [x28]\n"
+ "tbz x0, #0, 19f\n"
+ "ld1 { v30.b }[0], [x15]\n"
"19:" // Oddments: Load (1, 4): Bit 2: End
"usubl v30.8h, v30.8b, v9.8b\n"
- "ldr x0, [x25, #0x60]\n"
- "smlal v15.4s, v24.4h, v4.4h\n"
- "add x0, x0, x10\n"
- "smlal v8.4s, v30.4h, v3.4h\n"
+ "ldr x19, [x20, #0x60]\n"
+ "smlal v6.4s, v30.4h, v3.4h\n"
"smlal2 v5.4s, v30.8h, v3.8h\n"
- "smlal2 v18.4s, v24.8h, v4.8h\n"
- "tbz x4, #2, 21f\n"
- "ld1 { v27.s }[0], [x0], #0x4\n"
- "tbz x4, #1, 20f\n"
- "ld1 { v27.h }[2], [x0], #0x2\n"
- "tbz x4, #0, 23f\n"
- "ld1 { v27.b }[6], [x0]\n"
+ "smlal v15.4s, v24.4h, v4.4h\n"
+ "smlal2 v16.4s, v24.8h, v4.8h\n"
+ "add x19, x19, x24\n"
+ "tbz x0, #2, 21f\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "tbz x0, #1, 20f\n"
+ "ld1 { v27.h }[2], [x19], #0x2\n"
+ "tbz x0, #0, 23f\n"
+ "ld1 { v27.b }[6], [x19]\n"
"b 23f\n"
"20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 23f\n"
- "ld1 { v27.b }[4], [x0]\n"
+ "tbz x0, #0, 23f\n"
+ "ld1 { v27.b }[4], [x19]\n"
"b 23f\n"
"21:" // Oddments: Load (0, 5): Bit 2: Unset
- "tbz x4, #1, 22f\n"
- "ld1 { v27.h }[0], [x0], #0x2\n"
- "tbz x4, #0, 23f\n"
- "ld1 { v27.b }[2], [x0]\n"
+ "tbz x0, #1, 22f\n"
+ "ld1 { v27.h }[0], [x19], #0x2\n"
+ "tbz x0, #0, 23f\n"
+ "ld1 { v27.b }[2], [x19]\n"
"b 23f\n"
"22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 23f\n"
- "ld1 { v27.b }[0], [x0]\n"
+ "tbz x0, #0, 23f\n"
+ "ld1 { v27.b }[0], [x19]\n"
"23:" // Oddments: Load (0, 5): Bit 2: End
"usubl v27.8h, v27.8b, v9.8b\n"
- "ldr d0, [x3, #0x28]\n"
- "smlal v7.4s, v30.4h, v4.4h\n"
- "ldr x7, [x25, #0x68]\n"
- "add x7, x7, x10\n"
- "smlal v16.4s, v27.4h, v4.4h\n"
- "smlal2 v21.4s, v27.8h, v4.8h\n"
- "smlal2 v17.4s, v30.8h, v4.8h\n"
- "smlal v8.4s, v26.4h, v4.4h\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "ldr d0, [x23, #0x28]\n"
+ "smlal v17.4s, v27.4h, v4.4h\n"
+ "smlal2 v8.4s, v27.8h, v4.8h\n"
+ "smlal v10.4s, v30.4h, v4.4h\n"
+ "smlal2 v7.4s, v30.8h, v4.8h\n"
"ssubl v0.8h, v0.8b, v14.8b\n"
+ "ldr x27, [x20, #0x68]\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "add x27, x27, x24\n"
"smlal v15.4s, v29.4h, v0.4h\n"
- "smlal2 v18.4s, v29.8h, v0.8h\n"
- "smlal v16.4s, v28.4h, v0.4h\n"
- "smlal2 v21.4s, v28.8h, v0.8h\n"
- "smlal v7.4s, v22.4h, v0.4h\n"
- "smlal2 v17.4s, v22.8h, v0.8h\n"
- "tbz x4, #2, 25f\n"
- "ld1 { v25.s }[0], [x7], #0x4\n"
- "tbz x4, #1, 24f\n"
- "ld1 { v25.h }[2], [x7], #0x2\n"
- "tbz x4, #0, 27f\n"
- "ld1 { v25.b }[6], [x7]\n"
+ "smlal2 v16.4s, v29.8h, v0.8h\n"
+ "smlal v17.4s, v28.4h, v0.4h\n"
+ "smlal2 v8.4s, v28.8h, v0.8h\n"
+ "smlal v10.4s, v22.4h, v0.4h\n"
+ "smlal2 v7.4s, v22.8h, v0.8h\n"
+ "tbz x0, #2, 25f\n"
+ "ld1 { v25.s }[0], [x27], #0x4\n"
+ "tbz x0, #1, 24f\n"
+ "ld1 { v25.h }[2], [x27], #0x2\n"
+ "tbz x0, #0, 27f\n"
+ "ld1 { v25.b }[6], [x27]\n"
"b 27f\n"
"24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 27f\n"
- "ld1 { v25.b }[4], [x7]\n"
+ "tbz x0, #0, 27f\n"
+ "ld1 { v25.b }[4], [x27]\n"
"b 27f\n"
"25:" // Oddments: Load (2, 1): Bit 2: Unset
- "tbz x4, #1, 26f\n"
- "ld1 { v25.h }[0], [x7], #0x2\n"
- "tbz x4, #0, 27f\n"
- "ld1 { v25.b }[2], [x7]\n"
+ "tbz x0, #1, 26f\n"
+ "ld1 { v25.h }[0], [x27], #0x2\n"
+ "tbz x0, #0, 27f\n"
+ "ld1 { v25.b }[2], [x27]\n"
"b 27f\n"
"26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 27f\n"
- "ld1 { v25.b }[0], [x7]\n"
+ "tbz x0, #0, 27f\n"
+ "ld1 { v25.b }[0], [x27]\n"
"27:" // Oddments: Load (2, 1): Bit 2: End
+ "ldr d1, [x23, #0x30]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "ldr d1, [x3, #0x30]\n"
- "smlal v8.4s, v25.4h, v0.4h\n"
- "ldr x26, [x25, #0x70]\n"
- "add x26, x26, x10\n"
- "smlal2 v5.4s, v25.8h, v0.8h\n"
"ssubl v1.8h, v1.8b, v14.8b\n"
+ "ldr x5, [x20, #0x70]\n"
+ "smlal v6.4s, v25.4h, v0.4h\n"
+ "smlal2 v5.4s, v25.8h, v0.8h\n"
+ "add x5, x5, x24\n"
"smlal v15.4s, v28.4h, v1.4h\n"
- "smlal2 v18.4s, v28.8h, v1.8h\n"
- "smlal v16.4s, v23.4h, v1.4h\n"
- "smlal2 v21.4s, v23.8h, v1.8h\n"
- "smlal v7.4s, v25.4h, v1.4h\n"
- "smlal2 v17.4s, v25.8h, v1.8h\n"
- "tbz x4, #2, 29f\n"
- "ld1 { v24.s }[0], [x26], #0x4\n"
- "tbz x4, #1, 28f\n"
- "ld1 { v24.h }[2], [x26], #0x2\n"
- "tbz x4, #0, 31f\n"
- "ld1 { v24.b }[6], [x26]\n"
+ "smlal2 v16.4s, v28.8h, v1.8h\n"
+ "smlal v17.4s, v23.4h, v1.4h\n"
+ "smlal2 v8.4s, v23.8h, v1.8h\n"
+ "smlal v10.4s, v25.4h, v1.4h\n"
+ "smlal2 v7.4s, v25.8h, v1.8h\n"
+ "tbz x0, #2, 29f\n"
+ "ld1 { v24.s }[0], [x5], #0x4\n"
+ "tbz x0, #1, 28f\n"
+ "ld1 { v24.h }[2], [x5], #0x2\n"
+ "tbz x0, #0, 31f\n"
+ "ld1 { v24.b }[6], [x5]\n"
"b 31f\n"
"28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 31f\n"
- "ld1 { v24.b }[4], [x26]\n"
+ "tbz x0, #0, 31f\n"
+ "ld1 { v24.b }[4], [x5]\n"
"b 31f\n"
"29:" // Oddments: Load (2, 2): Bit 2: Unset
- "tbz x4, #1, 30f\n"
- "ld1 { v24.h }[0], [x26], #0x2\n"
- "tbz x4, #0, 31f\n"
- "ld1 { v24.b }[2], [x26]\n"
+ "tbz x0, #1, 30f\n"
+ "ld1 { v24.h }[0], [x5], #0x2\n"
+ "tbz x0, #0, 31f\n"
+ "ld1 { v24.b }[2], [x5]\n"
"b 31f\n"
"30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 31f\n"
- "ld1 { v24.b }[0], [x26]\n"
+ "tbz x0, #0, 31f\n"
+ "ld1 { v24.b }[0], [x5]\n"
"31:" // Oddments: Load (2, 2): Bit 2: End
+ "ldr d2, [x23, #0x38]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "ldr d2, [x3, #0x38]\n"
- "smlal v8.4s, v24.4h, v1.4h\n"
- "ldr x23, [x25, #0x78]\n"
- "add x23, x23, x10\n"
- "smlal2 v5.4s, v24.8h, v1.8h\n"
"ssubl v2.8h, v2.8b, v14.8b\n"
+ "ldr x11, [x20, #0x78]\n"
+ "smlal v6.4s, v24.4h, v1.4h\n"
+ "smlal2 v5.4s, v24.8h, v1.8h\n"
+ "add x11, x11, x24\n"
"smlal v15.4s, v23.4h, v2.4h\n"
- "smlal2 v18.4s, v23.8h, v2.8h\n"
- "smlal v16.4s, v31.4h, v2.4h\n"
- "smlal2 v21.4s, v31.8h, v2.8h\n"
- "smlal v7.4s, v24.4h, v2.4h\n"
- "smlal2 v17.4s, v24.8h, v2.8h\n"
- "tbz x4, #2, 33f\n"
- "ld1 { v27.s }[0], [x23], #0x4\n"
- "tbz x4, #1, 32f\n"
- "ld1 { v27.h }[2], [x23], #0x2\n"
- "tbz x4, #0, 35f\n"
- "ld1 { v27.b }[6], [x23]\n"
+ "smlal2 v16.4s, v23.8h, v2.8h\n"
+ "smlal v17.4s, v31.4h, v2.4h\n"
+ "smlal2 v8.4s, v31.8h, v2.8h\n"
+ "smlal v10.4s, v24.4h, v2.4h\n"
+ "smlal2 v7.4s, v24.8h, v2.8h\n"
+ "tbz x0, #2, 33f\n"
+ "ld1 { v27.s }[0], [x11], #0x4\n"
+ "tbz x0, #1, 32f\n"
+ "ld1 { v27.h }[2], [x11], #0x2\n"
+ "tbz x0, #0, 35f\n"
+ "ld1 { v27.b }[6], [x11]\n"
"b 35f\n"
"32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 35f\n"
- "ld1 { v27.b }[4], [x23]\n"
+ "tbz x0, #0, 35f\n"
+ "ld1 { v27.b }[4], [x11]\n"
"b 35f\n"
"33:" // Oddments: Load (2, 3): Bit 2: Unset
- "tbz x4, #1, 34f\n"
- "ld1 { v27.h }[0], [x23], #0x2\n"
- "tbz x4, #0, 35f\n"
- "ld1 { v27.b }[2], [x23]\n"
+ "tbz x0, #1, 34f\n"
+ "ld1 { v27.h }[0], [x11], #0x2\n"
+ "tbz x0, #0, 35f\n"
+ "ld1 { v27.b }[2], [x11]\n"
"b 35f\n"
"34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 35f\n"
- "ld1 { v27.b }[0], [x23]\n"
+ "tbz x0, #0, 35f\n"
+ "ld1 { v27.b }[0], [x11]\n"
"35:" // Oddments: Load (2, 3): Bit 2: End
+ "ldr d3, [x23, #0x40]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "ldr d3, [x3, #0x40]\n"
- "smlal v8.4s, v27.4h, v2.4h\n"
- "ldr x20, [x25, #0x80]\n"
- "add x20, x20, x10\n"
- "smlal2 v5.4s, v27.8h, v2.8h\n"
"ssubl v3.8h, v3.8b, v14.8b\n"
+ "ldr x12, [x20, #0x80]\n"
+ "smlal v6.4s, v27.4h, v2.4h\n"
+ "smlal2 v5.4s, v27.8h, v2.8h\n"
+ "add x12, x12, x24\n"
"smlal v15.4s, v31.4h, v3.4h\n"
- "smlal2 v18.4s, v31.8h, v3.8h\n"
- "smlal v16.4s, v30.4h, v3.4h\n"
- "smlal2 v21.4s, v30.8h, v3.8h\n"
- "smlal v7.4s, v27.4h, v3.4h\n"
- "smlal2 v17.4s, v27.8h, v3.8h\n"
- "tbz x4, #2, 37f\n"
- "ld1 { v23.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 36f\n"
- "ld1 { v23.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 39f\n"
- "ld1 { v23.b }[6], [x20]\n"
+ "smlal2 v16.4s, v31.8h, v3.8h\n"
+ "smlal v17.4s, v30.4h, v3.4h\n"
+ "smlal2 v8.4s, v30.8h, v3.8h\n"
+ "smlal v10.4s, v27.4h, v3.4h\n"
+ "smlal2 v7.4s, v27.8h, v3.8h\n"
+ "tbz x0, #2, 37f\n"
+ "ld1 { v23.s }[0], [x12], #0x4\n"
+ "tbz x0, #1, 36f\n"
+ "ld1 { v23.h }[2], [x12], #0x2\n"
+ "tbz x0, #0, 39f\n"
+ "ld1 { v23.b }[6], [x12]\n"
"b 39f\n"
"36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 39f\n"
- "ld1 { v23.b }[4], [x20]\n"
+ "tbz x0, #0, 39f\n"
+ "ld1 { v23.b }[4], [x12]\n"
"b 39f\n"
"37:" // Oddments: Load (2, 4): Bit 2: Unset
- "tbz x4, #1, 38f\n"
- "ld1 { v23.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 39f\n"
- "ld1 { v23.b }[2], [x20]\n"
+ "tbz x0, #1, 38f\n"
+ "ld1 { v23.h }[0], [x12], #0x2\n"
+ "tbz x0, #0, 39f\n"
+ "ld1 { v23.b }[2], [x12]\n"
"b 39f\n"
"38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 39f\n"
- "ld1 { v23.b }[0], [x20]\n"
+ "tbz x0, #0, 39f\n"
+ "ld1 { v23.b }[0], [x12]\n"
"39:" // Oddments: Load (2, 4): Bit 2: End
+ "ldr d4, [x23, #0x48]\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "ldr d4, [x3, #0x48]\n"
- "smlal v8.4s, v23.4h, v3.4h\n"
- "ldr x22, [x25, #0x88]\n"
- "add x22, x22, x10\n"
- "smlal2 v5.4s, v23.8h, v3.8h\n"
"ssubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x26, [x20, #0x88]\n"
+ "smlal v6.4s, v23.4h, v3.4h\n"
+ "smlal2 v5.4s, v23.8h, v3.8h\n"
+ "add x26, x26, x24\n"
"smlal v15.4s, v30.4h, v4.4h\n"
- "smlal2 v18.4s, v30.8h, v4.8h\n"
- "smlal v16.4s, v26.4h, v4.4h\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "smlal v7.4s, v23.4h, v4.4h\n"
- "smlal2 v17.4s, v23.8h, v4.8h\n"
- "tbz x4, #2, 41f\n"
- "ld1 { v28.s }[0], [x22], #0x4\n"
- "tbz x4, #1, 40f\n"
- "ld1 { v28.h }[2], [x22], #0x2\n"
- "tbz x4, #0, 43f\n"
- "ld1 { v28.b }[6], [x22]\n"
+ "smlal2 v16.4s, v30.8h, v4.8h\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal2 v8.4s, v26.8h, v4.8h\n"
+ "smlal v10.4s, v23.4h, v4.4h\n"
+ "smlal2 v7.4s, v23.8h, v4.8h\n"
+ "tbz x0, #2, 41f\n"
+ "ld1 { v28.s }[0], [x26], #0x4\n"
+ "tbz x0, #1, 40f\n"
+ "ld1 { v28.h }[2], [x26], #0x2\n"
+ "tbz x0, #0, 43f\n"
+ "ld1 { v28.b }[6], [x26]\n"
"b 43f\n"
"40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 43f\n"
- "ld1 { v28.b }[4], [x22]\n"
+ "tbz x0, #0, 43f\n"
+ "ld1 { v28.b }[4], [x26]\n"
"b 43f\n"
"41:" // Oddments: Load (2, 5): Bit 2: Unset
- "tbz x4, #1, 42f\n"
- "ld1 { v28.h }[0], [x22], #0x2\n"
- "tbz x4, #0, 43f\n"
- "ld1 { v28.b }[2], [x22]\n"
+ "tbz x0, #1, 42f\n"
+ "ld1 { v28.h }[0], [x26], #0x2\n"
+ "tbz x0, #0, 43f\n"
+ "ld1 { v28.b }[2], [x26]\n"
"b 43f\n"
"42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 43f\n"
- "ld1 { v28.b }[0], [x22]\n"
+ "tbz x0, #0, 43f\n"
+ "ld1 { v28.b }[0], [x26]\n"
"43:" // Oddments: Load (2, 5): Bit 2: End
+ "ldr d0, [x23, #0x50]\n"
"usubl v28.8h, v28.8b, v9.8b\n"
- "ldr d0, [x3, #0x50]\n"
- "smlal v8.4s, v28.4h, v4.4h\n"
- "ldr x13, [x25, #0x90]\n"
- "add x13, x13, x10\n"
- "smlal2 v5.4s, v28.8h, v4.8h\n"
"ssubl v0.8h, v0.8b, v14.8b\n"
+ "ldr x14, [x20, #0x90]\n"
+ "smlal v6.4s, v28.4h, v4.4h\n"
+ "smlal2 v5.4s, v28.8h, v4.8h\n"
+ "add x14, x14, x24\n"
"smlal v15.4s, v22.4h, v0.4h\n"
- "smlal2 v18.4s, v22.8h, v0.8h\n"
- "smlal v16.4s, v25.4h, v0.4h\n"
- "smlal2 v21.4s, v25.8h, v0.8h\n"
- "tbz x4, #2, 45f\n"
- "ld1 { v31.s }[0], [x13], #0x4\n"
- "tbz x4, #1, 44f\n"
- "ld1 { v31.h }[2], [x13], #0x2\n"
- "tbz x4, #0, 47f\n"
- "ld1 { v31.b }[6], [x13]\n"
+ "smlal2 v16.4s, v22.8h, v0.8h\n"
+ "smlal v17.4s, v25.4h, v0.4h\n"
+ "smlal2 v8.4s, v25.8h, v0.8h\n"
+ "tbz x0, #2, 45f\n"
+ "ld1 { v31.s }[0], [x14], #0x4\n"
+ "tbz x0, #1, 44f\n"
+ "ld1 { v31.h }[2], [x14], #0x2\n"
+ "tbz x0, #0, 47f\n"
+ "ld1 { v31.b }[6], [x14]\n"
"b 47f\n"
"44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 47f\n"
- "ld1 { v31.b }[4], [x13]\n"
+ "tbz x0, #0, 47f\n"
+ "ld1 { v31.b }[4], [x14]\n"
"b 47f\n"
"45:" // Oddments: Load (3, 0): Bit 2: Unset
- "tbz x4, #1, 46f\n"
- "ld1 { v31.h }[0], [x13], #0x2\n"
- "tbz x4, #0, 47f\n"
- "ld1 { v31.b }[2], [x13]\n"
+ "tbz x0, #1, 46f\n"
+ "ld1 { v31.h }[0], [x14], #0x2\n"
+ "tbz x0, #0, 47f\n"
+ "ld1 { v31.b }[2], [x14]\n"
"b 47f\n"
"46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 47f\n"
- "ld1 { v31.b }[0], [x13]\n"
+ "tbz x0, #0, 47f\n"
+ "ld1 { v31.b }[0], [x14]\n"
"47:" // Oddments: Load (3, 0): Bit 2: End
"usubl v31.8h, v31.8b, v9.8b\n"
- "ldr x21, [x25, #0x98]\n"
- "smlal v7.4s, v31.4h, v0.4h\n"
- "add x21, x21, x10\n"
- "smlal2 v17.4s, v31.8h, v0.8h\n"
- "tbz x4, #2, 49f\n"
- "ld1 { v30.s }[0], [x21], #0x4\n"
- "tbz x4, #1, 48f\n"
- "ld1 { v30.h }[2], [x21], #0x2\n"
- "tbz x4, #0, 51f\n"
- "ld1 { v30.b }[6], [x21]\n"
+ "ldr x15, [x20, #0x98]\n"
+ "smlal v10.4s, v31.4h, v0.4h\n"
+ "smlal2 v7.4s, v31.8h, v0.8h\n"
+ "add x15, x15, x24\n"
+ "tbz x0, #2, 49f\n"
+ "ld1 { v30.s }[0], [x15], #0x4\n"
+ "tbz x0, #1, 48f\n"
+ "ld1 { v30.h }[2], [x15], #0x2\n"
+ "tbz x0, #0, 51f\n"
+ "ld1 { v30.b }[6], [x15]\n"
"b 51f\n"
"48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 51f\n"
- "ld1 { v30.b }[4], [x21]\n"
+ "tbz x0, #0, 51f\n"
+ "ld1 { v30.b }[4], [x15]\n"
"b 51f\n"
"49:" // Oddments: Load (3, 1): Bit 2: Unset
- "tbz x4, #1, 50f\n"
- "ld1 { v30.h }[0], [x21], #0x2\n"
- "tbz x4, #0, 51f\n"
- "ld1 { v30.b }[2], [x21]\n"
+ "tbz x0, #1, 50f\n"
+ "ld1 { v30.h }[0], [x15], #0x2\n"
+ "tbz x0, #0, 51f\n"
+ "ld1 { v30.b }[2], [x15]\n"
"b 51f\n"
"50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 51f\n"
- "ld1 { v30.b }[0], [x21]\n"
+ "tbz x0, #0, 51f\n"
+ "ld1 { v30.b }[0], [x15]\n"
"51:" // Oddments: Load (3, 1): Bit 2: End
+ "ldr d1, [x23, #0x58]\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "ldr d1, [x3, #0x58]\n"
- "smlal v8.4s, v30.4h, v0.4h\n"
- "ldr x14, [x25, #0xa0]\n"
- "add x14, x14, x10\n"
- "smlal2 v5.4s, v30.8h, v0.8h\n"
"ssubl v1.8h, v1.8b, v14.8b\n"
+ "ldr x21, [x20, #0xa0]\n"
+ "smlal v6.4s, v30.4h, v0.4h\n"
+ "smlal2 v5.4s, v30.8h, v0.8h\n"
+ "add x21, x21, x24\n"
"smlal v15.4s, v25.4h, v1.4h\n"
- "smlal2 v18.4s, v25.8h, v1.8h\n"
- "smlal v16.4s, v24.4h, v1.4h\n"
- "smlal2 v21.4s, v24.8h, v1.8h\n"
- "smlal v7.4s, v30.4h, v1.4h\n"
- "smlal2 v17.4s, v30.8h, v1.8h\n"
- "tbz x4, #2, 53f\n"
- "ld1 { v26.s }[0], [x14], #0x4\n"
- "tbz x4, #1, 52f\n"
- "ld1 { v26.h }[2], [x14], #0x2\n"
- "tbz x4, #0, 55f\n"
- "ld1 { v26.b }[6], [x14]\n"
+ "smlal2 v16.4s, v25.8h, v1.8h\n"
+ "smlal v17.4s, v24.4h, v1.4h\n"
+ "smlal2 v8.4s, v24.8h, v1.8h\n"
+ "smlal v10.4s, v30.4h, v1.4h\n"
+ "smlal2 v7.4s, v30.8h, v1.8h\n"
+ "tbz x0, #2, 53f\n"
+ "ld1 { v26.s }[0], [x21], #0x4\n"
+ "tbz x0, #1, 52f\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "tbz x0, #0, 55f\n"
+ "ld1 { v26.b }[6], [x21]\n"
"b 55f\n"
"52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 55f\n"
- "ld1 { v26.b }[4], [x14]\n"
+ "tbz x0, #0, 55f\n"
+ "ld1 { v26.b }[4], [x21]\n"
"b 55f\n"
"53:" // Oddments: Load (3, 2): Bit 2: Unset
- "tbz x4, #1, 54f\n"
- "ld1 { v26.h }[0], [x14], #0x2\n"
- "tbz x4, #0, 55f\n"
- "ld1 { v26.b }[2], [x14]\n"
+ "tbz x0, #1, 54f\n"
+ "ld1 { v26.h }[0], [x21], #0x2\n"
+ "tbz x0, #0, 55f\n"
+ "ld1 { v26.b }[2], [x21]\n"
"b 55f\n"
"54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 55f\n"
- "ld1 { v26.b }[0], [x14]\n"
+ "tbz x0, #0, 55f\n"
+ "ld1 { v26.b }[0], [x21]\n"
"55:" // Oddments: Load (3, 2): Bit 2: End
+ "ldr d2, [x23, #0x60]\n"
"usubl v26.8h, v26.8b, v9.8b\n"
- "ldr d2, [x3, #0x60]\n"
- "smlal v8.4s, v26.4h, v1.4h\n"
- "ldr x11, [x25, #0xa8]\n"
- "add x11, x11, x10\n"
- "smlal2 v5.4s, v26.8h, v1.8h\n"
"ssubl v2.8h, v2.8b, v14.8b\n"
+ "ldr x2, [x20, #0xa8]\n"
+ "smlal v6.4s, v26.4h, v1.4h\n"
+ "smlal2 v5.4s, v26.8h, v1.8h\n"
+ "add x2, x2, x24\n"
"smlal v15.4s, v24.4h, v2.4h\n"
- "smlal2 v18.4s, v24.8h, v2.8h\n"
- "smlal v16.4s, v27.4h, v2.4h\n"
- "smlal2 v21.4s, v27.8h, v2.8h\n"
- "smlal v7.4s, v26.4h, v2.4h\n"
- "smlal2 v17.4s, v26.8h, v2.8h\n"
- "tbz x4, #2, 57f\n"
- "ld1 { v25.s }[0], [x11], #0x4\n"
- "tbz x4, #1, 56f\n"
- "ld1 { v25.h }[2], [x11], #0x2\n"
- "tbz x4, #0, 59f\n"
- "ld1 { v25.b }[6], [x11]\n"
+ "smlal2 v16.4s, v24.8h, v2.8h\n"
+ "smlal v17.4s, v27.4h, v2.4h\n"
+ "smlal2 v8.4s, v27.8h, v2.8h\n"
+ "smlal v10.4s, v26.4h, v2.4h\n"
+ "smlal2 v7.4s, v26.8h, v2.8h\n"
+ "tbz x0, #2, 57f\n"
+ "ld1 { v25.s }[0], [x2], #0x4\n"
+ "tbz x0, #1, 56f\n"
+ "ld1 { v25.h }[2], [x2], #0x2\n"
+ "tbz x0, #0, 59f\n"
+ "ld1 { v25.b }[6], [x2]\n"
"b 59f\n"
"56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 59f\n"
- "ld1 { v25.b }[4], [x11]\n"
+ "tbz x0, #0, 59f\n"
+ "ld1 { v25.b }[4], [x2]\n"
"b 59f\n"
"57:" // Oddments: Load (3, 3): Bit 2: Unset
- "tbz x4, #1, 58f\n"
- "ld1 { v25.h }[0], [x11], #0x2\n"
- "tbz x4, #0, 59f\n"
- "ld1 { v25.b }[2], [x11]\n"
+ "tbz x0, #1, 58f\n"
+ "ld1 { v25.h }[0], [x2], #0x2\n"
+ "tbz x0, #0, 59f\n"
+ "ld1 { v25.b }[2], [x2]\n"
"b 59f\n"
"58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 59f\n"
- "ld1 { v25.b }[0], [x11]\n"
+ "tbz x0, #0, 59f\n"
+ "ld1 { v25.b }[0], [x2]\n"
"59:" // Oddments: Load (3, 3): Bit 2: End
+ "ldr d3, [x23, #0x68]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "ldr d3, [x3, #0x68]\n"
- "smlal v8.4s, v25.4h, v2.4h\n"
- "ldr x24, [x25, #0xb0]\n"
- "add x24, x24, x10\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
"ssubl v3.8h, v3.8b, v14.8b\n"
+ "ldr x13, [x20, #0xb0]\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "add x13, x13, x24\n"
"smlal v15.4s, v27.4h, v3.4h\n"
- "smlal2 v18.4s, v27.8h, v3.8h\n"
- "smlal v16.4s, v23.4h, v3.4h\n"
- "smlal2 v21.4s, v23.8h, v3.8h\n"
- "smlal v7.4s, v25.4h, v3.4h\n"
- "smlal2 v17.4s, v25.8h, v3.8h\n"
- "tbz x4, #2, 61f\n"
- "ld1 { v24.s }[0], [x24], #0x4\n"
- "tbz x4, #1, 60f\n"
- "ld1 { v24.h }[2], [x24], #0x2\n"
- "tbz x4, #0, 63f\n"
- "ld1 { v24.b }[6], [x24]\n"
+ "smlal2 v16.4s, v27.8h, v3.8h\n"
+ "smlal v17.4s, v23.4h, v3.4h\n"
+ "smlal2 v8.4s, v23.8h, v3.8h\n"
+ "smlal v10.4s, v25.4h, v3.4h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "tbz x0, #2, 61f\n"
+ "ld1 { v24.s }[0], [x13], #0x4\n"
+ "tbz x0, #1, 60f\n"
+ "ld1 { v24.h }[2], [x13], #0x2\n"
+ "tbz x0, #0, 63f\n"
+ "ld1 { v24.b }[6], [x13]\n"
"b 63f\n"
"60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 63f\n"
- "ld1 { v24.b }[4], [x24]\n"
+ "tbz x0, #0, 63f\n"
+ "ld1 { v24.b }[4], [x13]\n"
"b 63f\n"
"61:" // Oddments: Load (3, 4): Bit 2: Unset
- "tbz x4, #1, 62f\n"
- "ld1 { v24.h }[0], [x24], #0x2\n"
- "tbz x4, #0, 63f\n"
- "ld1 { v24.b }[2], [x24]\n"
+ "tbz x0, #1, 62f\n"
+ "ld1 { v24.h }[0], [x13], #0x2\n"
+ "tbz x0, #0, 63f\n"
+ "ld1 { v24.b }[2], [x13]\n"
"b 63f\n"
"62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 63f\n"
- "ld1 { v24.b }[0], [x24]\n"
+ "tbz x0, #0, 63f\n"
+ "ld1 { v24.b }[0], [x13]\n"
"63:" // Oddments: Load (3, 4): Bit 2: End
+ "ldr d4, [x23, #0x70]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "ldr d4, [x3, #0x70]\n"
- "smlal v8.4s, v24.4h, v3.4h\n"
- "ldr x0, [x25, #0xb8]\n"
- "add x0, x0, x10\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
"ssubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x9, [x20, #0xb8]\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "add x9, x9, x24\n"
"smlal v15.4s, v23.4h, v4.4h\n"
- "smlal2 v18.4s, v23.8h, v4.8h\n"
- "smlal v16.4s, v28.4h, v4.4h\n"
- "smlal2 v21.4s, v28.8h, v4.8h\n"
- "smlal v7.4s, v24.4h, v4.4h\n"
- "smlal2 v17.4s, v24.8h, v4.8h\n"
- "tbz x4, #2, 65f\n"
- "ld1 { v22.s }[0], [x0], #0x4\n"
- "tbz x4, #1, 64f\n"
- "ld1 { v22.h }[2], [x0], #0x2\n"
- "tbz x4, #0, 67f\n"
- "ld1 { v22.b }[6], [x0]\n"
+ "smlal2 v16.4s, v23.8h, v4.8h\n"
+ "smlal v17.4s, v28.4h, v4.4h\n"
+ "smlal2 v8.4s, v28.8h, v4.8h\n"
+ "smlal v10.4s, v24.4h, v4.4h\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "tbz x0, #2, 65f\n"
+ "ld1 { v22.s }[0], [x9], #0x4\n"
+ "tbz x0, #1, 64f\n"
+ "ld1 { v22.h }[2], [x9], #0x2\n"
+ "tbz x0, #0, 67f\n"
+ "ld1 { v22.b }[6], [x9]\n"
"b 67f\n"
"64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 67f\n"
- "ld1 { v22.b }[4], [x0]\n"
+ "tbz x0, #0, 67f\n"
+ "ld1 { v22.b }[4], [x9]\n"
"b 67f\n"
"65:" // Oddments: Load (3, 5): Bit 2: Unset
- "tbz x4, #1, 66f\n"
- "ld1 { v22.h }[0], [x0], #0x2\n"
- "tbz x4, #0, 67f\n"
- "ld1 { v22.b }[2], [x0]\n"
+ "tbz x0, #1, 66f\n"
+ "ld1 { v22.h }[0], [x9], #0x2\n"
+ "tbz x0, #0, 67f\n"
+ "ld1 { v22.b }[2], [x9]\n"
"b 67f\n"
"66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 67f\n"
- "ld1 { v22.b }[0], [x0]\n"
+ "tbz x0, #0, 67f\n"
+ "ld1 { v22.b }[0], [x9]\n"
"67:" // Oddments: Load (3, 5): Bit 2: End
+ "ldr d0, [x23, #0x78]\n"
"usubl v22.8h, v22.8b, v9.8b\n"
- "ldr d0, [x3, #0x78]\n"
- "smlal v8.4s, v22.4h, v4.4h\n"
- "ldr x15, [x25, #0xc0]\n"
- "add x15, x15, x10\n"
- "smlal2 v5.4s, v22.8h, v4.8h\n"
"ssubl v0.8h, v0.8b, v14.8b\n"
+ "ldr x19, [x20, #0xc0]\n"
+ "smlal v6.4s, v22.4h, v4.4h\n"
+ "smlal2 v5.4s, v22.8h, v4.8h\n"
+ "add x19, x19, x24\n"
"smlal v15.4s, v31.4h, v0.4h\n"
- "smlal2 v18.4s, v31.8h, v0.8h\n"
- "smlal v16.4s, v30.4h, v0.4h\n"
- "smlal2 v21.4s, v30.8h, v0.8h\n"
- "tbz x4, #2, 69f\n"
- "ld1 { v27.s }[0], [x15], #0x4\n"
- "tbz x4, #1, 68f\n"
- "ld1 { v27.h }[2], [x15], #0x2\n"
- "tbz x4, #0, 71f\n"
- "ld1 { v27.b }[6], [x15]\n"
+ "smlal2 v16.4s, v31.8h, v0.8h\n"
+ "smlal v17.4s, v30.4h, v0.4h\n"
+ "smlal2 v8.4s, v30.8h, v0.8h\n"
+ "tbz x0, #2, 69f\n"
+ "ld1 { v27.s }[0], [x19], #0x4\n"
+ "tbz x0, #1, 68f\n"
+ "ld1 { v27.h }[2], [x19], #0x2\n"
+ "tbz x0, #0, 71f\n"
+ "ld1 { v27.b }[6], [x19]\n"
"b 71f\n"
"68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 71f\n"
- "ld1 { v27.b }[4], [x15]\n"
+ "tbz x0, #0, 71f\n"
+ "ld1 { v27.b }[4], [x19]\n"
"b 71f\n"
"69:" // Oddments: Load (4, 0): Bit 2: Unset
- "tbz x4, #1, 70f\n"
- "ld1 { v27.h }[0], [x15], #0x2\n"
- "tbz x4, #0, 71f\n"
- "ld1 { v27.b }[2], [x15]\n"
+ "tbz x0, #1, 70f\n"
+ "ld1 { v27.h }[0], [x19], #0x2\n"
+ "tbz x0, #0, 71f\n"
+ "ld1 { v27.b }[2], [x19]\n"
"b 71f\n"
"70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 71f\n"
- "ld1 { v27.b }[0], [x15]\n"
+ "tbz x0, #0, 71f\n"
+ "ld1 { v27.b }[0], [x19]\n"
"71:" // Oddments: Load (4, 0): Bit 2: End
"usubl v27.8h, v27.8b, v9.8b\n"
- "ldr x9, [x25, #0xc8]\n"
- "smlal v7.4s, v27.4h, v0.4h\n"
- "add x9, x9, x10\n"
- "smlal2 v17.4s, v27.8h, v0.8h\n"
- "tbz x4, #2, 73f\n"
- "ld1 { v23.s }[0], [x9], #0x4\n"
- "tbz x4, #1, 72f\n"
- "ld1 { v23.h }[2], [x9], #0x2\n"
- "tbz x4, #0, 75f\n"
- "ld1 { v23.b }[6], [x9]\n"
+ "ldr x28, [x20, #0xc8]\n"
+ "smlal v10.4s, v27.4h, v0.4h\n"
+ "smlal2 v7.4s, v27.8h, v0.8h\n"
+ "add x28, x28, x24\n"
+ "tbz x0, #2, 73f\n"
+ "ld1 { v23.s }[0], [x28], #0x4\n"
+ "tbz x0, #1, 72f\n"
+ "ld1 { v23.h }[2], [x28], #0x2\n"
+ "tbz x0, #0, 75f\n"
+ "ld1 { v23.b }[6], [x28]\n"
"b 75f\n"
"72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 75f\n"
- "ld1 { v23.b }[4], [x9]\n"
+ "tbz x0, #0, 75f\n"
+ "ld1 { v23.b }[4], [x28]\n"
"b 75f\n"
"73:" // Oddments: Load (4, 1): Bit 2: Unset
- "tbz x4, #1, 74f\n"
- "ld1 { v23.h }[0], [x9], #0x2\n"
- "tbz x4, #0, 75f\n"
- "ld1 { v23.b }[2], [x9]\n"
+ "tbz x0, #1, 74f\n"
+ "ld1 { v23.h }[0], [x28], #0x2\n"
+ "tbz x0, #0, 75f\n"
+ "ld1 { v23.b }[2], [x28]\n"
"b 75f\n"
"74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 75f\n"
- "ld1 { v23.b }[0], [x9]\n"
+ "tbz x0, #0, 75f\n"
+ "ld1 { v23.b }[0], [x28]\n"
"75:" // Oddments: Load (4, 1): Bit 2: End
+ "ldr d1, [x23, #0x80]\n"
"usubl v23.8h, v23.8b, v9.8b\n"
- "ldr d1, [x3, #0x80]\n"
- "smlal v8.4s, v23.4h, v0.4h\n"
- "ldr x27, [x25, #0xd0]\n"
- "add x27, x27, x10\n"
- "smlal2 v5.4s, v23.8h, v0.8h\n"
"ssubl v1.8h, v1.8b, v14.8b\n"
+ "ldr x6, [x20, #0xd0]\n"
+ "smlal v6.4s, v23.4h, v0.4h\n"
+ "smlal2 v5.4s, v23.8h, v0.8h\n"
+ "add x6, x6, x24\n"
"smlal v15.4s, v30.4h, v1.4h\n"
- "smlal2 v18.4s, v30.8h, v1.8h\n"
- "smlal v16.4s, v26.4h, v1.4h\n"
- "smlal2 v21.4s, v26.8h, v1.8h\n"
- "smlal v7.4s, v23.4h, v1.4h\n"
- "smlal2 v17.4s, v23.8h, v1.8h\n"
- "tbz x4, #2, 77f\n"
- "ld1 { v31.s }[0], [x27], #0x4\n"
- "tbz x4, #1, 76f\n"
- "ld1 { v31.h }[2], [x27], #0x2\n"
- "tbz x4, #0, 79f\n"
- "ld1 { v31.b }[6], [x27]\n"
+ "smlal2 v16.4s, v30.8h, v1.8h\n"
+ "smlal v17.4s, v26.4h, v1.4h\n"
+ "smlal2 v8.4s, v26.8h, v1.8h\n"
+ "smlal v10.4s, v23.4h, v1.4h\n"
+ "smlal2 v7.4s, v23.8h, v1.8h\n"
+ "tbz x0, #2, 77f\n"
+ "ld1 { v31.s }[0], [x6], #0x4\n"
+ "tbz x0, #1, 76f\n"
+ "ld1 { v31.h }[2], [x6], #0x2\n"
+ "tbz x0, #0, 79f\n"
+ "ld1 { v31.b }[6], [x6]\n"
"b 79f\n"
"76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 79f\n"
- "ld1 { v31.b }[4], [x27]\n"
+ "tbz x0, #0, 79f\n"
+ "ld1 { v31.b }[4], [x6]\n"
"b 79f\n"
"77:" // Oddments: Load (4, 2): Bit 2: Unset
- "tbz x4, #1, 78f\n"
- "ld1 { v31.h }[0], [x27], #0x2\n"
- "tbz x4, #0, 79f\n"
- "ld1 { v31.b }[2], [x27]\n"
+ "tbz x0, #1, 78f\n"
+ "ld1 { v31.h }[0], [x6], #0x2\n"
+ "tbz x0, #0, 79f\n"
+ "ld1 { v31.b }[2], [x6]\n"
"b 79f\n"
"78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 79f\n"
- "ld1 { v31.b }[0], [x27]\n"
+ "tbz x0, #0, 79f\n"
+ "ld1 { v31.b }[0], [x6]\n"
"79:" // Oddments: Load (4, 2): Bit 2: End
+ "ldr d2, [x23, #0x88]\n"
"usubl v31.8h, v31.8b, v9.8b\n"
- "ldr d2, [x3, #0x88]\n"
- "smlal v8.4s, v31.4h, v1.4h\n"
- "ldr x28, [x25, #0xd8]\n"
- "add x28, x28, x10\n"
- "smlal2 v5.4s, v31.8h, v1.8h\n"
"ssubl v2.8h, v2.8b, v14.8b\n"
+ "ldr x27, [x20, #0xd8]\n"
+ "smlal v6.4s, v31.4h, v1.4h\n"
+ "smlal2 v5.4s, v31.8h, v1.8h\n"
+ "add x27, x27, x24\n"
"smlal v15.4s, v26.4h, v2.4h\n"
- "smlal2 v18.4s, v26.8h, v2.8h\n"
- "smlal v16.4s, v25.4h, v2.4h\n"
- "smlal2 v21.4s, v25.8h, v2.8h\n"
- "smlal v7.4s, v31.4h, v2.4h\n"
- "smlal2 v17.4s, v31.8h, v2.8h\n"
- "tbz x4, #2, 81f\n"
- "ld1 { v30.s }[0], [x28], #0x4\n"
- "tbz x4, #1, 80f\n"
- "ld1 { v30.h }[2], [x28], #0x2\n"
- "tbz x4, #0, 83f\n"
- "ld1 { v30.b }[6], [x28]\n"
+ "smlal2 v16.4s, v26.8h, v2.8h\n"
+ "smlal v17.4s, v25.4h, v2.4h\n"
+ "smlal2 v8.4s, v25.8h, v2.8h\n"
+ "smlal v10.4s, v31.4h, v2.4h\n"
+ "smlal2 v7.4s, v31.8h, v2.8h\n"
+ "tbz x0, #2, 81f\n"
+ "ld1 { v30.s }[0], [x27], #0x4\n"
+ "tbz x0, #1, 80f\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "tbz x0, #0, 83f\n"
+ "ld1 { v30.b }[6], [x27]\n"
"b 83f\n"
"80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 83f\n"
- "ld1 { v30.b }[4], [x28]\n"
+ "tbz x0, #0, 83f\n"
+ "ld1 { v30.b }[4], [x27]\n"
"b 83f\n"
"81:" // Oddments: Load (4, 3): Bit 2: Unset
- "tbz x4, #1, 82f\n"
- "ld1 { v30.h }[0], [x28], #0x2\n"
- "tbz x4, #0, 83f\n"
- "ld1 { v30.b }[2], [x28]\n"
+ "tbz x0, #1, 82f\n"
+ "ld1 { v30.h }[0], [x27], #0x2\n"
+ "tbz x0, #0, 83f\n"
+ "ld1 { v30.b }[2], [x27]\n"
"b 83f\n"
"82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 83f\n"
- "ld1 { v30.b }[0], [x28]\n"
+ "tbz x0, #0, 83f\n"
+ "ld1 { v30.b }[0], [x27]\n"
"83:" // Oddments: Load (4, 3): Bit 2: End
+ "ldr d3, [x23, #0x90]\n"
"usubl v30.8h, v30.8b, v9.8b\n"
- "ldr d3, [x3, #0x90]\n"
- "smlal v8.4s, v30.4h, v2.4h\n"
- "ldr x12, [x25, #0xe0]\n"
- "add x12, x12, x10\n"
- "smlal2 v5.4s, v30.8h, v2.8h\n"
"ssubl v3.8h, v3.8b, v14.8b\n"
+ "ldr x11, [x20, #0xe0]\n"
+ "smlal v6.4s, v30.4h, v2.4h\n"
+ "smlal2 v5.4s, v30.8h, v2.8h\n"
+ "add x11, x11, x24\n"
"smlal v15.4s, v25.4h, v3.4h\n"
- "smlal2 v18.4s, v25.8h, v3.8h\n"
- "smlal v16.4s, v24.4h, v3.4h\n"
- "smlal2 v21.4s, v24.8h, v3.8h\n"
- "smlal v7.4s, v30.4h, v3.4h\n"
- "smlal2 v17.4s, v30.8h, v3.8h\n"
- "tbz x4, #2, 85f\n"
- "ld1 { v28.s }[0], [x12], #0x4\n"
- "tbz x4, #1, 84f\n"
- "ld1 { v28.h }[2], [x12], #0x2\n"
- "tbz x4, #0, 87f\n"
- "ld1 { v28.b }[6], [x12]\n"
+ "smlal2 v16.4s, v25.8h, v3.8h\n"
+ "smlal v17.4s, v24.4h, v3.4h\n"
+ "smlal2 v8.4s, v24.8h, v3.8h\n"
+ "smlal v10.4s, v30.4h, v3.4h\n"
+ "smlal2 v7.4s, v30.8h, v3.8h\n"
+ "tbz x0, #2, 85f\n"
+ "ld1 { v28.s }[0], [x11], #0x4\n"
+ "tbz x0, #1, 84f\n"
+ "ld1 { v28.h }[2], [x11], #0x2\n"
+ "tbz x0, #0, 87f\n"
+ "ld1 { v28.b }[6], [x11]\n"
"b 87f\n"
"84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 87f\n"
- "ld1 { v28.b }[4], [x12]\n"
+ "tbz x0, #0, 87f\n"
+ "ld1 { v28.b }[4], [x11]\n"
"b 87f\n"
"85:" // Oddments: Load (4, 4): Bit 2: Unset
- "tbz x4, #1, 86f\n"
- "ld1 { v28.h }[0], [x12], #0x2\n"
- "tbz x4, #0, 87f\n"
- "ld1 { v28.b }[2], [x12]\n"
+ "tbz x0, #1, 86f\n"
+ "ld1 { v28.h }[0], [x11], #0x2\n"
+ "tbz x0, #0, 87f\n"
+ "ld1 { v28.b }[2], [x11]\n"
"b 87f\n"
"86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 87f\n"
- "ld1 { v28.b }[0], [x12]\n"
+ "tbz x0, #0, 87f\n"
+ "ld1 { v28.b }[0], [x11]\n"
"87:" // Oddments: Load (4, 4): Bit 2: End
+ "ldr d4, [x23, #0x98]\n"
"usubl v28.8h, v28.8b, v9.8b\n"
- "ldr d4, [x3, #0x98]\n"
- "smlal v8.4s, v28.4h, v3.4h\n"
- "ldr x7, [x25, #0xe8]\n"
- "add x7, x7, x10\n"
- "smlal2 v5.4s, v28.8h, v3.8h\n"
"ssubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x17, [x20, #0xe8]\n"
+ "smlal v6.4s, v28.4h, v3.4h\n"
+ "smlal2 v5.4s, v28.8h, v3.8h\n"
+ "add x17, x17, x24\n"
"smlal v15.4s, v24.4h, v4.4h\n"
- "smlal2 v18.4s, v24.8h, v4.8h\n"
- "smlal v16.4s, v22.4h, v4.4h\n"
- "smlal2 v21.4s, v22.8h, v4.8h\n"
- "smlal v7.4s, v28.4h, v4.4h\n"
- "smlal2 v17.4s, v28.8h, v4.8h\n"
- "tbz x4, #2, 89f\n"
- "ld1 { v26.s }[0], [x7], #0x4\n"
- "tbz x4, #1, 88f\n"
- "ld1 { v26.h }[2], [x7], #0x2\n"
- "tbz x4, #0, 91f\n"
- "ld1 { v26.b }[6], [x7]\n"
+ "smlal2 v16.4s, v24.8h, v4.8h\n"
+ "smlal v17.4s, v22.4h, v4.4h\n"
+ "smlal2 v8.4s, v22.8h, v4.8h\n"
+ "smlal v10.4s, v28.4h, v4.4h\n"
+ "smlal2 v7.4s, v28.8h, v4.8h\n"
+ "tbz x0, #2, 89f\n"
+ "ld1 { v26.s }[0], [x17], #0x4\n"
+ "tbz x0, #1, 88f\n"
+ "ld1 { v26.h }[2], [x17], #0x2\n"
+ "tbz x0, #0, 91f\n"
+ "ld1 { v26.b }[6], [x17]\n"
"b 91f\n"
"88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 91f\n"
- "ld1 { v26.b }[4], [x7]\n"
+ "tbz x0, #0, 91f\n"
+ "ld1 { v26.b }[4], [x17]\n"
"b 91f\n"
"89:" // Oddments: Load (4, 5): Bit 2: Unset
- "tbz x4, #1, 90f\n"
- "ld1 { v26.h }[0], [x7], #0x2\n"
- "tbz x4, #0, 91f\n"
- "ld1 { v26.b }[2], [x7]\n"
+ "tbz x0, #1, 90f\n"
+ "ld1 { v26.h }[0], [x17], #0x2\n"
+ "tbz x0, #0, 91f\n"
+ "ld1 { v26.b }[2], [x17]\n"
"b 91f\n"
"90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 91f\n"
- "ld1 { v26.b }[0], [x7]\n"
+ "tbz x0, #0, 91f\n"
+ "ld1 { v26.b }[0], [x17]\n"
"91:" // Oddments: Load (4, 5): Bit 2: End
+ "ldr d0, [x23, #0xa0]\n"
"usubl v26.8h, v26.8b, v9.8b\n"
- "ldr d0, [x3, #0xa0]\n"
- "smlal v8.4s, v26.4h, v4.4h\n"
- "ldr x26, [x25, #0xf0]\n"
- "add x26, x26, x10\n"
- "smlal2 v5.4s, v26.8h, v4.8h\n"
"ssubl v0.8h, v0.8b, v14.8b\n"
+ "ldr x5, [x20, #0xf0]\n"
+ "smlal v6.4s, v26.4h, v4.4h\n"
+ "smlal2 v5.4s, v26.8h, v4.8h\n"
+ "add x5, x5, x24\n"
"smlal v15.4s, v27.4h, v0.4h\n"
- "smlal2 v18.4s, v27.8h, v0.8h\n"
- "smlal v16.4s, v23.4h, v0.4h\n"
- "smlal2 v21.4s, v23.8h, v0.8h\n"
- "tbz x4, #2, 93f\n"
- "ld1 { v25.s }[0], [x26], #0x4\n"
- "tbz x4, #1, 92f\n"
- "ld1 { v25.h }[2], [x26], #0x2\n"
- "tbz x4, #0, 95f\n"
- "ld1 { v25.b }[6], [x26]\n"
+ "smlal2 v16.4s, v27.8h, v0.8h\n"
+ "smlal v17.4s, v23.4h, v0.4h\n"
+ "smlal2 v8.4s, v23.8h, v0.8h\n"
+ "tbz x0, #2, 93f\n"
+ "ld1 { v25.s }[0], [x5], #0x4\n"
+ "tbz x0, #1, 92f\n"
+ "ld1 { v25.h }[2], [x5], #0x2\n"
+ "tbz x0, #0, 95f\n"
+ "ld1 { v25.b }[6], [x5]\n"
"b 95f\n"
"92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset
- "tbz x4, #0, 95f\n"
- "ld1 { v25.b }[4], [x26]\n"
+ "tbz x0, #0, 95f\n"
+ "ld1 { v25.b }[4], [x5]\n"
"b 95f\n"
"93:" // Oddments: Load (5, 0): Bit 2: Unset
- "tbz x4, #1, 94f\n"
- "ld1 { v25.h }[0], [x26], #0x2\n"
- "tbz x4, #0, 95f\n"
- "ld1 { v25.b }[2], [x26]\n"
+ "tbz x0, #1, 94f\n"
+ "ld1 { v25.h }[0], [x5], #0x2\n"
+ "tbz x0, #0, 95f\n"
+ "ld1 { v25.b }[2], [x5]\n"
"b 95f\n"
"94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 95f\n"
- "ld1 { v25.b }[0], [x26]\n"
+ "tbz x0, #0, 95f\n"
+ "ld1 { v25.b }[0], [x5]\n"
"95:" // Oddments: Load (5, 0): Bit 2: End
"usubl v25.8h, v25.8b, v9.8b\n"
- "ldr x23, [x25, #0xf8]\n"
- "smlal v7.4s, v25.4h, v0.4h\n"
- "add x23, x23, x10\n"
- "smlal2 v17.4s, v25.8h, v0.8h\n"
- "tbz x4, #2, 97f\n"
- "ld1 { v24.s }[0], [x23], #0x4\n"
- "tbz x4, #1, 96f\n"
- "ld1 { v24.h }[2], [x23], #0x2\n"
- "tbz x4, #0, 99f\n"
- "ld1 { v24.b }[6], [x23]\n"
+ "ldr x25, [x20, #0xf8]\n"
+ "smlal v10.4s, v25.4h, v0.4h\n"
+ "smlal2 v7.4s, v25.8h, v0.8h\n"
+ "add x25, x25, x24\n"
+ "tbz x0, #2, 97f\n"
+ "ld1 { v24.s }[0], [x25], #0x4\n"
+ "tbz x0, #1, 96f\n"
+ "ld1 { v24.h }[2], [x25], #0x2\n"
+ "tbz x0, #0, 99f\n"
+ "ld1 { v24.b }[6], [x25]\n"
"b 99f\n"
"96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset
- "tbz x4, #0, 99f\n"
- "ld1 { v24.b }[4], [x23]\n"
+ "tbz x0, #0, 99f\n"
+ "ld1 { v24.b }[4], [x25]\n"
"b 99f\n"
"97:" // Oddments: Load (5, 1): Bit 2: Unset
- "tbz x4, #1, 98f\n"
- "ld1 { v24.h }[0], [x23], #0x2\n"
- "tbz x4, #0, 99f\n"
- "ld1 { v24.b }[2], [x23]\n"
+ "tbz x0, #1, 98f\n"
+ "ld1 { v24.h }[0], [x25], #0x2\n"
+ "tbz x0, #0, 99f\n"
+ "ld1 { v24.b }[2], [x25]\n"
"b 99f\n"
"98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 99f\n"
- "ld1 { v24.b }[0], [x23]\n"
+ "tbz x0, #0, 99f\n"
+ "ld1 { v24.b }[0], [x25]\n"
"99:" // Oddments: Load (5, 1): Bit 2: End
+ "ldr d1, [x23, #0xa8]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "ldr d1, [x3, #0xa8]\n"
- "smlal v8.4s, v24.4h, v0.4h\n"
- "ldr x22, [x25, #0x100]\n"
- "add x22, x22, x10\n"
- "smlal2 v5.4s, v24.8h, v0.8h\n"
"ssubl v1.8h, v1.8b, v14.8b\n"
+ "ldr x26, [x20, #0x100]\n"
+ "smlal v6.4s, v24.4h, v0.4h\n"
+ "smlal2 v5.4s, v24.8h, v0.8h\n"
+ "add x26, x26, x24\n"
"smlal v15.4s, v23.4h, v1.4h\n"
- "smlal2 v18.4s, v23.8h, v1.8h\n"
- "smlal v16.4s, v31.4h, v1.4h\n"
- "smlal2 v21.4s, v31.8h, v1.8h\n"
- "smlal v7.4s, v24.4h, v1.4h\n"
- "smlal2 v17.4s, v24.8h, v1.8h\n"
- "tbz x4, #2, 101f\n"
- "ld1 { v27.s }[0], [x22], #0x4\n"
- "tbz x4, #1, 100f\n"
- "ld1 { v27.h }[2], [x22], #0x2\n"
- "tbz x4, #0, 103f\n"
- "ld1 { v27.b }[6], [x22]\n"
+ "smlal2 v16.4s, v23.8h, v1.8h\n"
+ "smlal v17.4s, v31.4h, v1.4h\n"
+ "smlal2 v8.4s, v31.8h, v1.8h\n"
+ "smlal v10.4s, v24.4h, v1.4h\n"
+ "smlal2 v7.4s, v24.8h, v1.8h\n"
+ "tbz x0, #2, 101f\n"
+ "ld1 { v27.s }[0], [x26], #0x4\n"
+ "tbz x0, #1, 100f\n"
+ "ld1 { v27.h }[2], [x26], #0x2\n"
+ "tbz x0, #0, 103f\n"
+ "ld1 { v27.b }[6], [x26]\n"
"b 103f\n"
"100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset
- "tbz x4, #0, 103f\n"
- "ld1 { v27.b }[4], [x22]\n"
+ "tbz x0, #0, 103f\n"
+ "ld1 { v27.b }[4], [x26]\n"
"b 103f\n"
"101:" // Oddments: Load (5, 2): Bit 2: Unset
- "tbz x4, #1, 102f\n"
- "ld1 { v27.h }[0], [x22], #0x2\n"
- "tbz x4, #0, 103f\n"
- "ld1 { v27.b }[2], [x22]\n"
+ "tbz x0, #1, 102f\n"
+ "ld1 { v27.h }[0], [x26], #0x2\n"
+ "tbz x0, #0, 103f\n"
+ "ld1 { v27.b }[2], [x26]\n"
"b 103f\n"
"102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 103f\n"
- "ld1 { v27.b }[0], [x22]\n"
+ "tbz x0, #0, 103f\n"
+ "ld1 { v27.b }[0], [x26]\n"
"103:" // Oddments: Load (5, 2): Bit 2: End
+ "ldr d2, [x23, #0xb0]\n"
"usubl v27.8h, v27.8b, v9.8b\n"
- "ldr d2, [x3, #0xb0]\n"
- "smlal v8.4s, v27.4h, v1.4h\n"
- "ldr x20, [x25, #0x108]\n"
- "add x20, x20, x10\n"
- "smlal2 v5.4s, v27.8h, v1.8h\n"
"ssubl v2.8h, v2.8b, v14.8b\n"
+ "ldr x12, [x20, #0x108]\n"
+ "smlal v6.4s, v27.4h, v1.4h\n"
+ "smlal2 v5.4s, v27.8h, v1.8h\n"
+ "add x12, x12, x24\n"
"smlal v15.4s, v31.4h, v2.4h\n"
- "smlal2 v18.4s, v31.8h, v2.8h\n"
- "smlal v16.4s, v30.4h, v2.4h\n"
- "smlal2 v21.4s, v30.8h, v2.8h\n"
- "smlal v7.4s, v27.4h, v2.4h\n"
- "smlal2 v17.4s, v27.8h, v2.8h\n"
- "tbz x4, #2, 105f\n"
- "ld1 { v25.s }[0], [x20], #0x4\n"
- "tbz x4, #1, 104f\n"
- "ld1 { v25.h }[2], [x20], #0x2\n"
- "tbz x4, #0, 107f\n"
- "ld1 { v25.b }[6], [x20]\n"
+ "smlal2 v16.4s, v31.8h, v2.8h\n"
+ "smlal v17.4s, v30.4h, v2.4h\n"
+ "smlal2 v8.4s, v30.8h, v2.8h\n"
+ "smlal v10.4s, v27.4h, v2.4h\n"
+ "smlal2 v7.4s, v27.8h, v2.8h\n"
+ "tbz x0, #2, 105f\n"
+ "ld1 { v25.s }[0], [x12], #0x4\n"
+ "tbz x0, #1, 104f\n"
+ "ld1 { v25.h }[2], [x12], #0x2\n"
+ "tbz x0, #0, 107f\n"
+ "ld1 { v25.b }[6], [x12]\n"
"b 107f\n"
"104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset
- "tbz x4, #0, 107f\n"
- "ld1 { v25.b }[4], [x20]\n"
+ "tbz x0, #0, 107f\n"
+ "ld1 { v25.b }[4], [x12]\n"
"b 107f\n"
"105:" // Oddments: Load (5, 3): Bit 2: Unset
- "tbz x4, #1, 106f\n"
- "ld1 { v25.h }[0], [x20], #0x2\n"
- "tbz x4, #0, 107f\n"
- "ld1 { v25.b }[2], [x20]\n"
+ "tbz x0, #1, 106f\n"
+ "ld1 { v25.h }[0], [x12], #0x2\n"
+ "tbz x0, #0, 107f\n"
+ "ld1 { v25.b }[2], [x12]\n"
"b 107f\n"
"106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 107f\n"
- "ld1 { v25.b }[0], [x20]\n"
+ "tbz x0, #0, 107f\n"
+ "ld1 { v25.b }[0], [x12]\n"
"107:" // Oddments: Load (5, 3): Bit 2: End
+ "ldr d3, [x23, #0xb8]\n"
"usubl v25.8h, v25.8b, v9.8b\n"
- "ldr d3, [x3, #0xb8]\n"
- "smlal v8.4s, v25.4h, v2.4h\n"
- "ldr x13, [x25, #0x110]\n"
- "add x13, x13, x10\n"
- "smlal2 v5.4s, v25.8h, v2.8h\n"
"ssubl v3.8h, v3.8b, v14.8b\n"
+ "ldr x14, [x20, #0x110]\n"
+ "smlal v6.4s, v25.4h, v2.4h\n"
+ "smlal2 v5.4s, v25.8h, v2.8h\n"
+ "add x14, x14, x24\n"
"smlal v15.4s, v30.4h, v3.4h\n"
- "smlal2 v18.4s, v30.8h, v3.8h\n"
- "smlal v16.4s, v28.4h, v3.4h\n"
- "smlal2 v21.4s, v28.8h, v3.8h\n"
- "smlal v7.4s, v25.4h, v3.4h\n"
- "smlal2 v17.4s, v25.8h, v3.8h\n"
- "tbz x4, #2, 109f\n"
- "ld1 { v24.s }[0], [x13], #0x4\n"
- "tbz x4, #1, 108f\n"
- "ld1 { v24.h }[2], [x13], #0x2\n"
- "tbz x4, #0, 111f\n"
- "ld1 { v24.b }[6], [x13]\n"
+ "smlal2 v16.4s, v30.8h, v3.8h\n"
+ "smlal v17.4s, v28.4h, v3.4h\n"
+ "smlal2 v8.4s, v28.8h, v3.8h\n"
+ "smlal v10.4s, v25.4h, v3.4h\n"
+ "smlal2 v7.4s, v25.8h, v3.8h\n"
+ "tbz x0, #2, 109f\n"
+ "ld1 { v24.s }[0], [x14], #0x4\n"
+ "tbz x0, #1, 108f\n"
+ "ld1 { v24.h }[2], [x14], #0x2\n"
+ "tbz x0, #0, 111f\n"
+ "ld1 { v24.b }[6], [x14]\n"
"b 111f\n"
"108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset
- "tbz x4, #0, 111f\n"
- "ld1 { v24.b }[4], [x13]\n"
+ "tbz x0, #0, 111f\n"
+ "ld1 { v24.b }[4], [x14]\n"
"b 111f\n"
"109:" // Oddments: Load (5, 4): Bit 2: Unset
- "tbz x4, #1, 110f\n"
- "ld1 { v24.h }[0], [x13], #0x2\n"
- "tbz x4, #0, 111f\n"
- "ld1 { v24.b }[2], [x13]\n"
+ "tbz x0, #1, 110f\n"
+ "ld1 { v24.h }[0], [x14], #0x2\n"
+ "tbz x0, #0, 111f\n"
+ "ld1 { v24.b }[2], [x14]\n"
"b 111f\n"
"110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 111f\n"
- "ld1 { v24.b }[0], [x13]\n"
+ "tbz x0, #0, 111f\n"
+ "ld1 { v24.b }[0], [x14]\n"
"111:" // Oddments: Load (5, 4): Bit 2: End
+ "ldr d4, [x23, #0xc0]\n"
"usubl v24.8h, v24.8b, v9.8b\n"
- "ldr d4, [x3, #0xc0]\n"
- "smlal v8.4s, v24.4h, v3.4h\n"
- "ldr x21, [x25, #0x118]\n"
- "add x21, x21, x10\n"
- "smlal2 v5.4s, v24.8h, v3.8h\n"
"ssubl v4.8h, v4.8b, v14.8b\n"
+ "ldr x21, [x20, #0x118]\n"
+ "smlal v6.4s, v24.4h, v3.4h\n"
+ "smlal2 v5.4s, v24.8h, v3.8h\n"
+ "add x21, x21, x24\n"
"smlal v15.4s, v28.4h, v4.4h\n"
- "smlal2 v18.4s, v28.8h, v4.8h\n"
- "smlal v16.4s, v26.4h, v4.4h\n"
- "smlal2 v21.4s, v26.8h, v4.8h\n"
- "smlal v7.4s, v24.4h, v4.4h\n"
- "smlal2 v17.4s, v24.8h, v4.8h\n"
- "tbz x4, #2, 113f\n"
+ "smlal2 v16.4s, v28.8h, v4.8h\n"
+ "smlal v17.4s, v26.4h, v4.4h\n"
+ "smlal2 v8.4s, v26.8h, v4.8h\n"
+ "smlal v10.4s, v24.4h, v4.4h\n"
+ "smlal2 v7.4s, v24.8h, v4.8h\n"
+ "tbz x0, #2, 113f\n"
"ld1 { v27.s }[0], [x21], #0x4\n"
- "tbz x4, #1, 112f\n"
+ "tbz x0, #1, 112f\n"
"ld1 { v27.h }[2], [x21], #0x2\n"
- "tbz x4, #0, 115f\n"
+ "tbz x0, #0, 115f\n"
"ld1 { v27.b }[6], [x21]\n"
"b 115f\n"
"112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset
- "tbz x4, #0, 115f\n"
+ "tbz x0, #0, 115f\n"
"ld1 { v27.b }[4], [x21]\n"
"b 115f\n"
"113:" // Oddments: Load (5, 5): Bit 2: Unset
- "tbz x4, #1, 114f\n"
+ "tbz x0, #1, 114f\n"
"ld1 { v27.h }[0], [x21], #0x2\n"
- "tbz x4, #0, 115f\n"
+ "tbz x0, #0, 115f\n"
"ld1 { v27.b }[2], [x21]\n"
"b 115f\n"
"114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 115f\n"
+ "tbz x0, #0, 115f\n"
"ld1 { v27.b }[0], [x21]\n"
"115:" // Oddments: Load (5, 5): Bit 2: End
"usubl v27.8h, v27.8b, v9.8b\n"
- "smlal v8.4s, v27.4h, v4.4h\n"
+ "smlal v6.4s, v27.4h, v4.4h\n"
"smlal2 v5.4s, v27.8h, v4.8h\n"
- "tbz x4, #2, 117f\n"
- "ld1 { v6.4s }, [x2], #0x10\n"
- "ld1 { v19.4s }, [x5], #0x10\n"
- "tbz x4, #1, 116f\n"
- "ld1 { v20.d }[0], [x2], #0x8\n"
- "ld1 { v12.d }[0], [x5], #0x8\n"
- "tbz x4, #0, 119f\n"
- "ld1 { v20.s }[2], [x2]\n"
- "ld1 { v12.s }[2], [x5]\n"
+ "tbz x0, #2, 117f\n"
+ "ld1 { v12.4s }, [x10], #0x10\n"
+ "ld1 { v19.4s }, [x1], #0x10\n"
+ "tbz x0, #1, 116f\n"
+ "ld1 { v20.d }[0], [x10], #0x8\n"
+ "ld1 { v29.d }[0], [x1], #0x8\n"
+ "tbz x0, #0, 119f\n"
+ "ld1 { v20.s }[2], [x10]\n"
+ "ld1 { v29.s }[2], [x1]\n"
"b 119f\n"
"116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset
- "tbz x4, #0, 119f\n"
- "ld1 { v20.s }[0], [x2]\n"
- "ld1 { v12.s }[0], [x5]\n"
+ "tbz x0, #0, 119f\n"
+ "ld1 { v20.s }[0], [x10]\n"
+ "ld1 { v29.s }[0], [x1]\n"
"b 119f\n"
"117:" // Oddments: Load requant params: Bit 2: Unset
- "tbz x4, #1, 118f\n"
- "ld1 { v6.d }[0], [x2], #0x8\n"
- "ld1 { v19.d }[0], [x5], #0x8\n"
- "tbz x4, #0, 119f\n"
- "ld1 { v6.s }[2], [x2]\n"
- "ld1 { v19.s }[2], [x5]\n"
+ "tbz x0, #1, 118f\n"
+ "ld1 { v12.d }[0], [x10], #0x8\n"
+ "ld1 { v19.d }[0], [x1], #0x8\n"
+ "tbz x0, #0, 119f\n"
+ "ld1 { v12.s }[2], [x10]\n"
+ "ld1 { v19.s }[2], [x1]\n"
"b 119f\n"
"118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 119f\n"
- "ld1 { v6.s }[0], [x2]\n"
- "ld1 { v19.s }[0], [x5]\n"
+ "tbz x0, #0, 119f\n"
+ "ld1 { v12.s }[0], [x10]\n"
+ "ld1 { v19.s }[0], [x1]\n"
"119:" // Oddments: Load requant params: Bit 2: End
- "sqrdmulh v15.4s, v15.4s, v6.4s\n"
- "add x17, x17, x1\n"
- "sqrdmulh v18.4s, v18.4s, v20.4s\n"
- "add x16, x16, x1\n"
- "sqrdmulh v16.4s, v16.4s, v6.4s\n"
- "add x6, x6, x1\n"
- "sqrdmulh v21.4s, v21.4s, v20.4s\n"
- "add x8, x8, x1\n"
- "sqrdmulh v7.4s, v7.4s, v6.4s\n"
- "and v28.16b, v15.16b, v19.16b\n"
- "and v26.16b, v18.16b, v12.16b\n"
- "and v29.16b, v16.16b, v19.16b\n"
- "sshr v28.4s, v28.4s, #0x1f\n"
+ "sqdmulh v15.4s, v15.4s, v12.4s\n"
+ "sqdmulh v17.4s, v17.4s, v12.4s\n"
+ "add x16, x16, x22\n"
+ "add x8, x8, x22\n"
+ "sqdmulh v10.4s, v10.4s, v12.4s\n"
+ "sqdmulh v6.4s, v6.4s, v12.4s\n"
+ "add x4, x4, x22\n"
+ "add x7, x7, x22\n"
+ "and v23.16b, v15.16b, v19.16b\n"
+ "sqdmulh v16.4s, v16.4s, v20.4s\n"
+ "and v22.16b, v17.16b, v19.16b\n"
+ "sqdmulh v8.4s, v8.4s, v20.4s\n"
+ "and v21.16b, v10.16b, v19.16b\n"
+ "sqdmulh v7.4s, v7.4s, v20.4s\n"
+ "and v26.16b, v6.16b, v19.16b\n"
+ "sqdmulh v5.4s, v5.4s, v20.4s\n"
+ "sshr v23.4s, v23.4s, #0x1f\n"
+ "and v4.16b, v16.16b, v29.16b\n"
+ "sshr v22.4s, v22.4s, #0x1f\n"
+ "and v2.16b, v8.16b, v29.16b\n"
+ "sshr v21.4s, v21.4s, #0x1f\n"
+ "and v3.16b, v7.16b, v29.16b\n"
"sshr v26.4s, v26.4s, #0x1f\n"
- "sshr v29.4s, v29.4s, #0x1f\n"
- "sqadd v15.4s, v15.4s, v28.4s\n"
- "sqadd v18.4s, v18.4s, v26.4s\n"
- "sqadd v16.4s, v16.4s, v29.4s\n"
- "and v4.16b, v21.16b, v12.16b\n"
- "srshl v15.4s, v15.4s, v19.4s\n"
- "srshl v18.4s, v18.4s, v12.4s\n"
- "srshl v16.4s, v16.4s, v19.4s\n"
+ "and v25.16b, v5.16b, v29.16b\n"
+ "sqadd v15.4s, v15.4s, v23.4s\n"
"sshr v4.4s, v4.4s, #0x1f\n"
- "add v15.4s, v15.4s, v10.4s\n"
- "add v18.4s, v18.4s, v10.4s\n"
- "add v16.4s, v16.4s, v10.4s\n"
- "smin v15.4s, v15.4s, v13.4s\n"
- "smin v18.4s, v18.4s, v13.4s\n"
- "smin v16.4s, v16.4s, v13.4s\n"
- "smax v15.4s, v15.4s, v11.4s\n"
- "smax v18.4s, v18.4s, v11.4s\n"
- "smax v16.4s, v16.4s, v11.4s\n"
- "sqadd v21.4s, v21.4s, v4.4s\n"
- "uzp1 v15.16b, v15.16b, v18.16b\n"
- "and v25.16b, v7.16b, v19.16b\n"
- "uzp1 v15.16b, v15.16b, v15.16b\n"
- "srshl v21.4s, v21.4s, v12.4s\n"
+ "sqadd v17.4s, v17.4s, v22.4s\n"
+ "sshr v2.4s, v2.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v21.4s\n"
+ "sshr v3.4s, v3.4s, #0x1f\n"
+ "sqadd v6.4s, v6.4s, v26.4s\n"
"sshr v25.4s, v25.4s, #0x1f\n"
- "sqrdmulh v17.4s, v17.4s, v20.4s\n"
- "sqrdmulh v8.4s, v8.4s, v6.4s\n"
- "add v21.4s, v21.4s, v10.4s\n"
- "sqadd v7.4s, v7.4s, v25.4s\n"
- "and v31.16b, v17.16b, v12.16b\n"
- "smin v21.4s, v21.4s, v13.4s\n"
- "and v24.16b, v8.16b, v19.16b\n"
- "srshl v7.4s, v7.4s, v19.4s\n"
- "smax v21.4s, v21.4s, v11.4s\n"
- "sshr v31.4s, v31.4s, #0x1f\n"
- "sshr v24.4s, v24.4s, #0x1f\n"
- "uzp1 v16.16b, v16.16b, v21.16b\n"
- "add v7.4s, v7.4s, v10.4s\n"
- "uzp1 v16.16b, v16.16b, v16.16b\n"
- "sqadd v17.4s, v17.4s, v31.4s\n"
- "smin v7.4s, v7.4s, v13.4s\n"
- "sqadd v8.4s, v8.4s, v24.4s\n"
- "sqrdmulh v5.4s, v5.4s, v20.4s\n"
- "smax v7.4s, v7.4s, v11.4s\n"
- "srshl v17.4s, v17.4s, v12.4s\n"
- "srshl v8.4s, v8.4s, v19.4s\n"
- "and v1.16b, v5.16b, v12.16b\n"
- "add v17.4s, v17.4s, v10.4s\n"
- "add v8.4s, v8.4s, v10.4s\n"
- "sshr v1.4s, v1.4s, #0x1f\n"
- "smin v17.4s, v17.4s, v13.4s\n"
- "smin v8.4s, v8.4s, v13.4s\n"
- "sqadd v5.4s, v5.4s, v1.4s\n"
- "smax v17.4s, v17.4s, v11.4s\n"
- "smax v8.4s, v8.4s, v11.4s\n"
- "srshl v5.4s, v5.4s, v12.4s\n"
- "uzp1 v7.16b, v7.16b, v17.16b\n"
- "uzp1 v7.16b, v7.16b, v7.16b\n"
- "add v5.4s, v5.4s, v10.4s\n"
- "smin v5.4s, v5.4s, v13.4s\n"
- "smax v5.4s, v5.4s, v11.4s\n"
- "uzp1 v8.16b, v8.16b, v5.16b\n"
- "uzp1 v8.16b, v8.16b, v8.16b\n"
- "tbz x4, #2, 121f\n"
- "st1 { v15.s }[0], [x17], #0x4\n"
- "st1 { v16.s }[0], [x16], #0x4\n"
- "st1 { v7.s }[0], [x6], #0x4\n"
- "st1 { v8.s }[0], [x8], #0x4\n"
- "tbz x4, #1, 120f\n"
- "st1 { v15.h }[2], [x17], #0x2\n"
- "st1 { v16.h }[2], [x16], #0x2\n"
- "st1 { v7.h }[2], [x6], #0x2\n"
- "st1 { v8.h }[2], [x8], #0x2\n"
- "tbz x4, #0, 123f\n"
- "st1 { v15.b }[6], [x17], #0x1\n"
- "st1 { v16.b }[6], [x16], #0x1\n"
- "st1 { v7.b }[6], [x6], #0x1\n"
- "st1 { v8.b }[6], [x8], #0x1\n"
+ "srshl v15.4s, v15.4s, v19.4s\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "srshl v17.4s, v17.4s, v19.4s\n"
+ "sqadd v8.4s, v8.4s, v2.4s\n"
+ "srshl v10.4s, v10.4s, v19.4s\n"
+ "sqadd v7.4s, v7.4s, v3.4s\n"
+ "srshl v6.4s, v6.4s, v19.4s\n"
+ "sqadd v5.4s, v5.4s, v25.4s\n"
+ "srshl v16.4s, v16.4s, v29.4s\n"
+ "sqxtn v15.4h, v15.4s\n"
+ "srshl v8.4s, v8.4s, v29.4s\n"
+ "sqxtn v17.4h, v17.4s\n"
+ "srshl v7.4s, v7.4s, v29.4s\n"
+ "sqxtn v10.4h, v10.4s\n"
+ "srshl v5.4s, v5.4s, v29.4s\n"
+ "sqxtn v6.4h, v6.4s\n"
+ "sqxtn2 v15.8h, v16.4s\n"
+ "sqxtn2 v17.8h, v8.4s\n"
+ "sqxtn2 v10.8h, v7.4s\n"
+ "sqxtn2 v6.8h, v5.4s\n"
+ "sqadd v15.8h, v15.8h, v18.8h\n"
+ "sqadd v17.8h, v17.8h, v18.8h\n"
+ "sqadd v10.8h, v10.8h, v18.8h\n"
+ "sqadd v6.8h, v6.8h, v18.8h\n"
+ "smax v15.8h, v15.8h, v11.8h\n"
+ "smax v17.8h, v17.8h, v11.8h\n"
+ "smax v10.8h, v10.8h, v11.8h\n"
+ "smax v6.8h, v6.8h, v11.8h\n"
+ "smin v15.8h, v15.8h, v13.8h\n"
+ "smin v17.8h, v17.8h, v13.8h\n"
+ "smin v10.8h, v10.8h, v13.8h\n"
+ "smin v6.8h, v6.8h, v13.8h\n"
+ "uzp1 v15.16b, v15.16b, v15.16b\n"
+ "uzp1 v17.16b, v17.16b, v17.16b\n"
+ "uzp1 v10.16b, v10.16b, v10.16b\n"
+ "uzp1 v6.16b, v6.16b, v6.16b\n"
+ "tbz x0, #2, 121f\n"
+ "st1 { v15.s }[0], [x16], #0x4\n"
+ "st1 { v17.s }[0], [x8], #0x4\n"
+ "st1 { v10.s }[0], [x4], #0x4\n"
+ "st1 { v6.s }[0], [x7], #0x4\n"
+ "tbz x0, #1, 120f\n"
+ "st1 { v15.h }[2], [x16], #0x2\n"
+ "st1 { v17.h }[2], [x8], #0x2\n"
+ "st1 { v10.h }[2], [x4], #0x2\n"
+ "st1 { v6.h }[2], [x7], #0x2\n"
+ "tbz x0, #0, 123f\n"
+ "st1 { v15.b }[6], [x16], #0x1\n"
+ "st1 { v17.b }[6], [x8], #0x1\n"
+ "st1 { v10.b }[6], [x4], #0x1\n"
+ "st1 { v6.b }[6], [x7], #0x1\n"
"b 123f\n"
"120:" // Oddments: Bit 2: Bit 1: Unset
- "tbz x4, #0, 123f\n"
- "st1 { v15.b }[4], [x17], #0x1\n"
- "st1 { v16.b }[4], [x16], #0x1\n"
- "st1 { v7.b }[4], [x6], #0x1\n"
- "st1 { v8.b }[4], [x8], #0x1\n"
+ "tbz x0, #0, 123f\n"
+ "st1 { v15.b }[4], [x16], #0x1\n"
+ "st1 { v17.b }[4], [x8], #0x1\n"
+ "st1 { v10.b }[4], [x4], #0x1\n"
+ "st1 { v6.b }[4], [x7], #0x1\n"
"b 123f\n"
"121:" // Oddments: Bit 2: Unset
- "tbz x4, #1, 122f\n"
- "st1 { v15.h }[0], [x17], #0x2\n"
- "st1 { v16.h }[0], [x16], #0x2\n"
- "st1 { v7.h }[0], [x6], #0x2\n"
- "st1 { v8.h }[0], [x8], #0x2\n"
- "tbz x4, #0, 123f\n"
- "st1 { v15.b }[2], [x17], #0x1\n"
- "st1 { v16.b }[2], [x16], #0x1\n"
- "st1 { v7.b }[2], [x6], #0x1\n"
- "st1 { v8.b }[2], [x8], #0x1\n"
+ "tbz x0, #1, 122f\n"
+ "st1 { v15.h }[0], [x16], #0x2\n"
+ "st1 { v17.h }[0], [x8], #0x2\n"
+ "st1 { v10.h }[0], [x4], #0x2\n"
+ "st1 { v6.h }[0], [x7], #0x2\n"
+ "tbz x0, #0, 123f\n"
+ "st1 { v15.b }[2], [x16], #0x1\n"
+ "st1 { v17.b }[2], [x8], #0x1\n"
+ "st1 { v10.b }[2], [x4], #0x1\n"
+ "st1 { v6.b }[2], [x7], #0x1\n"
"b 123f\n"
"122:" // Oddments: Bit 2: Unset: Bit 1: Unset
- "tbz x4, #0, 123f\n"
- "st1 { v15.b }[0], [x17], #0x1\n"
- "st1 { v16.b }[0], [x16], #0x1\n"
- "st1 { v7.b }[0], [x6], #0x1\n"
- "st1 { v8.b }[0], [x8], #0x1\n"
+ "tbz x0, #0, 123f\n"
+ "st1 { v15.b }[0], [x16], #0x1\n"
+ "st1 { v17.b }[0], [x8], #0x1\n"
+ "st1 { v10.b }[0], [x4], #0x1\n"
+ "st1 { v6.b }[0], [x7], #0x1\n"
"123:" // Oddments: Bit 2: End
-
"124:" // End
-
:
: [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (&params)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp
index 2bfeac0556..6bdcca115c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,28 +28,23 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
-struct a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst
+class a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<uint8_t, int8_t, uint8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef uint8_t input_type;
- typedef int8_t weight_type;
- typedef uint8_t return_type;
-
- typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- constexpr static unsigned int n_output_points = 9;
+ KernelType kernel = a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl;
- kern_type kernel = a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl;
+ public:
+ a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<uint8_t, int8_t, uint8_t, int32_t>(9, arm_gemm::VLType::None) {}
- a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {}
+ KernelType get_kernel() const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
index 802030573e..394df363da 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,31 +28,25 @@
#pragma once
+#if defined(__aarch64__)
+
namespace arm_conv {
namespace depthwise {
void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
-struct a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
+struct a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<uint8_t, int8_t, uint8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef uint8_t input_type;
- typedef int8_t weight_type;
- typedef uint8_t return_type;
-
- typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None;
-
- constexpr static unsigned int output_rows(void) { return 2; };
- constexpr static unsigned int output_cols(void) { return 8; };
-
- constexpr static unsigned int output_col_regs(void) { return 2; };
-
- kern_type kernel = a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
-
- a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {}
+ using Parent = GenericDepthfirstMultiplierKernelStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+ a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+ : Parent(2, 8, arm_gemm::VLType::None)
+ {
+ }
+ Parent::KernelType kernel = a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
+
+#endif // defined(__aarch64__)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 1cfea9dcbd..1c1fb25e1f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,7 +28,7 @@
#pragma once
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace depthwise {
@@ -36,19 +36,16 @@ namespace depthwise {
void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
-class sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy
+class sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
private:
- typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
- indirect_kern_type m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
-
- typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
- direct_kern_type m_direct_kernel = sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
public:
- typedef __fp16 return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::SVE;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
@@ -59,63 +56,16 @@ class sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirs
constexpr static unsigned int output_rows = 2;
constexpr static unsigned int output_cols = 2;
- constexpr static unsigned int input_rows = 4;
- constexpr static unsigned int input_cols = 4;
-
- sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+ sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+ : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 3, 1) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
- unsigned int get_kernel_rows(void) const override { return kernel_rows; }
- unsigned int get_kernel_cols(void) const override { return kernel_cols; }
-
- unsigned int get_stride_rows(void) const override { return stride_rows; }
- unsigned int get_stride_cols(void) const override { return stride_cols; }
-
- unsigned int get_output_rows(void) const override { return output_rows; }
- unsigned int get_output_cols(void) const override { return output_cols; }
-
- unsigned int get_input_rows(void) const override { return input_rows; }
- unsigned int get_input_cols(void) const override { return input_cols; }
-
- void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const override
- {
- m_indirect_kernel(
- reinterpret_cast<const __fp16 *const *>(input_ptrs),
- reinterpret_cast<__fp16 *const *>(outptrs),
- params, n_channels,
- *static_cast<const __fp16 *>(activation_min),
- *static_cast<const __fp16 *>(activation_max)
- );
- }
-
- void direct_kernel(
- const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
- const void *params, unsigned int n_channels,
- const void *activation_min, const void *activation_max
- ) const override
- {
- m_direct_kernel(
- n_tile_rows, n_tile_cols,
- static_cast<const __fp16 *>(inptr), ld_input_row, ld_input_col,
- static_cast<__fp16 *>(outptr), ld_output_row, ld_output_col,
- params, n_channels,
- *static_cast<const __fp16 *>(activation_min),
- *static_cast<const __fp16 *>(activation_max)
- );
- }
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
index af8af1840a..d49b14eeaf 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,7 +28,7 @@
#pragma once
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace depthwise {
@@ -36,19 +36,16 @@ namespace depthwise {
void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
-class sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public IDepthwiseDepthfirstStrategy
+class sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
private:
- typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
- indirect_kern_type m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
-
- typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
- direct_kern_type m_direct_kernel = sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
public:
- typedef __fp16 return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::SVE;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
@@ -59,63 +56,16 @@ class sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public IDepthwiseDepthfirs
constexpr static unsigned int output_rows = 3;
constexpr static unsigned int output_cols = 3;
- constexpr static unsigned int input_rows = 5;
- constexpr static unsigned int input_cols = 5;
-
- sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {}
+ sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+ : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(3, 3, 1) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
- unsigned int get_kernel_rows(void) const override { return kernel_rows; }
- unsigned int get_kernel_cols(void) const override { return kernel_cols; }
-
- unsigned int get_stride_rows(void) const override { return stride_rows; }
- unsigned int get_stride_cols(void) const override { return stride_cols; }
-
- unsigned int get_output_rows(void) const override { return output_rows; }
- unsigned int get_output_cols(void) const override { return output_cols; }
-
- unsigned int get_input_rows(void) const override { return input_rows; }
- unsigned int get_input_cols(void) const override { return input_cols; }
-
- void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const override
- {
- m_indirect_kernel(
- reinterpret_cast<const __fp16 *const *>(input_ptrs),
- reinterpret_cast<__fp16 *const *>(outptrs),
- params, n_channels,
- *static_cast<const __fp16 *>(activation_min),
- *static_cast<const __fp16 *>(activation_max)
- );
- }
-
- void direct_kernel(
- const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
- const void *params, unsigned int n_channels,
- const void *activation_min, const void *activation_max
- ) const override
- {
- m_direct_kernel(
- n_tile_rows, n_tile_cols,
- static_cast<const __fp16 *>(inptr), ld_input_row, ld_input_col,
- static_cast<__fp16 *>(outptr), ld_output_row, ld_output_col,
- params, n_channels,
- *static_cast<const __fp16 *>(activation_min),
- *static_cast<const __fp16 *>(activation_max)
- );
- }
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
index 60234c8a86..ac6ae284fd 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,7 +28,7 @@
#pragma once
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace depthwise {
@@ -36,19 +36,16 @@ namespace depthwise {
void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
-class sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public IDepthwiseDepthfirstStrategy
+class sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
private:
- typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
- indirect_kern_type m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
-
- typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
- direct_kern_type m_direct_kernel = sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
public:
- typedef __fp16 return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::SVE;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
@@ -59,63 +56,16 @@ class sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public IDepthwiseDepthfirs
constexpr static unsigned int output_rows = 4;
constexpr static unsigned int output_cols = 4;
- constexpr static unsigned int input_rows = 6;
- constexpr static unsigned int input_cols = 6;
-
- sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {}
+ sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+ : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(4, 3, 1) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
- unsigned int get_kernel_rows(void) const override { return kernel_rows; }
- unsigned int get_kernel_cols(void) const override { return kernel_cols; }
-
- unsigned int get_stride_rows(void) const override { return stride_rows; }
- unsigned int get_stride_cols(void) const override { return stride_cols; }
-
- unsigned int get_output_rows(void) const override { return output_rows; }
- unsigned int get_output_cols(void) const override { return output_cols; }
-
- unsigned int get_input_rows(void) const override { return input_rows; }
- unsigned int get_input_cols(void) const override { return input_cols; }
-
- void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const override
- {
- m_indirect_kernel(
- reinterpret_cast<const __fp16 *const *>(input_ptrs),
- reinterpret_cast<__fp16 *const *>(outptrs),
- params, n_channels,
- *static_cast<const __fp16 *>(activation_min),
- *static_cast<const __fp16 *>(activation_max)
- );
- }
-
- void direct_kernel(
- const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
- const void *params, unsigned int n_channels,
- const void *activation_min, const void *activation_max
- ) const override
- {
- m_direct_kernel(
- n_tile_rows, n_tile_cols,
- static_cast<const __fp16 *>(inptr), ld_input_row, ld_input_col,
- static_cast<__fp16 *>(outptr), ld_output_row, ld_output_col,
- params, n_channels,
- *static_cast<const __fp16 *>(activation_min),
- *static_cast<const __fp16 *>(activation_max)
- );
- }
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 5968309927..82173ee71f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,7 +28,7 @@
#pragma once
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace depthwise {
@@ -36,19 +36,16 @@ namespace depthwise {
void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
-class sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy
+class sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
private:
- typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
- indirect_kern_type m_indirect_kernel = sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
-
- typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
- direct_kern_type m_direct_kernel = sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
public:
- typedef __fp16 return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::SVE;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
@@ -59,63 +56,16 @@ class sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public IDepthwiseDepthfirs
constexpr static unsigned int output_rows = 2;
constexpr static unsigned int output_cols = 2;
- constexpr static unsigned int input_rows = 5;
- constexpr static unsigned int input_cols = 5;
-
- sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+ sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+ : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 3, 2) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
- unsigned int get_kernel_rows(void) const override { return kernel_rows; }
- unsigned int get_kernel_cols(void) const override { return kernel_cols; }
-
- unsigned int get_stride_rows(void) const override { return stride_rows; }
- unsigned int get_stride_cols(void) const override { return stride_cols; }
-
- unsigned int get_output_rows(void) const override { return output_rows; }
- unsigned int get_output_cols(void) const override { return output_cols; }
-
- unsigned int get_input_rows(void) const override { return input_rows; }
- unsigned int get_input_cols(void) const override { return input_cols; }
-
- void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const override
- {
- m_indirect_kernel(
- reinterpret_cast<const __fp16 *const *>(input_ptrs),
- reinterpret_cast<__fp16 *const *>(outptrs),
- params, n_channels,
- *static_cast<const __fp16 *>(activation_min),
- *static_cast<const __fp16 *>(activation_max)
- );
- }
-
- void direct_kernel(
- const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
- const void *params, unsigned int n_channels,
- const void *activation_min, const void *activation_max
- ) const override
- {
- m_direct_kernel(
- n_tile_rows, n_tile_cols,
- static_cast<const __fp16 *>(inptr), ld_input_row, ld_input_col,
- static_cast<__fp16 *>(outptr), ld_output_row, ld_output_col,
- params, n_channels,
- *static_cast<const __fp16 *>(activation_min),
- *static_cast<const __fp16 *>(activation_max)
- );
- }
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index 4a9bd33a1e..f5d4189a47 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,7 +28,7 @@
#pragma once
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
namespace arm_conv {
namespace depthwise {
@@ -36,19 +36,16 @@ namespace depthwise {
void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
-class sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy
+class sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>
{
private:
- typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16);
- indirect_kern_type m_indirect_kernel = sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
-
- typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16);
- direct_kern_type m_direct_kernel = sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+ using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>;
+ Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
public:
- typedef __fp16 return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+ using return_type = __fp16;
+ constexpr static auto vl_type = arm_gemm::VLType::SVE;
constexpr static unsigned int kernel_rows = 5;
constexpr static unsigned int kernel_cols = 5;
@@ -59,63 +56,16 @@ class sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirs
constexpr static unsigned int output_rows = 2;
constexpr static unsigned int output_cols = 2;
- constexpr static unsigned int input_rows = 6;
- constexpr static unsigned int input_cols = 6;
-
- sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+ sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
+ : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 5, 1) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
- unsigned int get_kernel_rows(void) const override { return kernel_rows; }
- unsigned int get_kernel_cols(void) const override { return kernel_cols; }
-
- unsigned int get_stride_rows(void) const override { return stride_rows; }
- unsigned int get_stride_cols(void) const override { return stride_cols; }
-
- unsigned int get_output_rows(void) const override { return output_rows; }
- unsigned int get_output_cols(void) const override { return output_cols; }
-
- unsigned int get_input_rows(void) const override { return input_rows; }
- unsigned int get_input_cols(void) const override { return input_cols; }
-
- void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const override
- {
- m_indirect_kernel(
- reinterpret_cast<const __fp16 *const *>(input_ptrs),
- reinterpret_cast<__fp16 *const *>(outptrs),
- params, n_channels,
- *static_cast<const __fp16 *>(activation_min),
- *static_cast<const __fp16 *>(activation_max)
- );
- }
-
- void direct_kernel(
- const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
- const void *params, unsigned int n_channels,
- const void *activation_min, const void *activation_max
- ) const override
- {
- m_direct_kernel(
- n_tile_rows, n_tile_cols,
- static_cast<const __fp16 *>(inptr), ld_input_row, ld_input_col,
- static_cast<__fp16 *>(outptr), ld_output_row, ld_output_col,
- params, n_channels,
- *static_cast<const __fp16 *>(activation_min),
- *static_cast<const __fp16 *>(activation_max)
- );
- }
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index e07e6314f8..d7b1de2062 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,7 +28,7 @@
#pragma once
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -36,19 +36,16 @@ namespace depthwise {
void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
-class sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy
+class sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
private:
- typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
- indirect_kern_type m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
-
- typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
- direct_kern_type m_direct_kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl;
public:
- typedef float return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::SVE;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
@@ -59,63 +56,16 @@ class sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirs
constexpr static unsigned int output_rows = 2;
constexpr static unsigned int output_cols = 2;
- constexpr static unsigned int input_rows = 4;
- constexpr static unsigned int input_cols = 4;
-
- sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+ sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *)
+ : DepthwiseDepthfirstStrategy<float, float, float, float>(2, 3, 1) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
- unsigned int get_kernel_rows(void) const override { return kernel_rows; }
- unsigned int get_kernel_cols(void) const override { return kernel_cols; }
-
- unsigned int get_stride_rows(void) const override { return stride_rows; }
- unsigned int get_stride_cols(void) const override { return stride_cols; }
-
- unsigned int get_output_rows(void) const override { return output_rows; }
- unsigned int get_output_cols(void) const override { return output_cols; }
-
- unsigned int get_input_rows(void) const override { return input_rows; }
- unsigned int get_input_cols(void) const override { return input_cols; }
-
- void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const override
- {
- m_indirect_kernel(
- reinterpret_cast<const float *const *>(input_ptrs),
- reinterpret_cast<float *const *>(outptrs),
- params, n_channels,
- *static_cast<const float *>(activation_min),
- *static_cast<const float *>(activation_max)
- );
- }
-
- void direct_kernel(
- const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
- const void *params, unsigned int n_channels,
- const void *activation_min, const void *activation_max
- ) const override
- {
- m_direct_kernel(
- n_tile_rows, n_tile_cols,
- static_cast<const float *>(inptr), ld_input_row, ld_input_col,
- static_cast<float *>(outptr), ld_output_row, ld_output_col,
- params, n_channels,
- *static_cast<const float *>(activation_min),
- *static_cast<const float *>(activation_max)
- );
- }
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
index eb9de9fa01..41ad193364 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,7 +28,7 @@
#pragma once
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -36,19 +36,16 @@ namespace depthwise {
void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
-class sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public IDepthwiseDepthfirstStrategy
+class sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
private:
- typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
- indirect_kern_type m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
-
- typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
- direct_kern_type m_direct_kernel = sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl;
public:
- typedef float return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::SVE;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
@@ -59,63 +56,16 @@ class sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public IDepthwiseDepthfirs
constexpr static unsigned int output_rows = 3;
constexpr static unsigned int output_cols = 3;
- constexpr static unsigned int input_rows = 5;
- constexpr static unsigned int input_cols = 5;
-
- sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {}
+ sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *)
+ : DepthwiseDepthfirstStrategy<float, float, float, float>(3, 3, 1) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
- unsigned int get_kernel_rows(void) const override { return kernel_rows; }
- unsigned int get_kernel_cols(void) const override { return kernel_cols; }
-
- unsigned int get_stride_rows(void) const override { return stride_rows; }
- unsigned int get_stride_cols(void) const override { return stride_cols; }
-
- unsigned int get_output_rows(void) const override { return output_rows; }
- unsigned int get_output_cols(void) const override { return output_cols; }
-
- unsigned int get_input_rows(void) const override { return input_rows; }
- unsigned int get_input_cols(void) const override { return input_cols; }
-
- void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const override
- {
- m_indirect_kernel(
- reinterpret_cast<const float *const *>(input_ptrs),
- reinterpret_cast<float *const *>(outptrs),
- params, n_channels,
- *static_cast<const float *>(activation_min),
- *static_cast<const float *>(activation_max)
- );
- }
-
- void direct_kernel(
- const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
- const void *params, unsigned int n_channels,
- const void *activation_min, const void *activation_max
- ) const override
- {
- m_direct_kernel(
- n_tile_rows, n_tile_cols,
- static_cast<const float *>(inptr), ld_input_row, ld_input_col,
- static_cast<float *>(outptr), ld_output_row, ld_output_col,
- params, n_channels,
- *static_cast<const float *>(activation_min),
- *static_cast<const float *>(activation_max)
- );
- }
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
index d7be9b122e..6073b2ba7d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,7 +28,7 @@
#pragma once
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -36,19 +36,16 @@ namespace depthwise {
void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
-class sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public IDepthwiseDepthfirstStrategy
+class sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
private:
- typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
- indirect_kern_type m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
-
- typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
- direct_kern_type m_direct_kernel = sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl;
public:
- typedef float return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::SVE;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
@@ -59,63 +56,16 @@ class sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public IDepthwiseDepthfirs
constexpr static unsigned int output_rows = 4;
constexpr static unsigned int output_cols = 4;
- constexpr static unsigned int input_rows = 6;
- constexpr static unsigned int input_cols = 6;
-
- sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {}
+ sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *)
+ : DepthwiseDepthfirstStrategy<float, float, float, float>(4, 3, 1) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
- unsigned int get_kernel_rows(void) const override { return kernel_rows; }
- unsigned int get_kernel_cols(void) const override { return kernel_cols; }
-
- unsigned int get_stride_rows(void) const override { return stride_rows; }
- unsigned int get_stride_cols(void) const override { return stride_cols; }
-
- unsigned int get_output_rows(void) const override { return output_rows; }
- unsigned int get_output_cols(void) const override { return output_cols; }
-
- unsigned int get_input_rows(void) const override { return input_rows; }
- unsigned int get_input_cols(void) const override { return input_cols; }
-
- void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const override
- {
- m_indirect_kernel(
- reinterpret_cast<const float *const *>(input_ptrs),
- reinterpret_cast<float *const *>(outptrs),
- params, n_channels,
- *static_cast<const float *>(activation_min),
- *static_cast<const float *>(activation_max)
- );
- }
-
- void direct_kernel(
- const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
- const void *params, unsigned int n_channels,
- const void *activation_min, const void *activation_max
- ) const override
- {
- m_direct_kernel(
- n_tile_rows, n_tile_cols,
- static_cast<const float *>(inptr), ld_input_row, ld_input_col,
- static_cast<float *>(outptr), ld_output_row, ld_output_col,
- params, n_channels,
- *static_cast<const float *>(activation_min),
- *static_cast<const float *>(activation_max)
- );
- }
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 28d44d0b5f..17ac74e223 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,7 +28,7 @@
#pragma once
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -36,19 +36,16 @@ namespace depthwise {
void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
-class sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy
+class sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
private:
- typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
- indirect_kern_type m_indirect_kernel = sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
-
- typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
- direct_kern_type m_direct_kernel = sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl;
public:
- typedef float return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::SVE;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
@@ -59,63 +56,16 @@ class sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public IDepthwiseDepthfirs
constexpr static unsigned int output_rows = 2;
constexpr static unsigned int output_cols = 2;
- constexpr static unsigned int input_rows = 5;
- constexpr static unsigned int input_cols = 5;
-
- sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+ sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *)
+ : DepthwiseDepthfirstStrategy<float, float, float, float>(2, 3, 2) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
- unsigned int get_kernel_rows(void) const override { return kernel_rows; }
- unsigned int get_kernel_cols(void) const override { return kernel_cols; }
-
- unsigned int get_stride_rows(void) const override { return stride_rows; }
- unsigned int get_stride_cols(void) const override { return stride_cols; }
-
- unsigned int get_output_rows(void) const override { return output_rows; }
- unsigned int get_output_cols(void) const override { return output_cols; }
-
- unsigned int get_input_rows(void) const override { return input_rows; }
- unsigned int get_input_cols(void) const override { return input_cols; }
-
- void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const override
- {
- m_indirect_kernel(
- reinterpret_cast<const float *const *>(input_ptrs),
- reinterpret_cast<float *const *>(outptrs),
- params, n_channels,
- *static_cast<const float *>(activation_min),
- *static_cast<const float *>(activation_max)
- );
- }
-
- void direct_kernel(
- const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
- const void *params, unsigned int n_channels,
- const void *activation_min, const void *activation_max
- ) const override
- {
- m_direct_kernel(
- n_tile_rows, n_tile_cols,
- static_cast<const float *>(inptr), ld_input_row, ld_input_col,
- static_cast<float *>(outptr), ld_output_row, ld_output_col,
- params, n_channels,
- *static_cast<const float *>(activation_min),
- *static_cast<const float *>(activation_max)
- );
- }
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index 751874ffbf..2449c96637 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,7 +28,7 @@
#pragma once
-#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -36,19 +36,16 @@ namespace depthwise {
void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
-class sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy
+class sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<float, float, float, float>
{
private:
- typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float);
- indirect_kern_type m_indirect_kernel = sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
-
- typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float);
- direct_kern_type m_direct_kernel = sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
+ using Parent = DepthwiseDepthfirstStrategy<float, float, float, float>;
+ Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl;
+ Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl;
public:
- typedef float return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
+ using return_type = float;
+ constexpr static auto vl_type = arm_gemm::VLType::SVE;
constexpr static unsigned int kernel_rows = 5;
constexpr static unsigned int kernel_cols = 5;
@@ -59,63 +56,16 @@ class sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirs
constexpr static unsigned int output_rows = 2;
constexpr static unsigned int output_cols = 2;
- constexpr static unsigned int input_rows = 6;
- constexpr static unsigned int input_cols = 6;
-
- sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+ sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *)
+ : DepthwiseDepthfirstStrategy<float, float, float, float>(2, 5, 1) {}
arm_gemm::VLType get_vl_type(void) const override { return vl_type; }
- unsigned int get_kernel_rows(void) const override { return kernel_rows; }
- unsigned int get_kernel_cols(void) const override { return kernel_cols; }
-
- unsigned int get_stride_rows(void) const override { return stride_rows; }
- unsigned int get_stride_cols(void) const override { return stride_cols; }
-
- unsigned int get_output_rows(void) const override { return output_rows; }
- unsigned int get_output_cols(void) const override { return output_cols; }
-
- unsigned int get_input_rows(void) const override { return input_rows; }
- unsigned int get_input_cols(void) const override { return input_cols; }
-
- void indirect_kernel(
- const void *const *const input_ptrs,
- void *const *const outptrs,
- const void *params,
- unsigned int n_channels,
- const void *activation_min,
- const void *activation_max
- ) const override
- {
- m_indirect_kernel(
- reinterpret_cast<const float *const *>(input_ptrs),
- reinterpret_cast<float *const *>(outptrs),
- params, n_channels,
- *static_cast<const float *>(activation_min),
- *static_cast<const float *>(activation_max)
- );
- }
-
- void direct_kernel(
- const unsigned int n_tile_rows, const unsigned int n_tile_cols,
- const void *inptr, int64_t ld_input_row, int64_t ld_input_col,
- void *outptr, int64_t ld_output_row, int64_t ld_output_col,
- const void *params, unsigned int n_channels,
- const void *activation_min, const void *activation_max
- ) const override
- {
- m_direct_kernel(
- n_tile_rows, n_tile_cols,
- static_cast<const float *>(inptr), ld_input_row, ld_input_col,
- static_cast<float *>(outptr), ld_output_row, ld_output_col,
- params, n_channels,
- *static_cast<const float *>(activation_min),
- *static_cast<const float *>(activation_max)
- );
- }
+ Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; }
+ Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp
index bd071d370c..62faca97a9 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,22 +35,14 @@ namespace depthwise {
void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float);
-struct sve_fp32_nhwc_generic_output9_mla_depthfirst
+class sve_fp32_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<float, float, float, float>
{
- typedef float bias_type;
- typedef float input_type;
- typedef float weight_type;
- typedef float return_type;
+ KernelType kernel = sve_fp32_nhwc_generic_output9_mla_depthfirst_impl;
- typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float);
+ public:
+ sve_fp32_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<float, float, float, float>(9, arm_gemm::VLType::SVE) {}
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
- constexpr static unsigned int n_output_points = 9;
-
- kern_type kernel = sve_fp32_nhwc_generic_output9_mla_depthfirst_impl;
-
- sve_fp32_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {}
+ KernelType get_kernel() const override { return kernel; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
index 563f0fc59f..8640343747 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,33 +35,24 @@ namespace depthwise {
void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
-struct sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst
+struct sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst : DepthfirstMultiplierStrategy<float, float, float, float>
{
- typedef float bias_type;
- typedef float input_type;
- typedef float weight_type;
- typedef float return_type;
-
- typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
+ using Parent = DepthfirstMultiplierStrategy<float, float, float, float>;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 2;
constexpr static unsigned int stride_cols = 2;
- constexpr static unsigned int output_rows = 3;
- constexpr static unsigned int output_cols = 3;
-
- constexpr static unsigned int input_rows = 7;
- constexpr static unsigned int input_cols = 7;
- constexpr static unsigned int input_col_quads = 2;
+ sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(const CPUInfo *)
+ : Parent(3, 3, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
- kern_type kernel = sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
- sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
index e9378c2a12..a4ee87cce2 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,33 +35,24 @@ namespace depthwise {
void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
-struct sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst
+struct sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst : DepthfirstMultiplierStrategy<float, float, float, float>
{
- typedef float bias_type;
- typedef float input_type;
- typedef float weight_type;
- typedef float return_type;
-
- typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
+ using Parent = DepthfirstMultiplierStrategy<float, float, float, float>;
constexpr static unsigned int kernel_rows = 5;
constexpr static unsigned int kernel_cols = 5;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 4;
-
- constexpr static unsigned int input_rows = 6;
- constexpr static unsigned int input_cols = 8;
- constexpr static unsigned int input_col_quads = 2;
+ sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(const CPUInfo *)
+ : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
- kern_type kernel = sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl;
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
- sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
index 6849e562bc..e1f0b50d89 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -28,35 +28,25 @@
#pragma once
-#if defined(ARM_COMPUTE_ENABLE_SVE)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float);
-struct sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst
+struct sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<float, float, float, float>
{
- typedef float bias_type;
- typedef float input_type;
- typedef float weight_type;
- typedef float return_type;
-
- typedef void (*kern_type)(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
- constexpr static unsigned int output_rows(void) { return 2; };
- constexpr static unsigned int output_cols(void) { return 8; };
-
- constexpr static unsigned int output_col_regs(void) { return 2; };
-
- kern_type kernel = sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
-
- sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {}
+ using Parent = GenericDepthfirstMultiplierKernelStrategy<float, float, float, float>;
+ sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *)
+ : Parent(2, 8, arm_gemm::VLType::SVE)
+ {
+ }
+ Parent::KernelType kernel = sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(ARM_COMPUTE_ENABLE_SVE)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
index 39974fde88..4e2ee43374 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,39 +34,40 @@
namespace arm_conv {
namespace depthwise {
-void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *);
-struct sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst
+class sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef int8_t input_type;
- typedef int8_t weight_type;
- typedef int8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
- typedef void (*kern_type)(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 4;
- constexpr static unsigned int input_cols = 4;
-
- constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_dot::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_dot::get_packed_size;
-
- kern_type kernel = sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
-
- sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {}
+ sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleave_sve_s8q_3x3_dot::get_packed_size(args);
+ }
+
+ void pack_parameters(
+ const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const override
+ {
+ interleave_sve_s8q_3x3_dot::pack_parameters(
+ args.input_channels, buffer, reinterpret_cast<const int32_t *>(biases),
+ reinterpret_cast<const int8_t *>(weights), qp, ld_weight_col, ld_weight_row
+ );
+ }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 8e9e5f4aeb..800803770a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,7 +30,15 @@
namespace arm_conv {
namespace depthwise {
-void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp)
+void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
+ const unsigned int n_channels,
+ const int8_t *const *const inptrs,
+ const int8_t *params,
+ const int32_t *, // Bias, should be wrapped into the parameters
+ const arm_gemm::Requantize32& qp,
+ const int32_t *, const int32_t *, // Requant parameters, also wrapped
+ int8_t *const *const outptrs
+)
{
__asm__ __volatile__(
"ldp x11, x10, [%x[inptrs], #0x0]\n"
@@ -446,7 +454,7 @@ void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *cons
"b.any 1b\n"
"addvl SP, SP, #8\n"
: [params] "+&r" (params)
- : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : [inptrs] "r" (inptrs), [n_channels] "r" ((long unsigned int) n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index f788829572..3e9765165c 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,47 +29,35 @@
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
-struct sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst
+class sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef int8_t input_type;
- typedef int8_t weight_type;
- typedef int8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
- typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 4;
- constexpr static unsigned int input_cols = 4;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
- constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_mla::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_mla::get_packed_size;
+ sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
- kern_type kernel = sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType kernel = sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
- sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 87387960f1..3583308357 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -415,4 +415,4 @@ void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 5c2b4f6f53..78bcd1407f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,47 +29,35 @@
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
-struct sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst
+class sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef int8_t input_type;
- typedef int8_t weight_type;
- typedef int8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
- typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 2;
constexpr static unsigned int stride_cols = 2;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 5;
- constexpr static unsigned int input_cols = 5;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
- constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_mla::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_mla::get_packed_size;
+ sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
- kern_type kernel = sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ Parent::KernelType kernel = sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
- sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index b4a1026aaa..ba8c1fdb8d 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -456,4 +456,4 @@ void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index 948c5ad2e7..41ecd520ae 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,47 +29,35 @@
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
-struct sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst
+class sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef int8_t input_type;
- typedef int8_t weight_type;
- typedef int8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
- typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 5;
constexpr static unsigned int kernel_cols = 5;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 6;
- constexpr static unsigned int input_cols = 6;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
- constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_5x5_mla::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_5x5_mla::get_packed_size;
+ sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
- kern_type kernel = sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType kernel = sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
- sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 565c145f92..4733c89199 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -657,4 +657,4 @@ void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
index 176c4f878e..2e8c2019db 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,33 +35,24 @@ namespace depthwise {
void sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
-struct sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst
+struct sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst : DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef int8_t input_type;
- typedef int8_t weight_type;
- typedef int8_t return_type;
-
- typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
+ using Parent = DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 2;
constexpr static unsigned int stride_cols = 2;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 4;
-
- constexpr static unsigned int input_rows = 5;
- constexpr static unsigned int input_cols = 9;
- constexpr static unsigned int input_col_quads = 1;
+ sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *)
+ : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
- kern_type kernel = sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
- sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
index 10eee34d62..4874fb9a77 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,33 +35,24 @@ namespace depthwise {
void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
-struct sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst
+struct sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst : DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef int8_t input_type;
- typedef int8_t weight_type;
- typedef int8_t return_type;
-
- typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
+ using Parent = DepthfirstMultiplierStrategy<int8_t, int8_t, int8_t, int32_t>;
constexpr static unsigned int kernel_rows = 5;
constexpr static unsigned int kernel_cols = 5;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 4;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 8;
- constexpr static unsigned int input_cols = 6;
- constexpr static unsigned int input_col_quads = 1;
+ sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *)
+ : Parent(4, 2, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
- kern_type kernel = sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
- sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
index b5c6e983ae..0d185fcafc 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,39 +34,40 @@
namespace arm_conv {
namespace depthwise {
-void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *);
-struct sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst
+class sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef int8_t input_type;
- typedef int8_t weight_type;
- typedef int8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
- typedef void (*kern_type)(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<int8_t, int8_t, int8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 4;
- constexpr static unsigned int input_cols = 4;
-
- constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_dot::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_dot::get_packed_size;
-
- kern_type kernel = sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
-
- sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {}
+ sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleave_sve_s8q_3x3_dot::get_packed_size(args);
+ }
+
+ void pack_parameters(
+ const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const override
+ {
+ interleave_sve_s8q_3x3_dot::pack_parameters(
+ args.input_channels, buffer, reinterpret_cast<const int32_t *>(biases),
+ reinterpret_cast<const int8_t *>(weights), qp, ld_weight_col, ld_weight_row
+ );
+ }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 095c1de8f2..391e98b561 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,7 +30,15 @@
namespace arm_conv {
namespace depthwise {
-void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp)
+void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
+ const unsigned int n_channels,
+ const int8_t *const *const inptrs,
+ const int8_t *params,
+ const int32_t *, // Bias, should be wrapped into the parameters
+ const arm_gemm::Requantize32& qp,
+ const int32_t *, const int32_t *, // Requant parameters, also wrapped
+ int8_t *const *const outptrs
+)
{
__asm__ __volatile__(
"ldp x11, x10, [%x[inptrs], #0x0]\n"
@@ -377,7 +385,7 @@ void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *con
"b.any 1b\n"
"addvl SP, SP, #8\n"
: [params] "+&r" (params)
- : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : [inptrs] "r" (inptrs), [n_channels] "r" ((long unsigned int) n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
index a087e801dc..648b2da163 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -34,39 +34,40 @@
namespace arm_conv {
namespace depthwise {
-void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *, uint8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
+void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, uint8_t *const *);
-struct sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst
+class sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
- typedef uint32_t bias_type;
- typedef uint8_t input_type;
- typedef uint8_t weight_type;
- typedef uint8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
- typedef void (*kern_type)(const uint8_t *const *, uint8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 4;
- constexpr static unsigned int input_cols = 4;
-
- constexpr static parameter_packing_fn pack_parameters = interleave_sve_u8q_3x3_dot::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_sve_u8q_3x3_dot::get_packed_size;
-
- kern_type kernel = sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
-
- sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {}
+ sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
+
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+
+ Parent::KernelType kernel = sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleave_sve_u8q_3x3_dot::get_packed_size(args);
+ }
+
+ void pack_parameters(
+ const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const override
+ {
+ interleave_sve_u8q_3x3_dot::pack_parameters(
+ args.input_channels, buffer, reinterpret_cast<const int32_t *>(biases),
+ reinterpret_cast<const uint8_t *>(weights), qp, ld_weight_col, ld_weight_row
+ );
+ }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
index 0d4b9e6687..440f57ed00 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -30,7 +30,15 @@
namespace arm_conv {
namespace depthwise {
-void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *const inptrs, uint8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp)
+void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(
+ const unsigned int n_channels,
+ const uint8_t *const *const inptrs,
+ const uint8_t *params,
+ const int32_t *, // Bias, should be wrapped into the parameters
+ const arm_gemm::Requantize32& qp,
+ const int32_t *, const int32_t *, // Requant parameters, also wrapped
+ uint8_t *const *const outptrs
+)
{
__asm__ __volatile__(
"ldp x11, x10, [%x[inptrs], #0x0]\n"
@@ -446,7 +454,7 @@ void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *con
"b.any 1b\n"
"addvl SP, SP, #8\n"
: [params] "+&r" (params)
- : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
+ : [inptrs] "r" (inptrs), [n_channels] "r" ((long unsigned int) n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp)
: "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
);
}
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index b524fd7c93..1cf20ef721 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -36,37 +36,25 @@ namespace depthwise {
void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
-struct sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst
+class sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef uint8_t input_type;
- typedef uint8_t weight_type;
- typedef uint8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
- typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 4;
- constexpr static unsigned int input_cols = 4;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
- constexpr static parameter_packing_fn pack_parameters = interleave_sve_u8q_3x3_mla::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_sve_u8q_3x3_mla::get_packed_size;
+ sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
- kern_type kernel = sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType kernel = sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
- sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 52dc4683ad..7bfa5fc4c7 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -415,4 +415,4 @@ void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index 981864270d..a794095c6f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,47 +29,35 @@
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
-struct sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst
+class sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef uint8_t input_type;
- typedef uint8_t weight_type;
- typedef uint8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
- typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 2;
constexpr static unsigned int stride_cols = 2;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 5;
- constexpr static unsigned int input_cols = 5;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
- constexpr static parameter_packing_fn pack_parameters = interleave_sve_u8q_3x3_mla::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_sve_u8q_3x3_mla::get_packed_size;
+ sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
- kern_type kernel = sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ Parent::KernelType kernel = sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
- sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 34ba8ece12..e1b2d257b0 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -456,4 +456,4 @@ void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index b1b16c55d3..ac0a00b245 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,47 +29,35 @@
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
-struct sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst
+class sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef uint8_t input_type;
- typedef uint8_t weight_type;
- typedef uint8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
- typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 5;
constexpr static unsigned int kernel_cols = 5;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 6;
- constexpr static unsigned int input_cols = 6;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
- constexpr static parameter_packing_fn pack_parameters = interleave_sve_u8q_5x5_mla::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_sve_u8q_5x5_mla::get_packed_size;
+ sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
- kern_type kernel = sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType kernel = sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
- sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 441da6da7a..0b2182f995 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -657,4 +657,4 @@ void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
index dbf70c3f8e..81c954a11b 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,33 +35,24 @@ namespace depthwise {
void sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
-struct sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst
+struct sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst : DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
- typedef uint32_t bias_type;
- typedef uint8_t input_type;
- typedef uint8_t weight_type;
- typedef uint8_t return_type;
-
- typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
+ using Parent = DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 2;
constexpr static unsigned int stride_cols = 2;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 4;
-
- constexpr static unsigned int input_rows = 5;
- constexpr static unsigned int input_cols = 9;
- constexpr static unsigned int input_col_quads = 1;
+ sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *)
+ : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
- kern_type kernel = sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
- sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
index 90fefdcda3..e7173de65a 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -35,33 +35,24 @@ namespace depthwise {
void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
-struct sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst
+struct sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst : DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>
{
- typedef uint32_t bias_type;
- typedef uint8_t input_type;
- typedef uint8_t weight_type;
- typedef uint8_t return_type;
-
- typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&);
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
+ using Parent = DepthfirstMultiplierStrategy<uint8_t, uint8_t, uint8_t, int32_t>;
constexpr static unsigned int kernel_rows = 5;
constexpr static unsigned int kernel_cols = 5;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 4;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 8;
- constexpr static unsigned int input_cols = 6;
- constexpr static unsigned int input_col_quads = 1;
+ sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *)
+ : Parent(4, 2, kernel_rows, kernel_cols, stride_rows, stride_cols)
+ {
+ }
- kern_type kernel = sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+ arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; }
- sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) {}
+ Parent::KernelType kernel = sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl;
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
index 8ab2e5ba2a..3d475daf72 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,47 +29,35 @@
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
-struct sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst
+class sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef uint8_t input_type;
- typedef int8_t weight_type;
- typedef uint8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
- typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 4;
- constexpr static unsigned int input_cols = 4;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
- constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_mla::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_mla::get_packed_size;
+ sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {}
- kern_type kernel = sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType kernel = sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl;
- sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
index 4b9be8f3e3..dc8fad95fa 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -415,4 +415,4 @@ void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
index f652e48e42..9a3db20f73 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,47 +29,35 @@
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
-struct sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst
+class sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef uint8_t input_type;
- typedef int8_t weight_type;
- typedef uint8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
- typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 3;
constexpr static unsigned int kernel_cols = 3;
constexpr static unsigned int stride_rows = 2;
constexpr static unsigned int stride_cols = 2;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 5;
- constexpr static unsigned int input_cols = 5;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
- constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_mla::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_mla::get_packed_size;
+ sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {}
- kern_type kernel = sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
+ Parent::KernelType kernel = sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl;
- sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
index 400e62d248..9adf100a0f 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -456,4 +456,4 @@ void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
index f07ea13a03..06ca42eed9 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -29,47 +29,35 @@
#pragma once
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
-struct sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst
+class sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>
{
- typedef int32_t bias_type;
- typedef uint8_t input_type;
- typedef int8_t weight_type;
- typedef uint8_t return_type;
-
- constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE;
-
- typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *);
- typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t);
- typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &);
+ using Parent = DepthwiseDepthfirstStrategy<uint8_t, int8_t, uint8_t, int32_t>;
+ public:
constexpr static unsigned int kernel_rows = 5;
constexpr static unsigned int kernel_cols = 5;
constexpr static unsigned int stride_rows = 1;
constexpr static unsigned int stride_cols = 1;
- constexpr static unsigned int output_rows = 2;
- constexpr static unsigned int output_cols = 2;
-
- constexpr static unsigned int input_rows = 6;
- constexpr static unsigned int input_cols = 6;
+ arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; }
+ unsigned int get_accumulator_depth_vl(void) const override { return 2; }
- constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_5x5_mla::pack_parameters;
- constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_5x5_mla::get_packed_size;
+ sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {}
- kern_type kernel = sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
+ Parent::KernelType kernel = sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl;
- sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {}
+ Parent::KernelType get_kernel(void) const override { return kernel; }
};
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
index 29582da0f6..9cf95e9588 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,7 +27,7 @@
#include <cstddef>
#include <cstdint>
-#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
namespace arm_conv {
namespace depthwise {
@@ -657,4 +657,4 @@ void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
} // namespace depthwise
} // namespace arm_conv
-#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2)
+#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE)
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp b/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp
new file mode 100644
index 0000000000..e9b29ca877
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp
@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Depthwise kernel drivers commonly require a per-thread blob of working space
+ * in which to store parameters required by the depthwise implementations. The
+ * composition of this working space varies with the driver, kernel, and data
+ * types -- but the tasks of requesting sufficient space, allocating buffer
+ * space, and performing initialisation of the working space are common.
+ *
+ * The classes in this file consist of a number of working space "Elements"
+ * (which are logical units of functionality) and a Workspace type which allows
+ * for compile time composition of elements into a single working space type.
+ *
+ * Creating a workspace
+ * ====================
+ *
+ * A new workspace type can be created by combining Elements as an argument to
+ * the Workspace class. For instance:
+ *
+ * Workspace<
+ * depthwise_depthfirst::InputArrayElement<float>,
+ * InputBufferElement<float>,
+ * OutputArrayElement<float>
+ * >
+ *
+ * Creates a new Workspace consisting of the given elements. The workspace type
+ * contained within this class (`Workspace<...>::WorkspaceType`) is equivalent to:
+ *
+ * struct WorkspaceType
+ * {
+ * const float **inptr_array; // From InputArrayElement<float>
+ * float *input_buffer; // From InputBufferElement<float>
+ * float **outptr_array; // From OutputArrayElement<float>
+ * float *output_buffer; // From OutputArrayElement<float>
+ * };
+ *
+ * Calling `Workspace<...>::get_sizeof_workspace(...)` will return the amount
+ * of space required to store the above struct and the elements contained
+ * within it. Once this space has been allocated, the workspace can be
+ * initialised by calling `Workspace<...>::initialise` with a pointer to the
+ * buffer and the same arguments. This will place a struct of type
+ * `Workspace<...>::WorkspaceType` at the start of the buffer, and share the
+ * remaining space between the specified elements. As this is all done at
+ * compile time, later code can access elements from the `WorkspaceType` by
+ * name.
+ *
+ * Writing a new element
+ * =====================
+ *
+ * Each Element must provide:
+ * - A struct called "Workspace" containing the variables contained within
+ * this portion of the workspace.
+ * - A static method called `get_element_size` which returns the amount of
+ * buffer space required by this element of the workspace (NOT including the
+ * size of the Workspace struct). For example, an element which stores a
+ * vector of pointers will return the amount of space required top store the
+ * vector.
+ * - A static method called `initialise` which accepts a pointer to a struct
+ * which will be composed of the Element's `Workspace` struct (along with
+ * other elements), a pointer to the start of the buffer allocated for this
+ * portion of the workspace, and arguments to be used to initialise the
+ * workspace. The Element should consume as much of the buffer as it
+ * requires, initialise the Workspace, and then return the pointer to the
+ * next free byte of the buffer.
+ *
+ * See the below elements for an example of how this should work.
+ */
+
+#pragma once
+
+#include "depthwise.hpp"
+#include "depthfirst_driver.hpp"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+namespace { // anonymous because we expect this to appear in several compilation units
+
+/* Arguments to use to size and initialise a workspace.
+ */
+template <class StratType, class OutputStage=Nothing>
+struct WorkspaceArgs
+{
+ const StratType *strategy;
+ const DepthwiseArgs &depthwise_args;
+ const OutputStage &output_stage;
+
+ WorkspaceArgs(const StratType *strat, const DepthwiseArgs &dwargs, const OutputStage &os = {})
+ : strategy(strat), depthwise_args(dwargs), output_stage(os)
+ {
+ }
+};
+
+
+/* Sometimes we use templated structs to fill in workspace types, the Empty
+ * element can be useful for when a blank element is required for some sets of
+ * parameters.
+ */
+struct EmptyElement
+{
+ struct Workspace {};
+
+ template <class StratType, class OutputStage>
+ static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &) { return 0; }
+
+ template <class WorkspaceType, class StratType, class OutputStage>
+ static void *initialise(WorkspaceType *, void *buffer, const WorkspaceArgs<StratType, OutputStage> &)
+ {
+ return buffer;
+ }
+};
+
+
+/* Store fused activations for a kernel.
+ *
+ * Activations are set based on the DepthwiseArgs.
+ */
+template <typename T, class OutputStage=Nothing>
+class ActivationsElement
+{
+ public:
+ struct Workspace
+ {
+ T activation_min, activation_max;
+ };
+
+ template <typename StratType>
+ static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &)
+ {
+ return 0;
+ }
+
+ template <class WorkspaceType, class StratType>
+ static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args)
+ {
+ ws->activation_min = static_cast<T>(-std::numeric_limits<float>::infinity());
+ ws->activation_max = static_cast<T>(std::numeric_limits<float>::infinity());
+
+ switch (args.depthwise_args.activation.type)
+ {
+ case arm_gemm::Activation::Type::BoundedReLU:
+ ws->activation_max = static_cast<T>(args.depthwise_args.activation.param1);
+ // Fall through
+ case arm_gemm::Activation::Type::ReLU:
+ ws->activation_min = static_cast<T>(0);
+ break;
+ default:
+ break;
+ }
+
+ return buffer;
+ }
+};
+
+/* Activation clamps are contained within `arm_gemm::Requantize32`, so if the
+ * output stage is one of these we substitute in an empty workspace element.
+ */
+template <typename T>
+class ActivationsElement<T, arm_gemm::Requantize32> : public EmptyElement
+{
+};
+
+
+/* Get the value with which to fill an input buffer. This defaults to `0`
+ * (which we return as a `char` since it gets used by `memset`).
+ */
+template <typename OutputStage>
+char get_input_buffer_fill_value(const OutputStage &)
+{
+ return 0;
+}
+
+/* In the case of kernels operating on quantized data, we need to fill the
+ * input buffer with the zero offset of the input tensor.
+ */
+template <> char get_input_buffer_fill_value(const arm_gemm::Requantize32 &qp) __attribute__ ((unused));
+template <> char get_input_buffer_fill_value(const arm_gemm::Requantize32 &qp)
+{
+ return qp.a_offset;
+}
+
+
+/* Container for a vector of padding values which can be safely consumed by the
+ * depthwise kernel. The padding values are initialised to either `0` or the
+ * zero offset of the input tensor (if quantized).
+ */
+template <typename T>
+class InputBufferElement
+{
+ public:
+ struct Workspace
+ {
+ T *input_buffer;
+ };
+
+ template <typename StratType, typename OutputStage>
+ static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &args)
+ {
+ return sizeof(T) * args.depthwise_args.input_channels;
+ }
+
+ template <class WorkspaceType, typename StratType, typename OutputStage>
+ static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args)
+ {
+ ws->input_buffer = reinterpret_cast<T*>(buffer);
+ memset(ws->input_buffer, get_input_buffer_fill_value(args.output_stage), get_element_size(args));
+ return reinterpret_cast<char *>(buffer) + get_element_size(args);
+ }
+};
+
+
+/* Container for an array of output pointers, and a buffer which can be used as
+ * a destination for unnecessary writes.
+ */
+template <typename T>
+class OutputArrayElement
+{
+ public:
+ struct Workspace
+ {
+ T **outptr_array;
+ T *output_buffer;
+ };
+
+ template <typename OutputStage>
+ static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ return sizeof_outptr_array(args) + sizeof_output_buffer(args);
+ }
+
+ template <class WorkspaceType, typename OutputStage>
+ static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ char *buffer_bytes = reinterpret_cast<char *>(buffer);
+
+ ws->outptr_array = reinterpret_cast<T **>(buffer_bytes);
+ buffer_bytes += sizeof_outptr_array(args);
+
+ ws->output_buffer = reinterpret_cast<T *>(buffer_bytes);
+ buffer_bytes += sizeof_output_buffer(args);
+
+ return buffer_bytes;
+ }
+
+ protected:
+ template <typename OutputStage>
+ static size_t sizeof_outptr_array(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ return sizeof(T **) * args.strategy->get_output_rows() * args.strategy->get_output_cols();
+ }
+
+ template <typename OutputStage>
+ static size_t sizeof_output_buffer(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
+ {
+ return sizeof(T) * args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
+ }
+};
+
+
+/* Container for requantization parameters.
+ *
+ * This removes the distinction between per-layer and per-channel
+ * requantization parameters by providing a vector of requantization parameters
+ * regardless of whether per-layer or per-channel is selected.
+ */
+class RequantizationParametersElement
+{
+ public:
+ struct Workspace
+ {
+ const int32_t *bias, *requant_muls, *requant_shifts;
+ };
+
+ template <typename StratType>
+ static size_t get_element_size(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+ {
+ return sizeof_bias(args) + sizeof_requant_muls(args) + sizeof_requant_shifts(args);
+ }
+
+ template <typename WorkspaceType, typename StratType>
+ static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+ {
+ const auto n_output_channels = args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
+ char *buffer_bytes = reinterpret_cast<char *>(buffer);
+
+ ws->bias = args.output_stage.bias;
+ ws->requant_muls = args.output_stage.per_channel_muls;
+ ws->requant_shifts = args.output_stage.per_channel_right_shifts;
+
+ if (ws->bias == nullptr)
+ {
+ ws->bias = reinterpret_cast<const int32_t *>(buffer_bytes);
+ memset(buffer_bytes, 0, sizeof_bias(args));
+ buffer_bytes += sizeof_bias(args);
+ }
+
+ if (ws->requant_muls == nullptr)
+ {
+ ws->requant_muls = reinterpret_cast<const int32_t *>(buffer_bytes);
+ auto muls = reinterpret_cast<int32_t *>(buffer_bytes);
+ buffer_bytes += sizeof_requant_muls(args);
+
+ for (auto n = 0u; n < n_output_channels; n++)
+ {
+ muls[n] = args.output_stage.per_layer_mul;
+ }
+ }
+
+ if (ws->requant_shifts == nullptr)
+ {
+ ws->requant_shifts = reinterpret_cast<int32_t *>(buffer_bytes);
+ auto shifts = reinterpret_cast<int32_t *>(buffer_bytes);
+ buffer_bytes += sizeof_requant_shifts(args);
+
+ for (auto n = 0u; n < n_output_channels; n++)
+ {
+ shifts[n] = args.output_stage.per_layer_right_shift;
+ }
+ }
+
+ return buffer_bytes;
+ }
+
+ protected:
+ template <typename StratType>
+ static size_t sizeof_bias(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+ {
+ return args.output_stage.bias != nullptr ?
+ 0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
+ }
+
+ template <typename StratType>
+ static size_t sizeof_requant_muls(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+ {
+ return args.output_stage.per_channel_muls != nullptr ?
+ 0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
+ }
+
+ template <typename StratType>
+ static size_t sizeof_requant_shifts(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
+ {
+ return args.output_stage.per_channel_right_shifts != nullptr ?
+ 0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
+ }
+};
+
+
+template <typename ...Elements>
+class Workspace;
+
+template <typename Element, typename ...Elements>
+class Workspace<Element, Elements...>
+{
+ public:
+ struct WorkspaceType : Element::Workspace, Workspace<Elements...>::WorkspaceType
+ {
+ };
+
+ template <class S, class T>
+ static void initialise(void *buffer, const WorkspaceArgs<S, T> &args)
+ {
+ // Allocate sufficient space for the struct, then initialise each of the
+ // elements in turn.
+ auto ws = reinterpret_cast<WorkspaceType *>(buffer);
+ initialise_elements(ws, ws + 1, args);
+ }
+
+ template <class S, class T=Nothing>
+ static size_t get_sizeof_workspace(const WorkspaceArgs<S, T> &args)
+ {
+ return sizeof(WorkspaceType) + get_element_sizes(args);
+ }
+
+ template <class S, class T>
+ static inline size_t get_element_sizes(const WorkspaceArgs<S, T> &args)
+ {
+ return Element::get_element_size(args) + Workspace<Elements...>::get_element_sizes(args);
+ }
+
+ template <class WorkspaceType, class S, class T>
+ static void initialise_elements(WorkspaceType *ws, void *buffer, const WorkspaceArgs<S, T> &args)
+ {
+ buffer = Element::initialise(ws, buffer, args); // Get the next buffer
+ Workspace<Elements...>::initialise_elements(ws, buffer, args);
+ }
+};
+
+template <>
+class Workspace<>
+{
+ public:
+ struct WorkspaceType
+ {
+ };
+
+ template <class S, class T>
+ static inline size_t get_element_sizes(const WorkspaceArgs<S, T> &)
+ {
+ return 0;
+ }
+
+ template <class WorkspaceType, class S, class T>
+ static void initialise_elements(WorkspaceType *, void *, const WorkspaceArgs<S, T> &)
+ {
+ }
+};
+
+} // namespace {anonymous}
+} // namespace depthwise
+} // namespace arm_conv
diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp
index 9262ea05a4..3998dfbc9a 100644
--- a/src/core/NEON/kernels/assembly/depthwise.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise.hpp
@@ -59,6 +59,8 @@ struct DepthwiseArgs
const DepthwiseConfig *config;
+ bool fast_mode = false;
+
DepthwiseArgs(
const CPUInfo *cpu_info,
unsigned int kernel_rows, unsigned int kernel_cols,
@@ -83,15 +85,18 @@ private:
protected:
const DepthwiseArgs m_args; // Copy of arguments
+
public:
std::string name() const
{
return _name;
}
+
void set_name(const std::string &n)
{
_name = n;
}
+
DepthwiseCommon(const DepthwiseArgs &args)
: m_args(args) {};
DepthwiseCommon(DepthwiseCommon &) = delete;
@@ -103,7 +108,7 @@ public:
void *const output,
void *const working_space,
const unsigned int thread_id,
- const unsigned int n_threads) const override
+ const unsigned int n_threads) const override final
{
const size_t ld_input_col = m_args.input_channels;
const size_t ld_input_row = ld_input_col * m_args.input_cols;
@@ -130,7 +135,7 @@ public:
size_t ld_output_batch,
void *const working_space,
const unsigned int thread_id,
- const unsigned int n_threads) const override
+ const unsigned int n_threads) const override final
{
execute(
m_args.n_batches, m_args.input_rows, m_args.input_cols,
@@ -142,7 +147,36 @@ public:
working_space, thread_id, n_threads);
}
- virtual void execute(
+ void execute(
+ unsigned int batches,
+ unsigned int input_height,
+ unsigned int input_width,
+ unsigned int channels,
+ const PaddingValues &padding,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ unsigned int output_height,
+ unsigned int output_width,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads) const override final
+ {
+ this->execute_internal(
+ batches, input_height, input_width, channels, padding, input,
+ ld_input_col, ld_input_row, ld_input_batch, parameters, output_height,
+ output_width, output, ld_output_col, ld_output_row, ld_output_batch,
+ working_space, thread_id, n_threads);
+ }
+
+protected:
+ virtual void execute_internal(
unsigned int batches,
unsigned int input_height,
unsigned int input_width,
@@ -161,7 +195,7 @@ public:
size_t ld_output_batch,
void *working_space,
unsigned int thread_id,
- unsigned int n_threads) const override = 0;
+ unsigned int n_threads) const = 0;
};
template <typename TInput, typename TWeight = TInput, typename TOutput = TInput>