From 8a164884dddf769643cf3b9f7f94e43cb4f3c20b Mon Sep 17 00:00:00 2001 From: ramelg01 Date: Thu, 7 Apr 2022 02:42:52 +0100 Subject: =?UTF-8?q?Update=20Neon=E2=84=A2=20depthwise=20kernel?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Reduce duplication and simplify overall structure. - Improve multi-threaded performance by sharing more data in lower-level caches. Partially Resolves: COMPMID-5054 Signed-off-by: Ramy Elgammal Change-Id: Iac747f39b21c540122fa75218762631c4d787911 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7449 Tested-by: Arm Jenkins Reviewed-by: Andrew Mundy Reviewed-by: Sheri Zhang Comments-Addressed: Arm Jenkins --- Android.bp | 6 + SConscript | 1 + filelist.json | 38 +- .../arm_conv/depthwise/depthfirst_driver.hpp | 281 ++ .../arm_conv/depthwise/depthwise_depthfirst.hpp | 736 +++-- .../depthwise/depthwise_depthfirst_generic.hpp | 520 ++- .../depthwise/depthwise_depthfirst_multiplier.hpp | 954 +++--- .../kernels/arm_conv/depthwise/depthwise_fp16.cpp | 47 +- .../kernels/arm_conv/depthwise/depthwise_fp32.cpp | 81 +- .../depthwise/depthwise_implementation.hpp | 13 +- .../depthwise_implementation_constraints.hpp | 23 +- .../arm_conv/depthwise/depthwise_planar.hpp | 409 +++ .../kernels/arm_conv/depthwise/depthwise_s8q.cpp | 73 +- .../depthwise/depthwise_strategies_common.cpp | 60 + .../depthwise/depthwise_strategies_common.hpp | 95 + .../kernels/arm_conv/depthwise/depthwise_u8q.cpp | 115 +- .../arm_conv/depthwise/depthwise_u8s8u8q.cpp | 45 +- .../arm_conv/depthwise/interleaves/generic.cpp | 150 + .../arm_conv/depthwise/interleaves/generic.hpp | 80 + .../interleaves/generic_quantized_dot_product.cpp | 161 + .../interleaves/generic_quantized_dot_product.hpp | 53 + .../arm_conv/depthwise/interleaves/list.hpp | 86 +- ...4_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 72 +- ...4_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp | 72 +- .../generic_indirect.cpp | 5 +- ...4_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp | 72 +- ...4_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 72 +- ...4_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 72 +- ...64_fp16_nhwc_generic_output9_mla_depthfirst.hpp | 24 +- .../generic.cpp | 6 +- ...ic_with_multiplier_output2x8_mla_depthfirst.hpp | 32 +- .../generic.cpp | 6 +- ...4_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 76 +- ...4_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp | 76 +- ...4_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp | 76 +- ...4_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 76 +- ...4_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 72 +- ...64_fp32_nhwc_generic_output9_mla_depthfirst.hpp | 24 +- ...s2_with_multiplier_output3x3_mla_depthfirst.hpp | 33 +- ...s1_with_multiplier_output2x4_mla_depthfirst.hpp | 33 +- ...ic_with_multiplier_output2x8_mla_depthfirst.hpp | 32 +- ...64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp | 51 +- .../generic.cpp | 14 +- ...64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 31 +- .../generic.cpp | 1386 ++++---- ...64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 31 +- .../generic.cpp | 1740 +++++----- ...64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 31 +- .../generic.cpp | 3496 ++++++++++---------- ...a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp | 23 +- ...s2_with_multiplier_output2x4_dot_depthfirst.hpp | 33 +- ...s1_with_multiplier_output4x2_dot_depthfirst.hpp | 32 +- ...ic_with_multiplier_output2x8_mla_depthfirst.hpp | 32 +- ...4_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp | 51 +- .../generic.cpp | 14 +- ...64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp | 51 +- .../generic.cpp | 14 +- ...64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 31 +- .../generic.cpp | 1386 ++++---- ...64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 31 +- .../generic.cpp | 1740 +++++----- ...64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 31 +- .../generic.cpp | 3496 ++++++++++---------- ...a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp | 23 +- ...s2_with_multiplier_output2x4_dot_depthfirst.hpp | 32 +- ...s1_with_multiplier_output4x2_dot_depthfirst.hpp | 32 +- ...ic_with_multiplier_output2x8_mla_depthfirst.hpp | 32 +- ...4_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 63 + .../generic.cpp | 1164 +++++++ ...4_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 63 + .../generic.cpp | 1395 ++++++++ ...4_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 63 + .../generic.cpp | 2185 ++++++++++++ ...8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 31 +- .../generic.cpp | 1412 ++++---- ...8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 31 +- .../generic.cpp | 2000 ++++++----- ...8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 31 +- .../generic.cpp | 3266 +++++++++--------- ...u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp | 23 +- ...ic_with_multiplier_output2x8_mla_depthfirst.hpp | 32 +- ...e_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 76 +- ...e_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp | 76 +- ...e_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp | 76 +- ...e_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 76 +- ...e_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 76 +- ...e_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 76 +- ...e_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp | 76 +- ...e_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp | 76 +- ...e_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 76 +- ...e_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 76 +- ...ve_fp32_nhwc_generic_output9_mla_depthfirst.hpp | 20 +- ...s2_with_multiplier_output3x3_mla_depthfirst.hpp | 29 +- ...s1_with_multiplier_output2x4_mla_depthfirst.hpp | 29 +- ...ic_with_multiplier_output2x8_mla_depthfirst.hpp | 32 +- ...ve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp | 51 +- .../generic.cpp | 14 +- ...ve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 34 +- .../generic.cpp | 6 +- ...ve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 34 +- .../generic.cpp | 6 +- ...ve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 34 +- .../generic.cpp | 6 +- ...s2_with_multiplier_output2x4_dot_depthfirst.hpp | 29 +- ...s1_with_multiplier_output4x2_dot_depthfirst.hpp | 29 +- ...e_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp | 51 +- .../generic.cpp | 14 +- ...ve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp | 51 +- .../generic.cpp | 14 +- ...ve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 30 +- .../generic.cpp | 6 +- ...ve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 34 +- .../generic.cpp | 6 +- ...ve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 34 +- .../generic.cpp | 6 +- ...s2_with_multiplier_output2x4_dot_depthfirst.hpp | 29 +- ...s1_with_multiplier_output4x2_dot_depthfirst.hpp | 29 +- ...8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp | 34 +- .../generic.cpp | 6 +- ...8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp | 34 +- .../generic.cpp | 6 +- ...8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp | 34 +- .../generic.cpp | 6 +- .../kernels/arm_conv/depthwise/working_space.hpp | 431 +++ src/core/NEON/kernels/assembly/depthwise.hpp | 42 +- 125 files changed, 18989 insertions(+), 13602 deletions(-) create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp create mode 100644 src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp diff --git a/Android.bp b/Android.bp index 17c56f3f88..c072c0e371 100644 --- a/Android.bp +++ b/Android.bp @@ -301,9 +301,12 @@ cc_library_static { "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp", "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp", "src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp", "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp", "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp", "src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp", "src/core/NEON/kernels/arm_conv/pooling/kernels/cpp_nhwc_1x1_stride_any_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/pooling/pooling_fp16.cpp", "src/core/NEON/kernels/arm_conv/pooling/pooling_fp32.cpp", @@ -962,6 +965,9 @@ cc_library_static { "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp", diff --git a/SConscript b/SConscript index f49263122d..dac1c4ecca 100644 --- a/SConscript +++ b/SConscript @@ -555,6 +555,7 @@ if env['neon']: "src/core/NEON/kernels/convolution/winograd/", "src/core/NEON/kernels/arm_conv/depthwise/", "src/core/NEON/kernels/arm_conv/pooling/", + "src/core/NEON/kernels/arm_conv/", "src/core/NEON/kernels/assembly/", "arm_compute/core/NEON/kernels/assembly/", "src/cpu/kernels/assembly/"]) diff --git a/filelist.json b/filelist.json index 84884e132a..93dfdfff6e 100644 --- a/filelist.json +++ b/filelist.json @@ -665,7 +665,7 @@ "Reduction": { "deps": [ "Reshape" ], "files": { - "common": [ + "common": [ "src/core/CL/kernels/CLReductionOperationKernel.cpp", "src/runtime/CL/functions/CLReductionOperation.cpp" ] @@ -1154,7 +1154,7 @@ "src/core/NEON/kernels/convolution/common/qsymm8.cpp", "src/core/NEON/kernels/convolution/common/utils.cpp", "src/core/NEON/kernels/arm_conv/addressing.cpp", - "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp", "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp", "src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp", "src/core/NEON/kernels/arm_conv/depthwise/interleaves/8b_mla.cpp", @@ -1171,6 +1171,7 @@ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_direct.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst/generic_indirect.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_direct.cpp", @@ -1208,13 +1209,18 @@ "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_s8q_3x3_dot.cpp", "src/core/NEON/kernels/arm_conv/depthwise/interleaves/a64_u8q_3x3_dot.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp", + "src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp", "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp" - ], + ], "fp16":["src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp"], - "fp32":["src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp"], - "qasymm8":["src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp"], + "fp32":["src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp"], + "qasymm8":["src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp"], "qasymm8_signed":["src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp"] }, "sve": { @@ -1315,7 +1321,7 @@ "fp32": ["src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp"], "fp16": ["src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp"] - }, + }, "sve2":{ "qasymm8": ["src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp"], "qasymm8_signed": ["src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp"] @@ -1519,7 +1525,7 @@ "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp", "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp", - "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp", + "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp", "src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp", "src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp" ], @@ -1603,7 +1609,7 @@ "common": [ "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp", "src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp" - ], + ], "neon":{ "common":["src/cpu/kernels/instancenorm/generic/neon/impl.cpp"], "fp16":["src/cpu/kernels/instancenorm/generic/neon/fp16.cpp"], @@ -1663,7 +1669,7 @@ "files": { "common": [ "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp", - "src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp", + "src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp", "src/cpu/operators/CpuMaxUnpooling.cpp" ], "neon":{ @@ -1764,12 +1770,12 @@ "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_2x2_s1_output2x2_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8_nhwc_max_generic_depthfirst/generic.cpp", "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_avg_generic_depthfirst/generic.cpp", - "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp" + "src/core/NEON/kernels/arm_conv/pooling/kernels/a64_u8q_nhwc_max_generic_depthfirst/generic.cpp" ], "nchw": [ "src/cpu/kernels/pool2d/neon/nchw/all.cpp" ], "fp16": [ "src/cpu/kernels/pool2d/neon/fp16.cpp" ], - "fp32": [ "src/cpu/kernels/pool2d/neon/fp32.cpp" ], - "qasymm8":[ "src/cpu/kernels/pool2d/neon/qasymm8.cpp" ], + "fp32": [ "src/cpu/kernels/pool2d/neon/fp32.cpp" ], + "qasymm8":[ "src/cpu/kernels/pool2d/neon/qasymm8.cpp" ], "qasymm8_signed":["src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp"] }, "sve": { @@ -1969,8 +1975,8 @@ "neon":{ "common":["src/cpu/kernels/softmax/generic/neon/impl.cpp"], "fp32": ["src/cpu/kernels/softmax/generic/neon/fp32.cpp"], - "fp16": ["src/cpu/kernels/softmax/generic/neon/fp16.cpp"], - "qasymm8":[ "src/cpu/kernels/softmax/generic/neon/qasymm8.cpp"], + "fp16": ["src/cpu/kernels/softmax/generic/neon/fp16.cpp"], + "qasymm8":[ "src/cpu/kernels/softmax/generic/neon/qasymm8.cpp"], "qasymm8_signed":["src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp"] }, "sve": { @@ -1982,7 +1988,7 @@ }, "sve2":{ "common" :["src/cpu/kernels/softmax/generic/sve2/impl.cpp"], - "qasymm8":[ "src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp"], + "qasymm8":[ "src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp"], "qasymm8_signed":["src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp"] } } @@ -2074,4 +2080,4 @@ "src/gpu/cl/kernels/experimental/dynamic_fusion/ClCompositeKernel.cpp" ] } -} \ No newline at end of file +} diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp new file mode 100644 index 0000000000..e02998f5a0 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "src/core/NEON/kernels/assembly/depthwise.hpp" +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +namespace arm_conv { +namespace depthwise { + +template struct DefaultTAccum { using Type = T; }; +template <> struct DefaultTAccum { using Type = int32_t; }; +template <> struct DefaultTAccum { using Type = int32_t; }; + +template struct DefaultOutputStage { using Type = Nothing; }; +template <> struct DefaultOutputStage { using Type = arm_gemm::Requantize32; }; +template <> struct DefaultOutputStage { using Type = arm_gemm::Requantize32; }; + +class IDepthfirstStrategy +{ + public: + virtual ~IDepthfirstStrategy() = default; + + virtual unsigned int get_input_rows() const = 0; + virtual unsigned int get_input_cols() const = 0; + + virtual unsigned int get_output_rows() const = 0; + virtual unsigned int get_output_cols() const = 0; +}; + + +template +struct TensorSpec +{ + T base; + size_t ld_row, ld_col; + + TensorSpec(T ptr, size_t ld_row, size_t ld_col) + : base(ptr), ld_row(ld_row), ld_col(ld_col) {} +}; + + +template +class DepthfirstDriver : public DepthwiseCommon +{ + protected: + using Parent = DepthwiseCommon; + + // The strategy which we're applying to solve the depthwise convolution. + std::unique_ptr m_strat; + + /* Compute the amount of working space required for a single thread. */ + virtual size_t get_working_size_per_thread(unsigned int n_input_channels) const = 0; + + /* Initialise the working space for a thread. */ + virtual void initialise_working_space(void *, unsigned int n_input_channels) const = 0; + + /* Compute a portion of the output tensor with padding. */ + virtual void compute_tile_padded( + unsigned int output_i, unsigned int output_j, + unsigned int output_channel_start, unsigned int output_channel_end, + const TensorSpec &input, + const TensorSpec &output, + const void *parameters, + void *working_space + ) const = 0; + + /* Compute a portion of the work with only top/bottom padding. + * + * The default implementation of this repeatedly calls into the padded tile + * variant. + */ + virtual void compute_row_padded_tile_row( + const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols, + const unsigned int output_channel_start, const unsigned int output_channel_end, + const TensorSpec &input, + const TensorSpec &output, + const void *parameters, + void *working_space + ) const + { + for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols()) + { + this->compute_tile_padded( + output_i, output_j, output_channel_start, output_channel_end, + input, output, parameters, working_space + ); + } + } + + /* Compute a portion of the output tensor with no padding. + * + * The default implementation of this repeatedly calls into the padded + * variant. + */ + virtual void compute_tiles_unpadded( + unsigned int start_output_i, unsigned int start_output_j, + unsigned int n_tile_rows, unsigned int n_tile_cols, + unsigned int output_channel_start, unsigned int output_channel_end, + const TensorSpec &input, + const TensorSpec &output, + const void *parameters, + void *working_space + ) const + { + for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++) + { + unsigned int row_start_output_j = start_output_j; + for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++) + { + this->compute_tile_padded( + start_output_i, row_start_output_j, + output_channel_start, output_channel_end, + input, output, parameters, working_space + ); + row_start_output_j += m_strat->get_output_cols(); + } + start_output_i += m_strat->get_output_rows(); + } + } + + void execute_internal( + unsigned int n_batches, + unsigned int input_height, + unsigned int input_width, + unsigned int n_input_channels, + const PaddingValues &padding, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + unsigned int output_height, + unsigned int output_width, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads + ) const override + { + // Get and initialise the working space for this thread. + void *thread_working_space = + static_cast(working_space) + thread_id * this->get_working_size_per_thread(n_input_channels); + this->initialise_working_space(thread_working_space, n_input_channels); + + // Construct convenient representations of the input/output tensors. + TensorSpec input_tensor(reinterpret_cast(input), ld_input_row, ld_input_col); + TensorSpec output_tensor(reinterpret_cast(output), ld_output_row, ld_output_col); + + const auto n_output_channels = n_input_channels * this->m_args.channel_multiplier; + + for (unsigned int batch = 0; batch < n_batches; batch++) + { + // Iterate over rows of the output tensor; we stripe over the tiles. + for (unsigned int start_output_i = thread_id * m_strat->get_output_rows(); + start_output_i < output_height; + start_output_i += n_threads * m_strat->get_output_rows()) + { + // Determine what (if any padding) is required on the top/bottom of + // this row of the convolution. + const auto end_output_i = start_output_i + m_strat->get_output_rows(); + const bool pad_output_bottom = output_height < end_output_i; + + const int start_input_i = start_output_i * this->m_args.stride_rows - padding.top; + const bool pad_input_top = start_input_i < 0; + const int end_input_i = start_input_i + m_strat->get_input_rows(); + const bool pad_input_bottom = static_cast(input_height) < end_input_i; + const bool pad_row = pad_input_top || pad_input_bottom || pad_output_bottom; + + // Iterate over the columns of the output tensor; we attempt to grab as + // much as possible of the unpadded regions, so the loop structure is a + // bit odd. + unsigned int start_output_j = 0; + while (start_output_j < output_width) + { + const int start_in_j = start_output_j * this->m_args.stride_cols - padding.left; + const bool pad_input_left = start_in_j < 0; + + // Determine if we can process a number of unpadded tiles in one go. + int n_unpadded_tiles = 0; + if (!pad_input_left) + { + // Determine the maximum number of tiles we could handle. + n_unpadded_tiles = (output_width - start_output_j) / m_strat->get_output_cols(); + + // Handle padding on the right hand edge + const int tile_stride = m_strat->get_output_cols() * this->m_args.stride_cols; + int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols(); + int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride; + + while (n_unpadded_tiles > 0 && + (static_cast(output_width) < end_output_j || + static_cast(input_width) < end_input_j)) + { + n_unpadded_tiles--; + end_output_j -= m_strat->get_output_cols(); + end_input_j -= tile_stride; + } + } + + // Process unpadded tiles, if possible, otherwise process a padded tile. + if (n_unpadded_tiles) + { + if (!pad_row) + { + // Completely unpadded execution + this->compute_tiles_unpadded( + start_output_i, start_output_j, + 1, n_unpadded_tiles, // Compute a row of unpadded tiles + 0, n_output_channels, // Compute all channels + input_tensor, output_tensor, parameters, thread_working_space + ); + } + else + { + // Top/bottom padding only + this->compute_row_padded_tile_row( + start_output_i, start_output_j, n_unpadded_tiles, + 0, n_output_channels, // Compute all channels + input_tensor, output_tensor, parameters, thread_working_space + ); + } + start_output_j += n_unpadded_tiles * m_strat->get_output_cols(); + } + else + { + this->compute_tile_padded( + start_output_i, start_output_j, + 0, n_output_channels, // Compute all channels + input_tensor, output_tensor, parameters, thread_working_space + ); + start_output_j += m_strat->get_output_cols(); + } + } + } + + // Progress the pointers for the next batch. + input_tensor.base += ld_input_batch; + output_tensor.base += ld_output_batch; + } + } + + public: + DepthfirstDriver(IDepthfirstStrategy *strategy, const DepthwiseArgs &args) + : Parent(args), m_strat(strategy) + { + } + + size_t get_working_size(unsigned int n_threads, unsigned int n_input_channels) const override final + { + return n_threads * this->get_working_size_per_thread(n_input_channels); + } +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp index 57fa11151b..6905076357 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,7 +24,9 @@ #pragma once -#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "src/core/NEON/kernels/arm_conv/addressing.hpp" +#include "depthwise_strategies_common.hpp" +#include "working_space.hpp" #ifdef CYCLE_PROFILING #include "profiler.hpp" @@ -35,349 +37,547 @@ namespace arm_conv { namespace depthwise { -struct IDepthwiseDepthfirstStrategy +template +class DepthwiseDepthfirstStrategyCommon + : public DepthfirstStrategy { - virtual arm_gemm::VLType get_vl_type() const = 0; + protected: + unsigned int m_output_rows, m_output_cols; + unsigned int m_kernel_rows, m_kernel_cols; + unsigned int m_stride_rows, m_stride_cols; - virtual unsigned int get_input_rows() const = 0; - virtual unsigned int get_input_cols() const = 0; + public: + DepthwiseDepthfirstStrategyCommon( + unsigned int output_rows, unsigned int output_cols, + unsigned int kernel_rows, unsigned int kernel_cols, + unsigned int stride_rows=1, unsigned int stride_cols=1 + ) : m_output_rows(output_rows), m_output_cols(output_cols), + m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols), + m_stride_rows(stride_rows), m_stride_cols(stride_cols) + { + } + + DepthwiseDepthfirstStrategyCommon(unsigned int output_size, unsigned int kernel_size, unsigned int stride=1) + : DepthwiseDepthfirstStrategyCommon(output_size, output_size, kernel_size, kernel_size, stride, stride) + { + } + + virtual ~DepthwiseDepthfirstStrategyCommon() {} + + unsigned int get_output_rows() const override { return m_output_rows; } + unsigned int get_output_cols() const override { return m_output_cols; } - virtual unsigned int get_output_rows() const = 0; - virtual unsigned int get_output_cols() const = 0; + unsigned int get_kernel_rows() const override { return m_kernel_rows; } + unsigned int get_kernel_cols() const override { return m_kernel_cols; } - virtual unsigned int get_kernel_rows() const = 0; - virtual unsigned int get_kernel_cols() const = 0; + unsigned int get_stride_rows() const override { return m_stride_rows; } + unsigned int get_stride_cols() const override { return m_stride_cols; } +}; + +template ::Type> +class DepthwiseDepthfirstStrategy : public DepthwiseDepthfirstStrategyCommon +{ + using Parent = DepthwiseDepthfirstStrategyCommon; - virtual unsigned int get_stride_rows() const = 0; - virtual unsigned int get_stride_cols() const = 0; + public: + using Parent::Parent; - virtual void indirect_kernel( - const void *const *const input_ptrs, - void *const *const output_ptrs, + typedef void (*IndirectKernelType)( + const TInput *const *input_ptrs, + TOutput *const *output_ptrs, const void *params, unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const = 0; + const TAccum activation_min, + const TAccum activation_max + ); + virtual IndirectKernelType get_indirect_kernel(void) const = 0; - virtual void direct_kernel( + typedef void (*DirectKernelType)( const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, + const TInput *inptr_base, int64_t ld_input_row, int64_t ld_input_col, + TOutput *outptr_base, int64_t ld_output_row, int64_t ld_output_col, const void *params, unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const = 0; - - virtual ~IDepthwiseDepthfirstStrategy() {} + const TAccum activation_min, + const TAccum activation_max + ); + virtual DirectKernelType get_direct_kernel(void) const = 0; }; -template -class DepthwiseDepthfirst : public DepthwiseCommon +template +class DepthwiseDepthfirstStrategy +: public DepthwiseDepthfirstStrategyCommon { - const std::unique_ptr m_strat; + using Parent = DepthwiseDepthfirstStrategyCommon; - size_t sizeof_inptr_array(void) const + protected: + interleaves::PackingArguments get_packing_args(void) const { - return sizeof(TInput *) * m_strat->get_input_rows() * m_strat->get_input_cols(); + return interleaves::PackingArguments( + this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight), + false, sizeof(int32_t), // Don't pack the bias + this->get_vl_type(), sizeof(int32_t), this->get_accumulator_depth_vl(), + [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool + { return this->get_kernel_packing_point(idx, x, y); } + ); } - size_t sizeof_input_buffer(unsigned int n_input_channels) const + public: + using Parent::Parent; + + typedef void (*KernelType)( + unsigned int, // n_channels, + const TInput *const *, // inptrs + const TWeight *, // weights + const int32_t *, // bias, + const arm_gemm::Requantize32 &, + const int32_t *, const int32_t *, // requant_muls and requant_shifts + TOutput *const * // outptrs + ); + virtual KernelType get_kernel() const = 0; + + size_t get_storage_size(const DepthwiseArgs &args) const override { - return sizeof(TInput) * n_input_channels; + return interleaves::get_storage_size_generic(get_packing_args(), args); } - size_t sizeof_outptr_array(void) const + void pack_parameters( + const DepthwiseArgs &args, void *buffer, + const void *biases, const arm_gemm::Requantize32 &, + const void *weights, size_t ld_weight_col, size_t ld_weight_row + ) const override { - return sizeof(TInput *) * m_strat->get_output_rows() * m_strat->get_output_cols(); + interleaves::pack_parameters_generic( + get_packing_args(), args, buffer, biases, weights, ld_weight_col, ld_weight_row); } +}; - size_t sizeof_output_buffer(unsigned int n_output_channels) const - { - return sizeof(TOutput) * n_output_channels; - } +template +class DepthwiseDepthfirstCommon : public DepthfirstDriver +{ + using StratType = DepthwiseDepthfirstStrategyCommon; + OutputStage m_os; + + protected: + inline OutputStage &get_output_stage(void) { return m_os; } + inline const OutputStage &get_output_stage(void) const { return m_os; } public: - DepthwiseDepthfirst( - IDepthwiseDepthfirstStrategy *const strat, - const DepthwiseArgs &args - ) : DepthwiseCommon(args), m_strat(strat) + DepthwiseDepthfirstCommon(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os) + : DepthfirstDriver(strat, args), m_os(os) { } - DepthwiseDepthfirst(DepthwiseDepthfirst &) = delete; - DepthwiseDepthfirst &operator=(DepthwiseDepthfirst &) = delete; + DepthwiseDepthfirstCommon(DepthwiseDepthfirstCommon &) = delete; + DepthwiseDepthfirstCommon &operator=(DepthwiseDepthfirstCommon &) = delete; size_t get_storage_size(void) const override { - // TODO What if we insert extra padding? Biases are a different size to the inputs, ... - const unsigned int vl = arm_gemm::utils::get_vector_length(m_strat->get_vl_type()); - const auto rounded_channels = arm_gemm::roundup(this->m_args.input_channels, vl); - return (1 + this->m_args.kernel_rows * this->m_args.kernel_cols) * rounded_channels * sizeof(TWeight); + return reinterpret_cast(this->m_strat.get())-> + get_storage_size(this->m_args); } - void pack_parameters(void *_buffer, const void *_biases, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override + void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override { - // TODO What if the kernel needs a different packing function? + reinterpret_cast(this->m_strat.get())-> + pack_parameters(this->m_args, buffer, biases, m_os, weights, ld_weight_col, ld_weight_row); + } +}; - // Cast the pointers - uint8_t *buffer = static_cast(_buffer); - const TAccum *biases = static_cast(_biases); - const TWeight *const weights = static_cast(_weights); +namespace depthwise_depthfirst { - const unsigned int vl = arm_gemm::utils::get_vector_length(m_strat->get_vl_type()); - ld_weight_col = (ld_weight_col == 0) ? this->m_args.input_channels : ld_weight_col; - ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row; +/* Workspace Element for an array of input pointers as consumed by the + * specialised depthwise kernels. + */ +template +class InputArrayElement +{ + public: + struct Workspace + { + const T **inptr_array; + }; - for (unsigned int n = 0; n < this->m_args.input_channels; n += vl) - { - const unsigned int todo = std::min(vl, this->m_args.input_channels - n); + template + static size_t get_element_size(const WorkspaceArgs &args) + { + return sizeof(T **) * args.strategy->get_input_rows() * args.strategy->get_input_cols(); + } - // Copy across the correct amount of bias (or 0) - for (unsigned int i = 0; i < todo; i++) - { - reinterpret_cast(buffer)[i] = (biases == nullptr) ? 0 : biases[n + i]; - } - buffer += vl * sizeof(TAccum); + template + static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs &args) + { + ws->inptr_array = reinterpret_cast(buffer); + return reinterpret_cast(buffer) + get_element_size(args); + } +}; - // Copy each of the weights in turn - auto weights_row = weights + n; - for (unsigned int i = 0; i < this->m_args.kernel_rows; i++) - { - auto weights_col = weights_row; +template +struct WorkspaceFinalElement +{ + using Element = ActivationsElement; +}; - for (unsigned int j = 0; j < this->m_args.kernel_cols; j++) - { - for (unsigned int m = 0; m < todo; m++) - { - reinterpret_cast(buffer)[m] = weights_col[m]; - } - buffer += vl * sizeof(TWeight); +template <> +struct WorkspaceFinalElement +{ + using Element = RequantizationParametersElement; +}; - weights_col += ld_weight_col; - } +template +struct Invoke +{ + constexpr static bool supports_direct_kernel = true; - weights_row += ld_weight_row; - } - } + template + static inline void indirect(const Strat *strat, const Workspace *ws, const OutputStage &, const void *params, const TAccum *, unsigned int n_channels) + { + strat->get_indirect_kernel()( + ws->inptr_array, + ws->outptr_array, + params, n_channels, + ws->activation_min, ws->activation_max + ); } - size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override + template + static void direct( + const Strat *strat, const Workspace *ws, const OutputStage &, + unsigned int n_tile_rows, unsigned int n_tile_cols, + const TInput *inptr, size_t ld_in_row, size_t ld_in_col, + TOutput *outptr, size_t ld_out_row, size_t ld_out_col, + const void *params, unsigned int n_channels + ) { - const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier; - return n_threads * (sizeof_inptr_array() + sizeof_outptr_array() + - sizeof_output_buffer(n_output_channels) + - sizeof_input_buffer(n_channels)); + strat->get_direct_kernel()( + n_tile_rows, n_tile_cols, + inptr, ld_in_row, ld_in_col, + outptr, ld_out_row, ld_out_col, + params, n_channels, ws->activation_min, ws->activation_max + ); } +}; - using DepthwiseCommon::execute; - void execute( - const unsigned int batches, - const unsigned int input_height, - const unsigned int input_width, - const unsigned int input_channels, - const PaddingValues &padding, - const void *const _input, - const size_t ld_input_col, - const size_t ld_input_row, - const size_t ld_input_batch, - const void *const parameters, - const unsigned int output_height, - const unsigned int output_width, - void *const _output, - const size_t ld_output_col, - const size_t ld_output_row, - const size_t ld_output_batch, - void *const _working_space, - const unsigned int thread_id, - const unsigned int n_threads - ) const override +template +struct Invoke +{ + constexpr static bool supports_direct_kernel = false; + + template + static inline void indirect(const Strat *strat, const Workspace *ws, const arm_gemm::Requantize32 &qp, const void *params, const TAccum *, unsigned int n_channels) { -#ifdef CYCLE_PROFILING - arm_gemm::profiler prof; -#endif + strat->get_kernel()( + n_channels, ws->inptr_array, + reinterpret_cast(params), ws->bias, + qp, ws->requant_muls, ws->requant_shifts, + ws->outptr_array + ); + } + + template + static inline void direct( + const Strat *, const Workspace *, const arm_gemm::Requantize32 &, + unsigned int, unsigned int, // n_tile_rows, n_tile_cols + const TInput *, size_t, size_t, // Input pointer, row stride, column stride + TOutput *, size_t, size_t, // Output pointer, row stride, column stride + const void *, unsigned int // Parameters, number of channels + ) + { + // Do nothing - this should never be reached because entry to it is guarded + // by an `if` on a `constexpr static bool`. + } +}; - // Compute activation values - TAccum activation_min, activation_max; - std::tie(activation_min, activation_max) = get_default_activation_values(); +namespace +{ - switch (this->m_args.activation.type) - { - case arm_gemm::Activation::Type::BoundedReLU: - activation_max = static_cast(this->m_args.activation.param1); - // Fall through - case arm_gemm::Activation::Type::ReLU: - activation_min = static_cast(0); - break; - default: - break; - } +template +inline void stash_bias(OutputStage &, const void *) {} - // Determine what portion of the work to do. - const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads); - const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height); - const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height); +template <> +inline void stash_bias(arm_gemm::Requantize32 &qp, const void *bias) __attribute__ ((unused)); - // Cast input and output pointers into the right types - const TInput *const inptr = static_cast(_input); - TOutput *const outptr = static_cast(_output); +template <> +inline void stash_bias(arm_gemm::Requantize32 &qp, const void *bias) +{ + qp.bias = reinterpret_cast(bias); +} - // Allocate portions of the working space - uint8_t *working_space = static_cast(_working_space) + get_working_size(thread_id, input_channels); +} - const void **const inptr_array = reinterpret_cast(working_space); - working_space += sizeof_inptr_array(); +} // namespace depthwise_depthfirst - void **const outptr_array = reinterpret_cast(working_space); - working_space += sizeof_outptr_array(); +template ::Type, + typename OutputStage=typename DefaultOutputStage::Type> +class DepthwiseDepthfirst +: public DepthwiseDepthfirstCommon +{ + using StratType = DepthwiseDepthfirstStrategy; + using Parent = DepthwiseDepthfirstCommon; + using WorkspaceManager = Workspace< + OutputArrayElement, + depthwise_depthfirst::InputArrayElement, + InputBufferElement, + typename depthwise_depthfirst::WorkspaceFinalElement::Element + >; + using WorkingSpace = typename WorkspaceManager::WorkspaceType; + + // We keep a copy of the bias and output stage + const TAccum *m_bias; - TOutput *const output_buffer = reinterpret_cast(working_space); - working_space += sizeof_output_buffer(input_channels * this->m_args.channel_multiplier); + public: + DepthwiseDepthfirst(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os = {}) + : Parent(strat, args, os), m_bias(nullptr) + { + } - TInput *const input_buffer = reinterpret_cast(working_space); + DepthwiseDepthfirst(DepthwiseDepthfirst &) = delete; + DepthwiseDepthfirst &operator=(DepthwiseDepthfirst &) = delete; - // Initialise the input buffer - for (unsigned int c = 0; c < input_channels; c++) - { - input_buffer[c] = static_cast(0); - } + void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override + { + reinterpret_cast(this->m_strat.get())->pack_parameters( + this->m_args, buffer, biases, this->get_output_stage(), + weights, ld_weight_col, ld_weight_row + ); + m_bias = reinterpret_cast(biases); + depthwise_depthfirst::stash_bias(this->get_output_stage(), biases); + } - // For each output tile, construct the requisite set of pointers and call - // into the kernel. - for (unsigned int batch = 0; batch < batches; batch++) - { - // Get batch pointers - const auto inptr_batch = inptr + batch * ld_input_batch; - const auto outptr_batch = outptr + batch * ld_output_batch; + size_t get_working_size_per_thread(const unsigned int n_input_channels) const override + { + DepthwiseArgs args(this->m_args); + args.input_channels = n_input_channels; + return WorkspaceManager::get_sizeof_workspace( + WorkspaceArgs(this->m_strat.get(), args, this->get_output_stage()) + ); + } - for (int start_out_i = start_out_height; - start_out_i < end_out_height; - start_out_i += static_cast(m_strat->get_output_rows())) - { - const int end_out_i = start_out_i + m_strat->get_output_rows(); - const int start_in_i = start_out_i * m_strat->get_stride_rows() - padding.top; - const int end_in_i = start_in_i + m_strat->get_input_rows(); - - // Compute top/bottom padding - const auto pad_top = static_cast(-std::min(start_in_i, 0)); - const auto pad_bottom = static_cast(-std::min(static_cast(input_height) - end_in_i, 0)); - const unsigned int valid_output_rows = std::min( - end_out_i - start_out_i, - static_cast(output_height) - start_out_i - ); + void initialise_working_space(void *buffer, unsigned int n_input_channels) const override + { + DepthwiseArgs args(this->m_args); + args.input_channels = n_input_channels; + WorkspaceManager::initialise( + buffer, WorkspaceArgs(this->m_strat.get(), args, this->get_output_stage()) + ); + } - // Fill the input pointer array with padding values - for (auto index = 0u; index < m_strat->get_input_rows() * m_strat->get_input_cols(); index++) - { - inptr_array[index] = input_buffer; - } + protected: + void compute_tile_padded( + unsigned int output_i, unsigned int output_j, + unsigned int output_channel_start, unsigned int output_channel_end, + const TensorSpec &input, + const TensorSpec &output, + const void *parameters, + void *working_space_raw + ) const override + { + // Get the working space + auto ws = reinterpret_cast(working_space_raw); + + // Compute the input pointer array + const auto input_channel_start = output_channel_start / this->m_args.channel_multiplier; + + const int ii = static_cast(output_i * this->m_args.stride_rows) - this->m_args.padding.top; + const auto input_pad_top = static_cast(ii < 0 ? -ii : 0); + const auto input_i = static_cast(ii < 0 ? 0 : ii); + + const int ij = static_cast(output_j * this->m_args.stride_cols) - this->m_args.padding.left; + const auto input_pad_left = static_cast(ij < 0 ? -ij : 0); + const auto input_j = static_cast(ij < 0 ? 0 : ij); + + fill_pointer_array( + ws->inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(), + input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start, + input.ld_row, input.ld_col, + ws->input_buffer, + input_pad_top, this->m_args.input_rows - input_i, + input_pad_left, this->m_args.input_cols - input_j + ); + + // Compute the output pointer array + fill_pointer_array( + ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(), + output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start, + output.ld_row, output.ld_col, + ws->output_buffer, + 0, this->m_args.output_rows - output_i, // Top padding, # valid rows + 0, this->m_args.output_cols - output_j // Left padding, # valid columns + ); + + // Execute the kernel + depthwise_depthfirst::Invoke::indirect( + reinterpret_cast(this->m_strat.get()), + ws, this->get_output_stage(), parameters, m_bias, output_channel_end - output_channel_start + ); + } - for (int start_out_j = 0; start_out_j < static_cast(output_width);) - { - const int start_in_j = start_out_j * m_strat->get_stride_cols() - this->m_args.padding.left; - int pad_left = std::min(0, start_in_j); + void compute_row_padded_tile_row( + const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols, + const unsigned int output_channel_start, const unsigned int output_channel_end, + const TensorSpec &input, + const TensorSpec &output, + const void *parameters, + void *working_space + ) const override + { + using Invoker = depthwise_depthfirst::Invoke; + auto ws = reinterpret_cast(working_space); + const auto strat = reinterpret_cast(this->m_strat.get()); + const auto os = this->get_output_stage(); + + // Compute top and bottom padding; hence fill in the initial pointer arrays. + const auto input_channel_start = output_channel_start / this->m_args.channel_multiplier; + const int ii = static_cast(output_i * this->m_args.stride_rows) - this->m_args.padding.top; + const auto input_pad_top = static_cast(ii < 0 ? -ii : 0); + + const auto input_i = static_cast(ii < 0 ? 0 : ii); + const auto input_j = output_j * this->m_args.stride_cols - this->m_args.padding.left; + + const auto valid_input_rows = std::min(strat->get_input_rows(), this->m_args.input_rows - input_i); + const auto valid_output_rows = std::min(strat->get_output_rows(), this->m_args.output_rows - output_i); + + const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * this->m_args.stride_cols; + const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols(); + + fill_pointer_array( + ws->inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(), + input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start, + input.ld_row, input.ld_col, + ws->input_buffer, + input_pad_top, this->m_args.input_rows - input_i, + 0, this->m_args.input_cols - input_j // No left padding + ); + + fill_pointer_array( + ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(), + output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start, + output.ld_row, output.ld_col, + ws->output_buffer, + 0, this->m_args.output_rows - output_i, // Top padding, # valid rows + 0, this->m_args.output_cols - output_j // Left padding, # valid columns + ); + + for (; n_tile_cols; n_tile_cols--) + { + // Execute the kernel + Invoker::indirect( + strat, ws, os, parameters, m_bias, output_channel_end - output_channel_start + ); - // Compute how many output tiles we can compute with the direct kernel. - int n_direct_tiles = 0; - if (!pad_top && !pad_bottom && !pad_left) + // Update all unpadded pointers + { + auto ptr = ws->inptr_array + strat->get_input_cols() * input_pad_top; + for (auto n = input_pad_top; n < valid_input_rows; n++) + { + for (auto m = 0u; m < strat->get_input_cols(); m++) { - // Determine the maximum number of tiles we could handle. - n_direct_tiles = (output_width - start_out_j) / m_strat->get_output_cols(); - - // Continue to reduce this number as required to avoid reading - // padding on the right edge. - int end_in_j = start_in_j + n_direct_tiles * m_strat->get_input_cols(); - int pad_right = std::max(0, end_in_j - static_cast(input_width)); - - while (pad_right && n_direct_tiles) - { - n_direct_tiles--; - end_in_j -= m_strat->get_input_cols(); - pad_right = std::max(0, end_in_j - static_cast(input_width)); - } + *(ptr++) += input_point_stride; } + } + } + { + auto ptr = ws->outptr_array; + for (auto n = 0u; n < valid_output_rows * strat->get_output_cols(); n++) + { + *(ptr++) += output_point_stride; + } + } + } + } - // Use the unpadded kernel if we can, otherwise use the padded one. - if (n_direct_tiles) - { - auto inptr = inptr_batch + start_in_i*ld_input_row + start_in_j*ld_input_col; - auto outptr = outptr_batch + start_out_i*ld_output_row + start_out_j*ld_output_col; - start_out_j += n_direct_tiles*m_strat->get_output_cols(); + void compute_tiles_unpadded( + unsigned int output_i, const unsigned int output_j, + unsigned int n_tile_rows, unsigned int n_tile_cols, + unsigned int output_channel_start, unsigned int output_channel_end, + const TensorSpec &input, + const TensorSpec &output, + const void *parameters, + void *working_space_raw + ) const override + { + using Invoker = depthwise_depthfirst::Invoke; + auto ws = reinterpret_cast(working_space_raw); + const auto strat = reinterpret_cast(this->m_strat.get()); + const auto os = this->get_output_stage(); -#ifdef CYCLE_PROFILING - auto p = prof.ScopedProfiler(PROFILE_KERNEL, 0); -#endif - m_strat->direct_kernel(1, n_direct_tiles, - inptr, ld_input_row, ld_input_col, - outptr, ld_output_row, ld_output_col, - parameters, this->m_args.input_channels, - &activation_min, &activation_max); - continue; - } + if (Invoker::supports_direct_kernel) + { + // If the direct kernel is supported, then use it. + // Compute the base pointers we'll use in the tile. + auto outptr = output.base + output_channel_start + output_i * output.ld_row + output_j * output.ld_col; + const int start_input_i = output_i * this->m_args.stride_rows - this->m_args.padding.top; + const int start_input_j = output_j * this->m_args.stride_cols - this->m_args.padding.left; + auto inptr = input.base + output_channel_start + start_input_i * input.ld_row + start_input_j * input.ld_col; + + // Execute the kernel + Invoker::direct( + strat, ws, os, + n_tile_rows, n_tile_cols, + inptr, input.ld_row, input.ld_col, + outptr, output.ld_row, output.ld_col, + parameters, output_channel_end - output_channel_start + ); + } + else + { + // Otherwise, we repeatedly call the padded kernel but use our knowledge + // of the tensor structure to avoid recomputing the pointer array. + const auto input_channel_start = output_channel_start / this->m_args.channel_multiplier; + + const auto n_input_pointers = this->m_strat->get_input_rows() * this->m_strat->get_input_cols(); + const auto input_point_stride = input.ld_col * this->m_strat->get_output_cols() * this->m_args.stride_cols; + const auto n_output_pointers = this->m_strat->get_output_rows() * this->m_strat->get_output_cols(); + const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols(); + + // For each tile row, initialise the input and output pointer arrays. For + // each subsequent tile we simply update the pointers. + for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++) + { + const int input_i = static_cast(output_i * this->m_args.stride_rows) - this->m_args.padding.top; + const int input_j = static_cast(output_j * this->m_args.stride_cols) - this->m_args.padding.left; + + fill_pointer_array( + ws->inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(), + input.base + input_i*input.ld_row + input_j*input.ld_col + input_channel_start, + input.ld_row, input.ld_col, + ws->input_buffer, + 0, this->m_args.input_rows, + 0, this->m_args.input_cols + ); - const int end_out_j = start_out_j + m_strat->get_output_cols(); - const int end_in_j = start_in_j + m_strat->get_input_cols(); + // Compute the output pointer array + fill_pointer_array( + ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(), + output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start, + output.ld_row, output.ld_col, + ws->output_buffer, + 0, this->m_args.output_rows, + 0, this->m_args.output_cols + ); - const auto pad_right = static_cast(-std::min(static_cast(input_width) - end_in_j, 0)); - const unsigned int valid_output_cols = std::min( - end_out_j - start_out_j, - static_cast(output_width) - start_out_j + for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++) + { + // Invoke the indirect kernel for this tile + depthwise_depthfirst::Invoke::indirect( + strat, ws, os, parameters, m_bias, output_channel_end - output_channel_start ); - pad_left *= -1; - // Construct the input pointer array - fill the array with pointers to - // the input buffer and then fill in the required values. - for (auto i = pad_top; i < m_strat->get_input_rows() - pad_bottom; i++) - { - // Can skip over the left padding because we will have either the - // same or less than the previous tile. - unsigned int j = pad_left; - const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col; - const void **ptrs = inptr_array + i * m_strat->get_input_cols() + j; - for (; j < m_strat->get_input_cols() - pad_right; j++) - { - *(ptrs++) = colptr; - colptr += ld_input_col; - } - for (; j < m_strat->get_input_cols(); j++) - { - *(ptrs++) = input_buffer; - } - } - // Construct the output pointer array. - void **outptr_pos = outptr_array; - for (auto i = 0u; i < valid_output_rows; i++) + // Progress the pointers + for (auto i = 0u; i < n_input_pointers; i++) { - unsigned int j = 0u; - TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col; - for (; j < valid_output_cols; j++) - { - *(outptr_pos++) = colptr; - colptr += ld_output_col; - } - for (; j < m_strat->get_output_cols(); j++) - { - *(outptr_pos++) = output_buffer; - } + ws->inptr_array[i] += input_point_stride; } - for (auto i = valid_output_rows; i < m_strat->get_output_rows(); i++) + for (auto i = 0u; i < n_output_pointers; i++) { - for (auto j = 0u; j < m_strat->get_output_cols(); j++) - { - *(outptr_pos++) = output_buffer; - } + ws->outptr_array[i] += output_point_stride; } - - start_out_j += m_strat->get_output_cols(); - -#ifdef CYCLE_PROFILING - // TODO Work number - auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(0)); -#endif - m_strat->indirect_kernel(inptr_array, outptr_array, parameters, - this->m_args.input_channels, - &activation_min, &activation_max); } + + output_i += this->m_strat->get_output_rows(); } } } diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp index f04f7751db..9f53f7cc6f 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,355 +24,277 @@ #pragma once -#include "src/core/NEON/kernels/arm_gemm/utils.hpp" - -#ifdef CYCLE_PROFILING -#include "profiler.hpp" -#endif - -#include +#include "depthwise_depthfirst.hpp" namespace arm_conv { namespace depthwise { -template -class DepthwiseDepthfirstGenericBase : - public DepthwiseCommon +template +struct GenericDepthfirstKernelStrategyFunctionType { - protected: + using KernelType = std::function; +}; - using TInput = typename Strategy::input_type; - using TWeight = typename Strategy::weight_type; - using TOutput = typename Strategy::return_type; - using TAccum = typename Strategy::bias_type; +template +struct GenericDepthfirstKernelStrategyFunctionType +{ + using KernelType = std::function; +}; - size_t sizeof_input_ptr_array(void) const - { - return sizeof(TInput *) * this->m_args.kernel_rows * this->m_args.kernel_cols * Strategy::n_output_points; - } +template +class GenericDepthfirstKernelStrategy +{ + unsigned int m_n_output_points; + arm_gemm::VLType m_vl_type; + unsigned int m_accumulator_depth_vl; - size_t sizeof_input_buffer(unsigned int n_channels) const + public: + GenericDepthfirstKernelStrategy(unsigned int n_output_points, arm_gemm::VLType vl_type, unsigned int accumulator_depth_vl=1) + : m_n_output_points(n_output_points), m_vl_type(vl_type), m_accumulator_depth_vl(accumulator_depth_vl) { - const unsigned int vl = arm_gemm::utils::get_vector_length(Strategy::vl_type); - const auto rounded_channels = arm_gemm::roundup(n_channels, vl); - return sizeof(TInput) * rounded_channels; } - size_t sizeof_output_buffer(unsigned int n_channels) const + virtual ~GenericDepthfirstKernelStrategy() = default; + + virtual arm_gemm::VLType get_vl_type() const { return m_vl_type; } + virtual unsigned int get_accumulator_depth_vl() const { return m_accumulator_depth_vl; } + virtual unsigned int get_n_output_points() const { return m_n_output_points; } + + using KernelType = typename GenericDepthfirstKernelStrategyFunctionType::KernelType; + virtual KernelType get_kernel(void) const = 0; +}; + +template ::Type, + typename OutputStage=typename DefaultOutputStage::Type> +class GenericDepthfirstStrategy : public DepthwiseDepthfirstStrategyCommon +{ + protected: + using KernelStrategyType = GenericDepthfirstKernelStrategy; + std::unique_ptr m_strategy; + + public: + GenericDepthfirstStrategy( + KernelStrategyType *strat, unsigned int n_output_rows, unsigned int n_output_cols, + const DepthwiseArgs &args + ) + : DepthwiseDepthfirstStrategyCommon( + n_output_rows, n_output_cols, + args.kernel_rows, args.kernel_cols, + args.stride_rows, args.stride_cols + ), + m_strategy(strat) { - const unsigned int vl = arm_gemm::utils::get_vector_length(Strategy::vl_type); - const auto rounded_channels = arm_gemm::roundup(n_channels, vl); - return sizeof(TOutput) * rounded_channels; } - unsigned int input_rows(void) const + GenericDepthfirstStrategy(GenericDepthfirstStrategy &) = delete; + GenericDepthfirstStrategy operator=(GenericDepthfirstStrategy &) = delete; + + arm_gemm::VLType get_vl_type(void) const override { return m_strategy->get_vl_type(); } + unsigned int get_accumulator_depth_vl(void) const override { return m_strategy->get_accumulator_depth_vl(); } + + size_t get_storage_size(const DepthwiseArgs &args) const override { - return this->m_args.kernel_rows + (OutputRows - 1)*this->m_args.stride_rows; + interleaves::PackingArguments packing_args( + this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight), + false, sizeof(TAccum), // Don't pack the bias + this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(), + [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool + { return this->get_kernel_packing_point(idx, x, y); } + ); + return interleaves::get_storage_size_generic(packing_args, args); } - unsigned int input_cols(void) const + void pack_parameters( + const DepthwiseArgs &args, void *buffer, + const void *biases, const OutputStage &, + const void *weights, size_t ld_weight_col, size_t ld_weight_row + ) const override { - return this->m_args.kernel_cols + (OutputCols - 1)*this->m_args.stride_cols; + interleaves::PackingArguments packing_args( + this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight), + false, sizeof(TAccum), // Don't pack the bias + this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(), + [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool + { return this->get_kernel_packing_point(idx, x, y); } + ); + interleaves::pack_parameters_generic( + packing_args, args, buffer, biases, weights, ld_weight_col, ld_weight_row); } - void execute_tiles( - std::function tile_fn, - std::function initialise_input_buffer, - const unsigned int batches, - const unsigned int input_height, - const unsigned int input_width, - const unsigned int input_channels, - const PaddingValues &padding, - const void *const _input, - const size_t ld_input_col, - const size_t ld_input_row, - const size_t ld_input_batch, - const unsigned int output_height, - const unsigned int output_width, - void *const _output, - const size_t ld_output_col, - const size_t ld_output_row, - const size_t ld_output_batch, - void *const _working_space, - const unsigned int thread_id, - const unsigned int n_threads - ) const + const typename KernelStrategyType::KernelType get_kernel() const { return m_strategy->get_kernel(); } +}; + +// Use a templated function to marshal arguments when executing the kernel. +template struct DepthwiseDepthfirstGenericKernelCall; + +template <> +struct DepthwiseDepthfirstGenericKernelCall +{ + template + static void execute( + const StratType *strat, const WorkspaceType *ws, const Nothing &, + const TAccum *bias, const void *params, + const unsigned int n_kernel_points, const unsigned int n_output_channels + ) { - static_assert(OutputRows * OutputCols <= Strategy::n_output_points, - "Too many output points for kernel."); - - // Determine what portion of the work to do. - const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads); - const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height); - const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height); - - // Cast input and output pointers into the right types - const TInput *const inptr = static_cast(_input); - TOutput *const outptr = static_cast(_output); - - // Allocate portions of the working space - uint8_t *const working_space = static_cast(_working_space) + this->get_working_size(thread_id, input_channels); - const TInput **const inptr_array = reinterpret_cast(working_space); - TOutput *const output_buffer = reinterpret_cast(working_space + this->sizeof_input_ptr_array()); - TInput *const input_buffer = reinterpret_cast(working_space + this->sizeof_input_ptr_array() + this->sizeof_output_buffer(input_channels * this->m_args.channel_multiplier)); - - // Create an array for the output pointers - TOutput * _outptr_array[Strategy::n_output_points]; - TOutput **const outptr_array = _outptr_array; - - // Initialise the input buffer - initialise_input_buffer(input_buffer, input_channels); - - // For each output tile, construct the requisite set of pointers and call - // into the kernel. - for (unsigned int batch = 0; batch < batches; batch++) - { - // Get batch pointers - const auto inptr_batch = inptr + batch * ld_input_batch; - const auto outptr_batch = outptr + batch * ld_output_batch; - - for (int start_out_i = start_out_height; - start_out_i < end_out_height; - start_out_i += static_cast(OutputRows)) - { - const int end_out_i = std::min(start_out_i + OutputRows, - output_height); - - for (int start_out_j = 0; - start_out_j < static_cast(output_width); - start_out_j += static_cast(OutputCols)) - { - const int end_out_j = std::min(start_out_j + OutputCols, - output_width); - - // Fill the pointer arrays with pointers to the input/output buffers. - for (auto index = 0u; - index < (Strategy::n_output_points * this->m_args.kernel_rows * this->m_args.kernel_cols); - index++) - { - inptr_array[index] = input_buffer; - } - for (auto index = 0u; index < Strategy::n_output_points; index++) - { - outptr_array[index] = output_buffer; - } - - // Construct the pointer arrays together. Note that the input pointer - // array is striped. Since the array has already been filled with - // pointers to the padding array we merely fill in the valid points - // as we get to them. - unsigned int output_index = 0; - auto outptr_row = outptr_batch + start_out_i * ld_output_row + start_out_j * ld_output_col; - for (auto out_i = start_out_i; out_i < end_out_i; out_i++) - { - auto outptr_col = outptr_row; - - // Compute the padding for this row of tiles. - const int start_in_i = out_i * this->m_args.stride_rows - padding.top; - const int end_in_i = start_in_i + this->m_args.kernel_rows; - const auto pad_top = static_cast(std::max(0, 0 - start_in_i)); - const auto pad_bottom = static_cast(std::max(0, end_in_i - input_height)); - const unsigned int valid_rows = this->m_args.kernel_rows - pad_top - pad_bottom; - - for (auto out_j = start_out_j; out_j < end_out_j; out_j++, output_index++) - { - // Compute the output pointer. - outptr_array[output_index] = outptr_col; - outptr_col += ld_output_col; - - // Compute the padding for this tile. - const int start_in_j = out_j * this->m_args.stride_cols - padding.left; - const int end_in_j = start_in_j + this->m_args.kernel_cols; - const auto pad_left = static_cast(std::max(0, 0 - start_in_j)); - const auto pad_right = static_cast(std::max(0, end_in_j - input_width)); - const unsigned int valid_cols = this->m_args.kernel_cols - pad_left - pad_right; - - // Hence compute the input pointers. - auto input_index = output_index + Strategy::n_output_points * (pad_top * this->m_args.kernel_cols + pad_left); - auto inptr_row = inptr_batch + (start_in_i + pad_top) * ld_input_row + (start_in_j + pad_left) * ld_input_col; - for (auto in_i = 0u; in_i < valid_rows; in_i++) - { - auto inptr_col = inptr_row; - auto input_index_col = input_index; - - for (auto in_j = 0u; in_j < valid_cols; in_j++) - { - inptr_array[input_index_col] = inptr_col; - inptr_col += ld_input_col; - input_index_col += Strategy::n_output_points; - } - - inptr_row += ld_input_row; - input_index += Strategy::n_output_points * this->m_args.kernel_cols; - } - } - - outptr_row += ld_output_row; - } - - tile_fn(inptr_array, outptr_array); - } - } - } + strat->get_kernel()( + ws->inptr_array, + ws->outptr_array, + params, bias, + n_kernel_points, n_output_channels, + ws->activation_min, ws->activation_max + ); } +}; - public: - DepthwiseDepthfirstGenericBase(const DepthwiseArgs &args) : DepthwiseCommon(args) +template <> +struct DepthwiseDepthfirstGenericKernelCall +{ + template + static void execute( + const StratType *strat, const WorkspaceType *ws, const arm_gemm::Requantize32 &qp, + const int32_t *, const void *params, + const unsigned int n_kernel_points, const unsigned int n_output_channels + ) { + strat->get_kernel()( + ws->inptr_array, + ws->outptr_array, + params, qp, + n_kernel_points, n_output_channels + ); } +}; - DepthwiseDepthfirstGenericBase(DepthwiseDepthfirstGenericBase &) = delete; - DepthwiseDepthfirstGenericBase &operator=(DepthwiseDepthfirstGenericBase &) = delete; - size_t get_storage_size(void) const override +/* Workspace Element for an array of input pointers as consumed by the + * "Generic" depthwise kernels. + */ +template +class GenericInputArrayElement +{ + public: + struct Workspace { - const unsigned int vl = arm_gemm::utils::get_vector_length(Strategy::vl_type); - const auto rounded_channels = arm_gemm::roundup(this->m_args.input_channels, vl); - return (this->m_args.kernel_rows * this->m_args.kernel_cols) * rounded_channels * sizeof(TWeight); - } + const T **inptr_array; + }; - void pack_parameters(void *_buffer, const void *, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override + template + static size_t get_element_size(const WorkspaceArgs &args) { - // Cast the pointers - TWeight *buffer = static_cast(_buffer); - const TWeight *const weights = static_cast(_weights); - - const unsigned int vl = arm_gemm::utils::get_vector_length(Strategy::vl_type); - ld_weight_col = (ld_weight_col == 0) ? this->m_args.input_channels : ld_weight_col; - ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row; - - for (unsigned int n = 0; n < this->m_args.input_channels; n += vl) - { - const unsigned int todo = std::min(vl, this->m_args.input_channels - n); - - // Copy each of the weights in turn - auto weights_row = weights + n; - for (unsigned int i = 0; i < this->m_args.kernel_rows; i++) - { - auto weights_col = weights_row; - - for (unsigned int j = 0; j < this->m_args.kernel_cols; j++) - { - for (unsigned int m = 0; m < todo; m++) - { - buffer[m] = weights_col[m]; - } - buffer += vl; - - weights_col += ld_weight_col; - } - - weights_row += ld_weight_row; - } - } + const auto kernel_points = args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols; + return sizeof(T **) * args.strategy->get_input_rows() * args.strategy->get_input_cols() * kernel_points; } - size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override + template + static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs &args) { - const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier; - return n_threads * (sizeof_input_ptr_array() + - sizeof_output_buffer(n_output_channels) + - sizeof_input_buffer(n_channels)); + ws->inptr_array = reinterpret_cast(buffer); + return reinterpret_cast(buffer) + get_element_size(args); } }; -template -class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstGenericBase +template ::Type, + typename OutputStage=typename DefaultOutputStage::Type> +class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstCommon { - using Parent = DepthwiseDepthfirstGenericBase; - using TInput = typename Parent::TInput; - using TWeight = typename Parent::TWeight; - using TAccum = typename Parent::TAccum; - using TOutput = typename Parent::TOutput; - + using StratType = GenericDepthfirstStrategy; + using Parent = DepthwiseDepthfirstCommon; + using WorkspaceManager = Workspace< + OutputArrayElement, + GenericInputArrayElement, + InputBufferElement, + ActivationsElement + >; + using WorkingSpace = typename WorkspaceManager::WorkspaceType; const TAccum *m_bias = nullptr; public: - DepthwiseDepthfirstGeneric(const DepthwiseArgs &args) : Parent(args) + DepthwiseDepthfirstGeneric(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os={}) + : Parent(strat, args, os) { } DepthwiseDepthfirstGeneric(DepthwiseDepthfirstGeneric &) = delete; DepthwiseDepthfirstGeneric &operator=(DepthwiseDepthfirstGeneric &) = delete; - void pack_parameters(void *buffer, const void *bias, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override + void pack_parameters( + void *buffer, const void *biases, + const void *weights, size_t ld_weight_col, size_t ld_weight_row + ) override + { + Parent::pack_parameters(buffer, biases, weights, ld_weight_col, ld_weight_row); + m_bias = reinterpret_cast(biases); // Get a copy of the biases + depthwise_depthfirst::stash_bias(this->get_output_stage(), m_bias); + } + + size_t get_working_size_per_thread(const unsigned int n_input_channels) const override + { + DepthwiseArgs args(this->m_args); + args.input_channels = n_input_channels; + return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs(this->m_strat.get(), args, this->get_output_stage())); + } + + void initialise_working_space(void *buffer, unsigned int n_input_channels) const override { - m_bias = static_cast(bias); - Parent::pack_parameters(buffer, bias, weights, ld_weight_col, ld_weight_row); + DepthwiseArgs args(this->m_args); + args.input_channels = n_input_channels; + return WorkspaceManager::initialise(buffer, WorkspaceArgs(this->m_strat.get(), args, this->get_output_stage())); } - using DepthwiseDepthfirstGenericBase::execute; - void execute( - const unsigned int batches, - const unsigned int input_height, - const unsigned int input_width, - const unsigned int input_channels, - const PaddingValues &padding, - const void *const _input, - const size_t ld_input_col, - const size_t ld_input_row, - const size_t ld_input_batch, - const void *const parameters, - const unsigned int output_height, - const unsigned int output_width, - void *const _output, - const size_t ld_output_col, - const size_t ld_output_row, - const size_t ld_output_batch, - void *const _working_space, - const unsigned int thread_id, - const unsigned int n_threads + protected: + void compute_tile_padded( + unsigned int output_i, unsigned int output_j, + unsigned int channel_start, unsigned int channel_end, + const TensorSpec &input, + const TensorSpec &output, + const void *parameters, + void *working_space_raw ) const override { - Strategy strat(this->m_args.cpu_info); -#ifdef CYCLE_PROFILING - arm_gemm::profiler prof; -#endif - - // Compute activation values - TAccum activation_min, activation_max; - std::tie(activation_min, activation_max) = get_default_activation_values(); - - switch (this->m_args.activation.type) - { - case arm_gemm::Activation::Type::BoundedReLU: - activation_max = static_cast(this->m_args.activation.param1); - // Fall through - case arm_gemm::Activation::Type::ReLU: - activation_min = static_cast(0); - break; - default: - break; - } - - // Create a function to initialise the input buffer - const auto initialise_input_buffer = [] (TInput *const buffer, const unsigned int n) { - std::memset(buffer, 0, n * sizeof(TInput)); - }; - - // Create a function to execute a tile of work - const auto tile_fn = [&] (const TInput *const *const inptrs, TOutput *const * const outptrs) { -#ifdef CYCLE_PROFILING - auto p = prof.ScopedProfiler( - PROFILE_KERNEL, - (unsigned long) (OutputRows * OutputCols * this->m_args.kernel_rows* this->m_args.kernel_cols) - ); -#endif - strat.kernel(inptrs, outptrs, parameters, m_bias, - this->m_args.kernel_rows * this->m_args.kernel_cols, - this->m_args.input_channels, activation_min, activation_max); - }; - - // Call into a parent utility function to do the actual work. - Parent::execute_tiles( - tile_fn, initialise_input_buffer, - batches, input_height, input_width, input_channels, padding, - _input, ld_input_col, ld_input_row, ld_input_batch, - output_height, output_width, - _output, ld_output_col, ld_output_row, ld_output_batch, - _working_space, thread_id, n_threads + // Get the working space + WorkingSpace *ws = reinterpret_cast(working_space_raw); + + const int ii = static_cast(output_i * this->m_args.stride_rows) - this->m_args.padding.top; + const auto input_pad_top = static_cast(ii < 0 ? -ii : 0); + const auto input_i = static_cast(ii < 0 ? 0 : ii); + + const int ij = static_cast(output_j * this->m_args.stride_cols) - this->m_args.padding.left; + const auto input_pad_left = static_cast(ij < 0 ? -ij : 0); + const auto input_j = static_cast(ij < 0 ? 0 : ij); + + fill_pointer_array_generic_kernel( + ws->inptr_array, + this->m_strat->get_output_rows(), this->m_strat->get_output_cols(), + this->m_args.kernel_rows, this->m_args.kernel_cols, + this->m_args.stride_rows, this->m_args.stride_cols, + input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start, + input.ld_row, input.ld_col, + ws->input_buffer, + input_pad_top, this->m_args.input_rows - input_i, + input_pad_left, this->m_args.input_cols - input_j + ); + + // Compute the output pointer array + fill_pointer_array( + ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(), + output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start, + output.ld_row, output.ld_col, + ws->output_buffer, + 0, this->m_args.output_rows - output_i, // Top padding, # valid rows + 0, this->m_args.output_cols - output_j // Left padding, # valid columns + ); + + // Execute the kernel + DepthwiseDepthfirstGenericKernelCall::execute( + reinterpret_cast(this->m_strat.get()), ws, + this->get_output_stage(), m_bias, parameters, + this->m_args.kernel_rows * this->m_args.kernel_cols, + channel_end - channel_start ); } }; diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp index 2862361b82..e58467b0f4 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,7 +24,8 @@ #pragma once -#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "depthwise_depthfirst.hpp" +#include "interleaves/generic_quantized_dot_product.hpp" #ifdef CYCLE_PROFILING #include "profiler.hpp" @@ -35,492 +36,559 @@ namespace arm_conv { namespace depthwise { -namespace common +template +class DepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon { - template - void depthwise_multiplier_execute( - const F execute_tile, - typename strategy::input_type pad_value, - const DepthwiseArgs &args, - const unsigned int batches, - const unsigned int input_height, - const unsigned int input_width, - const unsigned int input_channels, - const PaddingValues &padding, - const void *const _input, - const size_t ld_input_col, - const size_t ld_input_row, - const size_t ld_input_batch, - const void *const parameters, - const size_t param_stride, - const unsigned int output_height, - const unsigned int output_width, - void *const _output, - const size_t ld_output_col, - const size_t ld_output_row, - const size_t ld_output_batch, - void *const _working_space, - const unsigned int thread_id, - const unsigned int n_threads - ) - { - using TInput = typename strategy::input_type; - using TOutput = typename strategy::return_type; - - // Determine what portion of the work to do. - const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads); - const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height); - const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height); - - // Cast input and output pointers into the right types - const TInput *const inptr = static_cast(_input); - TOutput *const outptr = static_cast(_output); - - // To simplify the kernel, we process padded or non-NCHW-ordered input into - // a form which can be consumed by the kernel. This data is stored here and - // passed into the kernel as an array of N pointers (one per row of the - // input). - TInput rearranged_input[strategy::input_rows][strategy::input_col_quads*(16 / sizeof(TInput))]; - const TInput *inptrs[strategy::input_rows]; - - // Create an array for the output pointers - TOutput * _outptr_array[strategy::output_rows * strategy::output_cols]; - TOutput **const outptr_array = _outptr_array; - - // Allocate portions of the working space - uint8_t *const working_space = static_cast(_working_space); - TOutput *const output_buffer = reinterpret_cast(working_space); - - // For each output tile, construct the requisite set of pointers and call - // into the kernel. - for (unsigned int batch = 0; batch < batches; batch++) - { - // Get batch pointers - const auto inptr_batch = inptr + batch * ld_input_batch; - const auto outptr_batch = outptr + batch * ld_output_batch; + using Parent = DepthwiseDepthfirstStrategyCommon; - for (int start_out_i = start_out_height; - start_out_i < end_out_height; - start_out_i += static_cast(strategy::output_rows)) + protected: + virtual interleaves::PackingArguments get_packing_args(const DepthwiseArgs &args) const + { + return interleaves::PackingArguments( + args.kernel_rows, args.kernel_cols, sizeof(TWeight), + true, sizeof(TAccum), + this->get_vl_type(), + sizeof(TAccum), 1, + [args] (unsigned int pos, unsigned int &x, unsigned int &y) -> bool { - const int end_out_i = start_out_i + strategy::output_rows; - const int start_in_i = start_out_i * strategy::stride_rows - padding.top; - const int end_in_i = start_in_i + strategy::input_rows; - - // Compute top/bottom padding - const auto pad_top = static_cast(-std::min(start_in_i, 0)); - const auto pad_bottom = static_cast(-std::min(static_cast(input_height) - end_in_i, 0)); - const unsigned int valid_output_rows = std::min( - end_out_i - start_out_i, - static_cast(output_height) - start_out_i - ); - - for (int start_out_j = 0; start_out_j < static_cast(output_width);) + if (pos < args.kernel_rows * args.kernel_cols) { - const int start_in_j = start_out_j * strategy::stride_cols - args.padding.left; - const int pad_left = -std::min(0, start_in_j); - - const int end_out_j = start_out_j + strategy::output_cols; - const int end_in_j = start_in_j + strategy::input_cols; - - const auto pad_right = static_cast(-std::min(static_cast(input_width) - end_in_j, 0)); - const unsigned int valid_output_cols = std::min( - end_out_j - start_out_j, - static_cast(output_width) - start_out_j - ); - - // Construct the output pointer array. - TOutput **outptr_pos = outptr_array; - for (auto i = 0u; i < valid_output_rows; i++) - { - unsigned int j = 0u; - TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col; - for (; j < valid_output_cols; j++) - { - *(outptr_pos++) = colptr; - colptr += ld_output_col; - } - for (; j < strategy::output_cols; j++) - { - *(outptr_pos++) = output_buffer; - } - } - for (auto i = valid_output_rows; i < strategy::output_rows; i++) - { - for (auto j = 0u; j < strategy::output_cols; j++) - { - *(outptr_pos++) = output_buffer; - } - } - - start_out_j += strategy::output_cols; - - const uint8_t *params = static_cast(parameters); - - // Loop over the input channels - for (unsigned int in_c = 0; in_c < input_channels; in_c++) - { - // Construct the input array - first fill with padding values and - // then fill in correct values. - for (unsigned int i = 0; i < strategy::input_rows; i++) - { - for (unsigned int j = 0; - j < (16 / sizeof(TInput)) * strategy::input_col_quads; j++) - { - rearranged_input[i][j] = pad_value; - } - inptrs[i] = rearranged_input[i]; - } - - auto inptr_row = inptr_batch + in_c + - (start_in_i + pad_top) * ld_input_row + - (start_in_j + pad_left) * ld_input_col; - if (ld_input_col == 1 && !pad_left && - start_in_j + (16 / sizeof(TInput)) * strategy::input_col_quads < input_width) - { - // The input tensor is already in NCHW format, and we're reading - // an unpadded section of it - allow the kernel to read it - // directly. - for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++) - { - inptrs[i] = inptr_row; - inptr_row += ld_input_row; - } - } - else - { - // Either the input tensor isn't in NCHW format, or we're reading - // a padded section. Copy the relevant portion of the input here - // and allow the kernel to read this. - for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++) - { - auto inptr_col = inptr_row; - for (unsigned int j = pad_left; j < strategy::input_cols - pad_right; j++) - { - rearranged_input[i][j] = *inptr_col; - inptr_col += ld_input_col; - } - inptr_row += ld_input_row; - } - } - - execute_tile(inptrs, outptr_array, params); - - // Progress the output pointers - TOutput **outptr_pos = outptr_array; - for (auto i = 0u; i < strategy::output_rows * strategy::output_cols; i++) - { - outptr_pos[i] += args.channel_multiplier; - } - - // Progress the pointer into the parameters - params += param_stride; - } + y = pos % args.kernel_cols; + x = pos / args.kernel_cols; + return true; } + return false; } - } + ); } -} -template -class DepthwiseDepthfirstWithMultiplier : - public DepthwiseCommon -{ - using TInput = typename strategy::input_type; - using TWeight = typename strategy::weight_type; - using TOutput = typename strategy::return_type; - using TAccum = typename strategy::bias_type; + public: + using Parent::Parent; + + size_t get_storage_size(const DepthwiseArgs &args) const override + { + return interleaves::get_storage_size_generic(this->get_packing_args(args), args); + } - size_t sizeof_output_buffer(unsigned int n_channels) const + void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const Nothing &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override { - const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); - const auto rounded_channels = arm_gemm::roundup(n_channels, vl); - return sizeof(TOutput) * rounded_channels; + interleaves::pack_parameters_generic( + this->get_packing_args(args), args, + buffer, biases, weights, ld_weight_col, ld_weight_row + ); } + using KernelType = std::function; + virtual KernelType get_kernel(void) const = 0; +}; + + +template +class DepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon +{ + using Parent = DepthwiseDepthfirstStrategyCommon; + public: - DepthwiseDepthfirstWithMultiplier(const DepthwiseArgs &args) : DepthwiseCommon(args) + using Parent::Parent; + + size_t get_storage_size(const DepthwiseArgs &args) const override { + return interleaves::quantized::get_storage_size(args, this->get_vl_type(), this->get_accumulator_depth_vl()); } - DepthwiseDepthfirstWithMultiplier(DepthwiseDepthfirstWithMultiplier &) = delete; - DepthwiseDepthfirstWithMultiplier &operator=(DepthwiseDepthfirstWithMultiplier &) = delete; + void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override + { + interleaves::quantized::pack_parameters( + buffer, reinterpret_cast(biases), + reinterpret_cast(weights), ld_weight_col, ld_weight_row, + args, qp, this->get_vl_type(), this->get_accumulator_depth_vl() + ); + } - size_t get_storage_size(void) const override + using KernelType = std::function; + virtual KernelType get_kernel(void) const = 0; +}; + + +template +class GenericDepthfirstMultiplierKernelStrategy +{ + const arm_gemm::VLType m_vl_type; + const unsigned int m_output_rows, m_output_cols; + + public: + GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type) + : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols) { - // TODO What if we insert extra padding? Biases are a different size to the inputs, ... - const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); - const auto rounded_channels = this->m_args.input_channels * arm_gemm::roundup(this->m_args.channel_multiplier, vl); - return (1 + this->m_args.kernel_rows * this->m_args.kernel_cols) * rounded_channels * sizeof(TWeight); } - void pack_parameters(void *_buffer, const void *_biases, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override + virtual ~GenericDepthfirstMultiplierKernelStrategy() = default; + + arm_gemm::VLType get_vl_type(void) const { return m_vl_type; } + unsigned int get_output_rows(void) const { return m_output_rows; } + unsigned int get_output_cols(void) const { return m_output_cols; } + + using KernelType = std::function; + virtual KernelType get_kernel(void) const = 0; +}; + +template +class GenericDepthfirstMultiplierKernelStrategy +{ + const arm_gemm::VLType m_vl_type; + const unsigned int m_output_rows, m_output_cols; + + public: + GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type) + : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols) { - // TODO What if the kernel needs a different packing function? + } - // Cast the pointers - float *buffer = static_cast(_buffer); - const float *biases = static_cast(_biases); - const float *const weights = static_cast(_weights); + virtual ~GenericDepthfirstMultiplierKernelStrategy() = default; + + arm_gemm::VLType get_vl_type(void) const { return m_vl_type; } + unsigned int get_output_rows(void) const { return m_output_rows; } + unsigned int get_output_cols(void) const { return m_output_cols; } + + using KernelType = std::function; + virtual KernelType get_kernel(void) const = 0; +}; - const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); - ld_weight_col = (ld_weight_col == 0) ? this->m_args.channel_multiplier * this->m_args.input_channels : ld_weight_col; - ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row; +template ::Type, + typename OutputStage=typename DefaultOutputStage::Type> +class GenericDepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon +{ + using KernelStrategyType = GenericDepthfirstMultiplierKernelStrategy; + std::unique_ptr m_kern; - for (unsigned int in_c = 0; in_c < this->m_args.input_channels; in_c++) - { - for (unsigned int n = 0; n < this->m_args.channel_multiplier; n += vl) + protected: + virtual interleaves::PackingArguments get_packing_args(const DepthwiseArgs &args) const + { + return interleaves::PackingArguments( + args.kernel_rows, args.kernel_cols, sizeof(TWeight), + false, sizeof(TAccum), + this->get_vl_type(), + sizeof(TAccum), 1, + [args] (unsigned int pos, unsigned int &x, unsigned int &y) -> bool { - const unsigned int out_c = in_c * this->m_args.channel_multiplier + n; - const unsigned int todo = std::min(vl, this->m_args.channel_multiplier - n); - - // Copy across the correct amount of bias (or 0) - for (unsigned int i = 0; i < todo; i++) + if (pos < args.kernel_rows * args.kernel_cols) { - buffer[i] = (biases == nullptr) ? 0 : biases[out_c + i]; + y = pos % args.kernel_cols; + x = pos / args.kernel_cols; + return true; } - buffer += vl; + return false; + } + ); + } - // Copy each of the weights in turn - auto weights_row = weights + out_c; - for (unsigned int i = 0; i < this->m_args.kernel_rows; i++) - { - auto weights_col = weights_row; + public: + GenericDepthfirstMultiplierStrategy(KernelStrategyType *kern, const DepthwiseArgs &args) + : DepthwiseDepthfirstStrategyCommon( + kern->get_output_rows(), kern->get_output_cols(), + args.kernel_rows, args.kernel_cols, + args.stride_rows, args.stride_cols + ), + m_kern(kern) + { + }; - for (unsigned int j = 0; j < this->m_args.kernel_cols; j++) - { - for (unsigned int m = 0; m < todo; m++) - { - buffer[m] = weights_col[m]; - } - buffer += vl; + arm_gemm::VLType get_vl_type(void) const override { return m_kern->get_vl_type(); } + const typename KernelStrategyType::KernelType get_kernel(void) const { return m_kern->get_kernel(); } - weights_col += ld_weight_col; - } + size_t get_storage_size(const DepthwiseArgs &args) const override + { + return interleaves::get_storage_size_generic(this->get_packing_args(args), args); + } - weights_row += ld_weight_row; - } - } - } + void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const OutputStage &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override + { + interleaves::pack_parameters_generic( + this->get_packing_args(args), args, + buffer, biases, weights, ld_weight_col, ld_weight_row + ); } +}; + +// Specialise elements of the wrapper based on the type of kernel. +namespace depthfirst_multiplier { - size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override +/* Working space element which contains a pointer for each row of input, a row + * of padding, and a space which can be used to construct an NCHW-ordered patch + * of input. + */ +template +class InputPatchElement +{ + public: + struct Workspace { - const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier; - return n_threads * sizeof_output_buffer(n_output_channels); + constexpr static bool InputPatchIsGeneric = IsGeneric; + const T **input_rows; + T *input_padding; + T *input_patch; + }; + + static size_t get_element_size(const WorkspaceArgs &args) + { + return sizeof_input_rows(args) + sizeof_input_padding(args) + sizeof_input_patch(args); } - - using DepthwiseCommon::execute; - void execute( - const unsigned int batches, - const unsigned int input_height, - const unsigned int input_width, - const unsigned int input_channels, - const PaddingValues &padding, - const void *const _input, - const size_t ld_input_col, - const size_t ld_input_row, - const size_t ld_input_batch, - const void *const parameters, - const unsigned int output_height, - const unsigned int output_width, - void *const _output, - const size_t ld_output_col, - const size_t ld_output_row, - const size_t ld_output_batch, - void *const _working_space, - const unsigned int thread_id, - const unsigned int n_threads - ) const override + + template + static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs &args) { - strategy strat(this->m_args.cpu_info); -#ifdef CYCLE_PROFILING - arm_gemm::profiler prof; -#endif + auto buffer_bytes = reinterpret_cast(buffer); + + ws->input_rows = reinterpret_cast(buffer_bytes); + buffer_bytes += sizeof_input_rows(args); + + ws->input_padding = reinterpret_cast(buffer_bytes); + buffer_bytes += sizeof_input_padding(args); + + ws->input_patch = reinterpret_cast(buffer_bytes); + buffer_bytes += sizeof_input_patch(args); - // Compute activation values - TAccum activation_min = std::numeric_limits::has_infinity ? -std::numeric_limits::infinity() : std::numeric_limits::min(); - TAccum activation_max = std::numeric_limits::has_infinity ? std::numeric_limits::infinity() : std::numeric_limits::max(); + // Initialise the padding + memset(ws->input_padding, + get_input_buffer_fill_value(args.output_stage), + sizeof_input_padding(args)); - switch (this->m_args.activation.type) + return buffer_bytes; + } + + protected: + static size_t sizeof_input_rows(const WorkspaceArgs &args) + { + if (IsGeneric) + { + return sizeof(T *) * args.strategy->get_output_rows() * args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols; + } + else { - case arm_gemm::Activation::Type::BoundedReLU: - activation_max = static_cast(this->m_args.activation.param1); - // Fall through - case arm_gemm::Activation::Type::ReLU: - activation_min = static_cast(0); - break; - default: - break; + return sizeof(T *) * args.strategy->get_input_rows(); } + } + + static size_t sizeof_input_padding(const WorkspaceArgs &args) + { + // Round-up the number of columns to be a whole number of QUADS + auto input_cols = arm_gemm::roundup(args.strategy->get_input_cols(), 16 / sizeof(T)); + return sizeof(T) * input_cols; + } - // Determine what portion of the work to do. - const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads); - const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height); - const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height); - - // Need a stride over blocks of parameters - const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); - const unsigned int param_stride = - arm_gemm::roundup(this->m_args.channel_multiplier, vl) * - (sizeof(TAccum) + sizeof(TWeight) * strategy::kernel_rows * strategy::kernel_cols); - - // Cast input and output pointers into the right types - const TInput *const inptr = static_cast(_input); - TOutput *const outptr = static_cast(_output); - - // To simplify the kernel, we process padded or non-NCHW-ordered input into - // a form which can be consumed by the kernel. This data is stored here and - // passed into the kernel as an array of N pointers (one per row of the - // input). - TInput rearranged_input[strategy::input_rows][strategy::input_col_quads*4]; - const TInput *inptrs[strategy::input_rows]; - - // Create an array for the output pointers - TOutput * _outptr_array[strategy::output_rows * strategy::output_cols]; - TOutput **const outptr_array = _outptr_array; - - // Allocate portions of the working space - uint8_t *const working_space = static_cast(_working_space) + get_working_size(thread_id, input_channels); - TOutput *const output_buffer = reinterpret_cast(working_space); - - // For each output tile, construct the requisite set of pointers and call - // into the kernel. - for (unsigned int batch = 0; batch < batches; batch++) + static size_t sizeof_input_patch(const WorkspaceArgs &args) + { + if (IsGeneric) + { + // Round-up the number of columns to be a whole number of QUADS + auto output_cols = arm_gemm::roundup(args.strategy->get_output_cols(), 16 / sizeof(T)); + const auto kernel_points = args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols; + return sizeof(T) * kernel_points * args.strategy->get_output_rows() * output_cols; + } + else { - // Get batch pointers - const auto inptr_batch = inptr + batch * ld_input_batch; - const auto outptr_batch = outptr + batch * ld_output_batch; + // Round-up the number of columns to be a whole number of QUADS + auto input_cols = arm_gemm::roundup(args.strategy->get_input_cols(), 16 / sizeof(T)); + return sizeof(T) * args.strategy->get_input_rows() * input_cols; + } + } +}; + +template +struct StrategyType +{ + using Type = DepthfirstMultiplierStrategy; + + template + static void execute( + const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat, + const OutputStage &, const unsigned int, + const void *parameters, const void * + ) + { + strat->get_kernel()( + ws->input_rows, + ws->outptr_array, + parameters, args.channel_multiplier, + ws->activation_min, ws->activation_max + ); + } +}; - for (int start_out_i = start_out_height; - start_out_i < end_out_height; - start_out_i += static_cast(strategy::output_rows)) +template +struct StrategyType +{ + using Type = GenericDepthfirstMultiplierStrategy; + + template + static void execute( + const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat, + const OutputStage &, const unsigned int start_output_channel, + const void *parameters, const void *bias + ) + { + strat->get_kernel()( + ws->input_rows, ws->outptr_array, + reinterpret_cast(parameters), + bias == nullptr ? nullptr : reinterpret_cast(bias) + start_output_channel, + strat->get_kernel_rows() * strat->get_kernel_cols(), + args.channel_multiplier, + ws->activation_min, ws->activation_max + ); + } +}; + +template +struct StrategyType +{ + using Type = DepthfirstMultiplierStrategy; + + template + static void execute( + const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat, + const arm_gemm::Requantize32 &qp, const unsigned int, + const void *parameters, const void * + ) + { + strat->get_kernel()( + ws->input_rows, + ws->outptr_array, + parameters, args.channel_multiplier, + qp + ); + } +}; + +template +struct StrategyType +{ + using Type = GenericDepthfirstMultiplierStrategy; + + template + static void execute( + const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat, + const arm_gemm::Requantize32 &qp, const unsigned int start_output_channel, + const void *parameters, const void * + ) + { + auto get_ptr = [start_output_channel] (const int32_t *ptr) -> const int32_t * + { + return ptr == nullptr ? nullptr : ptr + start_output_channel; + }; + + strat->get_kernel()( + ws->input_rows, ws->outptr_array, + reinterpret_cast(parameters), + get_ptr(qp.bias), + strat->get_kernel_rows() * strat->get_kernel_cols(), + args.channel_multiplier, + get_ptr(qp.per_channel_left_shifts), + get_ptr(qp.per_channel_muls), + get_ptr(qp.per_channel_right_shifts), + qp + ); + } +}; + +template struct PrepareInputSample; + +template <> struct PrepareInputSample +{ + template + static void execute( + const DepthwiseArgs &, WorkspaceType *ws, const StrategyType *strat, + T *base_ptr, size_t ld_row, size_t ld_col, + const unsigned int input_pad_top, const unsigned int valid_rows, + const unsigned int input_pad_left, const unsigned int valid_cols + ) + { + fill_nchw_patch_array( + ws->input_rows, ws->input_patch, strat->get_input_rows(), strat->get_input_cols(), + base_ptr, ld_row, ld_col, + ws->input_padding, + input_pad_top, valid_rows, + input_pad_left, valid_cols + ); + } +}; + +template <> struct PrepareInputSample +{ + template + static void execute( + const DepthwiseArgs &args, WorkspaceType *ws, const StrategyType *strat, + T *base_ptr, size_t ld_row, size_t ld_col, + const unsigned int input_pad_top, const unsigned int valid_rows, + const unsigned int input_pad_left, const unsigned int valid_cols + ) + { + fill_patch_array_generic_kernel( + ws->input_rows, ws->input_patch, + strat->get_output_rows(), strat->get_output_cols(), + args.kernel_rows, args.kernel_cols, + args.stride_rows, args.stride_cols, + base_ptr, ld_row, ld_col, + ws->input_padding, + input_pad_top, valid_rows, + input_pad_left, valid_cols + ); + } +}; + +} // namespace depthfirst_multiplier + +template ::Type, + bool is_generic=false, + typename OutputStage=typename DefaultOutputStage::Type> +class DepthwiseDepthfirstMultiplier : public DepthfirstDriver +{ + protected: + using StratType = typename depthfirst_multiplier::StrategyType::Type; + using WorkspaceManager = Workspace< + OutputArrayElement, + depthfirst_multiplier::InputPatchElement, + ActivationsElement + >; + using WorkingSpace = typename WorkspaceManager::WorkspaceType; + + OutputStage m_os; // Copy of the output parameters + const void *m_bias = nullptr; // Copy of the bias (should we need it) + + public: + DepthwiseDepthfirstMultiplier(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os = {}) + : DepthfirstDriver(strat, args), m_os(os) + { + } + + DepthwiseDepthfirstMultiplier(DepthwiseDepthfirstMultiplier &) = delete; + DepthwiseDepthfirstMultiplier &operator=(DepthwiseDepthfirstMultiplier &) = delete; + + size_t get_storage_size(void) const override + { + return reinterpret_cast(this->m_strat.get()) + ->get_storage_size(this->m_args); + } + + void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override + { + reinterpret_cast(this->m_strat.get()) + ->pack_parameters(this->m_args, buffer, biases, m_os, weights, ld_weight_col, ld_weight_row); + m_bias = biases; + depthwise_depthfirst::stash_bias(m_os, biases); + } + + size_t get_working_size_per_thread(const unsigned int n_input_channels) const override + { + DepthwiseArgs args(this->m_args); + args.input_channels = n_input_channels; + return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs(this->m_strat.get(), args, m_os)); + } + + void initialise_working_space(void *buffer, unsigned int n_input_channels) const override + { + DepthwiseArgs args(this->m_args); + args.input_channels = n_input_channels; + return WorkspaceManager::initialise(buffer, WorkspaceArgs(this->m_strat.get(), args, m_os)); + } + + void compute_tile_padded( + unsigned int output_i, unsigned int output_j, + unsigned int output_channel_start, unsigned int output_channel_end, + const TensorSpec &input, + const TensorSpec &output, + const void *parameters, + void *working_space_raw + ) const override + { + // Get the working space + auto ws = reinterpret_cast(working_space_raw); + + const int ii = static_cast(output_i * this->m_args.stride_rows) - this->m_args.padding.top; + const auto input_pad_top = static_cast(ii < 0 ? -ii : 0); + const auto input_i = static_cast(ii < 0 ? 0 : ii); + + const int ij = static_cast(output_j * this->m_args.stride_cols) - this->m_args.padding.left; + const auto input_pad_left = static_cast(ij < 0 ? -ij : 0); + const auto input_j = static_cast(ij < 0 ? 0 : ij); + + // Compute the output pointer array. We'll update this array after every + // invocation of the kernel. + fill_pointer_array( + ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(), + output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start, + output.ld_row, output.ld_col, + ws->output_buffer, + 0, this->m_args.output_rows - output_i, // Top padding, # valid rows + 0, this->m_args.output_cols - output_j // Left padding, # valid columns + ); + + // Compute the parameter stride + DepthwiseArgs single_iter(this->m_args); + single_iter.input_channels = 1; + const size_t parameter_stride = reinterpret_cast(this->m_strat.get()) + ->get_storage_size(single_iter); + + for (; output_channel_start < output_channel_end; + output_channel_start += this->m_args.channel_multiplier) + { + // Compute the input pointer array + const auto input_channel = output_channel_start / this->m_args.channel_multiplier; + + // Construct the input patch + depthfirst_multiplier::PrepareInputSample::execute( + this->m_args, ws, this->m_strat.get(), + input.base + input_channel + input_i*input.ld_row + input_j*input.ld_col, input.ld_row, input.ld_col, + input_pad_top, this->m_args.input_rows - input_i, + input_pad_left, this->m_args.input_cols - input_j + ); + + // Execute the kernel + depthfirst_multiplier::StrategyType::execute( + this->m_args, ws, reinterpret_cast(this->m_strat.get()), m_os, output_channel_start, + parameters, m_bias + ); + + // Update the output pointers + for (unsigned int n = 0; n < this->m_strat->get_output_rows() * this->m_strat->get_output_cols(); n++) { - const int end_out_i = start_out_i + strategy::output_rows; - const int start_in_i = start_out_i * strategy::stride_rows - padding.top; - const int end_in_i = start_in_i + strategy::input_rows; - - // Compute top/bottom padding - const auto pad_top = static_cast(-std::min(start_in_i, 0)); - const auto pad_bottom = static_cast(-std::min(static_cast(input_height) - end_in_i, 0)); - const unsigned int valid_output_rows = std::min( - end_out_i - start_out_i, - static_cast(output_height) - start_out_i - ); - - for (int start_out_j = 0; start_out_j < static_cast(output_width);) - { - const int start_in_j = start_out_j * strategy::stride_cols - this->m_args.padding.left; - const int pad_left = -std::min(0, start_in_j); - - const int end_out_j = start_out_j + strategy::output_cols; - const int end_in_j = start_in_j + strategy::input_cols; - - const auto pad_right = static_cast(-std::min(static_cast(input_width) - end_in_j, 0)); - const unsigned int valid_output_cols = std::min( - end_out_j - start_out_j, - static_cast(output_width) - start_out_j - ); - - // Construct the output pointer array. - TOutput **outptr_pos = outptr_array; - for (auto i = 0u; i < valid_output_rows; i++) - { - unsigned int j = 0u; - TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col; - for (; j < valid_output_cols; j++) - { - *(outptr_pos++) = colptr; - colptr += ld_output_col; - } - for (; j < strategy::output_cols; j++) - { - *(outptr_pos++) = output_buffer; - } - } - for (auto i = valid_output_rows; i < strategy::output_rows; i++) - { - for (auto j = 0u; j < strategy::output_cols; j++) - { - *(outptr_pos++) = output_buffer; - } - } - - start_out_j += strategy::output_cols; - - const uint8_t *params = static_cast(parameters); - - // Loop over the input channels - for (unsigned int in_c = 0; in_c < input_channels; in_c++) - { - // Construct the input array - first fill with padding values and - // then fill in correct values. - for (unsigned int i = 0; i < strategy::input_rows; i++) - { - for (unsigned int j = 0; j < 4 * strategy::input_col_quads; j++) - { - rearranged_input[i][j] = static_cast(0); - } - inptrs[i] = rearranged_input[i]; - } - - auto inptr_row = inptr_batch + in_c + - (start_in_i + pad_top) * ld_input_row + - (start_in_j + pad_left) * ld_input_col; - if (ld_input_col == 1 && !pad_left && - start_in_j + 4 * strategy::input_col_quads < input_width) - { - // The input tensor is already in NCHW format, and we're reading - // an unpadded section of it - allow the kernel to read it - // directly. - for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++) - { - inptrs[i] = inptr_row; - inptr_row += ld_input_row; - } - } - else - { - // Either the input tensor isn't in NCHW format, or we're reading - // a padded section. Copy the relevant portion of the input here - // and allow the kernel to read this. - for (unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++) - { - auto inptr_col = inptr_row; - for (unsigned int j = pad_left; j < strategy::input_cols - pad_right; j++) - { - rearranged_input[i][j] = *inptr_col; - inptr_col += ld_input_col; - } - inptr_row += ld_input_row; - } - } - - { -#ifdef CYCLE_PROFILING - auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::output_rows * strategy::output_cols * this->m_args.channel_multiplier * strategy::kernel_rows * strategy::kernel_cols)); -#endif - strat.kernel( - inptrs, outptr_array, params, - this->m_args.channel_multiplier, - activation_min, activation_max - ); - } - - // Progress the output pointers - TOutput **outptr_pos = outptr_array; - for (auto i = 0u; i < strategy::output_rows * strategy::output_cols; i++) - { - outptr_pos[i] += this->m_args.channel_multiplier; - } - - // Progress the pointer into the parameters - params += param_stride; - } - } + ws->outptr_array[n] += this->m_args.channel_multiplier; } + + // Progress the parameters + parameters = reinterpret_cast(parameters) + parameter_stride; } } }; diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp index 934272a0ac..6b100d9d61 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,7 +28,6 @@ #include "depthwise_depthfirst.hpp" #include "depthwise_depthfirst_generic.hpp" #include "depthwise_depthfirst_multiplier.hpp" -#include "depthwise_depthfirst_generic_multiplier.hpp" #include "depthwise_implementation_constraints.hpp" @@ -43,6 +42,7 @@ #include "kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp" #include "kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp" #endif // defined(ARM_COMPUTE_ENABLE_SVE) +#if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) #include "kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp" #include "kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp" #include "kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp" @@ -50,6 +50,7 @@ #include "kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp" #include "kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp" #include "kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp" +#endif // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) #endif // defined(__aarch64__) namespace arm_conv { @@ -70,15 +71,11 @@ namespace } #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + unsigned int not_preferred(const DepthwiseArgs &, const Nothing &) __attribute__ ((unused)); unsigned int not_preferred(const DepthwiseArgs &, const Nothing &) { return std::numeric_limits::max(); } - - unsigned int not_preferred_if_no_multiplier(const DepthwiseArgs &args, const Nothing &) - { - return args.channel_multiplier > 1 ? 0 : std::numeric_limits::max(); - } #endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) } @@ -94,7 +91,7 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = cycle_estimate, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { auto strat = new sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info); - return new DepthwiseDepthfirst<__fp16, __fp16, __fp16, __fp16>(strat, args); + return new DepthwiseDepthfirst<__fp16>(strat, args); }, }, { @@ -106,7 +103,7 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = cycle_estimate, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { auto strat = new sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info); - return new DepthwiseDepthfirst<__fp16, __fp16, __fp16, __fp16>(strat, args); + return new DepthwiseDepthfirst<__fp16>(strat, args); }, }, { @@ -118,7 +115,7 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = cycle_estimate, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { auto strat = new sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info); - return new DepthwiseDepthfirst<__fp16, __fp16, __fp16, __fp16>(strat, args); + return new DepthwiseDepthfirst<__fp16>(strat, args); }, }, { @@ -130,7 +127,7 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = cycle_estimate, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { auto strat = new sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info); - return new DepthwiseDepthfirst<__fp16, __fp16, __fp16, __fp16>(strat, args); + return new DepthwiseDepthfirst<__fp16>(strat, args); }, }, { @@ -142,11 +139,11 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = cycle_estimate, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { auto strat = new sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info); - return new DepthwiseDepthfirst<__fp16, __fp16, __fp16, __fp16>(strat, args); + return new DepthwiseDepthfirst<__fp16>(strat, args); }, }, #endif // defined(ARM_COMPUTE_ENABLE_SVE) -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) { DepthwiseMethod::DEPTHFIRST, "a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst", @@ -156,7 +153,7 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = cycle_estimate, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { auto strat = new a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info); - return new DepthwiseDepthfirst<__fp16, __fp16, __fp16, __fp16>(strat, args); + return new DepthwiseDepthfirst<__fp16>(strat, args); }, }, { @@ -168,7 +165,7 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = cycle_estimate, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { auto strat = new a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info); - return new DepthwiseDepthfirst<__fp16, __fp16, __fp16, __fp16>(strat, args); + return new DepthwiseDepthfirst<__fp16>(strat, args); }, }, { @@ -180,7 +177,7 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = cycle_estimate, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { auto strat = new a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info); - return new DepthwiseDepthfirst<__fp16, __fp16, __fp16, __fp16>(strat, args); + return new DepthwiseDepthfirst<__fp16>(strat, args); }, }, { @@ -192,7 +189,7 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = cycle_estimate, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { auto strat = new a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info); - return new DepthwiseDepthfirst<__fp16, __fp16, __fp16, __fp16>(strat, args); + return new DepthwiseDepthfirst<__fp16>(strat, args); }, }, { @@ -204,7 +201,7 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = cycle_estimate, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { auto strat = new a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info); - return new DepthwiseDepthfirst<__fp16, __fp16, __fp16, __fp16>(strat, args); + return new DepthwiseDepthfirst<__fp16>(strat, args); }, }, { @@ -213,19 +210,23 @@ static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = constraint(has_no_channel_multiplier, cpu_has_fp16), not_preferred, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { - return new DepthwiseDepthfirstGeneric(args); + auto kern = new a64_fp16_nhwc_generic_output9_mla_depthfirst(args.cpu_info); + auto strat = new GenericDepthfirstStrategy<__fp16>(kern, 3, 3, args); + return new DepthwiseDepthfirstGeneric<__fp16>(strat, args); }, }, { DepthwiseMethod::DEPTHFIRST, "a64_fp16_nhwc_generic_with_multiplier_output2x8_mla_depthfirst", - constraint(cpu_has_fp16), - not_preferred_if_no_multiplier, + constraint(cpu_has_fp16, has_channel_multiplier), + nullptr, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * { - return new DepthwiseDepthfirstGenericWithMultiplier(args); + auto kern = new a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info); + auto strat = new GenericDepthfirstMultiplierStrategy<__fp16>(kern, args); + return new DepthwiseDepthfirstMultiplier<__fp16, __fp16, __fp16, __fp16, true>(strat, args); }, }, -#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#endif // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) #endif // defined(__aarch64__) { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list }; diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp index 5107ddacbc..643cf1d460 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,7 +28,7 @@ #include "depthwise_depthfirst.hpp" #include "depthwise_depthfirst_generic.hpp" #include "depthwise_depthfirst_multiplier.hpp" -#include "depthwise_depthfirst_generic_multiplier.hpp" +#include "depthwise_planar.hpp" #include "depthwise_implementation_constraints.hpp" @@ -78,9 +78,10 @@ namespace return std::numeric_limits::max(); } - unsigned int not_preferred_if_no_multiplier(const DepthwiseArgs &args, const Nothing &) + bool fast_mode_enabled(const DepthwiseArgs &args, const void *) __attribute__ ((unused)); + bool fast_mode_enabled(const DepthwiseArgs &args, const void *) { - return args.channel_multiplier > 1 ? 0 : std::numeric_limits::max(); + return args.fast_mode; } #endif // defined(__aarch64__) } @@ -97,7 +98,7 @@ static const DepthwiseImplementation depthwise_fp32_methods[] = { cycle_estimate, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { auto strat = new sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info); - return new DepthwiseDepthfirst(strat, args); + return new DepthwiseDepthfirst(strat, args); }, }, { @@ -109,7 +110,7 @@ static const DepthwiseImplementation depthwise_fp32_methods[] = { cycle_estimate, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { auto strat = new sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info); - return new DepthwiseDepthfirst(strat, args); + return new DepthwiseDepthfirst(strat, args); }, }, { @@ -121,7 +122,7 @@ static const DepthwiseImplementation depthwise_fp32_methods[] = { cycle_estimate, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { auto strat = new sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info); - return new DepthwiseDepthfirst(strat, args); + return new DepthwiseDepthfirst(strat, args); }, }, { @@ -133,7 +134,7 @@ static const DepthwiseImplementation depthwise_fp32_methods[] = { cycle_estimate, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { auto strat = new sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info); - return new DepthwiseDepthfirst(strat, args); + return new DepthwiseDepthfirst(strat, args); }, }, { @@ -145,7 +146,7 @@ static const DepthwiseImplementation depthwise_fp32_methods[] = { cycle_estimate, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { auto strat = new sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info); - return new DepthwiseDepthfirst(strat, args); + return new DepthwiseDepthfirst(strat, args); }, }, { @@ -154,36 +155,42 @@ static const DepthwiseImplementation depthwise_fp32_methods[] = { constraint(has_no_channel_multiplier, cpu_has_sve), not_preferred, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { - return new DepthwiseDepthfirstGeneric(args); + auto kern = new sve_fp32_nhwc_generic_output9_mla_depthfirst(args.cpu_info); + auto strat = new GenericDepthfirstStrategy(kern, 3, 3, args); + return new DepthwiseDepthfirstGeneric(strat, args); }, }, { DepthwiseMethod::DEPTHFIRST, "sve_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst", constraint(is_supported, - cpu_has_sve), - not_preferred_if_no_multiplier, + cpu_has_sve, has_channel_multiplier), + nullptr, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { - return new DepthwiseDepthfirstWithMultiplier(args); + auto strat = new sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirstMultiplier(strat, args); }, }, { DepthwiseMethod::DEPTHFIRST, "sve_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst", constraint(is_supported, - cpu_has_sve), - not_preferred_if_no_multiplier, + cpu_has_sve, has_channel_multiplier), + nullptr, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { - return new DepthwiseDepthfirstWithMultiplier(args); + auto strat = new sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirstMultiplier(strat, args); }, }, { DepthwiseMethod::DEPTHFIRST, "sve_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst", - constraint(cpu_has_sve), - not_preferred_if_no_multiplier, + constraint(cpu_has_sve, has_channel_multiplier), + nullptr, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { - return new DepthwiseDepthfirstGenericWithMultiplier(args); + auto kern = new sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info); + auto strat = new GenericDepthfirstMultiplierStrategy(kern, args); + return new DepthwiseDepthfirstMultiplier(strat, args); }, }, #endif // defined(ARM_COMPUTE_ENABLE_SVE) @@ -195,7 +202,7 @@ static const DepthwiseImplementation depthwise_fp32_methods[] = { cycle_estimate, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { auto strat = new a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info); - return new DepthwiseDepthfirst(strat, args); + return new DepthwiseDepthfirst(strat, args); }, }, { @@ -206,7 +213,7 @@ static const DepthwiseImplementation depthwise_fp32_methods[] = { cycle_estimate, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { auto strat = new a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info); - return new DepthwiseDepthfirst(strat, args); + return new DepthwiseDepthfirst(strat, args); }, }, { @@ -217,7 +224,7 @@ static const DepthwiseImplementation depthwise_fp32_methods[] = { cycle_estimate, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { auto strat = new a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info); - return new DepthwiseDepthfirst(strat, args); + return new DepthwiseDepthfirst(strat, args); }, }, { @@ -228,7 +235,7 @@ static const DepthwiseImplementation depthwise_fp32_methods[] = { cycle_estimate, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { auto strat = new a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info); - return new DepthwiseDepthfirst(strat, args); + return new DepthwiseDepthfirst(strat, args); }, }, { @@ -239,7 +246,7 @@ static const DepthwiseImplementation depthwise_fp32_methods[] = { cycle_estimate, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { auto strat = new a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info); - return new DepthwiseDepthfirst(strat, args); + return new DepthwiseDepthfirst(strat, args); }, }, { @@ -248,34 +255,42 @@ static const DepthwiseImplementation depthwise_fp32_methods[] = { constraint(has_no_channel_multiplier), not_preferred, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { - return new DepthwiseDepthfirstGeneric(args); + auto kern = new a64_fp32_nhwc_generic_output9_mla_depthfirst(args.cpu_info); + auto strat = new GenericDepthfirstStrategy(kern, 3, 3, args); + return new DepthwiseDepthfirstGeneric(strat, args); }, }, { DepthwiseMethod::DEPTHFIRST, "a64_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst", - constraint(is_supported), - not_preferred_if_no_multiplier, + constraint(is_supported, + has_channel_multiplier), + nullptr, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { - return new DepthwiseDepthfirstWithMultiplier(args); + auto strat = new a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirstMultiplier(strat, args); }, }, { DepthwiseMethod::DEPTHFIRST, "a64_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst", - constraint(is_supported), - not_preferred_if_no_multiplier, + constraint(is_supported, + has_channel_multiplier), + nullptr, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { - return new DepthwiseDepthfirstWithMultiplier(args); + auto strat = new a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirstMultiplier(strat, args); }, }, { DepthwiseMethod::DEPTHFIRST, "a64_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst", + constraint(has_channel_multiplier), nullptr, - not_preferred_if_no_multiplier, [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon * { - return new DepthwiseDepthfirstGenericWithMultiplier(args); + auto kern = new a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info); + auto strat = new GenericDepthfirstMultiplierStrategy(kern, args); + return new DepthwiseDepthfirstMultiplier(strat, args); }, }, #endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp index ea41529d81..0665fa3a29 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,7 +24,7 @@ #pragma once -#include "src/core/NEON/kernels/assembly/depthwise.hpp" +#include "depthwise.hpp" #include #include @@ -136,14 +136,7 @@ UniqueDepthwiseCommon depthwise(const DepthwiseArgs &a { const DepthwiseImplementation *impl = nullptr; const bool success = find_implementation(args, os, impl); - - if(success) - { - auto i = impl->get_instance(args, os); - i->set_name(impl->name); - return UniqueDepthwiseCommon(i); - } - return nullptr; + return UniqueDepthwiseCommon(success ? impl->get_instance(args, os) : nullptr); } } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp index 419872744d..78b6aec388 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_implementation_constraints.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -109,6 +109,12 @@ bool has_no_channel_multiplier(const DepthwiseArgs &args, const void *) return args.channel_multiplier == 1; } +bool has_channel_multiplier(const DepthwiseArgs &args, const void *) __attribute__ ((unused)); +bool has_channel_multiplier(const DepthwiseArgs &args, const void *) +{ + return args.channel_multiplier > 1; +} + bool qp_has_no_left_shift(const DepthwiseArgs &args, const void *_qp) __attribute__ ((unused)); bool qp_has_no_left_shift(const DepthwiseArgs &, const void *_qp) { @@ -118,6 +124,21 @@ bool qp_has_no_left_shift(const DepthwiseArgs &, const void *_qp) (qp->per_layer_left_shift == 0); } +bool qp_zero_a_offset(const DepthwiseArgs &args, const void *_qp) __attribute__ ((unused)); +bool qp_zero_a_offset(const DepthwiseArgs &, const void *_qp) +{ + const auto qp = static_cast(_qp); + return qp->a_offset == 0; +} + +template bool qp_skip_clamp(const DepthwiseArgs &args, const void *_qp) __attribute__ ((unused)); +template bool qp_skip_clamp(const DepthwiseArgs &, const void *_qp) +{ + const auto qp = static_cast(_qp); + return (qp->minval == std::numeric_limits::min() && + qp->maxval == std::numeric_limits::max()); +} + } // namespace } // namespace depthwise } // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp new file mode 100644 index 0000000000..ff5098d551 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp @@ -0,0 +1,409 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "depthfirst_driver.hpp" +#include "interleaves/generic.hpp" + +namespace arm_conv { +namespace depthwise { + +template +class IPlanarStrategy +{ + public: + virtual ~IPlanarStrategy() = default; + virtual unsigned int get_output_rows(void) const = 0; + virtual arm_gemm::VLType get_vl_type(void) const = 0; + + virtual size_t get_storage_size(const DepthwiseArgs &) const = 0; + virtual void pack_parameters( + const DepthwiseArgs &args, void *buffer, + const void *biases, const OutputStage &, + const void *weights, size_t ld_weight_col, size_t ld_weight_row + ) const = 0; +}; + + +template +struct PlanarKernelType; + +template +struct PlanarKernelType +{ + using Type = std::function; + + template + static inline void execute( + const Type fn, + const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl, + unsigned int pad_top, unsigned int valid_input_rows, + unsigned int pad_left, unsigned int valid_input_cols, + const TWeight *weights, const TAccum *bias, + TOutput **outptrs, const size_t *outlds, const size_t *outvllds, unsigned int output_cols, + unsigned int start_channel, unsigned int valid_channels, + const Nothing &, const WorkspaceType *ws + ) + { + fn( + inptr, ld_in_row, ld_in_col, ld_in_vl, + pad_top, valid_input_rows, + pad_left, valid_input_cols, + weights, bias, + outptrs, outlds, outvllds, output_cols, + start_channel, valid_channels, + ws->activation_min, ws->activation_max + ); + } +}; + +template +struct PlanarKernelType +{ + using Type = std::function; + + template + static inline void execute( + const Type fn, + const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl, + unsigned int pad_top, unsigned int valid_input_rows, + unsigned int pad_left, unsigned int valid_input_cols, + const TWeight *weights, const int32_t *, + TOutput **outptrs, const size_t *outlds, const size_t *outldvls, unsigned int output_cols, + unsigned int first_channel, unsigned int valid_channels, + const arm_gemm::Requantize32 &qp, const WorkspaceType * + ) + { + fn( + inptr, ld_in_row, ld_in_col, ld_in_vl, + pad_top, valid_input_rows, + pad_left, valid_input_cols, + weights, + outptrs, outlds, outldvls, output_cols, + first_channel, valid_channels, + qp + ); + } +}; + + +template ::Type, + typename OutputStage=typename DefaultOutputStage::Type> +class PlanarStrategy : public IPlanarStrategy +{ + unsigned int m_kernel_rows, m_kernel_cols; + unsigned int m_stride_rows, m_stride_cols; + unsigned int m_output_rows; + arm_gemm::VLType m_vl_type; + + protected: + virtual bool get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const + { + // Get the kernel point to pack at the given index; return false to + // indicate that this index (and all greater indices) is out of range. + if (m_kernel_rows * m_kernel_cols <= index) + return false; + + y = index % m_kernel_cols; + x = index / m_kernel_cols; + return true; + } + + virtual interleaves::PackingArguments get_kernel_packing_arguments(void) const + { + return interleaves::PackingArguments( + m_kernel_rows, m_kernel_cols, sizeof(TWeight), + false, sizeof(TAccum), // Don't pack the bias + m_vl_type, sizeof(TAccum), 1, // Accumulator depth of 1 TODO + [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool + { return this->get_kernel_packing_point(idx, x, y); } + ); + } + + public: + PlanarStrategy( + unsigned int kernel_rows, unsigned int kernel_cols, + unsigned int stride_rows, unsigned int stride_cols, + unsigned int output_rows, + arm_gemm::VLType vl_type + ) : m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols), + m_stride_rows(stride_rows), m_stride_cols(stride_cols), + m_output_rows(output_rows), m_vl_type(vl_type) + { + } + + unsigned int get_output_rows(void) const override { return m_output_rows; } + arm_gemm::VLType get_vl_type(void) const override { return m_vl_type; } + + size_t get_storage_size(const DepthwiseArgs &args) const override + { + return interleaves::get_storage_size_generic(this->get_kernel_packing_arguments(), args); + } + + void pack_parameters( + const DepthwiseArgs &args, void *buffer, + const void *biases, const OutputStage &, + const void *weights, size_t ld_weight_col, size_t ld_weight_row + ) const override + { + interleaves::pack_parameters_generic( + this->get_kernel_packing_arguments(), args, + buffer, biases, weights, ld_weight_col, ld_weight_row + ); + } + + using KernelType = typename PlanarKernelType::Type; + virtual KernelType get_kernel(void) const = 0; +}; + + +namespace { + +template +struct OutputRowPtrsElement +{ + struct Workspace + { + T **output_row_ptrs; + size_t *output_ld_cols; + size_t *output_ld_vls; // Stride between vectors of channels + T *output_padding_buffer; + }; + + template + static size_t get_element_size(const WorkspaceArgs, OutputStage> &args) + { + // We need one pointer and stride for each row of output, and an additional + // blob of memory into which padded stores can go. + return args.strategy->get_output_rows() * (sizeof(T *) + 2*sizeof(size_t)) + + get_vector_length(args.strategy->get_vl_type()); + } + + template + static void *initialise(WorkspaceType *ws, void *buffer, + const WorkspaceArgs, OutputStage> &args) + { + const auto n_rows = args.strategy->get_output_rows(); + ws->output_row_ptrs = reinterpret_cast(buffer); + ws->output_ld_cols = reinterpret_cast(ws->output_row_ptrs + n_rows); + ws->output_ld_vls = ws->output_ld_cols + n_rows; + ws->output_padding_buffer = reinterpret_cast(ws->output_ld_vls + n_rows); + return ws->output_padding_buffer + get_vector_length(args.strategy->get_vl_type()); + } +}; + +} // namespace {anonymous} + + +template ::Type, + typename OutputStage=typename DefaultOutputStage::Type> +class DepthwisePlanar : public DepthwiseCommon +{ + using Parent = DepthwiseCommon; + using StrategyType = IPlanarStrategy; + using WorkspaceManager = Workspace< + OutputRowPtrsElement, + ActivationsElement + >; + using WorkspaceType = typename WorkspaceManager::WorkspaceType; + + std::unique_ptr m_strat; + const TAccum *m_bias; + OutputStage m_os; + + public: + DepthwisePlanar(StrategyType *const strat, const DepthwiseArgs &args, const OutputStage &os = {}) + : Parent(args), m_strat(strat), m_bias(nullptr), m_os(os) + { + } + + size_t get_storage_size(void) const override + { + return m_strat->get_storage_size(this->m_args); + } + + void pack_parameters( + void *buffer, const void *biases, + const void *weights, size_t ld_weight_col, size_t ld_weight_row + ) override + { + m_strat->pack_parameters(this->m_args, buffer, biases, {}, weights, ld_weight_col, ld_weight_row); + this->m_bias = reinterpret_cast(biases); + depthwise_depthfirst::stash_bias(this->m_os, biases); + } + + size_t get_working_size(unsigned int n_threads, unsigned int) const override + { + return this->get_working_size_per_thread() * n_threads; + } + + protected: + /* Compute the amount of working space required for a single thread. */ + virtual size_t get_working_size_per_thread(void) const + { + return WorkspaceManager::get_sizeof_workspace( + WorkspaceArgs, OutputStage>(m_strat.get(), this->m_args, m_os)); + } + + /* Initialise the working space for a thread. */ + virtual void initialise_working_space(void *buffer) const + { + WorkspaceManager::initialise( + buffer, + WorkspaceArgs, OutputStage>(m_strat.get(), this->m_args, m_os) + ); + } + + /* Execute the kernel for a given chunk of work. */ + virtual void execute_kernel( + const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl, + unsigned int pad_top, unsigned int valid_input_rows, + unsigned int pad_left, unsigned int valid_input_cols, + const TWeight *weights, const TAccum *bias, + TOutput *outptr, size_t ld_out_row, size_t ld_out_col, size_t ld_out_vl, + unsigned int valid_output_rows, unsigned int valid_output_cols, + unsigned int first_channel, unsigned int valid_channels, + WorkspaceType *ws + ) const + { + // Initialise the output pointers + for (auto i = 0u; i < m_strat->get_output_rows(); i++) + { + // Point at the output tensor for all valid rows; otherwise point at the + // padding buffer. + ws->output_row_ptrs[i] = i < valid_output_rows ? outptr : ws->output_padding_buffer; + ws->output_ld_cols[i] = i < valid_output_rows ? ld_out_col : 0; + ws->output_ld_vls[i] = i < valid_output_rows ? ld_out_vl : 0; + outptr += ld_out_row; + } + + // Execute the kernel + PlanarKernelType::template execute( + reinterpret_cast *>(m_strat.get())->get_kernel(), + inptr, ld_in_row, ld_in_col, ld_in_vl, + pad_top, valid_input_rows, pad_left, valid_input_cols, + weights, bias, + ws->output_row_ptrs, ws->output_ld_cols, ws->output_ld_vls, + valid_output_cols, first_channel, valid_channels, + this->m_os, ws + ); + } + + void execute_internal( + unsigned int batches, + unsigned int input_height, + unsigned int input_width, + unsigned int n_input_channels, + const PaddingValues &padding, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + unsigned int output_height, + unsigned int output_width, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads + ) const override + { + // Get and initialise the working space for this thread. + void *thread_working_space = + static_cast(working_space) + thread_id * this->get_working_size_per_thread(); + this->initialise_working_space(thread_working_space); + auto ws = reinterpret_cast(thread_working_space); + + const auto n_output_channels = n_input_channels * this->m_args.channel_multiplier; + const auto vl = get_vector_length(m_strat->get_vl_type()); + + // Get typed pointers + auto input_batch = reinterpret_cast(input); + auto output_batch = reinterpret_cast(output); + auto weights = reinterpret_cast(parameters); + + // Iterate over batches + for (; batches; batches--) + { + // NOTE: Other loop orderings are possible and it would be worth + // investigating them. + + // Within a batch, stripe threads across rows. + for (auto start_output_i = thread_id * m_strat->get_output_rows(); + start_output_i < output_height; + start_output_i += n_threads * m_strat->get_output_rows()) + { + // Determine what (if any padding) is required on the top/bottom of + // this row of the convolution. + const int start_input_i = start_output_i * this->m_args.stride_rows - padding.top; + const unsigned int input_pad_top = start_input_i < 0 ? -start_input_i : 0; + const unsigned int input_i = start_input_i < 0 ? 0 : start_input_i; + const unsigned int valid_input_rows = input_i > input_height ? 0 : input_height - input_i; + const unsigned int valid_output_rows = output_height - start_output_i; + + auto inptr_row = input_batch + input_i*ld_input_row; + auto outptr_row = output_batch + start_output_i * ld_output_row; + + // Execute the kernel + this->execute_kernel( + inptr_row, ld_input_row, ld_input_col, vl, + input_pad_top, valid_input_rows, padding.left, input_width, + weights, this->m_bias, + outptr_row, ld_output_row, ld_output_col, vl, + valid_output_rows, output_width, + 0 /* first channel */, n_output_channels, + ws + ); + } + + // Update the input and output pointers to account for batch + input_batch += ld_input_batch; + output_batch += ld_output_batch; + } + } +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp index 46a31185d7..4ff249a5d5 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,15 +25,14 @@ #include "arm_gemm_local.hpp" #include "depthwise_implementation.hpp" -#include "depthwise_depthfirst_quantized.hpp" -#include "depthwise_depthfirst_generic_quantized.hpp" -#include "depthwise_depthfirst_multiplier_quantized.hpp" -#include "depthwise_depthfirst_generic_multiplier_quantized.hpp" +#include "depthwise_depthfirst.hpp" +#include "depthwise_depthfirst_generic.hpp" +#include "depthwise_depthfirst_multiplier.hpp" #include "depthwise_implementation_constraints.hpp" #if defined(__aarch64__) -#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(ARM_COMPUTE_ENABLE_SVE) #include "kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp" #include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp" #include "kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp" @@ -41,7 +40,7 @@ #include "kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp" #include "kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp" #include "kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp" -#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(ARM_COMPUTE_ENABLE_SVE) #include "kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp" #include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp" #include "kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp" @@ -73,7 +72,7 @@ bool qp_weights_are_symmetric(const DepthwiseArgs &, const void *_qp) static const DepthwiseImplementation depthwise_s8q_methods[] = { #if defined(__aarch64__) -#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(ARM_COMPUTE_ENABLE_SVE) { DepthwiseMethod::DEPTHFIRST, "sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst", @@ -84,7 +83,8 @@ static const DepthwiseImplementation depth cpu_has_sve2), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -96,7 +96,8 @@ static const DepthwiseImplementation depth cpu_has_sve2), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -108,7 +109,8 @@ static const DepthwiseImplementation depth cpu_has_sve2), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -120,7 +122,8 @@ static const DepthwiseImplementation depth cpu_has_sve2), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -132,7 +135,8 @@ static const DepthwiseImplementation depth cpu_has_sve2), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -140,10 +144,12 @@ static const DepthwiseImplementation depth "sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst", constraint(is_supported, qp_has_no_left_shift, + has_channel_multiplier, cpu_has_sve2), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstWithMultiplierQuantized(args, qp); + auto strat = new sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info); + return new DepthwiseDepthfirstMultiplier(strat, args, qp); }, }, { @@ -151,13 +157,15 @@ static const DepthwiseImplementation depth "sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst", constraint(is_supported, qp_has_no_left_shift, + has_channel_multiplier, cpu_has_sve2), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstWithMultiplierQuantized(args, qp); + auto strat = new sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info); + return new DepthwiseDepthfirstMultiplier(strat, args, qp); }, }, -#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(ARM_COMPUTE_ENABLE_SVE) { DepthwiseMethod::DEPTHFIRST, "a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst", @@ -168,7 +176,8 @@ static const DepthwiseImplementation depth cpu_has_dot_product), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -180,7 +189,8 @@ static const DepthwiseImplementation depth cpu_has_dot_product), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -191,7 +201,8 @@ static const DepthwiseImplementation depth qp_has_no_left_shift), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -202,7 +213,8 @@ static const DepthwiseImplementation depth qp_has_no_left_shift), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -213,7 +225,8 @@ static const DepthwiseImplementation depth qp_has_no_left_shift), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -222,7 +235,9 @@ static const DepthwiseImplementation depth constraint(has_no_channel_multiplier), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstGenericQuantized(args, qp); + auto kernel = new a64_s8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info); + auto strat = new GenericDepthfirstStrategy(kernel, 3, 3, args); + return new DepthwiseDepthfirstGeneric(strat, args, qp); }, }, { @@ -230,10 +245,12 @@ static const DepthwiseImplementation depth "a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst", constraint(is_supported, qp_has_no_left_shift, + has_channel_multiplier, cpu_has_dot_product), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstWithMultiplierQuantized(args, qp); + auto strat = new a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info); + return new DepthwiseDepthfirstMultiplier(strat, args, qp); }, }, { @@ -241,19 +258,23 @@ static const DepthwiseImplementation depth "a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst", constraint(is_supported, qp_has_no_left_shift, + has_channel_multiplier, cpu_has_dot_product), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstWithMultiplierQuantized(args, qp); + auto strat = new a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info); + return new DepthwiseDepthfirstMultiplier(strat, args, qp); }, }, { DepthwiseMethod::DEPTHFIRST, "a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst", - nullptr, + constraint(has_channel_multiplier), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstGenericWithMultiplierQuantized(args, qp); + auto kern = new a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info); + auto strat = new GenericDepthfirstMultiplierStrategy(kern, args); + return new DepthwiseDepthfirstMultiplier(strat, args, qp); }, }, #endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp new file mode 100644 index 0000000000..33f2177efe --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "depthwise_strategies_common.hpp" + +namespace arm_conv { +namespace depthwise { + +unsigned int DepthfirstStrategyUntyped::get_input_rows() const +{ + return this->get_kernel_rows() + (this->get_output_rows() - 1) * this->get_stride_rows(); +} + +unsigned int DepthfirstStrategyUntyped::get_input_cols() const +{ + return this->get_kernel_cols() + (this->get_output_cols() - 1) * this->get_stride_cols(); +} + +unsigned int DepthfirstStrategyUntyped::get_n_input_points() const { return this->get_input_rows() * this->get_input_cols(); } +unsigned int DepthfirstStrategyUntyped::get_n_output_points() const { return this->get_output_rows() * this->get_output_cols(); } +unsigned int DepthfirstStrategyUntyped::get_n_kernel_points() const { return this->get_kernel_rows() * this->get_kernel_cols(); } + +unsigned int DepthfirstStrategyUntyped::get_accumulator_depth_vl() const { return 1; } + +bool DepthfirstStrategyUntyped::get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const +{ + // Get the kernel point to pack at the given index; return false to + // indicate that this index, and all greater indices, is out of range. + if (index < (this->get_kernel_cols() * this->get_kernel_rows())) + { + y = index % this->get_kernel_cols(); + x = index / this->get_kernel_cols(); + return true; + } + return false; +} + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp new file mode 100644 index 0000000000..99b91fb833 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_strategies_common.hpp @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "interleaves/generic.hpp" +#include "depthfirst_driver.hpp" + +namespace arm_conv { +namespace depthwise { + +class DepthfirstStrategyUntyped : public IDepthfirstStrategy +{ + public: + virtual arm_gemm::VLType get_vl_type() const = 0; + + virtual unsigned int get_kernel_rows() const = 0; + virtual unsigned int get_kernel_cols() const = 0; + + virtual unsigned int get_stride_rows() const = 0; + virtual unsigned int get_stride_cols() const = 0; + + virtual unsigned int get_input_rows() const override; + virtual unsigned int get_input_cols() const override; + + virtual unsigned int get_n_input_points() const; + virtual unsigned int get_n_output_points() const; + virtual unsigned int get_n_kernel_points() const; + + // Get the number of VLs used in the accumulator, this defaults to 1. + virtual unsigned int get_accumulator_depth_vl() const; + + // Get the order in which to pack the weights, this defaults to a row-major + // sweep over the weight tensor. + virtual bool get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const; +}; + +template +class DepthfirstStrategy : public DepthfirstStrategyUntyped +{ + public: + virtual size_t get_storage_size(const DepthwiseArgs &args) const + { + interleaves::PackingArguments packing_args( + this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight), + true, sizeof(TAccum), + this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(), + [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool + { return this->get_kernel_packing_point(idx, x, y); } + ); + return interleaves::get_storage_size_generic(packing_args, args); + } + + virtual void pack_parameters( + const DepthwiseArgs &args, void *buffer, + const void *biases, const OutputStage &, + const void *weights, size_t ld_weight_col, size_t ld_weight_row + ) const + { + interleaves::PackingArguments packing_args( + this->get_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight), + true, sizeof(TAccum), + this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(), + [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool + { return this->get_kernel_packing_point(idx, x, y); } + ); + interleaves::pack_parameters_generic( + packing_args, args, buffer, biases, weights, ld_weight_col, ld_weight_row); + } +}; + +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp index 67713c5bcc..b1489d0b59 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8q.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,23 +25,27 @@ #include "arm_gemm_local.hpp" #include "depthwise_implementation.hpp" -#include "depthwise_depthfirst_quantized.hpp" -#include "depthwise_depthfirst_generic_quantized.hpp" -#include "depthwise_depthfirst_multiplier_quantized.hpp" -#include "depthwise_depthfirst_generic_multiplier_quantized.hpp" +#include "depthwise_depthfirst.hpp" +#include "depthwise_depthfirst_generic.hpp" +#include "depthwise_depthfirst_multiplier.hpp" #include "depthwise_implementation_constraints.hpp" #if defined(__aarch64__) -#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(ARM_COMPUTE_ENABLE_SVE) #include "kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp" #include "kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp" #include "kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp" #include "kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp" #include "kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp" #include "kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp" -#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(ARM_COMPUTE_ENABLE_SVE) #include "kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp" + +#include "kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp" +#include "kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp" +#include "kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp" + #include "kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp" #include "kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp" #include "kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp" @@ -49,6 +53,7 @@ #include "kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp" #include "kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp" #include "kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp" + #endif // defined(__aarch64__) #include @@ -60,7 +65,7 @@ namespace depthwise { static const DepthwiseImplementation depthwise_u8q_methods[] = { #if defined(__aarch64__) -#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(ARM_COMPUTE_ENABLE_SVE) { DepthwiseMethod::DEPTHFIRST, "sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst", @@ -70,7 +75,8 @@ static const DepthwiseImplementation de cpu_has_sve2), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -82,7 +88,8 @@ static const DepthwiseImplementation de cpu_has_sve2), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -94,7 +101,8 @@ static const DepthwiseImplementation de cpu_has_sve2), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -106,7 +114,8 @@ static const DepthwiseImplementation de cpu_has_sve2), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -114,10 +123,12 @@ static const DepthwiseImplementation de "sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst", constraint(is_supported, qp_has_no_left_shift, + has_channel_multiplier, cpu_has_sve2), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstWithMultiplierQuantized(args, qp); + auto strat = new sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info); + return new DepthwiseDepthfirstMultiplier(strat, args, qp); }, }, { @@ -125,13 +136,15 @@ static const DepthwiseImplementation de "sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst", constraint(is_supported, qp_has_no_left_shift, + has_channel_multiplier, cpu_has_sve2), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstWithMultiplierQuantized(args, qp); + auto strat = new sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info); + return new DepthwiseDepthfirstMultiplier(strat, args, qp); }, }, -#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(ARM_COMPUTE_ENABLE_SVE) { DepthwiseMethod::DEPTHFIRST, "a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst", @@ -141,9 +154,51 @@ static const DepthwiseImplementation de qp_has_no_left_shift), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); + }, + }, + + { + DepthwiseMethod::DEPTHFIRST, + "a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_zero_a_offset, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + auto strat = new a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_zero_a_offset, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + auto strat = new a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); + }, + }, + { + DepthwiseMethod::DEPTHFIRST, + "a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst", + constraint(is_supported, + has_no_channel_multiplier, + qp_zero_a_offset, + qp_has_no_left_shift), + nullptr, + [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { + auto strat = new a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); + }, + }, + { DepthwiseMethod::DEPTHFIRST, "a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst", @@ -152,7 +207,8 @@ static const DepthwiseImplementation de qp_has_no_left_shift), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -163,7 +219,8 @@ static const DepthwiseImplementation de qp_has_no_left_shift), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -174,7 +231,8 @@ static const DepthwiseImplementation de qp_has_no_left_shift), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -183,7 +241,9 @@ static const DepthwiseImplementation de constraint(has_no_channel_multiplier), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstGenericQuantized(args, qp); + auto kernel = new a64_u8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info); + auto strat = new GenericDepthfirstStrategy(kernel, 3, 3, args); + return new DepthwiseDepthfirstGeneric(strat, args, qp); }, }, { @@ -191,10 +251,12 @@ static const DepthwiseImplementation de "a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst", constraint(is_supported, cpu_has_dot_product, + has_channel_multiplier, qp_has_no_left_shift), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstWithMultiplierQuantized(args, qp); + auto strat = new a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(args.cpu_info); + return new DepthwiseDepthfirstMultiplier(strat, args, qp); }, }, { @@ -202,21 +264,26 @@ static const DepthwiseImplementation de "a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst", constraint(is_supported, cpu_has_dot_product, + has_channel_multiplier, qp_has_no_left_shift), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstWithMultiplierQuantized(args, qp); + auto strat = new a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(args.cpu_info); + return new DepthwiseDepthfirstMultiplier(strat, args, qp); }, }, { DepthwiseMethod::DEPTHFIRST, "a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst", - nullptr, + constraint(has_channel_multiplier), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstGenericWithMultiplierQuantized(args, qp); + auto kern = new a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info); + auto strat = new GenericDepthfirstMultiplierStrategy(kern, args); + return new DepthwiseDepthfirstMultiplier(strat, args, qp); }, }, + #endif // defined(__aarch64__) { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list }; diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp index af4426b69f..9b989012b8 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_u8s8u8q.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,19 +25,18 @@ #include "arm_gemm_local.hpp" #include "depthwise_implementation.hpp" -#include "depthwise_depthfirst_quantized.hpp" -#include "depthwise_depthfirst_generic_quantized.hpp" -#include "depthwise_depthfirst_multiplier_quantized.hpp" -#include "depthwise_depthfirst_generic_multiplier_quantized.hpp" +#include "depthwise_depthfirst.hpp" +#include "depthwise_depthfirst_generic.hpp" +#include "depthwise_depthfirst_multiplier.hpp" #include "depthwise_implementation_constraints.hpp" #if defined(__aarch64__) -#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(ARM_COMPUTE_ENABLE_SVE) #include "kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp" #include "kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp" #include "kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp" -#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(ARM_COMPUTE_ENABLE_SVE) #include "kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp" #include "kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp" #include "kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp" @@ -54,7 +53,7 @@ namespace depthwise { static const DepthwiseImplementation depthwise_u8q_methods[] = { #if defined(__aarch64__) -#if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(ARM_COMPUTE_ENABLE_SVE) { DepthwiseMethod::DEPTHFIRST, "sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst", @@ -64,7 +63,8 @@ static const DepthwiseImplementation dep cpu_has_sve2), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -76,7 +76,8 @@ static const DepthwiseImplementation dep cpu_has_sve2), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -88,10 +89,11 @@ static const DepthwiseImplementation dep cpu_has_sve2), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, -#endif // defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(ARM_COMPUTE_ENABLE_SVE) { DepthwiseMethod::DEPTHFIRST, "a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst", @@ -100,7 +102,8 @@ static const DepthwiseImplementation dep qp_has_no_left_shift), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -111,7 +114,8 @@ static const DepthwiseImplementation dep qp_has_no_left_shift), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -122,7 +126,8 @@ static const DepthwiseImplementation dep qp_has_no_left_shift), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstQuantized(args, qp); + auto strat = new a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info); + return new DepthwiseDepthfirst(strat, args, qp); }, }, { @@ -131,16 +136,20 @@ static const DepthwiseImplementation dep constraint(has_no_channel_multiplier), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstGenericQuantized(args, qp); + auto kernel = new a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst(args.cpu_info); + auto strat = new GenericDepthfirstStrategy(kernel, 3, 3, args); + return new DepthwiseDepthfirstGeneric(strat, args, qp); }, }, { DepthwiseMethod::DEPTHFIRST, "a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst", - nullptr, + constraint(has_channel_multiplier), nullptr, [] (const DepthwiseArgs &args, const Requantize32 &qp) -> DepthwiseCommon * { - return new DepthwiseDepthfirstGenericWithMultiplierQuantized(args, qp); + auto kern = new a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info); + auto strat = new GenericDepthfirstMultiplierStrategy(kern, args); + return new DepthwiseDepthfirstMultiplier(strat, args, qp); }, }, #endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp new file mode 100644 index 0000000000..056f08d037 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.cpp @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "generic.hpp" + +#include + +namespace arm_conv { +namespace depthwise { +namespace interleaves { + +PackingArguments::PackingArguments( + unsigned int kernel_rows, unsigned int kernel_cols, size_t weight_element_size, + bool include_bias, size_t bias_element_size, + arm_gemm::VLType vl_type, size_t accumulator_element_size, unsigned int accumulator_depth_vl, + std::function get_weight_pos +) : kernel_rows(kernel_rows), kernel_cols(kernel_cols), weight_element_size(weight_element_size), + include_bias(include_bias), bias_element_size(bias_element_size), + vl_type(vl_type), accumulator_element_size(accumulator_element_size), accumulator_depth_vl(accumulator_depth_vl), + get_weight_pos(get_weight_pos) +{ +} + +size_t get_storage_size_generic(const PackingArguments &packing_args, const DepthwiseArgs &args) +{ + // If the channel multiplier is greater than one, then we treat this as a + // repeated packing of `channel_multiplier`-sized problems. + if (args.channel_multiplier > 1) + { + DepthwiseArgs args_per_input_channel(args); + args_per_input_channel.input_channels = args.channel_multiplier; + args_per_input_channel.channel_multiplier = 1; + + return args.input_channels * get_storage_size_generic(packing_args, args_per_input_channel); + } + + const unsigned int vl = + packing_args.accumulator_depth_vl * + arm_gemm::utils::get_vector_length(packing_args.vl_type) / packing_args.accumulator_element_size; + const unsigned int n_packs = arm_gemm::iceildiv(args.input_channels, vl); + const auto pack_size = (packing_args.include_bias ? packing_args.bias_element_size : 0) + + packing_args.kernel_points() * packing_args.weight_element_size; + return n_packs * pack_size * vl; +} + +void pack_parameters_generic( + const PackingArguments &packing_args, + const DepthwiseArgs &args, + void *buffer_raw, + const void *biases_raw, + const void *weights_raw, + size_t ld_weight_col, + size_t ld_weight_row +) +{ + // Cast the pointers to byte sizes + auto *buffer = static_cast(buffer_raw); + auto *biases = static_cast(biases_raw); + auto *weights = static_cast(weights_raw); + + // If the channel multiplier is greater than one, then we treat this as a + // repeated packing of `channel_multiplier`-sized problems. + if (args.channel_multiplier > 1) + { + // Get a modified copy of the depthwise arguments + DepthwiseArgs args_per_input_channel(args); + args_per_input_channel.input_channels = args.channel_multiplier; + args_per_input_channel.channel_multiplier = 1; + + // Resolve the strides here + ld_weight_col = ld_weight_col ? ld_weight_col : args.input_channels * args.channel_multiplier; + ld_weight_row = ld_weight_row ? ld_weight_row : ld_weight_col * packing_args.kernel_cols; + + auto per_input_channel_size = get_storage_size_generic(packing_args, args_per_input_channel); + + for (unsigned int c = 0; c < args.input_channels; c++) + { + pack_parameters_generic( + packing_args, args_per_input_channel, buffer, biases, weights, ld_weight_col, ld_weight_row); + + // Update the pointers + buffer += per_input_channel_size; + biases += (biases == nullptr) ? 0 : packing_args.bias_element_size * args.channel_multiplier; + weights += packing_args.weight_element_size * args.channel_multiplier; + } + return; + } + + // Finalise the weight strides + ld_weight_col = (ld_weight_col == 0) ? args.input_channels : ld_weight_col; + ld_weight_row = (ld_weight_row == 0) ? packing_args.kernel_cols * ld_weight_col : ld_weight_row; + + const unsigned int vl = + packing_args.accumulator_depth_vl * + arm_gemm::utils::get_vector_length(packing_args.vl_type) / packing_args.accumulator_element_size; + + for (unsigned int n = 0; n < args.input_channels; n += vl) + { + const unsigned int todo = std::min(vl, args.input_channels - n); + + if (packing_args.include_bias) + { + if (biases != nullptr) + { + memcpy(buffer, biases, todo * packing_args.bias_element_size); + biases += todo * packing_args.bias_element_size; + } + else + { + memset(buffer, 0, vl * packing_args.bias_element_size); + } + + buffer += vl * packing_args.bias_element_size; + } + + // Copy each of the weights in turn + unsigned int kx, ky; + for (int kindex = 0; packing_args.get_weight_pos(kindex, kx, ky); kindex++) + { + const auto src_ptr = weights + (kx*ld_weight_row + ky*ld_weight_col + n) * packing_args.weight_element_size; + memcpy(buffer, src_ptr, todo * packing_args.weight_element_size); + buffer += vl * packing_args.weight_element_size; + } + } +} + +} // namespace interleaves +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp new file mode 100644 index 0000000000..5b5ae17806 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic.hpp @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" +#include "depthwise.hpp" + +#include + +namespace arm_conv { +namespace depthwise { +namespace interleaves { + +struct PackingArguments +{ + const unsigned int kernel_rows; + const unsigned int kernel_cols; + const size_t weight_element_size; + const bool include_bias; + const size_t bias_element_size; + arm_gemm::VLType vl_type; + const size_t accumulator_element_size; + const unsigned int accumulator_depth_vl; + std::function get_weight_pos; + + unsigned int kernel_points(void) const { return kernel_cols * kernel_rows; } + + PackingArguments( + unsigned int kernel_rows, + unsigned int kernel_cols, + size_t weight_element_size, + bool include_bias, + size_t bias_element_size, + arm_gemm::VLType vl_type, + size_t accumulator_element_size, + unsigned int accumulator_depth_vl, + std::function get_weight_pos + ); +}; + +size_t get_storage_size_generic( + const PackingArguments &packing_args, + const DepthwiseArgs &args +); + +void pack_parameters_generic( + const PackingArguments &packing_args, + const DepthwiseArgs &args, + void *buffer_raw, + const void *biases_raw, + const void *weights_raw, + size_t ld_weight_col, + size_t ld_weight_row +); + +} // namespace interleaves +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp new file mode 100644 index 0000000000..a6389054d1 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.cpp @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "generic_quantized_dot_product.hpp" +#include + +namespace arm_conv { +namespace depthwise { +namespace interleaves { +namespace quantized { + +size_t get_storage_size( + const DepthwiseArgs &args, + const arm_gemm::VLType vl_type, + const unsigned int accumulator_depth_vl +) +{ + // We produce VL channels at a time, for each of these blocks of + // channels we store a vector of biases, weights (complicated) and + // requantize parameters. + const unsigned int iter_length = accumulator_depth_vl * arm_gemm::utils::get_vector_length(vl_type); + const unsigned int n_iters = args.input_channels * arm_gemm::iceildiv(args.channel_multiplier, iter_length); + + // Compute the cost of storing the weights + const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(args.kernel_cols, 4u); + + return n_iters * iter_length * ( + sizeof(int32_t) + // Bias + 4 * n_dots_per_kernel_row * args.kernel_rows * sizeof(int8_t) + // Weights + 2 * sizeof(int32_t) // Requantisation parameters + ); +} + +template +void pack_parameters( + void *_buffer, const int32_t *biases, + const T *weights, size_t ld_weight_col, size_t ld_weight_row, + const DepthwiseArgs &args, + const arm_gemm::Requantize32 &qp, + const arm_gemm::VLType vl_type, + const unsigned int accumulator_depth_vl +) +{ + auto buffer = static_cast(_buffer); + auto requant_muls = qp.per_channel_muls; + auto requant_shifts = qp.per_channel_right_shifts; + + const unsigned int iter_length = accumulator_depth_vl * arm_gemm::utils::get_vector_length(vl_type); + const unsigned int n_iters_per_input_channel = arm_gemm::iceildiv(args.channel_multiplier, iter_length); + const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(args.kernel_cols, 4u); + + const size_t iter_stride = iter_length * ( + sizeof(int32_t) + // Bias + 4 * n_dots_per_kernel_row * args.kernel_rows * sizeof(T) + // Weights + 2 * sizeof(int32_t) // Requantisation parameters + ); + + ld_weight_col = (ld_weight_col == 0) ? args.input_channels * args.channel_multiplier : ld_weight_col; + ld_weight_row = (ld_weight_row == 0) ? args.kernel_cols * ld_weight_col : ld_weight_row; + + for (unsigned int input_channel = 0; input_channel < args.input_channels; input_channel++) + { + auto buffer_input_channel = buffer + input_channel * n_iters_per_input_channel * iter_stride; + auto weights_input_channel = weights + input_channel * args.channel_multiplier; + + for (unsigned int iter = 0; iter < n_iters_per_input_channel; iter++) + { + // Get a pointer to the start of this portion of the buffer; consequently + // derive pointers to the bias, weight and requantisation portions of + // this frame. + auto buffer_base = buffer_input_channel + iter_stride * iter; + auto buffer_biases = reinterpret_cast(buffer_base); + auto buffer_weights = buffer_base + sizeof(int32_t) * iter_length; + auto buffer_requant_mul = reinterpret_cast( + buffer_weights + args.kernel_rows * n_dots_per_kernel_row * 4 * iter_length); + auto buffer_requant_shift = buffer_requant_mul + iter_length; + auto weights_base = weights_input_channel + iter * iter_length; + + // Hence work through the data for this iteration, on a + // channel-by-channel basis. + const auto this_iter_length = std::min( + iter_length, args.channel_multiplier - iter * iter_length + ); + for (unsigned int i = 0; i < this_iter_length; i++) + { + auto weights_channel = weights_base + i; + + // Read the bias value, we modify this as we read the weights. + auto bias_value = biases == nullptr ? 0 : *(biases++); + int32_t elements_sum = 0; + + // Read through the kernel; for each row, marshal together as many dot + // product terms as are required. + for (unsigned int ki = 0; ki < args.kernel_rows; ki++) + { + auto buffer_row = buffer_weights + i*4 + ki * 4 * n_dots_per_kernel_row * iter_length; + auto weights_row = weights_channel + ki * ld_weight_row; + + unsigned int kj = 0; + for (; kj < args.kernel_cols; kj++) + { + // Determine which element to which we're writing + const auto dot = kj / 4; + const auto elem = kj % 4; + + // Copy the value; include in the sum + const auto val = weights_row[kj * ld_weight_col]; + buffer_row[dot * 4 * iter_length + elem] = val; + elements_sum += val; + } + for (; kj < 4 * n_dots_per_kernel_row; kj++) + { + const auto dot = kj / 4; + const auto elem = kj % 4; + buffer_row[dot * 4 * iter_length + elem] = 0; + } + + buffer_row += 4 * n_dots_per_kernel_row * iter_length; + } + + // Write back the bias and offset values + *(buffer_biases++) = + bias_value - qp.a_offset * elements_sum + + args.kernel_rows * args.kernel_cols * qp.a_offset * qp.b_offset; + + // Write out the requantisation parameters + *(buffer_requant_mul++) = qp.per_channel_requant ? *(requant_muls++) : qp.per_layer_mul; + *(buffer_requant_shift++) = qp.per_channel_requant ? *(requant_shifts++) : qp.per_layer_right_shift; + } + } + } +} + +template void pack_parameters(void *, const int32_t *, const int8_t *, size_t, size_t, const DepthwiseArgs &, const arm_gemm::Requantize32 &, arm_gemm::VLType, unsigned int); +template void pack_parameters(void *, const int32_t *, const uint8_t *, size_t, size_t, const DepthwiseArgs &, const arm_gemm::Requantize32 &, arm_gemm::VLType, unsigned int); + +} // namespace quantized +} // namespace interleaves +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.hpp new file mode 100644 index 0000000000..779d67d3f4 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/generic_quantized_dot_product.hpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +#include "generic.hpp" + +namespace arm_conv { +namespace depthwise { +namespace interleaves { +namespace quantized { + +size_t get_storage_size( + const DepthwiseArgs &args, + arm_gemm::VLType vl_type, + unsigned int accumulator_depth_vl=1 +); + +template +void pack_parameters( + void *buffer, const int32_t *biases, + const T *weights, size_t ld_weight_col, size_t ld_weight_row, + const DepthwiseArgs &args, + const arm_gemm::Requantize32 &qp, + arm_gemm::VLType vl_type, + unsigned int accumulator_depth_vl +); + +} // namespace quantized +} // namespace interleaves +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp index cb49a243af..76f38eb335 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,90 +29,30 @@ namespace depthwise { #if defined(ARM_COMPUTE_ENABLE_SVE) -class interleave_sve_u8q_3x3_dot +struct interleave_sve_u8q_3x3_dot { - public: - static void pack_parameters(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t); - static size_t get_packed_size(const DepthwiseArgs &); + static void pack_parameters(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t); + static size_t get_packed_size(const DepthwiseArgs &); }; -class interleave_sve_s8q_3x3_dot +struct interleave_sve_s8q_3x3_dot { - public: - static void pack_parameters(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t); - static size_t get_packed_size(const DepthwiseArgs &); -}; - -class interleave_sve_u8q_3x3_mla -{ - public: - static void pack_parameters(unsigned int, void *, const uint8_t *, size_t, size_t); - static size_t get_packed_size(const DepthwiseArgs &); -}; - -class interleave_sve_s8q_3x3_mla -{ - public: - static void pack_parameters(unsigned int, void *, const int8_t *, size_t, size_t); - static size_t get_packed_size(const DepthwiseArgs &); -}; - -class interleave_sve_u8q_5x5_mla -{ - public: - static void pack_parameters(unsigned int, void *, const uint8_t *, size_t, size_t); - static size_t get_packed_size(const DepthwiseArgs &); -}; - -class interleave_sve_s8q_5x5_mla -{ - public: - static void pack_parameters(unsigned int, void *, const int8_t *, size_t, size_t); - static size_t get_packed_size(const DepthwiseArgs &); + static void pack_parameters(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t); + static size_t get_packed_size(const DepthwiseArgs &); }; #endif // defined(ARM_COMPUTE_ENABLE_SVE) -class interleave_a64_u8q_3x3_dot -{ - public: - static void pack_parameters(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t); - static size_t get_packed_size(const DepthwiseArgs &); -}; - -class interleave_a64_s8q_3x3_dot -{ - public: - static void pack_parameters(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t); - static size_t get_packed_size(const DepthwiseArgs &); -}; - -class interleave_a64_u8q_3x3_mla -{ - public: - static void pack_parameters(unsigned int, void *, const uint8_t *, size_t, size_t); - static size_t get_packed_size(const DepthwiseArgs &); -}; - -class interleave_a64_s8q_3x3_mla -{ - public: - static void pack_parameters(unsigned int, void *, const int8_t *, size_t, size_t); - static size_t get_packed_size(const DepthwiseArgs &); -}; - -class interleave_a64_u8q_5x5_mla +struct interleave_a64_u8q_3x3_dot { - public: - static void pack_parameters(unsigned int, void *, const uint8_t *, size_t, size_t); - static size_t get_packed_size(const DepthwiseArgs &); + static void pack_parameters(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t); + static size_t get_packed_size(const DepthwiseArgs &); }; -class interleave_a64_s8q_5x5_mla +struct interleave_a64_s8q_3x3_dot { - public: - static void pack_parameters(unsigned int, void *, const int8_t *, size_t, size_t); - static size_t get_packed_size(const DepthwiseArgs &); + static void pack_parameters(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t); + static size_t get_packed_size(const DepthwiseArgs &); }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp index be50d1c535..d2db12535f 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,19 +36,16 @@ namespace depthwise { void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); void a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); -class a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy +class a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16> { private: - typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); - indirect_kern_type m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl; - - typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); - direct_kern_type m_direct_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl; + using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>; + Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl; + Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl; public: - typedef __fp16 return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + using return_type = __fp16; + constexpr static auto vl_type = arm_gemm::VLType::None; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; @@ -59,60 +56,13 @@ class a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirs constexpr static unsigned int output_rows = 2; constexpr static unsigned int output_cols = 2; - constexpr static unsigned int input_rows = 4; - constexpr static unsigned int input_cols = 4; - - a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {} + a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) + : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 3, 1) {} arm_gemm::VLType get_vl_type(void) const override { return vl_type; } - unsigned int get_kernel_rows(void) const override { return kernel_rows; } - unsigned int get_kernel_cols(void) const override { return kernel_cols; } - - unsigned int get_stride_rows(void) const override { return stride_rows; } - unsigned int get_stride_cols(void) const override { return stride_cols; } - - unsigned int get_output_rows(void) const override { return output_rows; } - unsigned int get_output_cols(void) const override { return output_cols; } - - unsigned int get_input_rows(void) const override { return input_rows; } - unsigned int get_input_cols(void) const override { return input_cols; } - - void indirect_kernel( - const void *const *const input_ptrs, - void *const *const outptrs, - const void *params, - unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const override - { - m_indirect_kernel( - reinterpret_cast(input_ptrs), - reinterpret_cast<__fp16 *const *>(outptrs), - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } - - void direct_kernel( - const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, - const void *params, unsigned int n_channels, - const void *activation_min, const void *activation_max - ) const override - { - m_direct_kernel( - n_tile_rows, n_tile_cols, - static_cast(inptr), ld_input_row, ld_input_col, - static_cast<__fp16 *>(outptr), ld_output_row, ld_output_col, - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } + Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; } + Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp index 39fa7f62fd..75368dfcf9 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,19 +36,16 @@ namespace depthwise { void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); -class a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public IDepthwiseDepthfirstStrategy +class a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16> { private: - typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); - indirect_kern_type m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl; - - typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); - direct_kern_type m_direct_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl; + using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>; + Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl; + Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl; public: - typedef __fp16 return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + using return_type = __fp16; + constexpr static auto vl_type = arm_gemm::VLType::None; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; @@ -59,60 +56,13 @@ class a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public IDepthwiseDepthfirs constexpr static unsigned int output_rows = 3; constexpr static unsigned int output_cols = 3; - constexpr static unsigned int input_rows = 5; - constexpr static unsigned int input_cols = 5; - - a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {} + a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) + : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(3, 3, 1) {} arm_gemm::VLType get_vl_type(void) const override { return vl_type; } - unsigned int get_kernel_rows(void) const override { return kernel_rows; } - unsigned int get_kernel_cols(void) const override { return kernel_cols; } - - unsigned int get_stride_rows(void) const override { return stride_rows; } - unsigned int get_stride_cols(void) const override { return stride_cols; } - - unsigned int get_output_rows(void) const override { return output_rows; } - unsigned int get_output_cols(void) const override { return output_cols; } - - unsigned int get_input_rows(void) const override { return input_rows; } - unsigned int get_input_cols(void) const override { return input_cols; } - - void indirect_kernel( - const void *const *const input_ptrs, - void *const *const outptrs, - const void *params, - unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const override - { - m_indirect_kernel( - reinterpret_cast(input_ptrs), - reinterpret_cast<__fp16 *const *>(outptrs), - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } - - void direct_kernel( - const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, - const void *params, unsigned int n_channels, - const void *activation_min, const void *activation_max - ) const override - { - m_direct_kernel( - n_tile_rows, n_tile_cols, - static_cast(inptr), ld_input_row, ld_input_col, - static_cast<__fp16 *>(outptr), ld_output_row, ld_output_col, - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } + Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; } + Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp index e0abca91a2..faf6c91181 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst/generic_indirect.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -1283,7 +1283,8 @@ void a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl( : : [n_channels] "r" ((unsigned long) n_channels), [offsetof_Args_inptrs] "I" (offsetof(Args, inptrs)), [offsetof_args_max] "I" (offsetof(Args, max)), [offsetof_args_min] "I" (offsetof(Args, min)), [offsetof_args_outptrs] "I" (offsetof(Args, outptrs)), [offsetof_args_params] "I" (offsetof(Args, params)), [params_struct] "r" (¶ms_struct) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); } } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp index 1e0d922be4..4f0de6b61c 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,19 +36,16 @@ namespace depthwise { void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); void a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); -class a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public IDepthwiseDepthfirstStrategy +class a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16> { private: - typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); - indirect_kern_type m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl; - - typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); - direct_kern_type m_direct_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl; + using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>; + Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl; + Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl; public: - typedef __fp16 return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + using return_type = __fp16; + constexpr static auto vl_type = arm_gemm::VLType::None; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; @@ -59,60 +56,13 @@ class a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public IDepthwiseDepthfirs constexpr static unsigned int output_rows = 4; constexpr static unsigned int output_cols = 4; - constexpr static unsigned int input_rows = 6; - constexpr static unsigned int input_cols = 6; - - a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {} + a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) + : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(4, 3, 1) {} arm_gemm::VLType get_vl_type(void) const override { return vl_type; } - unsigned int get_kernel_rows(void) const override { return kernel_rows; } - unsigned int get_kernel_cols(void) const override { return kernel_cols; } - - unsigned int get_stride_rows(void) const override { return stride_rows; } - unsigned int get_stride_cols(void) const override { return stride_cols; } - - unsigned int get_output_rows(void) const override { return output_rows; } - unsigned int get_output_cols(void) const override { return output_cols; } - - unsigned int get_input_rows(void) const override { return input_rows; } - unsigned int get_input_cols(void) const override { return input_cols; } - - void indirect_kernel( - const void *const *const input_ptrs, - void *const *const outptrs, - const void *params, - unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const override - { - m_indirect_kernel( - reinterpret_cast(input_ptrs), - reinterpret_cast<__fp16 *const *>(outptrs), - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } - - void direct_kernel( - const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, - const void *params, unsigned int n_channels, - const void *activation_min, const void *activation_max - ) const override - { - m_direct_kernel( - n_tile_rows, n_tile_cols, - static_cast(inptr), ld_input_row, ld_input_col, - static_cast<__fp16 *>(outptr), ld_output_row, ld_output_col, - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } + Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; } + Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp index d89ae0cd6d..d52f48064f 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,19 +36,16 @@ namespace depthwise { void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); void a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); -class a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy +class a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16> { private: - typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); - indirect_kern_type m_indirect_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl; - - typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); - direct_kern_type m_direct_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl; + using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>; + Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl; + Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl; public: - typedef __fp16 return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + using return_type = __fp16; + constexpr static auto vl_type = arm_gemm::VLType::None; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; @@ -59,60 +56,13 @@ class a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public IDepthwiseDepthfirs constexpr static unsigned int output_rows = 2; constexpr static unsigned int output_cols = 2; - constexpr static unsigned int input_rows = 5; - constexpr static unsigned int input_cols = 5; - - a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {} + a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) + : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 3, 2) {} arm_gemm::VLType get_vl_type(void) const override { return vl_type; } - unsigned int get_kernel_rows(void) const override { return kernel_rows; } - unsigned int get_kernel_cols(void) const override { return kernel_cols; } - - unsigned int get_stride_rows(void) const override { return stride_rows; } - unsigned int get_stride_cols(void) const override { return stride_cols; } - - unsigned int get_output_rows(void) const override { return output_rows; } - unsigned int get_output_cols(void) const override { return output_cols; } - - unsigned int get_input_rows(void) const override { return input_rows; } - unsigned int get_input_cols(void) const override { return input_cols; } - - void indirect_kernel( - const void *const *const input_ptrs, - void *const *const outptrs, - const void *params, - unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const override - { - m_indirect_kernel( - reinterpret_cast(input_ptrs), - reinterpret_cast<__fp16 *const *>(outptrs), - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } - - void direct_kernel( - const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, - const void *params, unsigned int n_channels, - const void *activation_min, const void *activation_max - ) const override - { - m_direct_kernel( - n_tile_rows, n_tile_cols, - static_cast(inptr), ld_input_row, ld_input_col, - static_cast<__fp16 *>(outptr), ld_output_row, ld_output_col, - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } + Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; } + Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp index 6b5f91fa64..81a608e349 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,19 +36,16 @@ namespace depthwise { void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); void a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); -class a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy +class a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16> { private: - typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); - indirect_kern_type m_indirect_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl; - - typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); - direct_kern_type m_direct_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl; + using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>; + Parent::IndirectKernelType m_indirect_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl; + Parent::DirectKernelType m_direct_kernel = a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl; public: - typedef __fp16 return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + using return_type = __fp16; + constexpr static auto vl_type = arm_gemm::VLType::None; constexpr static unsigned int kernel_rows = 5; constexpr static unsigned int kernel_cols = 5; @@ -59,60 +56,13 @@ class a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirs constexpr static unsigned int output_rows = 2; constexpr static unsigned int output_cols = 2; - constexpr static unsigned int input_rows = 6; - constexpr static unsigned int input_cols = 6; - - a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {} + a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) + : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 5, 1) {} arm_gemm::VLType get_vl_type(void) const override { return vl_type; } - unsigned int get_kernel_rows(void) const override { return kernel_rows; } - unsigned int get_kernel_cols(void) const override { return kernel_cols; } - - unsigned int get_stride_rows(void) const override { return stride_rows; } - unsigned int get_stride_cols(void) const override { return stride_cols; } - - unsigned int get_output_rows(void) const override { return output_rows; } - unsigned int get_output_cols(void) const override { return output_cols; } - - unsigned int get_input_rows(void) const override { return input_rows; } - unsigned int get_input_cols(void) const override { return input_cols; } - - void indirect_kernel( - const void *const *const input_ptrs, - void *const *const outptrs, - const void *params, - unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const override - { - m_indirect_kernel( - reinterpret_cast(input_ptrs), - reinterpret_cast<__fp16 *const *>(outptrs), - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } - - void direct_kernel( - const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, - const void *params, unsigned int n_channels, - const void *activation_min, const void *activation_max - ) const override - { - m_direct_kernel( - n_tile_rows, n_tile_cols, - static_cast(inptr), ld_input_row, ld_input_col, - static_cast<__fp16 *>(outptr), ld_output_row, ld_output_col, - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } + Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; } + Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp index 3468b70f29..1ccd3408e2 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,32 +28,24 @@ #pragma once -#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) namespace arm_conv { namespace depthwise { void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl(const __fp16 *const *const, __fp16 *const *const, const void *, const void *, const unsigned int, const unsigned int, const __fp16, const __fp16); -struct a64_fp16_nhwc_generic_output9_mla_depthfirst +class a64_fp16_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy<__fp16, __fp16, __fp16, __fp16> { - typedef __fp16 bias_type; - typedef __fp16 input_type; - typedef __fp16 weight_type; - typedef __fp16 return_type; + KernelType kernel = a64_fp16_nhwc_generic_output9_mla_depthfirst_impl; - typedef void (*kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, const void *, const unsigned int, const unsigned int, const __fp16, const __fp16); + public: + a64_fp16_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy<__fp16, __fp16, __fp16, __fp16>(9, arm_gemm::VLType::None) {} - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - constexpr static unsigned int n_output_points = 9; - - kern_type kernel = a64_fp16_nhwc_generic_output9_mla_depthfirst_impl; - - a64_fp16_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {} + virtual KernelType get_kernel() const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp index 8ac79f82fa..423ee4190c 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_nhwc_generic_output9_mla_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,7 +25,7 @@ #include #include -#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) namespace arm_conv { namespace depthwise { @@ -524,4 +524,4 @@ void a64_fp16_nhwc_generic_output9_mla_depthfirst_impl( } // namespace depthwise } // namespace arm_conv -#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp index a02a2b2984..8fcbce2cfe 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,35 +28,25 @@ #pragma once -#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) namespace arm_conv { namespace depthwise { void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const __fp16 *const *const, __fp16 *const *const, const __fp16 *, const __fp16 *, const unsigned int, const unsigned int, const __fp16, const __fp16); -struct a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst +struct a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy<__fp16, __fp16, __fp16, __fp16> { - typedef __fp16 bias_type; - typedef __fp16 input_type; - typedef __fp16 weight_type; - typedef __fp16 return_type; - - typedef void (*kern_type)(const __fp16 *const *const, __fp16 *const *const, const __fp16 *, const __fp16 *, const unsigned int, const unsigned int, const __fp16, const __fp16); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - constexpr static unsigned int output_rows(void) { return 2; }; - constexpr static unsigned int output_cols(void) { return 8; }; - - constexpr static unsigned int output_col_regs(void) { return 1; }; - - kern_type kernel = a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl; - - a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {} + using Parent = GenericDepthfirstMultiplierKernelStrategy<__fp16, __fp16, __fp16, __fp16>; + a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) + : Parent(2, 8, arm_gemm::VLType::None) + { + } + Parent::KernelType kernel = a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp index 7ed7c52db2..d9fc1403b2 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,7 +25,7 @@ #include #include -#if defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#if defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) namespace arm_conv { namespace depthwise { @@ -1046,4 +1046,4 @@ void a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_im } // namespace depthwise } // namespace arm_conv -#endif // defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#endif // defined(__aarch64__) && defined(__ARM_FP16_ARGS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp index a888eb5776..420e95384d 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,7 +28,7 @@ #pragma once -#if __aarch64__ +#if defined(__aarch64__) namespace arm_conv { namespace depthwise { @@ -36,19 +36,16 @@ namespace depthwise { void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); void a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); -class a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy +class a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { private: - typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); - indirect_kern_type m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl; - - typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); - direct_kern_type m_direct_kernel = a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl; + using Parent = DepthwiseDepthfirstStrategy; + Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl; + Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl; public: - typedef float return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + using return_type = float; + constexpr static auto vl_type = arm_gemm::VLType::None; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; @@ -59,63 +56,16 @@ class a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirs constexpr static unsigned int output_rows = 2; constexpr static unsigned int output_cols = 2; - constexpr static unsigned int input_rows = 4; - constexpr static unsigned int input_cols = 4; - - a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {} + a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) + : DepthwiseDepthfirstStrategy(2, 3, 1) {} arm_gemm::VLType get_vl_type(void) const override { return vl_type; } - unsigned int get_kernel_rows(void) const override { return kernel_rows; } - unsigned int get_kernel_cols(void) const override { return kernel_cols; } - - unsigned int get_stride_rows(void) const override { return stride_rows; } - unsigned int get_stride_cols(void) const override { return stride_cols; } - - unsigned int get_output_rows(void) const override { return output_rows; } - unsigned int get_output_cols(void) const override { return output_cols; } - - unsigned int get_input_rows(void) const override { return input_rows; } - unsigned int get_input_cols(void) const override { return input_cols; } - - void indirect_kernel( - const void *const *const input_ptrs, - void *const *const outptrs, - const void *params, - unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const override - { - m_indirect_kernel( - reinterpret_cast(input_ptrs), - reinterpret_cast(outptrs), - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } - - void direct_kernel( - const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, - const void *params, unsigned int n_channels, - const void *activation_min, const void *activation_max - ) const override - { - m_direct_kernel( - n_tile_rows, n_tile_cols, - static_cast(inptr), ld_input_row, ld_input_col, - static_cast(outptr), ld_output_row, ld_output_col, - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } + Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; } + Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // __aarch64__ +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp index 01bb06a561..0e9a3ba3fc 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,7 +28,7 @@ #pragma once -#if __aarch64__ +#if defined(__aarch64__) namespace arm_conv { namespace depthwise { @@ -36,19 +36,16 @@ namespace depthwise { void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); void a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); -class a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public IDepthwiseDepthfirstStrategy +class a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy { private: - typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); - indirect_kern_type m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl; - - typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); - direct_kern_type m_direct_kernel = a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl; + using Parent = DepthwiseDepthfirstStrategy; + Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl; + Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl; public: - typedef float return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + using return_type = float; + constexpr static auto vl_type = arm_gemm::VLType::None; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; @@ -59,63 +56,16 @@ class a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public IDepthwiseDepthfirs constexpr static unsigned int output_rows = 3; constexpr static unsigned int output_cols = 3; - constexpr static unsigned int input_rows = 5; - constexpr static unsigned int input_cols = 5; - - a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {} + a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) + : DepthwiseDepthfirstStrategy(3, 3, 1) {} arm_gemm::VLType get_vl_type(void) const override { return vl_type; } - unsigned int get_kernel_rows(void) const override { return kernel_rows; } - unsigned int get_kernel_cols(void) const override { return kernel_cols; } - - unsigned int get_stride_rows(void) const override { return stride_rows; } - unsigned int get_stride_cols(void) const override { return stride_cols; } - - unsigned int get_output_rows(void) const override { return output_rows; } - unsigned int get_output_cols(void) const override { return output_cols; } - - unsigned int get_input_rows(void) const override { return input_rows; } - unsigned int get_input_cols(void) const override { return input_cols; } - - void indirect_kernel( - const void *const *const input_ptrs, - void *const *const outptrs, - const void *params, - unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const override - { - m_indirect_kernel( - reinterpret_cast(input_ptrs), - reinterpret_cast(outptrs), - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } - - void direct_kernel( - const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, - const void *params, unsigned int n_channels, - const void *activation_min, const void *activation_max - ) const override - { - m_direct_kernel( - n_tile_rows, n_tile_cols, - static_cast(inptr), ld_input_row, ld_input_col, - static_cast(outptr), ld_output_row, ld_output_col, - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } + Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; } + Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // __aarch64__ +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp index 17084b57d5..6c897d6eaa 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,7 +28,7 @@ #pragma once -#if __aarch64__ +#if defined(__aarch64__) namespace arm_conv { namespace depthwise { @@ -36,19 +36,16 @@ namespace depthwise { void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); void a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); -class a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public IDepthwiseDepthfirstStrategy +class a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy { private: - typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); - indirect_kern_type m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl; - - typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); - direct_kern_type m_direct_kernel = a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl; + using Parent = DepthwiseDepthfirstStrategy; + Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl; + Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl; public: - typedef float return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + using return_type = float; + constexpr static auto vl_type = arm_gemm::VLType::None; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; @@ -59,63 +56,16 @@ class a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public IDepthwiseDepthfirs constexpr static unsigned int output_rows = 4; constexpr static unsigned int output_cols = 4; - constexpr static unsigned int input_rows = 6; - constexpr static unsigned int input_cols = 6; - - a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {} + a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) + : DepthwiseDepthfirstStrategy(4, 3, 1) {} arm_gemm::VLType get_vl_type(void) const override { return vl_type; } - unsigned int get_kernel_rows(void) const override { return kernel_rows; } - unsigned int get_kernel_cols(void) const override { return kernel_cols; } - - unsigned int get_stride_rows(void) const override { return stride_rows; } - unsigned int get_stride_cols(void) const override { return stride_cols; } - - unsigned int get_output_rows(void) const override { return output_rows; } - unsigned int get_output_cols(void) const override { return output_cols; } - - unsigned int get_input_rows(void) const override { return input_rows; } - unsigned int get_input_cols(void) const override { return input_cols; } - - void indirect_kernel( - const void *const *const input_ptrs, - void *const *const outptrs, - const void *params, - unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const override - { - m_indirect_kernel( - reinterpret_cast(input_ptrs), - reinterpret_cast(outptrs), - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } - - void direct_kernel( - const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, - const void *params, unsigned int n_channels, - const void *activation_min, const void *activation_max - ) const override - { - m_direct_kernel( - n_tile_rows, n_tile_cols, - static_cast(inptr), ld_input_row, ld_input_col, - static_cast(outptr), ld_output_row, ld_output_col, - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } + Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; } + Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // __aarch64__ +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp index f23862b699..ff521fb2ca 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,7 +28,7 @@ #pragma once -#if __aarch64__ +#if defined(__aarch64__) namespace arm_conv { namespace depthwise { @@ -36,19 +36,16 @@ namespace depthwise { void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); void a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); -class a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy +class a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { private: - typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); - indirect_kern_type m_indirect_kernel = a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl; - - typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); - direct_kern_type m_direct_kernel = a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl; + using Parent = DepthwiseDepthfirstStrategy; + Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl; + Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl; public: - typedef float return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + using return_type = float; + constexpr static auto vl_type = arm_gemm::VLType::None; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; @@ -59,63 +56,16 @@ class a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public IDepthwiseDepthfirs constexpr static unsigned int output_rows = 2; constexpr static unsigned int output_cols = 2; - constexpr static unsigned int input_rows = 5; - constexpr static unsigned int input_cols = 5; - - a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {} + a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) + : DepthwiseDepthfirstStrategy(2, 3, 2) {} arm_gemm::VLType get_vl_type(void) const override { return vl_type; } - unsigned int get_kernel_rows(void) const override { return kernel_rows; } - unsigned int get_kernel_cols(void) const override { return kernel_cols; } - - unsigned int get_stride_rows(void) const override { return stride_rows; } - unsigned int get_stride_cols(void) const override { return stride_cols; } - - unsigned int get_output_rows(void) const override { return output_rows; } - unsigned int get_output_cols(void) const override { return output_cols; } - - unsigned int get_input_rows(void) const override { return input_rows; } - unsigned int get_input_cols(void) const override { return input_cols; } - - void indirect_kernel( - const void *const *const input_ptrs, - void *const *const outptrs, - const void *params, - unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const override - { - m_indirect_kernel( - reinterpret_cast(input_ptrs), - reinterpret_cast(outptrs), - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } - - void direct_kernel( - const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, - const void *params, unsigned int n_channels, - const void *activation_min, const void *activation_max - ) const override - { - m_direct_kernel( - n_tile_rows, n_tile_cols, - static_cast(inptr), ld_input_row, ld_input_col, - static_cast(outptr), ld_output_row, ld_output_col, - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } + Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; } + Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // __aarch64__ +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp index e4bfbe6783..c88a7d57ce 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,19 +36,16 @@ namespace depthwise { void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); void a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); -class a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy +class a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { private: - typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); - indirect_kern_type m_indirect_kernel = a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl; - - typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); - direct_kern_type m_direct_kernel = a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl; + using Parent = DepthwiseDepthfirstStrategy; + Parent::IndirectKernelType m_indirect_kernel = a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl; + Parent::DirectKernelType m_direct_kernel = a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl; public: - typedef float return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + using return_type = float; + constexpr static auto vl_type = arm_gemm::VLType::None; constexpr static unsigned int kernel_rows = 5; constexpr static unsigned int kernel_cols = 5; @@ -59,60 +56,13 @@ class a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirs constexpr static unsigned int output_rows = 2; constexpr static unsigned int output_cols = 2; - constexpr static unsigned int input_rows = 6; - constexpr static unsigned int input_cols = 6; - - a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {} + a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) + : DepthwiseDepthfirstStrategy(2, 5, 1) {} arm_gemm::VLType get_vl_type(void) const override { return vl_type; } - unsigned int get_kernel_rows(void) const override { return kernel_rows; } - unsigned int get_kernel_cols(void) const override { return kernel_cols; } - - unsigned int get_stride_rows(void) const override { return stride_rows; } - unsigned int get_stride_cols(void) const override { return stride_cols; } - - unsigned int get_output_rows(void) const override { return output_rows; } - unsigned int get_output_cols(void) const override { return output_cols; } - - unsigned int get_input_rows(void) const override { return input_rows; } - unsigned int get_input_cols(void) const override { return input_cols; } - - void indirect_kernel( - const void *const *const input_ptrs, - void *const *const outptrs, - const void *params, - unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const override - { - m_indirect_kernel( - reinterpret_cast(input_ptrs), - reinterpret_cast(outptrs), - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } - - void direct_kernel( - const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, - const void *params, unsigned int n_channels, - const void *activation_min, const void *activation_max - ) const override - { - m_direct_kernel( - n_tile_rows, n_tile_cols, - static_cast(inptr), ld_input_row, ld_input_col, - static_cast(outptr), ld_output_row, ld_output_col, - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } + Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; } + Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp index 0f6cecdc56..6fa02b781e 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,28 +28,24 @@ #pragma once +#if defined(__aarch64__) + namespace arm_conv { namespace depthwise { void a64_fp32_nhwc_generic_output9_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float); -struct a64_fp32_nhwc_generic_output9_mla_depthfirst +class a64_fp32_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy { - typedef float bias_type; - typedef float input_type; - typedef float weight_type; - typedef float return_type; - - typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; + KernelType kernel = a64_fp32_nhwc_generic_output9_mla_depthfirst_impl; - constexpr static unsigned int n_output_points = 9; + public: + a64_fp32_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy(9, arm_gemm::VLType::None) {} - kern_type kernel = a64_fp32_nhwc_generic_output9_mla_depthfirst_impl; - - a64_fp32_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {} + KernelType get_kernel() const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp index 60f5ddd68f..2ec0525226 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,39 +28,34 @@ #pragma once +#if defined(__aarch64__) + namespace arm_conv { namespace depthwise { void a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float); -struct a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst +struct a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst : DepthfirstMultiplierStrategy { - typedef float bias_type; - typedef float input_type; - typedef float weight_type; - typedef float return_type; - - typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - + using Parent = DepthfirstMultiplierStrategy; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 2; constexpr static unsigned int stride_cols = 2; - constexpr static unsigned int output_rows = 3; - constexpr static unsigned int output_cols = 3; - - constexpr static unsigned int input_rows = 7; - constexpr static unsigned int input_cols = 7; - constexpr static unsigned int input_col_quads = 2; + a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(const CPUInfo *) + : Parent(3, 3, kernel_rows, kernel_cols, stride_rows, stride_cols) + { + } - kern_type kernel = a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl; + arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; } - a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv + +#endif // defined(__aarch64__) \ No newline at end of file diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp index 92d6a757f2..5ae8dd3653 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,39 +28,34 @@ #pragma once +#if defined(__aarch64__) + namespace arm_conv { namespace depthwise { void a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float); -struct a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst +struct a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst : DepthfirstMultiplierStrategy { - typedef float bias_type; - typedef float input_type; - typedef float weight_type; - typedef float return_type; - - typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - + using Parent = DepthfirstMultiplierStrategy; constexpr static unsigned int kernel_rows = 5; constexpr static unsigned int kernel_cols = 5; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 4; - - constexpr static unsigned int input_rows = 6; - constexpr static unsigned int input_cols = 8; - constexpr static unsigned int input_col_quads = 2; + a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(const CPUInfo *) + : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols) + { + } - kern_type kernel = a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl; + arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; } - a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp index 2cc2f7c103..d60e15ec84 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,31 +28,25 @@ #pragma once +#if defined(__aarch64__) + namespace arm_conv { namespace depthwise { void a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float); -struct a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst +struct a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy { - typedef float bias_type; - typedef float input_type; - typedef float weight_type; - typedef float return_type; - - typedef void (*kern_type)(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - constexpr static unsigned int output_rows(void) { return 2; }; - constexpr static unsigned int output_cols(void) { return 8; }; - - constexpr static unsigned int output_col_regs(void) { return 2; }; - - kern_type kernel = a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl; - - a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {} + using Parent = GenericDepthfirstMultiplierKernelStrategy; + a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) + : Parent(2, 8, arm_gemm::VLType::None) + { + } + Parent::KernelType kernel = a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp index c76cb9906f..62e4a82fbf 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -34,39 +34,40 @@ namespace arm_conv { namespace depthwise { -void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); +void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *); -struct a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst +class a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef int8_t input_type; - typedef int8_t weight_type; - typedef int8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - typedef void (*kern_type)(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); - typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 4; - constexpr static unsigned int input_cols = 4; - - constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_dot::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_dot::get_packed_size; - - kern_type kernel = a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl; - - a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {} + a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {} + + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; } + + Parent::KernelType kernel = a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } + size_t get_storage_size(const DepthwiseArgs &args) const override + { + return interleave_a64_s8q_3x3_dot::get_packed_size(args); + } + + void pack_parameters( + const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp, + const void *weights, size_t ld_weight_col, size_t ld_weight_row + ) const override + { + interleave_a64_s8q_3x3_dot::pack_parameters( + args.input_channels, buffer, reinterpret_cast(biases), + reinterpret_cast(weights), qp, ld_weight_col, ld_weight_row + ); + } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp index ed8cd4861e..f8245fc5d9 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,7 +30,15 @@ namespace arm_conv { namespace depthwise { -void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp) +void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl( + const unsigned int n_channels, + const int8_t *const *const inptrs, + const int8_t *params, + const int32_t *, // Bias, should be wrapped into the parameters + const arm_gemm::Requantize32& qp, + const int32_t *, const int32_t *, // Requant parameters, also wrapped + int8_t *const *const outptrs +) { __asm__ __volatile__( "ldp x13, x12, [%x[inptrs], #0x0]\n" @@ -1307,7 +1315,7 @@ void a64_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *cons "34:" // End "add SP, SP, #0x80\n" : [params] "+&r" (params) - : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) + : [inptrs] "r" (inptrs), [n_channels] "r" ((long unsigned int) n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp index 76c927abcb..c1baab43e5 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,37 +36,24 @@ namespace depthwise { void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); -struct a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst +class a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef int8_t input_type; - typedef int8_t weight_type; - typedef int8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); - typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 4; - constexpr static unsigned int input_cols = 4; - - constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_mla::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_mla::get_packed_size; + a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {} - kern_type kernel = a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl; + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; } - a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp index 3001276fb5..0e8d16fc40 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -46,7 +46,7 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( struct Params { long unsigned int n_channels; - const int8_t *weights; + const void *weights; const int32_t *bias; const arm_gemm::Requantize32 *requant; const int32_t *const requant_muls; @@ -57,7 +57,7 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( Params( long unsigned int n_channels, const int8_t *const *inptrs_raw, - const int8_t *const weights, + const void *const weights, const int32_t *const bias, const arm_gemm::Requantize32 &qp, const int32_t *const requant_muls, @@ -91,513 +91,497 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( requant_muls, requant_shifts, outptrs); __asm__ __volatile__( + "ldr x19, [%x[params], %[offsetof_Params_requant]]\n" "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n" - "mov x17, #0x0\n" - "ldr x16, [%x[params], %[offsetof_Params_weights]]\n" + "add x24, x19, %[offsetof_Requantize32_a_offset]\n" + "add x23, x19, %[offsetof_Requantize32_b_offset]\n" + "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x21, x19, %[offsetof_Requantize32_c_offset]\n" + "add x20, x19, %[offsetof_Requantize32_minval]\n" + "ldr x17, [%x[params], %[offsetof_Params_weights]]\n" + "add x19, x19, %[offsetof_Requantize32_maxval]\n" + "ld1r { v22.16b }, [x24]\n" + "ld1r { v12.16b }, [x23]\n" + "lsr x16, x8, #0x3\n" + "ld1r { v14.8h }, [x21]\n" + "ld1r { v17.8h }, [x20]\n" "mov x15, #0x0\n" - "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" - "add x14, %x[params], %[offsetof_Params_inptrs]\n" + "mov x14, #0x0\n" + "ld1r { v15.8h }, [x19]\n" "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n" - "lsr x12, x8, #0x3\n" + "add x12, %x[params], %[offsetof_Params_inptrs]\n" "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n" - "add x19, x22, %[offsetof_Requantize32_a_offset]\n" - "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" - "add x20, x22, %[offsetof_Requantize32_b_offset]\n" - "ld1r { v14.16b }, [x19]\n" - "add x19, x22, %[offsetof_Requantize32_c_offset]\n" - "ld1r { v9.16b }, [x20]\n" - "add x20, x22, %[offsetof_Requantize32_minval]\n" - "ld1r { v15.4s }, [x19]\n" - "add x19, x22, %[offsetof_Requantize32_maxval]\n" - "ld1r { v24.4s }, [x20]\n" - "ld1r { v12.4s }, [x19]\n" - "ldp x10, x9, [x21, #0x0]\n" - "ldp x28, x27, [x21, #0x10]\n" - "cbz x12, 3f\n" - "subs x12, x12, #0x1\n" + "ldp x10, x9, [x22, #0x0]\n" + "ldp x28, x27, [x22, #0x10]\n" + "cbz x16, 3f\n" "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" "ldr q13, [x19, #0x0]\n" - "mov v17.16b, v13.16b\n" - "ldr q19, [x19, #0x10]\n" + "subs x16, x16, #0x1\n" + "mov v19.16b, v13.16b\n" + "ldr q26, [x19, #0x10]\n" "add x19, x19, #0x20\n" - "mov v16.16b, v13.16b\n" "str x19, [%x[params], %[offsetof_Params_bias]]\n" - "mov v23.16b, v13.16b\n" - "ldr d0, [x16, #0x0]\n" - "ssubl v0.8h, v0.8b, v9.8b\n" - "mov v25.16b, v19.16b\n" - "ldr d1, [x16, #0x8]\n" - "mov v21.16b, v19.16b\n" - "ldr d2, [x16, #0x10]\n" - "ssubl v1.8h, v1.8b, v9.8b\n" - "mov v20.16b, v19.16b\n" - "ldr d3, [x16, #0x18]\n" - "ldr d4, [x16, #0x20]\n" - "ssubl v2.8h, v2.8b, v9.8b\n" - "ldr d5, [x16, #0x28]\n" - "ssubl v3.8h, v3.8b, v9.8b\n" - "ldr d6, [x16, #0x30]\n" - "ldr d7, [x16, #0x38]\n" - "ssubl v4.8h, v4.8b, v9.8b\n" - "ldr d8, [x16, #0x40]\n" - "ssubl v5.8h, v5.8b, v9.8b\n" - "ldp x23, x22, [x14, #0x0]\n" - "ssubl v6.8h, v6.8b, v9.8b\n" - "ldp x21, x20, [x14, #0x10]\n" - "ssubl v7.8h, v7.8b, v9.8b\n" - "ssubl v8.8h, v8.8b, v9.8b\n" - "ldr x19, [x14, #0x20]\n" - "ldr d31, [x23, x17]\n" - "ssubl v31.8h, v31.8b, v14.8b\n" - "ldr d30, [x22, x17]\n" - "ldr d29, [x21, x17]\n" - "ssubl v30.8h, v30.8b, v14.8b\n" - "ldr d28, [x20, x17]\n" - "ldr d27, [x19, x17]\n" - "ssubl v29.8h, v29.8b, v14.8b\n" - "ssubl v28.8h, v28.8b, v14.8b\n" - "ssubl v27.8h, v27.8b, v14.8b\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "ldr d2, [x17, #0x10]\n" + "mov v11.16b, v26.16b\n" + "mov v18.16b, v13.16b\n" + "ldr d3, [x17, #0x18]\n" + "ldr d4, [x17, #0x20]\n" + "mov v24.16b, v26.16b\n" + "mov v9.16b, v13.16b\n" + "ldr d5, [x17, #0x28]\n" + "ldr d6, [x17, #0x30]\n" + "mov v23.16b, v26.16b\n" + "ssubl v0.8h, v0.8b, v12.8b\n" + "ldr d7, [x17, #0x38]\n" + "ldr d8, [x17, #0x40]\n" + "ssubl v1.8h, v1.8b, v12.8b\n" + "ssubl v2.8h, v2.8b, v12.8b\n" + "ldp x23, x22, [x12, #0x0]\n" + "ldp x21, x20, [x12, #0x10]\n" + "ssubl v3.8h, v3.8b, v12.8b\n" + "ssubl v4.8h, v4.8b, v12.8b\n" + "ldr x19, [x12, #0x20]\n" + "ldr d31, [x23, x15]\n" + "ssubl v5.8h, v5.8b, v12.8b\n" + "ssubl v6.8h, v6.8b, v12.8b\n" + "ldr d30, [x22, x15]\n" + "ldr d29, [x21, x15]\n" + "ssubl v7.8h, v7.8b, v12.8b\n" + "ssubl v8.8h, v8.8b, v12.8b\n" + "ldr d28, [x20, x15]\n" + "ldr d27, [x19, x15]\n" + "ssubl v31.8h, v31.8b, v22.8b\n" + "ssubl v30.8h, v30.8b, v22.8b\n" + "ssubl v29.8h, v29.8b, v22.8b\n" + "ssubl v28.8h, v28.8b, v22.8b\n" + "ssubl v27.8h, v27.8b, v22.8b\n" "beq 2f\n" "1:" // Loop "smlal v13.4s, v31.4h, v4.4h\n" - "ldr x21, [x14, #0x28]\n" - "add x16, x16, #0x48\n" - "smlal2 v19.4s, v31.8h, v4.8h\n" - "ldr x20, [x14, #0x30]\n" - "subs x12, x12, #0x1\n" - "smlal v17.4s, v31.4h, v3.4h\n" - "ldr x26, [x14, #0x38]\n" - "smlal2 v25.4s, v31.8h, v3.8h\n" - "ldr x25, [x14, #0x40]\n" - "smlal v16.4s, v31.4h, v1.4h\n" - "ldr x19, [x14, #0x48]\n" - "smlal2 v21.4s, v31.8h, v1.8h\n" - "ldr x24, [x14, #0x50]\n" - "smlal v23.4s, v31.4h, v0.4h\n" - "ldr x23, [x14, #0x58]\n" - "smlal2 v20.4s, v31.8h, v0.8h\n" - "ldr d31, [x21, x17]\n" - "ssubl v31.8h, v31.8b, v14.8b\n" + "smlal2 v26.4s, v31.8h, v4.8h\n" + "ldr x21, [x12, #0x28]\n" + "ldr x26, [x12, #0x38]\n" + "smlal v19.4s, v31.4h, v3.4h\n" + "smlal2 v11.4s, v31.8h, v3.8h\n" + "ldr x20, [x12, #0x30]\n" + "ldr x25, [x12, #0x40]\n" "smlal v13.4s, v30.4h, v0.4h\n" - "ldr x22, [x14, #0x60]\n" - "smlal2 v19.4s, v30.8h, v0.8h\n" - "ldr d30, [x19, x17]\n" - "ssubl v30.8h, v30.8b, v14.8b\n" - "smlal v17.4s, v29.4h, v2.4h\n" - "ldr x21, [x14, #0x68]\n" - "smlal2 v25.4s, v29.8h, v2.8h\n" - "ldr d29, [x20, x17]\n" - "ssubl v29.8h, v29.8b, v14.8b\n" + "smlal2 v26.4s, v30.8h, v0.8h\n" + "ldr x19, [x12, #0x48]\n" + "ldr d30, [x19, x15]\n" + "smlal v19.4s, v29.4h, v2.4h\n" + "smlal2 v11.4s, v29.8h, v2.8h\n" + "ldr d29, [x20, x15]\n" + "ssubl v29.8h, v29.8b, v22.8b\n" + "smlal v18.4s, v31.4h, v1.4h\n" + "smlal2 v24.4s, v31.8h, v1.8h\n" + "ldr x24, [x12, #0x50]\n" + "ldr x23, [x12, #0x58]\n" + "smlal v9.4s, v31.4h, v0.4h\n" + "smlal2 v23.4s, v31.8h, v0.8h\n" + "ldr d31, [x21, x15]\n" + "ssubl v31.8h, v31.8b, v22.8b\n" "smlal v13.4s, v28.4h, v5.4h\n" - "ldr x20, [x14, #0x70]\n" - "smlal2 v19.4s, v28.8h, v5.8h\n" - "ldr x19, [x14, #0x78]\n" - "smlal v17.4s, v28.4h, v4.4h\n" - "ldr q26, [x13, #0x0]\n" - "smlal2 v25.4s, v28.8h, v4.8h\n" - "ldr q10, [x11, #0x0]\n" - "smlal v16.4s, v28.4h, v2.4h\n" - "ldr q11, [x13, #0x10]\n" - "add x13, x13, #0x20\n" - "smlal2 v21.4s, v28.8h, v2.8h\n" - "ldr q18, [x11, #0x10]\n" - "add x11, x11, #0x20\n" - "smlal v23.4s, v28.4h, v1.4h\n" - "smlal2 v20.4s, v28.8h, v1.8h\n" - "ldr d28, [x26, x17]\n" - "ssubl v28.8h, v28.8b, v14.8b\n" - "smlal v16.4s, v31.4h, v6.4h\n" - "smlal2 v21.4s, v31.8h, v6.8h\n" - "ldr d31, [x25, x17]\n" - "ssubl v31.8h, v31.8b, v14.8b\n" + "smlal2 v26.4s, v28.8h, v5.8h\n" + "ssubl v30.8h, v30.8b, v22.8b\n" + "ldr x22, [x12, #0x60]\n" + "smlal v19.4s, v28.4h, v4.4h\n" + "smlal2 v11.4s, v28.8h, v4.8h\n" + "ldr x21, [x12, #0x68]\n" + "ldr x20, [x12, #0x70]\n" + "smlal v18.4s, v28.4h, v2.4h\n" + "smlal2 v24.4s, v28.8h, v2.8h\n" + "ldr x19, [x12, #0x78]\n" + "ldr q21, [x13, #0x0]\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "smlal2 v23.4s, v28.8h, v1.8h\n" + "ldr d28, [x26, x15]\n" + "ssubl v28.8h, v28.8b, v22.8b\n" "smlal v13.4s, v27.4h, v7.4h\n" - "smlal2 v19.4s, v27.8h, v7.8h\n" - "smlal v17.4s, v27.4h, v6.4h\n" - "smlal2 v25.4s, v27.8h, v6.8h\n" - "smlal v16.4s, v27.4h, v4.4h\n" - "smlal2 v21.4s, v27.8h, v4.8h\n" - "smlal v23.4s, v27.4h, v3.4h\n" - "smlal2 v20.4s, v27.8h, v3.8h\n" + "smlal2 v26.4s, v27.8h, v7.8h\n" + "ldr q25, [x11, #0x0]\n" + "ldr q10, [x13, #0x10]\n" + "smlal v19.4s, v27.4h, v6.4h\n" + "smlal2 v11.4s, v27.8h, v6.8h\n" + "ldr q16, [x11, #0x10]\n" + "add x17, x17, #0x48\n" + "smlal v18.4s, v31.4h, v6.4h\n" + "smlal2 v24.4s, v31.8h, v6.8h\n" + "ldr d31, [x25, x15]\n" + "ssubl v31.8h, v31.8b, v22.8b\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "smlal2 v23.4s, v27.8h, v3.8h\n" + "subs x16, x16, #0x1\n" + "add x13, x13, #0x20\n" "smlal v13.4s, v28.4h, v1.4h\n" - "smlal2 v19.4s, v28.8h, v1.8h\n" - "smlal v23.4s, v29.4h, v8.4h\n" - "smlal2 v20.4s, v29.8h, v8.8h\n" - "ldr d29, [x24, x17]\n" - "ssubl v29.8h, v29.8b, v14.8b\n" - "smlal v17.4s, v28.4h, v0.4h\n" - "smlal2 v25.4s, v28.8h, v0.8h\n" - "ldr d28, [x23, x17]\n" - "ssubl v28.8h, v28.8b, v14.8b\n" + "smlal2 v26.4s, v28.8h, v1.8h\n" + "add x11, x11, #0x20\n" + "smlal v19.4s, v28.4h, v0.4h\n" + "smlal2 v11.4s, v28.8h, v0.8h\n" + "ldr d28, [x23, x15]\n" + "ssubl v28.8h, v28.8b, v22.8b\n" + "smlal v18.4s, v27.4h, v4.4h\n" + "smlal v9.4s, v29.4h, v8.4h\n" + "smlal2 v24.4s, v27.8h, v4.8h\n" + "smlal2 v23.4s, v29.8h, v8.8h\n" + "ldr d29, [x24, x15]\n" + "ssubl v29.8h, v29.8b, v22.8b\n" "smlal v13.4s, v31.4h, v2.4h\n" - "smlal2 v19.4s, v31.8h, v2.8h\n" - "smlal v17.4s, v31.4h, v1.4h\n" - "smlal2 v25.4s, v31.8h, v1.8h\n" - "ldr d31, [x22, x17]\n" - "ssubl v31.8h, v31.8b, v14.8b\n" + "smlal2 v26.4s, v31.8h, v2.8h\n" + "smlal v19.4s, v31.4h, v1.4h\n" + "smlal2 v11.4s, v31.8h, v1.8h\n" + "ldr d31, [x22, x15]\n" + "ssubl v31.8h, v31.8b, v22.8b\n" + "smlal v18.4s, v30.4h, v5.4h\n" + "smlal v9.4s, v30.4h, v4.4h\n" "smlal v13.4s, v30.4h, v8.4h\n" - "smlal2 v19.4s, v30.8h, v8.8h\n" - "smlal v17.4s, v30.4h, v7.4h\n" - "smlal2 v25.4s, v30.8h, v7.8h\n" - "smlal v16.4s, v30.4h, v5.4h\n" - "smlal2 v21.4s, v30.8h, v5.8h\n" - "smlal v23.4s, v30.4h, v4.4h\n" - "smlal2 v20.4s, v30.8h, v4.8h\n" - "ldr d30, [x21, x17]\n" - "ssubl v30.8h, v30.8b, v14.8b\n" + "smlal2 v26.4s, v30.8h, v8.8h\n" + "smlal v19.4s, v30.4h, v7.4h\n" + "smlal2 v11.4s, v30.8h, v7.8h\n" + "smlal2 v24.4s, v30.8h, v5.8h\n" + "smlal2 v23.4s, v30.8h, v4.8h\n" + "ldr d30, [x21, x15]\n" + "ssubl v30.8h, v30.8b, v22.8b\n" + "smlal v18.4s, v29.4h, v0.4h\n" + "smlal v9.4s, v28.4h, v2.4h\n" "smlal v13.4s, v29.4h, v3.4h\n" - "smlal2 v19.4s, v29.8h, v3.8h\n" - "smlal v16.4s, v29.4h, v0.4h\n" - "smlal2 v21.4s, v29.8h, v0.8h\n" - "ldr d29, [x20, x17]\n" - "ssubl v29.8h, v29.8b, v14.8b\n" - "smlal v17.4s, v28.4h, v5.4h\n" - "smlal2 v25.4s, v28.8h, v5.8h\n" - "smlal v23.4s, v28.4h, v2.4h\n" - "smlal2 v20.4s, v28.8h, v2.8h\n" - "ldr d28, [x19, x17]\n" - "add x17, x17, #0x8\n" + "smlal2 v26.4s, v29.8h, v3.8h\n" + "smlal2 v24.4s, v29.8h, v0.8h\n" + "ldr d29, [x20, x15]\n" + "smlal2 v23.4s, v28.8h, v2.8h\n" + "ssubl v29.8h, v29.8b, v22.8b\n" + "smlal v18.4s, v31.4h, v3.4h\n" + "smlal v9.4s, v30.4h, v5.4h\n" + "smlal v19.4s, v28.4h, v5.4h\n" + "smlal2 v11.4s, v28.8h, v5.8h\n" + "ldr d28, [x19, x15]\n" + "ssubl v28.8h, v28.8b, v22.8b\n" + "smlal2 v24.4s, v31.8h, v3.8h\n" + "smlal2 v23.4s, v30.8h, v5.8h\n" + "add x15, x15, #0x8\n" + "smlal v18.4s, v29.4h, v7.4h\n" + "smlal v9.4s, v29.4h, v6.4h\n" + "smlal2 v24.4s, v29.8h, v7.8h\n" + "smlal2 v23.4s, v29.8h, v6.8h\n" "smlal v13.4s, v31.4h, v6.4h\n" - "ssubl v28.8h, v28.8b, v14.8b\n" - "smlal2 v19.4s, v31.8h, v6.8h\n" - "smlal v16.4s, v31.4h, v3.4h\n" - "smlal2 v21.4s, v31.8h, v3.8h\n" - "smlal v17.4s, v30.4h, v8.4h\n" - "smlal2 v25.4s, v30.8h, v8.8h\n" - "smlal v23.4s, v30.4h, v5.4h\n" - "smlal2 v20.4s, v30.8h, v5.8h\n" - "smlal v16.4s, v29.4h, v7.4h\n" - "smlal2 v21.4s, v29.8h, v7.8h\n" - "smlal v23.4s, v29.4h, v6.4h\n" - "smlal2 v20.4s, v29.8h, v6.8h\n" - "smlal v16.4s, v28.4h, v8.4h\n" - "smlal2 v21.4s, v28.8h, v8.8h\n" - "smlal v23.4s, v28.4h, v7.4h\n" - "smlal2 v20.4s, v28.8h, v7.8h\n" - "sqrdmulh v13.4s, v13.4s, v26.4s\n" - "sqrdmulh v19.4s, v19.4s, v11.4s\n" - "sqrdmulh v17.4s, v17.4s, v26.4s\n" - "sqrdmulh v25.4s, v25.4s, v11.4s\n" - "and v22.16b, v13.16b, v10.16b\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "and v28.16b, v19.16b, v18.16b\n" - "and v3.16b, v17.16b, v10.16b\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "and v6.16b, v25.16b, v18.16b\n" - "sqrdmulh v16.4s, v16.4s, v26.4s\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqrdmulh v21.4s, v21.4s, v11.4s\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sqadd v13.4s, v13.4s, v22.4s\n" - "sqrdmulh v23.4s, v23.4s, v26.4s\n" - "and v0.16b, v16.16b, v10.16b\n" - "sshr v0.4s, v0.4s, #0x1f\n" - "srshl v13.4s, v13.4s, v10.4s\n" - "sqadd v19.4s, v19.4s, v28.4s\n" - "sqadd v17.4s, v17.4s, v3.4s\n" - "sqadd v25.4s, v25.4s, v6.4s\n" - "and v29.16b, v21.16b, v18.16b\n" + "smlal v19.4s, v30.4h, v8.4h\n" + "sqdmulh v13.4s, v13.4s, v21.4s\n" + "smlal v18.4s, v28.4h, v8.4h\n" + "smlal v9.4s, v28.4h, v7.4h\n" + "sqdmulh v19.4s, v19.4s, v21.4s\n" + "smlal2 v26.4s, v31.8h, v6.8h\n" + "smlal2 v11.4s, v30.8h, v8.8h\n" + "sqdmulh v18.4s, v18.4s, v21.4s\n" + "smlal2 v24.4s, v28.8h, v8.8h\n" + "smlal2 v23.4s, v28.8h, v7.8h\n" + "sqdmulh v9.4s, v9.4s, v21.4s\n" + "and v7.16b, v13.16b, v25.16b\n" + "sqdmulh v26.4s, v26.4s, v10.4s\n" + "and v4.16b, v19.16b, v25.16b\n" + "sqdmulh v11.4s, v11.4s, v10.4s\n" + "and v21.16b, v18.16b, v25.16b\n" + "sqdmulh v24.4s, v24.4s, v10.4s\n" + "and v20.16b, v9.16b, v25.16b\n" + "sqdmulh v23.4s, v23.4s, v10.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v29.16b, v26.16b, v16.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v10.16b, v11.16b, v16.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v31.16b, v24.16b, v16.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v30.16b, v23.16b, v16.16b\n" + "sqadd v13.4s, v13.4s, v7.4s\n" "sshr v29.4s, v29.4s, #0x1f\n" - "add v13.4s, v13.4s, v15.4s\n" - "srshl v19.4s, v19.4s, v18.4s\n" - "srshl v17.4s, v17.4s, v10.4s\n" - "srshl v25.4s, v25.4s, v18.4s\n" - "smin v13.4s, v13.4s, v12.4s\n" - "add v19.4s, v19.4s, v15.4s\n" - "add v17.4s, v17.4s, v15.4s\n" - "smax v13.4s, v13.4s, v24.4s\n" - "smin v19.4s, v19.4s, v12.4s\n" - "smin v17.4s, v17.4s, v12.4s\n" - "add v25.4s, v25.4s, v15.4s\n" - "smax v19.4s, v19.4s, v24.4s\n" - "smax v17.4s, v17.4s, v24.4s\n" - "smin v25.4s, v25.4s, v12.4s\n" - "uzp1 v13.16b, v13.16b, v19.16b\n" - "sqadd v16.4s, v16.4s, v0.4s\n" + "sqadd v19.4s, v19.4s, v4.4s\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v21.4s\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v20.4s\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "srshl v13.4s, v13.4s, v25.4s\n" + "sqadd v26.4s, v26.4s, v29.4s\n" + "srshl v19.4s, v19.4s, v25.4s\n" + "sqadd v11.4s, v11.4s, v10.4s\n" + "srshl v18.4s, v18.4s, v25.4s\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "srshl v9.4s, v9.4s, v25.4s\n" + "sqadd v23.4s, v23.4s, v30.4s\n" + "srshl v26.4s, v26.4s, v16.4s\n" + "sqxtn v13.4h, v13.4s\n" + "srshl v11.4s, v11.4s, v16.4s\n" + "sqxtn v19.4h, v19.4s\n" + "srshl v24.4s, v24.4s, v16.4s\n" + "sqxtn v18.4h, v18.4s\n" + "srshl v23.4s, v23.4s, v16.4s\n" + "sqxtn v9.4h, v9.4s\n" + "sqxtn2 v13.8h, v26.4s\n" + "sqxtn2 v19.8h, v11.4s\n" + "sqxtn2 v18.8h, v24.4s\n" + "sqxtn2 v9.8h, v23.4s\n" + "sqadd v13.8h, v13.8h, v14.8h\n" + "sqadd v19.8h, v19.8h, v14.8h\n" + "sqadd v18.8h, v18.8h, v14.8h\n" + "sqadd v9.8h, v9.8h, v14.8h\n" + "smax v13.8h, v13.8h, v17.8h\n" + "smax v19.8h, v19.8h, v17.8h\n" + "smax v18.8h, v18.8h, v17.8h\n" + "smax v9.8h, v9.8h, v17.8h\n" + "smin v13.8h, v13.8h, v15.8h\n" + "smin v19.8h, v19.8h, v15.8h\n" + "smin v18.8h, v18.8h, v15.8h\n" + "smin v9.8h, v9.8h, v15.8h\n" "uzp1 v13.16b, v13.16b, v13.16b\n" - "str d13, [x10, x15]\n" - "smax v25.4s, v25.4s, v24.4s\n" - "sqadd v21.4s, v21.4s, v29.4s\n" - "srshl v16.4s, v16.4s, v10.4s\n" - "and v3.16b, v23.16b, v10.16b\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "uzp1 v17.16b, v17.16b, v25.16b\n" - "add v16.4s, v16.4s, v15.4s\n" - "srshl v21.4s, v21.4s, v18.4s\n" - "uzp1 v17.16b, v17.16b, v17.16b\n" - "str d17, [x9, x15]\n" - "smin v16.4s, v16.4s, v12.4s\n" - "sqrdmulh v20.4s, v20.4s, v11.4s\n" - "add v21.4s, v21.4s, v15.4s\n" - "sqadd v23.4s, v23.4s, v3.4s\n" - "smax v16.4s, v16.4s, v24.4s\n" - "smin v21.4s, v21.4s, v12.4s\n" - "and v25.16b, v20.16b, v18.16b\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "smax v21.4s, v21.4s, v24.4s\n" - "srshl v23.4s, v23.4s, v10.4s\n" - "uzp1 v16.16b, v16.16b, v21.16b\n" - "add v23.4s, v23.4s, v15.4s\n" - "uzp1 v16.16b, v16.16b, v16.16b\n" - "str d16, [x28, x15]\n" - "smin v23.4s, v23.4s, v12.4s\n" - "sqadd v20.4s, v20.4s, v25.4s\n" - "smax v23.4s, v23.4s, v24.4s\n" - "srshl v20.4s, v20.4s, v18.4s\n" - "add v20.4s, v20.4s, v15.4s\n" - "smin v20.4s, v20.4s, v12.4s\n" - "smax v20.4s, v20.4s, v24.4s\n" - "uzp1 v23.16b, v23.16b, v20.16b\n" - "uzp1 v23.16b, v23.16b, v23.16b\n" - "str d23, [x27, x15]\n" - "add x15, x15, #0x8\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str d13, [x10, x14]\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" + "str d19, [x9, x14]\n" + "str d18, [x28, x14]\n" + "str d9, [x27, x14]\n" "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" "ldr q13, [x19, #0x0]\n" - "mov v17.16b, v13.16b\n" - "ldr q19, [x19, #0x10]\n" + "add x14, x14, #0x8\n" + "ldr q26, [x19, #0x10]\n" "add x19, x19, #0x20\n" - "mov v16.16b, v13.16b\n" "str x19, [%x[params], %[offsetof_Params_bias]]\n" - "mov v23.16b, v13.16b\n" - "ldr d0, [x16, #0x0]\n" - "ssubl v0.8h, v0.8b, v9.8b\n" - "mov v25.16b, v19.16b\n" - "ldr d1, [x16, #0x8]\n" - "mov v21.16b, v19.16b\n" - "ldr d2, [x16, #0x10]\n" - "ssubl v1.8h, v1.8b, v9.8b\n" - "mov v20.16b, v19.16b\n" - "ldr d3, [x16, #0x18]\n" - "ldr d4, [x16, #0x20]\n" - "ssubl v2.8h, v2.8b, v9.8b\n" - "ldr d5, [x16, #0x28]\n" - "ssubl v3.8h, v3.8b, v9.8b\n" - "ldr d6, [x16, #0x30]\n" - "ldr d7, [x16, #0x38]\n" - "ssubl v4.8h, v4.8b, v9.8b\n" - "ldr d8, [x16, #0x40]\n" - "ssubl v5.8h, v5.8b, v9.8b\n" - "ldp x23, x22, [x14, #0x0]\n" - "ssubl v6.8h, v6.8b, v9.8b\n" - "ldp x21, x20, [x14, #0x10]\n" - "ssubl v7.8h, v7.8b, v9.8b\n" - "ssubl v8.8h, v8.8b, v9.8b\n" - "ldr x19, [x14, #0x20]\n" - "ldr d31, [x23, x17]\n" - "ssubl v31.8h, v31.8b, v14.8b\n" - "ldr d30, [x22, x17]\n" - "ldr d29, [x21, x17]\n" - "ssubl v30.8h, v30.8b, v14.8b\n" - "ldr d28, [x20, x17]\n" - "ldr d27, [x19, x17]\n" - "ssubl v29.8h, v29.8b, v14.8b\n" - "ssubl v28.8h, v28.8b, v14.8b\n" - "ssubl v27.8h, v27.8b, v14.8b\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "ldr d2, [x17, #0x10]\n" + "mov v19.16b, v13.16b\n" + "mov v11.16b, v26.16b\n" + "ldr d3, [x17, #0x18]\n" + "ldr d4, [x17, #0x20]\n" + "mov v18.16b, v13.16b\n" + "mov v24.16b, v26.16b\n" + "ldr d5, [x17, #0x28]\n" + "ldr d6, [x17, #0x30]\n" + "mov v9.16b, v13.16b\n" + "mov v23.16b, v26.16b\n" + "ldr d7, [x17, #0x38]\n" + "ldr d8, [x17, #0x40]\n" + "ssubl v0.8h, v0.8b, v12.8b\n" + "ssubl v1.8h, v1.8b, v12.8b\n" + "ldp x23, x22, [x12, #0x0]\n" + "ldp x21, x20, [x12, #0x10]\n" + "ssubl v2.8h, v2.8b, v12.8b\n" + "ssubl v3.8h, v3.8b, v12.8b\n" + "ldr x19, [x12, #0x20]\n" + "ldr d31, [x23, x15]\n" + "ssubl v4.8h, v4.8b, v12.8b\n" + "ssubl v5.8h, v5.8b, v12.8b\n" + "ldr d30, [x22, x15]\n" + "ldr d29, [x21, x15]\n" + "ssubl v6.8h, v6.8b, v12.8b\n" + "ssubl v7.8h, v7.8b, v12.8b\n" + "ldr d28, [x20, x15]\n" + "ldr d27, [x19, x15]\n" + "ssubl v8.8h, v8.8b, v12.8b\n" + "ssubl v31.8h, v31.8b, v22.8b\n" + "ssubl v30.8h, v30.8b, v22.8b\n" + "ssubl v29.8h, v29.8b, v22.8b\n" + "ssubl v28.8h, v28.8b, v22.8b\n" + "ssubl v27.8h, v27.8b, v22.8b\n" "bgt 1b\n" "2:" // Tail "smlal v13.4s, v31.4h, v4.4h\n" - "ldr x21, [x14, #0x28]\n" - "tst x8, #0x7\n" - "smlal2 v19.4s, v31.8h, v4.8h\n" - "ldr x20, [x14, #0x30]\n" - "smlal v17.4s, v31.4h, v3.4h\n" - "ldr x26, [x14, #0x38]\n" - "smlal2 v25.4s, v31.8h, v3.8h\n" - "ldr x25, [x14, #0x40]\n" - "smlal v16.4s, v31.4h, v1.4h\n" - "ldr x19, [x14, #0x48]\n" - "smlal2 v21.4s, v31.8h, v1.8h\n" - "ldr x24, [x14, #0x50]\n" - "smlal v23.4s, v31.4h, v0.4h\n" - "ldr x23, [x14, #0x58]\n" - "smlal2 v20.4s, v31.8h, v0.8h\n" - "ldr d31, [x21, x17]\n" - "ssubl v31.8h, v31.8b, v14.8b\n" + "smlal2 v26.4s, v31.8h, v4.8h\n" + "ldr x21, [x12, #0x28]\n" + "ldr x26, [x12, #0x38]\n" + "smlal v19.4s, v31.4h, v3.4h\n" + "smlal2 v11.4s, v31.8h, v3.8h\n" + "ldr x20, [x12, #0x30]\n" + "ldr x25, [x12, #0x40]\n" "smlal v13.4s, v30.4h, v0.4h\n" - "ldr x22, [x14, #0x60]\n" - "smlal2 v19.4s, v30.8h, v0.8h\n" - "ldr d30, [x19, x17]\n" - "ssubl v30.8h, v30.8b, v14.8b\n" - "smlal v17.4s, v29.4h, v2.4h\n" - "ldr x21, [x14, #0x68]\n" - "smlal2 v25.4s, v29.8h, v2.8h\n" - "ldr d29, [x20, x17]\n" - "ssubl v29.8h, v29.8b, v14.8b\n" + "smlal2 v26.4s, v30.8h, v0.8h\n" + "ldr x19, [x12, #0x48]\n" + "ldr d30, [x19, x15]\n" + "smlal v19.4s, v29.4h, v2.4h\n" + "smlal2 v11.4s, v29.8h, v2.8h\n" + "ldr d29, [x20, x15]\n" + "ssubl v29.8h, v29.8b, v22.8b\n" + "smlal v18.4s, v31.4h, v1.4h\n" + "smlal2 v24.4s, v31.8h, v1.8h\n" + "ldr x24, [x12, #0x50]\n" + "ldr x23, [x12, #0x58]\n" + "smlal v9.4s, v31.4h, v0.4h\n" + "smlal2 v23.4s, v31.8h, v0.8h\n" + "ldr d31, [x21, x15]\n" + "ssubl v31.8h, v31.8b, v22.8b\n" "smlal v13.4s, v28.4h, v5.4h\n" - "ldr x20, [x14, #0x70]\n" - "smlal2 v19.4s, v28.8h, v5.8h\n" - "ldr x19, [x14, #0x78]\n" - "smlal v17.4s, v28.4h, v4.4h\n" - "ldr q26, [x13, #0x0]\n" - "smlal2 v25.4s, v28.8h, v4.8h\n" - "ldr q10, [x11, #0x0]\n" - "smlal v16.4s, v28.4h, v2.4h\n" - "ldr q11, [x13, #0x10]\n" + "smlal2 v26.4s, v28.8h, v5.8h\n" + "ssubl v30.8h, v30.8b, v22.8b\n" + "ldr x22, [x12, #0x60]\n" + "smlal v19.4s, v28.4h, v4.4h\n" + "smlal2 v11.4s, v28.8h, v4.8h\n" + "ldr x21, [x12, #0x68]\n" + "ldr x20, [x12, #0x70]\n" + "smlal v18.4s, v28.4h, v2.4h\n" + "smlal2 v24.4s, v28.8h, v2.8h\n" + "ldr x19, [x12, #0x78]\n" + "ldr q21, [x13, #0x0]\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "smlal2 v23.4s, v28.8h, v1.8h\n" + "ldr d28, [x26, x15]\n" + "ssubl v28.8h, v28.8b, v22.8b\n" + "smlal v13.4s, v27.4h, v7.4h\n" + "smlal2 v26.4s, v27.8h, v7.8h\n" + "ldr q25, [x11, #0x0]\n" + "ldr q10, [x13, #0x10]\n" + "smlal v19.4s, v27.4h, v6.4h\n" + "smlal2 v11.4s, v27.8h, v6.8h\n" + "ldr q16, [x11, #0x10]\n" + "tst x8, #0x7\n" + "smlal v18.4s, v31.4h, v6.4h\n" + "smlal2 v24.4s, v31.8h, v6.8h\n" + "ldr d31, [x25, x15]\n" + "ssubl v31.8h, v31.8b, v22.8b\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "smlal2 v23.4s, v27.8h, v3.8h\n" "add x13, x13, #0x20\n" - "smlal2 v21.4s, v28.8h, v2.8h\n" - "ldr q18, [x11, #0x10]\n" "add x11, x11, #0x20\n" - "smlal v23.4s, v28.4h, v1.4h\n" - "smlal2 v20.4s, v28.8h, v1.8h\n" - "ldr d28, [x26, x17]\n" - "ssubl v28.8h, v28.8b, v14.8b\n" - "smlal v16.4s, v31.4h, v6.4h\n" - "smlal2 v21.4s, v31.8h, v6.8h\n" - "ldr d31, [x25, x17]\n" - "ssubl v31.8h, v31.8b, v14.8b\n" - "smlal v13.4s, v27.4h, v7.4h\n" - "smlal2 v19.4s, v27.8h, v7.8h\n" - "smlal v17.4s, v27.4h, v6.4h\n" - "smlal2 v25.4s, v27.8h, v6.8h\n" - "smlal v16.4s, v27.4h, v4.4h\n" - "smlal2 v21.4s, v27.8h, v4.8h\n" - "smlal v23.4s, v27.4h, v3.4h\n" - "smlal2 v20.4s, v27.8h, v3.8h\n" "smlal v13.4s, v28.4h, v1.4h\n" - "smlal2 v19.4s, v28.8h, v1.8h\n" - "smlal v23.4s, v29.4h, v8.4h\n" - "smlal2 v20.4s, v29.8h, v8.8h\n" - "ldr d29, [x24, x17]\n" - "ssubl v29.8h, v29.8b, v14.8b\n" - "smlal v17.4s, v28.4h, v0.4h\n" - "smlal2 v25.4s, v28.8h, v0.8h\n" - "ldr d28, [x23, x17]\n" - "ssubl v28.8h, v28.8b, v14.8b\n" + "smlal2 v26.4s, v28.8h, v1.8h\n" + "smlal v19.4s, v28.4h, v0.4h\n" + "smlal2 v11.4s, v28.8h, v0.8h\n" + "ldr d28, [x23, x15]\n" + "ssubl v28.8h, v28.8b, v22.8b\n" + "smlal v18.4s, v27.4h, v4.4h\n" + "smlal v9.4s, v29.4h, v8.4h\n" + "smlal2 v24.4s, v27.8h, v4.8h\n" + "smlal2 v23.4s, v29.8h, v8.8h\n" + "ldr d29, [x24, x15]\n" + "ssubl v29.8h, v29.8b, v22.8b\n" "smlal v13.4s, v31.4h, v2.4h\n" - "smlal2 v19.4s, v31.8h, v2.8h\n" - "smlal v17.4s, v31.4h, v1.4h\n" - "smlal2 v25.4s, v31.8h, v1.8h\n" - "ldr d31, [x22, x17]\n" - "ssubl v31.8h, v31.8b, v14.8b\n" + "smlal2 v26.4s, v31.8h, v2.8h\n" + "smlal v19.4s, v31.4h, v1.4h\n" + "smlal2 v11.4s, v31.8h, v1.8h\n" + "ldr d31, [x22, x15]\n" + "ssubl v31.8h, v31.8b, v22.8b\n" + "smlal v18.4s, v30.4h, v5.4h\n" + "smlal v9.4s, v30.4h, v4.4h\n" "smlal v13.4s, v30.4h, v8.4h\n" - "smlal2 v19.4s, v30.8h, v8.8h\n" - "smlal v17.4s, v30.4h, v7.4h\n" - "smlal2 v25.4s, v30.8h, v7.8h\n" - "smlal v16.4s, v30.4h, v5.4h\n" - "smlal2 v21.4s, v30.8h, v5.8h\n" - "smlal v23.4s, v30.4h, v4.4h\n" - "smlal2 v20.4s, v30.8h, v4.8h\n" - "ldr d30, [x21, x17]\n" - "ssubl v30.8h, v30.8b, v14.8b\n" + "smlal2 v26.4s, v30.8h, v8.8h\n" + "smlal v19.4s, v30.4h, v7.4h\n" + "smlal2 v11.4s, v30.8h, v7.8h\n" + "smlal2 v24.4s, v30.8h, v5.8h\n" + "smlal2 v23.4s, v30.8h, v4.8h\n" + "ldr d30, [x21, x15]\n" + "ssubl v30.8h, v30.8b, v22.8b\n" + "smlal v18.4s, v29.4h, v0.4h\n" + "smlal v9.4s, v28.4h, v2.4h\n" "smlal v13.4s, v29.4h, v3.4h\n" - "smlal2 v19.4s, v29.8h, v3.8h\n" - "smlal v16.4s, v29.4h, v0.4h\n" - "smlal2 v21.4s, v29.8h, v0.8h\n" - "ldr d29, [x20, x17]\n" - "ssubl v29.8h, v29.8b, v14.8b\n" - "smlal v17.4s, v28.4h, v5.4h\n" - "smlal2 v25.4s, v28.8h, v5.8h\n" - "smlal v23.4s, v28.4h, v2.4h\n" - "smlal2 v20.4s, v28.8h, v2.8h\n" - "ldr d28, [x19, x17]\n" - "add x17, x17, #0x8\n" + "smlal2 v26.4s, v29.8h, v3.8h\n" + "smlal2 v24.4s, v29.8h, v0.8h\n" + "ldr d29, [x20, x15]\n" + "smlal2 v23.4s, v28.8h, v2.8h\n" + "ssubl v29.8h, v29.8b, v22.8b\n" + "smlal v18.4s, v31.4h, v3.4h\n" + "smlal v9.4s, v30.4h, v5.4h\n" + "smlal v19.4s, v28.4h, v5.4h\n" + "smlal2 v11.4s, v28.8h, v5.8h\n" + "ldr d28, [x19, x15]\n" + "ssubl v28.8h, v28.8b, v22.8b\n" + "smlal2 v24.4s, v31.8h, v3.8h\n" + "smlal2 v23.4s, v30.8h, v5.8h\n" + "add x15, x15, #0x8\n" + "smlal v18.4s, v29.4h, v7.4h\n" + "smlal v9.4s, v29.4h, v6.4h\n" + "smlal2 v24.4s, v29.8h, v7.8h\n" + "smlal2 v23.4s, v29.8h, v6.8h\n" "smlal v13.4s, v31.4h, v6.4h\n" - "ssubl v28.8h, v28.8b, v14.8b\n" - "smlal2 v19.4s, v31.8h, v6.8h\n" - "smlal v16.4s, v31.4h, v3.4h\n" - "smlal2 v21.4s, v31.8h, v3.8h\n" - "smlal v17.4s, v30.4h, v8.4h\n" - "smlal2 v25.4s, v30.8h, v8.8h\n" - "smlal v23.4s, v30.4h, v5.4h\n" - "smlal2 v20.4s, v30.8h, v5.8h\n" - "smlal v16.4s, v29.4h, v7.4h\n" - "smlal2 v21.4s, v29.8h, v7.8h\n" - "smlal v23.4s, v29.4h, v6.4h\n" - "smlal2 v20.4s, v29.8h, v6.8h\n" - "smlal v16.4s, v28.4h, v8.4h\n" - "smlal2 v21.4s, v28.8h, v8.8h\n" - "smlal v23.4s, v28.4h, v7.4h\n" - "smlal2 v20.4s, v28.8h, v7.8h\n" - "sqrdmulh v13.4s, v13.4s, v26.4s\n" - "sqrdmulh v19.4s, v19.4s, v11.4s\n" - "sqrdmulh v17.4s, v17.4s, v26.4s\n" - "sqrdmulh v25.4s, v25.4s, v11.4s\n" - "and v22.16b, v13.16b, v10.16b\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "and v28.16b, v19.16b, v18.16b\n" - "and v3.16b, v17.16b, v10.16b\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "and v6.16b, v25.16b, v18.16b\n" - "sqrdmulh v16.4s, v16.4s, v26.4s\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqrdmulh v21.4s, v21.4s, v11.4s\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sqadd v13.4s, v13.4s, v22.4s\n" - "sqrdmulh v23.4s, v23.4s, v26.4s\n" - "and v0.16b, v16.16b, v10.16b\n" - "sshr v0.4s, v0.4s, #0x1f\n" - "srshl v13.4s, v13.4s, v10.4s\n" - "sqadd v19.4s, v19.4s, v28.4s\n" - "sqadd v17.4s, v17.4s, v3.4s\n" - "sqadd v25.4s, v25.4s, v6.4s\n" - "and v29.16b, v21.16b, v18.16b\n" + "smlal v19.4s, v30.4h, v8.4h\n" + "sqdmulh v13.4s, v13.4s, v21.4s\n" + "smlal v18.4s, v28.4h, v8.4h\n" + "smlal v9.4s, v28.4h, v7.4h\n" + "sqdmulh v19.4s, v19.4s, v21.4s\n" + "smlal2 v26.4s, v31.8h, v6.8h\n" + "smlal2 v11.4s, v30.8h, v8.8h\n" + "sqdmulh v18.4s, v18.4s, v21.4s\n" + "smlal2 v24.4s, v28.8h, v8.8h\n" + "smlal2 v23.4s, v28.8h, v7.8h\n" + "sqdmulh v9.4s, v9.4s, v21.4s\n" + "and v7.16b, v13.16b, v25.16b\n" + "sqdmulh v26.4s, v26.4s, v10.4s\n" + "and v4.16b, v19.16b, v25.16b\n" + "sqdmulh v11.4s, v11.4s, v10.4s\n" + "and v21.16b, v18.16b, v25.16b\n" + "sqdmulh v24.4s, v24.4s, v10.4s\n" + "and v20.16b, v9.16b, v25.16b\n" + "sqdmulh v23.4s, v23.4s, v10.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v29.16b, v26.16b, v16.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v10.16b, v11.16b, v16.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v31.16b, v24.16b, v16.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v30.16b, v23.16b, v16.16b\n" + "sqadd v13.4s, v13.4s, v7.4s\n" "sshr v29.4s, v29.4s, #0x1f\n" - "add v13.4s, v13.4s, v15.4s\n" - "srshl v19.4s, v19.4s, v18.4s\n" - "srshl v17.4s, v17.4s, v10.4s\n" - "srshl v25.4s, v25.4s, v18.4s\n" - "smin v13.4s, v13.4s, v12.4s\n" - "add v19.4s, v19.4s, v15.4s\n" - "add v17.4s, v17.4s, v15.4s\n" - "smax v13.4s, v13.4s, v24.4s\n" - "smin v19.4s, v19.4s, v12.4s\n" - "smin v17.4s, v17.4s, v12.4s\n" - "add v25.4s, v25.4s, v15.4s\n" - "smax v19.4s, v19.4s, v24.4s\n" - "smax v17.4s, v17.4s, v24.4s\n" - "smin v25.4s, v25.4s, v12.4s\n" - "uzp1 v13.16b, v13.16b, v19.16b\n" - "sqadd v16.4s, v16.4s, v0.4s\n" + "sqadd v19.4s, v19.4s, v4.4s\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v21.4s\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v20.4s\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "srshl v13.4s, v13.4s, v25.4s\n" + "sqadd v26.4s, v26.4s, v29.4s\n" + "srshl v19.4s, v19.4s, v25.4s\n" + "sqadd v11.4s, v11.4s, v10.4s\n" + "srshl v18.4s, v18.4s, v25.4s\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "srshl v9.4s, v9.4s, v25.4s\n" + "sqadd v23.4s, v23.4s, v30.4s\n" + "srshl v26.4s, v26.4s, v16.4s\n" + "sqxtn v13.4h, v13.4s\n" + "srshl v11.4s, v11.4s, v16.4s\n" + "sqxtn v19.4h, v19.4s\n" + "srshl v24.4s, v24.4s, v16.4s\n" + "sqxtn v18.4h, v18.4s\n" + "srshl v23.4s, v23.4s, v16.4s\n" + "sqxtn v9.4h, v9.4s\n" + "sqxtn2 v13.8h, v26.4s\n" + "sqxtn2 v19.8h, v11.4s\n" + "sqxtn2 v18.8h, v24.4s\n" + "sqxtn2 v9.8h, v23.4s\n" + "sqadd v13.8h, v13.8h, v14.8h\n" + "sqadd v19.8h, v19.8h, v14.8h\n" + "sqadd v18.8h, v18.8h, v14.8h\n" + "sqadd v9.8h, v9.8h, v14.8h\n" + "smax v13.8h, v13.8h, v17.8h\n" + "smax v19.8h, v19.8h, v17.8h\n" + "smax v18.8h, v18.8h, v17.8h\n" + "smax v9.8h, v9.8h, v17.8h\n" + "smin v13.8h, v13.8h, v15.8h\n" + "smin v19.8h, v19.8h, v15.8h\n" + "smin v18.8h, v18.8h, v15.8h\n" + "smin v9.8h, v9.8h, v15.8h\n" "uzp1 v13.16b, v13.16b, v13.16b\n" - "str d13, [x10, x15]\n" - "smax v25.4s, v25.4s, v24.4s\n" - "sqadd v21.4s, v21.4s, v29.4s\n" - "srshl v16.4s, v16.4s, v10.4s\n" - "and v3.16b, v23.16b, v10.16b\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "uzp1 v17.16b, v17.16b, v25.16b\n" - "add v16.4s, v16.4s, v15.4s\n" - "srshl v21.4s, v21.4s, v18.4s\n" - "uzp1 v17.16b, v17.16b, v17.16b\n" - "str d17, [x9, x15]\n" - "smin v16.4s, v16.4s, v12.4s\n" - "sqrdmulh v20.4s, v20.4s, v11.4s\n" - "add v21.4s, v21.4s, v15.4s\n" - "sqadd v23.4s, v23.4s, v3.4s\n" - "smax v16.4s, v16.4s, v24.4s\n" - "smin v21.4s, v21.4s, v12.4s\n" - "and v25.16b, v20.16b, v18.16b\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "smax v21.4s, v21.4s, v24.4s\n" - "srshl v23.4s, v23.4s, v10.4s\n" - "uzp1 v16.16b, v16.16b, v21.16b\n" - "add v23.4s, v23.4s, v15.4s\n" - "uzp1 v16.16b, v16.16b, v16.16b\n" - "str d16, [x28, x15]\n" - "smin v23.4s, v23.4s, v12.4s\n" - "sqadd v20.4s, v20.4s, v25.4s\n" - "smax v23.4s, v23.4s, v24.4s\n" - "srshl v20.4s, v20.4s, v18.4s\n" - "add v20.4s, v20.4s, v15.4s\n" - "smin v20.4s, v20.4s, v12.4s\n" - "smax v20.4s, v20.4s, v24.4s\n" - "uzp1 v23.16b, v23.16b, v20.16b\n" - "uzp1 v23.16b, v23.16b, v23.16b\n" - "str d23, [x27, x15]\n" - "add x15, x15, #0x8\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str d13, [x10, x14]\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" + "str d19, [x9, x14]\n" + "str d18, [x28, x14]\n" + "str d9, [x27, x14]\n" + "add x14, x14, #0x8\n" "beq 64f\n" - "add x16, x16, #0x48\n" + "add x17, x17, #0x48\n" "3:" // Oddments "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" "tbz x8, #2, 5f\n" "ld1 { v13.4s }, [x19], #0x10\n" "tbz x8, #1, 4f\n" - "ld1 { v19.d }[0], [x19], #0x8\n" + "ld1 { v26.d }[0], [x19], #0x8\n" "tbz x8, #0, 7f\n" - "ld1 { v19.s }[2], [x19]\n" + "ld1 { v26.s }[2], [x19]\n" "b 7f\n" "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset "tbz x8, #0, 7f\n" - "ld1 { v19.s }[0], [x19]\n" + "ld1 { v26.s }[0], [x19]\n" "b 7f\n" "5:" // Oddments: Load bias: Bit 2: Unset "tbz x8, #1, 6f\n" @@ -609,38 +593,38 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 7f\n" "ld1 { v13.s }[0], [x19]\n" "7:" // Oddments: Load bias: Bit 2: End - "mov v17.16b, v13.16b\n" - "ldr d0, [x16, #0x0]\n" - "mov v25.16b, v19.16b\n" - "ldr d1, [x16, #0x8]\n" - "mov v16.16b, v13.16b\n" - "ldr d2, [x16, #0x10]\n" - "mov v21.16b, v19.16b\n" - "ldr d3, [x16, #0x18]\n" - "mov v23.16b, v13.16b\n" - "ldr d4, [x16, #0x20]\n" - "ssubl v0.8h, v0.8b, v9.8b\n" - "mov v20.16b, v19.16b\n" - "ldr d5, [x16, #0x28]\n" - "ssubl v1.8h, v1.8b, v9.8b\n" - "ldr d6, [x16, #0x30]\n" - "ssubl v2.8h, v2.8b, v9.8b\n" - "ldr d7, [x16, #0x38]\n" - "ssubl v3.8h, v3.8b, v9.8b\n" - "ldr d8, [x16, #0x40]\n" - "ssubl v4.8h, v4.8b, v9.8b\n" - "ldp x23, x22, [x14, #0x0]\n" - "ssubl v5.8h, v5.8b, v9.8b\n" - "ldp x21, x20, [x14, #0x10]\n" - "ssubl v6.8h, v6.8b, v9.8b\n" - "ssubl v7.8h, v7.8b, v9.8b\n" - "ldr x19, [x14, #0x20]\n" - "ssubl v8.8h, v8.8b, v9.8b\n" - "add x23, x23, x17\n" - "add x22, x22, x17\n" - "add x21, x21, x17\n" - "add x20, x20, x17\n" - "add x19, x19, x17\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "mov v19.16b, v13.16b\n" + "mov v11.16b, v26.16b\n" + "ldr d2, [x17, #0x10]\n" + "ldr d3, [x17, #0x18]\n" + "mov v18.16b, v13.16b\n" + "mov v24.16b, v26.16b\n" + "ldr d4, [x17, #0x20]\n" + "ldr d5, [x17, #0x28]\n" + "mov v9.16b, v13.16b\n" + "mov v23.16b, v26.16b\n" + "ldr d6, [x17, #0x30]\n" + "ldr d7, [x17, #0x38]\n" + "ssubl v0.8h, v0.8b, v12.8b\n" + "ssubl v1.8h, v1.8b, v12.8b\n" + "ldr d8, [x17, #0x40]\n" + "ldp x23, x22, [x12, #0x0]\n" + "ssubl v2.8h, v2.8b, v12.8b\n" + "ssubl v3.8h, v3.8b, v12.8b\n" + "ldp x21, x20, [x12, #0x10]\n" + "ldr x19, [x12, #0x20]\n" + "ssubl v4.8h, v4.8b, v12.8b\n" + "ssubl v5.8h, v5.8b, v12.8b\n" + "ssubl v6.8h, v6.8b, v12.8b\n" + "ssubl v7.8h, v7.8b, v12.8b\n" + "ssubl v8.8h, v8.8b, v12.8b\n" + "add x23, x23, x15\n" + "add x22, x22, x15\n" + "add x21, x21, x15\n" + "add x20, x20, x15\n" + "add x19, x19, x15\n" "tbz x8, #2, 9f\n" "ld1 { v31.s }[0], [x23], #0x4\n" "ld1 { v30.s }[0], [x22], #0x4\n" @@ -690,33 +674,33 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "ld1 { v28.b }[0], [x20]\n" "ld1 { v27.b }[0], [x19]\n" "11:" // Oddments: Initial loads: Bit 2: End - "ldr x21, [x14, #0x28]\n" - "ssubl v31.8h, v31.8b, v14.8b\n" + "ssubl v31.8h, v31.8b, v22.8b\n" "smlal v13.4s, v31.4h, v4.4h\n" - "ssubl v30.8h, v30.8b, v14.8b\n" - "smlal2 v19.4s, v31.8h, v4.8h\n" - "ssubl v29.8h, v29.8b, v14.8b\n" - "smlal v17.4s, v31.4h, v3.4h\n" - "ssubl v28.8h, v28.8b, v14.8b\n" - "smlal2 v25.4s, v31.8h, v3.8h\n" - "ssubl v27.8h, v27.8b, v14.8b\n" - "smlal v16.4s, v31.4h, v1.4h\n" - "add x21, x21, x17\n" - "smlal2 v21.4s, v31.8h, v1.8h\n" - "smlal v23.4s, v31.4h, v0.4h\n" - "smlal2 v20.4s, v31.8h, v0.8h\n" + "smlal2 v26.4s, v31.8h, v4.8h\n" + "ldr x21, [x12, #0x28]\n" + "smlal v19.4s, v31.4h, v3.4h\n" + "smlal2 v11.4s, v31.8h, v3.8h\n" + "ssubl v30.8h, v30.8b, v22.8b\n" + "add x21, x21, x15\n" + "ssubl v29.8h, v29.8b, v22.8b\n" + "smlal v18.4s, v31.4h, v1.4h\n" + "smlal2 v24.4s, v31.8h, v1.8h\n" + "smlal v9.4s, v31.4h, v0.4h\n" + "smlal2 v23.4s, v31.8h, v0.8h\n" + "ssubl v28.8h, v28.8b, v22.8b\n" "smlal v13.4s, v30.4h, v0.4h\n" - "smlal2 v19.4s, v30.8h, v0.8h\n" - "smlal v17.4s, v29.4h, v2.4h\n" - "smlal2 v25.4s, v29.8h, v2.8h\n" + "smlal2 v26.4s, v30.8h, v0.8h\n" + "ssubl v27.8h, v27.8b, v22.8b\n" + "smlal v19.4s, v29.4h, v2.4h\n" + "smlal2 v11.4s, v29.8h, v2.8h\n" "smlal v13.4s, v28.4h, v5.4h\n" - "smlal2 v19.4s, v28.8h, v5.8h\n" - "smlal v17.4s, v28.4h, v4.4h\n" - "smlal2 v25.4s, v28.8h, v4.8h\n" - "smlal v16.4s, v28.4h, v2.4h\n" - "smlal2 v21.4s, v28.8h, v2.8h\n" - "smlal v23.4s, v28.4h, v1.4h\n" - "smlal2 v20.4s, v28.8h, v1.8h\n" + "smlal2 v26.4s, v28.8h, v5.8h\n" + "smlal v19.4s, v28.4h, v4.4h\n" + "smlal2 v11.4s, v28.8h, v4.8h\n" + "smlal v18.4s, v28.4h, v2.4h\n" + "smlal2 v24.4s, v28.8h, v2.8h\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "smlal2 v23.4s, v28.8h, v1.8h\n" "tbz x8, #2, 13f\n" "ld1 { v31.s }[0], [x21], #0x4\n" "tbz x8, #1, 12f\n" @@ -738,19 +722,19 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 15f\n" "ld1 { v31.b }[0], [x21]\n" "15:" // Oddments: Load (3, 0): Bit 2: End + "ssubl v31.8h, v31.8b, v22.8b\n" + "smlal v18.4s, v31.4h, v6.4h\n" + "smlal2 v24.4s, v31.8h, v6.8h\n" + "ldr x20, [x12, #0x30]\n" "smlal v13.4s, v27.4h, v7.4h\n" - "ldr x20, [x14, #0x30]\n" - "ssubl v31.8h, v31.8b, v14.8b\n" - "smlal2 v19.4s, v27.8h, v7.8h\n" - "smlal v17.4s, v27.4h, v6.4h\n" - "add x20, x20, x17\n" - "smlal2 v25.4s, v27.8h, v6.8h\n" - "smlal v23.4s, v27.4h, v3.4h\n" - "smlal2 v20.4s, v27.8h, v3.8h\n" - "smlal v16.4s, v31.4h, v6.4h\n" - "smlal2 v21.4s, v31.8h, v6.8h\n" - "smlal v16.4s, v27.4h, v4.4h\n" - "smlal2 v21.4s, v27.8h, v4.8h\n" + "smlal2 v26.4s, v27.8h, v7.8h\n" + "add x20, x20, x15\n" + "smlal v19.4s, v27.4h, v6.4h\n" + "smlal2 v11.4s, v27.8h, v6.8h\n" + "smlal v18.4s, v27.4h, v4.4h\n" + "smlal2 v24.4s, v27.8h, v4.8h\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "smlal2 v23.4s, v27.8h, v3.8h\n" "tbz x8, #2, 17f\n" "ld1 { v29.s }[0], [x20], #0x4\n" "tbz x8, #1, 16f\n" @@ -772,11 +756,11 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 19f\n" "ld1 { v29.b }[0], [x20]\n" "19:" // Oddments: Load (3, 3): Bit 2: End - "ldr x26, [x14, #0x38]\n" - "ssubl v29.8h, v29.8b, v14.8b\n" - "smlal v23.4s, v29.4h, v8.4h\n" - "smlal2 v20.4s, v29.8h, v8.8h\n" - "add x26, x26, x17\n" + "ssubl v29.8h, v29.8b, v22.8b\n" + "ldr x26, [x12, #0x38]\n" + "smlal v9.4s, v29.4h, v8.4h\n" + "smlal2 v23.4s, v29.8h, v8.8h\n" + "add x26, x26, x15\n" "tbz x8, #2, 21f\n" "ld1 { v28.s }[0], [x26], #0x4\n" "tbz x8, #1, 20f\n" @@ -798,13 +782,13 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 23f\n" "ld1 { v28.b }[0], [x26]\n" "23:" // Oddments: Load (0, 1): Bit 2: End - "ldr x25, [x14, #0x40]\n" - "ssubl v28.8h, v28.8b, v14.8b\n" + "ssubl v28.8h, v28.8b, v22.8b\n" + "ldr x25, [x12, #0x40]\n" "smlal v13.4s, v28.4h, v1.4h\n" - "smlal2 v19.4s, v28.8h, v1.8h\n" - "add x25, x25, x17\n" - "smlal v17.4s, v28.4h, v0.4h\n" - "smlal2 v25.4s, v28.8h, v0.8h\n" + "smlal2 v26.4s, v28.8h, v1.8h\n" + "smlal v19.4s, v28.4h, v0.4h\n" + "smlal2 v11.4s, v28.8h, v0.8h\n" + "add x25, x25, x15\n" "tbz x8, #2, 25f\n" "ld1 { v31.s }[0], [x25], #0x4\n" "tbz x8, #1, 24f\n" @@ -826,13 +810,13 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 27f\n" "ld1 { v31.b }[0], [x25]\n" "27:" // Oddments: Load (0, 2): Bit 2: End - "ldr x19, [x14, #0x48]\n" - "ssubl v31.8h, v31.8b, v14.8b\n" + "ssubl v31.8h, v31.8b, v22.8b\n" + "ldr x19, [x12, #0x48]\n" "smlal v13.4s, v31.4h, v2.4h\n" - "smlal2 v19.4s, v31.8h, v2.8h\n" - "add x19, x19, x17\n" - "smlal v17.4s, v31.4h, v1.4h\n" - "smlal2 v25.4s, v31.8h, v1.8h\n" + "smlal2 v26.4s, v31.8h, v2.8h\n" + "smlal v19.4s, v31.4h, v1.4h\n" + "smlal2 v11.4s, v31.8h, v1.8h\n" + "add x19, x19, x15\n" "tbz x8, #2, 29f\n" "ld1 { v30.s }[0], [x19], #0x4\n" "tbz x8, #1, 28f\n" @@ -854,17 +838,17 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 31f\n" "ld1 { v30.b }[0], [x19]\n" "31:" // Oddments: Load (2, 2): Bit 2: End - "ldr x24, [x14, #0x50]\n" - "ssubl v30.8h, v30.8b, v14.8b\n" + "ssubl v30.8h, v30.8b, v22.8b\n" + "ldr x24, [x12, #0x50]\n" "smlal v13.4s, v30.4h, v8.4h\n" - "smlal2 v19.4s, v30.8h, v8.8h\n" - "add x24, x24, x17\n" - "smlal v17.4s, v30.4h, v7.4h\n" - "smlal2 v25.4s, v30.8h, v7.8h\n" - "smlal v16.4s, v30.4h, v5.4h\n" - "smlal2 v21.4s, v30.8h, v5.8h\n" - "smlal v23.4s, v30.4h, v4.4h\n" - "smlal2 v20.4s, v30.8h, v4.8h\n" + "smlal2 v26.4s, v30.8h, v8.8h\n" + "smlal v19.4s, v30.4h, v7.4h\n" + "smlal2 v11.4s, v30.8h, v7.8h\n" + "add x24, x24, x15\n" + "smlal v18.4s, v30.4h, v5.4h\n" + "smlal2 v24.4s, v30.8h, v5.8h\n" + "smlal v9.4s, v30.4h, v4.4h\n" + "smlal2 v23.4s, v30.8h, v4.8h\n" "tbz x8, #2, 33f\n" "ld1 { v29.s }[0], [x24], #0x4\n" "tbz x8, #1, 32f\n" @@ -886,13 +870,13 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 35f\n" "ld1 { v29.b }[0], [x24]\n" "35:" // Oddments: Load (1, 0): Bit 2: End - "ldr x23, [x14, #0x58]\n" - "ssubl v29.8h, v29.8b, v14.8b\n" + "ssubl v29.8h, v29.8b, v22.8b\n" + "ldr x23, [x12, #0x58]\n" "smlal v13.4s, v29.4h, v3.4h\n" - "smlal2 v19.4s, v29.8h, v3.8h\n" - "add x23, x23, x17\n" - "smlal v16.4s, v29.4h, v0.4h\n" - "smlal2 v21.4s, v29.8h, v0.8h\n" + "smlal2 v26.4s, v29.8h, v3.8h\n" + "smlal v18.4s, v29.4h, v0.4h\n" + "smlal2 v24.4s, v29.8h, v0.8h\n" + "add x23, x23, x15\n" "tbz x8, #2, 37f\n" "ld1 { v28.s }[0], [x23], #0x4\n" "tbz x8, #1, 36f\n" @@ -914,13 +898,13 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 39f\n" "ld1 { v28.b }[0], [x23]\n" "39:" // Oddments: Load (1, 3): Bit 2: End - "ldr x22, [x14, #0x60]\n" - "ssubl v28.8h, v28.8b, v14.8b\n" - "smlal v17.4s, v28.4h, v5.4h\n" - "smlal2 v25.4s, v28.8h, v5.8h\n" - "add x22, x22, x17\n" - "smlal v23.4s, v28.4h, v2.4h\n" - "smlal2 v20.4s, v28.8h, v2.8h\n" + "ssubl v28.8h, v28.8b, v22.8b\n" + "ldr x22, [x12, #0x60]\n" + "smlal v19.4s, v28.4h, v5.4h\n" + "smlal2 v11.4s, v28.8h, v5.8h\n" + "smlal v9.4s, v28.4h, v2.4h\n" + "smlal2 v23.4s, v28.8h, v2.8h\n" + "add x22, x22, x15\n" "tbz x8, #2, 41f\n" "ld1 { v31.s }[0], [x22], #0x4\n" "tbz x8, #1, 40f\n" @@ -942,13 +926,13 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 43f\n" "ld1 { v31.b }[0], [x22]\n" "43:" // Oddments: Load (2, 0): Bit 2: End - "ldr x21, [x14, #0x68]\n" - "ssubl v31.8h, v31.8b, v14.8b\n" + "ssubl v31.8h, v31.8b, v22.8b\n" + "ldr x21, [x12, #0x68]\n" "smlal v13.4s, v31.4h, v6.4h\n" - "smlal2 v19.4s, v31.8h, v6.8h\n" - "add x21, x21, x17\n" - "smlal v16.4s, v31.4h, v3.4h\n" - "smlal2 v21.4s, v31.8h, v3.8h\n" + "smlal2 v26.4s, v31.8h, v6.8h\n" + "smlal v18.4s, v31.4h, v3.4h\n" + "smlal2 v24.4s, v31.8h, v3.8h\n" + "add x21, x21, x15\n" "tbz x8, #2, 45f\n" "ld1 { v30.s }[0], [x21], #0x4\n" "tbz x8, #1, 44f\n" @@ -970,13 +954,13 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 47f\n" "ld1 { v30.b }[0], [x21]\n" "47:" // Oddments: Load (2, 3): Bit 2: End - "ldr x20, [x14, #0x70]\n" - "ssubl v30.8h, v30.8b, v14.8b\n" - "smlal v17.4s, v30.4h, v8.4h\n" - "smlal2 v25.4s, v30.8h, v8.8h\n" - "add x20, x20, x17\n" - "smlal v23.4s, v30.4h, v5.4h\n" - "smlal2 v20.4s, v30.8h, v5.8h\n" + "ssubl v30.8h, v30.8b, v22.8b\n" + "ldr x20, [x12, #0x70]\n" + "smlal v19.4s, v30.4h, v8.4h\n" + "smlal2 v11.4s, v30.8h, v8.8h\n" + "smlal v9.4s, v30.4h, v5.4h\n" + "smlal2 v23.4s, v30.8h, v5.8h\n" + "add x20, x20, x15\n" "tbz x8, #2, 49f\n" "ld1 { v29.s }[0], [x20], #0x4\n" "tbz x8, #1, 48f\n" @@ -998,13 +982,13 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 51f\n" "ld1 { v29.b }[0], [x20]\n" "51:" // Oddments: Load (3, 1): Bit 2: End - "ldr x19, [x14, #0x78]\n" - "ssubl v29.8h, v29.8b, v14.8b\n" - "smlal v16.4s, v29.4h, v7.4h\n" - "smlal2 v21.4s, v29.8h, v7.8h\n" - "add x19, x19, x17\n" - "smlal v23.4s, v29.4h, v6.4h\n" - "smlal2 v20.4s, v29.8h, v6.8h\n" + "ssubl v29.8h, v29.8b, v22.8b\n" + "ldr x19, [x12, #0x78]\n" + "smlal v18.4s, v29.4h, v7.4h\n" + "smlal2 v24.4s, v29.8h, v7.8h\n" + "smlal v9.4s, v29.4h, v6.4h\n" + "smlal2 v23.4s, v29.8h, v6.8h\n" + "add x19, x19, x15\n" "tbz x8, #2, 53f\n" "ld1 { v28.s }[0], [x19], #0x4\n" "tbz x8, #1, 52f\n" @@ -1026,160 +1010,150 @@ void a64_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 55f\n" "ld1 { v28.b }[0], [x19]\n" "55:" // Oddments: Load (3, 2): Bit 2: End - "ssubl v28.8h, v28.8b, v14.8b\n" - "smlal v16.4s, v28.4h, v8.4h\n" - "smlal2 v21.4s, v28.8h, v8.8h\n" - "smlal v23.4s, v28.4h, v7.4h\n" - "smlal2 v20.4s, v28.8h, v7.8h\n" + "ssubl v28.8h, v28.8b, v22.8b\n" + "smlal v18.4s, v28.4h, v8.4h\n" + "smlal2 v24.4s, v28.8h, v8.8h\n" + "smlal v9.4s, v28.4h, v7.4h\n" + "smlal2 v23.4s, v28.8h, v7.8h\n" "tbz x8, #2, 57f\n" - "ld1 { v26.4s }, [x13], #0x10\n" - "ld1 { v10.4s }, [x11], #0x10\n" + "ld1 { v21.4s }, [x13], #0x10\n" + "ld1 { v25.4s }, [x11], #0x10\n" "tbz x8, #1, 56f\n" - "ld1 { v11.d }[0], [x13], #0x8\n" - "ld1 { v18.d }[0], [x11], #0x8\n" + "ld1 { v10.d }[0], [x13], #0x8\n" + "ld1 { v16.d }[0], [x11], #0x8\n" "tbz x8, #0, 59f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v18.s }[2], [x11]\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v16.s }[2], [x11]\n" "b 59f\n" "56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset "tbz x8, #0, 59f\n" - "ld1 { v11.s }[0], [x13]\n" - "ld1 { v18.s }[0], [x11]\n" + "ld1 { v10.s }[0], [x13]\n" + "ld1 { v16.s }[0], [x11]\n" "b 59f\n" "57:" // Oddments: Load requant params: Bit 2: Unset "tbz x8, #1, 58f\n" - "ld1 { v26.d }[0], [x13], #0x8\n" - "ld1 { v10.d }[0], [x11], #0x8\n" + "ld1 { v21.d }[0], [x13], #0x8\n" + "ld1 { v25.d }[0], [x11], #0x8\n" "tbz x8, #0, 59f\n" - "ld1 { v26.s }[2], [x13]\n" - "ld1 { v10.s }[2], [x11]\n" + "ld1 { v21.s }[2], [x13]\n" + "ld1 { v25.s }[2], [x11]\n" "b 59f\n" "58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset "tbz x8, #0, 59f\n" - "ld1 { v26.s }[0], [x13]\n" - "ld1 { v10.s }[0], [x11]\n" + "ld1 { v21.s }[0], [x13]\n" + "ld1 { v25.s }[0], [x11]\n" "59:" // Oddments: Load requant params: Bit 2: End - "sqrdmulh v13.4s, v13.4s, v26.4s\n" - "add x10, x10, x15\n" - "sqrdmulh v19.4s, v19.4s, v11.4s\n" - "add x9, x9, x15\n" - "sqrdmulh v17.4s, v17.4s, v26.4s\n" - "add x28, x28, x15\n" - "sqrdmulh v25.4s, v25.4s, v11.4s\n" - "add x27, x27, x15\n" - "sqrdmulh v16.4s, v16.4s, v26.4s\n" - "and v22.16b, v13.16b, v10.16b\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "and v28.16b, v19.16b, v18.16b\n" - "and v3.16b, v17.16b, v10.16b\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "and v6.16b, v25.16b, v18.16b\n" - "and v0.16b, v16.16b, v10.16b\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqrdmulh v21.4s, v21.4s, v11.4s\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sqadd v13.4s, v13.4s, v22.4s\n" - "sqrdmulh v23.4s, v23.4s, v26.4s\n" - "sshr v0.4s, v0.4s, #0x1f\n" - "sqrdmulh v20.4s, v20.4s, v11.4s\n" - "sqadd v19.4s, v19.4s, v28.4s\n" - "sqadd v17.4s, v17.4s, v3.4s\n" - "srshl v13.4s, v13.4s, v10.4s\n" - "sqadd v25.4s, v25.4s, v6.4s\n" - "srshl v19.4s, v19.4s, v18.4s\n" - "srshl v17.4s, v17.4s, v10.4s\n" - "add v13.4s, v13.4s, v15.4s\n" - "srshl v25.4s, v25.4s, v18.4s\n" - "add v19.4s, v19.4s, v15.4s\n" - "smin v13.4s, v13.4s, v12.4s\n" - "add v17.4s, v17.4s, v15.4s\n" - "smin v19.4s, v19.4s, v12.4s\n" - "smax v13.4s, v13.4s, v24.4s\n" - "smin v17.4s, v17.4s, v12.4s\n" - "smax v19.4s, v19.4s, v24.4s\n" - "add v25.4s, v25.4s, v15.4s\n" - "smax v17.4s, v17.4s, v24.4s\n" - "uzp1 v13.16b, v13.16b, v19.16b\n" - "smin v25.4s, v25.4s, v12.4s\n" - "uzp1 v13.16b, v13.16b, v13.16b\n" - "sqadd v16.4s, v16.4s, v0.4s\n" - "smax v25.4s, v25.4s, v24.4s\n" - "and v29.16b, v21.16b, v18.16b\n" + "sqdmulh v13.4s, v13.4s, v21.4s\n" + "sqdmulh v19.4s, v19.4s, v21.4s\n" + "add x10, x10, x14\n" + "add x9, x9, x14\n" + "sqdmulh v18.4s, v18.4s, v21.4s\n" + "sqdmulh v9.4s, v9.4s, v21.4s\n" + "add x28, x28, x14\n" + "add x27, x27, x14\n" + "and v7.16b, v13.16b, v25.16b\n" + "sqdmulh v26.4s, v26.4s, v10.4s\n" + "and v4.16b, v19.16b, v25.16b\n" + "sqdmulh v11.4s, v11.4s, v10.4s\n" + "and v21.16b, v18.16b, v25.16b\n" + "sqdmulh v24.4s, v24.4s, v10.4s\n" + "and v20.16b, v9.16b, v25.16b\n" + "sqdmulh v23.4s, v23.4s, v10.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v29.16b, v26.16b, v16.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v10.16b, v11.16b, v16.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v31.16b, v24.16b, v16.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v30.16b, v23.16b, v16.16b\n" + "sqadd v13.4s, v13.4s, v7.4s\n" "sshr v29.4s, v29.4s, #0x1f\n" - "uzp1 v17.16b, v17.16b, v25.16b\n" - "srshl v16.4s, v16.4s, v10.4s\n" - "and v3.16b, v23.16b, v10.16b\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "uzp1 v17.16b, v17.16b, v17.16b\n" - "add v16.4s, v16.4s, v15.4s\n" - "sqadd v21.4s, v21.4s, v29.4s\n" - "and v25.16b, v20.16b, v18.16b\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "smin v16.4s, v16.4s, v12.4s\n" - "srshl v21.4s, v21.4s, v18.4s\n" - "sqadd v23.4s, v23.4s, v3.4s\n" - "smax v16.4s, v16.4s, v24.4s\n" - "add v21.4s, v21.4s, v15.4s\n" - "srshl v23.4s, v23.4s, v10.4s\n" - "sqadd v20.4s, v20.4s, v25.4s\n" - "smin v21.4s, v21.4s, v12.4s\n" - "add v23.4s, v23.4s, v15.4s\n" - "srshl v20.4s, v20.4s, v18.4s\n" - "smax v21.4s, v21.4s, v24.4s\n" - "smin v23.4s, v23.4s, v12.4s\n" - "uzp1 v16.16b, v16.16b, v21.16b\n" - "add v20.4s, v20.4s, v15.4s\n" - "uzp1 v16.16b, v16.16b, v16.16b\n" - "smax v23.4s, v23.4s, v24.4s\n" - "smin v20.4s, v20.4s, v12.4s\n" - "smax v20.4s, v20.4s, v24.4s\n" - "uzp1 v23.16b, v23.16b, v20.16b\n" - "uzp1 v23.16b, v23.16b, v23.16b\n" + "sqadd v19.4s, v19.4s, v4.4s\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v21.4s\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v20.4s\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "srshl v13.4s, v13.4s, v25.4s\n" + "sqadd v26.4s, v26.4s, v29.4s\n" + "srshl v19.4s, v19.4s, v25.4s\n" + "sqadd v11.4s, v11.4s, v10.4s\n" + "srshl v18.4s, v18.4s, v25.4s\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "srshl v9.4s, v9.4s, v25.4s\n" + "sqadd v23.4s, v23.4s, v30.4s\n" + "srshl v26.4s, v26.4s, v16.4s\n" + "sqxtn v13.4h, v13.4s\n" + "srshl v11.4s, v11.4s, v16.4s\n" + "sqxtn v19.4h, v19.4s\n" + "srshl v24.4s, v24.4s, v16.4s\n" + "sqxtn v18.4h, v18.4s\n" + "srshl v23.4s, v23.4s, v16.4s\n" + "sqxtn v9.4h, v9.4s\n" + "sqxtn2 v13.8h, v26.4s\n" + "sqxtn2 v19.8h, v11.4s\n" + "sqxtn2 v18.8h, v24.4s\n" + "sqxtn2 v9.8h, v23.4s\n" + "sqadd v13.8h, v13.8h, v14.8h\n" + "sqadd v19.8h, v19.8h, v14.8h\n" + "sqadd v18.8h, v18.8h, v14.8h\n" + "sqadd v9.8h, v9.8h, v14.8h\n" + "smax v13.8h, v13.8h, v17.8h\n" + "smax v19.8h, v19.8h, v17.8h\n" + "smax v18.8h, v18.8h, v17.8h\n" + "smax v9.8h, v9.8h, v17.8h\n" + "smin v13.8h, v13.8h, v15.8h\n" + "smin v19.8h, v19.8h, v15.8h\n" + "smin v18.8h, v18.8h, v15.8h\n" + "smin v9.8h, v9.8h, v15.8h\n" + "uzp1 v13.16b, v13.16b, v13.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" "tbz x8, #2, 61f\n" "st1 { v13.s }[0], [x10], #0x4\n" - "st1 { v17.s }[0], [x9], #0x4\n" - "st1 { v16.s }[0], [x28], #0x4\n" - "st1 { v23.s }[0], [x27], #0x4\n" + "st1 { v19.s }[0], [x9], #0x4\n" + "st1 { v18.s }[0], [x28], #0x4\n" + "st1 { v9.s }[0], [x27], #0x4\n" "tbz x8, #1, 60f\n" "st1 { v13.h }[2], [x10], #0x2\n" - "st1 { v17.h }[2], [x9], #0x2\n" - "st1 { v16.h }[2], [x28], #0x2\n" - "st1 { v23.h }[2], [x27], #0x2\n" + "st1 { v19.h }[2], [x9], #0x2\n" + "st1 { v18.h }[2], [x28], #0x2\n" + "st1 { v9.h }[2], [x27], #0x2\n" "tbz x8, #0, 63f\n" "st1 { v13.b }[6], [x10], #0x1\n" - "st1 { v17.b }[6], [x9], #0x1\n" - "st1 { v16.b }[6], [x28], #0x1\n" - "st1 { v23.b }[6], [x27], #0x1\n" + "st1 { v19.b }[6], [x9], #0x1\n" + "st1 { v18.b }[6], [x28], #0x1\n" + "st1 { v9.b }[6], [x27], #0x1\n" "b 63f\n" "60:" // Oddments: Bit 2: Bit 1: Unset "tbz x8, #0, 63f\n" "st1 { v13.b }[4], [x10], #0x1\n" - "st1 { v17.b }[4], [x9], #0x1\n" - "st1 { v16.b }[4], [x28], #0x1\n" - "st1 { v23.b }[4], [x27], #0x1\n" + "st1 { v19.b }[4], [x9], #0x1\n" + "st1 { v18.b }[4], [x28], #0x1\n" + "st1 { v9.b }[4], [x27], #0x1\n" "b 63f\n" "61:" // Oddments: Bit 2: Unset "tbz x8, #1, 62f\n" "st1 { v13.h }[0], [x10], #0x2\n" - "st1 { v17.h }[0], [x9], #0x2\n" - "st1 { v16.h }[0], [x28], #0x2\n" - "st1 { v23.h }[0], [x27], #0x2\n" + "st1 { v19.h }[0], [x9], #0x2\n" + "st1 { v18.h }[0], [x28], #0x2\n" + "st1 { v9.h }[0], [x27], #0x2\n" "tbz x8, #0, 63f\n" "st1 { v13.b }[2], [x10], #0x1\n" - "st1 { v17.b }[2], [x9], #0x1\n" - "st1 { v16.b }[2], [x28], #0x1\n" - "st1 { v23.b }[2], [x27], #0x1\n" + "st1 { v19.b }[2], [x9], #0x1\n" + "st1 { v18.b }[2], [x28], #0x1\n" + "st1 { v9.b }[2], [x27], #0x1\n" "b 63f\n" "62:" // Oddments: Bit 2: Unset: Bit 1: Unset "tbz x8, #0, 63f\n" "st1 { v13.b }[0], [x10], #0x1\n" - "st1 { v17.b }[0], [x9], #0x1\n" - "st1 { v16.b }[0], [x28], #0x1\n" - "st1 { v23.b }[0], [x27], #0x1\n" + "st1 { v19.b }[0], [x9], #0x1\n" + "st1 { v18.b }[0], [x28], #0x1\n" + "st1 { v9.b }[0], [x27], #0x1\n" "63:" // Oddments: Bit 2: End - "64:" // End - : : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp index b20759eec4..6032f8f2fb 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,37 +36,24 @@ namespace depthwise { void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); -struct a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst +class a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef int8_t input_type; - typedef int8_t weight_type; - typedef int8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); - typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 2; constexpr static unsigned int stride_cols = 2; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 5; - constexpr static unsigned int input_cols = 5; - - constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_mla::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_mla::get_packed_size; + a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {} - kern_type kernel = a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl; + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; } - a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp index 3b3d9c8946..5499392d6d 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -46,7 +46,7 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( struct Params { long unsigned int n_channels; - const int8_t *weights; + const void *weights; const int32_t *bias; const arm_gemm::Requantize32 *requant; const int32_t *const requant_muls; @@ -57,7 +57,7 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( Params( long unsigned int n_channels, const int8_t *const *inptrs_raw, - const int8_t *const weights, + const void *const weights, const int32_t *const bias, const arm_gemm::Requantize32 &qp, const int32_t *const requant_muls, @@ -100,75 +100,75 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( requant_muls, requant_shifts, outptrs); __asm__ __volatile__( - "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n" - "mov x5, #0x0\n" - "ldr x6, [%x[params], %[offsetof_Params_weights]]\n" - "mov x7, #0x0\n" - "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" - "add x8, %x[params], %[offsetof_Params_inptrs]\n" - "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n" - "lsr x16, x4, #0x3\n" - "ldr x15, [%x[params], %[offsetof_Params_requant_shifts]]\n" - "add x19, x22, %[offsetof_Requantize32_a_offset]\n" - "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" - "add x20, x22, %[offsetof_Requantize32_b_offset]\n" - "ld1r { v12.16b }, [x19]\n" - "add x19, x22, %[offsetof_Requantize32_c_offset]\n" - "ld1r { v13.16b }, [x20]\n" - "add x20, x22, %[offsetof_Requantize32_minval]\n" - "ld1r { v11.4s }, [x19]\n" - "add x19, x22, %[offsetof_Requantize32_maxval]\n" - "ld1r { v19.4s }, [x20]\n" - "ld1r { v14.4s }, [x19]\n" - "ldp x14, x13, [x21, #0x0]\n" - "ldp x12, x11, [x21, #0x10]\n" + "ldr x19, [%x[params], %[offsetof_Params_requant]]\n" + "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n" + "add x24, x19, %[offsetof_Requantize32_a_offset]\n" + "add x23, x19, %[offsetof_Requantize32_b_offset]\n" + "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x21, x19, %[offsetof_Requantize32_c_offset]\n" + "add x20, x19, %[offsetof_Requantize32_minval]\n" + "ldr x17, [%x[params], %[offsetof_Params_weights]]\n" + "add x19, x19, %[offsetof_Requantize32_maxval]\n" + "ld1r { v12.16b }, [x24]\n" + "ld1r { v13.16b }, [x23]\n" + "lsr x16, x8, #0x3\n" + "ld1r { v11.8h }, [x21]\n" + "ld1r { v17.8h }, [x20]\n" + "mov x15, #0x0\n" + "mov x14, #0x0\n" + "ld1r { v14.8h }, [x19]\n" + "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n" + "add x12, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "ldp x10, x9, [x22, #0x0]\n" + "ldp x28, x27, [x22, #0x10]\n" "cbz x16, 3f\n" - "subs x16, x16, #0x1\n" "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" "ldr q15, [x19, #0x0]\n" - "mov v20.16b, v15.16b\n" + "subs x16, x16, #0x1\n" + "mov v9.16b, v15.16b\n" "ldr q10, [x19, #0x10]\n" "add x19, x19, #0x20\n" - "mov v16.16b, v15.16b\n" "str x19, [%x[params], %[offsetof_Params_bias]]\n" - "mov v17.16b, v15.16b\n" - "ldr d0, [x6, #0x0]\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "ldr d2, [x17, #0x10]\n" + "mov v16.16b, v10.16b\n" + "mov v22.16b, v15.16b\n" + "ldr d3, [x17, #0x18]\n" + "ldr d4, [x17, #0x20]\n" + "mov v21.16b, v10.16b\n" + "mov v23.16b, v15.16b\n" + "ldr d5, [x17, #0x28]\n" + "ldr d6, [x17, #0x30]\n" + "mov v18.16b, v10.16b\n" "ssubl v0.8h, v0.8b, v13.8b\n" - "mov v23.16b, v10.16b\n" - "ldr d1, [x6, #0x8]\n" - "mov v22.16b, v10.16b\n" - "ldr d2, [x6, #0x10]\n" + "ldr d7, [x17, #0x38]\n" + "ldr d8, [x17, #0x40]\n" "ssubl v1.8h, v1.8b, v13.8b\n" - "mov v18.16b, v10.16b\n" - "ldr d3, [x6, #0x18]\n" - "ldr d4, [x6, #0x20]\n" "ssubl v2.8h, v2.8b, v13.8b\n" - "ldr d5, [x6, #0x28]\n" + "ldp x26, x25, [x12, #0x0]\n" + "ldp x24, x23, [x12, #0x10]\n" "ssubl v3.8h, v3.8b, v13.8b\n" - "ldr d6, [x6, #0x30]\n" - "ldr d7, [x6, #0x38]\n" "ssubl v4.8h, v4.8b, v13.8b\n" - "ldr d8, [x6, #0x40]\n" + "ldp x22, x21, [x12, #0x20]\n" + "ldp x20, x19, [x12, #0x30]\n" "ssubl v5.8h, v5.8b, v13.8b\n" - "ldp x26, x25, [x8, #0x0]\n" "ssubl v6.8h, v6.8b, v13.8b\n" - "ldp x24, x23, [x8, #0x10]\n" + "ldr d31, [x26, x15]\n" + "ldr d30, [x25, x15]\n" "ssubl v7.8h, v7.8b, v13.8b\n" "ssubl v8.8h, v8.8b, v13.8b\n" - "ldp x22, x21, [x8, #0x20]\n" - "ldp x20, x19, [x8, #0x30]\n" - "ldr d31, [x26, x5]\n" + "ldr d29, [x24, x15]\n" + "ldr d28, [x23, x15]\n" "ssubl v31.8h, v31.8b, v12.8b\n" - "ldr d30, [x25, x5]\n" - "ldr d29, [x24, x5]\n" "ssubl v30.8h, v30.8b, v12.8b\n" - "ldr d28, [x23, x5]\n" - "ldr d27, [x22, x5]\n" + "ldr d27, [x22, x15]\n" + "ldr d26, [x21, x15]\n" "ssubl v29.8h, v29.8b, v12.8b\n" - "ldr d26, [x21, x5]\n" "ssubl v28.8h, v28.8b, v12.8b\n" - "ldr d25, [x20, x5]\n" - "ldr d24, [x19, x5]\n" + "ldr d25, [x20, x15]\n" + "ldr d24, [x19, x15]\n" "ssubl v27.8h, v27.8b, v12.8b\n" "ssubl v26.8h, v26.8b, v12.8b\n" "ssubl v25.8h, v25.8b, v12.8b\n" @@ -176,259 +176,251 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "beq 2f\n" "1:" // Loop "smlal v15.4s, v31.4h, v8.4h\n" - "ldr x23, [x8, #0x40]\n" - "add x6, x6, #0x48\n" "smlal2 v10.4s, v31.8h, v8.8h\n" - "ldr x22, [x8, #0x48]\n" - "subs x16, x16, #0x1\n" - "smlal v20.4s, v31.4h, v6.4h\n" - "ldr x21, [x8, #0x50]\n" - "smlal2 v23.4s, v31.8h, v6.8h\n" - "ldr x20, [x8, #0x58]\n" - "smlal v16.4s, v31.4h, v2.4h\n" - "ldr x19, [x8, #0x60]\n" - "smlal2 v22.4s, v31.8h, v2.8h\n" - "ldr x10, [x8, #0x68]\n" - "smlal v17.4s, v31.4h, v0.4h\n" - "ldr x9, [x8, #0x70]\n" - "smlal2 v18.4s, v31.8h, v0.8h\n" - "ldr x28, [x8, #0x78]\n" + "ldr x24, [x12, #0x40]\n" + "ldr x23, [x12, #0x48]\n" + "smlal v9.4s, v31.4h, v6.4h\n" + "smlal2 v16.4s, v31.8h, v6.8h\n" + "ldr x21, [x12, #0x50]\n" + "ldr x19, [x12, #0x58]\n" "smlal v15.4s, v30.4h, v0.4h\n" - "ldr x27, [x8, #0x80]\n" "smlal2 v10.4s, v30.8h, v0.8h\n" - "ldr x26, [x8, #0x88]\n" - "smlal v20.4s, v28.4h, v1.4h\n" - "ldr x25, [x8, #0x90]\n" - "smlal2 v23.4s, v28.8h, v1.8h\n" - "ldr d28, [x22, x5]\n" + "ldr x22, [x12, #0x78]\n" + "ldr x20, [x12, #0x60]\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "smlal2 v16.4s, v28.8h, v1.8h\n" + "ldr d28, [x23, x15]\n" "ssubl v28.8h, v28.8b, v12.8b\n" "smlal v15.4s, v29.4h, v1.4h\n" - "ldr x24, [x8, #0x98]\n" "smlal2 v10.4s, v29.8h, v1.8h\n" - "ldr d29, [x23, x5]\n" + "ldr d29, [x24, x15]\n" "ssubl v29.8h, v29.8b, v12.8b\n" - "smlal v20.4s, v27.4h, v2.4h\n" - "ldr x23, [x8, #0xa0]\n" - "smlal2 v23.4s, v27.8h, v2.8h\n" - "ldr d27, [x21, x5]\n" + "smlal v9.4s, v27.4h, v2.4h\n" + "smlal2 v16.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x15]\n" "ssubl v27.8h, v27.8b, v12.8b\n" "smlal v15.4s, v26.4h, v3.4h\n" - "ldr x22, [x8, #0xa8]\n" "smlal2 v10.4s, v26.8h, v3.8h\n" - "ldr d26, [x20, x5]\n" + "ldr d26, [x19, x15]\n" "ssubl v26.8h, v26.8b, v12.8b\n" + "smlal v9.4s, v24.4h, v0.4h\n" + "smlal2 v16.4s, v24.8h, v0.8h\n" + "ldr x21, [x12, #0x80]\n" + "ldr x19, [x12, #0x68]\n" "smlal v15.4s, v25.4h, v4.4h\n" - "ldr x21, [x8, #0xb0]\n" "smlal2 v10.4s, v25.8h, v4.8h\n" - "ldr d25, [x19, x5]\n" + "ldr d25, [x20, x15]\n" "ssubl v25.8h, v25.8b, v12.8b\n" + "smlal v9.4s, v29.4h, v4.4h\n" + "smlal2 v16.4s, v29.8h, v4.8h\n" + "ldr x20, [x12, #0x88]\n" + "ldr d29, [x19, x15]\n" "smlal v15.4s, v24.4h, v2.4h\n" - "ldr x20, [x8, #0xb8]\n" "smlal2 v10.4s, v24.8h, v2.8h\n" - "ldr x19, [x8, #0xc0]\n" - "smlal v20.4s, v24.4h, v0.4h\n" - "ldr q21, [x17, #0x0]\n" - "smlal2 v23.4s, v24.8h, v0.8h\n" - "ldr d24, [x9, x5]\n" - "ssubl v24.8h, v24.8b, v12.8b\n" - "smlal v20.4s, v29.4h, v4.4h\n" - "ldr q30, [x15, #0x0]\n" - "smlal2 v23.4s, v29.8h, v4.8h\n" - "ldr d29, [x10, x5]\n" + "ldr x19, [x12, #0x70]\n" "ssubl v29.8h, v29.8b, v12.8b\n" - "smlal v20.4s, v28.4h, v5.4h\n" - "ldr q31, [x17, #0x10]\n" - "smlal2 v23.4s, v28.8h, v5.8h\n" - "ldr d28, [x27, x5]\n" - "add x17, x17, #0x20\n" + "smlal v9.4s, v28.4h, v5.4h\n" + "smlal2 v16.4s, v28.8h, v5.8h\n" + "ldr d28, [x21, x15]\n" + "ssubl v28.8h, v28.8b, v12.8b\n" + "smlal v22.4s, v31.4h, v2.4h\n" + "smlal2 v21.4s, v31.8h, v2.8h\n" + "ldr x24, [x12, #0x98]\n" + "ldr d24, [x19, x15]\n" "smlal v15.4s, v27.4h, v5.4h\n" - "ldr q9, [x15, #0x10]\n" - "add x15, x15, #0x20\n" "smlal2 v10.4s, v27.8h, v5.8h\n" - "ssubl v28.8h, v28.8b, v12.8b\n" - "smlal v20.4s, v27.4h, v3.4h\n" - "smlal2 v23.4s, v27.8h, v3.8h\n" - "ldr d27, [x28, x5]\n" + "ssubl v24.8h, v24.8b, v12.8b\n" + "ldr x23, [x12, #0x90]\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "smlal2 v16.4s, v27.8h, v3.8h\n" + "ldr d27, [x22, x15]\n" "ssubl v27.8h, v27.8b, v12.8b\n" - "smlal v16.4s, v26.4h, v3.4h\n" - "smlal2 v22.4s, v26.8h, v3.8h\n" - "ldr d26, [x26, x5]\n" + "smlal v23.4s, v31.4h, v0.4h\n" + "smlal v22.4s, v26.4h, v3.4h\n" + "ldr x22, [x12, #0xa8]\n" + "ldr x19, [x12, #0xa0]\n" + "smlal2 v21.4s, v26.8h, v3.8h\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" + "ldr d26, [x20, x15]\n" "ssubl v26.8h, v26.8b, v12.8b\n" + "smlal v23.4s, v27.4h, v4.4h\n" + "smlal v22.4s, v25.4h, v0.4h\n" + "ldr x21, [x12, #0xb0]\n" + "ldr x20, [x12, #0xb8]\n" + "smlal2 v21.4s, v25.8h, v0.8h\n" + "smlal2 v18.4s, v27.8h, v4.8h\n" + "ldr d27, [x19, x15]\n" + "ssubl v27.8h, v27.8b, v12.8b\n" + "smlal v23.4s, v28.4h, v1.4h\n" "smlal v15.4s, v25.4h, v6.4h\n" + "ldr x19, [x12, #0xc0]\n" + "ldr q19, [x13, #0x0]\n" "smlal2 v10.4s, v25.8h, v6.8h\n" - "smlal v16.4s, v25.4h, v0.4h\n" - "smlal2 v22.4s, v25.8h, v0.8h\n" - "ldr d25, [x25, x5]\n" + "smlal v22.4s, v29.4h, v4.4h\n" + "ldr d25, [x23, x15]\n" "ssubl v25.8h, v25.8b, v12.8b\n" - "smlal v16.4s, v29.4h, v4.4h\n" - "smlal2 v22.4s, v29.8h, v4.8h\n" - "ldr d29, [x24, x5]\n" + "smlal2 v21.4s, v29.8h, v4.8h\n" + "ldr d29, [x24, x15]\n" + "smlal2 v18.4s, v28.8h, v1.8h\n" "ssubl v29.8h, v29.8b, v12.8b\n" + "smlal v23.4s, v26.4h, v5.4h\n" "smlal v15.4s, v24.4h, v7.4h\n" + "ldr q0, [x11, #0x0]\n" + "ldr q4, [x13, #0x10]\n" "smlal2 v10.4s, v24.8h, v7.8h\n" - "smlal v16.4s, v24.4h, v1.4h\n" - "smlal2 v22.4s, v24.8h, v1.8h\n" - "ldr d24, [x22, x5]\n" - "ssubl v24.8h, v24.8b, v12.8b\n" - "smlal v17.4s, v27.4h, v4.4h\n" - "smlal2 v18.4s, v27.8h, v4.8h\n" - "ldr d27, [x23, x5]\n" - "ssubl v27.8h, v27.8b, v12.8b\n" - "smlal v20.4s, v28.4h, v7.4h\n" - "smlal2 v23.4s, v28.8h, v7.8h\n" - "smlal v17.4s, v28.4h, v1.4h\n" - "smlal2 v18.4s, v28.8h, v1.8h\n" - "smlal v16.4s, v25.4h, v6.4h\n" - "smlal2 v22.4s, v25.8h, v6.8h\n" - "ldr d25, [x20, x5]\n" - "ssubl v25.8h, v25.8b, v12.8b\n" - "smlal v17.4s, v26.4h, v5.4h\n" + "smlal v22.4s, v24.4h, v1.4h\n" + "sqdmulh v15.4s, v15.4s, v19.4s\n" + "ldr q31, [x11, #0x10]\n" + "smlal2 v21.4s, v24.8h, v1.8h\n" + "ldr d24, [x22, x15]\n" "smlal2 v18.4s, v26.8h, v5.8h\n" - "ldr d26, [x21, x5]\n" - "ssubl v26.8h, v26.8b, v12.8b\n" - "smlal v20.4s, v29.4h, v8.4h\n" - "smlal2 v23.4s, v29.8h, v8.8h\n" - "smlal v17.4s, v29.4h, v2.4h\n" + "ssubl v24.8h, v24.8b, v12.8b\n" + "smlal v23.4s, v29.4h, v2.4h\n" + "ldr d26, [x21, x15]\n" "smlal2 v18.4s, v29.8h, v2.8h\n" - "ldr d29, [x19, x5]\n" - "add x5, x5, #0x8\n" - "smlal v16.4s, v27.4h, v7.4h\n" - "ssubl v29.8h, v29.8b, v12.8b\n" - "smlal2 v22.4s, v27.8h, v7.8h\n" - "smlal v17.4s, v24.4h, v3.4h\n" - "smlal v16.4s, v24.4h, v5.4h\n" + "ssubl v26.8h, v26.8b, v12.8b\n" + "smlal v22.4s, v25.4h, v6.4h\n" + "smlal v23.4s, v24.4h, v3.4h\n" + "and v30.16b, v15.16b, v0.16b\n" + "add x17, x17, #0x48\n" + "smlal v9.4s, v28.4h, v7.4h\n" + "smlal2 v16.4s, v28.8h, v7.8h\n" + "sqdmulh v10.4s, v10.4s, v4.4s\n" + "subs x16, x16, #0x1\n" + "smlal2 v21.4s, v25.8h, v6.8h\n" + "ldr d25, [x20, x15]\n" "smlal2 v18.4s, v24.8h, v3.8h\n" - "sqrdmulh v15.4s, v15.4s, v21.4s\n" - "smlal2 v22.4s, v24.8h, v5.8h\n" - "smlal v17.4s, v26.4h, v7.4h\n" + "ssubl v25.8h, v25.8b, v12.8b\n" + "smlal v22.4s, v27.4h, v7.4h\n" + "smlal v23.4s, v26.4h, v7.4h\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "add x13, x13, #0x20\n" + "smlal v9.4s, v29.4h, v8.4h\n" + "smlal2 v16.4s, v29.8h, v8.8h\n" + "ldr d29, [x19, x15]\n" + "ssubl v29.8h, v29.8b, v12.8b\n" + "smlal2 v21.4s, v27.8h, v7.8h\n" "smlal2 v18.4s, v26.8h, v7.8h\n" - "smlal v16.4s, v25.4h, v8.4h\n" - "smlal2 v22.4s, v25.8h, v8.8h\n" - "smlal v17.4s, v25.4h, v6.4h\n" + "sqdmulh v9.4s, v9.4s, v19.4s\n" + "add x15, x15, #0x8\n" + "smlal v22.4s, v24.4h, v5.4h\n" + "smlal v23.4s, v25.4h, v6.4h\n" + "and v28.16b, v9.16b, v0.16b\n" + "add x11, x11, #0x20\n" + "smlal2 v21.4s, v24.8h, v5.8h\n" "smlal2 v18.4s, v25.8h, v6.8h\n" - "and v26.16b, v15.16b, v30.16b\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "smlal v17.4s, v29.4h, v8.4h\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "smlal v22.4s, v25.4h, v8.4h\n" + "smlal v23.4s, v29.4h, v8.4h\n" + "sqdmulh v22.4s, v22.4s, v19.4s\n" + "smlal2 v21.4s, v25.8h, v8.8h\n" "smlal2 v18.4s, v29.8h, v8.8h\n" - "sqrdmulh v10.4s, v10.4s, v31.4s\n" - "sqrdmulh v20.4s, v20.4s, v21.4s\n" - "sqrdmulh v23.4s, v23.4s, v31.4s\n" - "sqrdmulh v16.4s, v16.4s, v21.4s\n" - "sqadd v15.4s, v15.4s, v26.4s\n" - "and v8.16b, v10.16b, v9.16b\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "srshl v15.4s, v15.4s, v30.4s\n" - "and v4.16b, v20.16b, v30.16b\n" + "sqdmulh v23.4s, v23.4s, v19.4s\n" + "and v29.16b, v22.16b, v0.16b\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "and v20.16b, v23.16b, v0.16b\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "and v19.16b, v10.16b, v31.16b\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "and v4.16b, v16.16b, v31.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "and v5.16b, v21.16b, v31.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v26.16b, v18.16b, v31.16b\n" + "sqadd v15.4s, v15.4s, v30.4s\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v28.4s\n" "sshr v4.4s, v4.4s, #0x1f\n" - "and v2.16b, v23.16b, v9.16b\n" - "and v1.16b, v16.16b, v30.16b\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "add v15.4s, v15.4s, v11.4s\n" - "sqadd v10.4s, v10.4s, v8.4s\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqrdmulh v22.4s, v22.4s, v31.4s\n" - "sqadd v20.4s, v20.4s, v4.4s\n" - "smin v15.4s, v15.4s, v14.4s\n" - "srshl v10.4s, v10.4s, v9.4s\n" - "sqadd v23.4s, v23.4s, v2.4s\n" - "smax v15.4s, v15.4s, v19.4s\n" - "srshl v20.4s, v20.4s, v30.4s\n" - "add v10.4s, v10.4s, v11.4s\n" - "srshl v23.4s, v23.4s, v9.4s\n" - "sqadd v16.4s, v16.4s, v1.4s\n" - "smin v10.4s, v10.4s, v14.4s\n" - "add v20.4s, v20.4s, v11.4s\n" - "add v23.4s, v23.4s, v11.4s\n" - "smax v10.4s, v10.4s, v19.4s\n" - "smin v20.4s, v20.4s, v14.4s\n" - "smin v23.4s, v23.4s, v14.4s\n" - "uzp1 v15.16b, v15.16b, v10.16b\n" - "smax v20.4s, v20.4s, v19.4s\n" + "sqadd v22.4s, v22.4s, v29.4s\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v20.4s\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v0.4s\n" + "sqadd v10.4s, v10.4s, v19.4s\n" + "srshl v9.4s, v9.4s, v0.4s\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "sqadd v21.4s, v21.4s, v5.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "sqadd v18.4s, v18.4s, v26.4s\n" + "srshl v10.4s, v10.4s, v31.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v16.4s, v16.4s, v31.4s\n" + "sqxtn v9.4h, v9.4s\n" + "srshl v21.4s, v21.4s, v31.4s\n" + "sqxtn v22.4h, v22.4s\n" + "srshl v18.4s, v18.4s, v31.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v15.8h, v10.4s\n" + "sqxtn2 v9.8h, v16.4s\n" + "sqxtn2 v22.8h, v21.4s\n" + "sqxtn2 v23.8h, v18.4s\n" + "sqadd v15.8h, v15.8h, v11.8h\n" + "sqadd v9.8h, v9.8h, v11.8h\n" + "sqadd v22.8h, v22.8h, v11.8h\n" + "sqadd v23.8h, v23.8h, v11.8h\n" + "smax v15.8h, v15.8h, v17.8h\n" + "smax v9.8h, v9.8h, v17.8h\n" + "smax v22.8h, v22.8h, v17.8h\n" + "smax v23.8h, v23.8h, v17.8h\n" + "smin v15.8h, v15.8h, v14.8h\n" + "smin v9.8h, v9.8h, v14.8h\n" + "smin v22.8h, v22.8h, v14.8h\n" + "smin v23.8h, v23.8h, v14.8h\n" "uzp1 v15.16b, v15.16b, v15.16b\n" - "str d15, [x14, x7]\n" - "smax v23.4s, v23.4s, v19.4s\n" - "srshl v16.4s, v16.4s, v30.4s\n" - "and v24.16b, v22.16b, v9.16b\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "uzp1 v20.16b, v20.16b, v23.16b\n" - "add v16.4s, v16.4s, v11.4s\n" - "sqrdmulh v17.4s, v17.4s, v21.4s\n" - "uzp1 v20.16b, v20.16b, v20.16b\n" - "str d20, [x13, x7]\n" - "smin v16.4s, v16.4s, v14.4s\n" - "sqrdmulh v18.4s, v18.4s, v31.4s\n" - "sqadd v22.4s, v22.4s, v24.4s\n" - "and v2.16b, v17.16b, v30.16b\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "smax v16.4s, v16.4s, v19.4s\n" - "srshl v22.4s, v22.4s, v9.4s\n" - "and v31.16b, v18.16b, v9.16b\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "add v22.4s, v22.4s, v11.4s\n" - "sqadd v17.4s, v17.4s, v2.4s\n" - "smin v22.4s, v22.4s, v14.4s\n" - "srshl v17.4s, v17.4s, v30.4s\n" - "sqadd v18.4s, v18.4s, v31.4s\n" - "smax v22.4s, v22.4s, v19.4s\n" - "uzp1 v16.16b, v16.16b, v22.16b\n" - "add v17.4s, v17.4s, v11.4s\n" - "srshl v18.4s, v18.4s, v9.4s\n" - "uzp1 v16.16b, v16.16b, v16.16b\n" - "str d16, [x12, x7]\n" - "smin v17.4s, v17.4s, v14.4s\n" - "add v18.4s, v18.4s, v11.4s\n" - "smax v17.4s, v17.4s, v19.4s\n" - "smin v18.4s, v18.4s, v14.4s\n" - "smax v18.4s, v18.4s, v19.4s\n" - "uzp1 v17.16b, v17.16b, v18.16b\n" - "uzp1 v17.16b, v17.16b, v17.16b\n" - "str d17, [x11, x7]\n" - "add x7, x7, #0x8\n" + "str d15, [x10, x14]\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str d9, [x9, x14]\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str d22, [x28, x14]\n" + "str d23, [x27, x14]\n" "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" "ldr q15, [x19, #0x0]\n" - "mov v20.16b, v15.16b\n" + "add x14, x14, #0x8\n" "ldr q10, [x19, #0x10]\n" "add x19, x19, #0x20\n" - "mov v16.16b, v15.16b\n" "str x19, [%x[params], %[offsetof_Params_bias]]\n" - "mov v17.16b, v15.16b\n" - "ldr d0, [x6, #0x0]\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "ldr d2, [x17, #0x10]\n" + "mov v9.16b, v15.16b\n" + "mov v16.16b, v10.16b\n" + "ldr d3, [x17, #0x18]\n" + "ldr d4, [x17, #0x20]\n" + "mov v22.16b, v15.16b\n" + "mov v21.16b, v10.16b\n" + "ldr d5, [x17, #0x28]\n" + "ldr d6, [x17, #0x30]\n" + "mov v23.16b, v15.16b\n" + "mov v18.16b, v10.16b\n" + "ldr d7, [x17, #0x38]\n" + "ldr d8, [x17, #0x40]\n" "ssubl v0.8h, v0.8b, v13.8b\n" - "mov v23.16b, v10.16b\n" - "ldr d1, [x6, #0x8]\n" - "mov v22.16b, v10.16b\n" - "ldr d2, [x6, #0x10]\n" "ssubl v1.8h, v1.8b, v13.8b\n" - "mov v18.16b, v10.16b\n" - "ldr d3, [x6, #0x18]\n" - "ldr d4, [x6, #0x20]\n" + "ldp x26, x25, [x12, #0x0]\n" + "ldp x24, x23, [x12, #0x10]\n" "ssubl v2.8h, v2.8b, v13.8b\n" - "ldr d5, [x6, #0x28]\n" "ssubl v3.8h, v3.8b, v13.8b\n" - "ldr d6, [x6, #0x30]\n" - "ldr d7, [x6, #0x38]\n" + "ldp x22, x21, [x12, #0x20]\n" + "ldp x20, x19, [x12, #0x30]\n" "ssubl v4.8h, v4.8b, v13.8b\n" - "ldr d8, [x6, #0x40]\n" "ssubl v5.8h, v5.8b, v13.8b\n" - "ldp x26, x25, [x8, #0x0]\n" + "ldr d31, [x26, x15]\n" + "ldr d30, [x25, x15]\n" "ssubl v6.8h, v6.8b, v13.8b\n" - "ldp x24, x23, [x8, #0x10]\n" "ssubl v7.8h, v7.8b, v13.8b\n" + "ldr d29, [x24, x15]\n" + "ldr d28, [x23, x15]\n" "ssubl v8.8h, v8.8b, v13.8b\n" - "ldp x22, x21, [x8, #0x20]\n" - "ldp x20, x19, [x8, #0x30]\n" - "ldr d31, [x26, x5]\n" "ssubl v31.8h, v31.8b, v12.8b\n" - "ldr d30, [x25, x5]\n" - "ldr d29, [x24, x5]\n" + "ldr d27, [x22, x15]\n" + "ldr d26, [x21, x15]\n" "ssubl v30.8h, v30.8b, v12.8b\n" - "ldr d28, [x23, x5]\n" - "ldr d27, [x22, x5]\n" "ssubl v29.8h, v29.8b, v12.8b\n" - "ldr d26, [x21, x5]\n" + "ldr d25, [x20, x15]\n" + "ldr d24, [x19, x15]\n" "ssubl v28.8h, v28.8b, v12.8b\n" - "ldr d25, [x20, x5]\n" - "ldr d24, [x19, x5]\n" "ssubl v27.8h, v27.8b, v12.8b\n" "ssubl v26.8h, v26.8b, v12.8b\n" "ssubl v25.8h, v25.8b, v12.8b\n" @@ -436,275 +428,267 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "bgt 1b\n" "2:" // Tail "smlal v15.4s, v31.4h, v8.4h\n" - "ldr x23, [x8, #0x40]\n" - "tst x4, #0x7\n" "smlal2 v10.4s, v31.8h, v8.8h\n" - "ldr x22, [x8, #0x48]\n" - "smlal v20.4s, v31.4h, v6.4h\n" - "ldr x21, [x8, #0x50]\n" - "smlal2 v23.4s, v31.8h, v6.8h\n" - "ldr x20, [x8, #0x58]\n" - "smlal v16.4s, v31.4h, v2.4h\n" - "ldr x19, [x8, #0x60]\n" - "smlal2 v22.4s, v31.8h, v2.8h\n" - "ldr x10, [x8, #0x68]\n" - "smlal v17.4s, v31.4h, v0.4h\n" - "ldr x9, [x8, #0x70]\n" - "smlal2 v18.4s, v31.8h, v0.8h\n" - "ldr x28, [x8, #0x78]\n" + "ldr x24, [x12, #0x40]\n" + "ldr x23, [x12, #0x48]\n" + "smlal v9.4s, v31.4h, v6.4h\n" + "smlal2 v16.4s, v31.8h, v6.8h\n" + "ldr x21, [x12, #0x50]\n" + "ldr x19, [x12, #0x58]\n" "smlal v15.4s, v30.4h, v0.4h\n" - "ldr x27, [x8, #0x80]\n" "smlal2 v10.4s, v30.8h, v0.8h\n" - "ldr x26, [x8, #0x88]\n" - "smlal v20.4s, v28.4h, v1.4h\n" - "ldr x25, [x8, #0x90]\n" - "smlal2 v23.4s, v28.8h, v1.8h\n" - "ldr d28, [x22, x5]\n" + "ldr x22, [x12, #0x78]\n" + "ldr x20, [x12, #0x60]\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "smlal2 v16.4s, v28.8h, v1.8h\n" + "ldr d28, [x23, x15]\n" "ssubl v28.8h, v28.8b, v12.8b\n" "smlal v15.4s, v29.4h, v1.4h\n" - "ldr x24, [x8, #0x98]\n" "smlal2 v10.4s, v29.8h, v1.8h\n" - "ldr d29, [x23, x5]\n" + "ldr d29, [x24, x15]\n" "ssubl v29.8h, v29.8b, v12.8b\n" - "smlal v20.4s, v27.4h, v2.4h\n" - "ldr x23, [x8, #0xa0]\n" - "smlal2 v23.4s, v27.8h, v2.8h\n" - "ldr d27, [x21, x5]\n" + "smlal v9.4s, v27.4h, v2.4h\n" + "smlal2 v16.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x15]\n" "ssubl v27.8h, v27.8b, v12.8b\n" "smlal v15.4s, v26.4h, v3.4h\n" - "ldr x22, [x8, #0xa8]\n" "smlal2 v10.4s, v26.8h, v3.8h\n" - "ldr d26, [x20, x5]\n" + "ldr d26, [x19, x15]\n" "ssubl v26.8h, v26.8b, v12.8b\n" + "smlal v9.4s, v24.4h, v0.4h\n" + "smlal2 v16.4s, v24.8h, v0.8h\n" + "ldr x21, [x12, #0x80]\n" + "ldr x19, [x12, #0x68]\n" "smlal v15.4s, v25.4h, v4.4h\n" - "ldr x21, [x8, #0xb0]\n" "smlal2 v10.4s, v25.8h, v4.8h\n" - "ldr d25, [x19, x5]\n" + "ldr d25, [x20, x15]\n" "ssubl v25.8h, v25.8b, v12.8b\n" + "smlal v9.4s, v29.4h, v4.4h\n" + "smlal2 v16.4s, v29.8h, v4.8h\n" + "ldr x20, [x12, #0x88]\n" + "ldr d29, [x19, x15]\n" "smlal v15.4s, v24.4h, v2.4h\n" - "ldr x20, [x8, #0xb8]\n" "smlal2 v10.4s, v24.8h, v2.8h\n" - "ldr x19, [x8, #0xc0]\n" - "smlal v20.4s, v24.4h, v0.4h\n" - "ldr q21, [x17, #0x0]\n" - "smlal2 v23.4s, v24.8h, v0.8h\n" - "ldr d24, [x9, x5]\n" - "ssubl v24.8h, v24.8b, v12.8b\n" - "smlal v20.4s, v29.4h, v4.4h\n" - "ldr q30, [x15, #0x0]\n" - "smlal2 v23.4s, v29.8h, v4.8h\n" - "ldr d29, [x10, x5]\n" + "ldr x19, [x12, #0x70]\n" "ssubl v29.8h, v29.8b, v12.8b\n" - "smlal v20.4s, v28.4h, v5.4h\n" - "ldr q31, [x17, #0x10]\n" - "smlal2 v23.4s, v28.8h, v5.8h\n" - "ldr d28, [x27, x5]\n" - "add x17, x17, #0x20\n" + "smlal v9.4s, v28.4h, v5.4h\n" + "smlal2 v16.4s, v28.8h, v5.8h\n" + "ldr d28, [x21, x15]\n" + "ssubl v28.8h, v28.8b, v12.8b\n" + "smlal v22.4s, v31.4h, v2.4h\n" + "smlal2 v21.4s, v31.8h, v2.8h\n" + "ldr x24, [x12, #0x98]\n" + "ldr d24, [x19, x15]\n" "smlal v15.4s, v27.4h, v5.4h\n" - "ldr q9, [x15, #0x10]\n" - "add x15, x15, #0x20\n" "smlal2 v10.4s, v27.8h, v5.8h\n" - "ssubl v28.8h, v28.8b, v12.8b\n" - "smlal v20.4s, v27.4h, v3.4h\n" - "smlal2 v23.4s, v27.8h, v3.8h\n" - "ldr d27, [x28, x5]\n" + "ssubl v24.8h, v24.8b, v12.8b\n" + "ldr x23, [x12, #0x90]\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "smlal2 v16.4s, v27.8h, v3.8h\n" + "ldr d27, [x22, x15]\n" "ssubl v27.8h, v27.8b, v12.8b\n" - "smlal v16.4s, v26.4h, v3.4h\n" - "smlal2 v22.4s, v26.8h, v3.8h\n" - "ldr d26, [x26, x5]\n" + "smlal v23.4s, v31.4h, v0.4h\n" + "smlal v22.4s, v26.4h, v3.4h\n" + "ldr x22, [x12, #0xa8]\n" + "ldr x19, [x12, #0xa0]\n" + "smlal2 v21.4s, v26.8h, v3.8h\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" + "ldr d26, [x20, x15]\n" "ssubl v26.8h, v26.8b, v12.8b\n" + "smlal v23.4s, v27.4h, v4.4h\n" + "smlal v22.4s, v25.4h, v0.4h\n" + "ldr x21, [x12, #0xb0]\n" + "ldr x20, [x12, #0xb8]\n" + "smlal2 v21.4s, v25.8h, v0.8h\n" + "smlal2 v18.4s, v27.8h, v4.8h\n" + "ldr d27, [x19, x15]\n" + "ssubl v27.8h, v27.8b, v12.8b\n" + "smlal v23.4s, v28.4h, v1.4h\n" "smlal v15.4s, v25.4h, v6.4h\n" + "ldr x19, [x12, #0xc0]\n" + "ldr q19, [x13, #0x0]\n" "smlal2 v10.4s, v25.8h, v6.8h\n" - "smlal v16.4s, v25.4h, v0.4h\n" - "smlal2 v22.4s, v25.8h, v0.8h\n" - "ldr d25, [x25, x5]\n" + "smlal v22.4s, v29.4h, v4.4h\n" + "ldr d25, [x23, x15]\n" "ssubl v25.8h, v25.8b, v12.8b\n" - "smlal v16.4s, v29.4h, v4.4h\n" - "smlal2 v22.4s, v29.8h, v4.8h\n" - "ldr d29, [x24, x5]\n" + "smlal2 v21.4s, v29.8h, v4.8h\n" + "ldr d29, [x24, x15]\n" + "smlal2 v18.4s, v28.8h, v1.8h\n" "ssubl v29.8h, v29.8b, v12.8b\n" + "smlal v23.4s, v26.4h, v5.4h\n" "smlal v15.4s, v24.4h, v7.4h\n" + "ldr q0, [x11, #0x0]\n" + "ldr q4, [x13, #0x10]\n" "smlal2 v10.4s, v24.8h, v7.8h\n" - "smlal v16.4s, v24.4h, v1.4h\n" - "smlal2 v22.4s, v24.8h, v1.8h\n" - "ldr d24, [x22, x5]\n" - "ssubl v24.8h, v24.8b, v12.8b\n" - "smlal v17.4s, v27.4h, v4.4h\n" - "smlal2 v18.4s, v27.8h, v4.8h\n" - "ldr d27, [x23, x5]\n" - "ssubl v27.8h, v27.8b, v12.8b\n" - "smlal v20.4s, v28.4h, v7.4h\n" - "smlal2 v23.4s, v28.8h, v7.8h\n" - "smlal v17.4s, v28.4h, v1.4h\n" - "smlal2 v18.4s, v28.8h, v1.8h\n" - "smlal v16.4s, v25.4h, v6.4h\n" - "smlal2 v22.4s, v25.8h, v6.8h\n" - "ldr d25, [x20, x5]\n" - "ssubl v25.8h, v25.8b, v12.8b\n" - "smlal v17.4s, v26.4h, v5.4h\n" + "smlal v22.4s, v24.4h, v1.4h\n" + "sqdmulh v15.4s, v15.4s, v19.4s\n" + "ldr q31, [x11, #0x10]\n" + "smlal2 v21.4s, v24.8h, v1.8h\n" + "ldr d24, [x22, x15]\n" "smlal2 v18.4s, v26.8h, v5.8h\n" - "ldr d26, [x21, x5]\n" - "ssubl v26.8h, v26.8b, v12.8b\n" - "smlal v20.4s, v29.4h, v8.4h\n" - "smlal2 v23.4s, v29.8h, v8.8h\n" - "smlal v17.4s, v29.4h, v2.4h\n" + "ssubl v24.8h, v24.8b, v12.8b\n" + "smlal v23.4s, v29.4h, v2.4h\n" + "ldr d26, [x21, x15]\n" "smlal2 v18.4s, v29.8h, v2.8h\n" - "ldr d29, [x19, x5]\n" - "add x5, x5, #0x8\n" - "smlal v16.4s, v27.4h, v7.4h\n" - "ssubl v29.8h, v29.8b, v12.8b\n" - "smlal2 v22.4s, v27.8h, v7.8h\n" - "smlal v17.4s, v24.4h, v3.4h\n" - "smlal v16.4s, v24.4h, v5.4h\n" + "ssubl v26.8h, v26.8b, v12.8b\n" + "smlal v22.4s, v25.4h, v6.4h\n" + "smlal v23.4s, v24.4h, v3.4h\n" + "and v30.16b, v15.16b, v0.16b\n" + "tst x8, #0x7\n" + "smlal v9.4s, v28.4h, v7.4h\n" + "smlal2 v16.4s, v28.8h, v7.8h\n" + "sqdmulh v10.4s, v10.4s, v4.4s\n" + "add x13, x13, #0x20\n" + "smlal2 v21.4s, v25.8h, v6.8h\n" + "ldr d25, [x20, x15]\n" "smlal2 v18.4s, v24.8h, v3.8h\n" - "sqrdmulh v15.4s, v15.4s, v21.4s\n" - "smlal2 v22.4s, v24.8h, v5.8h\n" - "smlal v17.4s, v26.4h, v7.4h\n" + "ssubl v25.8h, v25.8b, v12.8b\n" + "smlal v22.4s, v27.4h, v7.4h\n" + "smlal v23.4s, v26.4h, v7.4h\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "add x11, x11, #0x20\n" + "smlal v9.4s, v29.4h, v8.4h\n" + "smlal2 v16.4s, v29.8h, v8.8h\n" + "ldr d29, [x19, x15]\n" + "ssubl v29.8h, v29.8b, v12.8b\n" + "smlal2 v21.4s, v27.8h, v7.8h\n" "smlal2 v18.4s, v26.8h, v7.8h\n" - "smlal v16.4s, v25.4h, v8.4h\n" - "smlal2 v22.4s, v25.8h, v8.8h\n" - "smlal v17.4s, v25.4h, v6.4h\n" + "sqdmulh v9.4s, v9.4s, v19.4s\n" + "add x15, x15, #0x8\n" + "smlal v22.4s, v24.4h, v5.4h\n" + "smlal v23.4s, v25.4h, v6.4h\n" + "and v28.16b, v9.16b, v0.16b\n" + "smlal2 v21.4s, v24.8h, v5.8h\n" "smlal2 v18.4s, v25.8h, v6.8h\n" - "and v26.16b, v15.16b, v30.16b\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "smlal v17.4s, v29.4h, v8.4h\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "smlal v22.4s, v25.4h, v8.4h\n" + "smlal v23.4s, v29.4h, v8.4h\n" + "sqdmulh v22.4s, v22.4s, v19.4s\n" + "smlal2 v21.4s, v25.8h, v8.8h\n" "smlal2 v18.4s, v29.8h, v8.8h\n" - "sqrdmulh v10.4s, v10.4s, v31.4s\n" - "sqrdmulh v20.4s, v20.4s, v21.4s\n" - "sqrdmulh v23.4s, v23.4s, v31.4s\n" - "sqrdmulh v16.4s, v16.4s, v21.4s\n" - "sqadd v15.4s, v15.4s, v26.4s\n" - "and v8.16b, v10.16b, v9.16b\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "srshl v15.4s, v15.4s, v30.4s\n" - "and v4.16b, v20.16b, v30.16b\n" + "sqdmulh v23.4s, v23.4s, v19.4s\n" + "and v29.16b, v22.16b, v0.16b\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "and v20.16b, v23.16b, v0.16b\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "and v19.16b, v10.16b, v31.16b\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "and v4.16b, v16.16b, v31.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "and v5.16b, v21.16b, v31.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v26.16b, v18.16b, v31.16b\n" + "sqadd v15.4s, v15.4s, v30.4s\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v28.4s\n" "sshr v4.4s, v4.4s, #0x1f\n" - "and v2.16b, v23.16b, v9.16b\n" - "and v1.16b, v16.16b, v30.16b\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "add v15.4s, v15.4s, v11.4s\n" - "sqadd v10.4s, v10.4s, v8.4s\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqrdmulh v22.4s, v22.4s, v31.4s\n" - "sqadd v20.4s, v20.4s, v4.4s\n" - "smin v15.4s, v15.4s, v14.4s\n" - "srshl v10.4s, v10.4s, v9.4s\n" - "sqadd v23.4s, v23.4s, v2.4s\n" - "smax v15.4s, v15.4s, v19.4s\n" - "srshl v20.4s, v20.4s, v30.4s\n" - "add v10.4s, v10.4s, v11.4s\n" - "srshl v23.4s, v23.4s, v9.4s\n" - "sqadd v16.4s, v16.4s, v1.4s\n" - "smin v10.4s, v10.4s, v14.4s\n" - "add v20.4s, v20.4s, v11.4s\n" - "add v23.4s, v23.4s, v11.4s\n" - "smax v10.4s, v10.4s, v19.4s\n" - "smin v20.4s, v20.4s, v14.4s\n" - "smin v23.4s, v23.4s, v14.4s\n" - "uzp1 v15.16b, v15.16b, v10.16b\n" - "smax v20.4s, v20.4s, v19.4s\n" + "sqadd v22.4s, v22.4s, v29.4s\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v20.4s\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v0.4s\n" + "sqadd v10.4s, v10.4s, v19.4s\n" + "srshl v9.4s, v9.4s, v0.4s\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "sqadd v21.4s, v21.4s, v5.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "sqadd v18.4s, v18.4s, v26.4s\n" + "srshl v10.4s, v10.4s, v31.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v16.4s, v16.4s, v31.4s\n" + "sqxtn v9.4h, v9.4s\n" + "srshl v21.4s, v21.4s, v31.4s\n" + "sqxtn v22.4h, v22.4s\n" + "srshl v18.4s, v18.4s, v31.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v15.8h, v10.4s\n" + "sqxtn2 v9.8h, v16.4s\n" + "sqxtn2 v22.8h, v21.4s\n" + "sqxtn2 v23.8h, v18.4s\n" + "sqadd v15.8h, v15.8h, v11.8h\n" + "sqadd v9.8h, v9.8h, v11.8h\n" + "sqadd v22.8h, v22.8h, v11.8h\n" + "sqadd v23.8h, v23.8h, v11.8h\n" + "smax v15.8h, v15.8h, v17.8h\n" + "smax v9.8h, v9.8h, v17.8h\n" + "smax v22.8h, v22.8h, v17.8h\n" + "smax v23.8h, v23.8h, v17.8h\n" + "smin v15.8h, v15.8h, v14.8h\n" + "smin v9.8h, v9.8h, v14.8h\n" + "smin v22.8h, v22.8h, v14.8h\n" + "smin v23.8h, v23.8h, v14.8h\n" "uzp1 v15.16b, v15.16b, v15.16b\n" - "str d15, [x14, x7]\n" - "smax v23.4s, v23.4s, v19.4s\n" - "srshl v16.4s, v16.4s, v30.4s\n" - "and v24.16b, v22.16b, v9.16b\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "uzp1 v20.16b, v20.16b, v23.16b\n" - "add v16.4s, v16.4s, v11.4s\n" - "sqrdmulh v17.4s, v17.4s, v21.4s\n" - "uzp1 v20.16b, v20.16b, v20.16b\n" - "str d20, [x13, x7]\n" - "smin v16.4s, v16.4s, v14.4s\n" - "sqrdmulh v18.4s, v18.4s, v31.4s\n" - "sqadd v22.4s, v22.4s, v24.4s\n" - "and v2.16b, v17.16b, v30.16b\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "smax v16.4s, v16.4s, v19.4s\n" - "srshl v22.4s, v22.4s, v9.4s\n" - "and v31.16b, v18.16b, v9.16b\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "add v22.4s, v22.4s, v11.4s\n" - "sqadd v17.4s, v17.4s, v2.4s\n" - "smin v22.4s, v22.4s, v14.4s\n" - "srshl v17.4s, v17.4s, v30.4s\n" - "sqadd v18.4s, v18.4s, v31.4s\n" - "smax v22.4s, v22.4s, v19.4s\n" - "uzp1 v16.16b, v16.16b, v22.16b\n" - "add v17.4s, v17.4s, v11.4s\n" - "srshl v18.4s, v18.4s, v9.4s\n" - "uzp1 v16.16b, v16.16b, v16.16b\n" - "str d16, [x12, x7]\n" - "smin v17.4s, v17.4s, v14.4s\n" - "add v18.4s, v18.4s, v11.4s\n" - "smax v17.4s, v17.4s, v19.4s\n" - "smin v18.4s, v18.4s, v14.4s\n" - "smax v18.4s, v18.4s, v19.4s\n" - "uzp1 v17.16b, v17.16b, v18.16b\n" - "uzp1 v17.16b, v17.16b, v17.16b\n" - "str d17, [x11, x7]\n" - "add x7, x7, #0x8\n" + "str d15, [x10, x14]\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str d9, [x9, x14]\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str d22, [x28, x14]\n" + "str d23, [x27, x14]\n" + "add x14, x14, #0x8\n" "beq 88f\n" - "add x6, x6, #0x48\n" + "add x17, x17, #0x48\n" "3:" // Oddments "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" - "tbz x4, #2, 5f\n" + "tbz x8, #2, 5f\n" "ld1 { v15.4s }, [x19], #0x10\n" - "tbz x4, #1, 4f\n" + "tbz x8, #1, 4f\n" "ld1 { v10.d }[0], [x19], #0x8\n" - "tbz x4, #0, 7f\n" + "tbz x8, #0, 7f\n" "ld1 { v10.s }[2], [x19]\n" "b 7f\n" "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset - "tbz x4, #0, 7f\n" + "tbz x8, #0, 7f\n" "ld1 { v10.s }[0], [x19]\n" "b 7f\n" "5:" // Oddments: Load bias: Bit 2: Unset - "tbz x4, #1, 6f\n" + "tbz x8, #1, 6f\n" "ld1 { v15.d }[0], [x19], #0x8\n" - "tbz x4, #0, 7f\n" + "tbz x8, #0, 7f\n" "ld1 { v15.s }[2], [x19]\n" "b 7f\n" "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 7f\n" + "tbz x8, #0, 7f\n" "ld1 { v15.s }[0], [x19]\n" "7:" // Oddments: Load bias: Bit 2: End - "mov v20.16b, v15.16b\n" - "ldr d0, [x6, #0x0]\n" - "mov v23.16b, v10.16b\n" - "ldr d1, [x6, #0x8]\n" - "mov v16.16b, v15.16b\n" - "ldr d2, [x6, #0x10]\n" - "mov v22.16b, v10.16b\n" - "ldr d3, [x6, #0x18]\n" - "mov v17.16b, v15.16b\n" - "ldr d4, [x6, #0x20]\n" - "ssubl v0.8h, v0.8b, v13.8b\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "mov v9.16b, v15.16b\n" + "mov v16.16b, v10.16b\n" + "ldr d2, [x17, #0x10]\n" + "ldr d3, [x17, #0x18]\n" + "mov v22.16b, v15.16b\n" + "mov v21.16b, v10.16b\n" + "ldr d4, [x17, #0x20]\n" + "ldr d5, [x17, #0x28]\n" + "mov v23.16b, v15.16b\n" "mov v18.16b, v10.16b\n" - "ldr d5, [x6, #0x28]\n" + "ldr d6, [x17, #0x30]\n" + "ldr d7, [x17, #0x38]\n" + "ssubl v0.8h, v0.8b, v13.8b\n" "ssubl v1.8h, v1.8b, v13.8b\n" - "ldr d6, [x6, #0x30]\n" + "ldr d8, [x17, #0x40]\n" + "ldp x26, x25, [x12, #0x0]\n" "ssubl v2.8h, v2.8b, v13.8b\n" - "ldr d7, [x6, #0x38]\n" "ssubl v3.8h, v3.8b, v13.8b\n" - "ldr d8, [x6, #0x40]\n" + "ldp x24, x23, [x12, #0x10]\n" + "ldp x22, x21, [x12, #0x20]\n" "ssubl v4.8h, v4.8b, v13.8b\n" - "ldp x26, x25, [x8, #0x0]\n" "ssubl v5.8h, v5.8b, v13.8b\n" - "ldp x24, x23, [x8, #0x10]\n" + "ldp x20, x19, [x12, #0x30]\n" "ssubl v6.8h, v6.8b, v13.8b\n" "ssubl v7.8h, v7.8b, v13.8b\n" - "ldp x22, x21, [x8, #0x20]\n" "ssubl v8.8h, v8.8b, v13.8b\n" - "ldp x20, x19, [x8, #0x30]\n" - "add x26, x26, x5\n" - "add x25, x25, x5\n" - "add x24, x24, x5\n" - "add x23, x23, x5\n" - "add x22, x22, x5\n" - "add x21, x21, x5\n" - "add x20, x20, x5\n" - "add x19, x19, x5\n" - "tbz x4, #2, 9f\n" + "add x26, x26, x15\n" + "add x25, x25, x15\n" + "add x24, x24, x15\n" + "add x23, x23, x15\n" + "add x22, x22, x15\n" + "add x21, x21, x15\n" + "add x20, x20, x15\n" + "add x19, x19, x15\n" + "tbz x8, #2, 9f\n" "ld1 { v31.s }[0], [x26], #0x4\n" "ld1 { v30.s }[0], [x25], #0x4\n" "ld1 { v29.s }[0], [x24], #0x4\n" @@ -713,7 +697,7 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v26.s }[0], [x21], #0x4\n" "ld1 { v25.s }[0], [x20], #0x4\n" "ld1 { v24.s }[0], [x19], #0x4\n" - "tbz x4, #1, 8f\n" + "tbz x8, #1, 8f\n" "ld1 { v31.h }[2], [x26], #0x2\n" "ld1 { v30.h }[2], [x25], #0x2\n" "ld1 { v29.h }[2], [x24], #0x2\n" @@ -722,7 +706,7 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v26.h }[2], [x21], #0x2\n" "ld1 { v25.h }[2], [x20], #0x2\n" "ld1 { v24.h }[2], [x19], #0x2\n" - "tbz x4, #0, 11f\n" + "tbz x8, #0, 11f\n" "ld1 { v31.b }[6], [x26]\n" "ld1 { v30.b }[6], [x25]\n" "ld1 { v29.b }[6], [x24]\n" @@ -733,7 +717,7 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v24.b }[6], [x19]\n" "b 11f\n" "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset - "tbz x4, #0, 11f\n" + "tbz x8, #0, 11f\n" "ld1 { v31.b }[4], [x26]\n" "ld1 { v30.b }[4], [x25]\n" "ld1 { v29.b }[4], [x24]\n" @@ -744,7 +728,7 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v24.b }[4], [x19]\n" "b 11f\n" "9:" // Oddments: Initial loads: Bit 2: Unset - "tbz x4, #1, 10f\n" + "tbz x8, #1, 10f\n" "ld1 { v31.h }[0], [x26], #0x2\n" "ld1 { v30.h }[0], [x25], #0x2\n" "ld1 { v29.h }[0], [x24], #0x2\n" @@ -753,7 +737,7 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v26.h }[0], [x21], #0x2\n" "ld1 { v25.h }[0], [x20], #0x2\n" "ld1 { v24.h }[0], [x19], #0x2\n" - "tbz x4, #0, 11f\n" + "tbz x8, #0, 11f\n" "ld1 { v31.b }[2], [x26]\n" "ld1 { v30.b }[2], [x25]\n" "ld1 { v29.b }[2], [x24]\n" @@ -764,7 +748,7 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v24.b }[2], [x19]\n" "b 11f\n" "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 11f\n" + "tbz x8, #0, 11f\n" "ld1 { v31.b }[0], [x26]\n" "ld1 { v30.b }[0], [x25]\n" "ld1 { v29.b }[0], [x24]\n" @@ -774,646 +758,636 @@ void a64_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v25.b }[0], [x20]\n" "ld1 { v24.b }[0], [x19]\n" "11:" // Oddments: Initial loads: Bit 2: End - "ldr x23, [x8, #0x40]\n" "ssubl v31.8h, v31.8b, v12.8b\n" "smlal v15.4s, v31.4h, v8.4h\n" - "ssubl v30.8h, v30.8b, v12.8b\n" "smlal2 v10.4s, v31.8h, v8.8h\n" - "ssubl v29.8h, v29.8b, v12.8b\n" - "smlal v20.4s, v31.4h, v6.4h\n" - "ssubl v28.8h, v28.8b, v12.8b\n" - "smlal2 v23.4s, v31.8h, v6.8h\n" - "ssubl v27.8h, v27.8b, v12.8b\n" - "smlal v16.4s, v31.4h, v2.4h\n" - "ssubl v26.8h, v26.8b, v12.8b\n" - "smlal2 v22.4s, v31.8h, v2.8h\n" - "ssubl v25.8h, v25.8b, v12.8b\n" - "smlal v17.4s, v31.4h, v0.4h\n" - "ssubl v24.8h, v24.8b, v12.8b\n" - "smlal2 v18.4s, v31.8h, v0.8h\n" - "add x23, x23, x5\n" + "ldr x24, [x12, #0x40]\n" + "ssubl v30.8h, v30.8b, v12.8b\n" "smlal v15.4s, v30.4h, v0.4h\n" "smlal2 v10.4s, v30.8h, v0.8h\n" - "smlal v20.4s, v28.4h, v1.4h\n" - "smlal2 v23.4s, v28.8h, v1.8h\n" + "add x24, x24, x15\n" + "ssubl v29.8h, v29.8b, v12.8b\n" + "smlal v9.4s, v31.4h, v6.4h\n" + "smlal2 v16.4s, v31.8h, v6.8h\n" "smlal v15.4s, v29.4h, v1.4h\n" "smlal2 v10.4s, v29.8h, v1.8h\n" - "smlal v20.4s, v27.4h, v2.4h\n" - "smlal2 v23.4s, v27.8h, v2.8h\n" + "ssubl v28.8h, v28.8b, v12.8b\n" + "ssubl v26.8h, v26.8b, v12.8b\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "smlal2 v16.4s, v28.8h, v1.8h\n" "smlal v15.4s, v26.4h, v3.4h\n" "smlal2 v10.4s, v26.8h, v3.8h\n" - "smlal v20.4s, v24.4h, v0.4h\n" - "smlal2 v23.4s, v24.8h, v0.8h\n" + "ssubl v27.8h, v27.8b, v12.8b\n" + "ssubl v25.8h, v25.8b, v12.8b\n" + "smlal v9.4s, v27.4h, v2.4h\n" + "smlal2 v16.4s, v27.8h, v2.8h\n" "smlal v15.4s, v25.4h, v4.4h\n" "smlal2 v10.4s, v25.8h, v4.8h\n" + "ssubl v24.8h, v24.8b, v12.8b\n" + "smlal v22.4s, v31.4h, v2.4h\n" + "smlal2 v21.4s, v31.8h, v2.8h\n" + "smlal v23.4s, v31.4h, v0.4h\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" "smlal v15.4s, v24.4h, v2.4h\n" "smlal2 v10.4s, v24.8h, v2.8h\n" - "tbz x4, #2, 13f\n" - "ld1 { v29.s }[0], [x23], #0x4\n" - "tbz x4, #1, 12f\n" - "ld1 { v29.h }[2], [x23], #0x2\n" - "tbz x4, #0, 15f\n" - "ld1 { v29.b }[6], [x23]\n" + "smlal v9.4s, v24.4h, v0.4h\n" + "smlal2 v16.4s, v24.8h, v0.8h\n" + "tbz x8, #2, 13f\n" + "ld1 { v29.s }[0], [x24], #0x4\n" + "tbz x8, #1, 12f\n" + "ld1 { v29.h }[2], [x24], #0x2\n" + "tbz x8, #0, 15f\n" + "ld1 { v29.b }[6], [x24]\n" "b 15f\n" "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 15f\n" - "ld1 { v29.b }[4], [x23]\n" + "tbz x8, #0, 15f\n" + "ld1 { v29.b }[4], [x24]\n" "b 15f\n" "13:" // Oddments: Load (1, 3): Bit 2: Unset - "tbz x4, #1, 14f\n" - "ld1 { v29.h }[0], [x23], #0x2\n" - "tbz x4, #0, 15f\n" - "ld1 { v29.b }[2], [x23]\n" + "tbz x8, #1, 14f\n" + "ld1 { v29.h }[0], [x24], #0x2\n" + "tbz x8, #0, 15f\n" + "ld1 { v29.b }[2], [x24]\n" "b 15f\n" "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 15f\n" - "ld1 { v29.b }[0], [x23]\n" + "tbz x8, #0, 15f\n" + "ld1 { v29.b }[0], [x24]\n" "15:" // Oddments: Load (1, 3): Bit 2: End - "ldr x22, [x8, #0x48]\n" "ssubl v29.8h, v29.8b, v12.8b\n" - "smlal v20.4s, v29.4h, v4.4h\n" - "smlal2 v23.4s, v29.8h, v4.8h\n" - "add x22, x22, x5\n" - "tbz x4, #2, 17f\n" - "ld1 { v28.s }[0], [x22], #0x4\n" - "tbz x4, #1, 16f\n" - "ld1 { v28.h }[2], [x22], #0x2\n" - "tbz x4, #0, 19f\n" - "ld1 { v28.b }[6], [x22]\n" + "ldr x23, [x12, #0x48]\n" + "smlal v9.4s, v29.4h, v4.4h\n" + "smlal2 v16.4s, v29.8h, v4.8h\n" + "add x23, x23, x15\n" + "tbz x8, #2, 17f\n" + "ld1 { v28.s }[0], [x23], #0x4\n" + "tbz x8, #1, 16f\n" + "ld1 { v28.h }[2], [x23], #0x2\n" + "tbz x8, #0, 19f\n" + "ld1 { v28.b }[6], [x23]\n" "b 19f\n" "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 19f\n" - "ld1 { v28.b }[4], [x22]\n" + "tbz x8, #0, 19f\n" + "ld1 { v28.b }[4], [x23]\n" "b 19f\n" "17:" // Oddments: Load (1, 4): Bit 2: Unset - "tbz x4, #1, 18f\n" - "ld1 { v28.h }[0], [x22], #0x2\n" - "tbz x4, #0, 19f\n" - "ld1 { v28.b }[2], [x22]\n" + "tbz x8, #1, 18f\n" + "ld1 { v28.h }[0], [x23], #0x2\n" + "tbz x8, #0, 19f\n" + "ld1 { v28.b }[2], [x23]\n" "b 19f\n" "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 19f\n" - "ld1 { v28.b }[0], [x22]\n" + "tbz x8, #0, 19f\n" + "ld1 { v28.b }[0], [x23]\n" "19:" // Oddments: Load (1, 4): Bit 2: End - "ldr x21, [x8, #0x50]\n" "ssubl v28.8h, v28.8b, v12.8b\n" - "smlal v20.4s, v28.4h, v5.4h\n" - "smlal2 v23.4s, v28.8h, v5.8h\n" - "add x21, x21, x5\n" - "tbz x4, #2, 21f\n" + "ldr x21, [x12, #0x50]\n" + "smlal v9.4s, v28.4h, v5.4h\n" + "smlal2 v16.4s, v28.8h, v5.8h\n" + "add x21, x21, x15\n" + "tbz x8, #2, 21f\n" "ld1 { v27.s }[0], [x21], #0x4\n" - "tbz x4, #1, 20f\n" + "tbz x8, #1, 20f\n" "ld1 { v27.h }[2], [x21], #0x2\n" - "tbz x4, #0, 23f\n" + "tbz x8, #0, 23f\n" "ld1 { v27.b }[6], [x21]\n" "b 23f\n" "20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset - "tbz x4, #0, 23f\n" + "tbz x8, #0, 23f\n" "ld1 { v27.b }[4], [x21]\n" "b 23f\n" "21:" // Oddments: Load (1, 2): Bit 2: Unset - "tbz x4, #1, 22f\n" + "tbz x8, #1, 22f\n" "ld1 { v27.h }[0], [x21], #0x2\n" - "tbz x4, #0, 23f\n" + "tbz x8, #0, 23f\n" "ld1 { v27.b }[2], [x21]\n" "b 23f\n" "22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 23f\n" + "tbz x8, #0, 23f\n" "ld1 { v27.b }[0], [x21]\n" "23:" // Oddments: Load (1, 2): Bit 2: End - "ldr x20, [x8, #0x58]\n" "ssubl v27.8h, v27.8b, v12.8b\n" + "ldr x19, [x12, #0x58]\n" "smlal v15.4s, v27.4h, v5.4h\n" "smlal2 v10.4s, v27.8h, v5.8h\n" - "add x20, x20, x5\n" - "smlal v20.4s, v27.4h, v3.4h\n" - "smlal2 v23.4s, v27.8h, v3.8h\n" - "tbz x4, #2, 25f\n" - "ld1 { v26.s }[0], [x20], #0x4\n" - "tbz x4, #1, 24f\n" - "ld1 { v26.h }[2], [x20], #0x2\n" - "tbz x4, #0, 27f\n" - "ld1 { v26.b }[6], [x20]\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "smlal2 v16.4s, v27.8h, v3.8h\n" + "add x19, x19, x15\n" + "tbz x8, #2, 25f\n" + "ld1 { v26.s }[0], [x19], #0x4\n" + "tbz x8, #1, 24f\n" + "ld1 { v26.h }[2], [x19], #0x2\n" + "tbz x8, #0, 27f\n" + "ld1 { v26.b }[6], [x19]\n" "b 27f\n" "24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset - "tbz x4, #0, 27f\n" - "ld1 { v26.b }[4], [x20]\n" + "tbz x8, #0, 27f\n" + "ld1 { v26.b }[4], [x19]\n" "b 27f\n" "25:" // Oddments: Load (3, 0): Bit 2: Unset - "tbz x4, #1, 26f\n" - "ld1 { v26.h }[0], [x20], #0x2\n" - "tbz x4, #0, 27f\n" - "ld1 { v26.b }[2], [x20]\n" + "tbz x8, #1, 26f\n" + "ld1 { v26.h }[0], [x19], #0x2\n" + "tbz x8, #0, 27f\n" + "ld1 { v26.b }[2], [x19]\n" "b 27f\n" "26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 27f\n" - "ld1 { v26.b }[0], [x20]\n" + "tbz x8, #0, 27f\n" + "ld1 { v26.b }[0], [x19]\n" "27:" // Oddments: Load (3, 0): Bit 2: End - "ldr x19, [x8, #0x60]\n" "ssubl v26.8h, v26.8b, v12.8b\n" - "smlal v16.4s, v26.4h, v3.4h\n" - "smlal2 v22.4s, v26.8h, v3.8h\n" - "add x19, x19, x5\n" - "tbz x4, #2, 29f\n" - "ld1 { v25.s }[0], [x19], #0x4\n" - "tbz x4, #1, 28f\n" - "ld1 { v25.h }[2], [x19], #0x2\n" - "tbz x4, #0, 31f\n" - "ld1 { v25.b }[6], [x19]\n" + "ldr x20, [x12, #0x60]\n" + "smlal v22.4s, v26.4h, v3.4h\n" + "smlal2 v21.4s, v26.8h, v3.8h\n" + "add x20, x20, x15\n" + "tbz x8, #2, 29f\n" + "ld1 { v25.s }[0], [x20], #0x4\n" + "tbz x8, #1, 28f\n" + "ld1 { v25.h }[2], [x20], #0x2\n" + "tbz x8, #0, 31f\n" + "ld1 { v25.b }[6], [x20]\n" "b 31f\n" "28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset - "tbz x4, #0, 31f\n" - "ld1 { v25.b }[4], [x19]\n" + "tbz x8, #0, 31f\n" + "ld1 { v25.b }[4], [x20]\n" "b 31f\n" "29:" // Oddments: Load (2, 0): Bit 2: Unset - "tbz x4, #1, 30f\n" - "ld1 { v25.h }[0], [x19], #0x2\n" - "tbz x4, #0, 31f\n" - "ld1 { v25.b }[2], [x19]\n" + "tbz x8, #1, 30f\n" + "ld1 { v25.h }[0], [x20], #0x2\n" + "tbz x8, #0, 31f\n" + "ld1 { v25.b }[2], [x20]\n" "b 31f\n" "30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 31f\n" - "ld1 { v25.b }[0], [x19]\n" + "tbz x8, #0, 31f\n" + "ld1 { v25.b }[0], [x20]\n" "31:" // Oddments: Load (2, 0): Bit 2: End - "ldr x10, [x8, #0x68]\n" "ssubl v25.8h, v25.8b, v12.8b\n" + "ldr x19, [x12, #0x68]\n" "smlal v15.4s, v25.4h, v6.4h\n" "smlal2 v10.4s, v25.8h, v6.8h\n" - "add x10, x10, x5\n" - "smlal v16.4s, v25.4h, v0.4h\n" - "smlal2 v22.4s, v25.8h, v0.8h\n" - "tbz x4, #2, 33f\n" - "ld1 { v29.s }[0], [x10], #0x4\n" - "tbz x4, #1, 32f\n" - "ld1 { v29.h }[2], [x10], #0x2\n" - "tbz x4, #0, 35f\n" - "ld1 { v29.b }[6], [x10]\n" + "smlal v22.4s, v25.4h, v0.4h\n" + "smlal2 v21.4s, v25.8h, v0.8h\n" + "add x19, x19, x15\n" + "tbz x8, #2, 33f\n" + "ld1 { v29.s }[0], [x19], #0x4\n" + "tbz x8, #1, 32f\n" + "ld1 { v29.h }[2], [x19], #0x2\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[6], [x19]\n" "b 35f\n" "32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset - "tbz x4, #0, 35f\n" - "ld1 { v29.b }[4], [x10]\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[4], [x19]\n" "b 35f\n" "33:" // Oddments: Load (3, 1): Bit 2: Unset - "tbz x4, #1, 34f\n" - "ld1 { v29.h }[0], [x10], #0x2\n" - "tbz x4, #0, 35f\n" - "ld1 { v29.b }[2], [x10]\n" + "tbz x8, #1, 34f\n" + "ld1 { v29.h }[0], [x19], #0x2\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[2], [x19]\n" "b 35f\n" "34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 35f\n" - "ld1 { v29.b }[0], [x10]\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[0], [x19]\n" "35:" // Oddments: Load (3, 1): Bit 2: End - "ldr x9, [x8, #0x70]\n" "ssubl v29.8h, v29.8b, v12.8b\n" - "smlal v16.4s, v29.4h, v4.4h\n" - "smlal2 v22.4s, v29.8h, v4.8h\n" - "add x9, x9, x5\n" - "tbz x4, #2, 37f\n" - "ld1 { v24.s }[0], [x9], #0x4\n" - "tbz x4, #1, 36f\n" - "ld1 { v24.h }[2], [x9], #0x2\n" - "tbz x4, #0, 39f\n" - "ld1 { v24.b }[6], [x9]\n" + "ldr x19, [x12, #0x70]\n" + "smlal v22.4s, v29.4h, v4.4h\n" + "smlal2 v21.4s, v29.8h, v4.8h\n" + "add x19, x19, x15\n" + "tbz x8, #2, 37f\n" + "ld1 { v24.s }[0], [x19], #0x4\n" + "tbz x8, #1, 36f\n" + "ld1 { v24.h }[2], [x19], #0x2\n" + "tbz x8, #0, 39f\n" + "ld1 { v24.b }[6], [x19]\n" "b 39f\n" "36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset - "tbz x4, #0, 39f\n" - "ld1 { v24.b }[4], [x9]\n" + "tbz x8, #0, 39f\n" + "ld1 { v24.b }[4], [x19]\n" "b 39f\n" "37:" // Oddments: Load (2, 1): Bit 2: Unset - "tbz x4, #1, 38f\n" - "ld1 { v24.h }[0], [x9], #0x2\n" - "tbz x4, #0, 39f\n" - "ld1 { v24.b }[2], [x9]\n" + "tbz x8, #1, 38f\n" + "ld1 { v24.h }[0], [x19], #0x2\n" + "tbz x8, #0, 39f\n" + "ld1 { v24.b }[2], [x19]\n" "b 39f\n" "38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 39f\n" - "ld1 { v24.b }[0], [x9]\n" + "tbz x8, #0, 39f\n" + "ld1 { v24.b }[0], [x19]\n" "39:" // Oddments: Load (2, 1): Bit 2: End - "ldr x28, [x8, #0x78]\n" "ssubl v24.8h, v24.8b, v12.8b\n" + "ldr x22, [x12, #0x78]\n" "smlal v15.4s, v24.4h, v7.4h\n" "smlal2 v10.4s, v24.8h, v7.8h\n" - "add x28, x28, x5\n" - "smlal v16.4s, v24.4h, v1.4h\n" - "smlal2 v22.4s, v24.8h, v1.8h\n" - "tbz x4, #2, 41f\n" - "ld1 { v27.s }[0], [x28], #0x4\n" - "tbz x4, #1, 40f\n" - "ld1 { v27.h }[2], [x28], #0x2\n" - "tbz x4, #0, 43f\n" - "ld1 { v27.b }[6], [x28]\n" + "smlal v22.4s, v24.4h, v1.4h\n" + "smlal2 v21.4s, v24.8h, v1.8h\n" + "add x22, x22, x15\n" + "tbz x8, #2, 41f\n" + "ld1 { v27.s }[0], [x22], #0x4\n" + "tbz x8, #1, 40f\n" + "ld1 { v27.h }[2], [x22], #0x2\n" + "tbz x8, #0, 43f\n" + "ld1 { v27.b }[6], [x22]\n" "b 43f\n" "40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 43f\n" - "ld1 { v27.b }[4], [x28]\n" + "tbz x8, #0, 43f\n" + "ld1 { v27.b }[4], [x22]\n" "b 43f\n" "41:" // Oddments: Load (3, 3): Bit 2: Unset - "tbz x4, #1, 42f\n" - "ld1 { v27.h }[0], [x28], #0x2\n" - "tbz x4, #0, 43f\n" - "ld1 { v27.b }[2], [x28]\n" + "tbz x8, #1, 42f\n" + "ld1 { v27.h }[0], [x22], #0x2\n" + "tbz x8, #0, 43f\n" + "ld1 { v27.b }[2], [x22]\n" "b 43f\n" "42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 43f\n" - "ld1 { v27.b }[0], [x28]\n" + "tbz x8, #0, 43f\n" + "ld1 { v27.b }[0], [x22]\n" "43:" // Oddments: Load (3, 3): Bit 2: End - "ldr x27, [x8, #0x80]\n" "ssubl v27.8h, v27.8b, v12.8b\n" - "smlal v17.4s, v27.4h, v4.4h\n" + "ldr x21, [x12, #0x80]\n" + "smlal v23.4s, v27.4h, v4.4h\n" "smlal2 v18.4s, v27.8h, v4.8h\n" - "add x27, x27, x5\n" - "tbz x4, #2, 45f\n" - "ld1 { v28.s }[0], [x27], #0x4\n" - "tbz x4, #1, 44f\n" - "ld1 { v28.h }[2], [x27], #0x2\n" - "tbz x4, #0, 47f\n" - "ld1 { v28.b }[6], [x27]\n" + "add x21, x21, x15\n" + "tbz x8, #2, 45f\n" + "ld1 { v28.s }[0], [x21], #0x4\n" + "tbz x8, #1, 44f\n" + "ld1 { v28.h }[2], [x21], #0x2\n" + "tbz x8, #0, 47f\n" + "ld1 { v28.b }[6], [x21]\n" "b 47f\n" "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 47f\n" - "ld1 { v28.b }[4], [x27]\n" + "tbz x8, #0, 47f\n" + "ld1 { v28.b }[4], [x21]\n" "b 47f\n" "45:" // Oddments: Load (2, 3): Bit 2: Unset - "tbz x4, #1, 46f\n" - "ld1 { v28.h }[0], [x27], #0x2\n" - "tbz x4, #0, 47f\n" - "ld1 { v28.b }[2], [x27]\n" + "tbz x8, #1, 46f\n" + "ld1 { v28.h }[0], [x21], #0x2\n" + "tbz x8, #0, 47f\n" + "ld1 { v28.b }[2], [x21]\n" "b 47f\n" "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 47f\n" - "ld1 { v28.b }[0], [x27]\n" + "tbz x8, #0, 47f\n" + "ld1 { v28.b }[0], [x21]\n" "47:" // Oddments: Load (2, 3): Bit 2: End - "ldr x26, [x8, #0x88]\n" "ssubl v28.8h, v28.8b, v12.8b\n" - "smlal v20.4s, v28.4h, v7.4h\n" - "smlal2 v23.4s, v28.8h, v7.8h\n" - "add x26, x26, x5\n" - "smlal v17.4s, v28.4h, v1.4h\n" + "ldr x20, [x12, #0x88]\n" + "smlal v9.4s, v28.4h, v7.4h\n" + "smlal2 v16.4s, v28.8h, v7.8h\n" + "smlal v23.4s, v28.4h, v1.4h\n" "smlal2 v18.4s, v28.8h, v1.8h\n" - "tbz x4, #2, 49f\n" - "ld1 { v26.s }[0], [x26], #0x4\n" - "tbz x4, #1, 48f\n" - "ld1 { v26.h }[2], [x26], #0x2\n" - "tbz x4, #0, 51f\n" - "ld1 { v26.b }[6], [x26]\n" + "add x20, x20, x15\n" + "tbz x8, #2, 49f\n" + "ld1 { v26.s }[0], [x20], #0x4\n" + "tbz x8, #1, 48f\n" + "ld1 { v26.h }[2], [x20], #0x2\n" + "tbz x8, #0, 51f\n" + "ld1 { v26.b }[6], [x20]\n" "b 51f\n" "48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 51f\n" - "ld1 { v26.b }[4], [x26]\n" + "tbz x8, #0, 51f\n" + "ld1 { v26.b }[4], [x20]\n" "b 51f\n" "49:" // Oddments: Load (3, 4): Bit 2: Unset - "tbz x4, #1, 50f\n" - "ld1 { v26.h }[0], [x26], #0x2\n" - "tbz x4, #0, 51f\n" - "ld1 { v26.b }[2], [x26]\n" + "tbz x8, #1, 50f\n" + "ld1 { v26.h }[0], [x20], #0x2\n" + "tbz x8, #0, 51f\n" + "ld1 { v26.b }[2], [x20]\n" "b 51f\n" "50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 51f\n" - "ld1 { v26.b }[0], [x26]\n" + "tbz x8, #0, 51f\n" + "ld1 { v26.b }[0], [x20]\n" "51:" // Oddments: Load (3, 4): Bit 2: End - "ldr x25, [x8, #0x90]\n" "ssubl v26.8h, v26.8b, v12.8b\n" - "smlal v17.4s, v26.4h, v5.4h\n" + "ldr x23, [x12, #0x90]\n" + "smlal v23.4s, v26.4h, v5.4h\n" "smlal2 v18.4s, v26.8h, v5.8h\n" - "add x25, x25, x5\n" - "tbz x4, #2, 53f\n" - "ld1 { v25.s }[0], [x25], #0x4\n" - "tbz x4, #1, 52f\n" - "ld1 { v25.h }[2], [x25], #0x2\n" - "tbz x4, #0, 55f\n" - "ld1 { v25.b }[6], [x25]\n" + "add x23, x23, x15\n" + "tbz x8, #2, 53f\n" + "ld1 { v25.s }[0], [x23], #0x4\n" + "tbz x8, #1, 52f\n" + "ld1 { v25.h }[2], [x23], #0x2\n" + "tbz x8, #0, 55f\n" + "ld1 { v25.b }[6], [x23]\n" "b 55f\n" "52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset - "tbz x4, #0, 55f\n" - "ld1 { v25.b }[4], [x25]\n" + "tbz x8, #0, 55f\n" + "ld1 { v25.b }[4], [x23]\n" "b 55f\n" "53:" // Oddments: Load (4, 0): Bit 2: Unset - "tbz x4, #1, 54f\n" - "ld1 { v25.h }[0], [x25], #0x2\n" - "tbz x4, #0, 55f\n" - "ld1 { v25.b }[2], [x25]\n" + "tbz x8, #1, 54f\n" + "ld1 { v25.h }[0], [x23], #0x2\n" + "tbz x8, #0, 55f\n" + "ld1 { v25.b }[2], [x23]\n" "b 55f\n" "54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 55f\n" - "ld1 { v25.b }[0], [x25]\n" + "tbz x8, #0, 55f\n" + "ld1 { v25.b }[0], [x23]\n" "55:" // Oddments: Load (4, 0): Bit 2: End - "ldr x24, [x8, #0x98]\n" "ssubl v25.8h, v25.8b, v12.8b\n" - "smlal v16.4s, v25.4h, v6.4h\n" - "smlal2 v22.4s, v25.8h, v6.8h\n" - "add x24, x24, x5\n" - "tbz x4, #2, 57f\n" + "ldr x24, [x12, #0x98]\n" + "smlal v22.4s, v25.4h, v6.4h\n" + "smlal2 v21.4s, v25.8h, v6.8h\n" + "add x24, x24, x15\n" + "tbz x8, #2, 57f\n" "ld1 { v29.s }[0], [x24], #0x4\n" - "tbz x4, #1, 56f\n" + "tbz x8, #1, 56f\n" "ld1 { v29.h }[2], [x24], #0x2\n" - "tbz x4, #0, 59f\n" + "tbz x8, #0, 59f\n" "ld1 { v29.b }[6], [x24]\n" "b 59f\n" "56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 59f\n" + "tbz x8, #0, 59f\n" "ld1 { v29.b }[4], [x24]\n" "b 59f\n" "57:" // Oddments: Load (2, 4): Bit 2: Unset - "tbz x4, #1, 58f\n" + "tbz x8, #1, 58f\n" "ld1 { v29.h }[0], [x24], #0x2\n" - "tbz x4, #0, 59f\n" + "tbz x8, #0, 59f\n" "ld1 { v29.b }[2], [x24]\n" "b 59f\n" "58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 59f\n" + "tbz x8, #0, 59f\n" "ld1 { v29.b }[0], [x24]\n" "59:" // Oddments: Load (2, 4): Bit 2: End - "ldr x23, [x8, #0xa0]\n" "ssubl v29.8h, v29.8b, v12.8b\n" - "smlal v20.4s, v29.4h, v8.4h\n" - "smlal2 v23.4s, v29.8h, v8.8h\n" - "add x23, x23, x5\n" - "smlal v17.4s, v29.4h, v2.4h\n" + "ldr x19, [x12, #0xa0]\n" + "smlal v9.4s, v29.4h, v8.4h\n" + "smlal2 v16.4s, v29.8h, v8.8h\n" + "smlal v23.4s, v29.4h, v2.4h\n" "smlal2 v18.4s, v29.8h, v2.8h\n" - "tbz x4, #2, 61f\n" - "ld1 { v27.s }[0], [x23], #0x4\n" - "tbz x4, #1, 60f\n" - "ld1 { v27.h }[2], [x23], #0x2\n" - "tbz x4, #0, 63f\n" - "ld1 { v27.b }[6], [x23]\n" + "add x19, x19, x15\n" + "tbz x8, #2, 61f\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "tbz x8, #1, 60f\n" + "ld1 { v27.h }[2], [x19], #0x2\n" + "tbz x8, #0, 63f\n" + "ld1 { v27.b }[6], [x19]\n" "b 63f\n" "60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset - "tbz x4, #0, 63f\n" - "ld1 { v27.b }[4], [x23]\n" + "tbz x8, #0, 63f\n" + "ld1 { v27.b }[4], [x19]\n" "b 63f\n" "61:" // Oddments: Load (4, 1): Bit 2: Unset - "tbz x4, #1, 62f\n" - "ld1 { v27.h }[0], [x23], #0x2\n" - "tbz x4, #0, 63f\n" - "ld1 { v27.b }[2], [x23]\n" + "tbz x8, #1, 62f\n" + "ld1 { v27.h }[0], [x19], #0x2\n" + "tbz x8, #0, 63f\n" + "ld1 { v27.b }[2], [x19]\n" "b 63f\n" "62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 63f\n" - "ld1 { v27.b }[0], [x23]\n" + "tbz x8, #0, 63f\n" + "ld1 { v27.b }[0], [x19]\n" "63:" // Oddments: Load (4, 1): Bit 2: End - "ldr x22, [x8, #0xa8]\n" "ssubl v27.8h, v27.8b, v12.8b\n" - "smlal v16.4s, v27.4h, v7.4h\n" - "smlal2 v22.4s, v27.8h, v7.8h\n" - "add x22, x22, x5\n" - "tbz x4, #2, 65f\n" + "ldr x22, [x12, #0xa8]\n" + "smlal v22.4s, v27.4h, v7.4h\n" + "smlal2 v21.4s, v27.8h, v7.8h\n" + "add x22, x22, x15\n" + "tbz x8, #2, 65f\n" "ld1 { v24.s }[0], [x22], #0x4\n" - "tbz x4, #1, 64f\n" + "tbz x8, #1, 64f\n" "ld1 { v24.h }[2], [x22], #0x2\n" - "tbz x4, #0, 67f\n" + "tbz x8, #0, 67f\n" "ld1 { v24.b }[6], [x22]\n" "b 67f\n" "64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset - "tbz x4, #0, 67f\n" + "tbz x8, #0, 67f\n" "ld1 { v24.b }[4], [x22]\n" "b 67f\n" "65:" // Oddments: Load (3, 2): Bit 2: Unset - "tbz x4, #1, 66f\n" + "tbz x8, #1, 66f\n" "ld1 { v24.h }[0], [x22], #0x2\n" - "tbz x4, #0, 67f\n" + "tbz x8, #0, 67f\n" "ld1 { v24.b }[2], [x22]\n" "b 67f\n" "66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 67f\n" + "tbz x8, #0, 67f\n" "ld1 { v24.b }[0], [x22]\n" "67:" // Oddments: Load (3, 2): Bit 2: End - "ldr x21, [x8, #0xb0]\n" "ssubl v24.8h, v24.8b, v12.8b\n" - "smlal v16.4s, v24.4h, v5.4h\n" - "smlal2 v22.4s, v24.8h, v5.8h\n" - "add x21, x21, x5\n" - "smlal v17.4s, v24.4h, v3.4h\n" + "ldr x21, [x12, #0xb0]\n" + "smlal v22.4s, v24.4h, v5.4h\n" + "smlal2 v21.4s, v24.8h, v5.8h\n" + "smlal v23.4s, v24.4h, v3.4h\n" "smlal2 v18.4s, v24.8h, v3.8h\n" - "tbz x4, #2, 69f\n" + "add x21, x21, x15\n" + "tbz x8, #2, 69f\n" "ld1 { v26.s }[0], [x21], #0x4\n" - "tbz x4, #1, 68f\n" + "tbz x8, #1, 68f\n" "ld1 { v26.h }[2], [x21], #0x2\n" - "tbz x4, #0, 71f\n" + "tbz x8, #0, 71f\n" "ld1 { v26.b }[6], [x21]\n" "b 71f\n" "68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 71f\n" + "tbz x8, #0, 71f\n" "ld1 { v26.b }[4], [x21]\n" "b 71f\n" "69:" // Oddments: Load (4, 3): Bit 2: Unset - "tbz x4, #1, 70f\n" + "tbz x8, #1, 70f\n" "ld1 { v26.h }[0], [x21], #0x2\n" - "tbz x4, #0, 71f\n" + "tbz x8, #0, 71f\n" "ld1 { v26.b }[2], [x21]\n" "b 71f\n" "70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 71f\n" + "tbz x8, #0, 71f\n" "ld1 { v26.b }[0], [x21]\n" "71:" // Oddments: Load (4, 3): Bit 2: End - "ldr x20, [x8, #0xb8]\n" "ssubl v26.8h, v26.8b, v12.8b\n" - "smlal v17.4s, v26.4h, v7.4h\n" + "ldr x20, [x12, #0xb8]\n" + "smlal v23.4s, v26.4h, v7.4h\n" "smlal2 v18.4s, v26.8h, v7.8h\n" - "add x20, x20, x5\n" - "tbz x4, #2, 73f\n" + "add x20, x20, x15\n" + "tbz x8, #2, 73f\n" "ld1 { v25.s }[0], [x20], #0x4\n" - "tbz x4, #1, 72f\n" + "tbz x8, #1, 72f\n" "ld1 { v25.h }[2], [x20], #0x2\n" - "tbz x4, #0, 75f\n" + "tbz x8, #0, 75f\n" "ld1 { v25.b }[6], [x20]\n" "b 75f\n" "72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset - "tbz x4, #0, 75f\n" + "tbz x8, #0, 75f\n" "ld1 { v25.b }[4], [x20]\n" "b 75f\n" "73:" // Oddments: Load (4, 2): Bit 2: Unset - "tbz x4, #1, 74f\n" + "tbz x8, #1, 74f\n" "ld1 { v25.h }[0], [x20], #0x2\n" - "tbz x4, #0, 75f\n" + "tbz x8, #0, 75f\n" "ld1 { v25.b }[2], [x20]\n" "b 75f\n" "74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 75f\n" + "tbz x8, #0, 75f\n" "ld1 { v25.b }[0], [x20]\n" "75:" // Oddments: Load (4, 2): Bit 2: End - "ldr x19, [x8, #0xc0]\n" "ssubl v25.8h, v25.8b, v12.8b\n" - "smlal v16.4s, v25.4h, v8.4h\n" - "smlal2 v22.4s, v25.8h, v8.8h\n" - "add x19, x19, x5\n" - "smlal v17.4s, v25.4h, v6.4h\n" + "ldr x19, [x12, #0xc0]\n" + "smlal v22.4s, v25.4h, v8.4h\n" + "smlal2 v21.4s, v25.8h, v8.8h\n" + "smlal v23.4s, v25.4h, v6.4h\n" "smlal2 v18.4s, v25.8h, v6.8h\n" - "tbz x4, #2, 77f\n" + "add x19, x19, x15\n" + "tbz x8, #2, 77f\n" "ld1 { v29.s }[0], [x19], #0x4\n" - "tbz x4, #1, 76f\n" + "tbz x8, #1, 76f\n" "ld1 { v29.h }[2], [x19], #0x2\n" - "tbz x4, #0, 79f\n" + "tbz x8, #0, 79f\n" "ld1 { v29.b }[6], [x19]\n" "b 79f\n" "76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 79f\n" + "tbz x8, #0, 79f\n" "ld1 { v29.b }[4], [x19]\n" "b 79f\n" "77:" // Oddments: Load (4, 4): Bit 2: Unset - "tbz x4, #1, 78f\n" + "tbz x8, #1, 78f\n" "ld1 { v29.h }[0], [x19], #0x2\n" - "tbz x4, #0, 79f\n" + "tbz x8, #0, 79f\n" "ld1 { v29.b }[2], [x19]\n" "b 79f\n" "78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 79f\n" + "tbz x8, #0, 79f\n" "ld1 { v29.b }[0], [x19]\n" "79:" // Oddments: Load (4, 4): Bit 2: End "ssubl v29.8h, v29.8b, v12.8b\n" - "smlal v17.4s, v29.4h, v8.4h\n" + "smlal v23.4s, v29.4h, v8.4h\n" "smlal2 v18.4s, v29.8h, v8.8h\n" - "tbz x4, #2, 81f\n" - "ld1 { v21.4s }, [x17], #0x10\n" - "ld1 { v30.4s }, [x15], #0x10\n" - "tbz x4, #1, 80f\n" - "ld1 { v31.d }[0], [x17], #0x8\n" - "ld1 { v9.d }[0], [x15], #0x8\n" - "tbz x4, #0, 83f\n" - "ld1 { v31.s }[2], [x17]\n" - "ld1 { v9.s }[2], [x15]\n" + "tbz x8, #2, 81f\n" + "ld1 { v19.4s }, [x13], #0x10\n" + "ld1 { v0.4s }, [x11], #0x10\n" + "tbz x8, #1, 80f\n" + "ld1 { v4.d }[0], [x13], #0x8\n" + "ld1 { v31.d }[0], [x11], #0x8\n" + "tbz x8, #0, 83f\n" + "ld1 { v4.s }[2], [x13]\n" + "ld1 { v31.s }[2], [x11]\n" "b 83f\n" "80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset - "tbz x4, #0, 83f\n" - "ld1 { v31.s }[0], [x17]\n" - "ld1 { v9.s }[0], [x15]\n" + "tbz x8, #0, 83f\n" + "ld1 { v4.s }[0], [x13]\n" + "ld1 { v31.s }[0], [x11]\n" "b 83f\n" "81:" // Oddments: Load requant params: Bit 2: Unset - "tbz x4, #1, 82f\n" - "ld1 { v21.d }[0], [x17], #0x8\n" - "ld1 { v30.d }[0], [x15], #0x8\n" - "tbz x4, #0, 83f\n" - "ld1 { v21.s }[2], [x17]\n" - "ld1 { v30.s }[2], [x15]\n" + "tbz x8, #1, 82f\n" + "ld1 { v19.d }[0], [x13], #0x8\n" + "ld1 { v0.d }[0], [x11], #0x8\n" + "tbz x8, #0, 83f\n" + "ld1 { v19.s }[2], [x13]\n" + "ld1 { v0.s }[2], [x11]\n" "b 83f\n" "82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 83f\n" - "ld1 { v21.s }[0], [x17]\n" - "ld1 { v30.s }[0], [x15]\n" + "tbz x8, #0, 83f\n" + "ld1 { v19.s }[0], [x13]\n" + "ld1 { v0.s }[0], [x11]\n" "83:" // Oddments: Load requant params: Bit 2: End - "sqrdmulh v15.4s, v15.4s, v21.4s\n" - "add x14, x14, x7\n" - "sqrdmulh v10.4s, v10.4s, v31.4s\n" - "add x13, x13, x7\n" - "sqrdmulh v20.4s, v20.4s, v21.4s\n" - "add x12, x12, x7\n" - "sqrdmulh v23.4s, v23.4s, v31.4s\n" - "add x11, x11, x7\n" - "sqrdmulh v16.4s, v16.4s, v21.4s\n" - "and v26.16b, v15.16b, v30.16b\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "and v8.16b, v10.16b, v9.16b\n" - "and v4.16b, v20.16b, v30.16b\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "and v2.16b, v23.16b, v9.16b\n" - "and v1.16b, v16.16b, v30.16b\n" + "sqdmulh v15.4s, v15.4s, v19.4s\n" + "sqdmulh v9.4s, v9.4s, v19.4s\n" + "add x10, x10, x14\n" + "add x9, x9, x14\n" + "sqdmulh v22.4s, v22.4s, v19.4s\n" + "sqdmulh v23.4s, v23.4s, v19.4s\n" + "add x28, x28, x14\n" + "add x27, x27, x14\n" + "and v30.16b, v15.16b, v0.16b\n" + "sqdmulh v10.4s, v10.4s, v4.4s\n" + "and v28.16b, v9.16b, v0.16b\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "and v29.16b, v22.16b, v0.16b\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "and v20.16b, v23.16b, v0.16b\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "and v19.16b, v10.16b, v31.16b\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "and v4.16b, v16.16b, v31.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "and v5.16b, v21.16b, v31.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v26.16b, v18.16b, v31.16b\n" + "sqadd v15.4s, v15.4s, v30.4s\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v28.4s\n" "sshr v4.4s, v4.4s, #0x1f\n" - "sqrdmulh v22.4s, v22.4s, v31.4s\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sqadd v15.4s, v15.4s, v26.4s\n" - "sqrdmulh v17.4s, v17.4s, v21.4s\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqrdmulh v18.4s, v18.4s, v31.4s\n" - "sqadd v10.4s, v10.4s, v8.4s\n" - "sqadd v20.4s, v20.4s, v4.4s\n" - "srshl v15.4s, v15.4s, v30.4s\n" - "sqadd v23.4s, v23.4s, v2.4s\n" - "srshl v10.4s, v10.4s, v9.4s\n" - "srshl v20.4s, v20.4s, v30.4s\n" - "add v15.4s, v15.4s, v11.4s\n" - "srshl v23.4s, v23.4s, v9.4s\n" - "add v10.4s, v10.4s, v11.4s\n" - "smin v15.4s, v15.4s, v14.4s\n" - "add v20.4s, v20.4s, v11.4s\n" - "smin v10.4s, v10.4s, v14.4s\n" - "smax v15.4s, v15.4s, v19.4s\n" - "smin v20.4s, v20.4s, v14.4s\n" - "smax v10.4s, v10.4s, v19.4s\n" - "add v23.4s, v23.4s, v11.4s\n" - "smax v20.4s, v20.4s, v19.4s\n" - "uzp1 v15.16b, v15.16b, v10.16b\n" - "smin v23.4s, v23.4s, v14.4s\n" + "sqadd v22.4s, v22.4s, v29.4s\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v20.4s\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v0.4s\n" + "sqadd v10.4s, v10.4s, v19.4s\n" + "srshl v9.4s, v9.4s, v0.4s\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "sqadd v21.4s, v21.4s, v5.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "sqadd v18.4s, v18.4s, v26.4s\n" + "srshl v10.4s, v10.4s, v31.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v16.4s, v16.4s, v31.4s\n" + "sqxtn v9.4h, v9.4s\n" + "srshl v21.4s, v21.4s, v31.4s\n" + "sqxtn v22.4h, v22.4s\n" + "srshl v18.4s, v18.4s, v31.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v15.8h, v10.4s\n" + "sqxtn2 v9.8h, v16.4s\n" + "sqxtn2 v22.8h, v21.4s\n" + "sqxtn2 v23.8h, v18.4s\n" + "sqadd v15.8h, v15.8h, v11.8h\n" + "sqadd v9.8h, v9.8h, v11.8h\n" + "sqadd v22.8h, v22.8h, v11.8h\n" + "sqadd v23.8h, v23.8h, v11.8h\n" + "smax v15.8h, v15.8h, v17.8h\n" + "smax v9.8h, v9.8h, v17.8h\n" + "smax v22.8h, v22.8h, v17.8h\n" + "smax v23.8h, v23.8h, v17.8h\n" + "smin v15.8h, v15.8h, v14.8h\n" + "smin v9.8h, v9.8h, v14.8h\n" + "smin v22.8h, v22.8h, v14.8h\n" + "smin v23.8h, v23.8h, v14.8h\n" "uzp1 v15.16b, v15.16b, v15.16b\n" - "sqadd v16.4s, v16.4s, v1.4s\n" - "smax v23.4s, v23.4s, v19.4s\n" - "and v24.16b, v22.16b, v9.16b\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "uzp1 v20.16b, v20.16b, v23.16b\n" - "srshl v16.4s, v16.4s, v30.4s\n" - "and v2.16b, v17.16b, v30.16b\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "uzp1 v20.16b, v20.16b, v20.16b\n" - "add v16.4s, v16.4s, v11.4s\n" - "sqadd v22.4s, v22.4s, v24.4s\n" - "and v31.16b, v18.16b, v9.16b\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "smin v16.4s, v16.4s, v14.4s\n" - "srshl v22.4s, v22.4s, v9.4s\n" - "sqadd v17.4s, v17.4s, v2.4s\n" - "smax v16.4s, v16.4s, v19.4s\n" - "add v22.4s, v22.4s, v11.4s\n" - "srshl v17.4s, v17.4s, v30.4s\n" - "sqadd v18.4s, v18.4s, v31.4s\n" - "smin v22.4s, v22.4s, v14.4s\n" - "add v17.4s, v17.4s, v11.4s\n" - "srshl v18.4s, v18.4s, v9.4s\n" - "smax v22.4s, v22.4s, v19.4s\n" - "smin v17.4s, v17.4s, v14.4s\n" - "uzp1 v16.16b, v16.16b, v22.16b\n" - "add v18.4s, v18.4s, v11.4s\n" - "uzp1 v16.16b, v16.16b, v16.16b\n" - "smax v17.4s, v17.4s, v19.4s\n" - "smin v18.4s, v18.4s, v14.4s\n" - "smax v18.4s, v18.4s, v19.4s\n" - "uzp1 v17.16b, v17.16b, v18.16b\n" - "uzp1 v17.16b, v17.16b, v17.16b\n" - "tbz x4, #2, 85f\n" - "st1 { v15.s }[0], [x14], #0x4\n" - "st1 { v20.s }[0], [x13], #0x4\n" - "st1 { v16.s }[0], [x12], #0x4\n" - "st1 { v17.s }[0], [x11], #0x4\n" - "tbz x4, #1, 84f\n" - "st1 { v15.h }[2], [x14], #0x2\n" - "st1 { v20.h }[2], [x13], #0x2\n" - "st1 { v16.h }[2], [x12], #0x2\n" - "st1 { v17.h }[2], [x11], #0x2\n" - "tbz x4, #0, 87f\n" - "st1 { v15.b }[6], [x14], #0x1\n" - "st1 { v20.b }[6], [x13], #0x1\n" - "st1 { v16.b }[6], [x12], #0x1\n" - "st1 { v17.b }[6], [x11], #0x1\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "tbz x8, #2, 85f\n" + "st1 { v15.s }[0], [x10], #0x4\n" + "st1 { v9.s }[0], [x9], #0x4\n" + "st1 { v22.s }[0], [x28], #0x4\n" + "st1 { v23.s }[0], [x27], #0x4\n" + "tbz x8, #1, 84f\n" + "st1 { v15.h }[2], [x10], #0x2\n" + "st1 { v9.h }[2], [x9], #0x2\n" + "st1 { v22.h }[2], [x28], #0x2\n" + "st1 { v23.h }[2], [x27], #0x2\n" + "tbz x8, #0, 87f\n" + "st1 { v15.b }[6], [x10], #0x1\n" + "st1 { v9.b }[6], [x9], #0x1\n" + "st1 { v22.b }[6], [x28], #0x1\n" + "st1 { v23.b }[6], [x27], #0x1\n" "b 87f\n" "84:" // Oddments: Bit 2: Bit 1: Unset - "tbz x4, #0, 87f\n" - "st1 { v15.b }[4], [x14], #0x1\n" - "st1 { v20.b }[4], [x13], #0x1\n" - "st1 { v16.b }[4], [x12], #0x1\n" - "st1 { v17.b }[4], [x11], #0x1\n" + "tbz x8, #0, 87f\n" + "st1 { v15.b }[4], [x10], #0x1\n" + "st1 { v9.b }[4], [x9], #0x1\n" + "st1 { v22.b }[4], [x28], #0x1\n" + "st1 { v23.b }[4], [x27], #0x1\n" "b 87f\n" "85:" // Oddments: Bit 2: Unset - "tbz x4, #1, 86f\n" - "st1 { v15.h }[0], [x14], #0x2\n" - "st1 { v20.h }[0], [x13], #0x2\n" - "st1 { v16.h }[0], [x12], #0x2\n" - "st1 { v17.h }[0], [x11], #0x2\n" - "tbz x4, #0, 87f\n" - "st1 { v15.b }[2], [x14], #0x1\n" - "st1 { v20.b }[2], [x13], #0x1\n" - "st1 { v16.b }[2], [x12], #0x1\n" - "st1 { v17.b }[2], [x11], #0x1\n" + "tbz x8, #1, 86f\n" + "st1 { v15.h }[0], [x10], #0x2\n" + "st1 { v9.h }[0], [x9], #0x2\n" + "st1 { v22.h }[0], [x28], #0x2\n" + "st1 { v23.h }[0], [x27], #0x2\n" + "tbz x8, #0, 87f\n" + "st1 { v15.b }[2], [x10], #0x1\n" + "st1 { v9.b }[2], [x9], #0x1\n" + "st1 { v22.b }[2], [x28], #0x1\n" + "st1 { v23.b }[2], [x27], #0x1\n" "b 87f\n" "86:" // Oddments: Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 87f\n" - "st1 { v15.b }[0], [x14], #0x1\n" - "st1 { v20.b }[0], [x13], #0x1\n" - "st1 { v16.b }[0], [x12], #0x1\n" - "st1 { v17.b }[0], [x11], #0x1\n" + "tbz x8, #0, 87f\n" + "st1 { v15.b }[0], [x10], #0x1\n" + "st1 { v9.b }[0], [x9], #0x1\n" + "st1 { v22.b }[0], [x28], #0x1\n" + "st1 { v23.b }[0], [x27], #0x1\n" "87:" // Oddments: Bit 2: End - "88:" // End - : : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp index a998fa16d6..52031e18da 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,37 +36,24 @@ namespace depthwise { void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); -struct a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst +class a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef int8_t input_type; - typedef int8_t weight_type; - typedef int8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); - typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 5; constexpr static unsigned int kernel_cols = 5; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 6; - constexpr static unsigned int input_cols = 6; - - constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_5x5_mla::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_5x5_mla::get_packed_size; + a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {} - kern_type kernel = a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl; + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; } - a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp index ab64f53f66..bd71b65908 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -46,7 +46,7 @@ void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl( struct Params { long unsigned int n_channels; - const int8_t *weights; + const void *weights; const int32_t *bias; const arm_gemm::Requantize32 *requant; const int32_t *const requant_muls; @@ -57,7 +57,7 @@ void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl( Params( long unsigned int n_channels, const int8_t *const *inptrs_raw, - const int8_t *const weights, + const void *const weights, const int32_t *const bias, const arm_gemm::Requantize32 &qp, const int32_t *const requant_muls, @@ -111,2096 +111,2070 @@ void a64_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl( requant_muls, requant_shifts, outptrs); __asm__ __volatile__( - "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n" - "mov x10, #0x0\n" - "ldr x3, [%x[params], %[offsetof_Params_weights]]\n" - "mov x1, #0x0\n" - "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" - "add x25, %x[params], %[offsetof_Params_inptrs]\n" - "ldr x2, [%x[params], %[offsetof_Params_requant_muls]]\n" - "lsr x19, x4, #0x3\n" - "ldr x5, [%x[params], %[offsetof_Params_requant_shifts]]\n" - "add x13, x22, %[offsetof_Requantize32_a_offset]\n" - "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" - "add x20, x22, %[offsetof_Requantize32_b_offset]\n" - "ld1r { v7.16b }, [x13]\n" - "add x8, x22, %[offsetof_Requantize32_c_offset]\n" - "ld1r { v13.16b }, [x20]\n" - "add x20, x22, %[offsetof_Requantize32_minval]\n" - "ld1r { v19.4s }, [x8]\n" - "add x8, x22, %[offsetof_Requantize32_maxval]\n" - "ld1r { v16.4s }, [x20]\n" - "ld1r { v12.4s }, [x8]\n" - "ldp x17, x16, [x21, #0x0]\n" - "ldp x6, x8, [x21, #0x10]\n" - "cbz x19, 3f\n" - "subs x19, x19, #0x1\n" - "ldr x12, [%x[params], %[offsetof_Params_bias]]\n" - "ldr q15, [x12, #0x0]\n" - "mov v18.16b, v15.16b\n" - "ldr q20, [x12, #0x10]\n" - "add x12, x12, #0x20\n" - "mov v11.16b, v15.16b\n" - "str x12, [%x[params], %[offsetof_Params_bias]]\n" + "ldr x10, [%x[params], %[offsetof_Params_requant]]\n" + "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n" + "add x17, x10, %[offsetof_Requantize32_a_offset]\n" + "add x9, x10, %[offsetof_Requantize32_b_offset]\n" + "ldr x25, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x4, x10, %[offsetof_Requantize32_c_offset]\n" + "add x14, x10, %[offsetof_Requantize32_minval]\n" + "ldr x23, [%x[params], %[offsetof_Params_weights]]\n" + "add x5, x10, %[offsetof_Requantize32_maxval]\n" + "ld1r { v9.16b }, [x17]\n" + "ld1r { v14.16b }, [x9]\n" + "lsr x3, x0, #0x3\n" + "ld1r { v18.8h }, [x4]\n" + "ld1r { v11.8h }, [x14]\n" + "mov x24, #0x0\n" + "mov x22, #0x0\n" + "ld1r { v13.8h }, [x5]\n" + "ldr x10, [%x[params], %[offsetof_Params_requant_muls]]\n" + "add x20, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x1, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "ldp x16, x8, [x25, #0x0]\n" + "ldp x4, x7, [x25, #0x10]\n" + "cbz x3, 3f\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q15, [x19, #0x0]\n" + "subs x3, x3, #0x1\n" + "mov v17.16b, v15.16b\n" + "ldr q16, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr d0, [x23, #0x0]\n" + "ldr d1, [x23, #0x8]\n" + "ldr d2, [x23, #0x10]\n" + "mov v8.16b, v16.16b\n" "mov v10.16b, v15.16b\n" - "ldr d0, [x3, #0x0]\n" - "ssubl v0.8h, v0.8b, v13.8b\n" - "mov v5.16b, v20.16b\n" - "ldr d1, [x3, #0x8]\n" - "mov v8.16b, v20.16b\n" - "ldr d2, [x3, #0x10]\n" - "ssubl v1.8h, v1.8b, v13.8b\n" - "mov v9.16b, v20.16b\n" - "ldr d3, [x3, #0x18]\n" - "ldr d4, [x3, #0x20]\n" - "ssubl v2.8h, v2.8b, v13.8b\n" - "ldp x28, x27, [x25, #0x0]\n" - "ssubl v3.8h, v3.8b, v13.8b\n" - "ldp x26, x13, [x25, #0x10]\n" - "ssubl v4.8h, v4.8b, v13.8b\n" - "ldp x24, x23, [x25, #0x20]\n" - "ldp x22, x21, [x25, #0x30]\n" - "ldp x20, x0, [x25, #0x40]\n" - "ldr d31, [x28, x10]\n" - "ssubl v31.8h, v31.8b, v7.8b\n" - "ldr d30, [x27, x10]\n" - "ldr d29, [x26, x10]\n" - "ssubl v30.8h, v30.8b, v7.8b\n" - "ldr d28, [x13, x10]\n" - "ldr d27, [x24, x10]\n" - "ssubl v29.8h, v29.8b, v7.8b\n" - "ldr d23, [x23, x10]\n" - "ssubl v28.8h, v28.8b, v7.8b\n" - "ldr d25, [x22, x10]\n" - "ldr d24, [x21, x10]\n" - "ssubl v27.8h, v27.8b, v7.8b\n" - "ldr d26, [x20, x10]\n" - "ssubl v23.8h, v23.8b, v7.8b\n" - "ldr d22, [x0, x10]\n" - "ssubl v25.8h, v25.8b, v7.8b\n" - "ssubl v24.8h, v24.8b, v7.8b\n" - "ssubl v26.8h, v26.8b, v7.8b\n" - "ssubl v22.8h, v22.8b, v7.8b\n" + "ldr d3, [x23, #0x18]\n" + "ldr d4, [x23, #0x20]\n" + "mov v7.16b, v16.16b\n" + "mov v6.16b, v15.16b\n" + "ldp x28, x6, [x20, #0x0]\n" + "ldp x26, x25, [x20, #0x10]\n" + "mov v5.16b, v16.16b\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "ldp x5, x2, [x20, #0x20]\n" + "ldp x27, x21, [x20, #0x30]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "ldp x12, x19, [x20, #0x40]\n" + "ldr d31, [x28, x24]\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "ldr d30, [x6, x24]\n" + "ldr d29, [x26, x24]\n" + "ssubl v31.8h, v31.8b, v9.8b\n" + "ssubl v30.8h, v30.8b, v9.8b\n" + "ldr d28, [x25, x24]\n" + "ldr d27, [x5, x24]\n" + "ssubl v29.8h, v29.8b, v9.8b\n" + "ssubl v28.8h, v28.8b, v9.8b\n" + "ldr d23, [x2, x24]\n" + "ldr d25, [x27, x24]\n" + "ssubl v27.8h, v27.8b, v9.8b\n" + "ssubl v23.8h, v23.8b, v9.8b\n" + "ldr d24, [x21, x24]\n" + "ldr d26, [x12, x24]\n" + "ssubl v25.8h, v25.8b, v9.8b\n" + "ssubl v24.8h, v24.8b, v9.8b\n" + "ldr d22, [x19, x24]\n" + "ssubl v26.8h, v26.8b, v9.8b\n" + "ssubl v22.8h, v22.8b, v9.8b\n" "beq 2f\n" "1:" // Loop "smlal v15.4s, v31.4h, v0.4h\n" - "ldr x20, [x25, #0x50]\n" - "subs x19, x19, #0x1\n" - "smlal2 v20.4s, v31.8h, v0.8h\n" - "ldr x28, [x25, #0x58]\n" - "smlal v18.4s, v30.4h, v0.4h\n" - "ldr x0, [x25, #0x60]\n" - "smlal2 v5.4s, v30.8h, v0.8h\n" - "ldr d31, [x20, x10]\n" - "ssubl v31.8h, v31.8b, v7.8b\n" - "smlal v11.4s, v29.4h, v0.4h\n" - "ldr x7, [x25, #0x68]\n" - "smlal2 v8.4s, v29.8h, v0.8h\n" - "ldr x26, [x25, #0x70]\n" - "smlal v10.4s, v28.4h, v0.4h\n" - "ldr x23, [x25, #0x78]\n" - "smlal2 v9.4s, v28.8h, v0.8h\n" - "ldr d0, [x3, #0x28]\n" - "ssubl v0.8h, v0.8b, v13.8b\n" + "smlal2 v16.4s, v31.8h, v0.8h\n" + "ldr x19, [x20, #0x50]\n" + "ldr d31, [x19, x24]\n" + "smlal v17.4s, v30.4h, v0.4h\n" + "smlal v10.4s, v29.4h, v0.4h\n" + "ldr x15, [x20, #0x58]\n" + "ssubl v31.8h, v31.8b, v9.8b\n" + "smlal v6.4s, v28.4h, v0.4h\n" + "smlal2 v8.4s, v30.8h, v0.8h\n" + "ldr x19, [x20, #0x60]\n" + "ldr x27, [x20, #0x68]\n" + "smlal2 v7.4s, v29.8h, v0.8h\n" "smlal v15.4s, v30.4h, v1.4h\n" - "ldr x20, [x25, #0x80]\n" - "smlal2 v20.4s, v30.8h, v1.8h\n" - "ldr d30, [x28, x10]\n" - "ssubl v30.8h, v30.8b, v7.8b\n" - "smlal v18.4s, v27.4h, v1.4h\n" - "ldr x22, [x25, #0x88]\n" - "smlal2 v5.4s, v27.8h, v1.8h\n" - "ldr x13, [x25, #0x90]\n" - "smlal v11.4s, v28.4h, v1.4h\n" - "ldr x21, [x25, #0x98]\n" - "smlal2 v8.4s, v28.8h, v1.8h\n" - "ldr x14, [x25, #0xa0]\n" - "smlal v10.4s, v23.4h, v1.4h\n" - "ldr x11, [x25, #0xa8]\n" - "smlal2 v9.4s, v23.8h, v1.8h\n" - "ldr d1, [x3, #0x30]\n" - "ssubl v1.8h, v1.8b, v13.8b\n" + "ldr x5, [x20, #0x70]\n" + "ldr x11, [x20, #0x78]\n" + "smlal2 v16.4s, v30.8h, v1.8h\n" + "smlal2 v5.4s, v28.8h, v0.8h\n" + "ldr d30, [x15, x24]\n" + "ssubl v30.8h, v30.8b, v9.8b\n" + "smlal v17.4s, v27.4h, v1.4h\n" + "smlal v10.4s, v28.4h, v1.4h\n" + "ldr d0, [x23, #0x28]\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal v6.4s, v23.4h, v1.4h\n" + "smlal2 v8.4s, v27.8h, v1.8h\n" + "ldr x12, [x20, #0x80]\n" + "ldr x26, [x20, #0x88]\n" + "smlal2 v7.4s, v28.8h, v1.8h\n" "smlal v15.4s, v27.4h, v2.4h\n" - "ldr x24, [x25, #0xb0]\n" - "smlal2 v20.4s, v27.8h, v2.8h\n" - "ldr d27, [x0, x10]\n" - "ssubl v27.8h, v27.8b, v7.8b\n" - "smlal v18.4s, v25.4h, v2.4h\n" - "ldr x0, [x25, #0xb8]\n" - "smlal2 v5.4s, v25.8h, v2.8h\n" - "ldr x15, [x25, #0xc0]\n" - "smlal v11.4s, v23.4h, v2.4h\n" - "ldr x9, [x25, #0xc8]\n" - "smlal2 v8.4s, v23.8h, v2.8h\n" - "ldr x27, [x25, #0xd0]\n" - "smlal v10.4s, v31.4h, v2.4h\n" - "ldr x28, [x25, #0xd8]\n" - "smlal2 v9.4s, v31.8h, v2.8h\n" - "ldr d2, [x3, #0x38]\n" - "ssubl v2.8h, v2.8b, v13.8b\n" + "ldr x14, [x20, #0x90]\n" + "ldr x15, [x20, #0x98]\n" + "smlal2 v16.4s, v27.8h, v2.8h\n" + "smlal2 v5.4s, v23.8h, v1.8h\n" + "ldr d27, [x19, x24]\n" + "ssubl v27.8h, v27.8b, v9.8b\n" + "smlal v17.4s, v25.4h, v2.4h\n" + "smlal v10.4s, v23.4h, v2.4h\n" + "ldr d1, [x23, #0x30]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v31.4h, v2.4h\n" + "smlal2 v8.4s, v25.8h, v2.8h\n" + "ldr x21, [x20, #0xa0]\n" + "ldr x2, [x20, #0xa8]\n" + "smlal2 v7.4s, v23.8h, v2.8h\n" "smlal v15.4s, v25.4h, v3.4h\n" - "ldr q6, [x2, #0x0]\n" - "smlal2 v20.4s, v25.8h, v3.8h\n" - "ldr d25, [x7, x10]\n" - "ssubl v25.8h, v25.8b, v7.8b\n" - "smlal v18.4s, v24.4h, v3.4h\n" - "ldr x12, [x25, #0xe0]\n" - "smlal2 v5.4s, v24.8h, v3.8h\n" - "ldr q21, [x5, #0x0]\n" - "smlal v11.4s, v31.4h, v3.4h\n" - "ldr q17, [x2, #0x10]\n" - "add x2, x2, #0x20\n" - "smlal2 v8.4s, v31.8h, v3.8h\n" - "ldr q14, [x5, #0x10]\n" - "add x5, x5, #0x20\n" - "smlal v10.4s, v30.4h, v3.4h\n" - "smlal2 v9.4s, v30.8h, v3.8h\n" - "ldr d3, [x3, #0x40]\n" - "ssubl v3.8h, v3.8b, v13.8b\n" + "ldr x13, [x20, #0xb0]\n" + "ldr x9, [x20, #0xb8]\n" + "smlal2 v16.4s, v25.8h, v3.8h\n" + "smlal2 v5.4s, v31.8h, v2.8h\n" + "ldr d25, [x27, x24]\n" + "ssubl v25.8h, v25.8b, v9.8b\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal v10.4s, v31.4h, v3.4h\n" + "ldr d2, [x23, #0x38]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v30.4h, v3.4h\n" + "smlal2 v8.4s, v24.8h, v3.8h\n" + "ldr x19, [x20, #0xc0]\n" + "ldr x28, [x20, #0xc8]\n" + "smlal2 v7.4s, v31.8h, v3.8h\n" "smlal v15.4s, v24.4h, v4.4h\n" - "smlal2 v20.4s, v24.8h, v4.8h\n" - "ldr d24, [x26, x10]\n" - "ssubl v24.8h, v24.8b, v7.8b\n" - "smlal v18.4s, v27.4h, v4.4h\n" - "ldr x7, [x25, #0xe8]\n" - "smlal2 v5.4s, v27.8h, v4.8h\n" - "ldr d27, [x23, x10]\n" - "ssubl v27.8h, v27.8b, v7.8b\n" - "smlal v11.4s, v30.4h, v4.4h\n" - "ldr x26, [x25, #0xf0]\n" - "smlal2 v8.4s, v30.8h, v4.8h\n" - "smlal v10.4s, v26.4h, v4.4h\n" - "smlal2 v9.4s, v26.8h, v4.8h\n" - "ldr d4, [x3, #0x48]\n" - "ssubl v4.8h, v4.8b, v13.8b\n" + "ldr x6, [x20, #0xd0]\n" + "ldr x27, [x20, #0xd8]\n" + "smlal2 v16.4s, v24.8h, v4.8h\n" + "smlal2 v5.4s, v30.8h, v3.8h\n" + "ldr d24, [x5, x24]\n" + "ssubl v24.8h, v24.8b, v9.8b\n" + "smlal v17.4s, v27.4h, v4.4h\n" + "smlal v10.4s, v30.4h, v4.4h\n" + "ldr d3, [x23, #0x40]\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v26.4h, v4.4h\n" + "smlal2 v8.4s, v27.8h, v4.8h\n" + "ldr d27, [x11, x24]\n" + "ssubl v27.8h, v27.8b, v9.8b\n" + "smlal2 v7.4s, v30.8h, v4.8h\n" "smlal v15.4s, v29.4h, v0.4h\n" - "smlal2 v20.4s, v29.8h, v0.8h\n" - "smlal v18.4s, v28.4h, v0.4h\n" - "smlal2 v5.4s, v28.8h, v0.8h\n" - "smlal v11.4s, v22.4h, v0.4h\n" - "smlal2 v8.4s, v22.8h, v0.8h\n" - "smlal v10.4s, v25.4h, v0.4h\n" - "smlal2 v9.4s, v25.8h, v0.8h\n" - "ldr d0, [x3, #0x50]\n" - "ssubl v0.8h, v0.8b, v13.8b\n" + "ldr x11, [x20, #0xe0]\n" + "ldr x17, [x20, #0xe8]\n" + "smlal2 v16.4s, v29.8h, v0.8h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d4, [x23, #0x48]\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v17.4s, v28.4h, v0.4h\n" + "smlal v10.4s, v22.4h, v0.4h\n" + "ldr x5, [x20, #0xf0]\n" + "ldr q12, [x10, #0x0]\n" + "smlal v6.4s, v25.4h, v0.4h\n" + "smlal2 v8.4s, v28.8h, v0.8h\n" + "ldr q19, [x1, #0x0]\n" + "ldr q20, [x10, #0x10]\n" + "smlal2 v7.4s, v22.8h, v0.8h\n" "smlal v15.4s, v28.4h, v1.4h\n" - "smlal2 v20.4s, v28.8h, v1.8h\n" - "ldr d28, [x22, x10]\n" - "ssubl v28.8h, v28.8b, v7.8b\n" - "smlal v18.4s, v23.4h, v1.4h\n" - "ldr x23, [x25, #0xf8]\n" - "smlal2 v5.4s, v23.8h, v1.8h\n" - "smlal v11.4s, v25.4h, v1.4h\n" - "smlal2 v8.4s, v25.8h, v1.8h\n" - "smlal v10.4s, v24.4h, v1.4h\n" - "smlal2 v9.4s, v24.8h, v1.8h\n" - "ldr d1, [x3, #0x58]\n" - "ssubl v1.8h, v1.8b, v13.8b\n" + "ldr q29, [x1, #0x10]\n" + "subs x3, x3, #0x1\n" + "smlal2 v16.4s, v28.8h, v1.8h\n" + "smlal2 v5.4s, v25.8h, v0.8h\n" + "ldr d28, [x26, x24]\n" + "ldr d0, [x23, #0x50]\n" + "smlal v17.4s, v23.4h, v1.4h\n" + "smlal v10.4s, v25.4h, v1.4h\n" + "ssubl v28.8h, v28.8b, v9.8b\n" + "ldr x25, [x20, #0xf8]\n" + "smlal v6.4s, v24.4h, v1.4h\n" + "smlal2 v8.4s, v23.8h, v1.8h\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "add x10, x10, #0x20\n" + "smlal2 v7.4s, v25.8h, v1.8h\n" "smlal v15.4s, v23.4h, v2.4h\n" - "smlal2 v20.4s, v23.8h, v2.8h\n" - "ldr d23, [x20, x10]\n" - "ssubl v23.8h, v23.8b, v7.8b\n" - "smlal v18.4s, v31.4h, v2.4h\n" - "ldr x22, [x25, #0x100]\n" - "smlal2 v5.4s, v31.8h, v2.8h\n" - "smlal v11.4s, v24.4h, v2.4h\n" - "smlal2 v8.4s, v24.8h, v2.8h\n" - "smlal v10.4s, v27.4h, v2.4h\n" - "smlal2 v9.4s, v27.8h, v2.8h\n" - "ldr d2, [x3, #0x60]\n" - "ssubl v2.8h, v2.8b, v13.8b\n" + "add x1, x1, #0x20\n" + "smlal2 v16.4s, v23.8h, v2.8h\n" + "ldr d23, [x12, x24]\n" + "smlal2 v5.4s, v24.8h, v1.8h\n" + "ssubl v23.8h, v23.8b, v9.8b\n" + "smlal v17.4s, v31.4h, v2.4h\n" + "smlal v10.4s, v24.4h, v2.4h\n" + "ldr d1, [x23, #0x58]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v27.4h, v2.4h\n" + "smlal2 v8.4s, v31.8h, v2.8h\n" + "ldr x26, [x20, #0x100]\n" + "smlal2 v7.4s, v24.8h, v2.8h\n" "smlal v15.4s, v31.4h, v3.4h\n" - "smlal2 v20.4s, v31.8h, v3.8h\n" - "ldr d31, [x13, x10]\n" - "ssubl v31.8h, v31.8b, v7.8b\n" - "smlal v18.4s, v30.4h, v3.4h\n" - "ldr x20, [x25, #0x108]\n" - "smlal2 v5.4s, v30.8h, v3.8h\n" - "smlal v11.4s, v27.4h, v3.4h\n" - "smlal2 v8.4s, v27.8h, v3.8h\n" - "smlal v10.4s, v23.4h, v3.4h\n" - "smlal2 v9.4s, v23.8h, v3.8h\n" - "ldr d3, [x3, #0x68]\n" - "ssubl v3.8h, v3.8b, v13.8b\n" + "smlal2 v16.4s, v31.8h, v3.8h\n" + "smlal2 v5.4s, v27.8h, v2.8h\n" + "ldr d31, [x14, x24]\n" + "ssubl v31.8h, v31.8b, v9.8b\n" + "smlal v17.4s, v30.4h, v3.4h\n" + "smlal v10.4s, v27.4h, v3.4h\n" + "ldr d2, [x23, #0x60]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v23.4h, v3.4h\n" + "smlal2 v8.4s, v30.8h, v3.8h\n" + "ldr x12, [x20, #0x108]\n" + "smlal2 v7.4s, v27.8h, v3.8h\n" "smlal v15.4s, v30.4h, v4.4h\n" - "smlal2 v20.4s, v30.8h, v4.8h\n" - "ldr d30, [x21, x10]\n" - "ssubl v30.8h, v30.8b, v7.8b\n" - "smlal v18.4s, v26.4h, v4.4h\n" - "ldr x13, [x25, #0x110]\n" - "smlal2 v5.4s, v26.8h, v4.8h\n" - "ldr d26, [x14, x10]\n" - "ssubl v26.8h, v26.8b, v7.8b\n" - "smlal v11.4s, v23.4h, v4.4h\n" - "ldr x21, [x25, #0x118]\n" - "smlal2 v8.4s, v23.8h, v4.8h\n" - "smlal v10.4s, v28.4h, v4.4h\n" - "smlal2 v9.4s, v28.8h, v4.8h\n" - "ldr d4, [x3, #0x70]\n" - "ssubl v4.8h, v4.8b, v13.8b\n" + "smlal2 v16.4s, v30.8h, v4.8h\n" + "ldr d30, [x15, x24]\n" + "smlal2 v5.4s, v23.8h, v3.8h\n" + "ssubl v30.8h, v30.8b, v9.8b\n" + "smlal v17.4s, v26.4h, v4.4h\n" + "smlal v10.4s, v23.4h, v4.4h\n" + "ldr d3, [x23, #0x68]\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v28.4h, v4.4h\n" + "smlal2 v8.4s, v26.8h, v4.8h\n" + "ldr d26, [x21, x24]\n" + "ssubl v26.8h, v26.8b, v9.8b\n" + "smlal2 v7.4s, v23.8h, v4.8h\n" "smlal v15.4s, v22.4h, v0.4h\n" - "smlal2 v20.4s, v22.8h, v0.8h\n" - "ldr d22, [x0, x10]\n" - "ssubl v22.8h, v22.8b, v7.8b\n" - "smlal v18.4s, v25.4h, v0.4h\n" - "smlal2 v5.4s, v25.8h, v0.8h\n" - "smlal v11.4s, v31.4h, v0.4h\n" - "smlal2 v8.4s, v31.8h, v0.8h\n" - "smlal v10.4s, v30.4h, v0.4h\n" - "smlal2 v9.4s, v30.8h, v0.8h\n" - "ldr d0, [x3, #0x78]\n" - "ssubl v0.8h, v0.8b, v13.8b\n" + "ldr x14, [x20, #0x110]\n" + "ldr x21, [x20, #0x118]\n" + "smlal2 v16.4s, v22.8h, v0.8h\n" + "smlal2 v5.4s, v28.8h, v4.8h\n" + "ldr d4, [x23, #0x70]\n" + "ldr d22, [x9, x24]\n" + "smlal v17.4s, v25.4h, v0.4h\n" + "smlal v10.4s, v31.4h, v0.4h\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v6.4s, v30.4h, v0.4h\n" + "smlal2 v8.4s, v25.8h, v0.8h\n" + "ssubl v22.8h, v22.8b, v9.8b\n" + "smlal2 v7.4s, v31.8h, v0.8h\n" "smlal v15.4s, v25.4h, v1.4h\n" - "smlal2 v20.4s, v25.8h, v1.8h\n" - "ldr d25, [x11, x10]\n" - "ssubl v25.8h, v25.8b, v7.8b\n" - "smlal v18.4s, v24.4h, v1.4h\n" - "smlal2 v5.4s, v24.8h, v1.8h\n" - "smlal v11.4s, v30.4h, v1.4h\n" - "smlal2 v8.4s, v30.8h, v1.8h\n" - "smlal v10.4s, v26.4h, v1.4h\n" - "smlal2 v9.4s, v26.8h, v1.8h\n" - "ldr d1, [x3, #0x80]\n" - "ssubl v1.8h, v1.8b, v13.8b\n" + "smlal2 v16.4s, v25.8h, v1.8h\n" + "ldr d25, [x2, x24]\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "ssubl v25.8h, v25.8b, v9.8b\n" + "smlal v17.4s, v24.4h, v1.4h\n" + "smlal v10.4s, v30.4h, v1.4h\n" + "ldr d0, [x23, #0x78]\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal v6.4s, v26.4h, v1.4h\n" + "smlal2 v8.4s, v24.8h, v1.8h\n" + "smlal2 v7.4s, v30.8h, v1.8h\n" "smlal v15.4s, v24.4h, v2.4h\n" - "smlal2 v20.4s, v24.8h, v2.8h\n" - "ldr d24, [x24, x10]\n" - "ssubl v24.8h, v24.8b, v7.8b\n" - "smlal v18.4s, v27.4h, v2.4h\n" - "smlal2 v5.4s, v27.8h, v2.8h\n" - "smlal v11.4s, v26.4h, v2.4h\n" - "smlal2 v8.4s, v26.8h, v2.8h\n" - "smlal v10.4s, v25.4h, v2.4h\n" - "smlal2 v9.4s, v25.8h, v2.8h\n" - "ldr d2, [x3, #0x88]\n" - "ssubl v2.8h, v2.8b, v13.8b\n" + "smlal2 v16.4s, v24.8h, v2.8h\n" + "ldr d24, [x13, x24]\n" + "smlal2 v5.4s, v26.8h, v1.8h\n" + "ssubl v24.8h, v24.8b, v9.8b\n" + "smlal v17.4s, v27.4h, v2.4h\n" + "smlal v10.4s, v26.4h, v2.4h\n" + "ldr d1, [x23, #0x80]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v25.4h, v2.4h\n" + "smlal2 v8.4s, v27.8h, v2.8h\n" + "smlal2 v7.4s, v26.8h, v2.8h\n" "smlal v15.4s, v27.4h, v3.4h\n" - "smlal2 v20.4s, v27.8h, v3.8h\n" - "ldr d27, [x15, x10]\n" - "ssubl v27.8h, v27.8b, v7.8b\n" - "smlal v18.4s, v23.4h, v3.4h\n" - "smlal2 v5.4s, v23.8h, v3.8h\n" - "smlal v11.4s, v25.4h, v3.4h\n" - "smlal2 v8.4s, v25.8h, v3.8h\n" - "smlal v10.4s, v24.4h, v3.4h\n" - "smlal2 v9.4s, v24.8h, v3.8h\n" - "ldr d3, [x3, #0x90]\n" - "ssubl v3.8h, v3.8b, v13.8b\n" + "smlal2 v16.4s, v27.8h, v3.8h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "ldr d27, [x19, x24]\n" + "ssubl v27.8h, v27.8b, v9.8b\n" + "smlal v17.4s, v23.4h, v3.4h\n" + "smlal v10.4s, v25.4h, v3.4h\n" + "ldr d2, [x23, #0x88]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v24.4h, v3.4h\n" + "smlal2 v8.4s, v23.8h, v3.8h\n" + "smlal2 v7.4s, v25.8h, v3.8h\n" "smlal v15.4s, v23.4h, v4.4h\n" - "smlal2 v20.4s, v23.8h, v4.8h\n" - "ldr d23, [x9, x10]\n" - "ssubl v23.8h, v23.8b, v7.8b\n" - "smlal v18.4s, v28.4h, v4.4h\n" - "smlal2 v5.4s, v28.8h, v4.8h\n" - "ldr d28, [x12, x10]\n" - "ssubl v28.8h, v28.8b, v7.8b\n" - "smlal v11.4s, v24.4h, v4.4h\n" - "smlal2 v8.4s, v24.8h, v4.8h\n" - "smlal v10.4s, v22.4h, v4.4h\n" - "smlal2 v9.4s, v22.8h, v4.8h\n" - "ldr d4, [x3, #0x98]\n" - "ssubl v4.8h, v4.8b, v13.8b\n" + "smlal2 v16.4s, v23.8h, v4.8h\n" + "ldr d23, [x28, x24]\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "ssubl v23.8h, v23.8b, v9.8b\n" + "smlal v17.4s, v28.4h, v4.4h\n" + "smlal v10.4s, v24.4h, v4.4h\n" + "ldr d3, [x23, #0x90]\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v22.4h, v4.4h\n" + "smlal2 v8.4s, v28.8h, v4.8h\n" + "ldr d28, [x11, x24]\n" + "ssubl v28.8h, v28.8b, v9.8b\n" + "smlal2 v7.4s, v24.8h, v4.8h\n" "smlal v15.4s, v31.4h, v0.4h\n" - "smlal2 v20.4s, v31.8h, v0.8h\n" - "ldr d31, [x27, x10]\n" - "ssubl v31.8h, v31.8b, v7.8b\n" - "smlal v18.4s, v30.4h, v0.4h\n" - "smlal2 v5.4s, v30.8h, v0.8h\n" - "smlal v11.4s, v27.4h, v0.4h\n" - "smlal2 v8.4s, v27.8h, v0.8h\n" - "smlal v10.4s, v23.4h, v0.4h\n" - "smlal2 v9.4s, v23.8h, v0.8h\n" - "ldr d0, [x3, #0xa0]\n" - "ssubl v0.8h, v0.8b, v13.8b\n" + "smlal2 v16.4s, v31.8h, v0.8h\n" + "ldr d31, [x6, x24]\n" + "smlal2 v5.4s, v22.8h, v4.8h\n" + "ssubl v31.8h, v31.8b, v9.8b\n" + "smlal v17.4s, v30.4h, v0.4h\n" + "smlal v10.4s, v27.4h, v0.4h\n" + "ldr d4, [x23, #0x98]\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v6.4s, v23.4h, v0.4h\n" + "smlal2 v8.4s, v30.8h, v0.8h\n" + "smlal2 v7.4s, v27.8h, v0.8h\n" "smlal v15.4s, v30.4h, v1.4h\n" - "smlal2 v20.4s, v30.8h, v1.8h\n" - "ldr d30, [x28, x10]\n" - "ssubl v30.8h, v30.8b, v7.8b\n" - "smlal v18.4s, v26.4h, v1.4h\n" - "smlal2 v5.4s, v26.8h, v1.8h\n" - "smlal v11.4s, v23.4h, v1.4h\n" - "smlal2 v8.4s, v23.8h, v1.8h\n" - "smlal v10.4s, v31.4h, v1.4h\n" - "smlal2 v9.4s, v31.8h, v1.8h\n" - "ldr d1, [x3, #0xa8]\n" - "ssubl v1.8h, v1.8b, v13.8b\n" + "smlal2 v16.4s, v30.8h, v1.8h\n" + "ldr d30, [x27, x24]\n" + "smlal2 v5.4s, v23.8h, v0.8h\n" + "ssubl v30.8h, v30.8b, v9.8b\n" + "smlal v17.4s, v26.4h, v1.4h\n" + "smlal v10.4s, v23.4h, v1.4h\n" + "ldr d0, [x23, #0xa0]\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal v6.4s, v31.4h, v1.4h\n" + "smlal2 v8.4s, v26.8h, v1.8h\n" + "smlal2 v7.4s, v23.8h, v1.8h\n" "smlal v15.4s, v26.4h, v2.4h\n" - "smlal2 v20.4s, v26.8h, v2.8h\n" - "ldr d26, [x7, x10]\n" - "ssubl v26.8h, v26.8b, v7.8b\n" - "smlal v18.4s, v25.4h, v2.4h\n" - "smlal2 v5.4s, v25.8h, v2.8h\n" - "smlal v11.4s, v31.4h, v2.4h\n" - "smlal2 v8.4s, v31.8h, v2.8h\n" - "smlal v10.4s, v30.4h, v2.4h\n" - "smlal2 v9.4s, v30.8h, v2.8h\n" - "ldr d2, [x3, #0xb0]\n" - "ssubl v2.8h, v2.8b, v13.8b\n" + "smlal2 v16.4s, v26.8h, v2.8h\n" + "smlal2 v5.4s, v31.8h, v1.8h\n" + "ldr d26, [x17, x24]\n" + "ssubl v26.8h, v26.8b, v9.8b\n" + "smlal v17.4s, v25.4h, v2.4h\n" + "smlal v10.4s, v31.4h, v2.4h\n" + "ldr d1, [x23, #0xa8]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v30.4h, v2.4h\n" + "smlal2 v8.4s, v25.8h, v2.8h\n" + "smlal2 v7.4s, v31.8h, v2.8h\n" "smlal v15.4s, v25.4h, v3.4h\n" - "smlal2 v20.4s, v25.8h, v3.8h\n" - "ldr d25, [x26, x10]\n" - "ssubl v25.8h, v25.8b, v7.8b\n" - "smlal v18.4s, v24.4h, v3.4h\n" - "smlal2 v5.4s, v24.8h, v3.8h\n" - "smlal v11.4s, v30.4h, v3.4h\n" - "smlal2 v8.4s, v30.8h, v3.8h\n" - "smlal v10.4s, v28.4h, v3.4h\n" - "smlal2 v9.4s, v28.8h, v3.8h\n" - "ldr d3, [x3, #0xb8]\n" - "ssubl v3.8h, v3.8b, v13.8b\n" + "smlal2 v16.4s, v25.8h, v3.8h\n" + "smlal2 v5.4s, v30.8h, v2.8h\n" + "ldr d25, [x5, x24]\n" + "ssubl v25.8h, v25.8b, v9.8b\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal v10.4s, v30.4h, v3.4h\n" + "ldr d2, [x23, #0xb0]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v28.4h, v3.4h\n" + "smlal2 v8.4s, v24.8h, v3.8h\n" + "smlal2 v7.4s, v30.8h, v3.8h\n" "smlal v15.4s, v24.4h, v4.4h\n" - "smlal2 v20.4s, v24.8h, v4.8h\n" - "ldr d24, [x23, x10]\n" - "ssubl v24.8h, v24.8b, v7.8b\n" - "smlal v18.4s, v22.4h, v4.4h\n" - "smlal2 v5.4s, v22.8h, v4.8h\n" - "smlal v11.4s, v28.4h, v4.4h\n" - "smlal2 v8.4s, v28.8h, v4.8h\n" - "smlal v10.4s, v26.4h, v4.4h\n" - "smlal2 v9.4s, v26.8h, v4.8h\n" - "ldr d4, [x3, #0xc0]\n" - "add x3, x3, #0xc8\n" + "smlal2 v16.4s, v24.8h, v4.8h\n" + "ldr d24, [x25, x24]\n" + "smlal2 v5.4s, v28.8h, v3.8h\n" + "ssubl v24.8h, v24.8b, v9.8b\n" + "smlal v17.4s, v22.4h, v4.4h\n" + "smlal v10.4s, v28.4h, v4.4h\n" + "ldr d3, [x23, #0xb8]\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v26.4h, v4.4h\n" + "smlal2 v7.4s, v28.8h, v4.8h\n" "smlal v15.4s, v27.4h, v0.4h\n" - "ssubl v4.8h, v4.8b, v13.8b\n" - "smlal2 v20.4s, v27.8h, v0.8h\n" - "ldr d27, [x22, x10]\n" - "smlal v18.4s, v23.4h, v0.4h\n" - "ssubl v27.8h, v27.8b, v7.8b\n" - "smlal2 v5.4s, v23.8h, v0.8h\n" - "smlal v11.4s, v25.4h, v0.4h\n" - "smlal2 v8.4s, v25.8h, v0.8h\n" - "ldr d25, [x20, x10]\n" - "ssubl v25.8h, v25.8b, v7.8b\n" - "smlal v10.4s, v24.4h, v0.4h\n" - "smlal2 v9.4s, v24.8h, v0.8h\n" + "smlal2 v16.4s, v27.8h, v0.8h\n" + "ldr d27, [x26, x24]\n" + "ssubl v27.8h, v27.8b, v9.8b\n" + "smlal2 v8.4s, v22.8h, v4.8h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d4, [x23, #0xc0]\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v17.4s, v23.4h, v0.4h\n" + "smlal v10.4s, v25.4h, v0.4h\n" + "add x23, x23, #0xc8\n" + "smlal v6.4s, v24.4h, v0.4h\n" + "smlal2 v7.4s, v25.8h, v0.8h\n" + "ldr d25, [x12, x24]\n" + "ssubl v25.8h, v25.8b, v9.8b\n" + "smlal2 v8.4s, v23.8h, v0.8h\n" + "smlal2 v5.4s, v24.8h, v0.8h\n" "smlal v15.4s, v23.4h, v1.4h\n" - "smlal2 v20.4s, v23.8h, v1.8h\n" - "smlal v18.4s, v31.4h, v1.4h\n" - "smlal2 v5.4s, v31.8h, v1.8h\n" - "smlal v11.4s, v24.4h, v1.4h\n" - "smlal2 v8.4s, v24.8h, v1.8h\n" - "ldr d24, [x13, x10]\n" - "ssubl v24.8h, v24.8b, v7.8b\n" - "smlal v10.4s, v27.4h, v1.4h\n" - "smlal2 v9.4s, v27.8h, v1.8h\n" + "smlal v17.4s, v31.4h, v1.4h\n" + "smlal v10.4s, v24.4h, v1.4h\n" + "smlal v6.4s, v27.4h, v1.4h\n" + "smlal2 v7.4s, v24.8h, v1.8h\n" + "ldr d24, [x14, x24]\n" + "smlal2 v16.4s, v23.8h, v1.8h\n" + "ssubl v24.8h, v24.8b, v9.8b\n" + "smlal2 v8.4s, v31.8h, v1.8h\n" + "smlal2 v5.4s, v27.8h, v1.8h\n" "smlal v15.4s, v31.4h, v2.4h\n" - "smlal2 v20.4s, v31.8h, v2.8h\n" - "smlal v18.4s, v30.4h, v2.4h\n" - "smlal2 v5.4s, v30.8h, v2.8h\n" - "smlal v11.4s, v27.4h, v2.4h\n" - "smlal2 v8.4s, v27.8h, v2.8h\n" - "ldr d27, [x21, x10]\n" - "add x10, x10, #0x8\n" - "smlal v10.4s, v25.4h, v2.4h\n" - "ssubl v27.8h, v27.8b, v7.8b\n" - "smlal2 v9.4s, v25.8h, v2.8h\n" + "smlal v17.4s, v30.4h, v2.4h\n" + "smlal v10.4s, v27.4h, v2.4h\n" + "smlal v6.4s, v25.4h, v2.4h\n" + "smlal2 v7.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x24]\n" + "smlal2 v16.4s, v31.8h, v2.8h\n" + "ssubl v27.8h, v27.8b, v9.8b\n" + "smlal2 v8.4s, v30.8h, v2.8h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "add x24, x24, #0x8\n" "smlal v15.4s, v30.4h, v3.4h\n" - "smlal2 v20.4s, v30.8h, v3.8h\n" - "smlal v18.4s, v28.4h, v3.4h\n" - "smlal2 v5.4s, v28.8h, v3.8h\n" - "smlal v11.4s, v25.4h, v3.4h\n" - "smlal2 v8.4s, v25.8h, v3.8h\n" - "smlal v10.4s, v24.4h, v3.4h\n" - "smlal2 v9.4s, v24.8h, v3.8h\n" + "smlal v17.4s, v28.4h, v3.4h\n" + "smlal v10.4s, v25.4h, v3.4h\n" + "smlal v6.4s, v24.4h, v3.4h\n" + "smlal2 v16.4s, v30.8h, v3.8h\n" + "smlal2 v8.4s, v28.8h, v3.8h\n" + "smlal2 v7.4s, v25.8h, v3.8h\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" "smlal v15.4s, v28.4h, v4.4h\n" - "smlal2 v20.4s, v28.8h, v4.8h\n" - "smlal v18.4s, v26.4h, v4.4h\n" - "smlal2 v5.4s, v26.8h, v4.8h\n" - "smlal v11.4s, v24.4h, v4.4h\n" - "smlal2 v8.4s, v24.8h, v4.8h\n" - "smlal v10.4s, v27.4h, v4.4h\n" - "smlal2 v9.4s, v27.8h, v4.8h\n" - "sqrdmulh v15.4s, v15.4s, v6.4s\n" - "sqrdmulh v20.4s, v20.4s, v17.4s\n" - "sqrdmulh v18.4s, v18.4s, v6.4s\n" - "sqrdmulh v5.4s, v5.4s, v17.4s\n" - "and v1.16b, v15.16b, v21.16b\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "and v29.16b, v20.16b, v14.16b\n" - "and v3.16b, v18.16b, v21.16b\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "and v2.16b, v5.16b, v14.16b\n" - "sqrdmulh v11.4s, v11.4s, v6.4s\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqrdmulh v8.4s, v8.4s, v17.4s\n" + "smlal v17.4s, v26.4h, v4.4h\n" + "sqdmulh v15.4s, v15.4s, v12.4s\n" + "smlal v10.4s, v24.4h, v4.4h\n" + "smlal v6.4s, v27.4h, v4.4h\n" + "sqdmulh v17.4s, v17.4s, v12.4s\n" + "smlal2 v16.4s, v28.8h, v4.8h\n" + "smlal2 v8.4s, v26.8h, v4.8h\n" + "sqdmulh v10.4s, v10.4s, v12.4s\n" + "smlal2 v7.4s, v24.8h, v4.8h\n" + "smlal2 v5.4s, v27.8h, v4.8h\n" + "sqdmulh v6.4s, v6.4s, v12.4s\n" + "and v23.16b, v15.16b, v19.16b\n" + "sqdmulh v16.4s, v16.4s, v20.4s\n" + "and v22.16b, v17.16b, v19.16b\n" + "sqdmulh v8.4s, v8.4s, v20.4s\n" + "and v21.16b, v10.16b, v19.16b\n" + "sqdmulh v7.4s, v7.4s, v20.4s\n" + "and v26.16b, v6.16b, v19.16b\n" + "sqdmulh v5.4s, v5.4s, v20.4s\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "and v4.16b, v16.16b, v29.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v2.16b, v8.16b, v29.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v3.16b, v7.16b, v29.16b\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "and v25.16b, v5.16b, v29.16b\n" + "sqadd v15.4s, v15.4s, v23.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v22.4s\n" "sshr v2.4s, v2.4s, #0x1f\n" - "sqadd v15.4s, v15.4s, v1.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "and v0.16b, v11.16b, v21.16b\n" - "sshr v0.4s, v0.4s, #0x1f\n" - "srshl v15.4s, v15.4s, v21.4s\n" - "sqadd v20.4s, v20.4s, v29.4s\n" - "sqadd v18.4s, v18.4s, v3.4s\n" - "sqadd v5.4s, v5.4s, v2.4s\n" - "and v27.16b, v8.16b, v14.16b\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "add v15.4s, v15.4s, v19.4s\n" - "srshl v20.4s, v20.4s, v14.4s\n" - "srshl v18.4s, v18.4s, v21.4s\n" - "srshl v5.4s, v5.4s, v14.4s\n" - "smin v15.4s, v15.4s, v12.4s\n" - "add v20.4s, v20.4s, v19.4s\n" - "add v18.4s, v18.4s, v19.4s\n" - "smax v15.4s, v15.4s, v16.4s\n" - "smin v20.4s, v20.4s, v12.4s\n" - "smin v18.4s, v18.4s, v12.4s\n" - "add v5.4s, v5.4s, v19.4s\n" - "smax v20.4s, v20.4s, v16.4s\n" - "smax v18.4s, v18.4s, v16.4s\n" - "smin v5.4s, v5.4s, v12.4s\n" - "uzp1 v15.16b, v15.16b, v20.16b\n" - "sqadd v11.4s, v11.4s, v0.4s\n" + "sqadd v10.4s, v10.4s, v21.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqadd v6.4s, v6.4s, v26.4s\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v19.4s\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "srshl v17.4s, v17.4s, v19.4s\n" + "sqadd v8.4s, v8.4s, v2.4s\n" + "srshl v10.4s, v10.4s, v19.4s\n" + "sqadd v7.4s, v7.4s, v3.4s\n" + "srshl v6.4s, v6.4s, v19.4s\n" + "sqadd v5.4s, v5.4s, v25.4s\n" + "srshl v16.4s, v16.4s, v29.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v8.4s, v8.4s, v29.4s\n" + "sqxtn v17.4h, v17.4s\n" + "srshl v7.4s, v7.4s, v29.4s\n" + "sqxtn v10.4h, v10.4s\n" + "srshl v5.4s, v5.4s, v29.4s\n" + "sqxtn v6.4h, v6.4s\n" + "sqxtn2 v15.8h, v16.4s\n" + "sqxtn2 v17.8h, v8.4s\n" + "sqxtn2 v10.8h, v7.4s\n" + "sqxtn2 v6.8h, v5.4s\n" + "sqadd v15.8h, v15.8h, v18.8h\n" + "sqadd v17.8h, v17.8h, v18.8h\n" + "sqadd v10.8h, v10.8h, v18.8h\n" + "sqadd v6.8h, v6.8h, v18.8h\n" + "smax v15.8h, v15.8h, v11.8h\n" + "smax v17.8h, v17.8h, v11.8h\n" + "smax v10.8h, v10.8h, v11.8h\n" + "smax v6.8h, v6.8h, v11.8h\n" + "smin v15.8h, v15.8h, v13.8h\n" + "smin v17.8h, v17.8h, v13.8h\n" + "smin v10.8h, v10.8h, v13.8h\n" + "smin v6.8h, v6.8h, v13.8h\n" "uzp1 v15.16b, v15.16b, v15.16b\n" - "str d15, [x17, x1]\n" - "smax v5.4s, v5.4s, v16.4s\n" - "sqadd v8.4s, v8.4s, v27.4s\n" - "srshl v11.4s, v11.4s, v21.4s\n" - "and v30.16b, v10.16b, v21.16b\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "uzp1 v18.16b, v18.16b, v5.16b\n" - "add v11.4s, v11.4s, v19.4s\n" - "srshl v8.4s, v8.4s, v14.4s\n" - "uzp1 v18.16b, v18.16b, v18.16b\n" - "str d18, [x16, x1]\n" - "smin v11.4s, v11.4s, v12.4s\n" - "sqrdmulh v9.4s, v9.4s, v17.4s\n" - "add v8.4s, v8.4s, v19.4s\n" - "sqadd v10.4s, v10.4s, v30.4s\n" - "smax v11.4s, v11.4s, v16.4s\n" - "smin v8.4s, v8.4s, v12.4s\n" - "and v6.16b, v9.16b, v14.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "smax v8.4s, v8.4s, v16.4s\n" - "srshl v10.4s, v10.4s, v21.4s\n" - "uzp1 v11.16b, v11.16b, v8.16b\n" - "add v10.4s, v10.4s, v19.4s\n" - "uzp1 v11.16b, v11.16b, v11.16b\n" - "str d11, [x6, x1]\n" - "smin v10.4s, v10.4s, v12.4s\n" - "sqadd v9.4s, v9.4s, v6.4s\n" - "smax v10.4s, v10.4s, v16.4s\n" - "srshl v9.4s, v9.4s, v14.4s\n" - "add v9.4s, v9.4s, v19.4s\n" - "smin v9.4s, v9.4s, v12.4s\n" - "smax v9.4s, v9.4s, v16.4s\n" - "uzp1 v10.16b, v10.16b, v9.16b\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" + "str d15, [x16, x22]\n" "uzp1 v10.16b, v10.16b, v10.16b\n" - "str d10, [x8, x1]\n" - "add x1, x1, #0x8\n" - "ldr x12, [%x[params], %[offsetof_Params_bias]]\n" - "ldr q15, [x12, #0x0]\n" - "mov v18.16b, v15.16b\n" - "ldr q20, [x12, #0x10]\n" - "add x12, x12, #0x20\n" - "mov v11.16b, v15.16b\n" - "str x12, [%x[params], %[offsetof_Params_bias]]\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "str d17, [x8, x22]\n" + "str d10, [x4, x22]\n" + "str d6, [x7, x22]\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q15, [x19, #0x0]\n" + "add x22, x22, #0x8\n" + "ldr q16, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr d0, [x23, #0x0]\n" + "ldr d1, [x23, #0x8]\n" + "ldr d2, [x23, #0x10]\n" + "mov v17.16b, v15.16b\n" + "mov v8.16b, v16.16b\n" + "ldr d3, [x23, #0x18]\n" + "ldr d4, [x23, #0x20]\n" "mov v10.16b, v15.16b\n" - "ldr d0, [x3, #0x0]\n" - "ssubl v0.8h, v0.8b, v13.8b\n" - "mov v5.16b, v20.16b\n" - "ldr d1, [x3, #0x8]\n" - "mov v8.16b, v20.16b\n" - "ldr d2, [x3, #0x10]\n" - "ssubl v1.8h, v1.8b, v13.8b\n" - "mov v9.16b, v20.16b\n" - "ldr d3, [x3, #0x18]\n" - "ldr d4, [x3, #0x20]\n" - "ssubl v2.8h, v2.8b, v13.8b\n" - "ldp x28, x27, [x25, #0x0]\n" - "ssubl v3.8h, v3.8b, v13.8b\n" - "ldp x26, x13, [x25, #0x10]\n" - "ssubl v4.8h, v4.8b, v13.8b\n" - "ldp x24, x23, [x25, #0x20]\n" - "ldp x22, x21, [x25, #0x30]\n" - "ldp x20, x0, [x25, #0x40]\n" - "ldr d31, [x28, x10]\n" - "ssubl v31.8h, v31.8b, v7.8b\n" - "ldr d30, [x27, x10]\n" - "ldr d29, [x26, x10]\n" - "ssubl v30.8h, v30.8b, v7.8b\n" - "ldr d28, [x13, x10]\n" - "ldr d27, [x24, x10]\n" - "ssubl v29.8h, v29.8b, v7.8b\n" - "ldr d23, [x23, x10]\n" - "ssubl v28.8h, v28.8b, v7.8b\n" - "ldr d25, [x22, x10]\n" - "ldr d24, [x21, x10]\n" - "ssubl v27.8h, v27.8b, v7.8b\n" - "ldr d26, [x20, x10]\n" - "ssubl v23.8h, v23.8b, v7.8b\n" - "ldr d22, [x0, x10]\n" - "ssubl v25.8h, v25.8b, v7.8b\n" - "ssubl v24.8h, v24.8b, v7.8b\n" - "ssubl v26.8h, v26.8b, v7.8b\n" - "ssubl v22.8h, v22.8b, v7.8b\n" + "mov v7.16b, v16.16b\n" + "ldp x28, x6, [x20, #0x0]\n" + "ldp x26, x25, [x20, #0x10]\n" + "mov v6.16b, v15.16b\n" + "mov v5.16b, v16.16b\n" + "ldp x5, x2, [x20, #0x20]\n" + "ldp x27, x21, [x20, #0x30]\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "ldp x12, x19, [x20, #0x40]\n" + "ldr d31, [x28, x24]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "ldr d30, [x6, x24]\n" + "ldr d29, [x26, x24]\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "ssubl v31.8h, v31.8b, v9.8b\n" + "ldr d28, [x25, x24]\n" + "ldr d27, [x5, x24]\n" + "ssubl v30.8h, v30.8b, v9.8b\n" + "ssubl v29.8h, v29.8b, v9.8b\n" + "ldr d23, [x2, x24]\n" + "ldr d25, [x27, x24]\n" + "ssubl v28.8h, v28.8b, v9.8b\n" + "ssubl v27.8h, v27.8b, v9.8b\n" + "ldr d24, [x21, x24]\n" + "ldr d26, [x12, x24]\n" + "ssubl v23.8h, v23.8b, v9.8b\n" + "ssubl v25.8h, v25.8b, v9.8b\n" + "ldr d22, [x19, x24]\n" + "ssubl v24.8h, v24.8b, v9.8b\n" + "ssubl v26.8h, v26.8b, v9.8b\n" + "ssubl v22.8h, v22.8b, v9.8b\n" "bgt 1b\n" "2:" // Tail "smlal v15.4s, v31.4h, v0.4h\n" - "ldr x20, [x25, #0x50]\n" - "tst x4, #0x7\n" - "smlal2 v20.4s, v31.8h, v0.8h\n" - "ldr x28, [x25, #0x58]\n" - "smlal v18.4s, v30.4h, v0.4h\n" - "ldr x0, [x25, #0x60]\n" - "smlal2 v5.4s, v30.8h, v0.8h\n" - "ldr d31, [x20, x10]\n" - "ssubl v31.8h, v31.8b, v7.8b\n" - "smlal v11.4s, v29.4h, v0.4h\n" - "ldr x7, [x25, #0x68]\n" - "smlal2 v8.4s, v29.8h, v0.8h\n" - "ldr x26, [x25, #0x70]\n" - "smlal v10.4s, v28.4h, v0.4h\n" - "ldr x23, [x25, #0x78]\n" - "smlal2 v9.4s, v28.8h, v0.8h\n" - "ldr d0, [x3, #0x28]\n" - "ssubl v0.8h, v0.8b, v13.8b\n" + "smlal2 v16.4s, v31.8h, v0.8h\n" + "ldr x19, [x20, #0x50]\n" + "ldr d31, [x19, x24]\n" + "smlal v17.4s, v30.4h, v0.4h\n" + "smlal v10.4s, v29.4h, v0.4h\n" + "ldr x15, [x20, #0x58]\n" + "ssubl v31.8h, v31.8b, v9.8b\n" + "smlal v6.4s, v28.4h, v0.4h\n" + "smlal2 v8.4s, v30.8h, v0.8h\n" + "ldr x19, [x20, #0x60]\n" + "ldr x27, [x20, #0x68]\n" + "smlal2 v7.4s, v29.8h, v0.8h\n" "smlal v15.4s, v30.4h, v1.4h\n" - "ldr x20, [x25, #0x80]\n" - "smlal2 v20.4s, v30.8h, v1.8h\n" - "ldr d30, [x28, x10]\n" - "ssubl v30.8h, v30.8b, v7.8b\n" - "smlal v18.4s, v27.4h, v1.4h\n" - "ldr x22, [x25, #0x88]\n" - "smlal2 v5.4s, v27.8h, v1.8h\n" - "ldr x13, [x25, #0x90]\n" - "smlal v11.4s, v28.4h, v1.4h\n" - "ldr x21, [x25, #0x98]\n" - "smlal2 v8.4s, v28.8h, v1.8h\n" - "ldr x14, [x25, #0xa0]\n" - "smlal v10.4s, v23.4h, v1.4h\n" - "ldr x11, [x25, #0xa8]\n" - "smlal2 v9.4s, v23.8h, v1.8h\n" - "ldr d1, [x3, #0x30]\n" - "ssubl v1.8h, v1.8b, v13.8b\n" + "ldr x5, [x20, #0x70]\n" + "ldr x11, [x20, #0x78]\n" + "smlal2 v16.4s, v30.8h, v1.8h\n" + "smlal2 v5.4s, v28.8h, v0.8h\n" + "ldr d30, [x15, x24]\n" + "ssubl v30.8h, v30.8b, v9.8b\n" + "smlal v17.4s, v27.4h, v1.4h\n" + "smlal v10.4s, v28.4h, v1.4h\n" + "ldr d0, [x23, #0x28]\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal v6.4s, v23.4h, v1.4h\n" + "smlal2 v8.4s, v27.8h, v1.8h\n" + "ldr x12, [x20, #0x80]\n" + "ldr x26, [x20, #0x88]\n" + "smlal2 v7.4s, v28.8h, v1.8h\n" "smlal v15.4s, v27.4h, v2.4h\n" - "ldr x24, [x25, #0xb0]\n" - "smlal2 v20.4s, v27.8h, v2.8h\n" - "ldr d27, [x0, x10]\n" - "ssubl v27.8h, v27.8b, v7.8b\n" - "smlal v18.4s, v25.4h, v2.4h\n" - "ldr x0, [x25, #0xb8]\n" - "smlal2 v5.4s, v25.8h, v2.8h\n" - "ldr x15, [x25, #0xc0]\n" - "smlal v11.4s, v23.4h, v2.4h\n" - "ldr x9, [x25, #0xc8]\n" - "smlal2 v8.4s, v23.8h, v2.8h\n" - "ldr x27, [x25, #0xd0]\n" - "smlal v10.4s, v31.4h, v2.4h\n" - "ldr x28, [x25, #0xd8]\n" - "smlal2 v9.4s, v31.8h, v2.8h\n" - "ldr d2, [x3, #0x38]\n" - "ssubl v2.8h, v2.8b, v13.8b\n" + "ldr x14, [x20, #0x90]\n" + "ldr x15, [x20, #0x98]\n" + "smlal2 v16.4s, v27.8h, v2.8h\n" + "smlal2 v5.4s, v23.8h, v1.8h\n" + "ldr d27, [x19, x24]\n" + "ssubl v27.8h, v27.8b, v9.8b\n" + "smlal v17.4s, v25.4h, v2.4h\n" + "smlal v10.4s, v23.4h, v2.4h\n" + "ldr d1, [x23, #0x30]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v31.4h, v2.4h\n" + "smlal2 v8.4s, v25.8h, v2.8h\n" + "ldr x21, [x20, #0xa0]\n" + "ldr x2, [x20, #0xa8]\n" + "smlal2 v7.4s, v23.8h, v2.8h\n" "smlal v15.4s, v25.4h, v3.4h\n" - "ldr x12, [x25, #0xe0]\n" - "smlal2 v20.4s, v25.8h, v3.8h\n" - "ldr d25, [x7, x10]\n" - "ssubl v25.8h, v25.8b, v7.8b\n" - "smlal v18.4s, v24.4h, v3.4h\n" - "ldr x7, [x25, #0xe8]\n" - "smlal2 v5.4s, v24.8h, v3.8h\n" - "ldr q6, [x2, #0x0]\n" - "smlal v11.4s, v31.4h, v3.4h\n" - "ldr q21, [x5, #0x0]\n" - "smlal2 v8.4s, v31.8h, v3.8h\n" - "ldr q17, [x2, #0x10]\n" - "add x2, x2, #0x20\n" - "smlal v10.4s, v30.4h, v3.4h\n" - "ldr q14, [x5, #0x10]\n" - "add x5, x5, #0x20\n" - "smlal2 v9.4s, v30.8h, v3.8h\n" - "ldr d3, [x3, #0x40]\n" - "ssubl v3.8h, v3.8b, v13.8b\n" + "ldr x13, [x20, #0xb0]\n" + "ldr x9, [x20, #0xb8]\n" + "smlal2 v16.4s, v25.8h, v3.8h\n" + "smlal2 v5.4s, v31.8h, v2.8h\n" + "ldr d25, [x27, x24]\n" + "ssubl v25.8h, v25.8b, v9.8b\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal v10.4s, v31.4h, v3.4h\n" + "ldr d2, [x23, #0x38]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v30.4h, v3.4h\n" + "smlal2 v8.4s, v24.8h, v3.8h\n" + "ldr x19, [x20, #0xc0]\n" + "ldr x28, [x20, #0xc8]\n" + "smlal2 v7.4s, v31.8h, v3.8h\n" "smlal v15.4s, v24.4h, v4.4h\n" - "smlal2 v20.4s, v24.8h, v4.8h\n" - "ldr d24, [x26, x10]\n" - "ssubl v24.8h, v24.8b, v7.8b\n" - "smlal v18.4s, v27.4h, v4.4h\n" - "ldr x26, [x25, #0xf0]\n" - "smlal2 v5.4s, v27.8h, v4.8h\n" - "ldr d27, [x23, x10]\n" - "ssubl v27.8h, v27.8b, v7.8b\n" - "smlal v11.4s, v30.4h, v4.4h\n" - "ldr x23, [x25, #0xf8]\n" - "smlal2 v8.4s, v30.8h, v4.8h\n" - "smlal v10.4s, v26.4h, v4.4h\n" - "smlal2 v9.4s, v26.8h, v4.8h\n" - "ldr d4, [x3, #0x48]\n" - "ssubl v4.8h, v4.8b, v13.8b\n" + "ldr x6, [x20, #0xd0]\n" + "ldr x27, [x20, #0xd8]\n" + "smlal2 v16.4s, v24.8h, v4.8h\n" + "smlal2 v5.4s, v30.8h, v3.8h\n" + "ldr d24, [x5, x24]\n" + "ssubl v24.8h, v24.8b, v9.8b\n" + "smlal v17.4s, v27.4h, v4.4h\n" + "smlal v10.4s, v30.4h, v4.4h\n" + "ldr d3, [x23, #0x40]\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v26.4h, v4.4h\n" + "smlal2 v8.4s, v27.8h, v4.8h\n" + "ldr d27, [x11, x24]\n" + "ssubl v27.8h, v27.8b, v9.8b\n" + "smlal2 v7.4s, v30.8h, v4.8h\n" "smlal v15.4s, v29.4h, v0.4h\n" - "smlal2 v20.4s, v29.8h, v0.8h\n" - "smlal v18.4s, v28.4h, v0.4h\n" - "smlal2 v5.4s, v28.8h, v0.8h\n" - "smlal v11.4s, v22.4h, v0.4h\n" - "smlal2 v8.4s, v22.8h, v0.8h\n" - "smlal v10.4s, v25.4h, v0.4h\n" - "smlal2 v9.4s, v25.8h, v0.8h\n" - "ldr d0, [x3, #0x50]\n" - "ssubl v0.8h, v0.8b, v13.8b\n" + "ldr x11, [x20, #0xe0]\n" + "ldr x17, [x20, #0xe8]\n" + "smlal2 v16.4s, v29.8h, v0.8h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d4, [x23, #0x48]\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v17.4s, v28.4h, v0.4h\n" + "smlal v10.4s, v22.4h, v0.4h\n" + "ldr x5, [x20, #0xf0]\n" + "ldr x25, [x20, #0xf8]\n" + "smlal v6.4s, v25.4h, v0.4h\n" + "smlal2 v8.4s, v28.8h, v0.8h\n" + "ldr q12, [x10, #0x0]\n" + "ldr q19, [x1, #0x0]\n" + "smlal2 v7.4s, v22.8h, v0.8h\n" "smlal v15.4s, v28.4h, v1.4h\n" - "smlal2 v20.4s, v28.8h, v1.8h\n" - "ldr d28, [x22, x10]\n" - "ssubl v28.8h, v28.8b, v7.8b\n" - "smlal v18.4s, v23.4h, v1.4h\n" - "ldr x22, [x25, #0x100]\n" - "smlal2 v5.4s, v23.8h, v1.8h\n" - "smlal v11.4s, v25.4h, v1.4h\n" - "smlal2 v8.4s, v25.8h, v1.8h\n" - "smlal v10.4s, v24.4h, v1.4h\n" - "smlal2 v9.4s, v24.8h, v1.8h\n" - "ldr d1, [x3, #0x58]\n" - "ssubl v1.8h, v1.8b, v13.8b\n" + "ldr q20, [x10, #0x10]\n" + "ldr q29, [x1, #0x10]\n" + "smlal2 v16.4s, v28.8h, v1.8h\n" + "smlal2 v5.4s, v25.8h, v0.8h\n" + "ldr d28, [x26, x24]\n" + "ldr d0, [x23, #0x50]\n" + "smlal v17.4s, v23.4h, v1.4h\n" + "smlal v10.4s, v25.4h, v1.4h\n" + "ssubl v28.8h, v28.8b, v9.8b\n" + "ldr x26, [x20, #0x100]\n" + "smlal v6.4s, v24.4h, v1.4h\n" + "smlal2 v8.4s, v23.8h, v1.8h\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "tst x0, #0x7\n" + "smlal2 v7.4s, v25.8h, v1.8h\n" "smlal v15.4s, v23.4h, v2.4h\n" - "smlal2 v20.4s, v23.8h, v2.8h\n" - "ldr d23, [x20, x10]\n" - "ssubl v23.8h, v23.8b, v7.8b\n" - "smlal v18.4s, v31.4h, v2.4h\n" - "ldr x20, [x25, #0x108]\n" - "smlal2 v5.4s, v31.8h, v2.8h\n" - "smlal v11.4s, v24.4h, v2.4h\n" - "smlal2 v8.4s, v24.8h, v2.8h\n" - "smlal v10.4s, v27.4h, v2.4h\n" - "smlal2 v9.4s, v27.8h, v2.8h\n" - "ldr d2, [x3, #0x60]\n" - "ssubl v2.8h, v2.8b, v13.8b\n" + "add x10, x10, #0x20\n" + "add x1, x1, #0x20\n" + "smlal2 v16.4s, v23.8h, v2.8h\n" + "ldr d23, [x12, x24]\n" + "smlal2 v5.4s, v24.8h, v1.8h\n" + "ssubl v23.8h, v23.8b, v9.8b\n" + "smlal v17.4s, v31.4h, v2.4h\n" + "smlal v10.4s, v24.4h, v2.4h\n" + "ldr d1, [x23, #0x58]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v27.4h, v2.4h\n" + "smlal2 v8.4s, v31.8h, v2.8h\n" + "ldr x12, [x20, #0x108]\n" + "smlal2 v7.4s, v24.8h, v2.8h\n" "smlal v15.4s, v31.4h, v3.4h\n" - "smlal2 v20.4s, v31.8h, v3.8h\n" - "ldr d31, [x13, x10]\n" - "ssubl v31.8h, v31.8b, v7.8b\n" - "smlal v18.4s, v30.4h, v3.4h\n" - "ldr x13, [x25, #0x110]\n" - "smlal2 v5.4s, v30.8h, v3.8h\n" - "smlal v11.4s, v27.4h, v3.4h\n" - "smlal2 v8.4s, v27.8h, v3.8h\n" - "smlal v10.4s, v23.4h, v3.4h\n" - "smlal2 v9.4s, v23.8h, v3.8h\n" - "ldr d3, [x3, #0x68]\n" - "ssubl v3.8h, v3.8b, v13.8b\n" + "smlal2 v16.4s, v31.8h, v3.8h\n" + "smlal2 v5.4s, v27.8h, v2.8h\n" + "ldr d31, [x14, x24]\n" + "ssubl v31.8h, v31.8b, v9.8b\n" + "smlal v17.4s, v30.4h, v3.4h\n" + "smlal v10.4s, v27.4h, v3.4h\n" + "ldr d2, [x23, #0x60]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v23.4h, v3.4h\n" + "smlal2 v8.4s, v30.8h, v3.8h\n" + "ldr x14, [x20, #0x110]\n" + "smlal2 v7.4s, v27.8h, v3.8h\n" "smlal v15.4s, v30.4h, v4.4h\n" - "smlal2 v20.4s, v30.8h, v4.8h\n" - "ldr d30, [x21, x10]\n" - "ssubl v30.8h, v30.8b, v7.8b\n" - "smlal v18.4s, v26.4h, v4.4h\n" - "ldr x21, [x25, #0x118]\n" - "smlal2 v5.4s, v26.8h, v4.8h\n" - "ldr d26, [x14, x10]\n" - "ssubl v26.8h, v26.8b, v7.8b\n" - "smlal v11.4s, v23.4h, v4.4h\n" - "smlal2 v8.4s, v23.8h, v4.8h\n" - "smlal v10.4s, v28.4h, v4.4h\n" - "smlal2 v9.4s, v28.8h, v4.8h\n" - "ldr d4, [x3, #0x70]\n" - "ssubl v4.8h, v4.8b, v13.8b\n" + "smlal2 v16.4s, v30.8h, v4.8h\n" + "ldr d30, [x15, x24]\n" + "smlal2 v5.4s, v23.8h, v3.8h\n" + "ssubl v30.8h, v30.8b, v9.8b\n" + "smlal v17.4s, v26.4h, v4.4h\n" + "smlal v10.4s, v23.4h, v4.4h\n" + "ldr d3, [x23, #0x68]\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v28.4h, v4.4h\n" + "smlal2 v8.4s, v26.8h, v4.8h\n" + "ldr d26, [x21, x24]\n" + "ssubl v26.8h, v26.8b, v9.8b\n" + "smlal2 v7.4s, v23.8h, v4.8h\n" "smlal v15.4s, v22.4h, v0.4h\n" - "smlal2 v20.4s, v22.8h, v0.8h\n" - "ldr d22, [x0, x10]\n" - "ssubl v22.8h, v22.8b, v7.8b\n" - "smlal v18.4s, v25.4h, v0.4h\n" - "smlal2 v5.4s, v25.8h, v0.8h\n" - "smlal v11.4s, v31.4h, v0.4h\n" - "smlal2 v8.4s, v31.8h, v0.8h\n" - "smlal v10.4s, v30.4h, v0.4h\n" - "smlal2 v9.4s, v30.8h, v0.8h\n" - "ldr d0, [x3, #0x78]\n" - "ssubl v0.8h, v0.8b, v13.8b\n" + "ldr x21, [x20, #0x118]\n" + "smlal2 v16.4s, v22.8h, v0.8h\n" + "smlal2 v5.4s, v28.8h, v4.8h\n" + "ldr d4, [x23, #0x70]\n" + "ldr d22, [x9, x24]\n" + "smlal v17.4s, v25.4h, v0.4h\n" + "smlal v10.4s, v31.4h, v0.4h\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v6.4s, v30.4h, v0.4h\n" + "smlal2 v8.4s, v25.8h, v0.8h\n" + "ssubl v22.8h, v22.8b, v9.8b\n" + "smlal2 v7.4s, v31.8h, v0.8h\n" "smlal v15.4s, v25.4h, v1.4h\n" - "smlal2 v20.4s, v25.8h, v1.8h\n" - "ldr d25, [x11, x10]\n" - "ssubl v25.8h, v25.8b, v7.8b\n" - "smlal v18.4s, v24.4h, v1.4h\n" - "smlal2 v5.4s, v24.8h, v1.8h\n" - "smlal v11.4s, v30.4h, v1.4h\n" - "smlal2 v8.4s, v30.8h, v1.8h\n" - "smlal v10.4s, v26.4h, v1.4h\n" - "smlal2 v9.4s, v26.8h, v1.8h\n" - "ldr d1, [x3, #0x80]\n" - "ssubl v1.8h, v1.8b, v13.8b\n" + "smlal2 v16.4s, v25.8h, v1.8h\n" + "ldr d25, [x2, x24]\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "ssubl v25.8h, v25.8b, v9.8b\n" + "smlal v17.4s, v24.4h, v1.4h\n" + "smlal v10.4s, v30.4h, v1.4h\n" + "ldr d0, [x23, #0x78]\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal v6.4s, v26.4h, v1.4h\n" + "smlal2 v8.4s, v24.8h, v1.8h\n" + "smlal2 v7.4s, v30.8h, v1.8h\n" "smlal v15.4s, v24.4h, v2.4h\n" - "smlal2 v20.4s, v24.8h, v2.8h\n" - "ldr d24, [x24, x10]\n" - "ssubl v24.8h, v24.8b, v7.8b\n" - "smlal v18.4s, v27.4h, v2.4h\n" - "smlal2 v5.4s, v27.8h, v2.8h\n" - "smlal v11.4s, v26.4h, v2.4h\n" - "smlal2 v8.4s, v26.8h, v2.8h\n" - "smlal v10.4s, v25.4h, v2.4h\n" - "smlal2 v9.4s, v25.8h, v2.8h\n" - "ldr d2, [x3, #0x88]\n" - "ssubl v2.8h, v2.8b, v13.8b\n" + "smlal2 v16.4s, v24.8h, v2.8h\n" + "ldr d24, [x13, x24]\n" + "smlal2 v5.4s, v26.8h, v1.8h\n" + "ssubl v24.8h, v24.8b, v9.8b\n" + "smlal v17.4s, v27.4h, v2.4h\n" + "smlal v10.4s, v26.4h, v2.4h\n" + "ldr d1, [x23, #0x80]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v25.4h, v2.4h\n" + "smlal2 v8.4s, v27.8h, v2.8h\n" + "smlal2 v7.4s, v26.8h, v2.8h\n" "smlal v15.4s, v27.4h, v3.4h\n" - "smlal2 v20.4s, v27.8h, v3.8h\n" - "ldr d27, [x15, x10]\n" - "ssubl v27.8h, v27.8b, v7.8b\n" - "smlal v18.4s, v23.4h, v3.4h\n" - "smlal2 v5.4s, v23.8h, v3.8h\n" - "smlal v11.4s, v25.4h, v3.4h\n" - "smlal2 v8.4s, v25.8h, v3.8h\n" - "smlal v10.4s, v24.4h, v3.4h\n" - "smlal2 v9.4s, v24.8h, v3.8h\n" - "ldr d3, [x3, #0x90]\n" - "ssubl v3.8h, v3.8b, v13.8b\n" + "smlal2 v16.4s, v27.8h, v3.8h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "ldr d27, [x19, x24]\n" + "ssubl v27.8h, v27.8b, v9.8b\n" + "smlal v17.4s, v23.4h, v3.4h\n" + "smlal v10.4s, v25.4h, v3.4h\n" + "ldr d2, [x23, #0x88]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v24.4h, v3.4h\n" + "smlal2 v8.4s, v23.8h, v3.8h\n" + "smlal2 v7.4s, v25.8h, v3.8h\n" "smlal v15.4s, v23.4h, v4.4h\n" - "smlal2 v20.4s, v23.8h, v4.8h\n" - "ldr d23, [x9, x10]\n" - "ssubl v23.8h, v23.8b, v7.8b\n" - "smlal v18.4s, v28.4h, v4.4h\n" - "smlal2 v5.4s, v28.8h, v4.8h\n" - "ldr d28, [x12, x10]\n" - "ssubl v28.8h, v28.8b, v7.8b\n" - "smlal v11.4s, v24.4h, v4.4h\n" - "smlal2 v8.4s, v24.8h, v4.8h\n" - "smlal v10.4s, v22.4h, v4.4h\n" - "smlal2 v9.4s, v22.8h, v4.8h\n" - "ldr d4, [x3, #0x98]\n" - "ssubl v4.8h, v4.8b, v13.8b\n" + "smlal2 v16.4s, v23.8h, v4.8h\n" + "ldr d23, [x28, x24]\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "ssubl v23.8h, v23.8b, v9.8b\n" + "smlal v17.4s, v28.4h, v4.4h\n" + "smlal v10.4s, v24.4h, v4.4h\n" + "ldr d3, [x23, #0x90]\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v22.4h, v4.4h\n" + "smlal2 v8.4s, v28.8h, v4.8h\n" + "ldr d28, [x11, x24]\n" + "ssubl v28.8h, v28.8b, v9.8b\n" + "smlal2 v7.4s, v24.8h, v4.8h\n" "smlal v15.4s, v31.4h, v0.4h\n" - "smlal2 v20.4s, v31.8h, v0.8h\n" - "ldr d31, [x27, x10]\n" - "ssubl v31.8h, v31.8b, v7.8b\n" - "smlal v18.4s, v30.4h, v0.4h\n" - "smlal2 v5.4s, v30.8h, v0.8h\n" - "smlal v11.4s, v27.4h, v0.4h\n" - "smlal2 v8.4s, v27.8h, v0.8h\n" - "smlal v10.4s, v23.4h, v0.4h\n" - "smlal2 v9.4s, v23.8h, v0.8h\n" - "ldr d0, [x3, #0xa0]\n" - "ssubl v0.8h, v0.8b, v13.8b\n" + "smlal2 v16.4s, v31.8h, v0.8h\n" + "ldr d31, [x6, x24]\n" + "smlal2 v5.4s, v22.8h, v4.8h\n" + "ssubl v31.8h, v31.8b, v9.8b\n" + "smlal v17.4s, v30.4h, v0.4h\n" + "smlal v10.4s, v27.4h, v0.4h\n" + "ldr d4, [x23, #0x98]\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v6.4s, v23.4h, v0.4h\n" + "smlal2 v8.4s, v30.8h, v0.8h\n" + "smlal2 v7.4s, v27.8h, v0.8h\n" "smlal v15.4s, v30.4h, v1.4h\n" - "smlal2 v20.4s, v30.8h, v1.8h\n" - "ldr d30, [x28, x10]\n" - "ssubl v30.8h, v30.8b, v7.8b\n" - "smlal v18.4s, v26.4h, v1.4h\n" - "smlal2 v5.4s, v26.8h, v1.8h\n" - "smlal v11.4s, v23.4h, v1.4h\n" - "smlal2 v8.4s, v23.8h, v1.8h\n" - "smlal v10.4s, v31.4h, v1.4h\n" - "smlal2 v9.4s, v31.8h, v1.8h\n" - "ldr d1, [x3, #0xa8]\n" - "ssubl v1.8h, v1.8b, v13.8b\n" + "smlal2 v16.4s, v30.8h, v1.8h\n" + "ldr d30, [x27, x24]\n" + "smlal2 v5.4s, v23.8h, v0.8h\n" + "ssubl v30.8h, v30.8b, v9.8b\n" + "smlal v17.4s, v26.4h, v1.4h\n" + "smlal v10.4s, v23.4h, v1.4h\n" + "ldr d0, [x23, #0xa0]\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal v6.4s, v31.4h, v1.4h\n" + "smlal2 v8.4s, v26.8h, v1.8h\n" + "smlal2 v7.4s, v23.8h, v1.8h\n" "smlal v15.4s, v26.4h, v2.4h\n" - "smlal2 v20.4s, v26.8h, v2.8h\n" - "ldr d26, [x7, x10]\n" - "ssubl v26.8h, v26.8b, v7.8b\n" - "smlal v18.4s, v25.4h, v2.4h\n" - "smlal2 v5.4s, v25.8h, v2.8h\n" - "smlal v11.4s, v31.4h, v2.4h\n" - "smlal2 v8.4s, v31.8h, v2.8h\n" - "smlal v10.4s, v30.4h, v2.4h\n" - "smlal2 v9.4s, v30.8h, v2.8h\n" - "ldr d2, [x3, #0xb0]\n" - "ssubl v2.8h, v2.8b, v13.8b\n" + "smlal2 v16.4s, v26.8h, v2.8h\n" + "smlal2 v5.4s, v31.8h, v1.8h\n" + "ldr d26, [x17, x24]\n" + "ssubl v26.8h, v26.8b, v9.8b\n" + "smlal v17.4s, v25.4h, v2.4h\n" + "smlal v10.4s, v31.4h, v2.4h\n" + "ldr d1, [x23, #0xa8]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v30.4h, v2.4h\n" + "smlal2 v8.4s, v25.8h, v2.8h\n" + "smlal2 v7.4s, v31.8h, v2.8h\n" "smlal v15.4s, v25.4h, v3.4h\n" - "smlal2 v20.4s, v25.8h, v3.8h\n" - "ldr d25, [x26, x10]\n" - "ssubl v25.8h, v25.8b, v7.8b\n" - "smlal v18.4s, v24.4h, v3.4h\n" - "smlal2 v5.4s, v24.8h, v3.8h\n" - "smlal v11.4s, v30.4h, v3.4h\n" - "smlal2 v8.4s, v30.8h, v3.8h\n" - "smlal v10.4s, v28.4h, v3.4h\n" - "smlal2 v9.4s, v28.8h, v3.8h\n" - "ldr d3, [x3, #0xb8]\n" - "ssubl v3.8h, v3.8b, v13.8b\n" + "smlal2 v16.4s, v25.8h, v3.8h\n" + "smlal2 v5.4s, v30.8h, v2.8h\n" + "ldr d25, [x5, x24]\n" + "ssubl v25.8h, v25.8b, v9.8b\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal v10.4s, v30.4h, v3.4h\n" + "ldr d2, [x23, #0xb0]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v28.4h, v3.4h\n" + "smlal2 v8.4s, v24.8h, v3.8h\n" + "smlal2 v7.4s, v30.8h, v3.8h\n" "smlal v15.4s, v24.4h, v4.4h\n" - "smlal2 v20.4s, v24.8h, v4.8h\n" - "ldr d24, [x23, x10]\n" - "ssubl v24.8h, v24.8b, v7.8b\n" - "smlal v18.4s, v22.4h, v4.4h\n" - "smlal2 v5.4s, v22.8h, v4.8h\n" - "smlal v11.4s, v28.4h, v4.4h\n" - "smlal2 v8.4s, v28.8h, v4.8h\n" - "smlal v10.4s, v26.4h, v4.4h\n" - "smlal2 v9.4s, v26.8h, v4.8h\n" - "ldr d4, [x3, #0xc0]\n" - "ssubl v4.8h, v4.8b, v13.8b\n" + "smlal2 v16.4s, v24.8h, v4.8h\n" + "ldr d24, [x25, x24]\n" + "smlal2 v5.4s, v28.8h, v3.8h\n" + "ssubl v24.8h, v24.8b, v9.8b\n" + "smlal v17.4s, v22.4h, v4.4h\n" + "smlal v10.4s, v28.4h, v4.4h\n" + "ldr d3, [x23, #0xb8]\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v26.4h, v4.4h\n" + "smlal2 v7.4s, v28.8h, v4.8h\n" "smlal v15.4s, v27.4h, v0.4h\n" - "smlal2 v20.4s, v27.8h, v0.8h\n" - "ldr d27, [x22, x10]\n" - "ssubl v27.8h, v27.8b, v7.8b\n" - "smlal v18.4s, v23.4h, v0.4h\n" - "smlal2 v5.4s, v23.8h, v0.8h\n" - "smlal v11.4s, v25.4h, v0.4h\n" - "smlal2 v8.4s, v25.8h, v0.8h\n" - "ldr d25, [x20, x10]\n" - "ssubl v25.8h, v25.8b, v7.8b\n" - "smlal v10.4s, v24.4h, v0.4h\n" - "smlal2 v9.4s, v24.8h, v0.8h\n" + "smlal2 v16.4s, v27.8h, v0.8h\n" + "ldr d27, [x26, x24]\n" + "ssubl v27.8h, v27.8b, v9.8b\n" + "smlal2 v8.4s, v22.8h, v4.8h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d4, [x23, #0xc0]\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v17.4s, v23.4h, v0.4h\n" + "smlal v10.4s, v25.4h, v0.4h\n" + "smlal v6.4s, v24.4h, v0.4h\n" + "smlal2 v7.4s, v25.8h, v0.8h\n" + "ldr d25, [x12, x24]\n" + "ssubl v25.8h, v25.8b, v9.8b\n" + "smlal2 v8.4s, v23.8h, v0.8h\n" + "smlal2 v5.4s, v24.8h, v0.8h\n" "smlal v15.4s, v23.4h, v1.4h\n" - "smlal2 v20.4s, v23.8h, v1.8h\n" - "smlal v18.4s, v31.4h, v1.4h\n" - "smlal2 v5.4s, v31.8h, v1.8h\n" - "smlal v11.4s, v24.4h, v1.4h\n" - "smlal2 v8.4s, v24.8h, v1.8h\n" - "ldr d24, [x13, x10]\n" - "ssubl v24.8h, v24.8b, v7.8b\n" - "smlal v10.4s, v27.4h, v1.4h\n" - "smlal2 v9.4s, v27.8h, v1.8h\n" + "smlal v17.4s, v31.4h, v1.4h\n" + "smlal v10.4s, v24.4h, v1.4h\n" + "smlal v6.4s, v27.4h, v1.4h\n" + "smlal2 v7.4s, v24.8h, v1.8h\n" + "ldr d24, [x14, x24]\n" + "smlal2 v16.4s, v23.8h, v1.8h\n" + "ssubl v24.8h, v24.8b, v9.8b\n" + "smlal2 v8.4s, v31.8h, v1.8h\n" + "smlal2 v5.4s, v27.8h, v1.8h\n" "smlal v15.4s, v31.4h, v2.4h\n" - "smlal2 v20.4s, v31.8h, v2.8h\n" - "smlal v18.4s, v30.4h, v2.4h\n" - "smlal2 v5.4s, v30.8h, v2.8h\n" - "smlal v11.4s, v27.4h, v2.4h\n" - "smlal2 v8.4s, v27.8h, v2.8h\n" - "ldr d27, [x21, x10]\n" - "add x10, x10, #0x8\n" - "smlal v10.4s, v25.4h, v2.4h\n" - "ssubl v27.8h, v27.8b, v7.8b\n" - "smlal2 v9.4s, v25.8h, v2.8h\n" + "smlal v17.4s, v30.4h, v2.4h\n" + "smlal v10.4s, v27.4h, v2.4h\n" + "smlal v6.4s, v25.4h, v2.4h\n" + "smlal2 v7.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x24]\n" + "smlal2 v16.4s, v31.8h, v2.8h\n" + "ssubl v27.8h, v27.8b, v9.8b\n" + "smlal2 v8.4s, v30.8h, v2.8h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "add x24, x24, #0x8\n" "smlal v15.4s, v30.4h, v3.4h\n" - "smlal2 v20.4s, v30.8h, v3.8h\n" - "smlal v18.4s, v28.4h, v3.4h\n" - "smlal2 v5.4s, v28.8h, v3.8h\n" - "smlal v11.4s, v25.4h, v3.4h\n" - "smlal2 v8.4s, v25.8h, v3.8h\n" - "smlal v10.4s, v24.4h, v3.4h\n" - "smlal2 v9.4s, v24.8h, v3.8h\n" + "smlal v17.4s, v28.4h, v3.4h\n" + "smlal v10.4s, v25.4h, v3.4h\n" + "smlal v6.4s, v24.4h, v3.4h\n" + "smlal2 v16.4s, v30.8h, v3.8h\n" + "smlal2 v8.4s, v28.8h, v3.8h\n" + "smlal2 v7.4s, v25.8h, v3.8h\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" "smlal v15.4s, v28.4h, v4.4h\n" - "smlal2 v20.4s, v28.8h, v4.8h\n" - "smlal v18.4s, v26.4h, v4.4h\n" - "smlal2 v5.4s, v26.8h, v4.8h\n" - "smlal v11.4s, v24.4h, v4.4h\n" - "smlal2 v8.4s, v24.8h, v4.8h\n" - "smlal v10.4s, v27.4h, v4.4h\n" - "smlal2 v9.4s, v27.8h, v4.8h\n" - "sqrdmulh v15.4s, v15.4s, v6.4s\n" - "sqrdmulh v20.4s, v20.4s, v17.4s\n" - "sqrdmulh v18.4s, v18.4s, v6.4s\n" - "sqrdmulh v5.4s, v5.4s, v17.4s\n" - "and v1.16b, v15.16b, v21.16b\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "and v29.16b, v20.16b, v14.16b\n" - "and v3.16b, v18.16b, v21.16b\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "and v2.16b, v5.16b, v14.16b\n" - "sqrdmulh v11.4s, v11.4s, v6.4s\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqrdmulh v8.4s, v8.4s, v17.4s\n" + "smlal v17.4s, v26.4h, v4.4h\n" + "sqdmulh v15.4s, v15.4s, v12.4s\n" + "smlal v10.4s, v24.4h, v4.4h\n" + "smlal v6.4s, v27.4h, v4.4h\n" + "sqdmulh v17.4s, v17.4s, v12.4s\n" + "smlal2 v16.4s, v28.8h, v4.8h\n" + "smlal2 v8.4s, v26.8h, v4.8h\n" + "sqdmulh v10.4s, v10.4s, v12.4s\n" + "smlal2 v7.4s, v24.8h, v4.8h\n" + "smlal2 v5.4s, v27.8h, v4.8h\n" + "sqdmulh v6.4s, v6.4s, v12.4s\n" + "and v23.16b, v15.16b, v19.16b\n" + "sqdmulh v16.4s, v16.4s, v20.4s\n" + "and v22.16b, v17.16b, v19.16b\n" + "sqdmulh v8.4s, v8.4s, v20.4s\n" + "and v21.16b, v10.16b, v19.16b\n" + "sqdmulh v7.4s, v7.4s, v20.4s\n" + "and v26.16b, v6.16b, v19.16b\n" + "sqdmulh v5.4s, v5.4s, v20.4s\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "and v4.16b, v16.16b, v29.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v2.16b, v8.16b, v29.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v3.16b, v7.16b, v29.16b\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "and v25.16b, v5.16b, v29.16b\n" + "sqadd v15.4s, v15.4s, v23.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v22.4s\n" "sshr v2.4s, v2.4s, #0x1f\n" - "sqadd v15.4s, v15.4s, v1.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "and v0.16b, v11.16b, v21.16b\n" - "sshr v0.4s, v0.4s, #0x1f\n" - "srshl v15.4s, v15.4s, v21.4s\n" - "sqadd v20.4s, v20.4s, v29.4s\n" - "sqadd v18.4s, v18.4s, v3.4s\n" - "sqadd v5.4s, v5.4s, v2.4s\n" - "and v27.16b, v8.16b, v14.16b\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "add v15.4s, v15.4s, v19.4s\n" - "srshl v20.4s, v20.4s, v14.4s\n" - "srshl v18.4s, v18.4s, v21.4s\n" - "srshl v5.4s, v5.4s, v14.4s\n" - "smin v15.4s, v15.4s, v12.4s\n" - "add v20.4s, v20.4s, v19.4s\n" - "add v18.4s, v18.4s, v19.4s\n" - "smax v15.4s, v15.4s, v16.4s\n" - "smin v20.4s, v20.4s, v12.4s\n" - "smin v18.4s, v18.4s, v12.4s\n" - "add v5.4s, v5.4s, v19.4s\n" - "smax v20.4s, v20.4s, v16.4s\n" - "smax v18.4s, v18.4s, v16.4s\n" - "smin v5.4s, v5.4s, v12.4s\n" - "uzp1 v15.16b, v15.16b, v20.16b\n" - "sqadd v11.4s, v11.4s, v0.4s\n" + "sqadd v10.4s, v10.4s, v21.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqadd v6.4s, v6.4s, v26.4s\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v19.4s\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "srshl v17.4s, v17.4s, v19.4s\n" + "sqadd v8.4s, v8.4s, v2.4s\n" + "srshl v10.4s, v10.4s, v19.4s\n" + "sqadd v7.4s, v7.4s, v3.4s\n" + "srshl v6.4s, v6.4s, v19.4s\n" + "sqadd v5.4s, v5.4s, v25.4s\n" + "srshl v16.4s, v16.4s, v29.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v8.4s, v8.4s, v29.4s\n" + "sqxtn v17.4h, v17.4s\n" + "srshl v7.4s, v7.4s, v29.4s\n" + "sqxtn v10.4h, v10.4s\n" + "srshl v5.4s, v5.4s, v29.4s\n" + "sqxtn v6.4h, v6.4s\n" + "sqxtn2 v15.8h, v16.4s\n" + "sqxtn2 v17.8h, v8.4s\n" + "sqxtn2 v10.8h, v7.4s\n" + "sqxtn2 v6.8h, v5.4s\n" + "sqadd v15.8h, v15.8h, v18.8h\n" + "sqadd v17.8h, v17.8h, v18.8h\n" + "sqadd v10.8h, v10.8h, v18.8h\n" + "sqadd v6.8h, v6.8h, v18.8h\n" + "smax v15.8h, v15.8h, v11.8h\n" + "smax v17.8h, v17.8h, v11.8h\n" + "smax v10.8h, v10.8h, v11.8h\n" + "smax v6.8h, v6.8h, v11.8h\n" + "smin v15.8h, v15.8h, v13.8h\n" + "smin v17.8h, v17.8h, v13.8h\n" + "smin v10.8h, v10.8h, v13.8h\n" + "smin v6.8h, v6.8h, v13.8h\n" "uzp1 v15.16b, v15.16b, v15.16b\n" - "str d15, [x17, x1]\n" - "smax v5.4s, v5.4s, v16.4s\n" - "sqadd v8.4s, v8.4s, v27.4s\n" - "srshl v11.4s, v11.4s, v21.4s\n" - "and v30.16b, v10.16b, v21.16b\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "uzp1 v18.16b, v18.16b, v5.16b\n" - "add v11.4s, v11.4s, v19.4s\n" - "srshl v8.4s, v8.4s, v14.4s\n" - "uzp1 v18.16b, v18.16b, v18.16b\n" - "str d18, [x16, x1]\n" - "smin v11.4s, v11.4s, v12.4s\n" - "sqrdmulh v9.4s, v9.4s, v17.4s\n" - "add v8.4s, v8.4s, v19.4s\n" - "sqadd v10.4s, v10.4s, v30.4s\n" - "smax v11.4s, v11.4s, v16.4s\n" - "smin v8.4s, v8.4s, v12.4s\n" - "and v6.16b, v9.16b, v14.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "smax v8.4s, v8.4s, v16.4s\n" - "srshl v10.4s, v10.4s, v21.4s\n" - "uzp1 v11.16b, v11.16b, v8.16b\n" - "add v10.4s, v10.4s, v19.4s\n" - "uzp1 v11.16b, v11.16b, v11.16b\n" - "str d11, [x6, x1]\n" - "smin v10.4s, v10.4s, v12.4s\n" - "sqadd v9.4s, v9.4s, v6.4s\n" - "smax v10.4s, v10.4s, v16.4s\n" - "srshl v9.4s, v9.4s, v14.4s\n" - "add v9.4s, v9.4s, v19.4s\n" - "smin v9.4s, v9.4s, v12.4s\n" - "smax v9.4s, v9.4s, v16.4s\n" - "uzp1 v10.16b, v10.16b, v9.16b\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" + "str d15, [x16, x22]\n" "uzp1 v10.16b, v10.16b, v10.16b\n" - "str d10, [x8, x1]\n" - "add x1, x1, #0x8\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "str d17, [x8, x22]\n" + "str d10, [x4, x22]\n" + "str d6, [x7, x22]\n" + "add x22, x22, #0x8\n" "beq 124f\n" - "add x3, x3, #0xc8\n" + "add x23, x23, #0xc8\n" "3:" // Oddments - "ldr x12, [%x[params], %[offsetof_Params_bias]]\n" - "tbz x4, #2, 5f\n" - "ld1 { v15.4s }, [x12], #0x10\n" - "tbz x4, #1, 4f\n" - "ld1 { v20.d }[0], [x12], #0x8\n" - "tbz x4, #0, 7f\n" - "ld1 { v20.s }[2], [x12]\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "tbz x0, #2, 5f\n" + "ld1 { v15.4s }, [x19], #0x10\n" + "tbz x0, #1, 4f\n" + "ld1 { v16.d }[0], [x19], #0x8\n" + "tbz x0, #0, 7f\n" + "ld1 { v16.s }[2], [x19]\n" "b 7f\n" "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset - "tbz x4, #0, 7f\n" - "ld1 { v20.s }[0], [x12]\n" + "tbz x0, #0, 7f\n" + "ld1 { v16.s }[0], [x19]\n" "b 7f\n" "5:" // Oddments: Load bias: Bit 2: Unset - "tbz x4, #1, 6f\n" - "ld1 { v15.d }[0], [x12], #0x8\n" - "tbz x4, #0, 7f\n" - "ld1 { v15.s }[2], [x12]\n" + "tbz x0, #1, 6f\n" + "ld1 { v15.d }[0], [x19], #0x8\n" + "tbz x0, #0, 7f\n" + "ld1 { v15.s }[2], [x19]\n" "b 7f\n" "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 7f\n" - "ld1 { v15.s }[0], [x12]\n" + "tbz x0, #0, 7f\n" + "ld1 { v15.s }[0], [x19]\n" "7:" // Oddments: Load bias: Bit 2: End - "mov v18.16b, v15.16b\n" - "ldr d0, [x3, #0x0]\n" - "mov v5.16b, v20.16b\n" - "ldr d1, [x3, #0x8]\n" - "mov v11.16b, v15.16b\n" - "ldr d2, [x3, #0x10]\n" - "mov v8.16b, v20.16b\n" - "ldr d3, [x3, #0x18]\n" + "ldr d0, [x23, #0x0]\n" + "ldr d1, [x23, #0x8]\n" + "mov v17.16b, v15.16b\n" + "mov v8.16b, v16.16b\n" + "ldr d2, [x23, #0x10]\n" + "ldr d3, [x23, #0x18]\n" "mov v10.16b, v15.16b\n" - "ldr d4, [x3, #0x20]\n" - "ssubl v0.8h, v0.8b, v13.8b\n" - "mov v9.16b, v20.16b\n" - "ldp x28, x27, [x25, #0x0]\n" - "ssubl v1.8h, v1.8b, v13.8b\n" - "ldp x26, x13, [x25, #0x10]\n" - "ssubl v2.8h, v2.8b, v13.8b\n" - "ssubl v3.8h, v3.8b, v13.8b\n" - "ldp x24, x23, [x25, #0x20]\n" - "ssubl v4.8h, v4.8b, v13.8b\n" - "ldp x22, x21, [x25, #0x30]\n" - "ldp x20, x0, [x25, #0x40]\n" - "add x28, x28, x10\n" - "add x27, x27, x10\n" - "add x26, x26, x10\n" - "add x13, x13, x10\n" - "add x24, x24, x10\n" - "add x23, x23, x10\n" - "add x22, x22, x10\n" - "add x21, x21, x10\n" - "add x20, x20, x10\n" - "add x0, x0, x10\n" - "tbz x4, #2, 9f\n" + "mov v7.16b, v16.16b\n" + "ldr d4, [x23, #0x20]\n" + "ldp x28, x6, [x20, #0x0]\n" + "mov v6.16b, v15.16b\n" + "mov v5.16b, v16.16b\n" + "ldp x26, x25, [x20, #0x10]\n" + "ldp x5, x2, [x20, #0x20]\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "ldp x27, x21, [x20, #0x30]\n" + "ldp x12, x19, [x20, #0x40]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "add x28, x28, x24\n" + "add x6, x6, x24\n" + "add x26, x26, x24\n" + "add x25, x25, x24\n" + "add x5, x5, x24\n" + "add x2, x2, x24\n" + "add x27, x27, x24\n" + "add x21, x21, x24\n" + "add x12, x12, x24\n" + "add x19, x19, x24\n" + "tbz x0, #2, 9f\n" "ld1 { v31.s }[0], [x28], #0x4\n" - "ld1 { v30.s }[0], [x27], #0x4\n" + "ld1 { v30.s }[0], [x6], #0x4\n" "ld1 { v29.s }[0], [x26], #0x4\n" - "ld1 { v28.s }[0], [x13], #0x4\n" - "ld1 { v27.s }[0], [x24], #0x4\n" - "ld1 { v23.s }[0], [x23], #0x4\n" - "ld1 { v25.s }[0], [x22], #0x4\n" + "ld1 { v28.s }[0], [x25], #0x4\n" + "ld1 { v27.s }[0], [x5], #0x4\n" + "ld1 { v23.s }[0], [x2], #0x4\n" + "ld1 { v25.s }[0], [x27], #0x4\n" "ld1 { v24.s }[0], [x21], #0x4\n" - "ld1 { v26.s }[0], [x20], #0x4\n" - "ld1 { v22.s }[0], [x0], #0x4\n" - "tbz x4, #1, 8f\n" + "ld1 { v26.s }[0], [x12], #0x4\n" + "ld1 { v22.s }[0], [x19], #0x4\n" + "tbz x0, #1, 8f\n" "ld1 { v31.h }[2], [x28], #0x2\n" - "ld1 { v30.h }[2], [x27], #0x2\n" + "ld1 { v30.h }[2], [x6], #0x2\n" "ld1 { v29.h }[2], [x26], #0x2\n" - "ld1 { v28.h }[2], [x13], #0x2\n" - "ld1 { v27.h }[2], [x24], #0x2\n" - "ld1 { v23.h }[2], [x23], #0x2\n" - "ld1 { v25.h }[2], [x22], #0x2\n" + "ld1 { v28.h }[2], [x25], #0x2\n" + "ld1 { v27.h }[2], [x5], #0x2\n" + "ld1 { v23.h }[2], [x2], #0x2\n" + "ld1 { v25.h }[2], [x27], #0x2\n" "ld1 { v24.h }[2], [x21], #0x2\n" - "ld1 { v26.h }[2], [x20], #0x2\n" - "ld1 { v22.h }[2], [x0], #0x2\n" - "tbz x4, #0, 11f\n" + "ld1 { v26.h }[2], [x12], #0x2\n" + "ld1 { v22.h }[2], [x19], #0x2\n" + "tbz x0, #0, 11f\n" "ld1 { v31.b }[6], [x28]\n" - "ld1 { v30.b }[6], [x27]\n" + "ld1 { v30.b }[6], [x6]\n" "ld1 { v29.b }[6], [x26]\n" - "ld1 { v28.b }[6], [x13]\n" - "ld1 { v27.b }[6], [x24]\n" - "ld1 { v23.b }[6], [x23]\n" - "ld1 { v25.b }[6], [x22]\n" + "ld1 { v28.b }[6], [x25]\n" + "ld1 { v27.b }[6], [x5]\n" + "ld1 { v23.b }[6], [x2]\n" + "ld1 { v25.b }[6], [x27]\n" "ld1 { v24.b }[6], [x21]\n" - "ld1 { v26.b }[6], [x20]\n" - "ld1 { v22.b }[6], [x0]\n" + "ld1 { v26.b }[6], [x12]\n" + "ld1 { v22.b }[6], [x19]\n" "b 11f\n" "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset - "tbz x4, #0, 11f\n" + "tbz x0, #0, 11f\n" "ld1 { v31.b }[4], [x28]\n" - "ld1 { v30.b }[4], [x27]\n" + "ld1 { v30.b }[4], [x6]\n" "ld1 { v29.b }[4], [x26]\n" - "ld1 { v28.b }[4], [x13]\n" - "ld1 { v27.b }[4], [x24]\n" - "ld1 { v23.b }[4], [x23]\n" - "ld1 { v25.b }[4], [x22]\n" + "ld1 { v28.b }[4], [x25]\n" + "ld1 { v27.b }[4], [x5]\n" + "ld1 { v23.b }[4], [x2]\n" + "ld1 { v25.b }[4], [x27]\n" "ld1 { v24.b }[4], [x21]\n" - "ld1 { v26.b }[4], [x20]\n" - "ld1 { v22.b }[4], [x0]\n" + "ld1 { v26.b }[4], [x12]\n" + "ld1 { v22.b }[4], [x19]\n" "b 11f\n" "9:" // Oddments: Initial loads: Bit 2: Unset - "tbz x4, #1, 10f\n" + "tbz x0, #1, 10f\n" "ld1 { v31.h }[0], [x28], #0x2\n" - "ld1 { v30.h }[0], [x27], #0x2\n" + "ld1 { v30.h }[0], [x6], #0x2\n" "ld1 { v29.h }[0], [x26], #0x2\n" - "ld1 { v28.h }[0], [x13], #0x2\n" - "ld1 { v27.h }[0], [x24], #0x2\n" - "ld1 { v23.h }[0], [x23], #0x2\n" - "ld1 { v25.h }[0], [x22], #0x2\n" + "ld1 { v28.h }[0], [x25], #0x2\n" + "ld1 { v27.h }[0], [x5], #0x2\n" + "ld1 { v23.h }[0], [x2], #0x2\n" + "ld1 { v25.h }[0], [x27], #0x2\n" "ld1 { v24.h }[0], [x21], #0x2\n" - "ld1 { v26.h }[0], [x20], #0x2\n" - "ld1 { v22.h }[0], [x0], #0x2\n" - "tbz x4, #0, 11f\n" + "ld1 { v26.h }[0], [x12], #0x2\n" + "ld1 { v22.h }[0], [x19], #0x2\n" + "tbz x0, #0, 11f\n" "ld1 { v31.b }[2], [x28]\n" - "ld1 { v30.b }[2], [x27]\n" + "ld1 { v30.b }[2], [x6]\n" "ld1 { v29.b }[2], [x26]\n" - "ld1 { v28.b }[2], [x13]\n" - "ld1 { v27.b }[2], [x24]\n" - "ld1 { v23.b }[2], [x23]\n" - "ld1 { v25.b }[2], [x22]\n" + "ld1 { v28.b }[2], [x25]\n" + "ld1 { v27.b }[2], [x5]\n" + "ld1 { v23.b }[2], [x2]\n" + "ld1 { v25.b }[2], [x27]\n" "ld1 { v24.b }[2], [x21]\n" - "ld1 { v26.b }[2], [x20]\n" - "ld1 { v22.b }[2], [x0]\n" + "ld1 { v26.b }[2], [x12]\n" + "ld1 { v22.b }[2], [x19]\n" "b 11f\n" "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 11f\n" + "tbz x0, #0, 11f\n" "ld1 { v31.b }[0], [x28]\n" - "ld1 { v30.b }[0], [x27]\n" + "ld1 { v30.b }[0], [x6]\n" "ld1 { v29.b }[0], [x26]\n" - "ld1 { v28.b }[0], [x13]\n" - "ld1 { v27.b }[0], [x24]\n" - "ld1 { v23.b }[0], [x23]\n" - "ld1 { v25.b }[0], [x22]\n" + "ld1 { v28.b }[0], [x25]\n" + "ld1 { v27.b }[0], [x5]\n" + "ld1 { v23.b }[0], [x2]\n" + "ld1 { v25.b }[0], [x27]\n" "ld1 { v24.b }[0], [x21]\n" - "ld1 { v26.b }[0], [x20]\n" - "ld1 { v22.b }[0], [x0]\n" + "ld1 { v26.b }[0], [x12]\n" + "ld1 { v22.b }[0], [x19]\n" "11:" // Oddments: Initial loads: Bit 2: End - "ldr x20, [x25, #0x50]\n" - "ssubl v31.8h, v31.8b, v7.8b\n" + "ssubl v31.8h, v31.8b, v9.8b\n" + "ssubl v30.8h, v30.8b, v9.8b\n" "smlal v15.4s, v31.4h, v0.4h\n" - "ssubl v30.8h, v30.8b, v7.8b\n" - "smlal2 v20.4s, v31.8h, v0.8h\n" - "ssubl v29.8h, v29.8b, v7.8b\n" - "ssubl v28.8h, v28.8b, v7.8b\n" - "smlal v18.4s, v30.4h, v0.4h\n" - "ssubl v27.8h, v27.8b, v7.8b\n" - "smlal2 v5.4s, v30.8h, v0.8h\n" - "ssubl v23.8h, v23.8b, v7.8b\n" - "smlal v11.4s, v29.4h, v0.4h\n" - "ssubl v25.8h, v25.8b, v7.8b\n" - "smlal2 v8.4s, v29.8h, v0.8h\n" - "ssubl v24.8h, v24.8b, v7.8b\n" - "smlal v10.4s, v28.4h, v0.4h\n" - "ssubl v26.8h, v26.8b, v7.8b\n" - "smlal2 v9.4s, v28.8h, v0.8h\n" - "ssubl v22.8h, v22.8b, v7.8b\n" + "ldr x19, [x20, #0x50]\n" + "ssubl v29.8h, v29.8b, v9.8b\n" + "smlal2 v16.4s, v31.8h, v0.8h\n" + "smlal v17.4s, v30.4h, v0.4h\n" + "smlal2 v8.4s, v30.8h, v0.8h\n" + "smlal v10.4s, v29.4h, v0.4h\n" + "ssubl v28.8h, v28.8b, v9.8b\n" + "add x19, x19, x24\n" + "smlal2 v7.4s, v29.8h, v0.8h\n" + "ssubl v27.8h, v27.8b, v9.8b\n" + "smlal v6.4s, v28.4h, v0.4h\n" + "smlal2 v5.4s, v28.8h, v0.8h\n" "smlal v15.4s, v30.4h, v1.4h\n" - "smlal2 v20.4s, v30.8h, v1.8h\n" - "add x20, x20, x10\n" - "smlal v18.4s, v27.4h, v1.4h\n" - "smlal2 v5.4s, v27.8h, v1.8h\n" - "smlal v11.4s, v28.4h, v1.4h\n" - "smlal2 v8.4s, v28.8h, v1.8h\n" - "smlal v10.4s, v23.4h, v1.4h\n" - "smlal2 v9.4s, v23.8h, v1.8h\n" + "ssubl v23.8h, v23.8b, v9.8b\n" + "smlal2 v16.4s, v30.8h, v1.8h\n" + "smlal v17.4s, v27.4h, v1.4h\n" + "ssubl v25.8h, v25.8b, v9.8b\n" + "smlal2 v8.4s, v27.8h, v1.8h\n" + "smlal v10.4s, v28.4h, v1.4h\n" + "ssubl v24.8h, v24.8b, v9.8b\n" + "smlal2 v7.4s, v28.8h, v1.8h\n" + "ssubl v26.8h, v26.8b, v9.8b\n" + "smlal v6.4s, v23.4h, v1.4h\n" + "ssubl v22.8h, v22.8b, v9.8b\n" + "smlal2 v5.4s, v23.8h, v1.8h\n" "smlal v15.4s, v27.4h, v2.4h\n" - "smlal2 v20.4s, v27.8h, v2.8h\n" - "smlal v18.4s, v25.4h, v2.4h\n" - "smlal2 v5.4s, v25.8h, v2.8h\n" - "smlal v11.4s, v23.4h, v2.4h\n" - "smlal2 v8.4s, v23.8h, v2.8h\n" - "tbz x4, #2, 13f\n" - "ld1 { v31.s }[0], [x20], #0x4\n" - "tbz x4, #1, 12f\n" - "ld1 { v31.h }[2], [x20], #0x2\n" - "tbz x4, #0, 15f\n" - "ld1 { v31.b }[6], [x20]\n" + "smlal2 v16.4s, v27.8h, v2.8h\n" + "smlal v17.4s, v25.4h, v2.4h\n" + "smlal2 v8.4s, v25.8h, v2.8h\n" + "smlal v10.4s, v23.4h, v2.4h\n" + "smlal2 v7.4s, v23.8h, v2.8h\n" + "tbz x0, #2, 13f\n" + "ld1 { v31.s }[0], [x19], #0x4\n" + "tbz x0, #1, 12f\n" + "ld1 { v31.h }[2], [x19], #0x2\n" + "tbz x0, #0, 15f\n" + "ld1 { v31.b }[6], [x19]\n" "b 15f\n" "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 15f\n" - "ld1 { v31.b }[4], [x20]\n" + "tbz x0, #0, 15f\n" + "ld1 { v31.b }[4], [x19]\n" "b 15f\n" "13:" // Oddments: Load (1, 3): Bit 2: Unset - "tbz x4, #1, 14f\n" - "ld1 { v31.h }[0], [x20], #0x2\n" - "tbz x4, #0, 15f\n" - "ld1 { v31.b }[2], [x20]\n" + "tbz x0, #1, 14f\n" + "ld1 { v31.h }[0], [x19], #0x2\n" + "tbz x0, #0, 15f\n" + "ld1 { v31.b }[2], [x19]\n" "b 15f\n" "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 15f\n" - "ld1 { v31.b }[0], [x20]\n" + "tbz x0, #0, 15f\n" + "ld1 { v31.b }[0], [x19]\n" "15:" // Oddments: Load (1, 3): Bit 2: End + "ssubl v31.8h, v31.8b, v9.8b\n" + "ldr x15, [x20, #0x58]\n" + "smlal v6.4s, v31.4h, v2.4h\n" + "smlal2 v5.4s, v31.8h, v2.8h\n" "smlal v15.4s, v25.4h, v3.4h\n" - "ldr x28, [x25, #0x58]\n" - "ssubl v31.8h, v31.8b, v7.8b\n" - "smlal2 v20.4s, v25.8h, v3.8h\n" - "smlal v18.4s, v24.4h, v3.4h\n" - "add x28, x28, x10\n" - "smlal2 v5.4s, v24.8h, v3.8h\n" - "smlal v10.4s, v31.4h, v2.4h\n" - "smlal2 v9.4s, v31.8h, v2.8h\n" - "smlal v11.4s, v31.4h, v3.4h\n" - "smlal2 v8.4s, v31.8h, v3.8h\n" - "tbz x4, #2, 17f\n" - "ld1 { v30.s }[0], [x28], #0x4\n" - "tbz x4, #1, 16f\n" - "ld1 { v30.h }[2], [x28], #0x2\n" - "tbz x4, #0, 19f\n" - "ld1 { v30.b }[6], [x28]\n" + "smlal2 v16.4s, v25.8h, v3.8h\n" + "add x15, x15, x24\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal2 v8.4s, v24.8h, v3.8h\n" + "smlal v10.4s, v31.4h, v3.4h\n" + "smlal2 v7.4s, v31.8h, v3.8h\n" + "tbz x0, #2, 17f\n" + "ld1 { v30.s }[0], [x15], #0x4\n" + "tbz x0, #1, 16f\n" + "ld1 { v30.h }[2], [x15], #0x2\n" + "tbz x0, #0, 19f\n" + "ld1 { v30.b }[6], [x15]\n" "b 19f\n" "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 19f\n" - "ld1 { v30.b }[4], [x28]\n" + "tbz x0, #0, 19f\n" + "ld1 { v30.b }[4], [x15]\n" "b 19f\n" "17:" // Oddments: Load (1, 4): Bit 2: Unset - "tbz x4, #1, 18f\n" - "ld1 { v30.h }[0], [x28], #0x2\n" - "tbz x4, #0, 19f\n" - "ld1 { v30.b }[2], [x28]\n" + "tbz x0, #1, 18f\n" + "ld1 { v30.h }[0], [x15], #0x2\n" + "tbz x0, #0, 19f\n" + "ld1 { v30.b }[2], [x15]\n" "b 19f\n" "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 19f\n" - "ld1 { v30.b }[0], [x28]\n" + "tbz x0, #0, 19f\n" + "ld1 { v30.b }[0], [x15]\n" "19:" // Oddments: Load (1, 4): Bit 2: End + "ssubl v30.8h, v30.8b, v9.8b\n" + "ldr x19, [x20, #0x60]\n" + "smlal v6.4s, v30.4h, v3.4h\n" + "smlal2 v5.4s, v30.8h, v3.8h\n" "smlal v15.4s, v24.4h, v4.4h\n" - "ldr x0, [x25, #0x60]\n" - "ssubl v30.8h, v30.8b, v7.8b\n" - "smlal2 v20.4s, v24.8h, v4.8h\n" - "add x0, x0, x10\n" - "smlal v10.4s, v30.4h, v3.4h\n" - "smlal2 v9.4s, v30.8h, v3.8h\n" - "tbz x4, #2, 21f\n" - "ld1 { v27.s }[0], [x0], #0x4\n" - "tbz x4, #1, 20f\n" - "ld1 { v27.h }[2], [x0], #0x2\n" - "tbz x4, #0, 23f\n" - "ld1 { v27.b }[6], [x0]\n" + "smlal2 v16.4s, v24.8h, v4.8h\n" + "add x19, x19, x24\n" + "tbz x0, #2, 21f\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "tbz x0, #1, 20f\n" + "ld1 { v27.h }[2], [x19], #0x2\n" + "tbz x0, #0, 23f\n" + "ld1 { v27.b }[6], [x19]\n" "b 23f\n" "20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset - "tbz x4, #0, 23f\n" - "ld1 { v27.b }[4], [x0]\n" + "tbz x0, #0, 23f\n" + "ld1 { v27.b }[4], [x19]\n" "b 23f\n" "21:" // Oddments: Load (0, 5): Bit 2: Unset - "tbz x4, #1, 22f\n" - "ld1 { v27.h }[0], [x0], #0x2\n" - "tbz x4, #0, 23f\n" - "ld1 { v27.b }[2], [x0]\n" + "tbz x0, #1, 22f\n" + "ld1 { v27.h }[0], [x19], #0x2\n" + "tbz x0, #0, 23f\n" + "ld1 { v27.b }[2], [x19]\n" "b 23f\n" "22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 23f\n" - "ld1 { v27.b }[0], [x0]\n" + "tbz x0, #0, 23f\n" + "ld1 { v27.b }[0], [x19]\n" "23:" // Oddments: Load (0, 5): Bit 2: End - "smlal v11.4s, v30.4h, v4.4h\n" - "ldr d0, [x3, #0x28]\n" - "ssubl v27.8h, v27.8b, v7.8b\n" - "smlal2 v8.4s, v30.8h, v4.8h\n" - "ldr x7, [x25, #0x68]\n" - "smlal v10.4s, v26.4h, v4.4h\n" - "ssubl v0.8h, v0.8b, v13.8b\n" - "smlal2 v9.4s, v26.8h, v4.8h\n" - "add x7, x7, x10\n" - "smlal v18.4s, v27.4h, v4.4h\n" - "smlal2 v5.4s, v27.8h, v4.8h\n" + "ssubl v27.8h, v27.8b, v9.8b\n" + "ldr d0, [x23, #0x28]\n" + "smlal v17.4s, v27.4h, v4.4h\n" + "smlal2 v8.4s, v27.8h, v4.8h\n" + "smlal v10.4s, v30.4h, v4.4h\n" + "smlal2 v7.4s, v30.8h, v4.8h\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "ldr x27, [x20, #0x68]\n" + "smlal v6.4s, v26.4h, v4.4h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "add x27, x27, x24\n" "smlal v15.4s, v29.4h, v0.4h\n" - "smlal2 v20.4s, v29.8h, v0.8h\n" - "smlal v18.4s, v28.4h, v0.4h\n" - "smlal2 v5.4s, v28.8h, v0.8h\n" - "smlal v11.4s, v22.4h, v0.4h\n" - "smlal2 v8.4s, v22.8h, v0.8h\n" - "tbz x4, #2, 25f\n" - "ld1 { v25.s }[0], [x7], #0x4\n" - "tbz x4, #1, 24f\n" - "ld1 { v25.h }[2], [x7], #0x2\n" - "tbz x4, #0, 27f\n" - "ld1 { v25.b }[6], [x7]\n" + "smlal2 v16.4s, v29.8h, v0.8h\n" + "smlal v17.4s, v28.4h, v0.4h\n" + "smlal2 v8.4s, v28.8h, v0.8h\n" + "smlal v10.4s, v22.4h, v0.4h\n" + "smlal2 v7.4s, v22.8h, v0.8h\n" + "tbz x0, #2, 25f\n" + "ld1 { v25.s }[0], [x27], #0x4\n" + "tbz x0, #1, 24f\n" + "ld1 { v25.h }[2], [x27], #0x2\n" + "tbz x0, #0, 27f\n" + "ld1 { v25.b }[6], [x27]\n" "b 27f\n" "24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset - "tbz x4, #0, 27f\n" - "ld1 { v25.b }[4], [x7]\n" + "tbz x0, #0, 27f\n" + "ld1 { v25.b }[4], [x27]\n" "b 27f\n" "25:" // Oddments: Load (2, 1): Bit 2: Unset - "tbz x4, #1, 26f\n" - "ld1 { v25.h }[0], [x7], #0x2\n" - "tbz x4, #0, 27f\n" - "ld1 { v25.b }[2], [x7]\n" + "tbz x0, #1, 26f\n" + "ld1 { v25.h }[0], [x27], #0x2\n" + "tbz x0, #0, 27f\n" + "ld1 { v25.b }[2], [x27]\n" "b 27f\n" "26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 27f\n" - "ld1 { v25.b }[0], [x7]\n" + "tbz x0, #0, 27f\n" + "ld1 { v25.b }[0], [x27]\n" "27:" // Oddments: Load (2, 1): Bit 2: End - "ldr d1, [x3, #0x30]\n" - "ssubl v25.8h, v25.8b, v7.8b\n" - "smlal v10.4s, v25.4h, v0.4h\n" - "ldr x26, [x25, #0x70]\n" - "ssubl v1.8h, v1.8b, v13.8b\n" - "smlal2 v9.4s, v25.8h, v0.8h\n" - "add x26, x26, x10\n" + "ldr d1, [x23, #0x30]\n" + "ssubl v25.8h, v25.8b, v9.8b\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "ldr x5, [x20, #0x70]\n" + "smlal v6.4s, v25.4h, v0.4h\n" + "smlal2 v5.4s, v25.8h, v0.8h\n" + "add x5, x5, x24\n" "smlal v15.4s, v28.4h, v1.4h\n" - "smlal2 v20.4s, v28.8h, v1.8h\n" - "smlal v18.4s, v23.4h, v1.4h\n" - "smlal2 v5.4s, v23.8h, v1.8h\n" - "smlal v11.4s, v25.4h, v1.4h\n" - "smlal2 v8.4s, v25.8h, v1.8h\n" - "tbz x4, #2, 29f\n" - "ld1 { v24.s }[0], [x26], #0x4\n" - "tbz x4, #1, 28f\n" - "ld1 { v24.h }[2], [x26], #0x2\n" - "tbz x4, #0, 31f\n" - "ld1 { v24.b }[6], [x26]\n" + "smlal2 v16.4s, v28.8h, v1.8h\n" + "smlal v17.4s, v23.4h, v1.4h\n" + "smlal2 v8.4s, v23.8h, v1.8h\n" + "smlal v10.4s, v25.4h, v1.4h\n" + "smlal2 v7.4s, v25.8h, v1.8h\n" + "tbz x0, #2, 29f\n" + "ld1 { v24.s }[0], [x5], #0x4\n" + "tbz x0, #1, 28f\n" + "ld1 { v24.h }[2], [x5], #0x2\n" + "tbz x0, #0, 31f\n" + "ld1 { v24.b }[6], [x5]\n" "b 31f\n" "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset - "tbz x4, #0, 31f\n" - "ld1 { v24.b }[4], [x26]\n" + "tbz x0, #0, 31f\n" + "ld1 { v24.b }[4], [x5]\n" "b 31f\n" "29:" // Oddments: Load (2, 2): Bit 2: Unset - "tbz x4, #1, 30f\n" - "ld1 { v24.h }[0], [x26], #0x2\n" - "tbz x4, #0, 31f\n" - "ld1 { v24.b }[2], [x26]\n" + "tbz x0, #1, 30f\n" + "ld1 { v24.h }[0], [x5], #0x2\n" + "tbz x0, #0, 31f\n" + "ld1 { v24.b }[2], [x5]\n" "b 31f\n" "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 31f\n" - "ld1 { v24.b }[0], [x26]\n" + "tbz x0, #0, 31f\n" + "ld1 { v24.b }[0], [x5]\n" "31:" // Oddments: Load (2, 2): Bit 2: End - "ldr d2, [x3, #0x38]\n" - "ssubl v24.8h, v24.8b, v7.8b\n" - "smlal v10.4s, v24.4h, v1.4h\n" - "ldr x23, [x25, #0x78]\n" - "ssubl v2.8h, v2.8b, v13.8b\n" - "smlal2 v9.4s, v24.8h, v1.8h\n" - "add x23, x23, x10\n" + "ldr d2, [x23, #0x38]\n" + "ssubl v24.8h, v24.8b, v9.8b\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "ldr x11, [x20, #0x78]\n" + "smlal v6.4s, v24.4h, v1.4h\n" + "smlal2 v5.4s, v24.8h, v1.8h\n" + "add x11, x11, x24\n" "smlal v15.4s, v23.4h, v2.4h\n" - "smlal2 v20.4s, v23.8h, v2.8h\n" - "smlal v18.4s, v31.4h, v2.4h\n" - "smlal2 v5.4s, v31.8h, v2.8h\n" - "smlal v11.4s, v24.4h, v2.4h\n" - "smlal2 v8.4s, v24.8h, v2.8h\n" - "tbz x4, #2, 33f\n" - "ld1 { v27.s }[0], [x23], #0x4\n" - "tbz x4, #1, 32f\n" - "ld1 { v27.h }[2], [x23], #0x2\n" - "tbz x4, #0, 35f\n" - "ld1 { v27.b }[6], [x23]\n" + "smlal2 v16.4s, v23.8h, v2.8h\n" + "smlal v17.4s, v31.4h, v2.4h\n" + "smlal2 v8.4s, v31.8h, v2.8h\n" + "smlal v10.4s, v24.4h, v2.4h\n" + "smlal2 v7.4s, v24.8h, v2.8h\n" + "tbz x0, #2, 33f\n" + "ld1 { v27.s }[0], [x11], #0x4\n" + "tbz x0, #1, 32f\n" + "ld1 { v27.h }[2], [x11], #0x2\n" + "tbz x0, #0, 35f\n" + "ld1 { v27.b }[6], [x11]\n" "b 35f\n" "32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 35f\n" - "ld1 { v27.b }[4], [x23]\n" + "tbz x0, #0, 35f\n" + "ld1 { v27.b }[4], [x11]\n" "b 35f\n" "33:" // Oddments: Load (2, 3): Bit 2: Unset - "tbz x4, #1, 34f\n" - "ld1 { v27.h }[0], [x23], #0x2\n" - "tbz x4, #0, 35f\n" - "ld1 { v27.b }[2], [x23]\n" + "tbz x0, #1, 34f\n" + "ld1 { v27.h }[0], [x11], #0x2\n" + "tbz x0, #0, 35f\n" + "ld1 { v27.b }[2], [x11]\n" "b 35f\n" "34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 35f\n" - "ld1 { v27.b }[0], [x23]\n" + "tbz x0, #0, 35f\n" + "ld1 { v27.b }[0], [x11]\n" "35:" // Oddments: Load (2, 3): Bit 2: End - "ldr d3, [x3, #0x40]\n" - "ssubl v27.8h, v27.8b, v7.8b\n" - "smlal v10.4s, v27.4h, v2.4h\n" - "ldr x20, [x25, #0x80]\n" - "ssubl v3.8h, v3.8b, v13.8b\n" - "smlal2 v9.4s, v27.8h, v2.8h\n" - "add x20, x20, x10\n" + "ldr d3, [x23, #0x40]\n" + "ssubl v27.8h, v27.8b, v9.8b\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "ldr x12, [x20, #0x80]\n" + "smlal v6.4s, v27.4h, v2.4h\n" + "smlal2 v5.4s, v27.8h, v2.8h\n" + "add x12, x12, x24\n" "smlal v15.4s, v31.4h, v3.4h\n" - "smlal2 v20.4s, v31.8h, v3.8h\n" - "smlal v18.4s, v30.4h, v3.4h\n" - "smlal2 v5.4s, v30.8h, v3.8h\n" - "smlal v11.4s, v27.4h, v3.4h\n" - "smlal2 v8.4s, v27.8h, v3.8h\n" - "tbz x4, #2, 37f\n" - "ld1 { v23.s }[0], [x20], #0x4\n" - "tbz x4, #1, 36f\n" - "ld1 { v23.h }[2], [x20], #0x2\n" - "tbz x4, #0, 39f\n" - "ld1 { v23.b }[6], [x20]\n" + "smlal2 v16.4s, v31.8h, v3.8h\n" + "smlal v17.4s, v30.4h, v3.4h\n" + "smlal2 v8.4s, v30.8h, v3.8h\n" + "smlal v10.4s, v27.4h, v3.4h\n" + "smlal2 v7.4s, v27.8h, v3.8h\n" + "tbz x0, #2, 37f\n" + "ld1 { v23.s }[0], [x12], #0x4\n" + "tbz x0, #1, 36f\n" + "ld1 { v23.h }[2], [x12], #0x2\n" + "tbz x0, #0, 39f\n" + "ld1 { v23.b }[6], [x12]\n" "b 39f\n" "36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 39f\n" - "ld1 { v23.b }[4], [x20]\n" + "tbz x0, #0, 39f\n" + "ld1 { v23.b }[4], [x12]\n" "b 39f\n" "37:" // Oddments: Load (2, 4): Bit 2: Unset - "tbz x4, #1, 38f\n" - "ld1 { v23.h }[0], [x20], #0x2\n" - "tbz x4, #0, 39f\n" - "ld1 { v23.b }[2], [x20]\n" + "tbz x0, #1, 38f\n" + "ld1 { v23.h }[0], [x12], #0x2\n" + "tbz x0, #0, 39f\n" + "ld1 { v23.b }[2], [x12]\n" "b 39f\n" "38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 39f\n" - "ld1 { v23.b }[0], [x20]\n" + "tbz x0, #0, 39f\n" + "ld1 { v23.b }[0], [x12]\n" "39:" // Oddments: Load (2, 4): Bit 2: End - "ldr d4, [x3, #0x48]\n" - "ssubl v23.8h, v23.8b, v7.8b\n" - "smlal v10.4s, v23.4h, v3.4h\n" - "ldr x22, [x25, #0x88]\n" - "ssubl v4.8h, v4.8b, v13.8b\n" - "smlal2 v9.4s, v23.8h, v3.8h\n" - "add x22, x22, x10\n" + "ldr d4, [x23, #0x48]\n" + "ssubl v23.8h, v23.8b, v9.8b\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "ldr x26, [x20, #0x88]\n" + "smlal v6.4s, v23.4h, v3.4h\n" + "smlal2 v5.4s, v23.8h, v3.8h\n" + "add x26, x26, x24\n" "smlal v15.4s, v30.4h, v4.4h\n" - "smlal2 v20.4s, v30.8h, v4.8h\n" - "smlal v18.4s, v26.4h, v4.4h\n" - "smlal2 v5.4s, v26.8h, v4.8h\n" - "smlal v11.4s, v23.4h, v4.4h\n" - "smlal2 v8.4s, v23.8h, v4.8h\n" - "tbz x4, #2, 41f\n" - "ld1 { v28.s }[0], [x22], #0x4\n" - "tbz x4, #1, 40f\n" - "ld1 { v28.h }[2], [x22], #0x2\n" - "tbz x4, #0, 43f\n" - "ld1 { v28.b }[6], [x22]\n" + "smlal2 v16.4s, v30.8h, v4.8h\n" + "smlal v17.4s, v26.4h, v4.4h\n" + "smlal2 v8.4s, v26.8h, v4.8h\n" + "smlal v10.4s, v23.4h, v4.4h\n" + "smlal2 v7.4s, v23.8h, v4.8h\n" + "tbz x0, #2, 41f\n" + "ld1 { v28.s }[0], [x26], #0x4\n" + "tbz x0, #1, 40f\n" + "ld1 { v28.h }[2], [x26], #0x2\n" + "tbz x0, #0, 43f\n" + "ld1 { v28.b }[6], [x26]\n" "b 43f\n" "40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset - "tbz x4, #0, 43f\n" - "ld1 { v28.b }[4], [x22]\n" + "tbz x0, #0, 43f\n" + "ld1 { v28.b }[4], [x26]\n" "b 43f\n" "41:" // Oddments: Load (2, 5): Bit 2: Unset - "tbz x4, #1, 42f\n" - "ld1 { v28.h }[0], [x22], #0x2\n" - "tbz x4, #0, 43f\n" - "ld1 { v28.b }[2], [x22]\n" + "tbz x0, #1, 42f\n" + "ld1 { v28.h }[0], [x26], #0x2\n" + "tbz x0, #0, 43f\n" + "ld1 { v28.b }[2], [x26]\n" "b 43f\n" "42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 43f\n" - "ld1 { v28.b }[0], [x22]\n" + "tbz x0, #0, 43f\n" + "ld1 { v28.b }[0], [x26]\n" "43:" // Oddments: Load (2, 5): Bit 2: End - "ldr d0, [x3, #0x50]\n" - "ssubl v28.8h, v28.8b, v7.8b\n" - "smlal v10.4s, v28.4h, v4.4h\n" - "ldr x13, [x25, #0x90]\n" - "ssubl v0.8h, v0.8b, v13.8b\n" - "smlal2 v9.4s, v28.8h, v4.8h\n" - "add x13, x13, x10\n" + "ldr d0, [x23, #0x50]\n" + "ssubl v28.8h, v28.8b, v9.8b\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "ldr x14, [x20, #0x90]\n" + "smlal v6.4s, v28.4h, v4.4h\n" + "smlal2 v5.4s, v28.8h, v4.8h\n" + "add x14, x14, x24\n" "smlal v15.4s, v22.4h, v0.4h\n" - "smlal2 v20.4s, v22.8h, v0.8h\n" - "smlal v18.4s, v25.4h, v0.4h\n" - "smlal2 v5.4s, v25.8h, v0.8h\n" - "tbz x4, #2, 45f\n" - "ld1 { v31.s }[0], [x13], #0x4\n" - "tbz x4, #1, 44f\n" - "ld1 { v31.h }[2], [x13], #0x2\n" - "tbz x4, #0, 47f\n" - "ld1 { v31.b }[6], [x13]\n" + "smlal2 v16.4s, v22.8h, v0.8h\n" + "smlal v17.4s, v25.4h, v0.4h\n" + "smlal2 v8.4s, v25.8h, v0.8h\n" + "tbz x0, #2, 45f\n" + "ld1 { v31.s }[0], [x14], #0x4\n" + "tbz x0, #1, 44f\n" + "ld1 { v31.h }[2], [x14], #0x2\n" + "tbz x0, #0, 47f\n" + "ld1 { v31.b }[6], [x14]\n" "b 47f\n" "44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset - "tbz x4, #0, 47f\n" - "ld1 { v31.b }[4], [x13]\n" + "tbz x0, #0, 47f\n" + "ld1 { v31.b }[4], [x14]\n" "b 47f\n" "45:" // Oddments: Load (3, 0): Bit 2: Unset - "tbz x4, #1, 46f\n" - "ld1 { v31.h }[0], [x13], #0x2\n" - "tbz x4, #0, 47f\n" - "ld1 { v31.b }[2], [x13]\n" + "tbz x0, #1, 46f\n" + "ld1 { v31.h }[0], [x14], #0x2\n" + "tbz x0, #0, 47f\n" + "ld1 { v31.b }[2], [x14]\n" "b 47f\n" "46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 47f\n" - "ld1 { v31.b }[0], [x13]\n" + "tbz x0, #0, 47f\n" + "ld1 { v31.b }[0], [x14]\n" "47:" // Oddments: Load (3, 0): Bit 2: End - "ldr x21, [x25, #0x98]\n" - "ssubl v31.8h, v31.8b, v7.8b\n" - "smlal v11.4s, v31.4h, v0.4h\n" - "smlal2 v8.4s, v31.8h, v0.8h\n" - "add x21, x21, x10\n" - "tbz x4, #2, 49f\n" - "ld1 { v30.s }[0], [x21], #0x4\n" - "tbz x4, #1, 48f\n" - "ld1 { v30.h }[2], [x21], #0x2\n" - "tbz x4, #0, 51f\n" - "ld1 { v30.b }[6], [x21]\n" + "ssubl v31.8h, v31.8b, v9.8b\n" + "ldr x15, [x20, #0x98]\n" + "smlal v10.4s, v31.4h, v0.4h\n" + "smlal2 v7.4s, v31.8h, v0.8h\n" + "add x15, x15, x24\n" + "tbz x0, #2, 49f\n" + "ld1 { v30.s }[0], [x15], #0x4\n" + "tbz x0, #1, 48f\n" + "ld1 { v30.h }[2], [x15], #0x2\n" + "tbz x0, #0, 51f\n" + "ld1 { v30.b }[6], [x15]\n" "b 51f\n" "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset - "tbz x4, #0, 51f\n" - "ld1 { v30.b }[4], [x21]\n" + "tbz x0, #0, 51f\n" + "ld1 { v30.b }[4], [x15]\n" "b 51f\n" "49:" // Oddments: Load (3, 1): Bit 2: Unset - "tbz x4, #1, 50f\n" - "ld1 { v30.h }[0], [x21], #0x2\n" - "tbz x4, #0, 51f\n" - "ld1 { v30.b }[2], [x21]\n" + "tbz x0, #1, 50f\n" + "ld1 { v30.h }[0], [x15], #0x2\n" + "tbz x0, #0, 51f\n" + "ld1 { v30.b }[2], [x15]\n" "b 51f\n" "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 51f\n" - "ld1 { v30.b }[0], [x21]\n" + "tbz x0, #0, 51f\n" + "ld1 { v30.b }[0], [x15]\n" "51:" // Oddments: Load (3, 1): Bit 2: End - "ldr d1, [x3, #0x58]\n" - "ssubl v30.8h, v30.8b, v7.8b\n" - "smlal v10.4s, v30.4h, v0.4h\n" - "ldr x14, [x25, #0xa0]\n" - "ssubl v1.8h, v1.8b, v13.8b\n" - "smlal2 v9.4s, v30.8h, v0.8h\n" - "add x14, x14, x10\n" + "ldr d1, [x23, #0x58]\n" + "ssubl v30.8h, v30.8b, v9.8b\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "ldr x21, [x20, #0xa0]\n" + "smlal v6.4s, v30.4h, v0.4h\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "add x21, x21, x24\n" "smlal v15.4s, v25.4h, v1.4h\n" - "smlal2 v20.4s, v25.8h, v1.8h\n" - "smlal v18.4s, v24.4h, v1.4h\n" - "smlal2 v5.4s, v24.8h, v1.8h\n" - "smlal v11.4s, v30.4h, v1.4h\n" - "smlal2 v8.4s, v30.8h, v1.8h\n" - "tbz x4, #2, 53f\n" - "ld1 { v26.s }[0], [x14], #0x4\n" - "tbz x4, #1, 52f\n" - "ld1 { v26.h }[2], [x14], #0x2\n" - "tbz x4, #0, 55f\n" - "ld1 { v26.b }[6], [x14]\n" + "smlal2 v16.4s, v25.8h, v1.8h\n" + "smlal v17.4s, v24.4h, v1.4h\n" + "smlal2 v8.4s, v24.8h, v1.8h\n" + "smlal v10.4s, v30.4h, v1.4h\n" + "smlal2 v7.4s, v30.8h, v1.8h\n" + "tbz x0, #2, 53f\n" + "ld1 { v26.s }[0], [x21], #0x4\n" + "tbz x0, #1, 52f\n" + "ld1 { v26.h }[2], [x21], #0x2\n" + "tbz x0, #0, 55f\n" + "ld1 { v26.b }[6], [x21]\n" "b 55f\n" "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset - "tbz x4, #0, 55f\n" - "ld1 { v26.b }[4], [x14]\n" + "tbz x0, #0, 55f\n" + "ld1 { v26.b }[4], [x21]\n" "b 55f\n" "53:" // Oddments: Load (3, 2): Bit 2: Unset - "tbz x4, #1, 54f\n" - "ld1 { v26.h }[0], [x14], #0x2\n" - "tbz x4, #0, 55f\n" - "ld1 { v26.b }[2], [x14]\n" + "tbz x0, #1, 54f\n" + "ld1 { v26.h }[0], [x21], #0x2\n" + "tbz x0, #0, 55f\n" + "ld1 { v26.b }[2], [x21]\n" "b 55f\n" "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 55f\n" - "ld1 { v26.b }[0], [x14]\n" + "tbz x0, #0, 55f\n" + "ld1 { v26.b }[0], [x21]\n" "55:" // Oddments: Load (3, 2): Bit 2: End - "ldr d2, [x3, #0x60]\n" - "ssubl v26.8h, v26.8b, v7.8b\n" - "smlal v10.4s, v26.4h, v1.4h\n" - "ldr x11, [x25, #0xa8]\n" - "ssubl v2.8h, v2.8b, v13.8b\n" - "smlal2 v9.4s, v26.8h, v1.8h\n" - "add x11, x11, x10\n" + "ldr d2, [x23, #0x60]\n" + "ssubl v26.8h, v26.8b, v9.8b\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "ldr x2, [x20, #0xa8]\n" + "smlal v6.4s, v26.4h, v1.4h\n" + "smlal2 v5.4s, v26.8h, v1.8h\n" + "add x2, x2, x24\n" "smlal v15.4s, v24.4h, v2.4h\n" - "smlal2 v20.4s, v24.8h, v2.8h\n" - "smlal v18.4s, v27.4h, v2.4h\n" - "smlal2 v5.4s, v27.8h, v2.8h\n" - "smlal v11.4s, v26.4h, v2.4h\n" - "smlal2 v8.4s, v26.8h, v2.8h\n" - "tbz x4, #2, 57f\n" - "ld1 { v25.s }[0], [x11], #0x4\n" - "tbz x4, #1, 56f\n" - "ld1 { v25.h }[2], [x11], #0x2\n" - "tbz x4, #0, 59f\n" - "ld1 { v25.b }[6], [x11]\n" + "smlal2 v16.4s, v24.8h, v2.8h\n" + "smlal v17.4s, v27.4h, v2.4h\n" + "smlal2 v8.4s, v27.8h, v2.8h\n" + "smlal v10.4s, v26.4h, v2.4h\n" + "smlal2 v7.4s, v26.8h, v2.8h\n" + "tbz x0, #2, 57f\n" + "ld1 { v25.s }[0], [x2], #0x4\n" + "tbz x0, #1, 56f\n" + "ld1 { v25.h }[2], [x2], #0x2\n" + "tbz x0, #0, 59f\n" + "ld1 { v25.b }[6], [x2]\n" "b 59f\n" "56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 59f\n" - "ld1 { v25.b }[4], [x11]\n" + "tbz x0, #0, 59f\n" + "ld1 { v25.b }[4], [x2]\n" "b 59f\n" "57:" // Oddments: Load (3, 3): Bit 2: Unset - "tbz x4, #1, 58f\n" - "ld1 { v25.h }[0], [x11], #0x2\n" - "tbz x4, #0, 59f\n" - "ld1 { v25.b }[2], [x11]\n" + "tbz x0, #1, 58f\n" + "ld1 { v25.h }[0], [x2], #0x2\n" + "tbz x0, #0, 59f\n" + "ld1 { v25.b }[2], [x2]\n" "b 59f\n" "58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 59f\n" - "ld1 { v25.b }[0], [x11]\n" + "tbz x0, #0, 59f\n" + "ld1 { v25.b }[0], [x2]\n" "59:" // Oddments: Load (3, 3): Bit 2: End - "ldr d3, [x3, #0x68]\n" - "ssubl v25.8h, v25.8b, v7.8b\n" - "smlal v10.4s, v25.4h, v2.4h\n" - "ldr x24, [x25, #0xb0]\n" - "ssubl v3.8h, v3.8b, v13.8b\n" - "smlal2 v9.4s, v25.8h, v2.8h\n" - "add x24, x24, x10\n" + "ldr d3, [x23, #0x68]\n" + "ssubl v25.8h, v25.8b, v9.8b\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "ldr x13, [x20, #0xb0]\n" + "smlal v6.4s, v25.4h, v2.4h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "add x13, x13, x24\n" "smlal v15.4s, v27.4h, v3.4h\n" - "smlal2 v20.4s, v27.8h, v3.8h\n" - "smlal v18.4s, v23.4h, v3.4h\n" - "smlal2 v5.4s, v23.8h, v3.8h\n" - "smlal v11.4s, v25.4h, v3.4h\n" - "smlal2 v8.4s, v25.8h, v3.8h\n" - "tbz x4, #2, 61f\n" - "ld1 { v24.s }[0], [x24], #0x4\n" - "tbz x4, #1, 60f\n" - "ld1 { v24.h }[2], [x24], #0x2\n" - "tbz x4, #0, 63f\n" - "ld1 { v24.b }[6], [x24]\n" + "smlal2 v16.4s, v27.8h, v3.8h\n" + "smlal v17.4s, v23.4h, v3.4h\n" + "smlal2 v8.4s, v23.8h, v3.8h\n" + "smlal v10.4s, v25.4h, v3.4h\n" + "smlal2 v7.4s, v25.8h, v3.8h\n" + "tbz x0, #2, 61f\n" + "ld1 { v24.s }[0], [x13], #0x4\n" + "tbz x0, #1, 60f\n" + "ld1 { v24.h }[2], [x13], #0x2\n" + "tbz x0, #0, 63f\n" + "ld1 { v24.b }[6], [x13]\n" "b 63f\n" "60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 63f\n" - "ld1 { v24.b }[4], [x24]\n" + "tbz x0, #0, 63f\n" + "ld1 { v24.b }[4], [x13]\n" "b 63f\n" "61:" // Oddments: Load (3, 4): Bit 2: Unset - "tbz x4, #1, 62f\n" - "ld1 { v24.h }[0], [x24], #0x2\n" - "tbz x4, #0, 63f\n" - "ld1 { v24.b }[2], [x24]\n" + "tbz x0, #1, 62f\n" + "ld1 { v24.h }[0], [x13], #0x2\n" + "tbz x0, #0, 63f\n" + "ld1 { v24.b }[2], [x13]\n" "b 63f\n" "62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 63f\n" - "ld1 { v24.b }[0], [x24]\n" + "tbz x0, #0, 63f\n" + "ld1 { v24.b }[0], [x13]\n" "63:" // Oddments: Load (3, 4): Bit 2: End - "ldr d4, [x3, #0x70]\n" - "ssubl v24.8h, v24.8b, v7.8b\n" - "smlal v10.4s, v24.4h, v3.4h\n" - "ldr x0, [x25, #0xb8]\n" - "ssubl v4.8h, v4.8b, v13.8b\n" - "smlal2 v9.4s, v24.8h, v3.8h\n" - "add x0, x0, x10\n" + "ldr d4, [x23, #0x70]\n" + "ssubl v24.8h, v24.8b, v9.8b\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "ldr x9, [x20, #0xb8]\n" + "smlal v6.4s, v24.4h, v3.4h\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "add x9, x9, x24\n" "smlal v15.4s, v23.4h, v4.4h\n" - "smlal2 v20.4s, v23.8h, v4.8h\n" - "smlal v18.4s, v28.4h, v4.4h\n" - "smlal2 v5.4s, v28.8h, v4.8h\n" - "smlal v11.4s, v24.4h, v4.4h\n" - "smlal2 v8.4s, v24.8h, v4.8h\n" - "tbz x4, #2, 65f\n" - "ld1 { v22.s }[0], [x0], #0x4\n" - "tbz x4, #1, 64f\n" - "ld1 { v22.h }[2], [x0], #0x2\n" - "tbz x4, #0, 67f\n" - "ld1 { v22.b }[6], [x0]\n" + "smlal2 v16.4s, v23.8h, v4.8h\n" + "smlal v17.4s, v28.4h, v4.4h\n" + "smlal2 v8.4s, v28.8h, v4.8h\n" + "smlal v10.4s, v24.4h, v4.4h\n" + "smlal2 v7.4s, v24.8h, v4.8h\n" + "tbz x0, #2, 65f\n" + "ld1 { v22.s }[0], [x9], #0x4\n" + "tbz x0, #1, 64f\n" + "ld1 { v22.h }[2], [x9], #0x2\n" + "tbz x0, #0, 67f\n" + "ld1 { v22.b }[6], [x9]\n" "b 67f\n" "64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset - "tbz x4, #0, 67f\n" - "ld1 { v22.b }[4], [x0]\n" + "tbz x0, #0, 67f\n" + "ld1 { v22.b }[4], [x9]\n" "b 67f\n" "65:" // Oddments: Load (3, 5): Bit 2: Unset - "tbz x4, #1, 66f\n" - "ld1 { v22.h }[0], [x0], #0x2\n" - "tbz x4, #0, 67f\n" - "ld1 { v22.b }[2], [x0]\n" + "tbz x0, #1, 66f\n" + "ld1 { v22.h }[0], [x9], #0x2\n" + "tbz x0, #0, 67f\n" + "ld1 { v22.b }[2], [x9]\n" "b 67f\n" "66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 67f\n" - "ld1 { v22.b }[0], [x0]\n" + "tbz x0, #0, 67f\n" + "ld1 { v22.b }[0], [x9]\n" "67:" // Oddments: Load (3, 5): Bit 2: End - "ldr d0, [x3, #0x78]\n" - "ssubl v22.8h, v22.8b, v7.8b\n" - "smlal v10.4s, v22.4h, v4.4h\n" - "ldr x15, [x25, #0xc0]\n" - "ssubl v0.8h, v0.8b, v13.8b\n" - "smlal2 v9.4s, v22.8h, v4.8h\n" - "add x15, x15, x10\n" + "ldr d0, [x23, #0x78]\n" + "ssubl v22.8h, v22.8b, v9.8b\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "ldr x19, [x20, #0xc0]\n" + "smlal v6.4s, v22.4h, v4.4h\n" + "smlal2 v5.4s, v22.8h, v4.8h\n" + "add x19, x19, x24\n" "smlal v15.4s, v31.4h, v0.4h\n" - "smlal2 v20.4s, v31.8h, v0.8h\n" - "smlal v18.4s, v30.4h, v0.4h\n" - "smlal2 v5.4s, v30.8h, v0.8h\n" - "tbz x4, #2, 69f\n" - "ld1 { v27.s }[0], [x15], #0x4\n" - "tbz x4, #1, 68f\n" - "ld1 { v27.h }[2], [x15], #0x2\n" - "tbz x4, #0, 71f\n" - "ld1 { v27.b }[6], [x15]\n" + "smlal2 v16.4s, v31.8h, v0.8h\n" + "smlal v17.4s, v30.4h, v0.4h\n" + "smlal2 v8.4s, v30.8h, v0.8h\n" + "tbz x0, #2, 69f\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "tbz x0, #1, 68f\n" + "ld1 { v27.h }[2], [x19], #0x2\n" + "tbz x0, #0, 71f\n" + "ld1 { v27.b }[6], [x19]\n" "b 71f\n" "68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset - "tbz x4, #0, 71f\n" - "ld1 { v27.b }[4], [x15]\n" + "tbz x0, #0, 71f\n" + "ld1 { v27.b }[4], [x19]\n" "b 71f\n" "69:" // Oddments: Load (4, 0): Bit 2: Unset - "tbz x4, #1, 70f\n" - "ld1 { v27.h }[0], [x15], #0x2\n" - "tbz x4, #0, 71f\n" - "ld1 { v27.b }[2], [x15]\n" + "tbz x0, #1, 70f\n" + "ld1 { v27.h }[0], [x19], #0x2\n" + "tbz x0, #0, 71f\n" + "ld1 { v27.b }[2], [x19]\n" "b 71f\n" "70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 71f\n" - "ld1 { v27.b }[0], [x15]\n" + "tbz x0, #0, 71f\n" + "ld1 { v27.b }[0], [x19]\n" "71:" // Oddments: Load (4, 0): Bit 2: End - "ldr x9, [x25, #0xc8]\n" - "ssubl v27.8h, v27.8b, v7.8b\n" - "smlal v11.4s, v27.4h, v0.4h\n" - "smlal2 v8.4s, v27.8h, v0.8h\n" - "add x9, x9, x10\n" - "tbz x4, #2, 73f\n" - "ld1 { v23.s }[0], [x9], #0x4\n" - "tbz x4, #1, 72f\n" - "ld1 { v23.h }[2], [x9], #0x2\n" - "tbz x4, #0, 75f\n" - "ld1 { v23.b }[6], [x9]\n" + "ssubl v27.8h, v27.8b, v9.8b\n" + "ldr x28, [x20, #0xc8]\n" + "smlal v10.4s, v27.4h, v0.4h\n" + "smlal2 v7.4s, v27.8h, v0.8h\n" + "add x28, x28, x24\n" + "tbz x0, #2, 73f\n" + "ld1 { v23.s }[0], [x28], #0x4\n" + "tbz x0, #1, 72f\n" + "ld1 { v23.h }[2], [x28], #0x2\n" + "tbz x0, #0, 75f\n" + "ld1 { v23.b }[6], [x28]\n" "b 75f\n" "72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset - "tbz x4, #0, 75f\n" - "ld1 { v23.b }[4], [x9]\n" + "tbz x0, #0, 75f\n" + "ld1 { v23.b }[4], [x28]\n" "b 75f\n" "73:" // Oddments: Load (4, 1): Bit 2: Unset - "tbz x4, #1, 74f\n" - "ld1 { v23.h }[0], [x9], #0x2\n" - "tbz x4, #0, 75f\n" - "ld1 { v23.b }[2], [x9]\n" + "tbz x0, #1, 74f\n" + "ld1 { v23.h }[0], [x28], #0x2\n" + "tbz x0, #0, 75f\n" + "ld1 { v23.b }[2], [x28]\n" "b 75f\n" "74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 75f\n" - "ld1 { v23.b }[0], [x9]\n" + "tbz x0, #0, 75f\n" + "ld1 { v23.b }[0], [x28]\n" "75:" // Oddments: Load (4, 1): Bit 2: End - "ldr d1, [x3, #0x80]\n" - "ssubl v23.8h, v23.8b, v7.8b\n" - "smlal v10.4s, v23.4h, v0.4h\n" - "ldr x27, [x25, #0xd0]\n" - "ssubl v1.8h, v1.8b, v13.8b\n" - "smlal2 v9.4s, v23.8h, v0.8h\n" - "add x27, x27, x10\n" + "ldr d1, [x23, #0x80]\n" + "ssubl v23.8h, v23.8b, v9.8b\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "ldr x6, [x20, #0xd0]\n" + "smlal v6.4s, v23.4h, v0.4h\n" + "smlal2 v5.4s, v23.8h, v0.8h\n" + "add x6, x6, x24\n" "smlal v15.4s, v30.4h, v1.4h\n" - "smlal2 v20.4s, v30.8h, v1.8h\n" - "smlal v18.4s, v26.4h, v1.4h\n" - "smlal2 v5.4s, v26.8h, v1.8h\n" - "smlal v11.4s, v23.4h, v1.4h\n" - "smlal2 v8.4s, v23.8h, v1.8h\n" - "tbz x4, #2, 77f\n" - "ld1 { v31.s }[0], [x27], #0x4\n" - "tbz x4, #1, 76f\n" - "ld1 { v31.h }[2], [x27], #0x2\n" - "tbz x4, #0, 79f\n" - "ld1 { v31.b }[6], [x27]\n" + "smlal2 v16.4s, v30.8h, v1.8h\n" + "smlal v17.4s, v26.4h, v1.4h\n" + "smlal2 v8.4s, v26.8h, v1.8h\n" + "smlal v10.4s, v23.4h, v1.4h\n" + "smlal2 v7.4s, v23.8h, v1.8h\n" + "tbz x0, #2, 77f\n" + "ld1 { v31.s }[0], [x6], #0x4\n" + "tbz x0, #1, 76f\n" + "ld1 { v31.h }[2], [x6], #0x2\n" + "tbz x0, #0, 79f\n" + "ld1 { v31.b }[6], [x6]\n" "b 79f\n" "76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset - "tbz x4, #0, 79f\n" - "ld1 { v31.b }[4], [x27]\n" + "tbz x0, #0, 79f\n" + "ld1 { v31.b }[4], [x6]\n" "b 79f\n" "77:" // Oddments: Load (4, 2): Bit 2: Unset - "tbz x4, #1, 78f\n" - "ld1 { v31.h }[0], [x27], #0x2\n" - "tbz x4, #0, 79f\n" - "ld1 { v31.b }[2], [x27]\n" + "tbz x0, #1, 78f\n" + "ld1 { v31.h }[0], [x6], #0x2\n" + "tbz x0, #0, 79f\n" + "ld1 { v31.b }[2], [x6]\n" "b 79f\n" "78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 79f\n" - "ld1 { v31.b }[0], [x27]\n" + "tbz x0, #0, 79f\n" + "ld1 { v31.b }[0], [x6]\n" "79:" // Oddments: Load (4, 2): Bit 2: End - "ldr d2, [x3, #0x88]\n" - "ssubl v31.8h, v31.8b, v7.8b\n" - "smlal v10.4s, v31.4h, v1.4h\n" - "ldr x28, [x25, #0xd8]\n" - "ssubl v2.8h, v2.8b, v13.8b\n" - "smlal2 v9.4s, v31.8h, v1.8h\n" - "add x28, x28, x10\n" + "ldr d2, [x23, #0x88]\n" + "ssubl v31.8h, v31.8b, v9.8b\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "ldr x27, [x20, #0xd8]\n" + "smlal v6.4s, v31.4h, v1.4h\n" + "smlal2 v5.4s, v31.8h, v1.8h\n" + "add x27, x27, x24\n" "smlal v15.4s, v26.4h, v2.4h\n" - "smlal2 v20.4s, v26.8h, v2.8h\n" - "smlal v18.4s, v25.4h, v2.4h\n" - "smlal2 v5.4s, v25.8h, v2.8h\n" - "smlal v11.4s, v31.4h, v2.4h\n" - "smlal2 v8.4s, v31.8h, v2.8h\n" - "tbz x4, #2, 81f\n" - "ld1 { v30.s }[0], [x28], #0x4\n" - "tbz x4, #1, 80f\n" - "ld1 { v30.h }[2], [x28], #0x2\n" - "tbz x4, #0, 83f\n" - "ld1 { v30.b }[6], [x28]\n" + "smlal2 v16.4s, v26.8h, v2.8h\n" + "smlal v17.4s, v25.4h, v2.4h\n" + "smlal2 v8.4s, v25.8h, v2.8h\n" + "smlal v10.4s, v31.4h, v2.4h\n" + "smlal2 v7.4s, v31.8h, v2.8h\n" + "tbz x0, #2, 81f\n" + "ld1 { v30.s }[0], [x27], #0x4\n" + "tbz x0, #1, 80f\n" + "ld1 { v30.h }[2], [x27], #0x2\n" + "tbz x0, #0, 83f\n" + "ld1 { v30.b }[6], [x27]\n" "b 83f\n" "80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 83f\n" - "ld1 { v30.b }[4], [x28]\n" + "tbz x0, #0, 83f\n" + "ld1 { v30.b }[4], [x27]\n" "b 83f\n" "81:" // Oddments: Load (4, 3): Bit 2: Unset - "tbz x4, #1, 82f\n" - "ld1 { v30.h }[0], [x28], #0x2\n" - "tbz x4, #0, 83f\n" - "ld1 { v30.b }[2], [x28]\n" + "tbz x0, #1, 82f\n" + "ld1 { v30.h }[0], [x27], #0x2\n" + "tbz x0, #0, 83f\n" + "ld1 { v30.b }[2], [x27]\n" "b 83f\n" "82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 83f\n" - "ld1 { v30.b }[0], [x28]\n" + "tbz x0, #0, 83f\n" + "ld1 { v30.b }[0], [x27]\n" "83:" // Oddments: Load (4, 3): Bit 2: End - "ldr d3, [x3, #0x90]\n" - "ssubl v30.8h, v30.8b, v7.8b\n" - "smlal v10.4s, v30.4h, v2.4h\n" - "ldr x12, [x25, #0xe0]\n" - "ssubl v3.8h, v3.8b, v13.8b\n" - "smlal2 v9.4s, v30.8h, v2.8h\n" - "add x12, x12, x10\n" + "ldr d3, [x23, #0x90]\n" + "ssubl v30.8h, v30.8b, v9.8b\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "ldr x11, [x20, #0xe0]\n" + "smlal v6.4s, v30.4h, v2.4h\n" + "smlal2 v5.4s, v30.8h, v2.8h\n" + "add x11, x11, x24\n" "smlal v15.4s, v25.4h, v3.4h\n" - "smlal2 v20.4s, v25.8h, v3.8h\n" - "smlal v18.4s, v24.4h, v3.4h\n" - "smlal2 v5.4s, v24.8h, v3.8h\n" - "smlal v11.4s, v30.4h, v3.4h\n" - "smlal2 v8.4s, v30.8h, v3.8h\n" - "tbz x4, #2, 85f\n" - "ld1 { v28.s }[0], [x12], #0x4\n" - "tbz x4, #1, 84f\n" - "ld1 { v28.h }[2], [x12], #0x2\n" - "tbz x4, #0, 87f\n" - "ld1 { v28.b }[6], [x12]\n" + "smlal2 v16.4s, v25.8h, v3.8h\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal2 v8.4s, v24.8h, v3.8h\n" + "smlal v10.4s, v30.4h, v3.4h\n" + "smlal2 v7.4s, v30.8h, v3.8h\n" + "tbz x0, #2, 85f\n" + "ld1 { v28.s }[0], [x11], #0x4\n" + "tbz x0, #1, 84f\n" + "ld1 { v28.h }[2], [x11], #0x2\n" + "tbz x0, #0, 87f\n" + "ld1 { v28.b }[6], [x11]\n" "b 87f\n" "84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 87f\n" - "ld1 { v28.b }[4], [x12]\n" + "tbz x0, #0, 87f\n" + "ld1 { v28.b }[4], [x11]\n" "b 87f\n" "85:" // Oddments: Load (4, 4): Bit 2: Unset - "tbz x4, #1, 86f\n" - "ld1 { v28.h }[0], [x12], #0x2\n" - "tbz x4, #0, 87f\n" - "ld1 { v28.b }[2], [x12]\n" + "tbz x0, #1, 86f\n" + "ld1 { v28.h }[0], [x11], #0x2\n" + "tbz x0, #0, 87f\n" + "ld1 { v28.b }[2], [x11]\n" "b 87f\n" "86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 87f\n" - "ld1 { v28.b }[0], [x12]\n" + "tbz x0, #0, 87f\n" + "ld1 { v28.b }[0], [x11]\n" "87:" // Oddments: Load (4, 4): Bit 2: End - "ldr d4, [x3, #0x98]\n" - "ssubl v28.8h, v28.8b, v7.8b\n" - "smlal v10.4s, v28.4h, v3.4h\n" - "ldr x7, [x25, #0xe8]\n" - "ssubl v4.8h, v4.8b, v13.8b\n" - "smlal2 v9.4s, v28.8h, v3.8h\n" - "add x7, x7, x10\n" + "ldr d4, [x23, #0x98]\n" + "ssubl v28.8h, v28.8b, v9.8b\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "ldr x17, [x20, #0xe8]\n" + "smlal v6.4s, v28.4h, v3.4h\n" + "smlal2 v5.4s, v28.8h, v3.8h\n" + "add x17, x17, x24\n" "smlal v15.4s, v24.4h, v4.4h\n" - "smlal2 v20.4s, v24.8h, v4.8h\n" - "smlal v18.4s, v22.4h, v4.4h\n" - "smlal2 v5.4s, v22.8h, v4.8h\n" - "smlal v11.4s, v28.4h, v4.4h\n" - "smlal2 v8.4s, v28.8h, v4.8h\n" - "tbz x4, #2, 89f\n" - "ld1 { v26.s }[0], [x7], #0x4\n" - "tbz x4, #1, 88f\n" - "ld1 { v26.h }[2], [x7], #0x2\n" - "tbz x4, #0, 91f\n" - "ld1 { v26.b }[6], [x7]\n" + "smlal2 v16.4s, v24.8h, v4.8h\n" + "smlal v17.4s, v22.4h, v4.4h\n" + "smlal2 v8.4s, v22.8h, v4.8h\n" + "smlal v10.4s, v28.4h, v4.4h\n" + "smlal2 v7.4s, v28.8h, v4.8h\n" + "tbz x0, #2, 89f\n" + "ld1 { v26.s }[0], [x17], #0x4\n" + "tbz x0, #1, 88f\n" + "ld1 { v26.h }[2], [x17], #0x2\n" + "tbz x0, #0, 91f\n" + "ld1 { v26.b }[6], [x17]\n" "b 91f\n" "88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset - "tbz x4, #0, 91f\n" - "ld1 { v26.b }[4], [x7]\n" + "tbz x0, #0, 91f\n" + "ld1 { v26.b }[4], [x17]\n" "b 91f\n" "89:" // Oddments: Load (4, 5): Bit 2: Unset - "tbz x4, #1, 90f\n" - "ld1 { v26.h }[0], [x7], #0x2\n" - "tbz x4, #0, 91f\n" - "ld1 { v26.b }[2], [x7]\n" + "tbz x0, #1, 90f\n" + "ld1 { v26.h }[0], [x17], #0x2\n" + "tbz x0, #0, 91f\n" + "ld1 { v26.b }[2], [x17]\n" "b 91f\n" "90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 91f\n" - "ld1 { v26.b }[0], [x7]\n" + "tbz x0, #0, 91f\n" + "ld1 { v26.b }[0], [x17]\n" "91:" // Oddments: Load (4, 5): Bit 2: End - "ldr d0, [x3, #0xa0]\n" - "ssubl v26.8h, v26.8b, v7.8b\n" - "smlal v10.4s, v26.4h, v4.4h\n" - "ldr x26, [x25, #0xf0]\n" - "ssubl v0.8h, v0.8b, v13.8b\n" - "smlal2 v9.4s, v26.8h, v4.8h\n" - "add x26, x26, x10\n" + "ldr d0, [x23, #0xa0]\n" + "ssubl v26.8h, v26.8b, v9.8b\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "ldr x5, [x20, #0xf0]\n" + "smlal v6.4s, v26.4h, v4.4h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "add x5, x5, x24\n" "smlal v15.4s, v27.4h, v0.4h\n" - "smlal2 v20.4s, v27.8h, v0.8h\n" - "smlal v18.4s, v23.4h, v0.4h\n" - "smlal2 v5.4s, v23.8h, v0.8h\n" - "tbz x4, #2, 93f\n" - "ld1 { v25.s }[0], [x26], #0x4\n" - "tbz x4, #1, 92f\n" - "ld1 { v25.h }[2], [x26], #0x2\n" - "tbz x4, #0, 95f\n" - "ld1 { v25.b }[6], [x26]\n" + "smlal2 v16.4s, v27.8h, v0.8h\n" + "smlal v17.4s, v23.4h, v0.4h\n" + "smlal2 v8.4s, v23.8h, v0.8h\n" + "tbz x0, #2, 93f\n" + "ld1 { v25.s }[0], [x5], #0x4\n" + "tbz x0, #1, 92f\n" + "ld1 { v25.h }[2], [x5], #0x2\n" + "tbz x0, #0, 95f\n" + "ld1 { v25.b }[6], [x5]\n" "b 95f\n" "92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset - "tbz x4, #0, 95f\n" - "ld1 { v25.b }[4], [x26]\n" + "tbz x0, #0, 95f\n" + "ld1 { v25.b }[4], [x5]\n" "b 95f\n" "93:" // Oddments: Load (5, 0): Bit 2: Unset - "tbz x4, #1, 94f\n" - "ld1 { v25.h }[0], [x26], #0x2\n" - "tbz x4, #0, 95f\n" - "ld1 { v25.b }[2], [x26]\n" + "tbz x0, #1, 94f\n" + "ld1 { v25.h }[0], [x5], #0x2\n" + "tbz x0, #0, 95f\n" + "ld1 { v25.b }[2], [x5]\n" "b 95f\n" "94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 95f\n" - "ld1 { v25.b }[0], [x26]\n" + "tbz x0, #0, 95f\n" + "ld1 { v25.b }[0], [x5]\n" "95:" // Oddments: Load (5, 0): Bit 2: End - "ldr x23, [x25, #0xf8]\n" - "ssubl v25.8h, v25.8b, v7.8b\n" - "smlal v11.4s, v25.4h, v0.4h\n" - "smlal2 v8.4s, v25.8h, v0.8h\n" - "add x23, x23, x10\n" - "tbz x4, #2, 97f\n" - "ld1 { v24.s }[0], [x23], #0x4\n" - "tbz x4, #1, 96f\n" - "ld1 { v24.h }[2], [x23], #0x2\n" - "tbz x4, #0, 99f\n" - "ld1 { v24.b }[6], [x23]\n" + "ssubl v25.8h, v25.8b, v9.8b\n" + "ldr x25, [x20, #0xf8]\n" + "smlal v10.4s, v25.4h, v0.4h\n" + "smlal2 v7.4s, v25.8h, v0.8h\n" + "add x25, x25, x24\n" + "tbz x0, #2, 97f\n" + "ld1 { v24.s }[0], [x25], #0x4\n" + "tbz x0, #1, 96f\n" + "ld1 { v24.h }[2], [x25], #0x2\n" + "tbz x0, #0, 99f\n" + "ld1 { v24.b }[6], [x25]\n" "b 99f\n" "96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset - "tbz x4, #0, 99f\n" - "ld1 { v24.b }[4], [x23]\n" + "tbz x0, #0, 99f\n" + "ld1 { v24.b }[4], [x25]\n" "b 99f\n" "97:" // Oddments: Load (5, 1): Bit 2: Unset - "tbz x4, #1, 98f\n" - "ld1 { v24.h }[0], [x23], #0x2\n" - "tbz x4, #0, 99f\n" - "ld1 { v24.b }[2], [x23]\n" + "tbz x0, #1, 98f\n" + "ld1 { v24.h }[0], [x25], #0x2\n" + "tbz x0, #0, 99f\n" + "ld1 { v24.b }[2], [x25]\n" "b 99f\n" "98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 99f\n" - "ld1 { v24.b }[0], [x23]\n" + "tbz x0, #0, 99f\n" + "ld1 { v24.b }[0], [x25]\n" "99:" // Oddments: Load (5, 1): Bit 2: End - "ldr d1, [x3, #0xa8]\n" - "ssubl v24.8h, v24.8b, v7.8b\n" - "smlal v10.4s, v24.4h, v0.4h\n" - "ldr x22, [x25, #0x100]\n" - "ssubl v1.8h, v1.8b, v13.8b\n" - "smlal2 v9.4s, v24.8h, v0.8h\n" - "add x22, x22, x10\n" + "ldr d1, [x23, #0xa8]\n" + "ssubl v24.8h, v24.8b, v9.8b\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "ldr x26, [x20, #0x100]\n" + "smlal v6.4s, v24.4h, v0.4h\n" + "smlal2 v5.4s, v24.8h, v0.8h\n" + "add x26, x26, x24\n" "smlal v15.4s, v23.4h, v1.4h\n" - "smlal2 v20.4s, v23.8h, v1.8h\n" - "smlal v18.4s, v31.4h, v1.4h\n" - "smlal2 v5.4s, v31.8h, v1.8h\n" - "smlal v11.4s, v24.4h, v1.4h\n" - "smlal2 v8.4s, v24.8h, v1.8h\n" - "tbz x4, #2, 101f\n" - "ld1 { v27.s }[0], [x22], #0x4\n" - "tbz x4, #1, 100f\n" - "ld1 { v27.h }[2], [x22], #0x2\n" - "tbz x4, #0, 103f\n" - "ld1 { v27.b }[6], [x22]\n" + "smlal2 v16.4s, v23.8h, v1.8h\n" + "smlal v17.4s, v31.4h, v1.4h\n" + "smlal2 v8.4s, v31.8h, v1.8h\n" + "smlal v10.4s, v24.4h, v1.4h\n" + "smlal2 v7.4s, v24.8h, v1.8h\n" + "tbz x0, #2, 101f\n" + "ld1 { v27.s }[0], [x26], #0x4\n" + "tbz x0, #1, 100f\n" + "ld1 { v27.h }[2], [x26], #0x2\n" + "tbz x0, #0, 103f\n" + "ld1 { v27.b }[6], [x26]\n" "b 103f\n" "100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset - "tbz x4, #0, 103f\n" - "ld1 { v27.b }[4], [x22]\n" + "tbz x0, #0, 103f\n" + "ld1 { v27.b }[4], [x26]\n" "b 103f\n" "101:" // Oddments: Load (5, 2): Bit 2: Unset - "tbz x4, #1, 102f\n" - "ld1 { v27.h }[0], [x22], #0x2\n" - "tbz x4, #0, 103f\n" - "ld1 { v27.b }[2], [x22]\n" + "tbz x0, #1, 102f\n" + "ld1 { v27.h }[0], [x26], #0x2\n" + "tbz x0, #0, 103f\n" + "ld1 { v27.b }[2], [x26]\n" "b 103f\n" "102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 103f\n" - "ld1 { v27.b }[0], [x22]\n" + "tbz x0, #0, 103f\n" + "ld1 { v27.b }[0], [x26]\n" "103:" // Oddments: Load (5, 2): Bit 2: End - "ldr d2, [x3, #0xb0]\n" - "ssubl v27.8h, v27.8b, v7.8b\n" - "smlal v10.4s, v27.4h, v1.4h\n" - "ldr x20, [x25, #0x108]\n" - "ssubl v2.8h, v2.8b, v13.8b\n" - "smlal2 v9.4s, v27.8h, v1.8h\n" - "add x20, x20, x10\n" + "ldr d2, [x23, #0xb0]\n" + "ssubl v27.8h, v27.8b, v9.8b\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "ldr x12, [x20, #0x108]\n" + "smlal v6.4s, v27.4h, v1.4h\n" + "smlal2 v5.4s, v27.8h, v1.8h\n" + "add x12, x12, x24\n" "smlal v15.4s, v31.4h, v2.4h\n" - "smlal2 v20.4s, v31.8h, v2.8h\n" - "smlal v18.4s, v30.4h, v2.4h\n" - "smlal2 v5.4s, v30.8h, v2.8h\n" - "smlal v11.4s, v27.4h, v2.4h\n" - "smlal2 v8.4s, v27.8h, v2.8h\n" - "tbz x4, #2, 105f\n" - "ld1 { v25.s }[0], [x20], #0x4\n" - "tbz x4, #1, 104f\n" - "ld1 { v25.h }[2], [x20], #0x2\n" - "tbz x4, #0, 107f\n" - "ld1 { v25.b }[6], [x20]\n" + "smlal2 v16.4s, v31.8h, v2.8h\n" + "smlal v17.4s, v30.4h, v2.4h\n" + "smlal2 v8.4s, v30.8h, v2.8h\n" + "smlal v10.4s, v27.4h, v2.4h\n" + "smlal2 v7.4s, v27.8h, v2.8h\n" + "tbz x0, #2, 105f\n" + "ld1 { v25.s }[0], [x12], #0x4\n" + "tbz x0, #1, 104f\n" + "ld1 { v25.h }[2], [x12], #0x2\n" + "tbz x0, #0, 107f\n" + "ld1 { v25.b }[6], [x12]\n" "b 107f\n" "104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 107f\n" - "ld1 { v25.b }[4], [x20]\n" + "tbz x0, #0, 107f\n" + "ld1 { v25.b }[4], [x12]\n" "b 107f\n" "105:" // Oddments: Load (5, 3): Bit 2: Unset - "tbz x4, #1, 106f\n" - "ld1 { v25.h }[0], [x20], #0x2\n" - "tbz x4, #0, 107f\n" - "ld1 { v25.b }[2], [x20]\n" + "tbz x0, #1, 106f\n" + "ld1 { v25.h }[0], [x12], #0x2\n" + "tbz x0, #0, 107f\n" + "ld1 { v25.b }[2], [x12]\n" "b 107f\n" "106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 107f\n" - "ld1 { v25.b }[0], [x20]\n" + "tbz x0, #0, 107f\n" + "ld1 { v25.b }[0], [x12]\n" "107:" // Oddments: Load (5, 3): Bit 2: End - "ldr d3, [x3, #0xb8]\n" - "ssubl v25.8h, v25.8b, v7.8b\n" - "smlal v10.4s, v25.4h, v2.4h\n" - "ldr x13, [x25, #0x110]\n" - "ssubl v3.8h, v3.8b, v13.8b\n" - "smlal2 v9.4s, v25.8h, v2.8h\n" - "add x13, x13, x10\n" + "ldr d3, [x23, #0xb8]\n" + "ssubl v25.8h, v25.8b, v9.8b\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "ldr x14, [x20, #0x110]\n" + "smlal v6.4s, v25.4h, v2.4h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "add x14, x14, x24\n" "smlal v15.4s, v30.4h, v3.4h\n" - "smlal2 v20.4s, v30.8h, v3.8h\n" - "smlal v18.4s, v28.4h, v3.4h\n" - "smlal2 v5.4s, v28.8h, v3.8h\n" - "smlal v11.4s, v25.4h, v3.4h\n" - "smlal2 v8.4s, v25.8h, v3.8h\n" - "tbz x4, #2, 109f\n" - "ld1 { v24.s }[0], [x13], #0x4\n" - "tbz x4, #1, 108f\n" - "ld1 { v24.h }[2], [x13], #0x2\n" - "tbz x4, #0, 111f\n" - "ld1 { v24.b }[6], [x13]\n" + "smlal2 v16.4s, v30.8h, v3.8h\n" + "smlal v17.4s, v28.4h, v3.4h\n" + "smlal2 v8.4s, v28.8h, v3.8h\n" + "smlal v10.4s, v25.4h, v3.4h\n" + "smlal2 v7.4s, v25.8h, v3.8h\n" + "tbz x0, #2, 109f\n" + "ld1 { v24.s }[0], [x14], #0x4\n" + "tbz x0, #1, 108f\n" + "ld1 { v24.h }[2], [x14], #0x2\n" + "tbz x0, #0, 111f\n" + "ld1 { v24.b }[6], [x14]\n" "b 111f\n" "108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 111f\n" - "ld1 { v24.b }[4], [x13]\n" + "tbz x0, #0, 111f\n" + "ld1 { v24.b }[4], [x14]\n" "b 111f\n" "109:" // Oddments: Load (5, 4): Bit 2: Unset - "tbz x4, #1, 110f\n" - "ld1 { v24.h }[0], [x13], #0x2\n" - "tbz x4, #0, 111f\n" - "ld1 { v24.b }[2], [x13]\n" + "tbz x0, #1, 110f\n" + "ld1 { v24.h }[0], [x14], #0x2\n" + "tbz x0, #0, 111f\n" + "ld1 { v24.b }[2], [x14]\n" "b 111f\n" "110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 111f\n" - "ld1 { v24.b }[0], [x13]\n" + "tbz x0, #0, 111f\n" + "ld1 { v24.b }[0], [x14]\n" "111:" // Oddments: Load (5, 4): Bit 2: End - "ldr d4, [x3, #0xc0]\n" - "ssubl v24.8h, v24.8b, v7.8b\n" - "smlal v10.4s, v24.4h, v3.4h\n" - "ldr x21, [x25, #0x118]\n" - "ssubl v4.8h, v4.8b, v13.8b\n" - "smlal2 v9.4s, v24.8h, v3.8h\n" - "add x21, x21, x10\n" + "ldr d4, [x23, #0xc0]\n" + "ssubl v24.8h, v24.8b, v9.8b\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "ldr x21, [x20, #0x118]\n" + "smlal v6.4s, v24.4h, v3.4h\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "add x21, x21, x24\n" "smlal v15.4s, v28.4h, v4.4h\n" - "smlal2 v20.4s, v28.8h, v4.8h\n" - "smlal v18.4s, v26.4h, v4.4h\n" - "smlal2 v5.4s, v26.8h, v4.8h\n" - "smlal v11.4s, v24.4h, v4.4h\n" - "smlal2 v8.4s, v24.8h, v4.8h\n" - "tbz x4, #2, 113f\n" + "smlal2 v16.4s, v28.8h, v4.8h\n" + "smlal v17.4s, v26.4h, v4.4h\n" + "smlal2 v8.4s, v26.8h, v4.8h\n" + "smlal v10.4s, v24.4h, v4.4h\n" + "smlal2 v7.4s, v24.8h, v4.8h\n" + "tbz x0, #2, 113f\n" "ld1 { v27.s }[0], [x21], #0x4\n" - "tbz x4, #1, 112f\n" + "tbz x0, #1, 112f\n" "ld1 { v27.h }[2], [x21], #0x2\n" - "tbz x4, #0, 115f\n" + "tbz x0, #0, 115f\n" "ld1 { v27.b }[6], [x21]\n" "b 115f\n" "112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset - "tbz x4, #0, 115f\n" + "tbz x0, #0, 115f\n" "ld1 { v27.b }[4], [x21]\n" "b 115f\n" "113:" // Oddments: Load (5, 5): Bit 2: Unset - "tbz x4, #1, 114f\n" + "tbz x0, #1, 114f\n" "ld1 { v27.h }[0], [x21], #0x2\n" - "tbz x4, #0, 115f\n" + "tbz x0, #0, 115f\n" "ld1 { v27.b }[2], [x21]\n" "b 115f\n" "114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 115f\n" + "tbz x0, #0, 115f\n" "ld1 { v27.b }[0], [x21]\n" "115:" // Oddments: Load (5, 5): Bit 2: End - "ssubl v27.8h, v27.8b, v7.8b\n" - "smlal v10.4s, v27.4h, v4.4h\n" - "smlal2 v9.4s, v27.8h, v4.8h\n" - "tbz x4, #2, 117f\n" - "ld1 { v6.4s }, [x2], #0x10\n" - "ld1 { v21.4s }, [x5], #0x10\n" - "tbz x4, #1, 116f\n" - "ld1 { v17.d }[0], [x2], #0x8\n" - "ld1 { v14.d }[0], [x5], #0x8\n" - "tbz x4, #0, 119f\n" - "ld1 { v17.s }[2], [x2]\n" - "ld1 { v14.s }[2], [x5]\n" + "ssubl v27.8h, v27.8b, v9.8b\n" + "smlal v6.4s, v27.4h, v4.4h\n" + "smlal2 v5.4s, v27.8h, v4.8h\n" + "tbz x0, #2, 117f\n" + "ld1 { v12.4s }, [x10], #0x10\n" + "ld1 { v19.4s }, [x1], #0x10\n" + "tbz x0, #1, 116f\n" + "ld1 { v20.d }[0], [x10], #0x8\n" + "ld1 { v29.d }[0], [x1], #0x8\n" + "tbz x0, #0, 119f\n" + "ld1 { v20.s }[2], [x10]\n" + "ld1 { v29.s }[2], [x1]\n" "b 119f\n" "116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset - "tbz x4, #0, 119f\n" - "ld1 { v17.s }[0], [x2]\n" - "ld1 { v14.s }[0], [x5]\n" + "tbz x0, #0, 119f\n" + "ld1 { v20.s }[0], [x10]\n" + "ld1 { v29.s }[0], [x1]\n" "b 119f\n" "117:" // Oddments: Load requant params: Bit 2: Unset - "tbz x4, #1, 118f\n" - "ld1 { v6.d }[0], [x2], #0x8\n" - "ld1 { v21.d }[0], [x5], #0x8\n" - "tbz x4, #0, 119f\n" - "ld1 { v6.s }[2], [x2]\n" - "ld1 { v21.s }[2], [x5]\n" + "tbz x0, #1, 118f\n" + "ld1 { v12.d }[0], [x10], #0x8\n" + "ld1 { v19.d }[0], [x1], #0x8\n" + "tbz x0, #0, 119f\n" + "ld1 { v12.s }[2], [x10]\n" + "ld1 { v19.s }[2], [x1]\n" "b 119f\n" "118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 119f\n" - "ld1 { v6.s }[0], [x2]\n" - "ld1 { v21.s }[0], [x5]\n" + "tbz x0, #0, 119f\n" + "ld1 { v12.s }[0], [x10]\n" + "ld1 { v19.s }[0], [x1]\n" "119:" // Oddments: Load requant params: Bit 2: End - "sqrdmulh v15.4s, v15.4s, v6.4s\n" - "add x17, x17, x1\n" - "sqrdmulh v20.4s, v20.4s, v17.4s\n" - "add x16, x16, x1\n" - "sqrdmulh v18.4s, v18.4s, v6.4s\n" - "add x6, x6, x1\n" - "sqrdmulh v5.4s, v5.4s, v17.4s\n" - "add x8, x8, x1\n" - "sqrdmulh v11.4s, v11.4s, v6.4s\n" - "and v1.16b, v15.16b, v21.16b\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "and v29.16b, v20.16b, v14.16b\n" - "and v3.16b, v18.16b, v21.16b\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "and v2.16b, v5.16b, v14.16b\n" - "and v0.16b, v11.16b, v21.16b\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqrdmulh v8.4s, v8.4s, v17.4s\n" + "sqdmulh v15.4s, v15.4s, v12.4s\n" + "sqdmulh v17.4s, v17.4s, v12.4s\n" + "add x16, x16, x22\n" + "add x8, x8, x22\n" + "sqdmulh v10.4s, v10.4s, v12.4s\n" + "sqdmulh v6.4s, v6.4s, v12.4s\n" + "add x4, x4, x22\n" + "add x7, x7, x22\n" + "and v23.16b, v15.16b, v19.16b\n" + "sqdmulh v16.4s, v16.4s, v20.4s\n" + "and v22.16b, v17.16b, v19.16b\n" + "sqdmulh v8.4s, v8.4s, v20.4s\n" + "and v21.16b, v10.16b, v19.16b\n" + "sqdmulh v7.4s, v7.4s, v20.4s\n" + "and v26.16b, v6.16b, v19.16b\n" + "sqdmulh v5.4s, v5.4s, v20.4s\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "and v4.16b, v16.16b, v29.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v2.16b, v8.16b, v29.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v3.16b, v7.16b, v29.16b\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "and v25.16b, v5.16b, v29.16b\n" + "sqadd v15.4s, v15.4s, v23.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v22.4s\n" "sshr v2.4s, v2.4s, #0x1f\n" - "sqadd v15.4s, v15.4s, v1.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "sshr v0.4s, v0.4s, #0x1f\n" - "sqrdmulh v9.4s, v9.4s, v17.4s\n" - "sqadd v20.4s, v20.4s, v29.4s\n" - "sqadd v18.4s, v18.4s, v3.4s\n" - "srshl v15.4s, v15.4s, v21.4s\n" - "sqadd v5.4s, v5.4s, v2.4s\n" - "srshl v20.4s, v20.4s, v14.4s\n" - "srshl v18.4s, v18.4s, v21.4s\n" - "add v15.4s, v15.4s, v19.4s\n" - "srshl v5.4s, v5.4s, v14.4s\n" - "add v20.4s, v20.4s, v19.4s\n" - "smin v15.4s, v15.4s, v12.4s\n" - "add v18.4s, v18.4s, v19.4s\n" - "smin v20.4s, v20.4s, v12.4s\n" - "smax v15.4s, v15.4s, v16.4s\n" - "smin v18.4s, v18.4s, v12.4s\n" - "smax v20.4s, v20.4s, v16.4s\n" - "add v5.4s, v5.4s, v19.4s\n" - "smax v18.4s, v18.4s, v16.4s\n" - "uzp1 v15.16b, v15.16b, v20.16b\n" - "smin v5.4s, v5.4s, v12.4s\n" + "sqadd v10.4s, v10.4s, v21.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqadd v6.4s, v6.4s, v26.4s\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v19.4s\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "srshl v17.4s, v17.4s, v19.4s\n" + "sqadd v8.4s, v8.4s, v2.4s\n" + "srshl v10.4s, v10.4s, v19.4s\n" + "sqadd v7.4s, v7.4s, v3.4s\n" + "srshl v6.4s, v6.4s, v19.4s\n" + "sqadd v5.4s, v5.4s, v25.4s\n" + "srshl v16.4s, v16.4s, v29.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v8.4s, v8.4s, v29.4s\n" + "sqxtn v17.4h, v17.4s\n" + "srshl v7.4s, v7.4s, v29.4s\n" + "sqxtn v10.4h, v10.4s\n" + "srshl v5.4s, v5.4s, v29.4s\n" + "sqxtn v6.4h, v6.4s\n" + "sqxtn2 v15.8h, v16.4s\n" + "sqxtn2 v17.8h, v8.4s\n" + "sqxtn2 v10.8h, v7.4s\n" + "sqxtn2 v6.8h, v5.4s\n" + "sqadd v15.8h, v15.8h, v18.8h\n" + "sqadd v17.8h, v17.8h, v18.8h\n" + "sqadd v10.8h, v10.8h, v18.8h\n" + "sqadd v6.8h, v6.8h, v18.8h\n" + "smax v15.8h, v15.8h, v11.8h\n" + "smax v17.8h, v17.8h, v11.8h\n" + "smax v10.8h, v10.8h, v11.8h\n" + "smax v6.8h, v6.8h, v11.8h\n" + "smin v15.8h, v15.8h, v13.8h\n" + "smin v17.8h, v17.8h, v13.8h\n" + "smin v10.8h, v10.8h, v13.8h\n" + "smin v6.8h, v6.8h, v13.8h\n" "uzp1 v15.16b, v15.16b, v15.16b\n" - "sqadd v11.4s, v11.4s, v0.4s\n" - "smax v5.4s, v5.4s, v16.4s\n" - "and v27.16b, v8.16b, v14.16b\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "uzp1 v18.16b, v18.16b, v5.16b\n" - "srshl v11.4s, v11.4s, v21.4s\n" - "and v30.16b, v10.16b, v21.16b\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "uzp1 v18.16b, v18.16b, v18.16b\n" - "add v11.4s, v11.4s, v19.4s\n" - "sqadd v8.4s, v8.4s, v27.4s\n" - "and v6.16b, v9.16b, v14.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "smin v11.4s, v11.4s, v12.4s\n" - "srshl v8.4s, v8.4s, v14.4s\n" - "sqadd v10.4s, v10.4s, v30.4s\n" - "smax v11.4s, v11.4s, v16.4s\n" - "add v8.4s, v8.4s, v19.4s\n" - "srshl v10.4s, v10.4s, v21.4s\n" - "sqadd v9.4s, v9.4s, v6.4s\n" - "smin v8.4s, v8.4s, v12.4s\n" - "add v10.4s, v10.4s, v19.4s\n" - "srshl v9.4s, v9.4s, v14.4s\n" - "smax v8.4s, v8.4s, v16.4s\n" - "smin v10.4s, v10.4s, v12.4s\n" - "uzp1 v11.16b, v11.16b, v8.16b\n" - "add v9.4s, v9.4s, v19.4s\n" - "uzp1 v11.16b, v11.16b, v11.16b\n" - "smax v10.4s, v10.4s, v16.4s\n" - "smin v9.4s, v9.4s, v12.4s\n" - "smax v9.4s, v9.4s, v16.4s\n" - "uzp1 v10.16b, v10.16b, v9.16b\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" "uzp1 v10.16b, v10.16b, v10.16b\n" - "tbz x4, #2, 121f\n" - "st1 { v15.s }[0], [x17], #0x4\n" - "st1 { v18.s }[0], [x16], #0x4\n" - "st1 { v11.s }[0], [x6], #0x4\n" - "st1 { v10.s }[0], [x8], #0x4\n" - "tbz x4, #1, 120f\n" - "st1 { v15.h }[2], [x17], #0x2\n" - "st1 { v18.h }[2], [x16], #0x2\n" - "st1 { v11.h }[2], [x6], #0x2\n" - "st1 { v10.h }[2], [x8], #0x2\n" - "tbz x4, #0, 123f\n" - "st1 { v15.b }[6], [x17], #0x1\n" - "st1 { v18.b }[6], [x16], #0x1\n" - "st1 { v11.b }[6], [x6], #0x1\n" - "st1 { v10.b }[6], [x8], #0x1\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "tbz x0, #2, 121f\n" + "st1 { v15.s }[0], [x16], #0x4\n" + "st1 { v17.s }[0], [x8], #0x4\n" + "st1 { v10.s }[0], [x4], #0x4\n" + "st1 { v6.s }[0], [x7], #0x4\n" + "tbz x0, #1, 120f\n" + "st1 { v15.h }[2], [x16], #0x2\n" + "st1 { v17.h }[2], [x8], #0x2\n" + "st1 { v10.h }[2], [x4], #0x2\n" + "st1 { v6.h }[2], [x7], #0x2\n" + "tbz x0, #0, 123f\n" + "st1 { v15.b }[6], [x16], #0x1\n" + "st1 { v17.b }[6], [x8], #0x1\n" + "st1 { v10.b }[6], [x4], #0x1\n" + "st1 { v6.b }[6], [x7], #0x1\n" "b 123f\n" "120:" // Oddments: Bit 2: Bit 1: Unset - "tbz x4, #0, 123f\n" - "st1 { v15.b }[4], [x17], #0x1\n" - "st1 { v18.b }[4], [x16], #0x1\n" - "st1 { v11.b }[4], [x6], #0x1\n" - "st1 { v10.b }[4], [x8], #0x1\n" + "tbz x0, #0, 123f\n" + "st1 { v15.b }[4], [x16], #0x1\n" + "st1 { v17.b }[4], [x8], #0x1\n" + "st1 { v10.b }[4], [x4], #0x1\n" + "st1 { v6.b }[4], [x7], #0x1\n" "b 123f\n" "121:" // Oddments: Bit 2: Unset - "tbz x4, #1, 122f\n" - "st1 { v15.h }[0], [x17], #0x2\n" - "st1 { v18.h }[0], [x16], #0x2\n" - "st1 { v11.h }[0], [x6], #0x2\n" - "st1 { v10.h }[0], [x8], #0x2\n" - "tbz x4, #0, 123f\n" - "st1 { v15.b }[2], [x17], #0x1\n" - "st1 { v18.b }[2], [x16], #0x1\n" - "st1 { v11.b }[2], [x6], #0x1\n" - "st1 { v10.b }[2], [x8], #0x1\n" + "tbz x0, #1, 122f\n" + "st1 { v15.h }[0], [x16], #0x2\n" + "st1 { v17.h }[0], [x8], #0x2\n" + "st1 { v10.h }[0], [x4], #0x2\n" + "st1 { v6.h }[0], [x7], #0x2\n" + "tbz x0, #0, 123f\n" + "st1 { v15.b }[2], [x16], #0x1\n" + "st1 { v17.b }[2], [x8], #0x1\n" + "st1 { v10.b }[2], [x4], #0x1\n" + "st1 { v6.b }[2], [x7], #0x1\n" "b 123f\n" "122:" // Oddments: Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 123f\n" - "st1 { v15.b }[0], [x17], #0x1\n" - "st1 { v18.b }[0], [x16], #0x1\n" - "st1 { v11.b }[0], [x6], #0x1\n" - "st1 { v10.b }[0], [x8], #0x1\n" + "tbz x0, #0, 123f\n" + "st1 { v15.b }[0], [x16], #0x1\n" + "st1 { v17.b }[0], [x8], #0x1\n" + "st1 { v10.b }[0], [x4], #0x1\n" + "st1 { v6.b }[0], [x7], #0x1\n" "123:" // Oddments: Bit 2: End - "124:" // End - : : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp index 4e845cceaf..9b1f7c239f 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_nhwc_generic_output9_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,28 +28,23 @@ #pragma once +#if defined(__aarch64__) + namespace arm_conv { namespace depthwise { void a64_s8q_nhwc_generic_output9_mla_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int); -struct a64_s8q_nhwc_generic_output9_mla_depthfirst +class a64_s8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy { - typedef int32_t bias_type; - typedef int8_t input_type; - typedef int8_t weight_type; - typedef int8_t return_type; - - typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - constexpr static unsigned int n_output_points = 9; + KernelType kernel = a64_s8q_nhwc_generic_output9_mla_depthfirst_impl; - kern_type kernel = a64_s8q_nhwc_generic_output9_mla_depthfirst_impl; + public: + a64_s8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy(9, arm_gemm::VLType::None) {} - a64_s8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {} + KernelType get_kernel() const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp index b9fef4f9ab..5ca3ccd4bf 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,39 +28,34 @@ #pragma once +#if defined(__aarch64__) + namespace arm_conv { namespace depthwise { void a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); -struct a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst +struct a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst : DepthfirstMultiplierStrategy { - typedef int32_t bias_type; - typedef int8_t input_type; - typedef int8_t weight_type; - typedef int8_t return_type; - - typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - + using Parent = DepthfirstMultiplierStrategy; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 2; constexpr static unsigned int stride_cols = 2; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 4; - - constexpr static unsigned int input_rows = 5; - constexpr static unsigned int input_cols = 9; - constexpr static unsigned int input_col_quads = 1; + a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) + : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols) + { + } - kern_type kernel = a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl; + arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; } - a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = a64_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp index 9a3eed47fb..0641229aa7 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -26,6 +26,8 @@ #include +#if defined(__aarch64__) + #pragma once namespace arm_conv { @@ -33,34 +35,26 @@ namespace depthwise { void a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); -struct a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst +struct a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst : DepthfirstMultiplierStrategy { - typedef int32_t bias_type; - typedef int8_t input_type; - typedef int8_t weight_type; - typedef int8_t return_type; - - typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - + using Parent = DepthfirstMultiplierStrategy; constexpr static unsigned int kernel_rows = 5; constexpr static unsigned int kernel_cols = 5; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 4; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 8; - constexpr static unsigned int input_cols = 6; - constexpr static unsigned int input_col_quads = 1; + a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) + : Parent(4, 2, kernel_rows, kernel_cols, stride_rows, stride_cols) + { + } - kern_type kernel = a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl; + arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; } - a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = a64_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp index d0ae00d260..3dad8d5604 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,31 +28,25 @@ #pragma once +#if defined(__aarch64__) + namespace arm_conv { namespace depthwise { void a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&); -struct a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst +struct a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy { - typedef int32_t bias_type; - typedef int8_t input_type; - typedef int8_t weight_type; - typedef int8_t return_type; - - typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - constexpr static unsigned int output_rows(void) { return 2; }; - constexpr static unsigned int output_cols(void) { return 8; }; - - constexpr static unsigned int output_col_regs(void) { return 2; }; - - kern_type kernel = a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl; - - a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {} + using Parent = GenericDepthfirstMultiplierKernelStrategy; + a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) + : Parent(2, 8, arm_gemm::VLType::None) + { + } + Parent::KernelType kernel = a64_s8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp index 0fde00ba37..22b6b657a5 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -34,39 +34,40 @@ namespace arm_conv { namespace depthwise { -void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); +void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *); -struct a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst +class a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef int8_t input_type; - typedef int8_t weight_type; - typedef int8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - typedef void (*kern_type)(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); - typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 4; - constexpr static unsigned int input_cols = 4; - - constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_dot::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_dot::get_packed_size; - - kern_type kernel = a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl; - - a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {} + a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {} + + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; } + + Parent::KernelType kernel = a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } + size_t get_storage_size(const DepthwiseArgs &args) const override + { + return interleave_a64_s8q_3x3_dot::get_packed_size(args); + } + + void pack_parameters( + const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp, + const void *weights, size_t ld_weight_col, size_t ld_weight_row + ) const override + { + interleave_a64_s8q_3x3_dot::pack_parameters( + args.input_channels, buffer, reinterpret_cast(biases), + reinterpret_cast(weights), qp, ld_weight_col, ld_weight_row + ); + } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp index bdbda178b3..761c7ec86e 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,7 +30,15 @@ namespace arm_conv { namespace depthwise { -void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp) +void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl( + const unsigned int n_channels, + const int8_t *const *const inptrs, + const int8_t *params, + const int32_t *, // Bias, should be wrapped into the parameters + const arm_gemm::Requantize32& qp, + const int32_t *, const int32_t *, // Requant parameters, also wrapped + int8_t *const *const outptrs +) { __asm__ __volatile__( "ldp x15, x14, [%x[inptrs], #0x0]\n" @@ -1173,7 +1181,7 @@ void a64_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *con "34:" // End "add SP, SP, #0x80\n" : [params] "+&r" (params) - : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) + : [inptrs] "r" (inptrs), [n_channels] "r" ((long unsigned int) n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp index 05eddd1853..00c8a3cde2 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -34,39 +34,40 @@ namespace arm_conv { namespace depthwise { -void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *, uint8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); +void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, uint8_t *const *); -struct a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst +class a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy { - typedef uint32_t bias_type; - typedef uint8_t input_type; - typedef uint8_t weight_type; - typedef uint8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - typedef void (*kern_type)(const uint8_t *const *, uint8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); - typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 4; - constexpr static unsigned int input_cols = 4; - - constexpr static parameter_packing_fn pack_parameters = interleave_a64_u8q_3x3_dot::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_a64_u8q_3x3_dot::get_packed_size; - - kern_type kernel = a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl; - - a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {} + a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {} + + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; } + + Parent::KernelType kernel = a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } + size_t get_storage_size(const DepthwiseArgs &args) const override + { + return interleave_a64_u8q_3x3_dot::get_packed_size(args); + } + + void pack_parameters( + const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp, + const void *weights, size_t ld_weight_col, size_t ld_weight_row + ) const override + { + interleave_a64_u8q_3x3_dot::pack_parameters( + args.input_channels, buffer, reinterpret_cast(biases), + reinterpret_cast(weights), qp, ld_weight_col, ld_weight_row + ); + } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp index 22c584f8e7..64b305c21d 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,7 +30,15 @@ namespace arm_conv { namespace depthwise { -void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *const inptrs, uint8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp) +void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl( + const unsigned int n_channels, + const uint8_t *const *const inptrs, + const uint8_t *params, + const int32_t *, // Bias, should be wrapped into the parameters + const arm_gemm::Requantize32& qp, + const int32_t *, const int32_t *, // Requant parameters, also wrapped + uint8_t *const *const outptrs +) { __asm__ __volatile__( "ldp x13, x12, [%x[inptrs], #0x0]\n" @@ -1307,7 +1315,7 @@ void a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *con "34:" // End "add SP, SP, #0x80\n" : [params] "+&r" (params) - : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) + : [inptrs] "r" (inptrs), [n_channels] "r" ((long unsigned) n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp index 09ba75f685..b55055fc93 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,37 +36,24 @@ namespace depthwise { void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); -struct a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst +class a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef uint8_t input_type; - typedef uint8_t weight_type; - typedef uint8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); - typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 4; - constexpr static unsigned int input_cols = 4; - - constexpr static parameter_packing_fn pack_parameters = interleave_a64_u8q_3x3_mla::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_a64_u8q_3x3_mla::get_packed_size; + a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {} - kern_type kernel = a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl; + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; } - a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp index b62ebb1687..453f9cfa92 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -46,7 +46,7 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( struct Params { long unsigned int n_channels; - const uint8_t *weights; + const void *weights; const int32_t *bias; const arm_gemm::Requantize32 *requant; const int32_t *const requant_muls; @@ -57,7 +57,7 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( Params( long unsigned int n_channels, const uint8_t *const *inptrs_raw, - const uint8_t *const weights, + const void *const weights, const int32_t *const bias, const arm_gemm::Requantize32 &qp, const int32_t *const requant_muls, @@ -91,513 +91,497 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( requant_muls, requant_shifts, outptrs); __asm__ __volatile__( + "ldr x19, [%x[params], %[offsetof_Params_requant]]\n" "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n" - "mov x17, #0x0\n" - "ldr x16, [%x[params], %[offsetof_Params_weights]]\n" + "add x24, x19, %[offsetof_Requantize32_a_offset]\n" + "add x23, x19, %[offsetof_Requantize32_b_offset]\n" + "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x21, x19, %[offsetof_Requantize32_c_offset]\n" + "add x20, x19, %[offsetof_Requantize32_minval]\n" + "ldr x17, [%x[params], %[offsetof_Params_weights]]\n" + "add x19, x19, %[offsetof_Requantize32_maxval]\n" + "ld1r { v22.16b }, [x24]\n" + "ld1r { v12.16b }, [x23]\n" + "lsr x16, x8, #0x3\n" + "ld1r { v14.8h }, [x21]\n" + "ld1r { v17.8h }, [x20]\n" "mov x15, #0x0\n" - "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" - "add x14, %x[params], %[offsetof_Params_inptrs]\n" + "mov x14, #0x0\n" + "ld1r { v15.8h }, [x19]\n" "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n" - "lsr x12, x8, #0x3\n" + "add x12, %x[params], %[offsetof_Params_inptrs]\n" "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n" - "add x19, x22, %[offsetof_Requantize32_a_offset]\n" - "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" - "add x20, x22, %[offsetof_Requantize32_b_offset]\n" - "ld1r { v14.16b }, [x19]\n" - "add x19, x22, %[offsetof_Requantize32_c_offset]\n" - "ld1r { v9.16b }, [x20]\n" - "add x20, x22, %[offsetof_Requantize32_minval]\n" - "ld1r { v15.4s }, [x19]\n" - "add x19, x22, %[offsetof_Requantize32_maxval]\n" - "ld1r { v24.4s }, [x20]\n" - "ld1r { v12.4s }, [x19]\n" - "ldp x10, x9, [x21, #0x0]\n" - "ldp x28, x27, [x21, #0x10]\n" - "cbz x12, 3f\n" - "subs x12, x12, #0x1\n" + "ldp x10, x9, [x22, #0x0]\n" + "ldp x28, x27, [x22, #0x10]\n" + "cbz x16, 3f\n" "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" "ldr q13, [x19, #0x0]\n" - "mov v17.16b, v13.16b\n" - "ldr q19, [x19, #0x10]\n" + "subs x16, x16, #0x1\n" + "mov v19.16b, v13.16b\n" + "ldr q26, [x19, #0x10]\n" "add x19, x19, #0x20\n" - "mov v16.16b, v13.16b\n" "str x19, [%x[params], %[offsetof_Params_bias]]\n" - "mov v23.16b, v13.16b\n" - "ldr d0, [x16, #0x0]\n" - "usubl v0.8h, v0.8b, v9.8b\n" - "mov v25.16b, v19.16b\n" - "ldr d1, [x16, #0x8]\n" - "mov v21.16b, v19.16b\n" - "ldr d2, [x16, #0x10]\n" - "usubl v1.8h, v1.8b, v9.8b\n" - "mov v20.16b, v19.16b\n" - "ldr d3, [x16, #0x18]\n" - "ldr d4, [x16, #0x20]\n" - "usubl v2.8h, v2.8b, v9.8b\n" - "ldr d5, [x16, #0x28]\n" - "usubl v3.8h, v3.8b, v9.8b\n" - "ldr d6, [x16, #0x30]\n" - "ldr d7, [x16, #0x38]\n" - "usubl v4.8h, v4.8b, v9.8b\n" - "ldr d8, [x16, #0x40]\n" - "usubl v5.8h, v5.8b, v9.8b\n" - "ldp x23, x22, [x14, #0x0]\n" - "usubl v6.8h, v6.8b, v9.8b\n" - "ldp x21, x20, [x14, #0x10]\n" - "usubl v7.8h, v7.8b, v9.8b\n" - "usubl v8.8h, v8.8b, v9.8b\n" - "ldr x19, [x14, #0x20]\n" - "ldr d31, [x23, x17]\n" - "usubl v31.8h, v31.8b, v14.8b\n" - "ldr d30, [x22, x17]\n" - "ldr d29, [x21, x17]\n" - "usubl v30.8h, v30.8b, v14.8b\n" - "ldr d28, [x20, x17]\n" - "ldr d27, [x19, x17]\n" - "usubl v29.8h, v29.8b, v14.8b\n" - "usubl v28.8h, v28.8b, v14.8b\n" - "usubl v27.8h, v27.8b, v14.8b\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "ldr d2, [x17, #0x10]\n" + "mov v11.16b, v26.16b\n" + "mov v18.16b, v13.16b\n" + "ldr d3, [x17, #0x18]\n" + "ldr d4, [x17, #0x20]\n" + "mov v24.16b, v26.16b\n" + "mov v9.16b, v13.16b\n" + "ldr d5, [x17, #0x28]\n" + "ldr d6, [x17, #0x30]\n" + "mov v23.16b, v26.16b\n" + "usubl v0.8h, v0.8b, v12.8b\n" + "ldr d7, [x17, #0x38]\n" + "ldr d8, [x17, #0x40]\n" + "usubl v1.8h, v1.8b, v12.8b\n" + "usubl v2.8h, v2.8b, v12.8b\n" + "ldp x23, x22, [x12, #0x0]\n" + "ldp x21, x20, [x12, #0x10]\n" + "usubl v3.8h, v3.8b, v12.8b\n" + "usubl v4.8h, v4.8b, v12.8b\n" + "ldr x19, [x12, #0x20]\n" + "ldr d31, [x23, x15]\n" + "usubl v5.8h, v5.8b, v12.8b\n" + "usubl v6.8h, v6.8b, v12.8b\n" + "ldr d30, [x22, x15]\n" + "ldr d29, [x21, x15]\n" + "usubl v7.8h, v7.8b, v12.8b\n" + "usubl v8.8h, v8.8b, v12.8b\n" + "ldr d28, [x20, x15]\n" + "ldr d27, [x19, x15]\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "usubl v27.8h, v27.8b, v22.8b\n" "beq 2f\n" "1:" // Loop "smlal v13.4s, v31.4h, v4.4h\n" - "ldr x21, [x14, #0x28]\n" - "add x16, x16, #0x48\n" - "smlal2 v19.4s, v31.8h, v4.8h\n" - "ldr x20, [x14, #0x30]\n" - "subs x12, x12, #0x1\n" - "smlal v17.4s, v31.4h, v3.4h\n" - "ldr x26, [x14, #0x38]\n" - "smlal2 v25.4s, v31.8h, v3.8h\n" - "ldr x25, [x14, #0x40]\n" - "smlal v16.4s, v31.4h, v1.4h\n" - "ldr x19, [x14, #0x48]\n" - "smlal2 v21.4s, v31.8h, v1.8h\n" - "ldr x24, [x14, #0x50]\n" - "smlal v23.4s, v31.4h, v0.4h\n" - "ldr x23, [x14, #0x58]\n" - "smlal2 v20.4s, v31.8h, v0.8h\n" - "ldr d31, [x21, x17]\n" - "usubl v31.8h, v31.8b, v14.8b\n" + "smlal2 v26.4s, v31.8h, v4.8h\n" + "ldr x21, [x12, #0x28]\n" + "ldr x26, [x12, #0x38]\n" + "smlal v19.4s, v31.4h, v3.4h\n" + "smlal2 v11.4s, v31.8h, v3.8h\n" + "ldr x20, [x12, #0x30]\n" + "ldr x25, [x12, #0x40]\n" "smlal v13.4s, v30.4h, v0.4h\n" - "ldr x22, [x14, #0x60]\n" - "smlal2 v19.4s, v30.8h, v0.8h\n" - "ldr d30, [x19, x17]\n" - "usubl v30.8h, v30.8b, v14.8b\n" - "smlal v17.4s, v29.4h, v2.4h\n" - "ldr x21, [x14, #0x68]\n" - "smlal2 v25.4s, v29.8h, v2.8h\n" - "ldr d29, [x20, x17]\n" - "usubl v29.8h, v29.8b, v14.8b\n" + "smlal2 v26.4s, v30.8h, v0.8h\n" + "ldr x19, [x12, #0x48]\n" + "ldr d30, [x19, x15]\n" + "smlal v19.4s, v29.4h, v2.4h\n" + "smlal2 v11.4s, v29.8h, v2.8h\n" + "ldr d29, [x20, x15]\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "smlal v18.4s, v31.4h, v1.4h\n" + "smlal2 v24.4s, v31.8h, v1.8h\n" + "ldr x24, [x12, #0x50]\n" + "ldr x23, [x12, #0x58]\n" + "smlal v9.4s, v31.4h, v0.4h\n" + "smlal2 v23.4s, v31.8h, v0.8h\n" + "ldr d31, [x21, x15]\n" + "usubl v31.8h, v31.8b, v22.8b\n" "smlal v13.4s, v28.4h, v5.4h\n" - "ldr x20, [x14, #0x70]\n" - "smlal2 v19.4s, v28.8h, v5.8h\n" - "ldr x19, [x14, #0x78]\n" - "smlal v17.4s, v28.4h, v4.4h\n" - "ldr q26, [x13, #0x0]\n" - "smlal2 v25.4s, v28.8h, v4.8h\n" - "ldr q10, [x11, #0x0]\n" - "smlal v16.4s, v28.4h, v2.4h\n" - "ldr q11, [x13, #0x10]\n" - "add x13, x13, #0x20\n" - "smlal2 v21.4s, v28.8h, v2.8h\n" - "ldr q18, [x11, #0x10]\n" - "add x11, x11, #0x20\n" - "smlal v23.4s, v28.4h, v1.4h\n" - "smlal2 v20.4s, v28.8h, v1.8h\n" - "ldr d28, [x26, x17]\n" - "usubl v28.8h, v28.8b, v14.8b\n" - "smlal v16.4s, v31.4h, v6.4h\n" - "smlal2 v21.4s, v31.8h, v6.8h\n" - "ldr d31, [x25, x17]\n" - "usubl v31.8h, v31.8b, v14.8b\n" + "smlal2 v26.4s, v28.8h, v5.8h\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "ldr x22, [x12, #0x60]\n" + "smlal v19.4s, v28.4h, v4.4h\n" + "smlal2 v11.4s, v28.8h, v4.8h\n" + "ldr x21, [x12, #0x68]\n" + "ldr x20, [x12, #0x70]\n" + "smlal v18.4s, v28.4h, v2.4h\n" + "smlal2 v24.4s, v28.8h, v2.8h\n" + "ldr x19, [x12, #0x78]\n" + "ldr q21, [x13, #0x0]\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "smlal2 v23.4s, v28.8h, v1.8h\n" + "ldr d28, [x26, x15]\n" + "usubl v28.8h, v28.8b, v22.8b\n" "smlal v13.4s, v27.4h, v7.4h\n" - "smlal2 v19.4s, v27.8h, v7.8h\n" - "smlal v17.4s, v27.4h, v6.4h\n" - "smlal2 v25.4s, v27.8h, v6.8h\n" - "smlal v16.4s, v27.4h, v4.4h\n" - "smlal2 v21.4s, v27.8h, v4.8h\n" - "smlal v23.4s, v27.4h, v3.4h\n" - "smlal2 v20.4s, v27.8h, v3.8h\n" + "smlal2 v26.4s, v27.8h, v7.8h\n" + "ldr q25, [x11, #0x0]\n" + "ldr q10, [x13, #0x10]\n" + "smlal v19.4s, v27.4h, v6.4h\n" + "smlal2 v11.4s, v27.8h, v6.8h\n" + "ldr q16, [x11, #0x10]\n" + "add x17, x17, #0x48\n" + "smlal v18.4s, v31.4h, v6.4h\n" + "smlal2 v24.4s, v31.8h, v6.8h\n" + "ldr d31, [x25, x15]\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "smlal2 v23.4s, v27.8h, v3.8h\n" + "subs x16, x16, #0x1\n" + "add x13, x13, #0x20\n" "smlal v13.4s, v28.4h, v1.4h\n" - "smlal2 v19.4s, v28.8h, v1.8h\n" - "smlal v23.4s, v29.4h, v8.4h\n" - "smlal2 v20.4s, v29.8h, v8.8h\n" - "ldr d29, [x24, x17]\n" - "usubl v29.8h, v29.8b, v14.8b\n" - "smlal v17.4s, v28.4h, v0.4h\n" - "smlal2 v25.4s, v28.8h, v0.8h\n" - "ldr d28, [x23, x17]\n" - "usubl v28.8h, v28.8b, v14.8b\n" + "smlal2 v26.4s, v28.8h, v1.8h\n" + "add x11, x11, #0x20\n" + "smlal v19.4s, v28.4h, v0.4h\n" + "smlal2 v11.4s, v28.8h, v0.8h\n" + "ldr d28, [x23, x15]\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "smlal v18.4s, v27.4h, v4.4h\n" + "smlal v9.4s, v29.4h, v8.4h\n" + "smlal2 v24.4s, v27.8h, v4.8h\n" + "smlal2 v23.4s, v29.8h, v8.8h\n" + "ldr d29, [x24, x15]\n" + "usubl v29.8h, v29.8b, v22.8b\n" "smlal v13.4s, v31.4h, v2.4h\n" - "smlal2 v19.4s, v31.8h, v2.8h\n" - "smlal v17.4s, v31.4h, v1.4h\n" - "smlal2 v25.4s, v31.8h, v1.8h\n" - "ldr d31, [x22, x17]\n" - "usubl v31.8h, v31.8b, v14.8b\n" + "smlal2 v26.4s, v31.8h, v2.8h\n" + "smlal v19.4s, v31.4h, v1.4h\n" + "smlal2 v11.4s, v31.8h, v1.8h\n" + "ldr d31, [x22, x15]\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "smlal v18.4s, v30.4h, v5.4h\n" + "smlal v9.4s, v30.4h, v4.4h\n" "smlal v13.4s, v30.4h, v8.4h\n" - "smlal2 v19.4s, v30.8h, v8.8h\n" - "smlal v17.4s, v30.4h, v7.4h\n" - "smlal2 v25.4s, v30.8h, v7.8h\n" - "smlal v16.4s, v30.4h, v5.4h\n" - "smlal2 v21.4s, v30.8h, v5.8h\n" - "smlal v23.4s, v30.4h, v4.4h\n" - "smlal2 v20.4s, v30.8h, v4.8h\n" - "ldr d30, [x21, x17]\n" - "usubl v30.8h, v30.8b, v14.8b\n" + "smlal2 v26.4s, v30.8h, v8.8h\n" + "smlal v19.4s, v30.4h, v7.4h\n" + "smlal2 v11.4s, v30.8h, v7.8h\n" + "smlal2 v24.4s, v30.8h, v5.8h\n" + "smlal2 v23.4s, v30.8h, v4.8h\n" + "ldr d30, [x21, x15]\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "smlal v18.4s, v29.4h, v0.4h\n" + "smlal v9.4s, v28.4h, v2.4h\n" "smlal v13.4s, v29.4h, v3.4h\n" - "smlal2 v19.4s, v29.8h, v3.8h\n" - "smlal v16.4s, v29.4h, v0.4h\n" - "smlal2 v21.4s, v29.8h, v0.8h\n" - "ldr d29, [x20, x17]\n" - "usubl v29.8h, v29.8b, v14.8b\n" - "smlal v17.4s, v28.4h, v5.4h\n" - "smlal2 v25.4s, v28.8h, v5.8h\n" - "smlal v23.4s, v28.4h, v2.4h\n" - "smlal2 v20.4s, v28.8h, v2.8h\n" - "ldr d28, [x19, x17]\n" - "add x17, x17, #0x8\n" + "smlal2 v26.4s, v29.8h, v3.8h\n" + "smlal2 v24.4s, v29.8h, v0.8h\n" + "ldr d29, [x20, x15]\n" + "smlal2 v23.4s, v28.8h, v2.8h\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "smlal v18.4s, v31.4h, v3.4h\n" + "smlal v9.4s, v30.4h, v5.4h\n" + "smlal v19.4s, v28.4h, v5.4h\n" + "smlal2 v11.4s, v28.8h, v5.8h\n" + "ldr d28, [x19, x15]\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "smlal2 v24.4s, v31.8h, v3.8h\n" + "smlal2 v23.4s, v30.8h, v5.8h\n" + "add x15, x15, #0x8\n" + "smlal v18.4s, v29.4h, v7.4h\n" + "smlal v9.4s, v29.4h, v6.4h\n" + "smlal2 v24.4s, v29.8h, v7.8h\n" + "smlal2 v23.4s, v29.8h, v6.8h\n" "smlal v13.4s, v31.4h, v6.4h\n" - "usubl v28.8h, v28.8b, v14.8b\n" - "smlal2 v19.4s, v31.8h, v6.8h\n" - "smlal v16.4s, v31.4h, v3.4h\n" - "smlal2 v21.4s, v31.8h, v3.8h\n" - "smlal v17.4s, v30.4h, v8.4h\n" - "smlal2 v25.4s, v30.8h, v8.8h\n" - "smlal v23.4s, v30.4h, v5.4h\n" - "smlal2 v20.4s, v30.8h, v5.8h\n" - "smlal v16.4s, v29.4h, v7.4h\n" - "smlal2 v21.4s, v29.8h, v7.8h\n" - "smlal v23.4s, v29.4h, v6.4h\n" - "smlal2 v20.4s, v29.8h, v6.8h\n" - "smlal v16.4s, v28.4h, v8.4h\n" - "smlal2 v21.4s, v28.8h, v8.8h\n" - "smlal v23.4s, v28.4h, v7.4h\n" - "smlal2 v20.4s, v28.8h, v7.8h\n" - "sqrdmulh v13.4s, v13.4s, v26.4s\n" - "sqrdmulh v19.4s, v19.4s, v11.4s\n" - "sqrdmulh v17.4s, v17.4s, v26.4s\n" - "sqrdmulh v25.4s, v25.4s, v11.4s\n" - "and v22.16b, v13.16b, v10.16b\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "and v28.16b, v19.16b, v18.16b\n" - "and v3.16b, v17.16b, v10.16b\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "and v6.16b, v25.16b, v18.16b\n" - "sqrdmulh v16.4s, v16.4s, v26.4s\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqrdmulh v21.4s, v21.4s, v11.4s\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sqadd v13.4s, v13.4s, v22.4s\n" - "sqrdmulh v23.4s, v23.4s, v26.4s\n" - "and v0.16b, v16.16b, v10.16b\n" - "sshr v0.4s, v0.4s, #0x1f\n" - "srshl v13.4s, v13.4s, v10.4s\n" - "sqadd v19.4s, v19.4s, v28.4s\n" - "sqadd v17.4s, v17.4s, v3.4s\n" - "sqadd v25.4s, v25.4s, v6.4s\n" - "and v29.16b, v21.16b, v18.16b\n" + "smlal v19.4s, v30.4h, v8.4h\n" + "sqdmulh v13.4s, v13.4s, v21.4s\n" + "smlal v18.4s, v28.4h, v8.4h\n" + "smlal v9.4s, v28.4h, v7.4h\n" + "sqdmulh v19.4s, v19.4s, v21.4s\n" + "smlal2 v26.4s, v31.8h, v6.8h\n" + "smlal2 v11.4s, v30.8h, v8.8h\n" + "sqdmulh v18.4s, v18.4s, v21.4s\n" + "smlal2 v24.4s, v28.8h, v8.8h\n" + "smlal2 v23.4s, v28.8h, v7.8h\n" + "sqdmulh v9.4s, v9.4s, v21.4s\n" + "and v7.16b, v13.16b, v25.16b\n" + "sqdmulh v26.4s, v26.4s, v10.4s\n" + "and v4.16b, v19.16b, v25.16b\n" + "sqdmulh v11.4s, v11.4s, v10.4s\n" + "and v21.16b, v18.16b, v25.16b\n" + "sqdmulh v24.4s, v24.4s, v10.4s\n" + "and v20.16b, v9.16b, v25.16b\n" + "sqdmulh v23.4s, v23.4s, v10.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v29.16b, v26.16b, v16.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v10.16b, v11.16b, v16.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v31.16b, v24.16b, v16.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v30.16b, v23.16b, v16.16b\n" + "sqadd v13.4s, v13.4s, v7.4s\n" "sshr v29.4s, v29.4s, #0x1f\n" - "add v13.4s, v13.4s, v15.4s\n" - "srshl v19.4s, v19.4s, v18.4s\n" - "srshl v17.4s, v17.4s, v10.4s\n" - "srshl v25.4s, v25.4s, v18.4s\n" - "smin v13.4s, v13.4s, v12.4s\n" - "add v19.4s, v19.4s, v15.4s\n" - "add v17.4s, v17.4s, v15.4s\n" - "smax v13.4s, v13.4s, v24.4s\n" - "smin v19.4s, v19.4s, v12.4s\n" - "smin v17.4s, v17.4s, v12.4s\n" - "add v25.4s, v25.4s, v15.4s\n" - "smax v19.4s, v19.4s, v24.4s\n" - "smax v17.4s, v17.4s, v24.4s\n" - "smin v25.4s, v25.4s, v12.4s\n" - "uzp1 v13.16b, v13.16b, v19.16b\n" - "sqadd v16.4s, v16.4s, v0.4s\n" + "sqadd v19.4s, v19.4s, v4.4s\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v21.4s\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v20.4s\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "srshl v13.4s, v13.4s, v25.4s\n" + "sqadd v26.4s, v26.4s, v29.4s\n" + "srshl v19.4s, v19.4s, v25.4s\n" + "sqadd v11.4s, v11.4s, v10.4s\n" + "srshl v18.4s, v18.4s, v25.4s\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "srshl v9.4s, v9.4s, v25.4s\n" + "sqadd v23.4s, v23.4s, v30.4s\n" + "srshl v26.4s, v26.4s, v16.4s\n" + "sqxtn v13.4h, v13.4s\n" + "srshl v11.4s, v11.4s, v16.4s\n" + "sqxtn v19.4h, v19.4s\n" + "srshl v24.4s, v24.4s, v16.4s\n" + "sqxtn v18.4h, v18.4s\n" + "srshl v23.4s, v23.4s, v16.4s\n" + "sqxtn v9.4h, v9.4s\n" + "sqxtn2 v13.8h, v26.4s\n" + "sqxtn2 v19.8h, v11.4s\n" + "sqxtn2 v18.8h, v24.4s\n" + "sqxtn2 v9.8h, v23.4s\n" + "sqadd v13.8h, v13.8h, v14.8h\n" + "sqadd v19.8h, v19.8h, v14.8h\n" + "sqadd v18.8h, v18.8h, v14.8h\n" + "sqadd v9.8h, v9.8h, v14.8h\n" + "smax v13.8h, v13.8h, v17.8h\n" + "smax v19.8h, v19.8h, v17.8h\n" + "smax v18.8h, v18.8h, v17.8h\n" + "smax v9.8h, v9.8h, v17.8h\n" + "smin v13.8h, v13.8h, v15.8h\n" + "smin v19.8h, v19.8h, v15.8h\n" + "smin v18.8h, v18.8h, v15.8h\n" + "smin v9.8h, v9.8h, v15.8h\n" "uzp1 v13.16b, v13.16b, v13.16b\n" - "str d13, [x10, x15]\n" - "smax v25.4s, v25.4s, v24.4s\n" - "sqadd v21.4s, v21.4s, v29.4s\n" - "srshl v16.4s, v16.4s, v10.4s\n" - "and v3.16b, v23.16b, v10.16b\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "uzp1 v17.16b, v17.16b, v25.16b\n" - "add v16.4s, v16.4s, v15.4s\n" - "srshl v21.4s, v21.4s, v18.4s\n" - "uzp1 v17.16b, v17.16b, v17.16b\n" - "str d17, [x9, x15]\n" - "smin v16.4s, v16.4s, v12.4s\n" - "sqrdmulh v20.4s, v20.4s, v11.4s\n" - "add v21.4s, v21.4s, v15.4s\n" - "sqadd v23.4s, v23.4s, v3.4s\n" - "smax v16.4s, v16.4s, v24.4s\n" - "smin v21.4s, v21.4s, v12.4s\n" - "and v25.16b, v20.16b, v18.16b\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "smax v21.4s, v21.4s, v24.4s\n" - "srshl v23.4s, v23.4s, v10.4s\n" - "uzp1 v16.16b, v16.16b, v21.16b\n" - "add v23.4s, v23.4s, v15.4s\n" - "uzp1 v16.16b, v16.16b, v16.16b\n" - "str d16, [x28, x15]\n" - "smin v23.4s, v23.4s, v12.4s\n" - "sqadd v20.4s, v20.4s, v25.4s\n" - "smax v23.4s, v23.4s, v24.4s\n" - "srshl v20.4s, v20.4s, v18.4s\n" - "add v20.4s, v20.4s, v15.4s\n" - "smin v20.4s, v20.4s, v12.4s\n" - "smax v20.4s, v20.4s, v24.4s\n" - "uzp1 v23.16b, v23.16b, v20.16b\n" - "uzp1 v23.16b, v23.16b, v23.16b\n" - "str d23, [x27, x15]\n" - "add x15, x15, #0x8\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str d13, [x10, x14]\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" + "str d19, [x9, x14]\n" + "str d18, [x28, x14]\n" + "str d9, [x27, x14]\n" "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" "ldr q13, [x19, #0x0]\n" - "mov v17.16b, v13.16b\n" - "ldr q19, [x19, #0x10]\n" + "add x14, x14, #0x8\n" + "ldr q26, [x19, #0x10]\n" "add x19, x19, #0x20\n" - "mov v16.16b, v13.16b\n" "str x19, [%x[params], %[offsetof_Params_bias]]\n" - "mov v23.16b, v13.16b\n" - "ldr d0, [x16, #0x0]\n" - "usubl v0.8h, v0.8b, v9.8b\n" - "mov v25.16b, v19.16b\n" - "ldr d1, [x16, #0x8]\n" - "mov v21.16b, v19.16b\n" - "ldr d2, [x16, #0x10]\n" - "usubl v1.8h, v1.8b, v9.8b\n" - "mov v20.16b, v19.16b\n" - "ldr d3, [x16, #0x18]\n" - "ldr d4, [x16, #0x20]\n" - "usubl v2.8h, v2.8b, v9.8b\n" - "ldr d5, [x16, #0x28]\n" - "usubl v3.8h, v3.8b, v9.8b\n" - "ldr d6, [x16, #0x30]\n" - "ldr d7, [x16, #0x38]\n" - "usubl v4.8h, v4.8b, v9.8b\n" - "ldr d8, [x16, #0x40]\n" - "usubl v5.8h, v5.8b, v9.8b\n" - "ldp x23, x22, [x14, #0x0]\n" - "usubl v6.8h, v6.8b, v9.8b\n" - "ldp x21, x20, [x14, #0x10]\n" - "usubl v7.8h, v7.8b, v9.8b\n" - "usubl v8.8h, v8.8b, v9.8b\n" - "ldr x19, [x14, #0x20]\n" - "ldr d31, [x23, x17]\n" - "usubl v31.8h, v31.8b, v14.8b\n" - "ldr d30, [x22, x17]\n" - "ldr d29, [x21, x17]\n" - "usubl v30.8h, v30.8b, v14.8b\n" - "ldr d28, [x20, x17]\n" - "ldr d27, [x19, x17]\n" - "usubl v29.8h, v29.8b, v14.8b\n" - "usubl v28.8h, v28.8b, v14.8b\n" - "usubl v27.8h, v27.8b, v14.8b\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "ldr d2, [x17, #0x10]\n" + "mov v19.16b, v13.16b\n" + "mov v11.16b, v26.16b\n" + "ldr d3, [x17, #0x18]\n" + "ldr d4, [x17, #0x20]\n" + "mov v18.16b, v13.16b\n" + "mov v24.16b, v26.16b\n" + "ldr d5, [x17, #0x28]\n" + "ldr d6, [x17, #0x30]\n" + "mov v9.16b, v13.16b\n" + "mov v23.16b, v26.16b\n" + "ldr d7, [x17, #0x38]\n" + "ldr d8, [x17, #0x40]\n" + "usubl v0.8h, v0.8b, v12.8b\n" + "usubl v1.8h, v1.8b, v12.8b\n" + "ldp x23, x22, [x12, #0x0]\n" + "ldp x21, x20, [x12, #0x10]\n" + "usubl v2.8h, v2.8b, v12.8b\n" + "usubl v3.8h, v3.8b, v12.8b\n" + "ldr x19, [x12, #0x20]\n" + "ldr d31, [x23, x15]\n" + "usubl v4.8h, v4.8b, v12.8b\n" + "usubl v5.8h, v5.8b, v12.8b\n" + "ldr d30, [x22, x15]\n" + "ldr d29, [x21, x15]\n" + "usubl v6.8h, v6.8b, v12.8b\n" + "usubl v7.8h, v7.8b, v12.8b\n" + "ldr d28, [x20, x15]\n" + "ldr d27, [x19, x15]\n" + "usubl v8.8h, v8.8b, v12.8b\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "usubl v27.8h, v27.8b, v22.8b\n" "bgt 1b\n" "2:" // Tail "smlal v13.4s, v31.4h, v4.4h\n" - "ldr x21, [x14, #0x28]\n" - "tst x8, #0x7\n" - "smlal2 v19.4s, v31.8h, v4.8h\n" - "ldr x20, [x14, #0x30]\n" - "smlal v17.4s, v31.4h, v3.4h\n" - "ldr x26, [x14, #0x38]\n" - "smlal2 v25.4s, v31.8h, v3.8h\n" - "ldr x25, [x14, #0x40]\n" - "smlal v16.4s, v31.4h, v1.4h\n" - "ldr x19, [x14, #0x48]\n" - "smlal2 v21.4s, v31.8h, v1.8h\n" - "ldr x24, [x14, #0x50]\n" - "smlal v23.4s, v31.4h, v0.4h\n" - "ldr x23, [x14, #0x58]\n" - "smlal2 v20.4s, v31.8h, v0.8h\n" - "ldr d31, [x21, x17]\n" - "usubl v31.8h, v31.8b, v14.8b\n" + "smlal2 v26.4s, v31.8h, v4.8h\n" + "ldr x21, [x12, #0x28]\n" + "ldr x26, [x12, #0x38]\n" + "smlal v19.4s, v31.4h, v3.4h\n" + "smlal2 v11.4s, v31.8h, v3.8h\n" + "ldr x20, [x12, #0x30]\n" + "ldr x25, [x12, #0x40]\n" "smlal v13.4s, v30.4h, v0.4h\n" - "ldr x22, [x14, #0x60]\n" - "smlal2 v19.4s, v30.8h, v0.8h\n" - "ldr d30, [x19, x17]\n" - "usubl v30.8h, v30.8b, v14.8b\n" - "smlal v17.4s, v29.4h, v2.4h\n" - "ldr x21, [x14, #0x68]\n" - "smlal2 v25.4s, v29.8h, v2.8h\n" - "ldr d29, [x20, x17]\n" - "usubl v29.8h, v29.8b, v14.8b\n" + "smlal2 v26.4s, v30.8h, v0.8h\n" + "ldr x19, [x12, #0x48]\n" + "ldr d30, [x19, x15]\n" + "smlal v19.4s, v29.4h, v2.4h\n" + "smlal2 v11.4s, v29.8h, v2.8h\n" + "ldr d29, [x20, x15]\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "smlal v18.4s, v31.4h, v1.4h\n" + "smlal2 v24.4s, v31.8h, v1.8h\n" + "ldr x24, [x12, #0x50]\n" + "ldr x23, [x12, #0x58]\n" + "smlal v9.4s, v31.4h, v0.4h\n" + "smlal2 v23.4s, v31.8h, v0.8h\n" + "ldr d31, [x21, x15]\n" + "usubl v31.8h, v31.8b, v22.8b\n" "smlal v13.4s, v28.4h, v5.4h\n" - "ldr x20, [x14, #0x70]\n" - "smlal2 v19.4s, v28.8h, v5.8h\n" - "ldr x19, [x14, #0x78]\n" - "smlal v17.4s, v28.4h, v4.4h\n" - "ldr q26, [x13, #0x0]\n" - "smlal2 v25.4s, v28.8h, v4.8h\n" - "ldr q10, [x11, #0x0]\n" - "smlal v16.4s, v28.4h, v2.4h\n" - "ldr q11, [x13, #0x10]\n" + "smlal2 v26.4s, v28.8h, v5.8h\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "ldr x22, [x12, #0x60]\n" + "smlal v19.4s, v28.4h, v4.4h\n" + "smlal2 v11.4s, v28.8h, v4.8h\n" + "ldr x21, [x12, #0x68]\n" + "ldr x20, [x12, #0x70]\n" + "smlal v18.4s, v28.4h, v2.4h\n" + "smlal2 v24.4s, v28.8h, v2.8h\n" + "ldr x19, [x12, #0x78]\n" + "ldr q21, [x13, #0x0]\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "smlal2 v23.4s, v28.8h, v1.8h\n" + "ldr d28, [x26, x15]\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "smlal v13.4s, v27.4h, v7.4h\n" + "smlal2 v26.4s, v27.8h, v7.8h\n" + "ldr q25, [x11, #0x0]\n" + "ldr q10, [x13, #0x10]\n" + "smlal v19.4s, v27.4h, v6.4h\n" + "smlal2 v11.4s, v27.8h, v6.8h\n" + "ldr q16, [x11, #0x10]\n" + "tst x8, #0x7\n" + "smlal v18.4s, v31.4h, v6.4h\n" + "smlal2 v24.4s, v31.8h, v6.8h\n" + "ldr d31, [x25, x15]\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "smlal2 v23.4s, v27.8h, v3.8h\n" "add x13, x13, #0x20\n" - "smlal2 v21.4s, v28.8h, v2.8h\n" - "ldr q18, [x11, #0x10]\n" "add x11, x11, #0x20\n" - "smlal v23.4s, v28.4h, v1.4h\n" - "smlal2 v20.4s, v28.8h, v1.8h\n" - "ldr d28, [x26, x17]\n" - "usubl v28.8h, v28.8b, v14.8b\n" - "smlal v16.4s, v31.4h, v6.4h\n" - "smlal2 v21.4s, v31.8h, v6.8h\n" - "ldr d31, [x25, x17]\n" - "usubl v31.8h, v31.8b, v14.8b\n" - "smlal v13.4s, v27.4h, v7.4h\n" - "smlal2 v19.4s, v27.8h, v7.8h\n" - "smlal v17.4s, v27.4h, v6.4h\n" - "smlal2 v25.4s, v27.8h, v6.8h\n" - "smlal v16.4s, v27.4h, v4.4h\n" - "smlal2 v21.4s, v27.8h, v4.8h\n" - "smlal v23.4s, v27.4h, v3.4h\n" - "smlal2 v20.4s, v27.8h, v3.8h\n" "smlal v13.4s, v28.4h, v1.4h\n" - "smlal2 v19.4s, v28.8h, v1.8h\n" - "smlal v23.4s, v29.4h, v8.4h\n" - "smlal2 v20.4s, v29.8h, v8.8h\n" - "ldr d29, [x24, x17]\n" - "usubl v29.8h, v29.8b, v14.8b\n" - "smlal v17.4s, v28.4h, v0.4h\n" - "smlal2 v25.4s, v28.8h, v0.8h\n" - "ldr d28, [x23, x17]\n" - "usubl v28.8h, v28.8b, v14.8b\n" + "smlal2 v26.4s, v28.8h, v1.8h\n" + "smlal v19.4s, v28.4h, v0.4h\n" + "smlal2 v11.4s, v28.8h, v0.8h\n" + "ldr d28, [x23, x15]\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "smlal v18.4s, v27.4h, v4.4h\n" + "smlal v9.4s, v29.4h, v8.4h\n" + "smlal2 v24.4s, v27.8h, v4.8h\n" + "smlal2 v23.4s, v29.8h, v8.8h\n" + "ldr d29, [x24, x15]\n" + "usubl v29.8h, v29.8b, v22.8b\n" "smlal v13.4s, v31.4h, v2.4h\n" - "smlal2 v19.4s, v31.8h, v2.8h\n" - "smlal v17.4s, v31.4h, v1.4h\n" - "smlal2 v25.4s, v31.8h, v1.8h\n" - "ldr d31, [x22, x17]\n" - "usubl v31.8h, v31.8b, v14.8b\n" + "smlal2 v26.4s, v31.8h, v2.8h\n" + "smlal v19.4s, v31.4h, v1.4h\n" + "smlal2 v11.4s, v31.8h, v1.8h\n" + "ldr d31, [x22, x15]\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "smlal v18.4s, v30.4h, v5.4h\n" + "smlal v9.4s, v30.4h, v4.4h\n" "smlal v13.4s, v30.4h, v8.4h\n" - "smlal2 v19.4s, v30.8h, v8.8h\n" - "smlal v17.4s, v30.4h, v7.4h\n" - "smlal2 v25.4s, v30.8h, v7.8h\n" - "smlal v16.4s, v30.4h, v5.4h\n" - "smlal2 v21.4s, v30.8h, v5.8h\n" - "smlal v23.4s, v30.4h, v4.4h\n" - "smlal2 v20.4s, v30.8h, v4.8h\n" - "ldr d30, [x21, x17]\n" - "usubl v30.8h, v30.8b, v14.8b\n" + "smlal2 v26.4s, v30.8h, v8.8h\n" + "smlal v19.4s, v30.4h, v7.4h\n" + "smlal2 v11.4s, v30.8h, v7.8h\n" + "smlal2 v24.4s, v30.8h, v5.8h\n" + "smlal2 v23.4s, v30.8h, v4.8h\n" + "ldr d30, [x21, x15]\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "smlal v18.4s, v29.4h, v0.4h\n" + "smlal v9.4s, v28.4h, v2.4h\n" "smlal v13.4s, v29.4h, v3.4h\n" - "smlal2 v19.4s, v29.8h, v3.8h\n" - "smlal v16.4s, v29.4h, v0.4h\n" - "smlal2 v21.4s, v29.8h, v0.8h\n" - "ldr d29, [x20, x17]\n" - "usubl v29.8h, v29.8b, v14.8b\n" - "smlal v17.4s, v28.4h, v5.4h\n" - "smlal2 v25.4s, v28.8h, v5.8h\n" - "smlal v23.4s, v28.4h, v2.4h\n" - "smlal2 v20.4s, v28.8h, v2.8h\n" - "ldr d28, [x19, x17]\n" - "add x17, x17, #0x8\n" + "smlal2 v26.4s, v29.8h, v3.8h\n" + "smlal2 v24.4s, v29.8h, v0.8h\n" + "ldr d29, [x20, x15]\n" + "smlal2 v23.4s, v28.8h, v2.8h\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "smlal v18.4s, v31.4h, v3.4h\n" + "smlal v9.4s, v30.4h, v5.4h\n" + "smlal v19.4s, v28.4h, v5.4h\n" + "smlal2 v11.4s, v28.8h, v5.8h\n" + "ldr d28, [x19, x15]\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "smlal2 v24.4s, v31.8h, v3.8h\n" + "smlal2 v23.4s, v30.8h, v5.8h\n" + "add x15, x15, #0x8\n" + "smlal v18.4s, v29.4h, v7.4h\n" + "smlal v9.4s, v29.4h, v6.4h\n" + "smlal2 v24.4s, v29.8h, v7.8h\n" + "smlal2 v23.4s, v29.8h, v6.8h\n" "smlal v13.4s, v31.4h, v6.4h\n" - "usubl v28.8h, v28.8b, v14.8b\n" - "smlal2 v19.4s, v31.8h, v6.8h\n" - "smlal v16.4s, v31.4h, v3.4h\n" - "smlal2 v21.4s, v31.8h, v3.8h\n" - "smlal v17.4s, v30.4h, v8.4h\n" - "smlal2 v25.4s, v30.8h, v8.8h\n" - "smlal v23.4s, v30.4h, v5.4h\n" - "smlal2 v20.4s, v30.8h, v5.8h\n" - "smlal v16.4s, v29.4h, v7.4h\n" - "smlal2 v21.4s, v29.8h, v7.8h\n" - "smlal v23.4s, v29.4h, v6.4h\n" - "smlal2 v20.4s, v29.8h, v6.8h\n" - "smlal v16.4s, v28.4h, v8.4h\n" - "smlal2 v21.4s, v28.8h, v8.8h\n" - "smlal v23.4s, v28.4h, v7.4h\n" - "smlal2 v20.4s, v28.8h, v7.8h\n" - "sqrdmulh v13.4s, v13.4s, v26.4s\n" - "sqrdmulh v19.4s, v19.4s, v11.4s\n" - "sqrdmulh v17.4s, v17.4s, v26.4s\n" - "sqrdmulh v25.4s, v25.4s, v11.4s\n" - "and v22.16b, v13.16b, v10.16b\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "and v28.16b, v19.16b, v18.16b\n" - "and v3.16b, v17.16b, v10.16b\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "and v6.16b, v25.16b, v18.16b\n" - "sqrdmulh v16.4s, v16.4s, v26.4s\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqrdmulh v21.4s, v21.4s, v11.4s\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sqadd v13.4s, v13.4s, v22.4s\n" - "sqrdmulh v23.4s, v23.4s, v26.4s\n" - "and v0.16b, v16.16b, v10.16b\n" - "sshr v0.4s, v0.4s, #0x1f\n" - "srshl v13.4s, v13.4s, v10.4s\n" - "sqadd v19.4s, v19.4s, v28.4s\n" - "sqadd v17.4s, v17.4s, v3.4s\n" - "sqadd v25.4s, v25.4s, v6.4s\n" - "and v29.16b, v21.16b, v18.16b\n" + "smlal v19.4s, v30.4h, v8.4h\n" + "sqdmulh v13.4s, v13.4s, v21.4s\n" + "smlal v18.4s, v28.4h, v8.4h\n" + "smlal v9.4s, v28.4h, v7.4h\n" + "sqdmulh v19.4s, v19.4s, v21.4s\n" + "smlal2 v26.4s, v31.8h, v6.8h\n" + "smlal2 v11.4s, v30.8h, v8.8h\n" + "sqdmulh v18.4s, v18.4s, v21.4s\n" + "smlal2 v24.4s, v28.8h, v8.8h\n" + "smlal2 v23.4s, v28.8h, v7.8h\n" + "sqdmulh v9.4s, v9.4s, v21.4s\n" + "and v7.16b, v13.16b, v25.16b\n" + "sqdmulh v26.4s, v26.4s, v10.4s\n" + "and v4.16b, v19.16b, v25.16b\n" + "sqdmulh v11.4s, v11.4s, v10.4s\n" + "and v21.16b, v18.16b, v25.16b\n" + "sqdmulh v24.4s, v24.4s, v10.4s\n" + "and v20.16b, v9.16b, v25.16b\n" + "sqdmulh v23.4s, v23.4s, v10.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v29.16b, v26.16b, v16.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v10.16b, v11.16b, v16.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v31.16b, v24.16b, v16.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v30.16b, v23.16b, v16.16b\n" + "sqadd v13.4s, v13.4s, v7.4s\n" "sshr v29.4s, v29.4s, #0x1f\n" - "add v13.4s, v13.4s, v15.4s\n" - "srshl v19.4s, v19.4s, v18.4s\n" - "srshl v17.4s, v17.4s, v10.4s\n" - "srshl v25.4s, v25.4s, v18.4s\n" - "smin v13.4s, v13.4s, v12.4s\n" - "add v19.4s, v19.4s, v15.4s\n" - "add v17.4s, v17.4s, v15.4s\n" - "smax v13.4s, v13.4s, v24.4s\n" - "smin v19.4s, v19.4s, v12.4s\n" - "smin v17.4s, v17.4s, v12.4s\n" - "add v25.4s, v25.4s, v15.4s\n" - "smax v19.4s, v19.4s, v24.4s\n" - "smax v17.4s, v17.4s, v24.4s\n" - "smin v25.4s, v25.4s, v12.4s\n" - "uzp1 v13.16b, v13.16b, v19.16b\n" - "sqadd v16.4s, v16.4s, v0.4s\n" + "sqadd v19.4s, v19.4s, v4.4s\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v21.4s\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v20.4s\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "srshl v13.4s, v13.4s, v25.4s\n" + "sqadd v26.4s, v26.4s, v29.4s\n" + "srshl v19.4s, v19.4s, v25.4s\n" + "sqadd v11.4s, v11.4s, v10.4s\n" + "srshl v18.4s, v18.4s, v25.4s\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "srshl v9.4s, v9.4s, v25.4s\n" + "sqadd v23.4s, v23.4s, v30.4s\n" + "srshl v26.4s, v26.4s, v16.4s\n" + "sqxtn v13.4h, v13.4s\n" + "srshl v11.4s, v11.4s, v16.4s\n" + "sqxtn v19.4h, v19.4s\n" + "srshl v24.4s, v24.4s, v16.4s\n" + "sqxtn v18.4h, v18.4s\n" + "srshl v23.4s, v23.4s, v16.4s\n" + "sqxtn v9.4h, v9.4s\n" + "sqxtn2 v13.8h, v26.4s\n" + "sqxtn2 v19.8h, v11.4s\n" + "sqxtn2 v18.8h, v24.4s\n" + "sqxtn2 v9.8h, v23.4s\n" + "sqadd v13.8h, v13.8h, v14.8h\n" + "sqadd v19.8h, v19.8h, v14.8h\n" + "sqadd v18.8h, v18.8h, v14.8h\n" + "sqadd v9.8h, v9.8h, v14.8h\n" + "smax v13.8h, v13.8h, v17.8h\n" + "smax v19.8h, v19.8h, v17.8h\n" + "smax v18.8h, v18.8h, v17.8h\n" + "smax v9.8h, v9.8h, v17.8h\n" + "smin v13.8h, v13.8h, v15.8h\n" + "smin v19.8h, v19.8h, v15.8h\n" + "smin v18.8h, v18.8h, v15.8h\n" + "smin v9.8h, v9.8h, v15.8h\n" "uzp1 v13.16b, v13.16b, v13.16b\n" - "str d13, [x10, x15]\n" - "smax v25.4s, v25.4s, v24.4s\n" - "sqadd v21.4s, v21.4s, v29.4s\n" - "srshl v16.4s, v16.4s, v10.4s\n" - "and v3.16b, v23.16b, v10.16b\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "uzp1 v17.16b, v17.16b, v25.16b\n" - "add v16.4s, v16.4s, v15.4s\n" - "srshl v21.4s, v21.4s, v18.4s\n" - "uzp1 v17.16b, v17.16b, v17.16b\n" - "str d17, [x9, x15]\n" - "smin v16.4s, v16.4s, v12.4s\n" - "sqrdmulh v20.4s, v20.4s, v11.4s\n" - "add v21.4s, v21.4s, v15.4s\n" - "sqadd v23.4s, v23.4s, v3.4s\n" - "smax v16.4s, v16.4s, v24.4s\n" - "smin v21.4s, v21.4s, v12.4s\n" - "and v25.16b, v20.16b, v18.16b\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "smax v21.4s, v21.4s, v24.4s\n" - "srshl v23.4s, v23.4s, v10.4s\n" - "uzp1 v16.16b, v16.16b, v21.16b\n" - "add v23.4s, v23.4s, v15.4s\n" - "uzp1 v16.16b, v16.16b, v16.16b\n" - "str d16, [x28, x15]\n" - "smin v23.4s, v23.4s, v12.4s\n" - "sqadd v20.4s, v20.4s, v25.4s\n" - "smax v23.4s, v23.4s, v24.4s\n" - "srshl v20.4s, v20.4s, v18.4s\n" - "add v20.4s, v20.4s, v15.4s\n" - "smin v20.4s, v20.4s, v12.4s\n" - "smax v20.4s, v20.4s, v24.4s\n" - "uzp1 v23.16b, v23.16b, v20.16b\n" - "uzp1 v23.16b, v23.16b, v23.16b\n" - "str d23, [x27, x15]\n" - "add x15, x15, #0x8\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str d13, [x10, x14]\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" + "str d19, [x9, x14]\n" + "str d18, [x28, x14]\n" + "str d9, [x27, x14]\n" + "add x14, x14, #0x8\n" "beq 64f\n" - "add x16, x16, #0x48\n" + "add x17, x17, #0x48\n" "3:" // Oddments "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" "tbz x8, #2, 5f\n" "ld1 { v13.4s }, [x19], #0x10\n" "tbz x8, #1, 4f\n" - "ld1 { v19.d }[0], [x19], #0x8\n" + "ld1 { v26.d }[0], [x19], #0x8\n" "tbz x8, #0, 7f\n" - "ld1 { v19.s }[2], [x19]\n" + "ld1 { v26.s }[2], [x19]\n" "b 7f\n" "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset "tbz x8, #0, 7f\n" - "ld1 { v19.s }[0], [x19]\n" + "ld1 { v26.s }[0], [x19]\n" "b 7f\n" "5:" // Oddments: Load bias: Bit 2: Unset "tbz x8, #1, 6f\n" @@ -609,38 +593,38 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 7f\n" "ld1 { v13.s }[0], [x19]\n" "7:" // Oddments: Load bias: Bit 2: End - "mov v17.16b, v13.16b\n" - "ldr d0, [x16, #0x0]\n" - "mov v25.16b, v19.16b\n" - "ldr d1, [x16, #0x8]\n" - "mov v16.16b, v13.16b\n" - "ldr d2, [x16, #0x10]\n" - "mov v21.16b, v19.16b\n" - "ldr d3, [x16, #0x18]\n" - "mov v23.16b, v13.16b\n" - "ldr d4, [x16, #0x20]\n" - "usubl v0.8h, v0.8b, v9.8b\n" - "mov v20.16b, v19.16b\n" - "ldr d5, [x16, #0x28]\n" - "usubl v1.8h, v1.8b, v9.8b\n" - "ldr d6, [x16, #0x30]\n" - "usubl v2.8h, v2.8b, v9.8b\n" - "ldr d7, [x16, #0x38]\n" - "usubl v3.8h, v3.8b, v9.8b\n" - "ldr d8, [x16, #0x40]\n" - "usubl v4.8h, v4.8b, v9.8b\n" - "ldp x23, x22, [x14, #0x0]\n" - "usubl v5.8h, v5.8b, v9.8b\n" - "ldp x21, x20, [x14, #0x10]\n" - "usubl v6.8h, v6.8b, v9.8b\n" - "usubl v7.8h, v7.8b, v9.8b\n" - "ldr x19, [x14, #0x20]\n" - "usubl v8.8h, v8.8b, v9.8b\n" - "add x23, x23, x17\n" - "add x22, x22, x17\n" - "add x21, x21, x17\n" - "add x20, x20, x17\n" - "add x19, x19, x17\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "mov v19.16b, v13.16b\n" + "mov v11.16b, v26.16b\n" + "ldr d2, [x17, #0x10]\n" + "ldr d3, [x17, #0x18]\n" + "mov v18.16b, v13.16b\n" + "mov v24.16b, v26.16b\n" + "ldr d4, [x17, #0x20]\n" + "ldr d5, [x17, #0x28]\n" + "mov v9.16b, v13.16b\n" + "mov v23.16b, v26.16b\n" + "ldr d6, [x17, #0x30]\n" + "ldr d7, [x17, #0x38]\n" + "usubl v0.8h, v0.8b, v12.8b\n" + "usubl v1.8h, v1.8b, v12.8b\n" + "ldr d8, [x17, #0x40]\n" + "ldp x23, x22, [x12, #0x0]\n" + "usubl v2.8h, v2.8b, v12.8b\n" + "usubl v3.8h, v3.8b, v12.8b\n" + "ldp x21, x20, [x12, #0x10]\n" + "ldr x19, [x12, #0x20]\n" + "usubl v4.8h, v4.8b, v12.8b\n" + "usubl v5.8h, v5.8b, v12.8b\n" + "usubl v6.8h, v6.8b, v12.8b\n" + "usubl v7.8h, v7.8b, v12.8b\n" + "usubl v8.8h, v8.8b, v12.8b\n" + "add x23, x23, x15\n" + "add x22, x22, x15\n" + "add x21, x21, x15\n" + "add x20, x20, x15\n" + "add x19, x19, x15\n" "tbz x8, #2, 9f\n" "ld1 { v31.s }[0], [x23], #0x4\n" "ld1 { v30.s }[0], [x22], #0x4\n" @@ -690,33 +674,33 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "ld1 { v28.b }[0], [x20]\n" "ld1 { v27.b }[0], [x19]\n" "11:" // Oddments: Initial loads: Bit 2: End - "ldr x21, [x14, #0x28]\n" - "usubl v31.8h, v31.8b, v14.8b\n" + "usubl v31.8h, v31.8b, v22.8b\n" "smlal v13.4s, v31.4h, v4.4h\n" - "usubl v30.8h, v30.8b, v14.8b\n" - "smlal2 v19.4s, v31.8h, v4.8h\n" - "usubl v29.8h, v29.8b, v14.8b\n" - "smlal v17.4s, v31.4h, v3.4h\n" - "usubl v28.8h, v28.8b, v14.8b\n" - "smlal2 v25.4s, v31.8h, v3.8h\n" - "usubl v27.8h, v27.8b, v14.8b\n" - "smlal v16.4s, v31.4h, v1.4h\n" - "add x21, x21, x17\n" - "smlal2 v21.4s, v31.8h, v1.8h\n" - "smlal v23.4s, v31.4h, v0.4h\n" - "smlal2 v20.4s, v31.8h, v0.8h\n" + "smlal2 v26.4s, v31.8h, v4.8h\n" + "ldr x21, [x12, #0x28]\n" + "smlal v19.4s, v31.4h, v3.4h\n" + "smlal2 v11.4s, v31.8h, v3.8h\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "add x21, x21, x15\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "smlal v18.4s, v31.4h, v1.4h\n" + "smlal2 v24.4s, v31.8h, v1.8h\n" + "smlal v9.4s, v31.4h, v0.4h\n" + "smlal2 v23.4s, v31.8h, v0.8h\n" + "usubl v28.8h, v28.8b, v22.8b\n" "smlal v13.4s, v30.4h, v0.4h\n" - "smlal2 v19.4s, v30.8h, v0.8h\n" - "smlal v17.4s, v29.4h, v2.4h\n" - "smlal2 v25.4s, v29.8h, v2.8h\n" + "smlal2 v26.4s, v30.8h, v0.8h\n" + "usubl v27.8h, v27.8b, v22.8b\n" + "smlal v19.4s, v29.4h, v2.4h\n" + "smlal2 v11.4s, v29.8h, v2.8h\n" "smlal v13.4s, v28.4h, v5.4h\n" - "smlal2 v19.4s, v28.8h, v5.8h\n" - "smlal v17.4s, v28.4h, v4.4h\n" - "smlal2 v25.4s, v28.8h, v4.8h\n" - "smlal v16.4s, v28.4h, v2.4h\n" - "smlal2 v21.4s, v28.8h, v2.8h\n" - "smlal v23.4s, v28.4h, v1.4h\n" - "smlal2 v20.4s, v28.8h, v1.8h\n" + "smlal2 v26.4s, v28.8h, v5.8h\n" + "smlal v19.4s, v28.4h, v4.4h\n" + "smlal2 v11.4s, v28.8h, v4.8h\n" + "smlal v18.4s, v28.4h, v2.4h\n" + "smlal2 v24.4s, v28.8h, v2.8h\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "smlal2 v23.4s, v28.8h, v1.8h\n" "tbz x8, #2, 13f\n" "ld1 { v31.s }[0], [x21], #0x4\n" "tbz x8, #1, 12f\n" @@ -738,19 +722,19 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 15f\n" "ld1 { v31.b }[0], [x21]\n" "15:" // Oddments: Load (3, 0): Bit 2: End + "usubl v31.8h, v31.8b, v22.8b\n" + "smlal v18.4s, v31.4h, v6.4h\n" + "smlal2 v24.4s, v31.8h, v6.8h\n" + "ldr x20, [x12, #0x30]\n" "smlal v13.4s, v27.4h, v7.4h\n" - "ldr x20, [x14, #0x30]\n" - "usubl v31.8h, v31.8b, v14.8b\n" - "smlal2 v19.4s, v27.8h, v7.8h\n" - "smlal v17.4s, v27.4h, v6.4h\n" - "add x20, x20, x17\n" - "smlal2 v25.4s, v27.8h, v6.8h\n" - "smlal v23.4s, v27.4h, v3.4h\n" - "smlal2 v20.4s, v27.8h, v3.8h\n" - "smlal v16.4s, v31.4h, v6.4h\n" - "smlal2 v21.4s, v31.8h, v6.8h\n" - "smlal v16.4s, v27.4h, v4.4h\n" - "smlal2 v21.4s, v27.8h, v4.8h\n" + "smlal2 v26.4s, v27.8h, v7.8h\n" + "add x20, x20, x15\n" + "smlal v19.4s, v27.4h, v6.4h\n" + "smlal2 v11.4s, v27.8h, v6.8h\n" + "smlal v18.4s, v27.4h, v4.4h\n" + "smlal2 v24.4s, v27.8h, v4.8h\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "smlal2 v23.4s, v27.8h, v3.8h\n" "tbz x8, #2, 17f\n" "ld1 { v29.s }[0], [x20], #0x4\n" "tbz x8, #1, 16f\n" @@ -772,11 +756,11 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 19f\n" "ld1 { v29.b }[0], [x20]\n" "19:" // Oddments: Load (3, 3): Bit 2: End - "ldr x26, [x14, #0x38]\n" - "usubl v29.8h, v29.8b, v14.8b\n" - "smlal v23.4s, v29.4h, v8.4h\n" - "smlal2 v20.4s, v29.8h, v8.8h\n" - "add x26, x26, x17\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "ldr x26, [x12, #0x38]\n" + "smlal v9.4s, v29.4h, v8.4h\n" + "smlal2 v23.4s, v29.8h, v8.8h\n" + "add x26, x26, x15\n" "tbz x8, #2, 21f\n" "ld1 { v28.s }[0], [x26], #0x4\n" "tbz x8, #1, 20f\n" @@ -798,13 +782,13 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 23f\n" "ld1 { v28.b }[0], [x26]\n" "23:" // Oddments: Load (0, 1): Bit 2: End - "ldr x25, [x14, #0x40]\n" - "usubl v28.8h, v28.8b, v14.8b\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "ldr x25, [x12, #0x40]\n" "smlal v13.4s, v28.4h, v1.4h\n" - "smlal2 v19.4s, v28.8h, v1.8h\n" - "add x25, x25, x17\n" - "smlal v17.4s, v28.4h, v0.4h\n" - "smlal2 v25.4s, v28.8h, v0.8h\n" + "smlal2 v26.4s, v28.8h, v1.8h\n" + "smlal v19.4s, v28.4h, v0.4h\n" + "smlal2 v11.4s, v28.8h, v0.8h\n" + "add x25, x25, x15\n" "tbz x8, #2, 25f\n" "ld1 { v31.s }[0], [x25], #0x4\n" "tbz x8, #1, 24f\n" @@ -826,13 +810,13 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 27f\n" "ld1 { v31.b }[0], [x25]\n" "27:" // Oddments: Load (0, 2): Bit 2: End - "ldr x19, [x14, #0x48]\n" - "usubl v31.8h, v31.8b, v14.8b\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "ldr x19, [x12, #0x48]\n" "smlal v13.4s, v31.4h, v2.4h\n" - "smlal2 v19.4s, v31.8h, v2.8h\n" - "add x19, x19, x17\n" - "smlal v17.4s, v31.4h, v1.4h\n" - "smlal2 v25.4s, v31.8h, v1.8h\n" + "smlal2 v26.4s, v31.8h, v2.8h\n" + "smlal v19.4s, v31.4h, v1.4h\n" + "smlal2 v11.4s, v31.8h, v1.8h\n" + "add x19, x19, x15\n" "tbz x8, #2, 29f\n" "ld1 { v30.s }[0], [x19], #0x4\n" "tbz x8, #1, 28f\n" @@ -854,17 +838,17 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 31f\n" "ld1 { v30.b }[0], [x19]\n" "31:" // Oddments: Load (2, 2): Bit 2: End - "ldr x24, [x14, #0x50]\n" - "usubl v30.8h, v30.8b, v14.8b\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "ldr x24, [x12, #0x50]\n" "smlal v13.4s, v30.4h, v8.4h\n" - "smlal2 v19.4s, v30.8h, v8.8h\n" - "add x24, x24, x17\n" - "smlal v17.4s, v30.4h, v7.4h\n" - "smlal2 v25.4s, v30.8h, v7.8h\n" - "smlal v16.4s, v30.4h, v5.4h\n" - "smlal2 v21.4s, v30.8h, v5.8h\n" - "smlal v23.4s, v30.4h, v4.4h\n" - "smlal2 v20.4s, v30.8h, v4.8h\n" + "smlal2 v26.4s, v30.8h, v8.8h\n" + "smlal v19.4s, v30.4h, v7.4h\n" + "smlal2 v11.4s, v30.8h, v7.8h\n" + "add x24, x24, x15\n" + "smlal v18.4s, v30.4h, v5.4h\n" + "smlal2 v24.4s, v30.8h, v5.8h\n" + "smlal v9.4s, v30.4h, v4.4h\n" + "smlal2 v23.4s, v30.8h, v4.8h\n" "tbz x8, #2, 33f\n" "ld1 { v29.s }[0], [x24], #0x4\n" "tbz x8, #1, 32f\n" @@ -886,13 +870,13 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 35f\n" "ld1 { v29.b }[0], [x24]\n" "35:" // Oddments: Load (1, 0): Bit 2: End - "ldr x23, [x14, #0x58]\n" - "usubl v29.8h, v29.8b, v14.8b\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "ldr x23, [x12, #0x58]\n" "smlal v13.4s, v29.4h, v3.4h\n" - "smlal2 v19.4s, v29.8h, v3.8h\n" - "add x23, x23, x17\n" - "smlal v16.4s, v29.4h, v0.4h\n" - "smlal2 v21.4s, v29.8h, v0.8h\n" + "smlal2 v26.4s, v29.8h, v3.8h\n" + "smlal v18.4s, v29.4h, v0.4h\n" + "smlal2 v24.4s, v29.8h, v0.8h\n" + "add x23, x23, x15\n" "tbz x8, #2, 37f\n" "ld1 { v28.s }[0], [x23], #0x4\n" "tbz x8, #1, 36f\n" @@ -914,13 +898,13 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 39f\n" "ld1 { v28.b }[0], [x23]\n" "39:" // Oddments: Load (1, 3): Bit 2: End - "ldr x22, [x14, #0x60]\n" - "usubl v28.8h, v28.8b, v14.8b\n" - "smlal v17.4s, v28.4h, v5.4h\n" - "smlal2 v25.4s, v28.8h, v5.8h\n" - "add x22, x22, x17\n" - "smlal v23.4s, v28.4h, v2.4h\n" - "smlal2 v20.4s, v28.8h, v2.8h\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "ldr x22, [x12, #0x60]\n" + "smlal v19.4s, v28.4h, v5.4h\n" + "smlal2 v11.4s, v28.8h, v5.8h\n" + "smlal v9.4s, v28.4h, v2.4h\n" + "smlal2 v23.4s, v28.8h, v2.8h\n" + "add x22, x22, x15\n" "tbz x8, #2, 41f\n" "ld1 { v31.s }[0], [x22], #0x4\n" "tbz x8, #1, 40f\n" @@ -942,13 +926,13 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 43f\n" "ld1 { v31.b }[0], [x22]\n" "43:" // Oddments: Load (2, 0): Bit 2: End - "ldr x21, [x14, #0x68]\n" - "usubl v31.8h, v31.8b, v14.8b\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "ldr x21, [x12, #0x68]\n" "smlal v13.4s, v31.4h, v6.4h\n" - "smlal2 v19.4s, v31.8h, v6.8h\n" - "add x21, x21, x17\n" - "smlal v16.4s, v31.4h, v3.4h\n" - "smlal2 v21.4s, v31.8h, v3.8h\n" + "smlal2 v26.4s, v31.8h, v6.8h\n" + "smlal v18.4s, v31.4h, v3.4h\n" + "smlal2 v24.4s, v31.8h, v3.8h\n" + "add x21, x21, x15\n" "tbz x8, #2, 45f\n" "ld1 { v30.s }[0], [x21], #0x4\n" "tbz x8, #1, 44f\n" @@ -970,13 +954,13 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 47f\n" "ld1 { v30.b }[0], [x21]\n" "47:" // Oddments: Load (2, 3): Bit 2: End - "ldr x20, [x14, #0x70]\n" - "usubl v30.8h, v30.8b, v14.8b\n" - "smlal v17.4s, v30.4h, v8.4h\n" - "smlal2 v25.4s, v30.8h, v8.8h\n" - "add x20, x20, x17\n" - "smlal v23.4s, v30.4h, v5.4h\n" - "smlal2 v20.4s, v30.8h, v5.8h\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "ldr x20, [x12, #0x70]\n" + "smlal v19.4s, v30.4h, v8.4h\n" + "smlal2 v11.4s, v30.8h, v8.8h\n" + "smlal v9.4s, v30.4h, v5.4h\n" + "smlal2 v23.4s, v30.8h, v5.8h\n" + "add x20, x20, x15\n" "tbz x8, #2, 49f\n" "ld1 { v29.s }[0], [x20], #0x4\n" "tbz x8, #1, 48f\n" @@ -998,13 +982,13 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 51f\n" "ld1 { v29.b }[0], [x20]\n" "51:" // Oddments: Load (3, 1): Bit 2: End - "ldr x19, [x14, #0x78]\n" - "usubl v29.8h, v29.8b, v14.8b\n" - "smlal v16.4s, v29.4h, v7.4h\n" - "smlal2 v21.4s, v29.8h, v7.8h\n" - "add x19, x19, x17\n" - "smlal v23.4s, v29.4h, v6.4h\n" - "smlal2 v20.4s, v29.8h, v6.8h\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "ldr x19, [x12, #0x78]\n" + "smlal v18.4s, v29.4h, v7.4h\n" + "smlal2 v24.4s, v29.8h, v7.8h\n" + "smlal v9.4s, v29.4h, v6.4h\n" + "smlal2 v23.4s, v29.8h, v6.8h\n" + "add x19, x19, x15\n" "tbz x8, #2, 53f\n" "ld1 { v28.s }[0], [x19], #0x4\n" "tbz x8, #1, 52f\n" @@ -1026,160 +1010,150 @@ void a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 55f\n" "ld1 { v28.b }[0], [x19]\n" "55:" // Oddments: Load (3, 2): Bit 2: End - "usubl v28.8h, v28.8b, v14.8b\n" - "smlal v16.4s, v28.4h, v8.4h\n" - "smlal2 v21.4s, v28.8h, v8.8h\n" - "smlal v23.4s, v28.4h, v7.4h\n" - "smlal2 v20.4s, v28.8h, v7.8h\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "smlal v18.4s, v28.4h, v8.4h\n" + "smlal2 v24.4s, v28.8h, v8.8h\n" + "smlal v9.4s, v28.4h, v7.4h\n" + "smlal2 v23.4s, v28.8h, v7.8h\n" "tbz x8, #2, 57f\n" - "ld1 { v26.4s }, [x13], #0x10\n" - "ld1 { v10.4s }, [x11], #0x10\n" + "ld1 { v21.4s }, [x13], #0x10\n" + "ld1 { v25.4s }, [x11], #0x10\n" "tbz x8, #1, 56f\n" - "ld1 { v11.d }[0], [x13], #0x8\n" - "ld1 { v18.d }[0], [x11], #0x8\n" + "ld1 { v10.d }[0], [x13], #0x8\n" + "ld1 { v16.d }[0], [x11], #0x8\n" "tbz x8, #0, 59f\n" - "ld1 { v11.s }[2], [x13]\n" - "ld1 { v18.s }[2], [x11]\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v16.s }[2], [x11]\n" "b 59f\n" "56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset "tbz x8, #0, 59f\n" - "ld1 { v11.s }[0], [x13]\n" - "ld1 { v18.s }[0], [x11]\n" + "ld1 { v10.s }[0], [x13]\n" + "ld1 { v16.s }[0], [x11]\n" "b 59f\n" "57:" // Oddments: Load requant params: Bit 2: Unset "tbz x8, #1, 58f\n" - "ld1 { v26.d }[0], [x13], #0x8\n" - "ld1 { v10.d }[0], [x11], #0x8\n" + "ld1 { v21.d }[0], [x13], #0x8\n" + "ld1 { v25.d }[0], [x11], #0x8\n" "tbz x8, #0, 59f\n" - "ld1 { v26.s }[2], [x13]\n" - "ld1 { v10.s }[2], [x11]\n" + "ld1 { v21.s }[2], [x13]\n" + "ld1 { v25.s }[2], [x11]\n" "b 59f\n" "58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset "tbz x8, #0, 59f\n" - "ld1 { v26.s }[0], [x13]\n" - "ld1 { v10.s }[0], [x11]\n" + "ld1 { v21.s }[0], [x13]\n" + "ld1 { v25.s }[0], [x11]\n" "59:" // Oddments: Load requant params: Bit 2: End - "sqrdmulh v13.4s, v13.4s, v26.4s\n" - "add x10, x10, x15\n" - "sqrdmulh v19.4s, v19.4s, v11.4s\n" - "add x9, x9, x15\n" - "sqrdmulh v17.4s, v17.4s, v26.4s\n" - "add x28, x28, x15\n" - "sqrdmulh v25.4s, v25.4s, v11.4s\n" - "add x27, x27, x15\n" - "sqrdmulh v16.4s, v16.4s, v26.4s\n" - "and v22.16b, v13.16b, v10.16b\n" - "sshr v22.4s, v22.4s, #0x1f\n" - "and v28.16b, v19.16b, v18.16b\n" - "and v3.16b, v17.16b, v10.16b\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "and v6.16b, v25.16b, v18.16b\n" - "and v0.16b, v16.16b, v10.16b\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqrdmulh v21.4s, v21.4s, v11.4s\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sqadd v13.4s, v13.4s, v22.4s\n" - "sqrdmulh v23.4s, v23.4s, v26.4s\n" - "sshr v0.4s, v0.4s, #0x1f\n" - "sqrdmulh v20.4s, v20.4s, v11.4s\n" - "sqadd v19.4s, v19.4s, v28.4s\n" - "sqadd v17.4s, v17.4s, v3.4s\n" - "srshl v13.4s, v13.4s, v10.4s\n" - "sqadd v25.4s, v25.4s, v6.4s\n" - "srshl v19.4s, v19.4s, v18.4s\n" - "srshl v17.4s, v17.4s, v10.4s\n" - "add v13.4s, v13.4s, v15.4s\n" - "srshl v25.4s, v25.4s, v18.4s\n" - "add v19.4s, v19.4s, v15.4s\n" - "smin v13.4s, v13.4s, v12.4s\n" - "add v17.4s, v17.4s, v15.4s\n" - "smin v19.4s, v19.4s, v12.4s\n" - "smax v13.4s, v13.4s, v24.4s\n" - "smin v17.4s, v17.4s, v12.4s\n" - "smax v19.4s, v19.4s, v24.4s\n" - "add v25.4s, v25.4s, v15.4s\n" - "smax v17.4s, v17.4s, v24.4s\n" - "uzp1 v13.16b, v13.16b, v19.16b\n" - "smin v25.4s, v25.4s, v12.4s\n" - "uzp1 v13.16b, v13.16b, v13.16b\n" - "sqadd v16.4s, v16.4s, v0.4s\n" - "smax v25.4s, v25.4s, v24.4s\n" - "and v29.16b, v21.16b, v18.16b\n" + "sqdmulh v13.4s, v13.4s, v21.4s\n" + "sqdmulh v19.4s, v19.4s, v21.4s\n" + "add x10, x10, x14\n" + "add x9, x9, x14\n" + "sqdmulh v18.4s, v18.4s, v21.4s\n" + "sqdmulh v9.4s, v9.4s, v21.4s\n" + "add x28, x28, x14\n" + "add x27, x27, x14\n" + "and v7.16b, v13.16b, v25.16b\n" + "sqdmulh v26.4s, v26.4s, v10.4s\n" + "and v4.16b, v19.16b, v25.16b\n" + "sqdmulh v11.4s, v11.4s, v10.4s\n" + "and v21.16b, v18.16b, v25.16b\n" + "sqdmulh v24.4s, v24.4s, v10.4s\n" + "and v20.16b, v9.16b, v25.16b\n" + "sqdmulh v23.4s, v23.4s, v10.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v29.16b, v26.16b, v16.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v10.16b, v11.16b, v16.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v31.16b, v24.16b, v16.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v30.16b, v23.16b, v16.16b\n" + "sqadd v13.4s, v13.4s, v7.4s\n" "sshr v29.4s, v29.4s, #0x1f\n" - "uzp1 v17.16b, v17.16b, v25.16b\n" - "srshl v16.4s, v16.4s, v10.4s\n" - "and v3.16b, v23.16b, v10.16b\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "uzp1 v17.16b, v17.16b, v17.16b\n" - "add v16.4s, v16.4s, v15.4s\n" - "sqadd v21.4s, v21.4s, v29.4s\n" - "and v25.16b, v20.16b, v18.16b\n" - "sshr v25.4s, v25.4s, #0x1f\n" - "smin v16.4s, v16.4s, v12.4s\n" - "srshl v21.4s, v21.4s, v18.4s\n" - "sqadd v23.4s, v23.4s, v3.4s\n" - "smax v16.4s, v16.4s, v24.4s\n" - "add v21.4s, v21.4s, v15.4s\n" - "srshl v23.4s, v23.4s, v10.4s\n" - "sqadd v20.4s, v20.4s, v25.4s\n" - "smin v21.4s, v21.4s, v12.4s\n" - "add v23.4s, v23.4s, v15.4s\n" - "srshl v20.4s, v20.4s, v18.4s\n" - "smax v21.4s, v21.4s, v24.4s\n" - "smin v23.4s, v23.4s, v12.4s\n" - "uzp1 v16.16b, v16.16b, v21.16b\n" - "add v20.4s, v20.4s, v15.4s\n" - "uzp1 v16.16b, v16.16b, v16.16b\n" - "smax v23.4s, v23.4s, v24.4s\n" - "smin v20.4s, v20.4s, v12.4s\n" - "smax v20.4s, v20.4s, v24.4s\n" - "uzp1 v23.16b, v23.16b, v20.16b\n" - "uzp1 v23.16b, v23.16b, v23.16b\n" + "sqadd v19.4s, v19.4s, v4.4s\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v21.4s\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v20.4s\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "srshl v13.4s, v13.4s, v25.4s\n" + "sqadd v26.4s, v26.4s, v29.4s\n" + "srshl v19.4s, v19.4s, v25.4s\n" + "sqadd v11.4s, v11.4s, v10.4s\n" + "srshl v18.4s, v18.4s, v25.4s\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "srshl v9.4s, v9.4s, v25.4s\n" + "sqadd v23.4s, v23.4s, v30.4s\n" + "srshl v26.4s, v26.4s, v16.4s\n" + "sqxtn v13.4h, v13.4s\n" + "srshl v11.4s, v11.4s, v16.4s\n" + "sqxtn v19.4h, v19.4s\n" + "srshl v24.4s, v24.4s, v16.4s\n" + "sqxtn v18.4h, v18.4s\n" + "srshl v23.4s, v23.4s, v16.4s\n" + "sqxtn v9.4h, v9.4s\n" + "sqxtn2 v13.8h, v26.4s\n" + "sqxtn2 v19.8h, v11.4s\n" + "sqxtn2 v18.8h, v24.4s\n" + "sqxtn2 v9.8h, v23.4s\n" + "sqadd v13.8h, v13.8h, v14.8h\n" + "sqadd v19.8h, v19.8h, v14.8h\n" + "sqadd v18.8h, v18.8h, v14.8h\n" + "sqadd v9.8h, v9.8h, v14.8h\n" + "smax v13.8h, v13.8h, v17.8h\n" + "smax v19.8h, v19.8h, v17.8h\n" + "smax v18.8h, v18.8h, v17.8h\n" + "smax v9.8h, v9.8h, v17.8h\n" + "smin v13.8h, v13.8h, v15.8h\n" + "smin v19.8h, v19.8h, v15.8h\n" + "smin v18.8h, v18.8h, v15.8h\n" + "smin v9.8h, v9.8h, v15.8h\n" + "uzp1 v13.16b, v13.16b, v13.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" "tbz x8, #2, 61f\n" "st1 { v13.s }[0], [x10], #0x4\n" - "st1 { v17.s }[0], [x9], #0x4\n" - "st1 { v16.s }[0], [x28], #0x4\n" - "st1 { v23.s }[0], [x27], #0x4\n" + "st1 { v19.s }[0], [x9], #0x4\n" + "st1 { v18.s }[0], [x28], #0x4\n" + "st1 { v9.s }[0], [x27], #0x4\n" "tbz x8, #1, 60f\n" "st1 { v13.h }[2], [x10], #0x2\n" - "st1 { v17.h }[2], [x9], #0x2\n" - "st1 { v16.h }[2], [x28], #0x2\n" - "st1 { v23.h }[2], [x27], #0x2\n" + "st1 { v19.h }[2], [x9], #0x2\n" + "st1 { v18.h }[2], [x28], #0x2\n" + "st1 { v9.h }[2], [x27], #0x2\n" "tbz x8, #0, 63f\n" "st1 { v13.b }[6], [x10], #0x1\n" - "st1 { v17.b }[6], [x9], #0x1\n" - "st1 { v16.b }[6], [x28], #0x1\n" - "st1 { v23.b }[6], [x27], #0x1\n" + "st1 { v19.b }[6], [x9], #0x1\n" + "st1 { v18.b }[6], [x28], #0x1\n" + "st1 { v9.b }[6], [x27], #0x1\n" "b 63f\n" "60:" // Oddments: Bit 2: Bit 1: Unset "tbz x8, #0, 63f\n" "st1 { v13.b }[4], [x10], #0x1\n" - "st1 { v17.b }[4], [x9], #0x1\n" - "st1 { v16.b }[4], [x28], #0x1\n" - "st1 { v23.b }[4], [x27], #0x1\n" + "st1 { v19.b }[4], [x9], #0x1\n" + "st1 { v18.b }[4], [x28], #0x1\n" + "st1 { v9.b }[4], [x27], #0x1\n" "b 63f\n" "61:" // Oddments: Bit 2: Unset "tbz x8, #1, 62f\n" "st1 { v13.h }[0], [x10], #0x2\n" - "st1 { v17.h }[0], [x9], #0x2\n" - "st1 { v16.h }[0], [x28], #0x2\n" - "st1 { v23.h }[0], [x27], #0x2\n" + "st1 { v19.h }[0], [x9], #0x2\n" + "st1 { v18.h }[0], [x28], #0x2\n" + "st1 { v9.h }[0], [x27], #0x2\n" "tbz x8, #0, 63f\n" "st1 { v13.b }[2], [x10], #0x1\n" - "st1 { v17.b }[2], [x9], #0x1\n" - "st1 { v16.b }[2], [x28], #0x1\n" - "st1 { v23.b }[2], [x27], #0x1\n" + "st1 { v19.b }[2], [x9], #0x1\n" + "st1 { v18.b }[2], [x28], #0x1\n" + "st1 { v9.b }[2], [x27], #0x1\n" "b 63f\n" "62:" // Oddments: Bit 2: Unset: Bit 1: Unset "tbz x8, #0, 63f\n" "st1 { v13.b }[0], [x10], #0x1\n" - "st1 { v17.b }[0], [x9], #0x1\n" - "st1 { v16.b }[0], [x28], #0x1\n" - "st1 { v23.b }[0], [x27], #0x1\n" + "st1 { v19.b }[0], [x9], #0x1\n" + "st1 { v18.b }[0], [x28], #0x1\n" + "st1 { v9.b }[0], [x27], #0x1\n" "63:" // Oddments: Bit 2: End - "64:" // End - : : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp index 44817dbccf..00d1c5e868 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,37 +36,24 @@ namespace depthwise { void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); -struct a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst +class a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef uint8_t input_type; - typedef uint8_t weight_type; - typedef uint8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); - typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 2; constexpr static unsigned int stride_cols = 2; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 5; - constexpr static unsigned int input_cols = 5; - - constexpr static parameter_packing_fn pack_parameters = interleave_a64_u8q_3x3_mla::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_a64_u8q_3x3_mla::get_packed_size; + a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {} - kern_type kernel = a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl; + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; } - a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp index 8d22836e5b..872f665b40 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -46,7 +46,7 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( struct Params { long unsigned int n_channels; - const uint8_t *weights; + const void *weights; const int32_t *bias; const arm_gemm::Requantize32 *requant; const int32_t *const requant_muls; @@ -57,7 +57,7 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( Params( long unsigned int n_channels, const uint8_t *const *inptrs_raw, - const uint8_t *const weights, + const void *const weights, const int32_t *const bias, const arm_gemm::Requantize32 &qp, const int32_t *const requant_muls, @@ -100,75 +100,75 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( requant_muls, requant_shifts, outptrs); __asm__ __volatile__( - "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n" - "mov x5, #0x0\n" - "ldr x6, [%x[params], %[offsetof_Params_weights]]\n" - "mov x7, #0x0\n" - "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" - "add x8, %x[params], %[offsetof_Params_inptrs]\n" - "ldr x17, [%x[params], %[offsetof_Params_requant_muls]]\n" - "lsr x16, x4, #0x3\n" - "ldr x15, [%x[params], %[offsetof_Params_requant_shifts]]\n" - "add x19, x22, %[offsetof_Requantize32_a_offset]\n" - "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" - "add x20, x22, %[offsetof_Requantize32_b_offset]\n" - "ld1r { v12.16b }, [x19]\n" - "add x19, x22, %[offsetof_Requantize32_c_offset]\n" - "ld1r { v13.16b }, [x20]\n" - "add x20, x22, %[offsetof_Requantize32_minval]\n" - "ld1r { v11.4s }, [x19]\n" - "add x19, x22, %[offsetof_Requantize32_maxval]\n" - "ld1r { v19.4s }, [x20]\n" - "ld1r { v14.4s }, [x19]\n" - "ldp x14, x13, [x21, #0x0]\n" - "ldp x12, x11, [x21, #0x10]\n" + "ldr x19, [%x[params], %[offsetof_Params_requant]]\n" + "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n" + "add x24, x19, %[offsetof_Requantize32_a_offset]\n" + "add x23, x19, %[offsetof_Requantize32_b_offset]\n" + "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x21, x19, %[offsetof_Requantize32_c_offset]\n" + "add x20, x19, %[offsetof_Requantize32_minval]\n" + "ldr x17, [%x[params], %[offsetof_Params_weights]]\n" + "add x19, x19, %[offsetof_Requantize32_maxval]\n" + "ld1r { v12.16b }, [x24]\n" + "ld1r { v13.16b }, [x23]\n" + "lsr x16, x8, #0x3\n" + "ld1r { v11.8h }, [x21]\n" + "ld1r { v17.8h }, [x20]\n" + "mov x15, #0x0\n" + "mov x14, #0x0\n" + "ld1r { v14.8h }, [x19]\n" + "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n" + "add x12, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "ldp x10, x9, [x22, #0x0]\n" + "ldp x28, x27, [x22, #0x10]\n" "cbz x16, 3f\n" - "subs x16, x16, #0x1\n" "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" "ldr q15, [x19, #0x0]\n" - "mov v20.16b, v15.16b\n" + "subs x16, x16, #0x1\n" + "mov v9.16b, v15.16b\n" "ldr q10, [x19, #0x10]\n" "add x19, x19, #0x20\n" - "mov v16.16b, v15.16b\n" "str x19, [%x[params], %[offsetof_Params_bias]]\n" - "mov v17.16b, v15.16b\n" - "ldr d0, [x6, #0x0]\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "ldr d2, [x17, #0x10]\n" + "mov v16.16b, v10.16b\n" + "mov v22.16b, v15.16b\n" + "ldr d3, [x17, #0x18]\n" + "ldr d4, [x17, #0x20]\n" + "mov v21.16b, v10.16b\n" + "mov v23.16b, v15.16b\n" + "ldr d5, [x17, #0x28]\n" + "ldr d6, [x17, #0x30]\n" + "mov v18.16b, v10.16b\n" "usubl v0.8h, v0.8b, v13.8b\n" - "mov v23.16b, v10.16b\n" - "ldr d1, [x6, #0x8]\n" - "mov v22.16b, v10.16b\n" - "ldr d2, [x6, #0x10]\n" + "ldr d7, [x17, #0x38]\n" + "ldr d8, [x17, #0x40]\n" "usubl v1.8h, v1.8b, v13.8b\n" - "mov v18.16b, v10.16b\n" - "ldr d3, [x6, #0x18]\n" - "ldr d4, [x6, #0x20]\n" "usubl v2.8h, v2.8b, v13.8b\n" - "ldr d5, [x6, #0x28]\n" + "ldp x26, x25, [x12, #0x0]\n" + "ldp x24, x23, [x12, #0x10]\n" "usubl v3.8h, v3.8b, v13.8b\n" - "ldr d6, [x6, #0x30]\n" - "ldr d7, [x6, #0x38]\n" "usubl v4.8h, v4.8b, v13.8b\n" - "ldr d8, [x6, #0x40]\n" + "ldp x22, x21, [x12, #0x20]\n" + "ldp x20, x19, [x12, #0x30]\n" "usubl v5.8h, v5.8b, v13.8b\n" - "ldp x26, x25, [x8, #0x0]\n" "usubl v6.8h, v6.8b, v13.8b\n" - "ldp x24, x23, [x8, #0x10]\n" + "ldr d31, [x26, x15]\n" + "ldr d30, [x25, x15]\n" "usubl v7.8h, v7.8b, v13.8b\n" "usubl v8.8h, v8.8b, v13.8b\n" - "ldp x22, x21, [x8, #0x20]\n" - "ldp x20, x19, [x8, #0x30]\n" - "ldr d31, [x26, x5]\n" + "ldr d29, [x24, x15]\n" + "ldr d28, [x23, x15]\n" "usubl v31.8h, v31.8b, v12.8b\n" - "ldr d30, [x25, x5]\n" - "ldr d29, [x24, x5]\n" "usubl v30.8h, v30.8b, v12.8b\n" - "ldr d28, [x23, x5]\n" - "ldr d27, [x22, x5]\n" + "ldr d27, [x22, x15]\n" + "ldr d26, [x21, x15]\n" "usubl v29.8h, v29.8b, v12.8b\n" - "ldr d26, [x21, x5]\n" "usubl v28.8h, v28.8b, v12.8b\n" - "ldr d25, [x20, x5]\n" - "ldr d24, [x19, x5]\n" + "ldr d25, [x20, x15]\n" + "ldr d24, [x19, x15]\n" "usubl v27.8h, v27.8b, v12.8b\n" "usubl v26.8h, v26.8b, v12.8b\n" "usubl v25.8h, v25.8b, v12.8b\n" @@ -176,259 +176,251 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "beq 2f\n" "1:" // Loop "smlal v15.4s, v31.4h, v8.4h\n" - "ldr x23, [x8, #0x40]\n" - "add x6, x6, #0x48\n" "smlal2 v10.4s, v31.8h, v8.8h\n" - "ldr x22, [x8, #0x48]\n" - "subs x16, x16, #0x1\n" - "smlal v20.4s, v31.4h, v6.4h\n" - "ldr x21, [x8, #0x50]\n" - "smlal2 v23.4s, v31.8h, v6.8h\n" - "ldr x20, [x8, #0x58]\n" - "smlal v16.4s, v31.4h, v2.4h\n" - "ldr x19, [x8, #0x60]\n" - "smlal2 v22.4s, v31.8h, v2.8h\n" - "ldr x10, [x8, #0x68]\n" - "smlal v17.4s, v31.4h, v0.4h\n" - "ldr x9, [x8, #0x70]\n" - "smlal2 v18.4s, v31.8h, v0.8h\n" - "ldr x28, [x8, #0x78]\n" + "ldr x24, [x12, #0x40]\n" + "ldr x23, [x12, #0x48]\n" + "smlal v9.4s, v31.4h, v6.4h\n" + "smlal2 v16.4s, v31.8h, v6.8h\n" + "ldr x21, [x12, #0x50]\n" + "ldr x19, [x12, #0x58]\n" "smlal v15.4s, v30.4h, v0.4h\n" - "ldr x27, [x8, #0x80]\n" "smlal2 v10.4s, v30.8h, v0.8h\n" - "ldr x26, [x8, #0x88]\n" - "smlal v20.4s, v28.4h, v1.4h\n" - "ldr x25, [x8, #0x90]\n" - "smlal2 v23.4s, v28.8h, v1.8h\n" - "ldr d28, [x22, x5]\n" + "ldr x22, [x12, #0x78]\n" + "ldr x20, [x12, #0x60]\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "smlal2 v16.4s, v28.8h, v1.8h\n" + "ldr d28, [x23, x15]\n" "usubl v28.8h, v28.8b, v12.8b\n" "smlal v15.4s, v29.4h, v1.4h\n" - "ldr x24, [x8, #0x98]\n" "smlal2 v10.4s, v29.8h, v1.8h\n" - "ldr d29, [x23, x5]\n" + "ldr d29, [x24, x15]\n" "usubl v29.8h, v29.8b, v12.8b\n" - "smlal v20.4s, v27.4h, v2.4h\n" - "ldr x23, [x8, #0xa0]\n" - "smlal2 v23.4s, v27.8h, v2.8h\n" - "ldr d27, [x21, x5]\n" + "smlal v9.4s, v27.4h, v2.4h\n" + "smlal2 v16.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x15]\n" "usubl v27.8h, v27.8b, v12.8b\n" "smlal v15.4s, v26.4h, v3.4h\n" - "ldr x22, [x8, #0xa8]\n" "smlal2 v10.4s, v26.8h, v3.8h\n" - "ldr d26, [x20, x5]\n" + "ldr d26, [x19, x15]\n" "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v9.4s, v24.4h, v0.4h\n" + "smlal2 v16.4s, v24.8h, v0.8h\n" + "ldr x21, [x12, #0x80]\n" + "ldr x19, [x12, #0x68]\n" "smlal v15.4s, v25.4h, v4.4h\n" - "ldr x21, [x8, #0xb0]\n" "smlal2 v10.4s, v25.8h, v4.8h\n" - "ldr d25, [x19, x5]\n" + "ldr d25, [x20, x15]\n" "usubl v25.8h, v25.8b, v12.8b\n" + "smlal v9.4s, v29.4h, v4.4h\n" + "smlal2 v16.4s, v29.8h, v4.8h\n" + "ldr x20, [x12, #0x88]\n" + "ldr d29, [x19, x15]\n" "smlal v15.4s, v24.4h, v2.4h\n" - "ldr x20, [x8, #0xb8]\n" "smlal2 v10.4s, v24.8h, v2.8h\n" - "ldr x19, [x8, #0xc0]\n" - "smlal v20.4s, v24.4h, v0.4h\n" - "ldr q21, [x17, #0x0]\n" - "smlal2 v23.4s, v24.8h, v0.8h\n" - "ldr d24, [x9, x5]\n" - "usubl v24.8h, v24.8b, v12.8b\n" - "smlal v20.4s, v29.4h, v4.4h\n" - "ldr q30, [x15, #0x0]\n" - "smlal2 v23.4s, v29.8h, v4.8h\n" - "ldr d29, [x10, x5]\n" + "ldr x19, [x12, #0x70]\n" "usubl v29.8h, v29.8b, v12.8b\n" - "smlal v20.4s, v28.4h, v5.4h\n" - "ldr q31, [x17, #0x10]\n" - "smlal2 v23.4s, v28.8h, v5.8h\n" - "ldr d28, [x27, x5]\n" - "add x17, x17, #0x20\n" + "smlal v9.4s, v28.4h, v5.4h\n" + "smlal2 v16.4s, v28.8h, v5.8h\n" + "ldr d28, [x21, x15]\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "smlal v22.4s, v31.4h, v2.4h\n" + "smlal2 v21.4s, v31.8h, v2.8h\n" + "ldr x24, [x12, #0x98]\n" + "ldr d24, [x19, x15]\n" "smlal v15.4s, v27.4h, v5.4h\n" - "ldr q9, [x15, #0x10]\n" - "add x15, x15, #0x20\n" "smlal2 v10.4s, v27.8h, v5.8h\n" - "usubl v28.8h, v28.8b, v12.8b\n" - "smlal v20.4s, v27.4h, v3.4h\n" - "smlal2 v23.4s, v27.8h, v3.8h\n" - "ldr d27, [x28, x5]\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "ldr x23, [x12, #0x90]\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "smlal2 v16.4s, v27.8h, v3.8h\n" + "ldr d27, [x22, x15]\n" "usubl v27.8h, v27.8b, v12.8b\n" - "smlal v16.4s, v26.4h, v3.4h\n" - "smlal2 v22.4s, v26.8h, v3.8h\n" - "ldr d26, [x26, x5]\n" + "smlal v23.4s, v31.4h, v0.4h\n" + "smlal v22.4s, v26.4h, v3.4h\n" + "ldr x22, [x12, #0xa8]\n" + "ldr x19, [x12, #0xa0]\n" + "smlal2 v21.4s, v26.8h, v3.8h\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" + "ldr d26, [x20, x15]\n" "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v23.4s, v27.4h, v4.4h\n" + "smlal v22.4s, v25.4h, v0.4h\n" + "ldr x21, [x12, #0xb0]\n" + "ldr x20, [x12, #0xb8]\n" + "smlal2 v21.4s, v25.8h, v0.8h\n" + "smlal2 v18.4s, v27.8h, v4.8h\n" + "ldr d27, [x19, x15]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "smlal v23.4s, v28.4h, v1.4h\n" "smlal v15.4s, v25.4h, v6.4h\n" + "ldr x19, [x12, #0xc0]\n" + "ldr q19, [x13, #0x0]\n" "smlal2 v10.4s, v25.8h, v6.8h\n" - "smlal v16.4s, v25.4h, v0.4h\n" - "smlal2 v22.4s, v25.8h, v0.8h\n" - "ldr d25, [x25, x5]\n" + "smlal v22.4s, v29.4h, v4.4h\n" + "ldr d25, [x23, x15]\n" "usubl v25.8h, v25.8b, v12.8b\n" - "smlal v16.4s, v29.4h, v4.4h\n" - "smlal2 v22.4s, v29.8h, v4.8h\n" - "ldr d29, [x24, x5]\n" + "smlal2 v21.4s, v29.8h, v4.8h\n" + "ldr d29, [x24, x15]\n" + "smlal2 v18.4s, v28.8h, v1.8h\n" "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v23.4s, v26.4h, v5.4h\n" "smlal v15.4s, v24.4h, v7.4h\n" + "ldr q0, [x11, #0x0]\n" + "ldr q4, [x13, #0x10]\n" "smlal2 v10.4s, v24.8h, v7.8h\n" - "smlal v16.4s, v24.4h, v1.4h\n" - "smlal2 v22.4s, v24.8h, v1.8h\n" - "ldr d24, [x22, x5]\n" - "usubl v24.8h, v24.8b, v12.8b\n" - "smlal v17.4s, v27.4h, v4.4h\n" - "smlal2 v18.4s, v27.8h, v4.8h\n" - "ldr d27, [x23, x5]\n" - "usubl v27.8h, v27.8b, v12.8b\n" - "smlal v20.4s, v28.4h, v7.4h\n" - "smlal2 v23.4s, v28.8h, v7.8h\n" - "smlal v17.4s, v28.4h, v1.4h\n" - "smlal2 v18.4s, v28.8h, v1.8h\n" - "smlal v16.4s, v25.4h, v6.4h\n" - "smlal2 v22.4s, v25.8h, v6.8h\n" - "ldr d25, [x20, x5]\n" - "usubl v25.8h, v25.8b, v12.8b\n" - "smlal v17.4s, v26.4h, v5.4h\n" + "smlal v22.4s, v24.4h, v1.4h\n" + "sqdmulh v15.4s, v15.4s, v19.4s\n" + "ldr q31, [x11, #0x10]\n" + "smlal2 v21.4s, v24.8h, v1.8h\n" + "ldr d24, [x22, x15]\n" "smlal2 v18.4s, v26.8h, v5.8h\n" - "ldr d26, [x21, x5]\n" - "usubl v26.8h, v26.8b, v12.8b\n" - "smlal v20.4s, v29.4h, v8.4h\n" - "smlal2 v23.4s, v29.8h, v8.8h\n" - "smlal v17.4s, v29.4h, v2.4h\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "smlal v23.4s, v29.4h, v2.4h\n" + "ldr d26, [x21, x15]\n" "smlal2 v18.4s, v29.8h, v2.8h\n" - "ldr d29, [x19, x5]\n" - "add x5, x5, #0x8\n" - "smlal v16.4s, v27.4h, v7.4h\n" - "usubl v29.8h, v29.8b, v12.8b\n" - "smlal2 v22.4s, v27.8h, v7.8h\n" - "smlal v17.4s, v24.4h, v3.4h\n" - "smlal v16.4s, v24.4h, v5.4h\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v22.4s, v25.4h, v6.4h\n" + "smlal v23.4s, v24.4h, v3.4h\n" + "and v30.16b, v15.16b, v0.16b\n" + "add x17, x17, #0x48\n" + "smlal v9.4s, v28.4h, v7.4h\n" + "smlal2 v16.4s, v28.8h, v7.8h\n" + "sqdmulh v10.4s, v10.4s, v4.4s\n" + "subs x16, x16, #0x1\n" + "smlal2 v21.4s, v25.8h, v6.8h\n" + "ldr d25, [x20, x15]\n" "smlal2 v18.4s, v24.8h, v3.8h\n" - "sqrdmulh v15.4s, v15.4s, v21.4s\n" - "smlal2 v22.4s, v24.8h, v5.8h\n" - "smlal v17.4s, v26.4h, v7.4h\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "smlal v22.4s, v27.4h, v7.4h\n" + "smlal v23.4s, v26.4h, v7.4h\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "add x13, x13, #0x20\n" + "smlal v9.4s, v29.4h, v8.4h\n" + "smlal2 v16.4s, v29.8h, v8.8h\n" + "ldr d29, [x19, x15]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal2 v21.4s, v27.8h, v7.8h\n" "smlal2 v18.4s, v26.8h, v7.8h\n" - "smlal v16.4s, v25.4h, v8.4h\n" - "smlal2 v22.4s, v25.8h, v8.8h\n" - "smlal v17.4s, v25.4h, v6.4h\n" + "sqdmulh v9.4s, v9.4s, v19.4s\n" + "add x15, x15, #0x8\n" + "smlal v22.4s, v24.4h, v5.4h\n" + "smlal v23.4s, v25.4h, v6.4h\n" + "and v28.16b, v9.16b, v0.16b\n" + "add x11, x11, #0x20\n" + "smlal2 v21.4s, v24.8h, v5.8h\n" "smlal2 v18.4s, v25.8h, v6.8h\n" - "and v26.16b, v15.16b, v30.16b\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "smlal v17.4s, v29.4h, v8.4h\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "smlal v22.4s, v25.4h, v8.4h\n" + "smlal v23.4s, v29.4h, v8.4h\n" + "sqdmulh v22.4s, v22.4s, v19.4s\n" + "smlal2 v21.4s, v25.8h, v8.8h\n" "smlal2 v18.4s, v29.8h, v8.8h\n" - "sqrdmulh v10.4s, v10.4s, v31.4s\n" - "sqrdmulh v20.4s, v20.4s, v21.4s\n" - "sqrdmulh v23.4s, v23.4s, v31.4s\n" - "sqrdmulh v16.4s, v16.4s, v21.4s\n" - "sqadd v15.4s, v15.4s, v26.4s\n" - "and v8.16b, v10.16b, v9.16b\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "srshl v15.4s, v15.4s, v30.4s\n" - "and v4.16b, v20.16b, v30.16b\n" + "sqdmulh v23.4s, v23.4s, v19.4s\n" + "and v29.16b, v22.16b, v0.16b\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "and v20.16b, v23.16b, v0.16b\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "and v19.16b, v10.16b, v31.16b\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "and v4.16b, v16.16b, v31.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "and v5.16b, v21.16b, v31.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v26.16b, v18.16b, v31.16b\n" + "sqadd v15.4s, v15.4s, v30.4s\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v28.4s\n" "sshr v4.4s, v4.4s, #0x1f\n" - "and v2.16b, v23.16b, v9.16b\n" - "and v1.16b, v16.16b, v30.16b\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "add v15.4s, v15.4s, v11.4s\n" - "sqadd v10.4s, v10.4s, v8.4s\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqrdmulh v22.4s, v22.4s, v31.4s\n" - "sqadd v20.4s, v20.4s, v4.4s\n" - "smin v15.4s, v15.4s, v14.4s\n" - "srshl v10.4s, v10.4s, v9.4s\n" - "sqadd v23.4s, v23.4s, v2.4s\n" - "smax v15.4s, v15.4s, v19.4s\n" - "srshl v20.4s, v20.4s, v30.4s\n" - "add v10.4s, v10.4s, v11.4s\n" - "srshl v23.4s, v23.4s, v9.4s\n" - "sqadd v16.4s, v16.4s, v1.4s\n" - "smin v10.4s, v10.4s, v14.4s\n" - "add v20.4s, v20.4s, v11.4s\n" - "add v23.4s, v23.4s, v11.4s\n" - "smax v10.4s, v10.4s, v19.4s\n" - "smin v20.4s, v20.4s, v14.4s\n" - "smin v23.4s, v23.4s, v14.4s\n" - "uzp1 v15.16b, v15.16b, v10.16b\n" - "smax v20.4s, v20.4s, v19.4s\n" + "sqadd v22.4s, v22.4s, v29.4s\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v20.4s\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v0.4s\n" + "sqadd v10.4s, v10.4s, v19.4s\n" + "srshl v9.4s, v9.4s, v0.4s\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "sqadd v21.4s, v21.4s, v5.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "sqadd v18.4s, v18.4s, v26.4s\n" + "srshl v10.4s, v10.4s, v31.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v16.4s, v16.4s, v31.4s\n" + "sqxtn v9.4h, v9.4s\n" + "srshl v21.4s, v21.4s, v31.4s\n" + "sqxtn v22.4h, v22.4s\n" + "srshl v18.4s, v18.4s, v31.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v15.8h, v10.4s\n" + "sqxtn2 v9.8h, v16.4s\n" + "sqxtn2 v22.8h, v21.4s\n" + "sqxtn2 v23.8h, v18.4s\n" + "sqadd v15.8h, v15.8h, v11.8h\n" + "sqadd v9.8h, v9.8h, v11.8h\n" + "sqadd v22.8h, v22.8h, v11.8h\n" + "sqadd v23.8h, v23.8h, v11.8h\n" + "smax v15.8h, v15.8h, v17.8h\n" + "smax v9.8h, v9.8h, v17.8h\n" + "smax v22.8h, v22.8h, v17.8h\n" + "smax v23.8h, v23.8h, v17.8h\n" + "smin v15.8h, v15.8h, v14.8h\n" + "smin v9.8h, v9.8h, v14.8h\n" + "smin v22.8h, v22.8h, v14.8h\n" + "smin v23.8h, v23.8h, v14.8h\n" "uzp1 v15.16b, v15.16b, v15.16b\n" - "str d15, [x14, x7]\n" - "smax v23.4s, v23.4s, v19.4s\n" - "srshl v16.4s, v16.4s, v30.4s\n" - "and v24.16b, v22.16b, v9.16b\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "uzp1 v20.16b, v20.16b, v23.16b\n" - "add v16.4s, v16.4s, v11.4s\n" - "sqrdmulh v17.4s, v17.4s, v21.4s\n" - "uzp1 v20.16b, v20.16b, v20.16b\n" - "str d20, [x13, x7]\n" - "smin v16.4s, v16.4s, v14.4s\n" - "sqrdmulh v18.4s, v18.4s, v31.4s\n" - "sqadd v22.4s, v22.4s, v24.4s\n" - "and v2.16b, v17.16b, v30.16b\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "smax v16.4s, v16.4s, v19.4s\n" - "srshl v22.4s, v22.4s, v9.4s\n" - "and v31.16b, v18.16b, v9.16b\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "add v22.4s, v22.4s, v11.4s\n" - "sqadd v17.4s, v17.4s, v2.4s\n" - "smin v22.4s, v22.4s, v14.4s\n" - "srshl v17.4s, v17.4s, v30.4s\n" - "sqadd v18.4s, v18.4s, v31.4s\n" - "smax v22.4s, v22.4s, v19.4s\n" - "uzp1 v16.16b, v16.16b, v22.16b\n" - "add v17.4s, v17.4s, v11.4s\n" - "srshl v18.4s, v18.4s, v9.4s\n" - "uzp1 v16.16b, v16.16b, v16.16b\n" - "str d16, [x12, x7]\n" - "smin v17.4s, v17.4s, v14.4s\n" - "add v18.4s, v18.4s, v11.4s\n" - "smax v17.4s, v17.4s, v19.4s\n" - "smin v18.4s, v18.4s, v14.4s\n" - "smax v18.4s, v18.4s, v19.4s\n" - "uzp1 v17.16b, v17.16b, v18.16b\n" - "uzp1 v17.16b, v17.16b, v17.16b\n" - "str d17, [x11, x7]\n" - "add x7, x7, #0x8\n" + "str d15, [x10, x14]\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str d9, [x9, x14]\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str d22, [x28, x14]\n" + "str d23, [x27, x14]\n" "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" "ldr q15, [x19, #0x0]\n" - "mov v20.16b, v15.16b\n" + "add x14, x14, #0x8\n" "ldr q10, [x19, #0x10]\n" "add x19, x19, #0x20\n" - "mov v16.16b, v15.16b\n" "str x19, [%x[params], %[offsetof_Params_bias]]\n" - "mov v17.16b, v15.16b\n" - "ldr d0, [x6, #0x0]\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "ldr d2, [x17, #0x10]\n" + "mov v9.16b, v15.16b\n" + "mov v16.16b, v10.16b\n" + "ldr d3, [x17, #0x18]\n" + "ldr d4, [x17, #0x20]\n" + "mov v22.16b, v15.16b\n" + "mov v21.16b, v10.16b\n" + "ldr d5, [x17, #0x28]\n" + "ldr d6, [x17, #0x30]\n" + "mov v23.16b, v15.16b\n" + "mov v18.16b, v10.16b\n" + "ldr d7, [x17, #0x38]\n" + "ldr d8, [x17, #0x40]\n" "usubl v0.8h, v0.8b, v13.8b\n" - "mov v23.16b, v10.16b\n" - "ldr d1, [x6, #0x8]\n" - "mov v22.16b, v10.16b\n" - "ldr d2, [x6, #0x10]\n" "usubl v1.8h, v1.8b, v13.8b\n" - "mov v18.16b, v10.16b\n" - "ldr d3, [x6, #0x18]\n" - "ldr d4, [x6, #0x20]\n" + "ldp x26, x25, [x12, #0x0]\n" + "ldp x24, x23, [x12, #0x10]\n" "usubl v2.8h, v2.8b, v13.8b\n" - "ldr d5, [x6, #0x28]\n" "usubl v3.8h, v3.8b, v13.8b\n" - "ldr d6, [x6, #0x30]\n" - "ldr d7, [x6, #0x38]\n" + "ldp x22, x21, [x12, #0x20]\n" + "ldp x20, x19, [x12, #0x30]\n" "usubl v4.8h, v4.8b, v13.8b\n" - "ldr d8, [x6, #0x40]\n" "usubl v5.8h, v5.8b, v13.8b\n" - "ldp x26, x25, [x8, #0x0]\n" + "ldr d31, [x26, x15]\n" + "ldr d30, [x25, x15]\n" "usubl v6.8h, v6.8b, v13.8b\n" - "ldp x24, x23, [x8, #0x10]\n" "usubl v7.8h, v7.8b, v13.8b\n" + "ldr d29, [x24, x15]\n" + "ldr d28, [x23, x15]\n" "usubl v8.8h, v8.8b, v13.8b\n" - "ldp x22, x21, [x8, #0x20]\n" - "ldp x20, x19, [x8, #0x30]\n" - "ldr d31, [x26, x5]\n" "usubl v31.8h, v31.8b, v12.8b\n" - "ldr d30, [x25, x5]\n" - "ldr d29, [x24, x5]\n" + "ldr d27, [x22, x15]\n" + "ldr d26, [x21, x15]\n" "usubl v30.8h, v30.8b, v12.8b\n" - "ldr d28, [x23, x5]\n" - "ldr d27, [x22, x5]\n" "usubl v29.8h, v29.8b, v12.8b\n" - "ldr d26, [x21, x5]\n" + "ldr d25, [x20, x15]\n" + "ldr d24, [x19, x15]\n" "usubl v28.8h, v28.8b, v12.8b\n" - "ldr d25, [x20, x5]\n" - "ldr d24, [x19, x5]\n" "usubl v27.8h, v27.8b, v12.8b\n" "usubl v26.8h, v26.8b, v12.8b\n" "usubl v25.8h, v25.8b, v12.8b\n" @@ -436,275 +428,267 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "bgt 1b\n" "2:" // Tail "smlal v15.4s, v31.4h, v8.4h\n" - "ldr x23, [x8, #0x40]\n" - "tst x4, #0x7\n" "smlal2 v10.4s, v31.8h, v8.8h\n" - "ldr x22, [x8, #0x48]\n" - "smlal v20.4s, v31.4h, v6.4h\n" - "ldr x21, [x8, #0x50]\n" - "smlal2 v23.4s, v31.8h, v6.8h\n" - "ldr x20, [x8, #0x58]\n" - "smlal v16.4s, v31.4h, v2.4h\n" - "ldr x19, [x8, #0x60]\n" - "smlal2 v22.4s, v31.8h, v2.8h\n" - "ldr x10, [x8, #0x68]\n" - "smlal v17.4s, v31.4h, v0.4h\n" - "ldr x9, [x8, #0x70]\n" - "smlal2 v18.4s, v31.8h, v0.8h\n" - "ldr x28, [x8, #0x78]\n" + "ldr x24, [x12, #0x40]\n" + "ldr x23, [x12, #0x48]\n" + "smlal v9.4s, v31.4h, v6.4h\n" + "smlal2 v16.4s, v31.8h, v6.8h\n" + "ldr x21, [x12, #0x50]\n" + "ldr x19, [x12, #0x58]\n" "smlal v15.4s, v30.4h, v0.4h\n" - "ldr x27, [x8, #0x80]\n" "smlal2 v10.4s, v30.8h, v0.8h\n" - "ldr x26, [x8, #0x88]\n" - "smlal v20.4s, v28.4h, v1.4h\n" - "ldr x25, [x8, #0x90]\n" - "smlal2 v23.4s, v28.8h, v1.8h\n" - "ldr d28, [x22, x5]\n" + "ldr x22, [x12, #0x78]\n" + "ldr x20, [x12, #0x60]\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "smlal2 v16.4s, v28.8h, v1.8h\n" + "ldr d28, [x23, x15]\n" "usubl v28.8h, v28.8b, v12.8b\n" "smlal v15.4s, v29.4h, v1.4h\n" - "ldr x24, [x8, #0x98]\n" "smlal2 v10.4s, v29.8h, v1.8h\n" - "ldr d29, [x23, x5]\n" + "ldr d29, [x24, x15]\n" "usubl v29.8h, v29.8b, v12.8b\n" - "smlal v20.4s, v27.4h, v2.4h\n" - "ldr x23, [x8, #0xa0]\n" - "smlal2 v23.4s, v27.8h, v2.8h\n" - "ldr d27, [x21, x5]\n" + "smlal v9.4s, v27.4h, v2.4h\n" + "smlal2 v16.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x15]\n" "usubl v27.8h, v27.8b, v12.8b\n" "smlal v15.4s, v26.4h, v3.4h\n" - "ldr x22, [x8, #0xa8]\n" "smlal2 v10.4s, v26.8h, v3.8h\n" - "ldr d26, [x20, x5]\n" + "ldr d26, [x19, x15]\n" "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v9.4s, v24.4h, v0.4h\n" + "smlal2 v16.4s, v24.8h, v0.8h\n" + "ldr x21, [x12, #0x80]\n" + "ldr x19, [x12, #0x68]\n" "smlal v15.4s, v25.4h, v4.4h\n" - "ldr x21, [x8, #0xb0]\n" "smlal2 v10.4s, v25.8h, v4.8h\n" - "ldr d25, [x19, x5]\n" + "ldr d25, [x20, x15]\n" "usubl v25.8h, v25.8b, v12.8b\n" + "smlal v9.4s, v29.4h, v4.4h\n" + "smlal2 v16.4s, v29.8h, v4.8h\n" + "ldr x20, [x12, #0x88]\n" + "ldr d29, [x19, x15]\n" "smlal v15.4s, v24.4h, v2.4h\n" - "ldr x20, [x8, #0xb8]\n" "smlal2 v10.4s, v24.8h, v2.8h\n" - "ldr x19, [x8, #0xc0]\n" - "smlal v20.4s, v24.4h, v0.4h\n" - "ldr q21, [x17, #0x0]\n" - "smlal2 v23.4s, v24.8h, v0.8h\n" - "ldr d24, [x9, x5]\n" - "usubl v24.8h, v24.8b, v12.8b\n" - "smlal v20.4s, v29.4h, v4.4h\n" - "ldr q30, [x15, #0x0]\n" - "smlal2 v23.4s, v29.8h, v4.8h\n" - "ldr d29, [x10, x5]\n" + "ldr x19, [x12, #0x70]\n" "usubl v29.8h, v29.8b, v12.8b\n" - "smlal v20.4s, v28.4h, v5.4h\n" - "ldr q31, [x17, #0x10]\n" - "smlal2 v23.4s, v28.8h, v5.8h\n" - "ldr d28, [x27, x5]\n" - "add x17, x17, #0x20\n" + "smlal v9.4s, v28.4h, v5.4h\n" + "smlal2 v16.4s, v28.8h, v5.8h\n" + "ldr d28, [x21, x15]\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "smlal v22.4s, v31.4h, v2.4h\n" + "smlal2 v21.4s, v31.8h, v2.8h\n" + "ldr x24, [x12, #0x98]\n" + "ldr d24, [x19, x15]\n" "smlal v15.4s, v27.4h, v5.4h\n" - "ldr q9, [x15, #0x10]\n" - "add x15, x15, #0x20\n" "smlal2 v10.4s, v27.8h, v5.8h\n" - "usubl v28.8h, v28.8b, v12.8b\n" - "smlal v20.4s, v27.4h, v3.4h\n" - "smlal2 v23.4s, v27.8h, v3.8h\n" - "ldr d27, [x28, x5]\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "ldr x23, [x12, #0x90]\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "smlal2 v16.4s, v27.8h, v3.8h\n" + "ldr d27, [x22, x15]\n" "usubl v27.8h, v27.8b, v12.8b\n" - "smlal v16.4s, v26.4h, v3.4h\n" - "smlal2 v22.4s, v26.8h, v3.8h\n" - "ldr d26, [x26, x5]\n" + "smlal v23.4s, v31.4h, v0.4h\n" + "smlal v22.4s, v26.4h, v3.4h\n" + "ldr x22, [x12, #0xa8]\n" + "ldr x19, [x12, #0xa0]\n" + "smlal2 v21.4s, v26.8h, v3.8h\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" + "ldr d26, [x20, x15]\n" "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v23.4s, v27.4h, v4.4h\n" + "smlal v22.4s, v25.4h, v0.4h\n" + "ldr x21, [x12, #0xb0]\n" + "ldr x20, [x12, #0xb8]\n" + "smlal2 v21.4s, v25.8h, v0.8h\n" + "smlal2 v18.4s, v27.8h, v4.8h\n" + "ldr d27, [x19, x15]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "smlal v23.4s, v28.4h, v1.4h\n" "smlal v15.4s, v25.4h, v6.4h\n" + "ldr x19, [x12, #0xc0]\n" + "ldr q19, [x13, #0x0]\n" "smlal2 v10.4s, v25.8h, v6.8h\n" - "smlal v16.4s, v25.4h, v0.4h\n" - "smlal2 v22.4s, v25.8h, v0.8h\n" - "ldr d25, [x25, x5]\n" + "smlal v22.4s, v29.4h, v4.4h\n" + "ldr d25, [x23, x15]\n" "usubl v25.8h, v25.8b, v12.8b\n" - "smlal v16.4s, v29.4h, v4.4h\n" - "smlal2 v22.4s, v29.8h, v4.8h\n" - "ldr d29, [x24, x5]\n" + "smlal2 v21.4s, v29.8h, v4.8h\n" + "ldr d29, [x24, x15]\n" + "smlal2 v18.4s, v28.8h, v1.8h\n" "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v23.4s, v26.4h, v5.4h\n" "smlal v15.4s, v24.4h, v7.4h\n" + "ldr q0, [x11, #0x0]\n" + "ldr q4, [x13, #0x10]\n" "smlal2 v10.4s, v24.8h, v7.8h\n" - "smlal v16.4s, v24.4h, v1.4h\n" - "smlal2 v22.4s, v24.8h, v1.8h\n" - "ldr d24, [x22, x5]\n" - "usubl v24.8h, v24.8b, v12.8b\n" - "smlal v17.4s, v27.4h, v4.4h\n" - "smlal2 v18.4s, v27.8h, v4.8h\n" - "ldr d27, [x23, x5]\n" - "usubl v27.8h, v27.8b, v12.8b\n" - "smlal v20.4s, v28.4h, v7.4h\n" - "smlal2 v23.4s, v28.8h, v7.8h\n" - "smlal v17.4s, v28.4h, v1.4h\n" - "smlal2 v18.4s, v28.8h, v1.8h\n" - "smlal v16.4s, v25.4h, v6.4h\n" - "smlal2 v22.4s, v25.8h, v6.8h\n" - "ldr d25, [x20, x5]\n" - "usubl v25.8h, v25.8b, v12.8b\n" - "smlal v17.4s, v26.4h, v5.4h\n" + "smlal v22.4s, v24.4h, v1.4h\n" + "sqdmulh v15.4s, v15.4s, v19.4s\n" + "ldr q31, [x11, #0x10]\n" + "smlal2 v21.4s, v24.8h, v1.8h\n" + "ldr d24, [x22, x15]\n" "smlal2 v18.4s, v26.8h, v5.8h\n" - "ldr d26, [x21, x5]\n" - "usubl v26.8h, v26.8b, v12.8b\n" - "smlal v20.4s, v29.4h, v8.4h\n" - "smlal2 v23.4s, v29.8h, v8.8h\n" - "smlal v17.4s, v29.4h, v2.4h\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "smlal v23.4s, v29.4h, v2.4h\n" + "ldr d26, [x21, x15]\n" "smlal2 v18.4s, v29.8h, v2.8h\n" - "ldr d29, [x19, x5]\n" - "add x5, x5, #0x8\n" - "smlal v16.4s, v27.4h, v7.4h\n" - "usubl v29.8h, v29.8b, v12.8b\n" - "smlal2 v22.4s, v27.8h, v7.8h\n" - "smlal v17.4s, v24.4h, v3.4h\n" - "smlal v16.4s, v24.4h, v5.4h\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v22.4s, v25.4h, v6.4h\n" + "smlal v23.4s, v24.4h, v3.4h\n" + "and v30.16b, v15.16b, v0.16b\n" + "tst x8, #0x7\n" + "smlal v9.4s, v28.4h, v7.4h\n" + "smlal2 v16.4s, v28.8h, v7.8h\n" + "sqdmulh v10.4s, v10.4s, v4.4s\n" + "add x13, x13, #0x20\n" + "smlal2 v21.4s, v25.8h, v6.8h\n" + "ldr d25, [x20, x15]\n" "smlal2 v18.4s, v24.8h, v3.8h\n" - "sqrdmulh v15.4s, v15.4s, v21.4s\n" - "smlal2 v22.4s, v24.8h, v5.8h\n" - "smlal v17.4s, v26.4h, v7.4h\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "smlal v22.4s, v27.4h, v7.4h\n" + "smlal v23.4s, v26.4h, v7.4h\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "add x11, x11, #0x20\n" + "smlal v9.4s, v29.4h, v8.4h\n" + "smlal2 v16.4s, v29.8h, v8.8h\n" + "ldr d29, [x19, x15]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal2 v21.4s, v27.8h, v7.8h\n" "smlal2 v18.4s, v26.8h, v7.8h\n" - "smlal v16.4s, v25.4h, v8.4h\n" - "smlal2 v22.4s, v25.8h, v8.8h\n" - "smlal v17.4s, v25.4h, v6.4h\n" + "sqdmulh v9.4s, v9.4s, v19.4s\n" + "add x15, x15, #0x8\n" + "smlal v22.4s, v24.4h, v5.4h\n" + "smlal v23.4s, v25.4h, v6.4h\n" + "and v28.16b, v9.16b, v0.16b\n" + "smlal2 v21.4s, v24.8h, v5.8h\n" "smlal2 v18.4s, v25.8h, v6.8h\n" - "and v26.16b, v15.16b, v30.16b\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "smlal v17.4s, v29.4h, v8.4h\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "smlal v22.4s, v25.4h, v8.4h\n" + "smlal v23.4s, v29.4h, v8.4h\n" + "sqdmulh v22.4s, v22.4s, v19.4s\n" + "smlal2 v21.4s, v25.8h, v8.8h\n" "smlal2 v18.4s, v29.8h, v8.8h\n" - "sqrdmulh v10.4s, v10.4s, v31.4s\n" - "sqrdmulh v20.4s, v20.4s, v21.4s\n" - "sqrdmulh v23.4s, v23.4s, v31.4s\n" - "sqrdmulh v16.4s, v16.4s, v21.4s\n" - "sqadd v15.4s, v15.4s, v26.4s\n" - "and v8.16b, v10.16b, v9.16b\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "srshl v15.4s, v15.4s, v30.4s\n" - "and v4.16b, v20.16b, v30.16b\n" + "sqdmulh v23.4s, v23.4s, v19.4s\n" + "and v29.16b, v22.16b, v0.16b\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "and v20.16b, v23.16b, v0.16b\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "and v19.16b, v10.16b, v31.16b\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "and v4.16b, v16.16b, v31.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "and v5.16b, v21.16b, v31.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v26.16b, v18.16b, v31.16b\n" + "sqadd v15.4s, v15.4s, v30.4s\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v28.4s\n" "sshr v4.4s, v4.4s, #0x1f\n" - "and v2.16b, v23.16b, v9.16b\n" - "and v1.16b, v16.16b, v30.16b\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "add v15.4s, v15.4s, v11.4s\n" - "sqadd v10.4s, v10.4s, v8.4s\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqrdmulh v22.4s, v22.4s, v31.4s\n" - "sqadd v20.4s, v20.4s, v4.4s\n" - "smin v15.4s, v15.4s, v14.4s\n" - "srshl v10.4s, v10.4s, v9.4s\n" - "sqadd v23.4s, v23.4s, v2.4s\n" - "smax v15.4s, v15.4s, v19.4s\n" - "srshl v20.4s, v20.4s, v30.4s\n" - "add v10.4s, v10.4s, v11.4s\n" - "srshl v23.4s, v23.4s, v9.4s\n" - "sqadd v16.4s, v16.4s, v1.4s\n" - "smin v10.4s, v10.4s, v14.4s\n" - "add v20.4s, v20.4s, v11.4s\n" - "add v23.4s, v23.4s, v11.4s\n" - "smax v10.4s, v10.4s, v19.4s\n" - "smin v20.4s, v20.4s, v14.4s\n" - "smin v23.4s, v23.4s, v14.4s\n" - "uzp1 v15.16b, v15.16b, v10.16b\n" - "smax v20.4s, v20.4s, v19.4s\n" + "sqadd v22.4s, v22.4s, v29.4s\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v20.4s\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v0.4s\n" + "sqadd v10.4s, v10.4s, v19.4s\n" + "srshl v9.4s, v9.4s, v0.4s\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "sqadd v21.4s, v21.4s, v5.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "sqadd v18.4s, v18.4s, v26.4s\n" + "srshl v10.4s, v10.4s, v31.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v16.4s, v16.4s, v31.4s\n" + "sqxtn v9.4h, v9.4s\n" + "srshl v21.4s, v21.4s, v31.4s\n" + "sqxtn v22.4h, v22.4s\n" + "srshl v18.4s, v18.4s, v31.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v15.8h, v10.4s\n" + "sqxtn2 v9.8h, v16.4s\n" + "sqxtn2 v22.8h, v21.4s\n" + "sqxtn2 v23.8h, v18.4s\n" + "sqadd v15.8h, v15.8h, v11.8h\n" + "sqadd v9.8h, v9.8h, v11.8h\n" + "sqadd v22.8h, v22.8h, v11.8h\n" + "sqadd v23.8h, v23.8h, v11.8h\n" + "smax v15.8h, v15.8h, v17.8h\n" + "smax v9.8h, v9.8h, v17.8h\n" + "smax v22.8h, v22.8h, v17.8h\n" + "smax v23.8h, v23.8h, v17.8h\n" + "smin v15.8h, v15.8h, v14.8h\n" + "smin v9.8h, v9.8h, v14.8h\n" + "smin v22.8h, v22.8h, v14.8h\n" + "smin v23.8h, v23.8h, v14.8h\n" "uzp1 v15.16b, v15.16b, v15.16b\n" - "str d15, [x14, x7]\n" - "smax v23.4s, v23.4s, v19.4s\n" - "srshl v16.4s, v16.4s, v30.4s\n" - "and v24.16b, v22.16b, v9.16b\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "uzp1 v20.16b, v20.16b, v23.16b\n" - "add v16.4s, v16.4s, v11.4s\n" - "sqrdmulh v17.4s, v17.4s, v21.4s\n" - "uzp1 v20.16b, v20.16b, v20.16b\n" - "str d20, [x13, x7]\n" - "smin v16.4s, v16.4s, v14.4s\n" - "sqrdmulh v18.4s, v18.4s, v31.4s\n" - "sqadd v22.4s, v22.4s, v24.4s\n" - "and v2.16b, v17.16b, v30.16b\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "smax v16.4s, v16.4s, v19.4s\n" - "srshl v22.4s, v22.4s, v9.4s\n" - "and v31.16b, v18.16b, v9.16b\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "add v22.4s, v22.4s, v11.4s\n" - "sqadd v17.4s, v17.4s, v2.4s\n" - "smin v22.4s, v22.4s, v14.4s\n" - "srshl v17.4s, v17.4s, v30.4s\n" - "sqadd v18.4s, v18.4s, v31.4s\n" - "smax v22.4s, v22.4s, v19.4s\n" - "uzp1 v16.16b, v16.16b, v22.16b\n" - "add v17.4s, v17.4s, v11.4s\n" - "srshl v18.4s, v18.4s, v9.4s\n" - "uzp1 v16.16b, v16.16b, v16.16b\n" - "str d16, [x12, x7]\n" - "smin v17.4s, v17.4s, v14.4s\n" - "add v18.4s, v18.4s, v11.4s\n" - "smax v17.4s, v17.4s, v19.4s\n" - "smin v18.4s, v18.4s, v14.4s\n" - "smax v18.4s, v18.4s, v19.4s\n" - "uzp1 v17.16b, v17.16b, v18.16b\n" - "uzp1 v17.16b, v17.16b, v17.16b\n" - "str d17, [x11, x7]\n" - "add x7, x7, #0x8\n" + "str d15, [x10, x14]\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str d9, [x9, x14]\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str d22, [x28, x14]\n" + "str d23, [x27, x14]\n" + "add x14, x14, #0x8\n" "beq 88f\n" - "add x6, x6, #0x48\n" + "add x17, x17, #0x48\n" "3:" // Oddments "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" - "tbz x4, #2, 5f\n" + "tbz x8, #2, 5f\n" "ld1 { v15.4s }, [x19], #0x10\n" - "tbz x4, #1, 4f\n" + "tbz x8, #1, 4f\n" "ld1 { v10.d }[0], [x19], #0x8\n" - "tbz x4, #0, 7f\n" + "tbz x8, #0, 7f\n" "ld1 { v10.s }[2], [x19]\n" "b 7f\n" "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset - "tbz x4, #0, 7f\n" + "tbz x8, #0, 7f\n" "ld1 { v10.s }[0], [x19]\n" "b 7f\n" "5:" // Oddments: Load bias: Bit 2: Unset - "tbz x4, #1, 6f\n" + "tbz x8, #1, 6f\n" "ld1 { v15.d }[0], [x19], #0x8\n" - "tbz x4, #0, 7f\n" + "tbz x8, #0, 7f\n" "ld1 { v15.s }[2], [x19]\n" "b 7f\n" "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 7f\n" + "tbz x8, #0, 7f\n" "ld1 { v15.s }[0], [x19]\n" "7:" // Oddments: Load bias: Bit 2: End - "mov v20.16b, v15.16b\n" - "ldr d0, [x6, #0x0]\n" - "mov v23.16b, v10.16b\n" - "ldr d1, [x6, #0x8]\n" - "mov v16.16b, v15.16b\n" - "ldr d2, [x6, #0x10]\n" - "mov v22.16b, v10.16b\n" - "ldr d3, [x6, #0x18]\n" - "mov v17.16b, v15.16b\n" - "ldr d4, [x6, #0x20]\n" - "usubl v0.8h, v0.8b, v13.8b\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "mov v9.16b, v15.16b\n" + "mov v16.16b, v10.16b\n" + "ldr d2, [x17, #0x10]\n" + "ldr d3, [x17, #0x18]\n" + "mov v22.16b, v15.16b\n" + "mov v21.16b, v10.16b\n" + "ldr d4, [x17, #0x20]\n" + "ldr d5, [x17, #0x28]\n" + "mov v23.16b, v15.16b\n" "mov v18.16b, v10.16b\n" - "ldr d5, [x6, #0x28]\n" + "ldr d6, [x17, #0x30]\n" + "ldr d7, [x17, #0x38]\n" + "usubl v0.8h, v0.8b, v13.8b\n" "usubl v1.8h, v1.8b, v13.8b\n" - "ldr d6, [x6, #0x30]\n" + "ldr d8, [x17, #0x40]\n" + "ldp x26, x25, [x12, #0x0]\n" "usubl v2.8h, v2.8b, v13.8b\n" - "ldr d7, [x6, #0x38]\n" "usubl v3.8h, v3.8b, v13.8b\n" - "ldr d8, [x6, #0x40]\n" + "ldp x24, x23, [x12, #0x10]\n" + "ldp x22, x21, [x12, #0x20]\n" "usubl v4.8h, v4.8b, v13.8b\n" - "ldp x26, x25, [x8, #0x0]\n" "usubl v5.8h, v5.8b, v13.8b\n" - "ldp x24, x23, [x8, #0x10]\n" + "ldp x20, x19, [x12, #0x30]\n" "usubl v6.8h, v6.8b, v13.8b\n" "usubl v7.8h, v7.8b, v13.8b\n" - "ldp x22, x21, [x8, #0x20]\n" "usubl v8.8h, v8.8b, v13.8b\n" - "ldp x20, x19, [x8, #0x30]\n" - "add x26, x26, x5\n" - "add x25, x25, x5\n" - "add x24, x24, x5\n" - "add x23, x23, x5\n" - "add x22, x22, x5\n" - "add x21, x21, x5\n" - "add x20, x20, x5\n" - "add x19, x19, x5\n" - "tbz x4, #2, 9f\n" + "add x26, x26, x15\n" + "add x25, x25, x15\n" + "add x24, x24, x15\n" + "add x23, x23, x15\n" + "add x22, x22, x15\n" + "add x21, x21, x15\n" + "add x20, x20, x15\n" + "add x19, x19, x15\n" + "tbz x8, #2, 9f\n" "ld1 { v31.s }[0], [x26], #0x4\n" "ld1 { v30.s }[0], [x25], #0x4\n" "ld1 { v29.s }[0], [x24], #0x4\n" @@ -713,7 +697,7 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v26.s }[0], [x21], #0x4\n" "ld1 { v25.s }[0], [x20], #0x4\n" "ld1 { v24.s }[0], [x19], #0x4\n" - "tbz x4, #1, 8f\n" + "tbz x8, #1, 8f\n" "ld1 { v31.h }[2], [x26], #0x2\n" "ld1 { v30.h }[2], [x25], #0x2\n" "ld1 { v29.h }[2], [x24], #0x2\n" @@ -722,7 +706,7 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v26.h }[2], [x21], #0x2\n" "ld1 { v25.h }[2], [x20], #0x2\n" "ld1 { v24.h }[2], [x19], #0x2\n" - "tbz x4, #0, 11f\n" + "tbz x8, #0, 11f\n" "ld1 { v31.b }[6], [x26]\n" "ld1 { v30.b }[6], [x25]\n" "ld1 { v29.b }[6], [x24]\n" @@ -733,7 +717,7 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v24.b }[6], [x19]\n" "b 11f\n" "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset - "tbz x4, #0, 11f\n" + "tbz x8, #0, 11f\n" "ld1 { v31.b }[4], [x26]\n" "ld1 { v30.b }[4], [x25]\n" "ld1 { v29.b }[4], [x24]\n" @@ -744,7 +728,7 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v24.b }[4], [x19]\n" "b 11f\n" "9:" // Oddments: Initial loads: Bit 2: Unset - "tbz x4, #1, 10f\n" + "tbz x8, #1, 10f\n" "ld1 { v31.h }[0], [x26], #0x2\n" "ld1 { v30.h }[0], [x25], #0x2\n" "ld1 { v29.h }[0], [x24], #0x2\n" @@ -753,7 +737,7 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v26.h }[0], [x21], #0x2\n" "ld1 { v25.h }[0], [x20], #0x2\n" "ld1 { v24.h }[0], [x19], #0x2\n" - "tbz x4, #0, 11f\n" + "tbz x8, #0, 11f\n" "ld1 { v31.b }[2], [x26]\n" "ld1 { v30.b }[2], [x25]\n" "ld1 { v29.b }[2], [x24]\n" @@ -764,7 +748,7 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v24.b }[2], [x19]\n" "b 11f\n" "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 11f\n" + "tbz x8, #0, 11f\n" "ld1 { v31.b }[0], [x26]\n" "ld1 { v30.b }[0], [x25]\n" "ld1 { v29.b }[0], [x24]\n" @@ -774,646 +758,636 @@ void a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v25.b }[0], [x20]\n" "ld1 { v24.b }[0], [x19]\n" "11:" // Oddments: Initial loads: Bit 2: End - "ldr x23, [x8, #0x40]\n" "usubl v31.8h, v31.8b, v12.8b\n" "smlal v15.4s, v31.4h, v8.4h\n" - "usubl v30.8h, v30.8b, v12.8b\n" "smlal2 v10.4s, v31.8h, v8.8h\n" - "usubl v29.8h, v29.8b, v12.8b\n" - "smlal v20.4s, v31.4h, v6.4h\n" - "usubl v28.8h, v28.8b, v12.8b\n" - "smlal2 v23.4s, v31.8h, v6.8h\n" - "usubl v27.8h, v27.8b, v12.8b\n" - "smlal v16.4s, v31.4h, v2.4h\n" - "usubl v26.8h, v26.8b, v12.8b\n" - "smlal2 v22.4s, v31.8h, v2.8h\n" - "usubl v25.8h, v25.8b, v12.8b\n" - "smlal v17.4s, v31.4h, v0.4h\n" - "usubl v24.8h, v24.8b, v12.8b\n" - "smlal2 v18.4s, v31.8h, v0.8h\n" - "add x23, x23, x5\n" + "ldr x24, [x12, #0x40]\n" + "usubl v30.8h, v30.8b, v12.8b\n" "smlal v15.4s, v30.4h, v0.4h\n" "smlal2 v10.4s, v30.8h, v0.8h\n" - "smlal v20.4s, v28.4h, v1.4h\n" - "smlal2 v23.4s, v28.8h, v1.8h\n" + "add x24, x24, x15\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v9.4s, v31.4h, v6.4h\n" + "smlal2 v16.4s, v31.8h, v6.8h\n" "smlal v15.4s, v29.4h, v1.4h\n" "smlal2 v10.4s, v29.8h, v1.8h\n" - "smlal v20.4s, v27.4h, v2.4h\n" - "smlal2 v23.4s, v27.8h, v2.8h\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "smlal2 v16.4s, v28.8h, v1.8h\n" "smlal v15.4s, v26.4h, v3.4h\n" "smlal2 v10.4s, v26.8h, v3.8h\n" - "smlal v20.4s, v24.4h, v0.4h\n" - "smlal2 v23.4s, v24.8h, v0.8h\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "smlal v9.4s, v27.4h, v2.4h\n" + "smlal2 v16.4s, v27.8h, v2.8h\n" "smlal v15.4s, v25.4h, v4.4h\n" "smlal2 v10.4s, v25.8h, v4.8h\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "smlal v22.4s, v31.4h, v2.4h\n" + "smlal2 v21.4s, v31.8h, v2.8h\n" + "smlal v23.4s, v31.4h, v0.4h\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" "smlal v15.4s, v24.4h, v2.4h\n" "smlal2 v10.4s, v24.8h, v2.8h\n" - "tbz x4, #2, 13f\n" - "ld1 { v29.s }[0], [x23], #0x4\n" - "tbz x4, #1, 12f\n" - "ld1 { v29.h }[2], [x23], #0x2\n" - "tbz x4, #0, 15f\n" - "ld1 { v29.b }[6], [x23]\n" + "smlal v9.4s, v24.4h, v0.4h\n" + "smlal2 v16.4s, v24.8h, v0.8h\n" + "tbz x8, #2, 13f\n" + "ld1 { v29.s }[0], [x24], #0x4\n" + "tbz x8, #1, 12f\n" + "ld1 { v29.h }[2], [x24], #0x2\n" + "tbz x8, #0, 15f\n" + "ld1 { v29.b }[6], [x24]\n" "b 15f\n" "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 15f\n" - "ld1 { v29.b }[4], [x23]\n" + "tbz x8, #0, 15f\n" + "ld1 { v29.b }[4], [x24]\n" "b 15f\n" "13:" // Oddments: Load (1, 3): Bit 2: Unset - "tbz x4, #1, 14f\n" - "ld1 { v29.h }[0], [x23], #0x2\n" - "tbz x4, #0, 15f\n" - "ld1 { v29.b }[2], [x23]\n" + "tbz x8, #1, 14f\n" + "ld1 { v29.h }[0], [x24], #0x2\n" + "tbz x8, #0, 15f\n" + "ld1 { v29.b }[2], [x24]\n" "b 15f\n" "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 15f\n" - "ld1 { v29.b }[0], [x23]\n" + "tbz x8, #0, 15f\n" + "ld1 { v29.b }[0], [x24]\n" "15:" // Oddments: Load (1, 3): Bit 2: End - "ldr x22, [x8, #0x48]\n" "usubl v29.8h, v29.8b, v12.8b\n" - "smlal v20.4s, v29.4h, v4.4h\n" - "smlal2 v23.4s, v29.8h, v4.8h\n" - "add x22, x22, x5\n" - "tbz x4, #2, 17f\n" - "ld1 { v28.s }[0], [x22], #0x4\n" - "tbz x4, #1, 16f\n" - "ld1 { v28.h }[2], [x22], #0x2\n" - "tbz x4, #0, 19f\n" - "ld1 { v28.b }[6], [x22]\n" + "ldr x23, [x12, #0x48]\n" + "smlal v9.4s, v29.4h, v4.4h\n" + "smlal2 v16.4s, v29.8h, v4.8h\n" + "add x23, x23, x15\n" + "tbz x8, #2, 17f\n" + "ld1 { v28.s }[0], [x23], #0x4\n" + "tbz x8, #1, 16f\n" + "ld1 { v28.h }[2], [x23], #0x2\n" + "tbz x8, #0, 19f\n" + "ld1 { v28.b }[6], [x23]\n" "b 19f\n" "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 19f\n" - "ld1 { v28.b }[4], [x22]\n" + "tbz x8, #0, 19f\n" + "ld1 { v28.b }[4], [x23]\n" "b 19f\n" "17:" // Oddments: Load (1, 4): Bit 2: Unset - "tbz x4, #1, 18f\n" - "ld1 { v28.h }[0], [x22], #0x2\n" - "tbz x4, #0, 19f\n" - "ld1 { v28.b }[2], [x22]\n" + "tbz x8, #1, 18f\n" + "ld1 { v28.h }[0], [x23], #0x2\n" + "tbz x8, #0, 19f\n" + "ld1 { v28.b }[2], [x23]\n" "b 19f\n" "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 19f\n" - "ld1 { v28.b }[0], [x22]\n" + "tbz x8, #0, 19f\n" + "ld1 { v28.b }[0], [x23]\n" "19:" // Oddments: Load (1, 4): Bit 2: End - "ldr x21, [x8, #0x50]\n" "usubl v28.8h, v28.8b, v12.8b\n" - "smlal v20.4s, v28.4h, v5.4h\n" - "smlal2 v23.4s, v28.8h, v5.8h\n" - "add x21, x21, x5\n" - "tbz x4, #2, 21f\n" + "ldr x21, [x12, #0x50]\n" + "smlal v9.4s, v28.4h, v5.4h\n" + "smlal2 v16.4s, v28.8h, v5.8h\n" + "add x21, x21, x15\n" + "tbz x8, #2, 21f\n" "ld1 { v27.s }[0], [x21], #0x4\n" - "tbz x4, #1, 20f\n" + "tbz x8, #1, 20f\n" "ld1 { v27.h }[2], [x21], #0x2\n" - "tbz x4, #0, 23f\n" + "tbz x8, #0, 23f\n" "ld1 { v27.b }[6], [x21]\n" "b 23f\n" "20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset - "tbz x4, #0, 23f\n" + "tbz x8, #0, 23f\n" "ld1 { v27.b }[4], [x21]\n" "b 23f\n" "21:" // Oddments: Load (1, 2): Bit 2: Unset - "tbz x4, #1, 22f\n" + "tbz x8, #1, 22f\n" "ld1 { v27.h }[0], [x21], #0x2\n" - "tbz x4, #0, 23f\n" + "tbz x8, #0, 23f\n" "ld1 { v27.b }[2], [x21]\n" "b 23f\n" "22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 23f\n" + "tbz x8, #0, 23f\n" "ld1 { v27.b }[0], [x21]\n" "23:" // Oddments: Load (1, 2): Bit 2: End - "ldr x20, [x8, #0x58]\n" "usubl v27.8h, v27.8b, v12.8b\n" + "ldr x19, [x12, #0x58]\n" "smlal v15.4s, v27.4h, v5.4h\n" "smlal2 v10.4s, v27.8h, v5.8h\n" - "add x20, x20, x5\n" - "smlal v20.4s, v27.4h, v3.4h\n" - "smlal2 v23.4s, v27.8h, v3.8h\n" - "tbz x4, #2, 25f\n" - "ld1 { v26.s }[0], [x20], #0x4\n" - "tbz x4, #1, 24f\n" - "ld1 { v26.h }[2], [x20], #0x2\n" - "tbz x4, #0, 27f\n" - "ld1 { v26.b }[6], [x20]\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "smlal2 v16.4s, v27.8h, v3.8h\n" + "add x19, x19, x15\n" + "tbz x8, #2, 25f\n" + "ld1 { v26.s }[0], [x19], #0x4\n" + "tbz x8, #1, 24f\n" + "ld1 { v26.h }[2], [x19], #0x2\n" + "tbz x8, #0, 27f\n" + "ld1 { v26.b }[6], [x19]\n" "b 27f\n" "24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset - "tbz x4, #0, 27f\n" - "ld1 { v26.b }[4], [x20]\n" + "tbz x8, #0, 27f\n" + "ld1 { v26.b }[4], [x19]\n" "b 27f\n" "25:" // Oddments: Load (3, 0): Bit 2: Unset - "tbz x4, #1, 26f\n" - "ld1 { v26.h }[0], [x20], #0x2\n" - "tbz x4, #0, 27f\n" - "ld1 { v26.b }[2], [x20]\n" + "tbz x8, #1, 26f\n" + "ld1 { v26.h }[0], [x19], #0x2\n" + "tbz x8, #0, 27f\n" + "ld1 { v26.b }[2], [x19]\n" "b 27f\n" "26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 27f\n" - "ld1 { v26.b }[0], [x20]\n" + "tbz x8, #0, 27f\n" + "ld1 { v26.b }[0], [x19]\n" "27:" // Oddments: Load (3, 0): Bit 2: End - "ldr x19, [x8, #0x60]\n" "usubl v26.8h, v26.8b, v12.8b\n" - "smlal v16.4s, v26.4h, v3.4h\n" - "smlal2 v22.4s, v26.8h, v3.8h\n" - "add x19, x19, x5\n" - "tbz x4, #2, 29f\n" - "ld1 { v25.s }[0], [x19], #0x4\n" - "tbz x4, #1, 28f\n" - "ld1 { v25.h }[2], [x19], #0x2\n" - "tbz x4, #0, 31f\n" - "ld1 { v25.b }[6], [x19]\n" + "ldr x20, [x12, #0x60]\n" + "smlal v22.4s, v26.4h, v3.4h\n" + "smlal2 v21.4s, v26.8h, v3.8h\n" + "add x20, x20, x15\n" + "tbz x8, #2, 29f\n" + "ld1 { v25.s }[0], [x20], #0x4\n" + "tbz x8, #1, 28f\n" + "ld1 { v25.h }[2], [x20], #0x2\n" + "tbz x8, #0, 31f\n" + "ld1 { v25.b }[6], [x20]\n" "b 31f\n" "28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset - "tbz x4, #0, 31f\n" - "ld1 { v25.b }[4], [x19]\n" + "tbz x8, #0, 31f\n" + "ld1 { v25.b }[4], [x20]\n" "b 31f\n" "29:" // Oddments: Load (2, 0): Bit 2: Unset - "tbz x4, #1, 30f\n" - "ld1 { v25.h }[0], [x19], #0x2\n" - "tbz x4, #0, 31f\n" - "ld1 { v25.b }[2], [x19]\n" + "tbz x8, #1, 30f\n" + "ld1 { v25.h }[0], [x20], #0x2\n" + "tbz x8, #0, 31f\n" + "ld1 { v25.b }[2], [x20]\n" "b 31f\n" "30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 31f\n" - "ld1 { v25.b }[0], [x19]\n" + "tbz x8, #0, 31f\n" + "ld1 { v25.b }[0], [x20]\n" "31:" // Oddments: Load (2, 0): Bit 2: End - "ldr x10, [x8, #0x68]\n" "usubl v25.8h, v25.8b, v12.8b\n" + "ldr x19, [x12, #0x68]\n" "smlal v15.4s, v25.4h, v6.4h\n" "smlal2 v10.4s, v25.8h, v6.8h\n" - "add x10, x10, x5\n" - "smlal v16.4s, v25.4h, v0.4h\n" - "smlal2 v22.4s, v25.8h, v0.8h\n" - "tbz x4, #2, 33f\n" - "ld1 { v29.s }[0], [x10], #0x4\n" - "tbz x4, #1, 32f\n" - "ld1 { v29.h }[2], [x10], #0x2\n" - "tbz x4, #0, 35f\n" - "ld1 { v29.b }[6], [x10]\n" + "smlal v22.4s, v25.4h, v0.4h\n" + "smlal2 v21.4s, v25.8h, v0.8h\n" + "add x19, x19, x15\n" + "tbz x8, #2, 33f\n" + "ld1 { v29.s }[0], [x19], #0x4\n" + "tbz x8, #1, 32f\n" + "ld1 { v29.h }[2], [x19], #0x2\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[6], [x19]\n" "b 35f\n" "32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset - "tbz x4, #0, 35f\n" - "ld1 { v29.b }[4], [x10]\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[4], [x19]\n" "b 35f\n" "33:" // Oddments: Load (3, 1): Bit 2: Unset - "tbz x4, #1, 34f\n" - "ld1 { v29.h }[0], [x10], #0x2\n" - "tbz x4, #0, 35f\n" - "ld1 { v29.b }[2], [x10]\n" + "tbz x8, #1, 34f\n" + "ld1 { v29.h }[0], [x19], #0x2\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[2], [x19]\n" "b 35f\n" "34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 35f\n" - "ld1 { v29.b }[0], [x10]\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[0], [x19]\n" "35:" // Oddments: Load (3, 1): Bit 2: End - "ldr x9, [x8, #0x70]\n" "usubl v29.8h, v29.8b, v12.8b\n" - "smlal v16.4s, v29.4h, v4.4h\n" - "smlal2 v22.4s, v29.8h, v4.8h\n" - "add x9, x9, x5\n" - "tbz x4, #2, 37f\n" - "ld1 { v24.s }[0], [x9], #0x4\n" - "tbz x4, #1, 36f\n" - "ld1 { v24.h }[2], [x9], #0x2\n" - "tbz x4, #0, 39f\n" - "ld1 { v24.b }[6], [x9]\n" + "ldr x19, [x12, #0x70]\n" + "smlal v22.4s, v29.4h, v4.4h\n" + "smlal2 v21.4s, v29.8h, v4.8h\n" + "add x19, x19, x15\n" + "tbz x8, #2, 37f\n" + "ld1 { v24.s }[0], [x19], #0x4\n" + "tbz x8, #1, 36f\n" + "ld1 { v24.h }[2], [x19], #0x2\n" + "tbz x8, #0, 39f\n" + "ld1 { v24.b }[6], [x19]\n" "b 39f\n" "36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset - "tbz x4, #0, 39f\n" - "ld1 { v24.b }[4], [x9]\n" + "tbz x8, #0, 39f\n" + "ld1 { v24.b }[4], [x19]\n" "b 39f\n" "37:" // Oddments: Load (2, 1): Bit 2: Unset - "tbz x4, #1, 38f\n" - "ld1 { v24.h }[0], [x9], #0x2\n" - "tbz x4, #0, 39f\n" - "ld1 { v24.b }[2], [x9]\n" + "tbz x8, #1, 38f\n" + "ld1 { v24.h }[0], [x19], #0x2\n" + "tbz x8, #0, 39f\n" + "ld1 { v24.b }[2], [x19]\n" "b 39f\n" "38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 39f\n" - "ld1 { v24.b }[0], [x9]\n" + "tbz x8, #0, 39f\n" + "ld1 { v24.b }[0], [x19]\n" "39:" // Oddments: Load (2, 1): Bit 2: End - "ldr x28, [x8, #0x78]\n" "usubl v24.8h, v24.8b, v12.8b\n" + "ldr x22, [x12, #0x78]\n" "smlal v15.4s, v24.4h, v7.4h\n" "smlal2 v10.4s, v24.8h, v7.8h\n" - "add x28, x28, x5\n" - "smlal v16.4s, v24.4h, v1.4h\n" - "smlal2 v22.4s, v24.8h, v1.8h\n" - "tbz x4, #2, 41f\n" - "ld1 { v27.s }[0], [x28], #0x4\n" - "tbz x4, #1, 40f\n" - "ld1 { v27.h }[2], [x28], #0x2\n" - "tbz x4, #0, 43f\n" - "ld1 { v27.b }[6], [x28]\n" + "smlal v22.4s, v24.4h, v1.4h\n" + "smlal2 v21.4s, v24.8h, v1.8h\n" + "add x22, x22, x15\n" + "tbz x8, #2, 41f\n" + "ld1 { v27.s }[0], [x22], #0x4\n" + "tbz x8, #1, 40f\n" + "ld1 { v27.h }[2], [x22], #0x2\n" + "tbz x8, #0, 43f\n" + "ld1 { v27.b }[6], [x22]\n" "b 43f\n" "40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 43f\n" - "ld1 { v27.b }[4], [x28]\n" + "tbz x8, #0, 43f\n" + "ld1 { v27.b }[4], [x22]\n" "b 43f\n" "41:" // Oddments: Load (3, 3): Bit 2: Unset - "tbz x4, #1, 42f\n" - "ld1 { v27.h }[0], [x28], #0x2\n" - "tbz x4, #0, 43f\n" - "ld1 { v27.b }[2], [x28]\n" + "tbz x8, #1, 42f\n" + "ld1 { v27.h }[0], [x22], #0x2\n" + "tbz x8, #0, 43f\n" + "ld1 { v27.b }[2], [x22]\n" "b 43f\n" "42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 43f\n" - "ld1 { v27.b }[0], [x28]\n" + "tbz x8, #0, 43f\n" + "ld1 { v27.b }[0], [x22]\n" "43:" // Oddments: Load (3, 3): Bit 2: End - "ldr x27, [x8, #0x80]\n" "usubl v27.8h, v27.8b, v12.8b\n" - "smlal v17.4s, v27.4h, v4.4h\n" + "ldr x21, [x12, #0x80]\n" + "smlal v23.4s, v27.4h, v4.4h\n" "smlal2 v18.4s, v27.8h, v4.8h\n" - "add x27, x27, x5\n" - "tbz x4, #2, 45f\n" - "ld1 { v28.s }[0], [x27], #0x4\n" - "tbz x4, #1, 44f\n" - "ld1 { v28.h }[2], [x27], #0x2\n" - "tbz x4, #0, 47f\n" - "ld1 { v28.b }[6], [x27]\n" + "add x21, x21, x15\n" + "tbz x8, #2, 45f\n" + "ld1 { v28.s }[0], [x21], #0x4\n" + "tbz x8, #1, 44f\n" + "ld1 { v28.h }[2], [x21], #0x2\n" + "tbz x8, #0, 47f\n" + "ld1 { v28.b }[6], [x21]\n" "b 47f\n" "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 47f\n" - "ld1 { v28.b }[4], [x27]\n" + "tbz x8, #0, 47f\n" + "ld1 { v28.b }[4], [x21]\n" "b 47f\n" "45:" // Oddments: Load (2, 3): Bit 2: Unset - "tbz x4, #1, 46f\n" - "ld1 { v28.h }[0], [x27], #0x2\n" - "tbz x4, #0, 47f\n" - "ld1 { v28.b }[2], [x27]\n" + "tbz x8, #1, 46f\n" + "ld1 { v28.h }[0], [x21], #0x2\n" + "tbz x8, #0, 47f\n" + "ld1 { v28.b }[2], [x21]\n" "b 47f\n" "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 47f\n" - "ld1 { v28.b }[0], [x27]\n" + "tbz x8, #0, 47f\n" + "ld1 { v28.b }[0], [x21]\n" "47:" // Oddments: Load (2, 3): Bit 2: End - "ldr x26, [x8, #0x88]\n" "usubl v28.8h, v28.8b, v12.8b\n" - "smlal v20.4s, v28.4h, v7.4h\n" - "smlal2 v23.4s, v28.8h, v7.8h\n" - "add x26, x26, x5\n" - "smlal v17.4s, v28.4h, v1.4h\n" + "ldr x20, [x12, #0x88]\n" + "smlal v9.4s, v28.4h, v7.4h\n" + "smlal2 v16.4s, v28.8h, v7.8h\n" + "smlal v23.4s, v28.4h, v1.4h\n" "smlal2 v18.4s, v28.8h, v1.8h\n" - "tbz x4, #2, 49f\n" - "ld1 { v26.s }[0], [x26], #0x4\n" - "tbz x4, #1, 48f\n" - "ld1 { v26.h }[2], [x26], #0x2\n" - "tbz x4, #0, 51f\n" - "ld1 { v26.b }[6], [x26]\n" + "add x20, x20, x15\n" + "tbz x8, #2, 49f\n" + "ld1 { v26.s }[0], [x20], #0x4\n" + "tbz x8, #1, 48f\n" + "ld1 { v26.h }[2], [x20], #0x2\n" + "tbz x8, #0, 51f\n" + "ld1 { v26.b }[6], [x20]\n" "b 51f\n" "48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 51f\n" - "ld1 { v26.b }[4], [x26]\n" + "tbz x8, #0, 51f\n" + "ld1 { v26.b }[4], [x20]\n" "b 51f\n" "49:" // Oddments: Load (3, 4): Bit 2: Unset - "tbz x4, #1, 50f\n" - "ld1 { v26.h }[0], [x26], #0x2\n" - "tbz x4, #0, 51f\n" - "ld1 { v26.b }[2], [x26]\n" + "tbz x8, #1, 50f\n" + "ld1 { v26.h }[0], [x20], #0x2\n" + "tbz x8, #0, 51f\n" + "ld1 { v26.b }[2], [x20]\n" "b 51f\n" "50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 51f\n" - "ld1 { v26.b }[0], [x26]\n" + "tbz x8, #0, 51f\n" + "ld1 { v26.b }[0], [x20]\n" "51:" // Oddments: Load (3, 4): Bit 2: End - "ldr x25, [x8, #0x90]\n" "usubl v26.8h, v26.8b, v12.8b\n" - "smlal v17.4s, v26.4h, v5.4h\n" + "ldr x23, [x12, #0x90]\n" + "smlal v23.4s, v26.4h, v5.4h\n" "smlal2 v18.4s, v26.8h, v5.8h\n" - "add x25, x25, x5\n" - "tbz x4, #2, 53f\n" - "ld1 { v25.s }[0], [x25], #0x4\n" - "tbz x4, #1, 52f\n" - "ld1 { v25.h }[2], [x25], #0x2\n" - "tbz x4, #0, 55f\n" - "ld1 { v25.b }[6], [x25]\n" + "add x23, x23, x15\n" + "tbz x8, #2, 53f\n" + "ld1 { v25.s }[0], [x23], #0x4\n" + "tbz x8, #1, 52f\n" + "ld1 { v25.h }[2], [x23], #0x2\n" + "tbz x8, #0, 55f\n" + "ld1 { v25.b }[6], [x23]\n" "b 55f\n" "52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset - "tbz x4, #0, 55f\n" - "ld1 { v25.b }[4], [x25]\n" + "tbz x8, #0, 55f\n" + "ld1 { v25.b }[4], [x23]\n" "b 55f\n" "53:" // Oddments: Load (4, 0): Bit 2: Unset - "tbz x4, #1, 54f\n" - "ld1 { v25.h }[0], [x25], #0x2\n" - "tbz x4, #0, 55f\n" - "ld1 { v25.b }[2], [x25]\n" + "tbz x8, #1, 54f\n" + "ld1 { v25.h }[0], [x23], #0x2\n" + "tbz x8, #0, 55f\n" + "ld1 { v25.b }[2], [x23]\n" "b 55f\n" "54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 55f\n" - "ld1 { v25.b }[0], [x25]\n" + "tbz x8, #0, 55f\n" + "ld1 { v25.b }[0], [x23]\n" "55:" // Oddments: Load (4, 0): Bit 2: End - "ldr x24, [x8, #0x98]\n" "usubl v25.8h, v25.8b, v12.8b\n" - "smlal v16.4s, v25.4h, v6.4h\n" - "smlal2 v22.4s, v25.8h, v6.8h\n" - "add x24, x24, x5\n" - "tbz x4, #2, 57f\n" + "ldr x24, [x12, #0x98]\n" + "smlal v22.4s, v25.4h, v6.4h\n" + "smlal2 v21.4s, v25.8h, v6.8h\n" + "add x24, x24, x15\n" + "tbz x8, #2, 57f\n" "ld1 { v29.s }[0], [x24], #0x4\n" - "tbz x4, #1, 56f\n" + "tbz x8, #1, 56f\n" "ld1 { v29.h }[2], [x24], #0x2\n" - "tbz x4, #0, 59f\n" + "tbz x8, #0, 59f\n" "ld1 { v29.b }[6], [x24]\n" "b 59f\n" "56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 59f\n" + "tbz x8, #0, 59f\n" "ld1 { v29.b }[4], [x24]\n" "b 59f\n" "57:" // Oddments: Load (2, 4): Bit 2: Unset - "tbz x4, #1, 58f\n" + "tbz x8, #1, 58f\n" "ld1 { v29.h }[0], [x24], #0x2\n" - "tbz x4, #0, 59f\n" + "tbz x8, #0, 59f\n" "ld1 { v29.b }[2], [x24]\n" "b 59f\n" "58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 59f\n" + "tbz x8, #0, 59f\n" "ld1 { v29.b }[0], [x24]\n" "59:" // Oddments: Load (2, 4): Bit 2: End - "ldr x23, [x8, #0xa0]\n" "usubl v29.8h, v29.8b, v12.8b\n" - "smlal v20.4s, v29.4h, v8.4h\n" - "smlal2 v23.4s, v29.8h, v8.8h\n" - "add x23, x23, x5\n" - "smlal v17.4s, v29.4h, v2.4h\n" + "ldr x19, [x12, #0xa0]\n" + "smlal v9.4s, v29.4h, v8.4h\n" + "smlal2 v16.4s, v29.8h, v8.8h\n" + "smlal v23.4s, v29.4h, v2.4h\n" "smlal2 v18.4s, v29.8h, v2.8h\n" - "tbz x4, #2, 61f\n" - "ld1 { v27.s }[0], [x23], #0x4\n" - "tbz x4, #1, 60f\n" - "ld1 { v27.h }[2], [x23], #0x2\n" - "tbz x4, #0, 63f\n" - "ld1 { v27.b }[6], [x23]\n" + "add x19, x19, x15\n" + "tbz x8, #2, 61f\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "tbz x8, #1, 60f\n" + "ld1 { v27.h }[2], [x19], #0x2\n" + "tbz x8, #0, 63f\n" + "ld1 { v27.b }[6], [x19]\n" "b 63f\n" "60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset - "tbz x4, #0, 63f\n" - "ld1 { v27.b }[4], [x23]\n" + "tbz x8, #0, 63f\n" + "ld1 { v27.b }[4], [x19]\n" "b 63f\n" "61:" // Oddments: Load (4, 1): Bit 2: Unset - "tbz x4, #1, 62f\n" - "ld1 { v27.h }[0], [x23], #0x2\n" - "tbz x4, #0, 63f\n" - "ld1 { v27.b }[2], [x23]\n" + "tbz x8, #1, 62f\n" + "ld1 { v27.h }[0], [x19], #0x2\n" + "tbz x8, #0, 63f\n" + "ld1 { v27.b }[2], [x19]\n" "b 63f\n" "62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 63f\n" - "ld1 { v27.b }[0], [x23]\n" + "tbz x8, #0, 63f\n" + "ld1 { v27.b }[0], [x19]\n" "63:" // Oddments: Load (4, 1): Bit 2: End - "ldr x22, [x8, #0xa8]\n" "usubl v27.8h, v27.8b, v12.8b\n" - "smlal v16.4s, v27.4h, v7.4h\n" - "smlal2 v22.4s, v27.8h, v7.8h\n" - "add x22, x22, x5\n" - "tbz x4, #2, 65f\n" + "ldr x22, [x12, #0xa8]\n" + "smlal v22.4s, v27.4h, v7.4h\n" + "smlal2 v21.4s, v27.8h, v7.8h\n" + "add x22, x22, x15\n" + "tbz x8, #2, 65f\n" "ld1 { v24.s }[0], [x22], #0x4\n" - "tbz x4, #1, 64f\n" + "tbz x8, #1, 64f\n" "ld1 { v24.h }[2], [x22], #0x2\n" - "tbz x4, #0, 67f\n" + "tbz x8, #0, 67f\n" "ld1 { v24.b }[6], [x22]\n" "b 67f\n" "64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset - "tbz x4, #0, 67f\n" + "tbz x8, #0, 67f\n" "ld1 { v24.b }[4], [x22]\n" "b 67f\n" "65:" // Oddments: Load (3, 2): Bit 2: Unset - "tbz x4, #1, 66f\n" + "tbz x8, #1, 66f\n" "ld1 { v24.h }[0], [x22], #0x2\n" - "tbz x4, #0, 67f\n" + "tbz x8, #0, 67f\n" "ld1 { v24.b }[2], [x22]\n" "b 67f\n" "66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 67f\n" + "tbz x8, #0, 67f\n" "ld1 { v24.b }[0], [x22]\n" "67:" // Oddments: Load (3, 2): Bit 2: End - "ldr x21, [x8, #0xb0]\n" "usubl v24.8h, v24.8b, v12.8b\n" - "smlal v16.4s, v24.4h, v5.4h\n" - "smlal2 v22.4s, v24.8h, v5.8h\n" - "add x21, x21, x5\n" - "smlal v17.4s, v24.4h, v3.4h\n" + "ldr x21, [x12, #0xb0]\n" + "smlal v22.4s, v24.4h, v5.4h\n" + "smlal2 v21.4s, v24.8h, v5.8h\n" + "smlal v23.4s, v24.4h, v3.4h\n" "smlal2 v18.4s, v24.8h, v3.8h\n" - "tbz x4, #2, 69f\n" + "add x21, x21, x15\n" + "tbz x8, #2, 69f\n" "ld1 { v26.s }[0], [x21], #0x4\n" - "tbz x4, #1, 68f\n" + "tbz x8, #1, 68f\n" "ld1 { v26.h }[2], [x21], #0x2\n" - "tbz x4, #0, 71f\n" + "tbz x8, #0, 71f\n" "ld1 { v26.b }[6], [x21]\n" "b 71f\n" "68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 71f\n" + "tbz x8, #0, 71f\n" "ld1 { v26.b }[4], [x21]\n" "b 71f\n" "69:" // Oddments: Load (4, 3): Bit 2: Unset - "tbz x4, #1, 70f\n" + "tbz x8, #1, 70f\n" "ld1 { v26.h }[0], [x21], #0x2\n" - "tbz x4, #0, 71f\n" + "tbz x8, #0, 71f\n" "ld1 { v26.b }[2], [x21]\n" "b 71f\n" "70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 71f\n" + "tbz x8, #0, 71f\n" "ld1 { v26.b }[0], [x21]\n" "71:" // Oddments: Load (4, 3): Bit 2: End - "ldr x20, [x8, #0xb8]\n" "usubl v26.8h, v26.8b, v12.8b\n" - "smlal v17.4s, v26.4h, v7.4h\n" + "ldr x20, [x12, #0xb8]\n" + "smlal v23.4s, v26.4h, v7.4h\n" "smlal2 v18.4s, v26.8h, v7.8h\n" - "add x20, x20, x5\n" - "tbz x4, #2, 73f\n" + "add x20, x20, x15\n" + "tbz x8, #2, 73f\n" "ld1 { v25.s }[0], [x20], #0x4\n" - "tbz x4, #1, 72f\n" + "tbz x8, #1, 72f\n" "ld1 { v25.h }[2], [x20], #0x2\n" - "tbz x4, #0, 75f\n" + "tbz x8, #0, 75f\n" "ld1 { v25.b }[6], [x20]\n" "b 75f\n" "72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset - "tbz x4, #0, 75f\n" + "tbz x8, #0, 75f\n" "ld1 { v25.b }[4], [x20]\n" "b 75f\n" "73:" // Oddments: Load (4, 2): Bit 2: Unset - "tbz x4, #1, 74f\n" + "tbz x8, #1, 74f\n" "ld1 { v25.h }[0], [x20], #0x2\n" - "tbz x4, #0, 75f\n" + "tbz x8, #0, 75f\n" "ld1 { v25.b }[2], [x20]\n" "b 75f\n" "74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 75f\n" + "tbz x8, #0, 75f\n" "ld1 { v25.b }[0], [x20]\n" "75:" // Oddments: Load (4, 2): Bit 2: End - "ldr x19, [x8, #0xc0]\n" "usubl v25.8h, v25.8b, v12.8b\n" - "smlal v16.4s, v25.4h, v8.4h\n" - "smlal2 v22.4s, v25.8h, v8.8h\n" - "add x19, x19, x5\n" - "smlal v17.4s, v25.4h, v6.4h\n" + "ldr x19, [x12, #0xc0]\n" + "smlal v22.4s, v25.4h, v8.4h\n" + "smlal2 v21.4s, v25.8h, v8.8h\n" + "smlal v23.4s, v25.4h, v6.4h\n" "smlal2 v18.4s, v25.8h, v6.8h\n" - "tbz x4, #2, 77f\n" + "add x19, x19, x15\n" + "tbz x8, #2, 77f\n" "ld1 { v29.s }[0], [x19], #0x4\n" - "tbz x4, #1, 76f\n" + "tbz x8, #1, 76f\n" "ld1 { v29.h }[2], [x19], #0x2\n" - "tbz x4, #0, 79f\n" + "tbz x8, #0, 79f\n" "ld1 { v29.b }[6], [x19]\n" "b 79f\n" "76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 79f\n" + "tbz x8, #0, 79f\n" "ld1 { v29.b }[4], [x19]\n" "b 79f\n" "77:" // Oddments: Load (4, 4): Bit 2: Unset - "tbz x4, #1, 78f\n" + "tbz x8, #1, 78f\n" "ld1 { v29.h }[0], [x19], #0x2\n" - "tbz x4, #0, 79f\n" + "tbz x8, #0, 79f\n" "ld1 { v29.b }[2], [x19]\n" "b 79f\n" "78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 79f\n" + "tbz x8, #0, 79f\n" "ld1 { v29.b }[0], [x19]\n" "79:" // Oddments: Load (4, 4): Bit 2: End "usubl v29.8h, v29.8b, v12.8b\n" - "smlal v17.4s, v29.4h, v8.4h\n" + "smlal v23.4s, v29.4h, v8.4h\n" "smlal2 v18.4s, v29.8h, v8.8h\n" - "tbz x4, #2, 81f\n" - "ld1 { v21.4s }, [x17], #0x10\n" - "ld1 { v30.4s }, [x15], #0x10\n" - "tbz x4, #1, 80f\n" - "ld1 { v31.d }[0], [x17], #0x8\n" - "ld1 { v9.d }[0], [x15], #0x8\n" - "tbz x4, #0, 83f\n" - "ld1 { v31.s }[2], [x17]\n" - "ld1 { v9.s }[2], [x15]\n" + "tbz x8, #2, 81f\n" + "ld1 { v19.4s }, [x13], #0x10\n" + "ld1 { v0.4s }, [x11], #0x10\n" + "tbz x8, #1, 80f\n" + "ld1 { v4.d }[0], [x13], #0x8\n" + "ld1 { v31.d }[0], [x11], #0x8\n" + "tbz x8, #0, 83f\n" + "ld1 { v4.s }[2], [x13]\n" + "ld1 { v31.s }[2], [x11]\n" "b 83f\n" "80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset - "tbz x4, #0, 83f\n" - "ld1 { v31.s }[0], [x17]\n" - "ld1 { v9.s }[0], [x15]\n" + "tbz x8, #0, 83f\n" + "ld1 { v4.s }[0], [x13]\n" + "ld1 { v31.s }[0], [x11]\n" "b 83f\n" "81:" // Oddments: Load requant params: Bit 2: Unset - "tbz x4, #1, 82f\n" - "ld1 { v21.d }[0], [x17], #0x8\n" - "ld1 { v30.d }[0], [x15], #0x8\n" - "tbz x4, #0, 83f\n" - "ld1 { v21.s }[2], [x17]\n" - "ld1 { v30.s }[2], [x15]\n" + "tbz x8, #1, 82f\n" + "ld1 { v19.d }[0], [x13], #0x8\n" + "ld1 { v0.d }[0], [x11], #0x8\n" + "tbz x8, #0, 83f\n" + "ld1 { v19.s }[2], [x13]\n" + "ld1 { v0.s }[2], [x11]\n" "b 83f\n" "82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 83f\n" - "ld1 { v21.s }[0], [x17]\n" - "ld1 { v30.s }[0], [x15]\n" + "tbz x8, #0, 83f\n" + "ld1 { v19.s }[0], [x13]\n" + "ld1 { v0.s }[0], [x11]\n" "83:" // Oddments: Load requant params: Bit 2: End - "sqrdmulh v15.4s, v15.4s, v21.4s\n" - "add x14, x14, x7\n" - "sqrdmulh v10.4s, v10.4s, v31.4s\n" - "add x13, x13, x7\n" - "sqrdmulh v20.4s, v20.4s, v21.4s\n" - "add x12, x12, x7\n" - "sqrdmulh v23.4s, v23.4s, v31.4s\n" - "add x11, x11, x7\n" - "sqrdmulh v16.4s, v16.4s, v21.4s\n" - "and v26.16b, v15.16b, v30.16b\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "and v8.16b, v10.16b, v9.16b\n" - "and v4.16b, v20.16b, v30.16b\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "and v2.16b, v23.16b, v9.16b\n" - "and v1.16b, v16.16b, v30.16b\n" + "sqdmulh v15.4s, v15.4s, v19.4s\n" + "sqdmulh v9.4s, v9.4s, v19.4s\n" + "add x10, x10, x14\n" + "add x9, x9, x14\n" + "sqdmulh v22.4s, v22.4s, v19.4s\n" + "sqdmulh v23.4s, v23.4s, v19.4s\n" + "add x28, x28, x14\n" + "add x27, x27, x14\n" + "and v30.16b, v15.16b, v0.16b\n" + "sqdmulh v10.4s, v10.4s, v4.4s\n" + "and v28.16b, v9.16b, v0.16b\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "and v29.16b, v22.16b, v0.16b\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "and v20.16b, v23.16b, v0.16b\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "and v19.16b, v10.16b, v31.16b\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "and v4.16b, v16.16b, v31.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "and v5.16b, v21.16b, v31.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v26.16b, v18.16b, v31.16b\n" + "sqadd v15.4s, v15.4s, v30.4s\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v28.4s\n" "sshr v4.4s, v4.4s, #0x1f\n" - "sqrdmulh v22.4s, v22.4s, v31.4s\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "sqadd v15.4s, v15.4s, v26.4s\n" - "sqrdmulh v17.4s, v17.4s, v21.4s\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqrdmulh v18.4s, v18.4s, v31.4s\n" - "sqadd v10.4s, v10.4s, v8.4s\n" - "sqadd v20.4s, v20.4s, v4.4s\n" - "srshl v15.4s, v15.4s, v30.4s\n" - "sqadd v23.4s, v23.4s, v2.4s\n" - "srshl v10.4s, v10.4s, v9.4s\n" - "srshl v20.4s, v20.4s, v30.4s\n" - "add v15.4s, v15.4s, v11.4s\n" - "srshl v23.4s, v23.4s, v9.4s\n" - "add v10.4s, v10.4s, v11.4s\n" - "smin v15.4s, v15.4s, v14.4s\n" - "add v20.4s, v20.4s, v11.4s\n" - "smin v10.4s, v10.4s, v14.4s\n" - "smax v15.4s, v15.4s, v19.4s\n" - "smin v20.4s, v20.4s, v14.4s\n" - "smax v10.4s, v10.4s, v19.4s\n" - "add v23.4s, v23.4s, v11.4s\n" - "smax v20.4s, v20.4s, v19.4s\n" - "uzp1 v15.16b, v15.16b, v10.16b\n" - "smin v23.4s, v23.4s, v14.4s\n" + "sqadd v22.4s, v22.4s, v29.4s\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v20.4s\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v0.4s\n" + "sqadd v10.4s, v10.4s, v19.4s\n" + "srshl v9.4s, v9.4s, v0.4s\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "sqadd v21.4s, v21.4s, v5.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "sqadd v18.4s, v18.4s, v26.4s\n" + "srshl v10.4s, v10.4s, v31.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v16.4s, v16.4s, v31.4s\n" + "sqxtn v9.4h, v9.4s\n" + "srshl v21.4s, v21.4s, v31.4s\n" + "sqxtn v22.4h, v22.4s\n" + "srshl v18.4s, v18.4s, v31.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v15.8h, v10.4s\n" + "sqxtn2 v9.8h, v16.4s\n" + "sqxtn2 v22.8h, v21.4s\n" + "sqxtn2 v23.8h, v18.4s\n" + "sqadd v15.8h, v15.8h, v11.8h\n" + "sqadd v9.8h, v9.8h, v11.8h\n" + "sqadd v22.8h, v22.8h, v11.8h\n" + "sqadd v23.8h, v23.8h, v11.8h\n" + "smax v15.8h, v15.8h, v17.8h\n" + "smax v9.8h, v9.8h, v17.8h\n" + "smax v22.8h, v22.8h, v17.8h\n" + "smax v23.8h, v23.8h, v17.8h\n" + "smin v15.8h, v15.8h, v14.8h\n" + "smin v9.8h, v9.8h, v14.8h\n" + "smin v22.8h, v22.8h, v14.8h\n" + "smin v23.8h, v23.8h, v14.8h\n" "uzp1 v15.16b, v15.16b, v15.16b\n" - "sqadd v16.4s, v16.4s, v1.4s\n" - "smax v23.4s, v23.4s, v19.4s\n" - "and v24.16b, v22.16b, v9.16b\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "uzp1 v20.16b, v20.16b, v23.16b\n" - "srshl v16.4s, v16.4s, v30.4s\n" - "and v2.16b, v17.16b, v30.16b\n" - "sshr v2.4s, v2.4s, #0x1f\n" - "uzp1 v20.16b, v20.16b, v20.16b\n" - "add v16.4s, v16.4s, v11.4s\n" - "sqadd v22.4s, v22.4s, v24.4s\n" - "and v31.16b, v18.16b, v9.16b\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "smin v16.4s, v16.4s, v14.4s\n" - "srshl v22.4s, v22.4s, v9.4s\n" - "sqadd v17.4s, v17.4s, v2.4s\n" - "smax v16.4s, v16.4s, v19.4s\n" - "add v22.4s, v22.4s, v11.4s\n" - "srshl v17.4s, v17.4s, v30.4s\n" - "sqadd v18.4s, v18.4s, v31.4s\n" - "smin v22.4s, v22.4s, v14.4s\n" - "add v17.4s, v17.4s, v11.4s\n" - "srshl v18.4s, v18.4s, v9.4s\n" - "smax v22.4s, v22.4s, v19.4s\n" - "smin v17.4s, v17.4s, v14.4s\n" - "uzp1 v16.16b, v16.16b, v22.16b\n" - "add v18.4s, v18.4s, v11.4s\n" - "uzp1 v16.16b, v16.16b, v16.16b\n" - "smax v17.4s, v17.4s, v19.4s\n" - "smin v18.4s, v18.4s, v14.4s\n" - "smax v18.4s, v18.4s, v19.4s\n" - "uzp1 v17.16b, v17.16b, v18.16b\n" - "uzp1 v17.16b, v17.16b, v17.16b\n" - "tbz x4, #2, 85f\n" - "st1 { v15.s }[0], [x14], #0x4\n" - "st1 { v20.s }[0], [x13], #0x4\n" - "st1 { v16.s }[0], [x12], #0x4\n" - "st1 { v17.s }[0], [x11], #0x4\n" - "tbz x4, #1, 84f\n" - "st1 { v15.h }[2], [x14], #0x2\n" - "st1 { v20.h }[2], [x13], #0x2\n" - "st1 { v16.h }[2], [x12], #0x2\n" - "st1 { v17.h }[2], [x11], #0x2\n" - "tbz x4, #0, 87f\n" - "st1 { v15.b }[6], [x14], #0x1\n" - "st1 { v20.b }[6], [x13], #0x1\n" - "st1 { v16.b }[6], [x12], #0x1\n" - "st1 { v17.b }[6], [x11], #0x1\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "tbz x8, #2, 85f\n" + "st1 { v15.s }[0], [x10], #0x4\n" + "st1 { v9.s }[0], [x9], #0x4\n" + "st1 { v22.s }[0], [x28], #0x4\n" + "st1 { v23.s }[0], [x27], #0x4\n" + "tbz x8, #1, 84f\n" + "st1 { v15.h }[2], [x10], #0x2\n" + "st1 { v9.h }[2], [x9], #0x2\n" + "st1 { v22.h }[2], [x28], #0x2\n" + "st1 { v23.h }[2], [x27], #0x2\n" + "tbz x8, #0, 87f\n" + "st1 { v15.b }[6], [x10], #0x1\n" + "st1 { v9.b }[6], [x9], #0x1\n" + "st1 { v22.b }[6], [x28], #0x1\n" + "st1 { v23.b }[6], [x27], #0x1\n" "b 87f\n" "84:" // Oddments: Bit 2: Bit 1: Unset - "tbz x4, #0, 87f\n" - "st1 { v15.b }[4], [x14], #0x1\n" - "st1 { v20.b }[4], [x13], #0x1\n" - "st1 { v16.b }[4], [x12], #0x1\n" - "st1 { v17.b }[4], [x11], #0x1\n" + "tbz x8, #0, 87f\n" + "st1 { v15.b }[4], [x10], #0x1\n" + "st1 { v9.b }[4], [x9], #0x1\n" + "st1 { v22.b }[4], [x28], #0x1\n" + "st1 { v23.b }[4], [x27], #0x1\n" "b 87f\n" "85:" // Oddments: Bit 2: Unset - "tbz x4, #1, 86f\n" - "st1 { v15.h }[0], [x14], #0x2\n" - "st1 { v20.h }[0], [x13], #0x2\n" - "st1 { v16.h }[0], [x12], #0x2\n" - "st1 { v17.h }[0], [x11], #0x2\n" - "tbz x4, #0, 87f\n" - "st1 { v15.b }[2], [x14], #0x1\n" - "st1 { v20.b }[2], [x13], #0x1\n" - "st1 { v16.b }[2], [x12], #0x1\n" - "st1 { v17.b }[2], [x11], #0x1\n" + "tbz x8, #1, 86f\n" + "st1 { v15.h }[0], [x10], #0x2\n" + "st1 { v9.h }[0], [x9], #0x2\n" + "st1 { v22.h }[0], [x28], #0x2\n" + "st1 { v23.h }[0], [x27], #0x2\n" + "tbz x8, #0, 87f\n" + "st1 { v15.b }[2], [x10], #0x1\n" + "st1 { v9.b }[2], [x9], #0x1\n" + "st1 { v22.b }[2], [x28], #0x1\n" + "st1 { v23.b }[2], [x27], #0x1\n" "b 87f\n" "86:" // Oddments: Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 87f\n" - "st1 { v15.b }[0], [x14], #0x1\n" - "st1 { v20.b }[0], [x13], #0x1\n" - "st1 { v16.b }[0], [x12], #0x1\n" - "st1 { v17.b }[0], [x11], #0x1\n" + "tbz x8, #0, 87f\n" + "st1 { v15.b }[0], [x10], #0x1\n" + "st1 { v9.b }[0], [x9], #0x1\n" + "st1 { v22.b }[0], [x28], #0x1\n" + "st1 { v23.b }[0], [x27], #0x1\n" "87:" // Oddments: Bit 2: End - "88:" // End - : : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp index 73de9650c3..11e993c51c 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,37 +36,24 @@ namespace depthwise { void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); -struct a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst +class a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef uint8_t input_type; - typedef uint8_t weight_type; - typedef uint8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); - typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 5; constexpr static unsigned int kernel_cols = 5; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 6; - constexpr static unsigned int input_cols = 6; - - constexpr static parameter_packing_fn pack_parameters = interleave_a64_u8q_5x5_mla::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_a64_u8q_5x5_mla::get_packed_size; + a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {} - kern_type kernel = a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl; + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; } - a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp index b42f29ab0d..6934dffc98 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -46,7 +46,7 @@ void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl( struct Params { long unsigned int n_channels; - const uint8_t *weights; + const void *weights; const int32_t *bias; const arm_gemm::Requantize32 *requant; const int32_t *const requant_muls; @@ -57,7 +57,7 @@ void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl( Params( long unsigned int n_channels, const uint8_t *const *inptrs_raw, - const uint8_t *const weights, + const void *const weights, const int32_t *const bias, const arm_gemm::Requantize32 &qp, const int32_t *const requant_muls, @@ -111,2096 +111,2070 @@ void a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl( requant_muls, requant_shifts, outptrs); __asm__ __volatile__( - "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n" - "mov x10, #0x0\n" - "ldr x3, [%x[params], %[offsetof_Params_weights]]\n" - "mov x1, #0x0\n" - "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" - "add x25, %x[params], %[offsetof_Params_inptrs]\n" - "ldr x2, [%x[params], %[offsetof_Params_requant_muls]]\n" - "lsr x19, x4, #0x3\n" - "ldr x5, [%x[params], %[offsetof_Params_requant_shifts]]\n" - "add x13, x22, %[offsetof_Requantize32_a_offset]\n" - "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" - "add x20, x22, %[offsetof_Requantize32_b_offset]\n" - "ld1r { v7.16b }, [x13]\n" - "add x8, x22, %[offsetof_Requantize32_c_offset]\n" - "ld1r { v13.16b }, [x20]\n" - "add x20, x22, %[offsetof_Requantize32_minval]\n" - "ld1r { v19.4s }, [x8]\n" - "add x8, x22, %[offsetof_Requantize32_maxval]\n" - "ld1r { v16.4s }, [x20]\n" - "ld1r { v12.4s }, [x8]\n" - "ldp x17, x16, [x21, #0x0]\n" - "ldp x6, x8, [x21, #0x10]\n" - "cbz x19, 3f\n" - "subs x19, x19, #0x1\n" - "ldr x12, [%x[params], %[offsetof_Params_bias]]\n" - "ldr q15, [x12, #0x0]\n" - "mov v18.16b, v15.16b\n" - "ldr q20, [x12, #0x10]\n" - "add x12, x12, #0x20\n" - "mov v11.16b, v15.16b\n" - "str x12, [%x[params], %[offsetof_Params_bias]]\n" + "ldr x10, [%x[params], %[offsetof_Params_requant]]\n" + "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n" + "add x17, x10, %[offsetof_Requantize32_a_offset]\n" + "add x9, x10, %[offsetof_Requantize32_b_offset]\n" + "ldr x25, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x4, x10, %[offsetof_Requantize32_c_offset]\n" + "add x14, x10, %[offsetof_Requantize32_minval]\n" + "ldr x23, [%x[params], %[offsetof_Params_weights]]\n" + "add x5, x10, %[offsetof_Requantize32_maxval]\n" + "ld1r { v9.16b }, [x17]\n" + "ld1r { v14.16b }, [x9]\n" + "lsr x3, x0, #0x3\n" + "ld1r { v18.8h }, [x4]\n" + "ld1r { v11.8h }, [x14]\n" + "mov x24, #0x0\n" + "mov x22, #0x0\n" + "ld1r { v13.8h }, [x5]\n" + "ldr x10, [%x[params], %[offsetof_Params_requant_muls]]\n" + "add x20, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x1, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "ldp x16, x8, [x25, #0x0]\n" + "ldp x4, x7, [x25, #0x10]\n" + "cbz x3, 3f\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q15, [x19, #0x0]\n" + "subs x3, x3, #0x1\n" + "mov v17.16b, v15.16b\n" + "ldr q16, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr d0, [x23, #0x0]\n" + "ldr d1, [x23, #0x8]\n" + "ldr d2, [x23, #0x10]\n" + "mov v8.16b, v16.16b\n" "mov v10.16b, v15.16b\n" - "ldr d0, [x3, #0x0]\n" - "usubl v0.8h, v0.8b, v13.8b\n" - "mov v5.16b, v20.16b\n" - "ldr d1, [x3, #0x8]\n" - "mov v8.16b, v20.16b\n" - "ldr d2, [x3, #0x10]\n" - "usubl v1.8h, v1.8b, v13.8b\n" - "mov v9.16b, v20.16b\n" - "ldr d3, [x3, #0x18]\n" - "ldr d4, [x3, #0x20]\n" - "usubl v2.8h, v2.8b, v13.8b\n" - "ldp x28, x27, [x25, #0x0]\n" - "usubl v3.8h, v3.8b, v13.8b\n" - "ldp x26, x13, [x25, #0x10]\n" - "usubl v4.8h, v4.8b, v13.8b\n" - "ldp x24, x23, [x25, #0x20]\n" - "ldp x22, x21, [x25, #0x30]\n" - "ldp x20, x0, [x25, #0x40]\n" - "ldr d31, [x28, x10]\n" - "usubl v31.8h, v31.8b, v7.8b\n" - "ldr d30, [x27, x10]\n" - "ldr d29, [x26, x10]\n" - "usubl v30.8h, v30.8b, v7.8b\n" - "ldr d28, [x13, x10]\n" - "ldr d27, [x24, x10]\n" - "usubl v29.8h, v29.8b, v7.8b\n" - "ldr d23, [x23, x10]\n" - "usubl v28.8h, v28.8b, v7.8b\n" - "ldr d25, [x22, x10]\n" - "ldr d24, [x21, x10]\n" - "usubl v27.8h, v27.8b, v7.8b\n" - "ldr d26, [x20, x10]\n" - "usubl v23.8h, v23.8b, v7.8b\n" - "ldr d22, [x0, x10]\n" - "usubl v25.8h, v25.8b, v7.8b\n" - "usubl v24.8h, v24.8b, v7.8b\n" - "usubl v26.8h, v26.8b, v7.8b\n" - "usubl v22.8h, v22.8b, v7.8b\n" + "ldr d3, [x23, #0x18]\n" + "ldr d4, [x23, #0x20]\n" + "mov v7.16b, v16.16b\n" + "mov v6.16b, v15.16b\n" + "ldp x28, x6, [x20, #0x0]\n" + "ldp x26, x25, [x20, #0x10]\n" + "mov v5.16b, v16.16b\n" + "usubl v0.8h, v0.8b, v14.8b\n" + "ldp x5, x2, [x20, #0x20]\n" + "ldp x27, x21, [x20, #0x30]\n" + "usubl v1.8h, v1.8b, v14.8b\n" + "usubl v2.8h, v2.8b, v14.8b\n" + "ldp x12, x19, [x20, #0x40]\n" + "ldr d31, [x28, x24]\n" + "usubl v3.8h, v3.8b, v14.8b\n" + "usubl v4.8h, v4.8b, v14.8b\n" + "ldr d30, [x6, x24]\n" + "ldr d29, [x26, x24]\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "ldr d28, [x25, x24]\n" + "ldr d27, [x5, x24]\n" + "usubl v29.8h, v29.8b, v9.8b\n" + "usubl v28.8h, v28.8b, v9.8b\n" + "ldr d23, [x2, x24]\n" + "ldr d25, [x27, x24]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "usubl v23.8h, v23.8b, v9.8b\n" + "ldr d24, [x21, x24]\n" + "ldr d26, [x12, x24]\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "ldr d22, [x19, x24]\n" + "usubl v26.8h, v26.8b, v9.8b\n" + "usubl v22.8h, v22.8b, v9.8b\n" "beq 2f\n" "1:" // Loop "smlal v15.4s, v31.4h, v0.4h\n" - "ldr x20, [x25, #0x50]\n" - "subs x19, x19, #0x1\n" - "smlal2 v20.4s, v31.8h, v0.8h\n" - "ldr x28, [x25, #0x58]\n" - "smlal v18.4s, v30.4h, v0.4h\n" - "ldr x0, [x25, #0x60]\n" - "smlal2 v5.4s, v30.8h, v0.8h\n" - "ldr d31, [x20, x10]\n" - "usubl v31.8h, v31.8b, v7.8b\n" - "smlal v11.4s, v29.4h, v0.4h\n" - "ldr x7, [x25, #0x68]\n" - "smlal2 v8.4s, v29.8h, v0.8h\n" - "ldr x26, [x25, #0x70]\n" - "smlal v10.4s, v28.4h, v0.4h\n" - "ldr x23, [x25, #0x78]\n" - "smlal2 v9.4s, v28.8h, v0.8h\n" - "ldr d0, [x3, #0x28]\n" - "usubl v0.8h, v0.8b, v13.8b\n" + "smlal2 v16.4s, v31.8h, v0.8h\n" + "ldr x19, [x20, #0x50]\n" + "ldr d31, [x19, x24]\n" + "smlal v17.4s, v30.4h, v0.4h\n" + "smlal v10.4s, v29.4h, v0.4h\n" + "ldr x15, [x20, #0x58]\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "smlal v6.4s, v28.4h, v0.4h\n" + "smlal2 v8.4s, v30.8h, v0.8h\n" + "ldr x19, [x20, #0x60]\n" + "ldr x27, [x20, #0x68]\n" + "smlal2 v7.4s, v29.8h, v0.8h\n" "smlal v15.4s, v30.4h, v1.4h\n" - "ldr x20, [x25, #0x80]\n" - "smlal2 v20.4s, v30.8h, v1.8h\n" - "ldr d30, [x28, x10]\n" - "usubl v30.8h, v30.8b, v7.8b\n" - "smlal v18.4s, v27.4h, v1.4h\n" - "ldr x22, [x25, #0x88]\n" - "smlal2 v5.4s, v27.8h, v1.8h\n" - "ldr x13, [x25, #0x90]\n" - "smlal v11.4s, v28.4h, v1.4h\n" - "ldr x21, [x25, #0x98]\n" - "smlal2 v8.4s, v28.8h, v1.8h\n" - "ldr x14, [x25, #0xa0]\n" - "smlal v10.4s, v23.4h, v1.4h\n" - "ldr x11, [x25, #0xa8]\n" - "smlal2 v9.4s, v23.8h, v1.8h\n" - "ldr d1, [x3, #0x30]\n" - "usubl v1.8h, v1.8b, v13.8b\n" + "ldr x5, [x20, #0x70]\n" + "ldr x11, [x20, #0x78]\n" + "smlal2 v16.4s, v30.8h, v1.8h\n" + "smlal2 v5.4s, v28.8h, v0.8h\n" + "ldr d30, [x15, x24]\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "smlal v17.4s, v27.4h, v1.4h\n" + "smlal v10.4s, v28.4h, v1.4h\n" + "ldr d0, [x23, #0x28]\n" + "usubl v0.8h, v0.8b, v14.8b\n" + "smlal v6.4s, v23.4h, v1.4h\n" + "smlal2 v8.4s, v27.8h, v1.8h\n" + "ldr x12, [x20, #0x80]\n" + "ldr x26, [x20, #0x88]\n" + "smlal2 v7.4s, v28.8h, v1.8h\n" "smlal v15.4s, v27.4h, v2.4h\n" - "ldr x24, [x25, #0xb0]\n" - "smlal2 v20.4s, v27.8h, v2.8h\n" - "ldr d27, [x0, x10]\n" - "usubl v27.8h, v27.8b, v7.8b\n" - "smlal v18.4s, v25.4h, v2.4h\n" - "ldr x0, [x25, #0xb8]\n" - "smlal2 v5.4s, v25.8h, v2.8h\n" - "ldr x15, [x25, #0xc0]\n" - "smlal v11.4s, v23.4h, v2.4h\n" - "ldr x9, [x25, #0xc8]\n" - "smlal2 v8.4s, v23.8h, v2.8h\n" - "ldr x27, [x25, #0xd0]\n" - "smlal v10.4s, v31.4h, v2.4h\n" - "ldr x28, [x25, #0xd8]\n" - "smlal2 v9.4s, v31.8h, v2.8h\n" - "ldr d2, [x3, #0x38]\n" - "usubl v2.8h, v2.8b, v13.8b\n" + "ldr x14, [x20, #0x90]\n" + "ldr x15, [x20, #0x98]\n" + "smlal2 v16.4s, v27.8h, v2.8h\n" + "smlal2 v5.4s, v23.8h, v1.8h\n" + "ldr d27, [x19, x24]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal v17.4s, v25.4h, v2.4h\n" + "smlal v10.4s, v23.4h, v2.4h\n" + "ldr d1, [x23, #0x30]\n" + "usubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v31.4h, v2.4h\n" + "smlal2 v8.4s, v25.8h, v2.8h\n" + "ldr x21, [x20, #0xa0]\n" + "ldr x2, [x20, #0xa8]\n" + "smlal2 v7.4s, v23.8h, v2.8h\n" "smlal v15.4s, v25.4h, v3.4h\n" - "ldr q6, [x2, #0x0]\n" - "smlal2 v20.4s, v25.8h, v3.8h\n" - "ldr d25, [x7, x10]\n" - "usubl v25.8h, v25.8b, v7.8b\n" - "smlal v18.4s, v24.4h, v3.4h\n" - "ldr x12, [x25, #0xe0]\n" - "smlal2 v5.4s, v24.8h, v3.8h\n" - "ldr q21, [x5, #0x0]\n" - "smlal v11.4s, v31.4h, v3.4h\n" - "ldr q17, [x2, #0x10]\n" - "add x2, x2, #0x20\n" - "smlal2 v8.4s, v31.8h, v3.8h\n" - "ldr q14, [x5, #0x10]\n" - "add x5, x5, #0x20\n" - "smlal v10.4s, v30.4h, v3.4h\n" - "smlal2 v9.4s, v30.8h, v3.8h\n" - "ldr d3, [x3, #0x40]\n" - "usubl v3.8h, v3.8b, v13.8b\n" + "ldr x13, [x20, #0xb0]\n" + "ldr x9, [x20, #0xb8]\n" + "smlal2 v16.4s, v25.8h, v3.8h\n" + "smlal2 v5.4s, v31.8h, v2.8h\n" + "ldr d25, [x27, x24]\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal v10.4s, v31.4h, v3.4h\n" + "ldr d2, [x23, #0x38]\n" + "usubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v30.4h, v3.4h\n" + "smlal2 v8.4s, v24.8h, v3.8h\n" + "ldr x19, [x20, #0xc0]\n" + "ldr x28, [x20, #0xc8]\n" + "smlal2 v7.4s, v31.8h, v3.8h\n" "smlal v15.4s, v24.4h, v4.4h\n" - "smlal2 v20.4s, v24.8h, v4.8h\n" - "ldr d24, [x26, x10]\n" - "usubl v24.8h, v24.8b, v7.8b\n" - "smlal v18.4s, v27.4h, v4.4h\n" - "ldr x7, [x25, #0xe8]\n" - "smlal2 v5.4s, v27.8h, v4.8h\n" - "ldr d27, [x23, x10]\n" - "usubl v27.8h, v27.8b, v7.8b\n" - "smlal v11.4s, v30.4h, v4.4h\n" - "ldr x26, [x25, #0xf0]\n" - "smlal2 v8.4s, v30.8h, v4.8h\n" - "smlal v10.4s, v26.4h, v4.4h\n" - "smlal2 v9.4s, v26.8h, v4.8h\n" - "ldr d4, [x3, #0x48]\n" - "usubl v4.8h, v4.8b, v13.8b\n" + "ldr x6, [x20, #0xd0]\n" + "ldr x27, [x20, #0xd8]\n" + "smlal2 v16.4s, v24.8h, v4.8h\n" + "smlal2 v5.4s, v30.8h, v3.8h\n" + "ldr d24, [x5, x24]\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "smlal v17.4s, v27.4h, v4.4h\n" + "smlal v10.4s, v30.4h, v4.4h\n" + "ldr d3, [x23, #0x40]\n" + "usubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v26.4h, v4.4h\n" + "smlal2 v8.4s, v27.8h, v4.8h\n" + "ldr d27, [x11, x24]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal2 v7.4s, v30.8h, v4.8h\n" "smlal v15.4s, v29.4h, v0.4h\n" - "smlal2 v20.4s, v29.8h, v0.8h\n" - "smlal v18.4s, v28.4h, v0.4h\n" - "smlal2 v5.4s, v28.8h, v0.8h\n" - "smlal v11.4s, v22.4h, v0.4h\n" - "smlal2 v8.4s, v22.8h, v0.8h\n" - "smlal v10.4s, v25.4h, v0.4h\n" - "smlal2 v9.4s, v25.8h, v0.8h\n" - "ldr d0, [x3, #0x50]\n" - "usubl v0.8h, v0.8b, v13.8b\n" + "ldr x11, [x20, #0xe0]\n" + "ldr x17, [x20, #0xe8]\n" + "smlal2 v16.4s, v29.8h, v0.8h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d4, [x23, #0x48]\n" + "usubl v4.8h, v4.8b, v14.8b\n" + "smlal v17.4s, v28.4h, v0.4h\n" + "smlal v10.4s, v22.4h, v0.4h\n" + "ldr x5, [x20, #0xf0]\n" + "ldr q12, [x10, #0x0]\n" + "smlal v6.4s, v25.4h, v0.4h\n" + "smlal2 v8.4s, v28.8h, v0.8h\n" + "ldr q19, [x1, #0x0]\n" + "ldr q20, [x10, #0x10]\n" + "smlal2 v7.4s, v22.8h, v0.8h\n" "smlal v15.4s, v28.4h, v1.4h\n" - "smlal2 v20.4s, v28.8h, v1.8h\n" - "ldr d28, [x22, x10]\n" - "usubl v28.8h, v28.8b, v7.8b\n" - "smlal v18.4s, v23.4h, v1.4h\n" - "ldr x23, [x25, #0xf8]\n" - "smlal2 v5.4s, v23.8h, v1.8h\n" - "smlal v11.4s, v25.4h, v1.4h\n" - "smlal2 v8.4s, v25.8h, v1.8h\n" - "smlal v10.4s, v24.4h, v1.4h\n" - "smlal2 v9.4s, v24.8h, v1.8h\n" - "ldr d1, [x3, #0x58]\n" - "usubl v1.8h, v1.8b, v13.8b\n" + "ldr q29, [x1, #0x10]\n" + "subs x3, x3, #0x1\n" + "smlal2 v16.4s, v28.8h, v1.8h\n" + "smlal2 v5.4s, v25.8h, v0.8h\n" + "ldr d28, [x26, x24]\n" + "ldr d0, [x23, #0x50]\n" + "smlal v17.4s, v23.4h, v1.4h\n" + "smlal v10.4s, v25.4h, v1.4h\n" + "usubl v28.8h, v28.8b, v9.8b\n" + "ldr x25, [x20, #0xf8]\n" + "smlal v6.4s, v24.4h, v1.4h\n" + "smlal2 v8.4s, v23.8h, v1.8h\n" + "usubl v0.8h, v0.8b, v14.8b\n" + "add x10, x10, #0x20\n" + "smlal2 v7.4s, v25.8h, v1.8h\n" "smlal v15.4s, v23.4h, v2.4h\n" - "smlal2 v20.4s, v23.8h, v2.8h\n" - "ldr d23, [x20, x10]\n" - "usubl v23.8h, v23.8b, v7.8b\n" - "smlal v18.4s, v31.4h, v2.4h\n" - "ldr x22, [x25, #0x100]\n" - "smlal2 v5.4s, v31.8h, v2.8h\n" - "smlal v11.4s, v24.4h, v2.4h\n" - "smlal2 v8.4s, v24.8h, v2.8h\n" - "smlal v10.4s, v27.4h, v2.4h\n" - "smlal2 v9.4s, v27.8h, v2.8h\n" - "ldr d2, [x3, #0x60]\n" - "usubl v2.8h, v2.8b, v13.8b\n" + "add x1, x1, #0x20\n" + "smlal2 v16.4s, v23.8h, v2.8h\n" + "ldr d23, [x12, x24]\n" + "smlal2 v5.4s, v24.8h, v1.8h\n" + "usubl v23.8h, v23.8b, v9.8b\n" + "smlal v17.4s, v31.4h, v2.4h\n" + "smlal v10.4s, v24.4h, v2.4h\n" + "ldr d1, [x23, #0x58]\n" + "usubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v27.4h, v2.4h\n" + "smlal2 v8.4s, v31.8h, v2.8h\n" + "ldr x26, [x20, #0x100]\n" + "smlal2 v7.4s, v24.8h, v2.8h\n" "smlal v15.4s, v31.4h, v3.4h\n" - "smlal2 v20.4s, v31.8h, v3.8h\n" - "ldr d31, [x13, x10]\n" - "usubl v31.8h, v31.8b, v7.8b\n" - "smlal v18.4s, v30.4h, v3.4h\n" - "ldr x20, [x25, #0x108]\n" - "smlal2 v5.4s, v30.8h, v3.8h\n" - "smlal v11.4s, v27.4h, v3.4h\n" - "smlal2 v8.4s, v27.8h, v3.8h\n" - "smlal v10.4s, v23.4h, v3.4h\n" - "smlal2 v9.4s, v23.8h, v3.8h\n" - "ldr d3, [x3, #0x68]\n" - "usubl v3.8h, v3.8b, v13.8b\n" + "smlal2 v16.4s, v31.8h, v3.8h\n" + "smlal2 v5.4s, v27.8h, v2.8h\n" + "ldr d31, [x14, x24]\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "smlal v17.4s, v30.4h, v3.4h\n" + "smlal v10.4s, v27.4h, v3.4h\n" + "ldr d2, [x23, #0x60]\n" + "usubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v23.4h, v3.4h\n" + "smlal2 v8.4s, v30.8h, v3.8h\n" + "ldr x12, [x20, #0x108]\n" + "smlal2 v7.4s, v27.8h, v3.8h\n" "smlal v15.4s, v30.4h, v4.4h\n" - "smlal2 v20.4s, v30.8h, v4.8h\n" - "ldr d30, [x21, x10]\n" - "usubl v30.8h, v30.8b, v7.8b\n" - "smlal v18.4s, v26.4h, v4.4h\n" - "ldr x13, [x25, #0x110]\n" - "smlal2 v5.4s, v26.8h, v4.8h\n" - "ldr d26, [x14, x10]\n" - "usubl v26.8h, v26.8b, v7.8b\n" - "smlal v11.4s, v23.4h, v4.4h\n" - "ldr x21, [x25, #0x118]\n" - "smlal2 v8.4s, v23.8h, v4.8h\n" - "smlal v10.4s, v28.4h, v4.4h\n" - "smlal2 v9.4s, v28.8h, v4.8h\n" - "ldr d4, [x3, #0x70]\n" - "usubl v4.8h, v4.8b, v13.8b\n" + "smlal2 v16.4s, v30.8h, v4.8h\n" + "ldr d30, [x15, x24]\n" + "smlal2 v5.4s, v23.8h, v3.8h\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "smlal v17.4s, v26.4h, v4.4h\n" + "smlal v10.4s, v23.4h, v4.4h\n" + "ldr d3, [x23, #0x68]\n" + "usubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v28.4h, v4.4h\n" + "smlal2 v8.4s, v26.8h, v4.8h\n" + "ldr d26, [x21, x24]\n" + "usubl v26.8h, v26.8b, v9.8b\n" + "smlal2 v7.4s, v23.8h, v4.8h\n" "smlal v15.4s, v22.4h, v0.4h\n" - "smlal2 v20.4s, v22.8h, v0.8h\n" - "ldr d22, [x0, x10]\n" - "usubl v22.8h, v22.8b, v7.8b\n" - "smlal v18.4s, v25.4h, v0.4h\n" - "smlal2 v5.4s, v25.8h, v0.8h\n" - "smlal v11.4s, v31.4h, v0.4h\n" - "smlal2 v8.4s, v31.8h, v0.8h\n" - "smlal v10.4s, v30.4h, v0.4h\n" - "smlal2 v9.4s, v30.8h, v0.8h\n" - "ldr d0, [x3, #0x78]\n" - "usubl v0.8h, v0.8b, v13.8b\n" + "ldr x14, [x20, #0x110]\n" + "ldr x21, [x20, #0x118]\n" + "smlal2 v16.4s, v22.8h, v0.8h\n" + "smlal2 v5.4s, v28.8h, v4.8h\n" + "ldr d4, [x23, #0x70]\n" + "ldr d22, [x9, x24]\n" + "smlal v17.4s, v25.4h, v0.4h\n" + "smlal v10.4s, v31.4h, v0.4h\n" + "usubl v4.8h, v4.8b, v14.8b\n" + "smlal v6.4s, v30.4h, v0.4h\n" + "smlal2 v8.4s, v25.8h, v0.8h\n" + "usubl v22.8h, v22.8b, v9.8b\n" + "smlal2 v7.4s, v31.8h, v0.8h\n" "smlal v15.4s, v25.4h, v1.4h\n" - "smlal2 v20.4s, v25.8h, v1.8h\n" - "ldr d25, [x11, x10]\n" - "usubl v25.8h, v25.8b, v7.8b\n" - "smlal v18.4s, v24.4h, v1.4h\n" - "smlal2 v5.4s, v24.8h, v1.8h\n" - "smlal v11.4s, v30.4h, v1.4h\n" - "smlal2 v8.4s, v30.8h, v1.8h\n" - "smlal v10.4s, v26.4h, v1.4h\n" - "smlal2 v9.4s, v26.8h, v1.8h\n" - "ldr d1, [x3, #0x80]\n" - "usubl v1.8h, v1.8b, v13.8b\n" + "smlal2 v16.4s, v25.8h, v1.8h\n" + "ldr d25, [x2, x24]\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "smlal v17.4s, v24.4h, v1.4h\n" + "smlal v10.4s, v30.4h, v1.4h\n" + "ldr d0, [x23, #0x78]\n" + "usubl v0.8h, v0.8b, v14.8b\n" + "smlal v6.4s, v26.4h, v1.4h\n" + "smlal2 v8.4s, v24.8h, v1.8h\n" + "smlal2 v7.4s, v30.8h, v1.8h\n" "smlal v15.4s, v24.4h, v2.4h\n" - "smlal2 v20.4s, v24.8h, v2.8h\n" - "ldr d24, [x24, x10]\n" - "usubl v24.8h, v24.8b, v7.8b\n" - "smlal v18.4s, v27.4h, v2.4h\n" - "smlal2 v5.4s, v27.8h, v2.8h\n" - "smlal v11.4s, v26.4h, v2.4h\n" - "smlal2 v8.4s, v26.8h, v2.8h\n" - "smlal v10.4s, v25.4h, v2.4h\n" - "smlal2 v9.4s, v25.8h, v2.8h\n" - "ldr d2, [x3, #0x88]\n" - "usubl v2.8h, v2.8b, v13.8b\n" + "smlal2 v16.4s, v24.8h, v2.8h\n" + "ldr d24, [x13, x24]\n" + "smlal2 v5.4s, v26.8h, v1.8h\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "smlal v17.4s, v27.4h, v2.4h\n" + "smlal v10.4s, v26.4h, v2.4h\n" + "ldr d1, [x23, #0x80]\n" + "usubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v25.4h, v2.4h\n" + "smlal2 v8.4s, v27.8h, v2.8h\n" + "smlal2 v7.4s, v26.8h, v2.8h\n" "smlal v15.4s, v27.4h, v3.4h\n" - "smlal2 v20.4s, v27.8h, v3.8h\n" - "ldr d27, [x15, x10]\n" - "usubl v27.8h, v27.8b, v7.8b\n" - "smlal v18.4s, v23.4h, v3.4h\n" - "smlal2 v5.4s, v23.8h, v3.8h\n" - "smlal v11.4s, v25.4h, v3.4h\n" - "smlal2 v8.4s, v25.8h, v3.8h\n" - "smlal v10.4s, v24.4h, v3.4h\n" - "smlal2 v9.4s, v24.8h, v3.8h\n" - "ldr d3, [x3, #0x90]\n" - "usubl v3.8h, v3.8b, v13.8b\n" + "smlal2 v16.4s, v27.8h, v3.8h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "ldr d27, [x19, x24]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal v17.4s, v23.4h, v3.4h\n" + "smlal v10.4s, v25.4h, v3.4h\n" + "ldr d2, [x23, #0x88]\n" + "usubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v24.4h, v3.4h\n" + "smlal2 v8.4s, v23.8h, v3.8h\n" + "smlal2 v7.4s, v25.8h, v3.8h\n" "smlal v15.4s, v23.4h, v4.4h\n" - "smlal2 v20.4s, v23.8h, v4.8h\n" - "ldr d23, [x9, x10]\n" - "usubl v23.8h, v23.8b, v7.8b\n" - "smlal v18.4s, v28.4h, v4.4h\n" - "smlal2 v5.4s, v28.8h, v4.8h\n" - "ldr d28, [x12, x10]\n" - "usubl v28.8h, v28.8b, v7.8b\n" - "smlal v11.4s, v24.4h, v4.4h\n" - "smlal2 v8.4s, v24.8h, v4.8h\n" - "smlal v10.4s, v22.4h, v4.4h\n" - "smlal2 v9.4s, v22.8h, v4.8h\n" - "ldr d4, [x3, #0x98]\n" - "usubl v4.8h, v4.8b, v13.8b\n" + "smlal2 v16.4s, v23.8h, v4.8h\n" + "ldr d23, [x28, x24]\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "usubl v23.8h, v23.8b, v9.8b\n" + "smlal v17.4s, v28.4h, v4.4h\n" + "smlal v10.4s, v24.4h, v4.4h\n" + "ldr d3, [x23, #0x90]\n" + "usubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v22.4h, v4.4h\n" + "smlal2 v8.4s, v28.8h, v4.8h\n" + "ldr d28, [x11, x24]\n" + "usubl v28.8h, v28.8b, v9.8b\n" + "smlal2 v7.4s, v24.8h, v4.8h\n" "smlal v15.4s, v31.4h, v0.4h\n" - "smlal2 v20.4s, v31.8h, v0.8h\n" - "ldr d31, [x27, x10]\n" - "usubl v31.8h, v31.8b, v7.8b\n" - "smlal v18.4s, v30.4h, v0.4h\n" - "smlal2 v5.4s, v30.8h, v0.8h\n" - "smlal v11.4s, v27.4h, v0.4h\n" - "smlal2 v8.4s, v27.8h, v0.8h\n" - "smlal v10.4s, v23.4h, v0.4h\n" - "smlal2 v9.4s, v23.8h, v0.8h\n" - "ldr d0, [x3, #0xa0]\n" - "usubl v0.8h, v0.8b, v13.8b\n" + "smlal2 v16.4s, v31.8h, v0.8h\n" + "ldr d31, [x6, x24]\n" + "smlal2 v5.4s, v22.8h, v4.8h\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "smlal v17.4s, v30.4h, v0.4h\n" + "smlal v10.4s, v27.4h, v0.4h\n" + "ldr d4, [x23, #0x98]\n" + "usubl v4.8h, v4.8b, v14.8b\n" + "smlal v6.4s, v23.4h, v0.4h\n" + "smlal2 v8.4s, v30.8h, v0.8h\n" + "smlal2 v7.4s, v27.8h, v0.8h\n" "smlal v15.4s, v30.4h, v1.4h\n" - "smlal2 v20.4s, v30.8h, v1.8h\n" - "ldr d30, [x28, x10]\n" - "usubl v30.8h, v30.8b, v7.8b\n" - "smlal v18.4s, v26.4h, v1.4h\n" - "smlal2 v5.4s, v26.8h, v1.8h\n" - "smlal v11.4s, v23.4h, v1.4h\n" - "smlal2 v8.4s, v23.8h, v1.8h\n" - "smlal v10.4s, v31.4h, v1.4h\n" - "smlal2 v9.4s, v31.8h, v1.8h\n" - "ldr d1, [x3, #0xa8]\n" - "usubl v1.8h, v1.8b, v13.8b\n" + "smlal2 v16.4s, v30.8h, v1.8h\n" + "ldr d30, [x27, x24]\n" + "smlal2 v5.4s, v23.8h, v0.8h\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "smlal v17.4s, v26.4h, v1.4h\n" + "smlal v10.4s, v23.4h, v1.4h\n" + "ldr d0, [x23, #0xa0]\n" + "usubl v0.8h, v0.8b, v14.8b\n" + "smlal v6.4s, v31.4h, v1.4h\n" + "smlal2 v8.4s, v26.8h, v1.8h\n" + "smlal2 v7.4s, v23.8h, v1.8h\n" "smlal v15.4s, v26.4h, v2.4h\n" - "smlal2 v20.4s, v26.8h, v2.8h\n" - "ldr d26, [x7, x10]\n" - "usubl v26.8h, v26.8b, v7.8b\n" - "smlal v18.4s, v25.4h, v2.4h\n" - "smlal2 v5.4s, v25.8h, v2.8h\n" - "smlal v11.4s, v31.4h, v2.4h\n" - "smlal2 v8.4s, v31.8h, v2.8h\n" - "smlal v10.4s, v30.4h, v2.4h\n" - "smlal2 v9.4s, v30.8h, v2.8h\n" - "ldr d2, [x3, #0xb0]\n" - "usubl v2.8h, v2.8b, v13.8b\n" + "smlal2 v16.4s, v26.8h, v2.8h\n" + "smlal2 v5.4s, v31.8h, v1.8h\n" + "ldr d26, [x17, x24]\n" + "usubl v26.8h, v26.8b, v9.8b\n" + "smlal v17.4s, v25.4h, v2.4h\n" + "smlal v10.4s, v31.4h, v2.4h\n" + "ldr d1, [x23, #0xa8]\n" + "usubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v30.4h, v2.4h\n" + "smlal2 v8.4s, v25.8h, v2.8h\n" + "smlal2 v7.4s, v31.8h, v2.8h\n" "smlal v15.4s, v25.4h, v3.4h\n" - "smlal2 v20.4s, v25.8h, v3.8h\n" - "ldr d25, [x26, x10]\n" - "usubl v25.8h, v25.8b, v7.8b\n" - "smlal v18.4s, v24.4h, v3.4h\n" - "smlal2 v5.4s, v24.8h, v3.8h\n" - "smlal v11.4s, v30.4h, v3.4h\n" - "smlal2 v8.4s, v30.8h, v3.8h\n" - "smlal v10.4s, v28.4h, v3.4h\n" - "smlal2 v9.4s, v28.8h, v3.8h\n" - "ldr d3, [x3, #0xb8]\n" - "usubl v3.8h, v3.8b, v13.8b\n" + "smlal2 v16.4s, v25.8h, v3.8h\n" + "smlal2 v5.4s, v30.8h, v2.8h\n" + "ldr d25, [x5, x24]\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal v10.4s, v30.4h, v3.4h\n" + "ldr d2, [x23, #0xb0]\n" + "usubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v28.4h, v3.4h\n" + "smlal2 v8.4s, v24.8h, v3.8h\n" + "smlal2 v7.4s, v30.8h, v3.8h\n" "smlal v15.4s, v24.4h, v4.4h\n" - "smlal2 v20.4s, v24.8h, v4.8h\n" - "ldr d24, [x23, x10]\n" - "usubl v24.8h, v24.8b, v7.8b\n" - "smlal v18.4s, v22.4h, v4.4h\n" - "smlal2 v5.4s, v22.8h, v4.8h\n" - "smlal v11.4s, v28.4h, v4.4h\n" - "smlal2 v8.4s, v28.8h, v4.8h\n" - "smlal v10.4s, v26.4h, v4.4h\n" - "smlal2 v9.4s, v26.8h, v4.8h\n" - "ldr d4, [x3, #0xc0]\n" - "add x3, x3, #0xc8\n" + "smlal2 v16.4s, v24.8h, v4.8h\n" + "ldr d24, [x25, x24]\n" + "smlal2 v5.4s, v28.8h, v3.8h\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "smlal v17.4s, v22.4h, v4.4h\n" + "smlal v10.4s, v28.4h, v4.4h\n" + "ldr d3, [x23, #0xb8]\n" + "usubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v26.4h, v4.4h\n" + "smlal2 v7.4s, v28.8h, v4.8h\n" "smlal v15.4s, v27.4h, v0.4h\n" - "usubl v4.8h, v4.8b, v13.8b\n" - "smlal2 v20.4s, v27.8h, v0.8h\n" - "ldr d27, [x22, x10]\n" - "smlal v18.4s, v23.4h, v0.4h\n" - "usubl v27.8h, v27.8b, v7.8b\n" - "smlal2 v5.4s, v23.8h, v0.8h\n" - "smlal v11.4s, v25.4h, v0.4h\n" - "smlal2 v8.4s, v25.8h, v0.8h\n" - "ldr d25, [x20, x10]\n" - "usubl v25.8h, v25.8b, v7.8b\n" - "smlal v10.4s, v24.4h, v0.4h\n" - "smlal2 v9.4s, v24.8h, v0.8h\n" + "smlal2 v16.4s, v27.8h, v0.8h\n" + "ldr d27, [x26, x24]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal2 v8.4s, v22.8h, v4.8h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d4, [x23, #0xc0]\n" + "usubl v4.8h, v4.8b, v14.8b\n" + "smlal v17.4s, v23.4h, v0.4h\n" + "smlal v10.4s, v25.4h, v0.4h\n" + "add x23, x23, #0xc8\n" + "smlal v6.4s, v24.4h, v0.4h\n" + "smlal2 v7.4s, v25.8h, v0.8h\n" + "ldr d25, [x12, x24]\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "smlal2 v8.4s, v23.8h, v0.8h\n" + "smlal2 v5.4s, v24.8h, v0.8h\n" "smlal v15.4s, v23.4h, v1.4h\n" - "smlal2 v20.4s, v23.8h, v1.8h\n" - "smlal v18.4s, v31.4h, v1.4h\n" - "smlal2 v5.4s, v31.8h, v1.8h\n" - "smlal v11.4s, v24.4h, v1.4h\n" - "smlal2 v8.4s, v24.8h, v1.8h\n" - "ldr d24, [x13, x10]\n" - "usubl v24.8h, v24.8b, v7.8b\n" - "smlal v10.4s, v27.4h, v1.4h\n" - "smlal2 v9.4s, v27.8h, v1.8h\n" + "smlal v17.4s, v31.4h, v1.4h\n" + "smlal v10.4s, v24.4h, v1.4h\n" + "smlal v6.4s, v27.4h, v1.4h\n" + "smlal2 v7.4s, v24.8h, v1.8h\n" + "ldr d24, [x14, x24]\n" + "smlal2 v16.4s, v23.8h, v1.8h\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "smlal2 v8.4s, v31.8h, v1.8h\n" + "smlal2 v5.4s, v27.8h, v1.8h\n" "smlal v15.4s, v31.4h, v2.4h\n" - "smlal2 v20.4s, v31.8h, v2.8h\n" - "smlal v18.4s, v30.4h, v2.4h\n" - "smlal2 v5.4s, v30.8h, v2.8h\n" - "smlal v11.4s, v27.4h, v2.4h\n" - "smlal2 v8.4s, v27.8h, v2.8h\n" - "ldr d27, [x21, x10]\n" - "add x10, x10, #0x8\n" - "smlal v10.4s, v25.4h, v2.4h\n" - "usubl v27.8h, v27.8b, v7.8b\n" - "smlal2 v9.4s, v25.8h, v2.8h\n" + "smlal v17.4s, v30.4h, v2.4h\n" + "smlal v10.4s, v27.4h, v2.4h\n" + "smlal v6.4s, v25.4h, v2.4h\n" + "smlal2 v7.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x24]\n" + "smlal2 v16.4s, v31.8h, v2.8h\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal2 v8.4s, v30.8h, v2.8h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "add x24, x24, #0x8\n" "smlal v15.4s, v30.4h, v3.4h\n" - "smlal2 v20.4s, v30.8h, v3.8h\n" - "smlal v18.4s, v28.4h, v3.4h\n" - "smlal2 v5.4s, v28.8h, v3.8h\n" - "smlal v11.4s, v25.4h, v3.4h\n" - "smlal2 v8.4s, v25.8h, v3.8h\n" - "smlal v10.4s, v24.4h, v3.4h\n" - "smlal2 v9.4s, v24.8h, v3.8h\n" + "smlal v17.4s, v28.4h, v3.4h\n" + "smlal v10.4s, v25.4h, v3.4h\n" + "smlal v6.4s, v24.4h, v3.4h\n" + "smlal2 v16.4s, v30.8h, v3.8h\n" + "smlal2 v8.4s, v28.8h, v3.8h\n" + "smlal2 v7.4s, v25.8h, v3.8h\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" "smlal v15.4s, v28.4h, v4.4h\n" - "smlal2 v20.4s, v28.8h, v4.8h\n" - "smlal v18.4s, v26.4h, v4.4h\n" - "smlal2 v5.4s, v26.8h, v4.8h\n" - "smlal v11.4s, v24.4h, v4.4h\n" - "smlal2 v8.4s, v24.8h, v4.8h\n" - "smlal v10.4s, v27.4h, v4.4h\n" - "smlal2 v9.4s, v27.8h, v4.8h\n" - "sqrdmulh v15.4s, v15.4s, v6.4s\n" - "sqrdmulh v20.4s, v20.4s, v17.4s\n" - "sqrdmulh v18.4s, v18.4s, v6.4s\n" - "sqrdmulh v5.4s, v5.4s, v17.4s\n" - "and v1.16b, v15.16b, v21.16b\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "and v29.16b, v20.16b, v14.16b\n" - "and v3.16b, v18.16b, v21.16b\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "and v2.16b, v5.16b, v14.16b\n" - "sqrdmulh v11.4s, v11.4s, v6.4s\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqrdmulh v8.4s, v8.4s, v17.4s\n" + "smlal v17.4s, v26.4h, v4.4h\n" + "sqdmulh v15.4s, v15.4s, v12.4s\n" + "smlal v10.4s, v24.4h, v4.4h\n" + "smlal v6.4s, v27.4h, v4.4h\n" + "sqdmulh v17.4s, v17.4s, v12.4s\n" + "smlal2 v16.4s, v28.8h, v4.8h\n" + "smlal2 v8.4s, v26.8h, v4.8h\n" + "sqdmulh v10.4s, v10.4s, v12.4s\n" + "smlal2 v7.4s, v24.8h, v4.8h\n" + "smlal2 v5.4s, v27.8h, v4.8h\n" + "sqdmulh v6.4s, v6.4s, v12.4s\n" + "and v23.16b, v15.16b, v19.16b\n" + "sqdmulh v16.4s, v16.4s, v20.4s\n" + "and v22.16b, v17.16b, v19.16b\n" + "sqdmulh v8.4s, v8.4s, v20.4s\n" + "and v21.16b, v10.16b, v19.16b\n" + "sqdmulh v7.4s, v7.4s, v20.4s\n" + "and v26.16b, v6.16b, v19.16b\n" + "sqdmulh v5.4s, v5.4s, v20.4s\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "and v4.16b, v16.16b, v29.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v2.16b, v8.16b, v29.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v3.16b, v7.16b, v29.16b\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "and v25.16b, v5.16b, v29.16b\n" + "sqadd v15.4s, v15.4s, v23.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v22.4s\n" "sshr v2.4s, v2.4s, #0x1f\n" - "sqadd v15.4s, v15.4s, v1.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "and v0.16b, v11.16b, v21.16b\n" - "sshr v0.4s, v0.4s, #0x1f\n" - "srshl v15.4s, v15.4s, v21.4s\n" - "sqadd v20.4s, v20.4s, v29.4s\n" - "sqadd v18.4s, v18.4s, v3.4s\n" - "sqadd v5.4s, v5.4s, v2.4s\n" - "and v27.16b, v8.16b, v14.16b\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "add v15.4s, v15.4s, v19.4s\n" - "srshl v20.4s, v20.4s, v14.4s\n" - "srshl v18.4s, v18.4s, v21.4s\n" - "srshl v5.4s, v5.4s, v14.4s\n" - "smin v15.4s, v15.4s, v12.4s\n" - "add v20.4s, v20.4s, v19.4s\n" - "add v18.4s, v18.4s, v19.4s\n" - "smax v15.4s, v15.4s, v16.4s\n" - "smin v20.4s, v20.4s, v12.4s\n" - "smin v18.4s, v18.4s, v12.4s\n" - "add v5.4s, v5.4s, v19.4s\n" - "smax v20.4s, v20.4s, v16.4s\n" - "smax v18.4s, v18.4s, v16.4s\n" - "smin v5.4s, v5.4s, v12.4s\n" - "uzp1 v15.16b, v15.16b, v20.16b\n" - "sqadd v11.4s, v11.4s, v0.4s\n" + "sqadd v10.4s, v10.4s, v21.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqadd v6.4s, v6.4s, v26.4s\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v19.4s\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "srshl v17.4s, v17.4s, v19.4s\n" + "sqadd v8.4s, v8.4s, v2.4s\n" + "srshl v10.4s, v10.4s, v19.4s\n" + "sqadd v7.4s, v7.4s, v3.4s\n" + "srshl v6.4s, v6.4s, v19.4s\n" + "sqadd v5.4s, v5.4s, v25.4s\n" + "srshl v16.4s, v16.4s, v29.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v8.4s, v8.4s, v29.4s\n" + "sqxtn v17.4h, v17.4s\n" + "srshl v7.4s, v7.4s, v29.4s\n" + "sqxtn v10.4h, v10.4s\n" + "srshl v5.4s, v5.4s, v29.4s\n" + "sqxtn v6.4h, v6.4s\n" + "sqxtn2 v15.8h, v16.4s\n" + "sqxtn2 v17.8h, v8.4s\n" + "sqxtn2 v10.8h, v7.4s\n" + "sqxtn2 v6.8h, v5.4s\n" + "sqadd v15.8h, v15.8h, v18.8h\n" + "sqadd v17.8h, v17.8h, v18.8h\n" + "sqadd v10.8h, v10.8h, v18.8h\n" + "sqadd v6.8h, v6.8h, v18.8h\n" + "smax v15.8h, v15.8h, v11.8h\n" + "smax v17.8h, v17.8h, v11.8h\n" + "smax v10.8h, v10.8h, v11.8h\n" + "smax v6.8h, v6.8h, v11.8h\n" + "smin v15.8h, v15.8h, v13.8h\n" + "smin v17.8h, v17.8h, v13.8h\n" + "smin v10.8h, v10.8h, v13.8h\n" + "smin v6.8h, v6.8h, v13.8h\n" "uzp1 v15.16b, v15.16b, v15.16b\n" - "str d15, [x17, x1]\n" - "smax v5.4s, v5.4s, v16.4s\n" - "sqadd v8.4s, v8.4s, v27.4s\n" - "srshl v11.4s, v11.4s, v21.4s\n" - "and v30.16b, v10.16b, v21.16b\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "uzp1 v18.16b, v18.16b, v5.16b\n" - "add v11.4s, v11.4s, v19.4s\n" - "srshl v8.4s, v8.4s, v14.4s\n" - "uzp1 v18.16b, v18.16b, v18.16b\n" - "str d18, [x16, x1]\n" - "smin v11.4s, v11.4s, v12.4s\n" - "sqrdmulh v9.4s, v9.4s, v17.4s\n" - "add v8.4s, v8.4s, v19.4s\n" - "sqadd v10.4s, v10.4s, v30.4s\n" - "smax v11.4s, v11.4s, v16.4s\n" - "smin v8.4s, v8.4s, v12.4s\n" - "and v6.16b, v9.16b, v14.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "smax v8.4s, v8.4s, v16.4s\n" - "srshl v10.4s, v10.4s, v21.4s\n" - "uzp1 v11.16b, v11.16b, v8.16b\n" - "add v10.4s, v10.4s, v19.4s\n" - "uzp1 v11.16b, v11.16b, v11.16b\n" - "str d11, [x6, x1]\n" - "smin v10.4s, v10.4s, v12.4s\n" - "sqadd v9.4s, v9.4s, v6.4s\n" - "smax v10.4s, v10.4s, v16.4s\n" - "srshl v9.4s, v9.4s, v14.4s\n" - "add v9.4s, v9.4s, v19.4s\n" - "smin v9.4s, v9.4s, v12.4s\n" - "smax v9.4s, v9.4s, v16.4s\n" - "uzp1 v10.16b, v10.16b, v9.16b\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" + "str d15, [x16, x22]\n" "uzp1 v10.16b, v10.16b, v10.16b\n" - "str d10, [x8, x1]\n" - "add x1, x1, #0x8\n" - "ldr x12, [%x[params], %[offsetof_Params_bias]]\n" - "ldr q15, [x12, #0x0]\n" - "mov v18.16b, v15.16b\n" - "ldr q20, [x12, #0x10]\n" - "add x12, x12, #0x20\n" - "mov v11.16b, v15.16b\n" - "str x12, [%x[params], %[offsetof_Params_bias]]\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "str d17, [x8, x22]\n" + "str d10, [x4, x22]\n" + "str d6, [x7, x22]\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q15, [x19, #0x0]\n" + "add x22, x22, #0x8\n" + "ldr q16, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr d0, [x23, #0x0]\n" + "ldr d1, [x23, #0x8]\n" + "ldr d2, [x23, #0x10]\n" + "mov v17.16b, v15.16b\n" + "mov v8.16b, v16.16b\n" + "ldr d3, [x23, #0x18]\n" + "ldr d4, [x23, #0x20]\n" "mov v10.16b, v15.16b\n" - "ldr d0, [x3, #0x0]\n" - "usubl v0.8h, v0.8b, v13.8b\n" - "mov v5.16b, v20.16b\n" - "ldr d1, [x3, #0x8]\n" - "mov v8.16b, v20.16b\n" - "ldr d2, [x3, #0x10]\n" - "usubl v1.8h, v1.8b, v13.8b\n" - "mov v9.16b, v20.16b\n" - "ldr d3, [x3, #0x18]\n" - "ldr d4, [x3, #0x20]\n" - "usubl v2.8h, v2.8b, v13.8b\n" - "ldp x28, x27, [x25, #0x0]\n" - "usubl v3.8h, v3.8b, v13.8b\n" - "ldp x26, x13, [x25, #0x10]\n" - "usubl v4.8h, v4.8b, v13.8b\n" - "ldp x24, x23, [x25, #0x20]\n" - "ldp x22, x21, [x25, #0x30]\n" - "ldp x20, x0, [x25, #0x40]\n" - "ldr d31, [x28, x10]\n" - "usubl v31.8h, v31.8b, v7.8b\n" - "ldr d30, [x27, x10]\n" - "ldr d29, [x26, x10]\n" - "usubl v30.8h, v30.8b, v7.8b\n" - "ldr d28, [x13, x10]\n" - "ldr d27, [x24, x10]\n" - "usubl v29.8h, v29.8b, v7.8b\n" - "ldr d23, [x23, x10]\n" - "usubl v28.8h, v28.8b, v7.8b\n" - "ldr d25, [x22, x10]\n" - "ldr d24, [x21, x10]\n" - "usubl v27.8h, v27.8b, v7.8b\n" - "ldr d26, [x20, x10]\n" - "usubl v23.8h, v23.8b, v7.8b\n" - "ldr d22, [x0, x10]\n" - "usubl v25.8h, v25.8b, v7.8b\n" - "usubl v24.8h, v24.8b, v7.8b\n" - "usubl v26.8h, v26.8b, v7.8b\n" - "usubl v22.8h, v22.8b, v7.8b\n" + "mov v7.16b, v16.16b\n" + "ldp x28, x6, [x20, #0x0]\n" + "ldp x26, x25, [x20, #0x10]\n" + "mov v6.16b, v15.16b\n" + "mov v5.16b, v16.16b\n" + "ldp x5, x2, [x20, #0x20]\n" + "ldp x27, x21, [x20, #0x30]\n" + "usubl v0.8h, v0.8b, v14.8b\n" + "usubl v1.8h, v1.8b, v14.8b\n" + "ldp x12, x19, [x20, #0x40]\n" + "ldr d31, [x28, x24]\n" + "usubl v2.8h, v2.8b, v14.8b\n" + "usubl v3.8h, v3.8b, v14.8b\n" + "ldr d30, [x6, x24]\n" + "ldr d29, [x26, x24]\n" + "usubl v4.8h, v4.8b, v14.8b\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "ldr d28, [x25, x24]\n" + "ldr d27, [x5, x24]\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "usubl v29.8h, v29.8b, v9.8b\n" + "ldr d23, [x2, x24]\n" + "ldr d25, [x27, x24]\n" + "usubl v28.8h, v28.8b, v9.8b\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "ldr d24, [x21, x24]\n" + "ldr d26, [x12, x24]\n" + "usubl v23.8h, v23.8b, v9.8b\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "ldr d22, [x19, x24]\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "usubl v26.8h, v26.8b, v9.8b\n" + "usubl v22.8h, v22.8b, v9.8b\n" "bgt 1b\n" "2:" // Tail "smlal v15.4s, v31.4h, v0.4h\n" - "ldr x20, [x25, #0x50]\n" - "tst x4, #0x7\n" - "smlal2 v20.4s, v31.8h, v0.8h\n" - "ldr x28, [x25, #0x58]\n" - "smlal v18.4s, v30.4h, v0.4h\n" - "ldr x0, [x25, #0x60]\n" - "smlal2 v5.4s, v30.8h, v0.8h\n" - "ldr d31, [x20, x10]\n" - "usubl v31.8h, v31.8b, v7.8b\n" - "smlal v11.4s, v29.4h, v0.4h\n" - "ldr x7, [x25, #0x68]\n" - "smlal2 v8.4s, v29.8h, v0.8h\n" - "ldr x26, [x25, #0x70]\n" - "smlal v10.4s, v28.4h, v0.4h\n" - "ldr x23, [x25, #0x78]\n" - "smlal2 v9.4s, v28.8h, v0.8h\n" - "ldr d0, [x3, #0x28]\n" - "usubl v0.8h, v0.8b, v13.8b\n" + "smlal2 v16.4s, v31.8h, v0.8h\n" + "ldr x19, [x20, #0x50]\n" + "ldr d31, [x19, x24]\n" + "smlal v17.4s, v30.4h, v0.4h\n" + "smlal v10.4s, v29.4h, v0.4h\n" + "ldr x15, [x20, #0x58]\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "smlal v6.4s, v28.4h, v0.4h\n" + "smlal2 v8.4s, v30.8h, v0.8h\n" + "ldr x19, [x20, #0x60]\n" + "ldr x27, [x20, #0x68]\n" + "smlal2 v7.4s, v29.8h, v0.8h\n" "smlal v15.4s, v30.4h, v1.4h\n" - "ldr x20, [x25, #0x80]\n" - "smlal2 v20.4s, v30.8h, v1.8h\n" - "ldr d30, [x28, x10]\n" - "usubl v30.8h, v30.8b, v7.8b\n" - "smlal v18.4s, v27.4h, v1.4h\n" - "ldr x22, [x25, #0x88]\n" - "smlal2 v5.4s, v27.8h, v1.8h\n" - "ldr x13, [x25, #0x90]\n" - "smlal v11.4s, v28.4h, v1.4h\n" - "ldr x21, [x25, #0x98]\n" - "smlal2 v8.4s, v28.8h, v1.8h\n" - "ldr x14, [x25, #0xa0]\n" - "smlal v10.4s, v23.4h, v1.4h\n" - "ldr x11, [x25, #0xa8]\n" - "smlal2 v9.4s, v23.8h, v1.8h\n" - "ldr d1, [x3, #0x30]\n" - "usubl v1.8h, v1.8b, v13.8b\n" + "ldr x5, [x20, #0x70]\n" + "ldr x11, [x20, #0x78]\n" + "smlal2 v16.4s, v30.8h, v1.8h\n" + "smlal2 v5.4s, v28.8h, v0.8h\n" + "ldr d30, [x15, x24]\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "smlal v17.4s, v27.4h, v1.4h\n" + "smlal v10.4s, v28.4h, v1.4h\n" + "ldr d0, [x23, #0x28]\n" + "usubl v0.8h, v0.8b, v14.8b\n" + "smlal v6.4s, v23.4h, v1.4h\n" + "smlal2 v8.4s, v27.8h, v1.8h\n" + "ldr x12, [x20, #0x80]\n" + "ldr x26, [x20, #0x88]\n" + "smlal2 v7.4s, v28.8h, v1.8h\n" "smlal v15.4s, v27.4h, v2.4h\n" - "ldr x24, [x25, #0xb0]\n" - "smlal2 v20.4s, v27.8h, v2.8h\n" - "ldr d27, [x0, x10]\n" - "usubl v27.8h, v27.8b, v7.8b\n" - "smlal v18.4s, v25.4h, v2.4h\n" - "ldr x0, [x25, #0xb8]\n" - "smlal2 v5.4s, v25.8h, v2.8h\n" - "ldr x15, [x25, #0xc0]\n" - "smlal v11.4s, v23.4h, v2.4h\n" - "ldr x9, [x25, #0xc8]\n" - "smlal2 v8.4s, v23.8h, v2.8h\n" - "ldr x27, [x25, #0xd0]\n" - "smlal v10.4s, v31.4h, v2.4h\n" - "ldr x28, [x25, #0xd8]\n" - "smlal2 v9.4s, v31.8h, v2.8h\n" - "ldr d2, [x3, #0x38]\n" - "usubl v2.8h, v2.8b, v13.8b\n" + "ldr x14, [x20, #0x90]\n" + "ldr x15, [x20, #0x98]\n" + "smlal2 v16.4s, v27.8h, v2.8h\n" + "smlal2 v5.4s, v23.8h, v1.8h\n" + "ldr d27, [x19, x24]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal v17.4s, v25.4h, v2.4h\n" + "smlal v10.4s, v23.4h, v2.4h\n" + "ldr d1, [x23, #0x30]\n" + "usubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v31.4h, v2.4h\n" + "smlal2 v8.4s, v25.8h, v2.8h\n" + "ldr x21, [x20, #0xa0]\n" + "ldr x2, [x20, #0xa8]\n" + "smlal2 v7.4s, v23.8h, v2.8h\n" "smlal v15.4s, v25.4h, v3.4h\n" - "ldr x12, [x25, #0xe0]\n" - "smlal2 v20.4s, v25.8h, v3.8h\n" - "ldr d25, [x7, x10]\n" - "usubl v25.8h, v25.8b, v7.8b\n" - "smlal v18.4s, v24.4h, v3.4h\n" - "ldr x7, [x25, #0xe8]\n" - "smlal2 v5.4s, v24.8h, v3.8h\n" - "ldr q6, [x2, #0x0]\n" - "smlal v11.4s, v31.4h, v3.4h\n" - "ldr q21, [x5, #0x0]\n" - "smlal2 v8.4s, v31.8h, v3.8h\n" - "ldr q17, [x2, #0x10]\n" - "add x2, x2, #0x20\n" - "smlal v10.4s, v30.4h, v3.4h\n" - "ldr q14, [x5, #0x10]\n" - "add x5, x5, #0x20\n" - "smlal2 v9.4s, v30.8h, v3.8h\n" - "ldr d3, [x3, #0x40]\n" - "usubl v3.8h, v3.8b, v13.8b\n" + "ldr x13, [x20, #0xb0]\n" + "ldr x9, [x20, #0xb8]\n" + "smlal2 v16.4s, v25.8h, v3.8h\n" + "smlal2 v5.4s, v31.8h, v2.8h\n" + "ldr d25, [x27, x24]\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal v10.4s, v31.4h, v3.4h\n" + "ldr d2, [x23, #0x38]\n" + "usubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v30.4h, v3.4h\n" + "smlal2 v8.4s, v24.8h, v3.8h\n" + "ldr x19, [x20, #0xc0]\n" + "ldr x28, [x20, #0xc8]\n" + "smlal2 v7.4s, v31.8h, v3.8h\n" "smlal v15.4s, v24.4h, v4.4h\n" - "smlal2 v20.4s, v24.8h, v4.8h\n" - "ldr d24, [x26, x10]\n" - "usubl v24.8h, v24.8b, v7.8b\n" - "smlal v18.4s, v27.4h, v4.4h\n" - "ldr x26, [x25, #0xf0]\n" - "smlal2 v5.4s, v27.8h, v4.8h\n" - "ldr d27, [x23, x10]\n" - "usubl v27.8h, v27.8b, v7.8b\n" - "smlal v11.4s, v30.4h, v4.4h\n" - "ldr x23, [x25, #0xf8]\n" - "smlal2 v8.4s, v30.8h, v4.8h\n" - "smlal v10.4s, v26.4h, v4.4h\n" - "smlal2 v9.4s, v26.8h, v4.8h\n" - "ldr d4, [x3, #0x48]\n" - "usubl v4.8h, v4.8b, v13.8b\n" + "ldr x6, [x20, #0xd0]\n" + "ldr x27, [x20, #0xd8]\n" + "smlal2 v16.4s, v24.8h, v4.8h\n" + "smlal2 v5.4s, v30.8h, v3.8h\n" + "ldr d24, [x5, x24]\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "smlal v17.4s, v27.4h, v4.4h\n" + "smlal v10.4s, v30.4h, v4.4h\n" + "ldr d3, [x23, #0x40]\n" + "usubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v26.4h, v4.4h\n" + "smlal2 v8.4s, v27.8h, v4.8h\n" + "ldr d27, [x11, x24]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal2 v7.4s, v30.8h, v4.8h\n" "smlal v15.4s, v29.4h, v0.4h\n" - "smlal2 v20.4s, v29.8h, v0.8h\n" - "smlal v18.4s, v28.4h, v0.4h\n" - "smlal2 v5.4s, v28.8h, v0.8h\n" - "smlal v11.4s, v22.4h, v0.4h\n" - "smlal2 v8.4s, v22.8h, v0.8h\n" - "smlal v10.4s, v25.4h, v0.4h\n" - "smlal2 v9.4s, v25.8h, v0.8h\n" - "ldr d0, [x3, #0x50]\n" - "usubl v0.8h, v0.8b, v13.8b\n" + "ldr x11, [x20, #0xe0]\n" + "ldr x17, [x20, #0xe8]\n" + "smlal2 v16.4s, v29.8h, v0.8h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d4, [x23, #0x48]\n" + "usubl v4.8h, v4.8b, v14.8b\n" + "smlal v17.4s, v28.4h, v0.4h\n" + "smlal v10.4s, v22.4h, v0.4h\n" + "ldr x5, [x20, #0xf0]\n" + "ldr x25, [x20, #0xf8]\n" + "smlal v6.4s, v25.4h, v0.4h\n" + "smlal2 v8.4s, v28.8h, v0.8h\n" + "ldr q12, [x10, #0x0]\n" + "ldr q19, [x1, #0x0]\n" + "smlal2 v7.4s, v22.8h, v0.8h\n" "smlal v15.4s, v28.4h, v1.4h\n" - "smlal2 v20.4s, v28.8h, v1.8h\n" - "ldr d28, [x22, x10]\n" - "usubl v28.8h, v28.8b, v7.8b\n" - "smlal v18.4s, v23.4h, v1.4h\n" - "ldr x22, [x25, #0x100]\n" - "smlal2 v5.4s, v23.8h, v1.8h\n" - "smlal v11.4s, v25.4h, v1.4h\n" - "smlal2 v8.4s, v25.8h, v1.8h\n" - "smlal v10.4s, v24.4h, v1.4h\n" - "smlal2 v9.4s, v24.8h, v1.8h\n" - "ldr d1, [x3, #0x58]\n" - "usubl v1.8h, v1.8b, v13.8b\n" + "ldr q20, [x10, #0x10]\n" + "ldr q29, [x1, #0x10]\n" + "smlal2 v16.4s, v28.8h, v1.8h\n" + "smlal2 v5.4s, v25.8h, v0.8h\n" + "ldr d28, [x26, x24]\n" + "ldr d0, [x23, #0x50]\n" + "smlal v17.4s, v23.4h, v1.4h\n" + "smlal v10.4s, v25.4h, v1.4h\n" + "usubl v28.8h, v28.8b, v9.8b\n" + "ldr x26, [x20, #0x100]\n" + "smlal v6.4s, v24.4h, v1.4h\n" + "smlal2 v8.4s, v23.8h, v1.8h\n" + "usubl v0.8h, v0.8b, v14.8b\n" + "tst x0, #0x7\n" + "smlal2 v7.4s, v25.8h, v1.8h\n" "smlal v15.4s, v23.4h, v2.4h\n" - "smlal2 v20.4s, v23.8h, v2.8h\n" - "ldr d23, [x20, x10]\n" - "usubl v23.8h, v23.8b, v7.8b\n" - "smlal v18.4s, v31.4h, v2.4h\n" - "ldr x20, [x25, #0x108]\n" - "smlal2 v5.4s, v31.8h, v2.8h\n" - "smlal v11.4s, v24.4h, v2.4h\n" - "smlal2 v8.4s, v24.8h, v2.8h\n" - "smlal v10.4s, v27.4h, v2.4h\n" - "smlal2 v9.4s, v27.8h, v2.8h\n" - "ldr d2, [x3, #0x60]\n" - "usubl v2.8h, v2.8b, v13.8b\n" + "add x10, x10, #0x20\n" + "add x1, x1, #0x20\n" + "smlal2 v16.4s, v23.8h, v2.8h\n" + "ldr d23, [x12, x24]\n" + "smlal2 v5.4s, v24.8h, v1.8h\n" + "usubl v23.8h, v23.8b, v9.8b\n" + "smlal v17.4s, v31.4h, v2.4h\n" + "smlal v10.4s, v24.4h, v2.4h\n" + "ldr d1, [x23, #0x58]\n" + "usubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v27.4h, v2.4h\n" + "smlal2 v8.4s, v31.8h, v2.8h\n" + "ldr x12, [x20, #0x108]\n" + "smlal2 v7.4s, v24.8h, v2.8h\n" "smlal v15.4s, v31.4h, v3.4h\n" - "smlal2 v20.4s, v31.8h, v3.8h\n" - "ldr d31, [x13, x10]\n" - "usubl v31.8h, v31.8b, v7.8b\n" - "smlal v18.4s, v30.4h, v3.4h\n" - "ldr x13, [x25, #0x110]\n" - "smlal2 v5.4s, v30.8h, v3.8h\n" - "smlal v11.4s, v27.4h, v3.4h\n" - "smlal2 v8.4s, v27.8h, v3.8h\n" - "smlal v10.4s, v23.4h, v3.4h\n" - "smlal2 v9.4s, v23.8h, v3.8h\n" - "ldr d3, [x3, #0x68]\n" - "usubl v3.8h, v3.8b, v13.8b\n" + "smlal2 v16.4s, v31.8h, v3.8h\n" + "smlal2 v5.4s, v27.8h, v2.8h\n" + "ldr d31, [x14, x24]\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "smlal v17.4s, v30.4h, v3.4h\n" + "smlal v10.4s, v27.4h, v3.4h\n" + "ldr d2, [x23, #0x60]\n" + "usubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v23.4h, v3.4h\n" + "smlal2 v8.4s, v30.8h, v3.8h\n" + "ldr x14, [x20, #0x110]\n" + "smlal2 v7.4s, v27.8h, v3.8h\n" "smlal v15.4s, v30.4h, v4.4h\n" - "smlal2 v20.4s, v30.8h, v4.8h\n" - "ldr d30, [x21, x10]\n" - "usubl v30.8h, v30.8b, v7.8b\n" - "smlal v18.4s, v26.4h, v4.4h\n" - "ldr x21, [x25, #0x118]\n" - "smlal2 v5.4s, v26.8h, v4.8h\n" - "ldr d26, [x14, x10]\n" - "usubl v26.8h, v26.8b, v7.8b\n" - "smlal v11.4s, v23.4h, v4.4h\n" - "smlal2 v8.4s, v23.8h, v4.8h\n" - "smlal v10.4s, v28.4h, v4.4h\n" - "smlal2 v9.4s, v28.8h, v4.8h\n" - "ldr d4, [x3, #0x70]\n" - "usubl v4.8h, v4.8b, v13.8b\n" + "smlal2 v16.4s, v30.8h, v4.8h\n" + "ldr d30, [x15, x24]\n" + "smlal2 v5.4s, v23.8h, v3.8h\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "smlal v17.4s, v26.4h, v4.4h\n" + "smlal v10.4s, v23.4h, v4.4h\n" + "ldr d3, [x23, #0x68]\n" + "usubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v28.4h, v4.4h\n" + "smlal2 v8.4s, v26.8h, v4.8h\n" + "ldr d26, [x21, x24]\n" + "usubl v26.8h, v26.8b, v9.8b\n" + "smlal2 v7.4s, v23.8h, v4.8h\n" "smlal v15.4s, v22.4h, v0.4h\n" - "smlal2 v20.4s, v22.8h, v0.8h\n" - "ldr d22, [x0, x10]\n" - "usubl v22.8h, v22.8b, v7.8b\n" - "smlal v18.4s, v25.4h, v0.4h\n" - "smlal2 v5.4s, v25.8h, v0.8h\n" - "smlal v11.4s, v31.4h, v0.4h\n" - "smlal2 v8.4s, v31.8h, v0.8h\n" - "smlal v10.4s, v30.4h, v0.4h\n" - "smlal2 v9.4s, v30.8h, v0.8h\n" - "ldr d0, [x3, #0x78]\n" - "usubl v0.8h, v0.8b, v13.8b\n" + "ldr x21, [x20, #0x118]\n" + "smlal2 v16.4s, v22.8h, v0.8h\n" + "smlal2 v5.4s, v28.8h, v4.8h\n" + "ldr d4, [x23, #0x70]\n" + "ldr d22, [x9, x24]\n" + "smlal v17.4s, v25.4h, v0.4h\n" + "smlal v10.4s, v31.4h, v0.4h\n" + "usubl v4.8h, v4.8b, v14.8b\n" + "smlal v6.4s, v30.4h, v0.4h\n" + "smlal2 v8.4s, v25.8h, v0.8h\n" + "usubl v22.8h, v22.8b, v9.8b\n" + "smlal2 v7.4s, v31.8h, v0.8h\n" "smlal v15.4s, v25.4h, v1.4h\n" - "smlal2 v20.4s, v25.8h, v1.8h\n" - "ldr d25, [x11, x10]\n" - "usubl v25.8h, v25.8b, v7.8b\n" - "smlal v18.4s, v24.4h, v1.4h\n" - "smlal2 v5.4s, v24.8h, v1.8h\n" - "smlal v11.4s, v30.4h, v1.4h\n" - "smlal2 v8.4s, v30.8h, v1.8h\n" - "smlal v10.4s, v26.4h, v1.4h\n" - "smlal2 v9.4s, v26.8h, v1.8h\n" - "ldr d1, [x3, #0x80]\n" - "usubl v1.8h, v1.8b, v13.8b\n" + "smlal2 v16.4s, v25.8h, v1.8h\n" + "ldr d25, [x2, x24]\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "smlal v17.4s, v24.4h, v1.4h\n" + "smlal v10.4s, v30.4h, v1.4h\n" + "ldr d0, [x23, #0x78]\n" + "usubl v0.8h, v0.8b, v14.8b\n" + "smlal v6.4s, v26.4h, v1.4h\n" + "smlal2 v8.4s, v24.8h, v1.8h\n" + "smlal2 v7.4s, v30.8h, v1.8h\n" "smlal v15.4s, v24.4h, v2.4h\n" - "smlal2 v20.4s, v24.8h, v2.8h\n" - "ldr d24, [x24, x10]\n" - "usubl v24.8h, v24.8b, v7.8b\n" - "smlal v18.4s, v27.4h, v2.4h\n" - "smlal2 v5.4s, v27.8h, v2.8h\n" - "smlal v11.4s, v26.4h, v2.4h\n" - "smlal2 v8.4s, v26.8h, v2.8h\n" - "smlal v10.4s, v25.4h, v2.4h\n" - "smlal2 v9.4s, v25.8h, v2.8h\n" - "ldr d2, [x3, #0x88]\n" - "usubl v2.8h, v2.8b, v13.8b\n" + "smlal2 v16.4s, v24.8h, v2.8h\n" + "ldr d24, [x13, x24]\n" + "smlal2 v5.4s, v26.8h, v1.8h\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "smlal v17.4s, v27.4h, v2.4h\n" + "smlal v10.4s, v26.4h, v2.4h\n" + "ldr d1, [x23, #0x80]\n" + "usubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v25.4h, v2.4h\n" + "smlal2 v8.4s, v27.8h, v2.8h\n" + "smlal2 v7.4s, v26.8h, v2.8h\n" "smlal v15.4s, v27.4h, v3.4h\n" - "smlal2 v20.4s, v27.8h, v3.8h\n" - "ldr d27, [x15, x10]\n" - "usubl v27.8h, v27.8b, v7.8b\n" - "smlal v18.4s, v23.4h, v3.4h\n" - "smlal2 v5.4s, v23.8h, v3.8h\n" - "smlal v11.4s, v25.4h, v3.4h\n" - "smlal2 v8.4s, v25.8h, v3.8h\n" - "smlal v10.4s, v24.4h, v3.4h\n" - "smlal2 v9.4s, v24.8h, v3.8h\n" - "ldr d3, [x3, #0x90]\n" - "usubl v3.8h, v3.8b, v13.8b\n" + "smlal2 v16.4s, v27.8h, v3.8h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "ldr d27, [x19, x24]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal v17.4s, v23.4h, v3.4h\n" + "smlal v10.4s, v25.4h, v3.4h\n" + "ldr d2, [x23, #0x88]\n" + "usubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v24.4h, v3.4h\n" + "smlal2 v8.4s, v23.8h, v3.8h\n" + "smlal2 v7.4s, v25.8h, v3.8h\n" "smlal v15.4s, v23.4h, v4.4h\n" - "smlal2 v20.4s, v23.8h, v4.8h\n" - "ldr d23, [x9, x10]\n" - "usubl v23.8h, v23.8b, v7.8b\n" - "smlal v18.4s, v28.4h, v4.4h\n" - "smlal2 v5.4s, v28.8h, v4.8h\n" - "ldr d28, [x12, x10]\n" - "usubl v28.8h, v28.8b, v7.8b\n" - "smlal v11.4s, v24.4h, v4.4h\n" - "smlal2 v8.4s, v24.8h, v4.8h\n" - "smlal v10.4s, v22.4h, v4.4h\n" - "smlal2 v9.4s, v22.8h, v4.8h\n" - "ldr d4, [x3, #0x98]\n" - "usubl v4.8h, v4.8b, v13.8b\n" + "smlal2 v16.4s, v23.8h, v4.8h\n" + "ldr d23, [x28, x24]\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "usubl v23.8h, v23.8b, v9.8b\n" + "smlal v17.4s, v28.4h, v4.4h\n" + "smlal v10.4s, v24.4h, v4.4h\n" + "ldr d3, [x23, #0x90]\n" + "usubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v22.4h, v4.4h\n" + "smlal2 v8.4s, v28.8h, v4.8h\n" + "ldr d28, [x11, x24]\n" + "usubl v28.8h, v28.8b, v9.8b\n" + "smlal2 v7.4s, v24.8h, v4.8h\n" "smlal v15.4s, v31.4h, v0.4h\n" - "smlal2 v20.4s, v31.8h, v0.8h\n" - "ldr d31, [x27, x10]\n" - "usubl v31.8h, v31.8b, v7.8b\n" - "smlal v18.4s, v30.4h, v0.4h\n" - "smlal2 v5.4s, v30.8h, v0.8h\n" - "smlal v11.4s, v27.4h, v0.4h\n" - "smlal2 v8.4s, v27.8h, v0.8h\n" - "smlal v10.4s, v23.4h, v0.4h\n" - "smlal2 v9.4s, v23.8h, v0.8h\n" - "ldr d0, [x3, #0xa0]\n" - "usubl v0.8h, v0.8b, v13.8b\n" + "smlal2 v16.4s, v31.8h, v0.8h\n" + "ldr d31, [x6, x24]\n" + "smlal2 v5.4s, v22.8h, v4.8h\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "smlal v17.4s, v30.4h, v0.4h\n" + "smlal v10.4s, v27.4h, v0.4h\n" + "ldr d4, [x23, #0x98]\n" + "usubl v4.8h, v4.8b, v14.8b\n" + "smlal v6.4s, v23.4h, v0.4h\n" + "smlal2 v8.4s, v30.8h, v0.8h\n" + "smlal2 v7.4s, v27.8h, v0.8h\n" "smlal v15.4s, v30.4h, v1.4h\n" - "smlal2 v20.4s, v30.8h, v1.8h\n" - "ldr d30, [x28, x10]\n" - "usubl v30.8h, v30.8b, v7.8b\n" - "smlal v18.4s, v26.4h, v1.4h\n" - "smlal2 v5.4s, v26.8h, v1.8h\n" - "smlal v11.4s, v23.4h, v1.4h\n" - "smlal2 v8.4s, v23.8h, v1.8h\n" - "smlal v10.4s, v31.4h, v1.4h\n" - "smlal2 v9.4s, v31.8h, v1.8h\n" - "ldr d1, [x3, #0xa8]\n" - "usubl v1.8h, v1.8b, v13.8b\n" + "smlal2 v16.4s, v30.8h, v1.8h\n" + "ldr d30, [x27, x24]\n" + "smlal2 v5.4s, v23.8h, v0.8h\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "smlal v17.4s, v26.4h, v1.4h\n" + "smlal v10.4s, v23.4h, v1.4h\n" + "ldr d0, [x23, #0xa0]\n" + "usubl v0.8h, v0.8b, v14.8b\n" + "smlal v6.4s, v31.4h, v1.4h\n" + "smlal2 v8.4s, v26.8h, v1.8h\n" + "smlal2 v7.4s, v23.8h, v1.8h\n" "smlal v15.4s, v26.4h, v2.4h\n" - "smlal2 v20.4s, v26.8h, v2.8h\n" - "ldr d26, [x7, x10]\n" - "usubl v26.8h, v26.8b, v7.8b\n" - "smlal v18.4s, v25.4h, v2.4h\n" - "smlal2 v5.4s, v25.8h, v2.8h\n" - "smlal v11.4s, v31.4h, v2.4h\n" - "smlal2 v8.4s, v31.8h, v2.8h\n" - "smlal v10.4s, v30.4h, v2.4h\n" - "smlal2 v9.4s, v30.8h, v2.8h\n" - "ldr d2, [x3, #0xb0]\n" - "usubl v2.8h, v2.8b, v13.8b\n" + "smlal2 v16.4s, v26.8h, v2.8h\n" + "smlal2 v5.4s, v31.8h, v1.8h\n" + "ldr d26, [x17, x24]\n" + "usubl v26.8h, v26.8b, v9.8b\n" + "smlal v17.4s, v25.4h, v2.4h\n" + "smlal v10.4s, v31.4h, v2.4h\n" + "ldr d1, [x23, #0xa8]\n" + "usubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v30.4h, v2.4h\n" + "smlal2 v8.4s, v25.8h, v2.8h\n" + "smlal2 v7.4s, v31.8h, v2.8h\n" "smlal v15.4s, v25.4h, v3.4h\n" - "smlal2 v20.4s, v25.8h, v3.8h\n" - "ldr d25, [x26, x10]\n" - "usubl v25.8h, v25.8b, v7.8b\n" - "smlal v18.4s, v24.4h, v3.4h\n" - "smlal2 v5.4s, v24.8h, v3.8h\n" - "smlal v11.4s, v30.4h, v3.4h\n" - "smlal2 v8.4s, v30.8h, v3.8h\n" - "smlal v10.4s, v28.4h, v3.4h\n" - "smlal2 v9.4s, v28.8h, v3.8h\n" - "ldr d3, [x3, #0xb8]\n" - "usubl v3.8h, v3.8b, v13.8b\n" + "smlal2 v16.4s, v25.8h, v3.8h\n" + "smlal2 v5.4s, v30.8h, v2.8h\n" + "ldr d25, [x5, x24]\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal v10.4s, v30.4h, v3.4h\n" + "ldr d2, [x23, #0xb0]\n" + "usubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v28.4h, v3.4h\n" + "smlal2 v8.4s, v24.8h, v3.8h\n" + "smlal2 v7.4s, v30.8h, v3.8h\n" "smlal v15.4s, v24.4h, v4.4h\n" - "smlal2 v20.4s, v24.8h, v4.8h\n" - "ldr d24, [x23, x10]\n" - "usubl v24.8h, v24.8b, v7.8b\n" - "smlal v18.4s, v22.4h, v4.4h\n" - "smlal2 v5.4s, v22.8h, v4.8h\n" - "smlal v11.4s, v28.4h, v4.4h\n" - "smlal2 v8.4s, v28.8h, v4.8h\n" - "smlal v10.4s, v26.4h, v4.4h\n" - "smlal2 v9.4s, v26.8h, v4.8h\n" - "ldr d4, [x3, #0xc0]\n" - "usubl v4.8h, v4.8b, v13.8b\n" + "smlal2 v16.4s, v24.8h, v4.8h\n" + "ldr d24, [x25, x24]\n" + "smlal2 v5.4s, v28.8h, v3.8h\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "smlal v17.4s, v22.4h, v4.4h\n" + "smlal v10.4s, v28.4h, v4.4h\n" + "ldr d3, [x23, #0xb8]\n" + "usubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v26.4h, v4.4h\n" + "smlal2 v7.4s, v28.8h, v4.8h\n" "smlal v15.4s, v27.4h, v0.4h\n" - "smlal2 v20.4s, v27.8h, v0.8h\n" - "ldr d27, [x22, x10]\n" - "usubl v27.8h, v27.8b, v7.8b\n" - "smlal v18.4s, v23.4h, v0.4h\n" - "smlal2 v5.4s, v23.8h, v0.8h\n" - "smlal v11.4s, v25.4h, v0.4h\n" - "smlal2 v8.4s, v25.8h, v0.8h\n" - "ldr d25, [x20, x10]\n" - "usubl v25.8h, v25.8b, v7.8b\n" - "smlal v10.4s, v24.4h, v0.4h\n" - "smlal2 v9.4s, v24.8h, v0.8h\n" + "smlal2 v16.4s, v27.8h, v0.8h\n" + "ldr d27, [x26, x24]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal2 v8.4s, v22.8h, v4.8h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d4, [x23, #0xc0]\n" + "usubl v4.8h, v4.8b, v14.8b\n" + "smlal v17.4s, v23.4h, v0.4h\n" + "smlal v10.4s, v25.4h, v0.4h\n" + "smlal v6.4s, v24.4h, v0.4h\n" + "smlal2 v7.4s, v25.8h, v0.8h\n" + "ldr d25, [x12, x24]\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "smlal2 v8.4s, v23.8h, v0.8h\n" + "smlal2 v5.4s, v24.8h, v0.8h\n" "smlal v15.4s, v23.4h, v1.4h\n" - "smlal2 v20.4s, v23.8h, v1.8h\n" - "smlal v18.4s, v31.4h, v1.4h\n" - "smlal2 v5.4s, v31.8h, v1.8h\n" - "smlal v11.4s, v24.4h, v1.4h\n" - "smlal2 v8.4s, v24.8h, v1.8h\n" - "ldr d24, [x13, x10]\n" - "usubl v24.8h, v24.8b, v7.8b\n" - "smlal v10.4s, v27.4h, v1.4h\n" - "smlal2 v9.4s, v27.8h, v1.8h\n" + "smlal v17.4s, v31.4h, v1.4h\n" + "smlal v10.4s, v24.4h, v1.4h\n" + "smlal v6.4s, v27.4h, v1.4h\n" + "smlal2 v7.4s, v24.8h, v1.8h\n" + "ldr d24, [x14, x24]\n" + "smlal2 v16.4s, v23.8h, v1.8h\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "smlal2 v8.4s, v31.8h, v1.8h\n" + "smlal2 v5.4s, v27.8h, v1.8h\n" "smlal v15.4s, v31.4h, v2.4h\n" - "smlal2 v20.4s, v31.8h, v2.8h\n" - "smlal v18.4s, v30.4h, v2.4h\n" - "smlal2 v5.4s, v30.8h, v2.8h\n" - "smlal v11.4s, v27.4h, v2.4h\n" - "smlal2 v8.4s, v27.8h, v2.8h\n" - "ldr d27, [x21, x10]\n" - "add x10, x10, #0x8\n" - "smlal v10.4s, v25.4h, v2.4h\n" - "usubl v27.8h, v27.8b, v7.8b\n" - "smlal2 v9.4s, v25.8h, v2.8h\n" + "smlal v17.4s, v30.4h, v2.4h\n" + "smlal v10.4s, v27.4h, v2.4h\n" + "smlal v6.4s, v25.4h, v2.4h\n" + "smlal2 v7.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x24]\n" + "smlal2 v16.4s, v31.8h, v2.8h\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal2 v8.4s, v30.8h, v2.8h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "add x24, x24, #0x8\n" "smlal v15.4s, v30.4h, v3.4h\n" - "smlal2 v20.4s, v30.8h, v3.8h\n" - "smlal v18.4s, v28.4h, v3.4h\n" - "smlal2 v5.4s, v28.8h, v3.8h\n" - "smlal v11.4s, v25.4h, v3.4h\n" - "smlal2 v8.4s, v25.8h, v3.8h\n" - "smlal v10.4s, v24.4h, v3.4h\n" - "smlal2 v9.4s, v24.8h, v3.8h\n" + "smlal v17.4s, v28.4h, v3.4h\n" + "smlal v10.4s, v25.4h, v3.4h\n" + "smlal v6.4s, v24.4h, v3.4h\n" + "smlal2 v16.4s, v30.8h, v3.8h\n" + "smlal2 v8.4s, v28.8h, v3.8h\n" + "smlal2 v7.4s, v25.8h, v3.8h\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" "smlal v15.4s, v28.4h, v4.4h\n" - "smlal2 v20.4s, v28.8h, v4.8h\n" - "smlal v18.4s, v26.4h, v4.4h\n" - "smlal2 v5.4s, v26.8h, v4.8h\n" - "smlal v11.4s, v24.4h, v4.4h\n" - "smlal2 v8.4s, v24.8h, v4.8h\n" - "smlal v10.4s, v27.4h, v4.4h\n" - "smlal2 v9.4s, v27.8h, v4.8h\n" - "sqrdmulh v15.4s, v15.4s, v6.4s\n" - "sqrdmulh v20.4s, v20.4s, v17.4s\n" - "sqrdmulh v18.4s, v18.4s, v6.4s\n" - "sqrdmulh v5.4s, v5.4s, v17.4s\n" - "and v1.16b, v15.16b, v21.16b\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "and v29.16b, v20.16b, v14.16b\n" - "and v3.16b, v18.16b, v21.16b\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "and v2.16b, v5.16b, v14.16b\n" - "sqrdmulh v11.4s, v11.4s, v6.4s\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqrdmulh v8.4s, v8.4s, v17.4s\n" + "smlal v17.4s, v26.4h, v4.4h\n" + "sqdmulh v15.4s, v15.4s, v12.4s\n" + "smlal v10.4s, v24.4h, v4.4h\n" + "smlal v6.4s, v27.4h, v4.4h\n" + "sqdmulh v17.4s, v17.4s, v12.4s\n" + "smlal2 v16.4s, v28.8h, v4.8h\n" + "smlal2 v8.4s, v26.8h, v4.8h\n" + "sqdmulh v10.4s, v10.4s, v12.4s\n" + "smlal2 v7.4s, v24.8h, v4.8h\n" + "smlal2 v5.4s, v27.8h, v4.8h\n" + "sqdmulh v6.4s, v6.4s, v12.4s\n" + "and v23.16b, v15.16b, v19.16b\n" + "sqdmulh v16.4s, v16.4s, v20.4s\n" + "and v22.16b, v17.16b, v19.16b\n" + "sqdmulh v8.4s, v8.4s, v20.4s\n" + "and v21.16b, v10.16b, v19.16b\n" + "sqdmulh v7.4s, v7.4s, v20.4s\n" + "and v26.16b, v6.16b, v19.16b\n" + "sqdmulh v5.4s, v5.4s, v20.4s\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "and v4.16b, v16.16b, v29.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v2.16b, v8.16b, v29.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v3.16b, v7.16b, v29.16b\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "and v25.16b, v5.16b, v29.16b\n" + "sqadd v15.4s, v15.4s, v23.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v22.4s\n" "sshr v2.4s, v2.4s, #0x1f\n" - "sqadd v15.4s, v15.4s, v1.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "and v0.16b, v11.16b, v21.16b\n" - "sshr v0.4s, v0.4s, #0x1f\n" - "srshl v15.4s, v15.4s, v21.4s\n" - "sqadd v20.4s, v20.4s, v29.4s\n" - "sqadd v18.4s, v18.4s, v3.4s\n" - "sqadd v5.4s, v5.4s, v2.4s\n" - "and v27.16b, v8.16b, v14.16b\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "add v15.4s, v15.4s, v19.4s\n" - "srshl v20.4s, v20.4s, v14.4s\n" - "srshl v18.4s, v18.4s, v21.4s\n" - "srshl v5.4s, v5.4s, v14.4s\n" - "smin v15.4s, v15.4s, v12.4s\n" - "add v20.4s, v20.4s, v19.4s\n" - "add v18.4s, v18.4s, v19.4s\n" - "smax v15.4s, v15.4s, v16.4s\n" - "smin v20.4s, v20.4s, v12.4s\n" - "smin v18.4s, v18.4s, v12.4s\n" - "add v5.4s, v5.4s, v19.4s\n" - "smax v20.4s, v20.4s, v16.4s\n" - "smax v18.4s, v18.4s, v16.4s\n" - "smin v5.4s, v5.4s, v12.4s\n" - "uzp1 v15.16b, v15.16b, v20.16b\n" - "sqadd v11.4s, v11.4s, v0.4s\n" + "sqadd v10.4s, v10.4s, v21.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqadd v6.4s, v6.4s, v26.4s\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v19.4s\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "srshl v17.4s, v17.4s, v19.4s\n" + "sqadd v8.4s, v8.4s, v2.4s\n" + "srshl v10.4s, v10.4s, v19.4s\n" + "sqadd v7.4s, v7.4s, v3.4s\n" + "srshl v6.4s, v6.4s, v19.4s\n" + "sqadd v5.4s, v5.4s, v25.4s\n" + "srshl v16.4s, v16.4s, v29.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v8.4s, v8.4s, v29.4s\n" + "sqxtn v17.4h, v17.4s\n" + "srshl v7.4s, v7.4s, v29.4s\n" + "sqxtn v10.4h, v10.4s\n" + "srshl v5.4s, v5.4s, v29.4s\n" + "sqxtn v6.4h, v6.4s\n" + "sqxtn2 v15.8h, v16.4s\n" + "sqxtn2 v17.8h, v8.4s\n" + "sqxtn2 v10.8h, v7.4s\n" + "sqxtn2 v6.8h, v5.4s\n" + "sqadd v15.8h, v15.8h, v18.8h\n" + "sqadd v17.8h, v17.8h, v18.8h\n" + "sqadd v10.8h, v10.8h, v18.8h\n" + "sqadd v6.8h, v6.8h, v18.8h\n" + "smax v15.8h, v15.8h, v11.8h\n" + "smax v17.8h, v17.8h, v11.8h\n" + "smax v10.8h, v10.8h, v11.8h\n" + "smax v6.8h, v6.8h, v11.8h\n" + "smin v15.8h, v15.8h, v13.8h\n" + "smin v17.8h, v17.8h, v13.8h\n" + "smin v10.8h, v10.8h, v13.8h\n" + "smin v6.8h, v6.8h, v13.8h\n" "uzp1 v15.16b, v15.16b, v15.16b\n" - "str d15, [x17, x1]\n" - "smax v5.4s, v5.4s, v16.4s\n" - "sqadd v8.4s, v8.4s, v27.4s\n" - "srshl v11.4s, v11.4s, v21.4s\n" - "and v30.16b, v10.16b, v21.16b\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "uzp1 v18.16b, v18.16b, v5.16b\n" - "add v11.4s, v11.4s, v19.4s\n" - "srshl v8.4s, v8.4s, v14.4s\n" - "uzp1 v18.16b, v18.16b, v18.16b\n" - "str d18, [x16, x1]\n" - "smin v11.4s, v11.4s, v12.4s\n" - "sqrdmulh v9.4s, v9.4s, v17.4s\n" - "add v8.4s, v8.4s, v19.4s\n" - "sqadd v10.4s, v10.4s, v30.4s\n" - "smax v11.4s, v11.4s, v16.4s\n" - "smin v8.4s, v8.4s, v12.4s\n" - "and v6.16b, v9.16b, v14.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "smax v8.4s, v8.4s, v16.4s\n" - "srshl v10.4s, v10.4s, v21.4s\n" - "uzp1 v11.16b, v11.16b, v8.16b\n" - "add v10.4s, v10.4s, v19.4s\n" - "uzp1 v11.16b, v11.16b, v11.16b\n" - "str d11, [x6, x1]\n" - "smin v10.4s, v10.4s, v12.4s\n" - "sqadd v9.4s, v9.4s, v6.4s\n" - "smax v10.4s, v10.4s, v16.4s\n" - "srshl v9.4s, v9.4s, v14.4s\n" - "add v9.4s, v9.4s, v19.4s\n" - "smin v9.4s, v9.4s, v12.4s\n" - "smax v9.4s, v9.4s, v16.4s\n" - "uzp1 v10.16b, v10.16b, v9.16b\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" + "str d15, [x16, x22]\n" "uzp1 v10.16b, v10.16b, v10.16b\n" - "str d10, [x8, x1]\n" - "add x1, x1, #0x8\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "str d17, [x8, x22]\n" + "str d10, [x4, x22]\n" + "str d6, [x7, x22]\n" + "add x22, x22, #0x8\n" "beq 124f\n" - "add x3, x3, #0xc8\n" + "add x23, x23, #0xc8\n" "3:" // Oddments - "ldr x12, [%x[params], %[offsetof_Params_bias]]\n" - "tbz x4, #2, 5f\n" - "ld1 { v15.4s }, [x12], #0x10\n" - "tbz x4, #1, 4f\n" - "ld1 { v20.d }[0], [x12], #0x8\n" - "tbz x4, #0, 7f\n" - "ld1 { v20.s }[2], [x12]\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "tbz x0, #2, 5f\n" + "ld1 { v15.4s }, [x19], #0x10\n" + "tbz x0, #1, 4f\n" + "ld1 { v16.d }[0], [x19], #0x8\n" + "tbz x0, #0, 7f\n" + "ld1 { v16.s }[2], [x19]\n" "b 7f\n" "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset - "tbz x4, #0, 7f\n" - "ld1 { v20.s }[0], [x12]\n" + "tbz x0, #0, 7f\n" + "ld1 { v16.s }[0], [x19]\n" "b 7f\n" "5:" // Oddments: Load bias: Bit 2: Unset - "tbz x4, #1, 6f\n" - "ld1 { v15.d }[0], [x12], #0x8\n" - "tbz x4, #0, 7f\n" - "ld1 { v15.s }[2], [x12]\n" + "tbz x0, #1, 6f\n" + "ld1 { v15.d }[0], [x19], #0x8\n" + "tbz x0, #0, 7f\n" + "ld1 { v15.s }[2], [x19]\n" "b 7f\n" "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 7f\n" - "ld1 { v15.s }[0], [x12]\n" + "tbz x0, #0, 7f\n" + "ld1 { v15.s }[0], [x19]\n" "7:" // Oddments: Load bias: Bit 2: End - "mov v18.16b, v15.16b\n" - "ldr d0, [x3, #0x0]\n" - "mov v5.16b, v20.16b\n" - "ldr d1, [x3, #0x8]\n" - "mov v11.16b, v15.16b\n" - "ldr d2, [x3, #0x10]\n" - "mov v8.16b, v20.16b\n" - "ldr d3, [x3, #0x18]\n" + "ldr d0, [x23, #0x0]\n" + "ldr d1, [x23, #0x8]\n" + "mov v17.16b, v15.16b\n" + "mov v8.16b, v16.16b\n" + "ldr d2, [x23, #0x10]\n" + "ldr d3, [x23, #0x18]\n" "mov v10.16b, v15.16b\n" - "ldr d4, [x3, #0x20]\n" - "usubl v0.8h, v0.8b, v13.8b\n" - "mov v9.16b, v20.16b\n" - "ldp x28, x27, [x25, #0x0]\n" - "usubl v1.8h, v1.8b, v13.8b\n" - "ldp x26, x13, [x25, #0x10]\n" - "usubl v2.8h, v2.8b, v13.8b\n" - "usubl v3.8h, v3.8b, v13.8b\n" - "ldp x24, x23, [x25, #0x20]\n" - "usubl v4.8h, v4.8b, v13.8b\n" - "ldp x22, x21, [x25, #0x30]\n" - "ldp x20, x0, [x25, #0x40]\n" - "add x28, x28, x10\n" - "add x27, x27, x10\n" - "add x26, x26, x10\n" - "add x13, x13, x10\n" - "add x24, x24, x10\n" - "add x23, x23, x10\n" - "add x22, x22, x10\n" - "add x21, x21, x10\n" - "add x20, x20, x10\n" - "add x0, x0, x10\n" - "tbz x4, #2, 9f\n" + "mov v7.16b, v16.16b\n" + "ldr d4, [x23, #0x20]\n" + "ldp x28, x6, [x20, #0x0]\n" + "mov v6.16b, v15.16b\n" + "mov v5.16b, v16.16b\n" + "ldp x26, x25, [x20, #0x10]\n" + "ldp x5, x2, [x20, #0x20]\n" + "usubl v0.8h, v0.8b, v14.8b\n" + "usubl v1.8h, v1.8b, v14.8b\n" + "ldp x27, x21, [x20, #0x30]\n" + "ldp x12, x19, [x20, #0x40]\n" + "usubl v2.8h, v2.8b, v14.8b\n" + "usubl v3.8h, v3.8b, v14.8b\n" + "usubl v4.8h, v4.8b, v14.8b\n" + "add x28, x28, x24\n" + "add x6, x6, x24\n" + "add x26, x26, x24\n" + "add x25, x25, x24\n" + "add x5, x5, x24\n" + "add x2, x2, x24\n" + "add x27, x27, x24\n" + "add x21, x21, x24\n" + "add x12, x12, x24\n" + "add x19, x19, x24\n" + "tbz x0, #2, 9f\n" "ld1 { v31.s }[0], [x28], #0x4\n" - "ld1 { v30.s }[0], [x27], #0x4\n" + "ld1 { v30.s }[0], [x6], #0x4\n" "ld1 { v29.s }[0], [x26], #0x4\n" - "ld1 { v28.s }[0], [x13], #0x4\n" - "ld1 { v27.s }[0], [x24], #0x4\n" - "ld1 { v23.s }[0], [x23], #0x4\n" - "ld1 { v25.s }[0], [x22], #0x4\n" + "ld1 { v28.s }[0], [x25], #0x4\n" + "ld1 { v27.s }[0], [x5], #0x4\n" + "ld1 { v23.s }[0], [x2], #0x4\n" + "ld1 { v25.s }[0], [x27], #0x4\n" "ld1 { v24.s }[0], [x21], #0x4\n" - "ld1 { v26.s }[0], [x20], #0x4\n" - "ld1 { v22.s }[0], [x0], #0x4\n" - "tbz x4, #1, 8f\n" + "ld1 { v26.s }[0], [x12], #0x4\n" + "ld1 { v22.s }[0], [x19], #0x4\n" + "tbz x0, #1, 8f\n" "ld1 { v31.h }[2], [x28], #0x2\n" - "ld1 { v30.h }[2], [x27], #0x2\n" + "ld1 { v30.h }[2], [x6], #0x2\n" "ld1 { v29.h }[2], [x26], #0x2\n" - "ld1 { v28.h }[2], [x13], #0x2\n" - "ld1 { v27.h }[2], [x24], #0x2\n" - "ld1 { v23.h }[2], [x23], #0x2\n" - "ld1 { v25.h }[2], [x22], #0x2\n" + "ld1 { v28.h }[2], [x25], #0x2\n" + "ld1 { v27.h }[2], [x5], #0x2\n" + "ld1 { v23.h }[2], [x2], #0x2\n" + "ld1 { v25.h }[2], [x27], #0x2\n" "ld1 { v24.h }[2], [x21], #0x2\n" - "ld1 { v26.h }[2], [x20], #0x2\n" - "ld1 { v22.h }[2], [x0], #0x2\n" - "tbz x4, #0, 11f\n" + "ld1 { v26.h }[2], [x12], #0x2\n" + "ld1 { v22.h }[2], [x19], #0x2\n" + "tbz x0, #0, 11f\n" "ld1 { v31.b }[6], [x28]\n" - "ld1 { v30.b }[6], [x27]\n" + "ld1 { v30.b }[6], [x6]\n" "ld1 { v29.b }[6], [x26]\n" - "ld1 { v28.b }[6], [x13]\n" - "ld1 { v27.b }[6], [x24]\n" - "ld1 { v23.b }[6], [x23]\n" - "ld1 { v25.b }[6], [x22]\n" + "ld1 { v28.b }[6], [x25]\n" + "ld1 { v27.b }[6], [x5]\n" + "ld1 { v23.b }[6], [x2]\n" + "ld1 { v25.b }[6], [x27]\n" "ld1 { v24.b }[6], [x21]\n" - "ld1 { v26.b }[6], [x20]\n" - "ld1 { v22.b }[6], [x0]\n" + "ld1 { v26.b }[6], [x12]\n" + "ld1 { v22.b }[6], [x19]\n" "b 11f\n" "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset - "tbz x4, #0, 11f\n" + "tbz x0, #0, 11f\n" "ld1 { v31.b }[4], [x28]\n" - "ld1 { v30.b }[4], [x27]\n" + "ld1 { v30.b }[4], [x6]\n" "ld1 { v29.b }[4], [x26]\n" - "ld1 { v28.b }[4], [x13]\n" - "ld1 { v27.b }[4], [x24]\n" - "ld1 { v23.b }[4], [x23]\n" - "ld1 { v25.b }[4], [x22]\n" + "ld1 { v28.b }[4], [x25]\n" + "ld1 { v27.b }[4], [x5]\n" + "ld1 { v23.b }[4], [x2]\n" + "ld1 { v25.b }[4], [x27]\n" "ld1 { v24.b }[4], [x21]\n" - "ld1 { v26.b }[4], [x20]\n" - "ld1 { v22.b }[4], [x0]\n" + "ld1 { v26.b }[4], [x12]\n" + "ld1 { v22.b }[4], [x19]\n" "b 11f\n" "9:" // Oddments: Initial loads: Bit 2: Unset - "tbz x4, #1, 10f\n" + "tbz x0, #1, 10f\n" "ld1 { v31.h }[0], [x28], #0x2\n" - "ld1 { v30.h }[0], [x27], #0x2\n" + "ld1 { v30.h }[0], [x6], #0x2\n" "ld1 { v29.h }[0], [x26], #0x2\n" - "ld1 { v28.h }[0], [x13], #0x2\n" - "ld1 { v27.h }[0], [x24], #0x2\n" - "ld1 { v23.h }[0], [x23], #0x2\n" - "ld1 { v25.h }[0], [x22], #0x2\n" + "ld1 { v28.h }[0], [x25], #0x2\n" + "ld1 { v27.h }[0], [x5], #0x2\n" + "ld1 { v23.h }[0], [x2], #0x2\n" + "ld1 { v25.h }[0], [x27], #0x2\n" "ld1 { v24.h }[0], [x21], #0x2\n" - "ld1 { v26.h }[0], [x20], #0x2\n" - "ld1 { v22.h }[0], [x0], #0x2\n" - "tbz x4, #0, 11f\n" + "ld1 { v26.h }[0], [x12], #0x2\n" + "ld1 { v22.h }[0], [x19], #0x2\n" + "tbz x0, #0, 11f\n" "ld1 { v31.b }[2], [x28]\n" - "ld1 { v30.b }[2], [x27]\n" + "ld1 { v30.b }[2], [x6]\n" "ld1 { v29.b }[2], [x26]\n" - "ld1 { v28.b }[2], [x13]\n" - "ld1 { v27.b }[2], [x24]\n" - "ld1 { v23.b }[2], [x23]\n" - "ld1 { v25.b }[2], [x22]\n" + "ld1 { v28.b }[2], [x25]\n" + "ld1 { v27.b }[2], [x5]\n" + "ld1 { v23.b }[2], [x2]\n" + "ld1 { v25.b }[2], [x27]\n" "ld1 { v24.b }[2], [x21]\n" - "ld1 { v26.b }[2], [x20]\n" - "ld1 { v22.b }[2], [x0]\n" + "ld1 { v26.b }[2], [x12]\n" + "ld1 { v22.b }[2], [x19]\n" "b 11f\n" "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 11f\n" + "tbz x0, #0, 11f\n" "ld1 { v31.b }[0], [x28]\n" - "ld1 { v30.b }[0], [x27]\n" + "ld1 { v30.b }[0], [x6]\n" "ld1 { v29.b }[0], [x26]\n" - "ld1 { v28.b }[0], [x13]\n" - "ld1 { v27.b }[0], [x24]\n" - "ld1 { v23.b }[0], [x23]\n" - "ld1 { v25.b }[0], [x22]\n" + "ld1 { v28.b }[0], [x25]\n" + "ld1 { v27.b }[0], [x5]\n" + "ld1 { v23.b }[0], [x2]\n" + "ld1 { v25.b }[0], [x27]\n" "ld1 { v24.b }[0], [x21]\n" - "ld1 { v26.b }[0], [x20]\n" - "ld1 { v22.b }[0], [x0]\n" + "ld1 { v26.b }[0], [x12]\n" + "ld1 { v22.b }[0], [x19]\n" "11:" // Oddments: Initial loads: Bit 2: End - "ldr x20, [x25, #0x50]\n" - "usubl v31.8h, v31.8b, v7.8b\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "usubl v30.8h, v30.8b, v9.8b\n" "smlal v15.4s, v31.4h, v0.4h\n" - "usubl v30.8h, v30.8b, v7.8b\n" - "smlal2 v20.4s, v31.8h, v0.8h\n" - "usubl v29.8h, v29.8b, v7.8b\n" - "usubl v28.8h, v28.8b, v7.8b\n" - "smlal v18.4s, v30.4h, v0.4h\n" - "usubl v27.8h, v27.8b, v7.8b\n" - "smlal2 v5.4s, v30.8h, v0.8h\n" - "usubl v23.8h, v23.8b, v7.8b\n" - "smlal v11.4s, v29.4h, v0.4h\n" - "usubl v25.8h, v25.8b, v7.8b\n" - "smlal2 v8.4s, v29.8h, v0.8h\n" - "usubl v24.8h, v24.8b, v7.8b\n" - "smlal v10.4s, v28.4h, v0.4h\n" - "usubl v26.8h, v26.8b, v7.8b\n" - "smlal2 v9.4s, v28.8h, v0.8h\n" - "usubl v22.8h, v22.8b, v7.8b\n" + "ldr x19, [x20, #0x50]\n" + "usubl v29.8h, v29.8b, v9.8b\n" + "smlal2 v16.4s, v31.8h, v0.8h\n" + "smlal v17.4s, v30.4h, v0.4h\n" + "smlal2 v8.4s, v30.8h, v0.8h\n" + "smlal v10.4s, v29.4h, v0.4h\n" + "usubl v28.8h, v28.8b, v9.8b\n" + "add x19, x19, x24\n" + "smlal2 v7.4s, v29.8h, v0.8h\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal v6.4s, v28.4h, v0.4h\n" + "smlal2 v5.4s, v28.8h, v0.8h\n" "smlal v15.4s, v30.4h, v1.4h\n" - "smlal2 v20.4s, v30.8h, v1.8h\n" - "add x20, x20, x10\n" - "smlal v18.4s, v27.4h, v1.4h\n" - "smlal2 v5.4s, v27.8h, v1.8h\n" - "smlal v11.4s, v28.4h, v1.4h\n" - "smlal2 v8.4s, v28.8h, v1.8h\n" - "smlal v10.4s, v23.4h, v1.4h\n" - "smlal2 v9.4s, v23.8h, v1.8h\n" + "usubl v23.8h, v23.8b, v9.8b\n" + "smlal2 v16.4s, v30.8h, v1.8h\n" + "smlal v17.4s, v27.4h, v1.4h\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "smlal2 v8.4s, v27.8h, v1.8h\n" + "smlal v10.4s, v28.4h, v1.4h\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "smlal2 v7.4s, v28.8h, v1.8h\n" + "usubl v26.8h, v26.8b, v9.8b\n" + "smlal v6.4s, v23.4h, v1.4h\n" + "usubl v22.8h, v22.8b, v9.8b\n" + "smlal2 v5.4s, v23.8h, v1.8h\n" "smlal v15.4s, v27.4h, v2.4h\n" - "smlal2 v20.4s, v27.8h, v2.8h\n" - "smlal v18.4s, v25.4h, v2.4h\n" - "smlal2 v5.4s, v25.8h, v2.8h\n" - "smlal v11.4s, v23.4h, v2.4h\n" - "smlal2 v8.4s, v23.8h, v2.8h\n" - "tbz x4, #2, 13f\n" - "ld1 { v31.s }[0], [x20], #0x4\n" - "tbz x4, #1, 12f\n" - "ld1 { v31.h }[2], [x20], #0x2\n" - "tbz x4, #0, 15f\n" - "ld1 { v31.b }[6], [x20]\n" + "smlal2 v16.4s, v27.8h, v2.8h\n" + "smlal v17.4s, v25.4h, v2.4h\n" + "smlal2 v8.4s, v25.8h, v2.8h\n" + "smlal v10.4s, v23.4h, v2.4h\n" + "smlal2 v7.4s, v23.8h, v2.8h\n" + "tbz x0, #2, 13f\n" + "ld1 { v31.s }[0], [x19], #0x4\n" + "tbz x0, #1, 12f\n" + "ld1 { v31.h }[2], [x19], #0x2\n" + "tbz x0, #0, 15f\n" + "ld1 { v31.b }[6], [x19]\n" "b 15f\n" "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 15f\n" - "ld1 { v31.b }[4], [x20]\n" + "tbz x0, #0, 15f\n" + "ld1 { v31.b }[4], [x19]\n" "b 15f\n" "13:" // Oddments: Load (1, 3): Bit 2: Unset - "tbz x4, #1, 14f\n" - "ld1 { v31.h }[0], [x20], #0x2\n" - "tbz x4, #0, 15f\n" - "ld1 { v31.b }[2], [x20]\n" + "tbz x0, #1, 14f\n" + "ld1 { v31.h }[0], [x19], #0x2\n" + "tbz x0, #0, 15f\n" + "ld1 { v31.b }[2], [x19]\n" "b 15f\n" "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 15f\n" - "ld1 { v31.b }[0], [x20]\n" + "tbz x0, #0, 15f\n" + "ld1 { v31.b }[0], [x19]\n" "15:" // Oddments: Load (1, 3): Bit 2: End + "usubl v31.8h, v31.8b, v9.8b\n" + "ldr x15, [x20, #0x58]\n" + "smlal v6.4s, v31.4h, v2.4h\n" + "smlal2 v5.4s, v31.8h, v2.8h\n" "smlal v15.4s, v25.4h, v3.4h\n" - "ldr x28, [x25, #0x58]\n" - "usubl v31.8h, v31.8b, v7.8b\n" - "smlal2 v20.4s, v25.8h, v3.8h\n" - "smlal v18.4s, v24.4h, v3.4h\n" - "add x28, x28, x10\n" - "smlal2 v5.4s, v24.8h, v3.8h\n" - "smlal v10.4s, v31.4h, v2.4h\n" - "smlal2 v9.4s, v31.8h, v2.8h\n" - "smlal v11.4s, v31.4h, v3.4h\n" - "smlal2 v8.4s, v31.8h, v3.8h\n" - "tbz x4, #2, 17f\n" - "ld1 { v30.s }[0], [x28], #0x4\n" - "tbz x4, #1, 16f\n" - "ld1 { v30.h }[2], [x28], #0x2\n" - "tbz x4, #0, 19f\n" - "ld1 { v30.b }[6], [x28]\n" + "smlal2 v16.4s, v25.8h, v3.8h\n" + "add x15, x15, x24\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal2 v8.4s, v24.8h, v3.8h\n" + "smlal v10.4s, v31.4h, v3.4h\n" + "smlal2 v7.4s, v31.8h, v3.8h\n" + "tbz x0, #2, 17f\n" + "ld1 { v30.s }[0], [x15], #0x4\n" + "tbz x0, #1, 16f\n" + "ld1 { v30.h }[2], [x15], #0x2\n" + "tbz x0, #0, 19f\n" + "ld1 { v30.b }[6], [x15]\n" "b 19f\n" "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 19f\n" - "ld1 { v30.b }[4], [x28]\n" + "tbz x0, #0, 19f\n" + "ld1 { v30.b }[4], [x15]\n" "b 19f\n" "17:" // Oddments: Load (1, 4): Bit 2: Unset - "tbz x4, #1, 18f\n" - "ld1 { v30.h }[0], [x28], #0x2\n" - "tbz x4, #0, 19f\n" - "ld1 { v30.b }[2], [x28]\n" + "tbz x0, #1, 18f\n" + "ld1 { v30.h }[0], [x15], #0x2\n" + "tbz x0, #0, 19f\n" + "ld1 { v30.b }[2], [x15]\n" "b 19f\n" "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 19f\n" - "ld1 { v30.b }[0], [x28]\n" + "tbz x0, #0, 19f\n" + "ld1 { v30.b }[0], [x15]\n" "19:" // Oddments: Load (1, 4): Bit 2: End + "usubl v30.8h, v30.8b, v9.8b\n" + "ldr x19, [x20, #0x60]\n" + "smlal v6.4s, v30.4h, v3.4h\n" + "smlal2 v5.4s, v30.8h, v3.8h\n" "smlal v15.4s, v24.4h, v4.4h\n" - "ldr x0, [x25, #0x60]\n" - "usubl v30.8h, v30.8b, v7.8b\n" - "smlal2 v20.4s, v24.8h, v4.8h\n" - "add x0, x0, x10\n" - "smlal v10.4s, v30.4h, v3.4h\n" - "smlal2 v9.4s, v30.8h, v3.8h\n" - "tbz x4, #2, 21f\n" - "ld1 { v27.s }[0], [x0], #0x4\n" - "tbz x4, #1, 20f\n" - "ld1 { v27.h }[2], [x0], #0x2\n" - "tbz x4, #0, 23f\n" - "ld1 { v27.b }[6], [x0]\n" + "smlal2 v16.4s, v24.8h, v4.8h\n" + "add x19, x19, x24\n" + "tbz x0, #2, 21f\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "tbz x0, #1, 20f\n" + "ld1 { v27.h }[2], [x19], #0x2\n" + "tbz x0, #0, 23f\n" + "ld1 { v27.b }[6], [x19]\n" "b 23f\n" "20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset - "tbz x4, #0, 23f\n" - "ld1 { v27.b }[4], [x0]\n" + "tbz x0, #0, 23f\n" + "ld1 { v27.b }[4], [x19]\n" "b 23f\n" "21:" // Oddments: Load (0, 5): Bit 2: Unset - "tbz x4, #1, 22f\n" - "ld1 { v27.h }[0], [x0], #0x2\n" - "tbz x4, #0, 23f\n" - "ld1 { v27.b }[2], [x0]\n" + "tbz x0, #1, 22f\n" + "ld1 { v27.h }[0], [x19], #0x2\n" + "tbz x0, #0, 23f\n" + "ld1 { v27.b }[2], [x19]\n" "b 23f\n" "22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 23f\n" - "ld1 { v27.b }[0], [x0]\n" + "tbz x0, #0, 23f\n" + "ld1 { v27.b }[0], [x19]\n" "23:" // Oddments: Load (0, 5): Bit 2: End - "smlal v11.4s, v30.4h, v4.4h\n" - "ldr d0, [x3, #0x28]\n" - "usubl v27.8h, v27.8b, v7.8b\n" - "smlal2 v8.4s, v30.8h, v4.8h\n" - "ldr x7, [x25, #0x68]\n" - "smlal v10.4s, v26.4h, v4.4h\n" - "usubl v0.8h, v0.8b, v13.8b\n" - "smlal2 v9.4s, v26.8h, v4.8h\n" - "add x7, x7, x10\n" - "smlal v18.4s, v27.4h, v4.4h\n" - "smlal2 v5.4s, v27.8h, v4.8h\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "ldr d0, [x23, #0x28]\n" + "smlal v17.4s, v27.4h, v4.4h\n" + "smlal2 v8.4s, v27.8h, v4.8h\n" + "smlal v10.4s, v30.4h, v4.4h\n" + "smlal2 v7.4s, v30.8h, v4.8h\n" + "usubl v0.8h, v0.8b, v14.8b\n" + "ldr x27, [x20, #0x68]\n" + "smlal v6.4s, v26.4h, v4.4h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "add x27, x27, x24\n" "smlal v15.4s, v29.4h, v0.4h\n" - "smlal2 v20.4s, v29.8h, v0.8h\n" - "smlal v18.4s, v28.4h, v0.4h\n" - "smlal2 v5.4s, v28.8h, v0.8h\n" - "smlal v11.4s, v22.4h, v0.4h\n" - "smlal2 v8.4s, v22.8h, v0.8h\n" - "tbz x4, #2, 25f\n" - "ld1 { v25.s }[0], [x7], #0x4\n" - "tbz x4, #1, 24f\n" - "ld1 { v25.h }[2], [x7], #0x2\n" - "tbz x4, #0, 27f\n" - "ld1 { v25.b }[6], [x7]\n" + "smlal2 v16.4s, v29.8h, v0.8h\n" + "smlal v17.4s, v28.4h, v0.4h\n" + "smlal2 v8.4s, v28.8h, v0.8h\n" + "smlal v10.4s, v22.4h, v0.4h\n" + "smlal2 v7.4s, v22.8h, v0.8h\n" + "tbz x0, #2, 25f\n" + "ld1 { v25.s }[0], [x27], #0x4\n" + "tbz x0, #1, 24f\n" + "ld1 { v25.h }[2], [x27], #0x2\n" + "tbz x0, #0, 27f\n" + "ld1 { v25.b }[6], [x27]\n" "b 27f\n" "24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset - "tbz x4, #0, 27f\n" - "ld1 { v25.b }[4], [x7]\n" + "tbz x0, #0, 27f\n" + "ld1 { v25.b }[4], [x27]\n" "b 27f\n" "25:" // Oddments: Load (2, 1): Bit 2: Unset - "tbz x4, #1, 26f\n" - "ld1 { v25.h }[0], [x7], #0x2\n" - "tbz x4, #0, 27f\n" - "ld1 { v25.b }[2], [x7]\n" + "tbz x0, #1, 26f\n" + "ld1 { v25.h }[0], [x27], #0x2\n" + "tbz x0, #0, 27f\n" + "ld1 { v25.b }[2], [x27]\n" "b 27f\n" "26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 27f\n" - "ld1 { v25.b }[0], [x7]\n" + "tbz x0, #0, 27f\n" + "ld1 { v25.b }[0], [x27]\n" "27:" // Oddments: Load (2, 1): Bit 2: End - "ldr d1, [x3, #0x30]\n" - "usubl v25.8h, v25.8b, v7.8b\n" - "smlal v10.4s, v25.4h, v0.4h\n" - "ldr x26, [x25, #0x70]\n" - "usubl v1.8h, v1.8b, v13.8b\n" - "smlal2 v9.4s, v25.8h, v0.8h\n" - "add x26, x26, x10\n" + "ldr d1, [x23, #0x30]\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "usubl v1.8h, v1.8b, v14.8b\n" + "ldr x5, [x20, #0x70]\n" + "smlal v6.4s, v25.4h, v0.4h\n" + "smlal2 v5.4s, v25.8h, v0.8h\n" + "add x5, x5, x24\n" "smlal v15.4s, v28.4h, v1.4h\n" - "smlal2 v20.4s, v28.8h, v1.8h\n" - "smlal v18.4s, v23.4h, v1.4h\n" - "smlal2 v5.4s, v23.8h, v1.8h\n" - "smlal v11.4s, v25.4h, v1.4h\n" - "smlal2 v8.4s, v25.8h, v1.8h\n" - "tbz x4, #2, 29f\n" - "ld1 { v24.s }[0], [x26], #0x4\n" - "tbz x4, #1, 28f\n" - "ld1 { v24.h }[2], [x26], #0x2\n" - "tbz x4, #0, 31f\n" - "ld1 { v24.b }[6], [x26]\n" + "smlal2 v16.4s, v28.8h, v1.8h\n" + "smlal v17.4s, v23.4h, v1.4h\n" + "smlal2 v8.4s, v23.8h, v1.8h\n" + "smlal v10.4s, v25.4h, v1.4h\n" + "smlal2 v7.4s, v25.8h, v1.8h\n" + "tbz x0, #2, 29f\n" + "ld1 { v24.s }[0], [x5], #0x4\n" + "tbz x0, #1, 28f\n" + "ld1 { v24.h }[2], [x5], #0x2\n" + "tbz x0, #0, 31f\n" + "ld1 { v24.b }[6], [x5]\n" "b 31f\n" "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset - "tbz x4, #0, 31f\n" - "ld1 { v24.b }[4], [x26]\n" + "tbz x0, #0, 31f\n" + "ld1 { v24.b }[4], [x5]\n" "b 31f\n" "29:" // Oddments: Load (2, 2): Bit 2: Unset - "tbz x4, #1, 30f\n" - "ld1 { v24.h }[0], [x26], #0x2\n" - "tbz x4, #0, 31f\n" - "ld1 { v24.b }[2], [x26]\n" + "tbz x0, #1, 30f\n" + "ld1 { v24.h }[0], [x5], #0x2\n" + "tbz x0, #0, 31f\n" + "ld1 { v24.b }[2], [x5]\n" "b 31f\n" "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 31f\n" - "ld1 { v24.b }[0], [x26]\n" + "tbz x0, #0, 31f\n" + "ld1 { v24.b }[0], [x5]\n" "31:" // Oddments: Load (2, 2): Bit 2: End - "ldr d2, [x3, #0x38]\n" - "usubl v24.8h, v24.8b, v7.8b\n" - "smlal v10.4s, v24.4h, v1.4h\n" - "ldr x23, [x25, #0x78]\n" - "usubl v2.8h, v2.8b, v13.8b\n" - "smlal2 v9.4s, v24.8h, v1.8h\n" - "add x23, x23, x10\n" + "ldr d2, [x23, #0x38]\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "usubl v2.8h, v2.8b, v14.8b\n" + "ldr x11, [x20, #0x78]\n" + "smlal v6.4s, v24.4h, v1.4h\n" + "smlal2 v5.4s, v24.8h, v1.8h\n" + "add x11, x11, x24\n" "smlal v15.4s, v23.4h, v2.4h\n" - "smlal2 v20.4s, v23.8h, v2.8h\n" - "smlal v18.4s, v31.4h, v2.4h\n" - "smlal2 v5.4s, v31.8h, v2.8h\n" - "smlal v11.4s, v24.4h, v2.4h\n" - "smlal2 v8.4s, v24.8h, v2.8h\n" - "tbz x4, #2, 33f\n" - "ld1 { v27.s }[0], [x23], #0x4\n" - "tbz x4, #1, 32f\n" - "ld1 { v27.h }[2], [x23], #0x2\n" - "tbz x4, #0, 35f\n" - "ld1 { v27.b }[6], [x23]\n" + "smlal2 v16.4s, v23.8h, v2.8h\n" + "smlal v17.4s, v31.4h, v2.4h\n" + "smlal2 v8.4s, v31.8h, v2.8h\n" + "smlal v10.4s, v24.4h, v2.4h\n" + "smlal2 v7.4s, v24.8h, v2.8h\n" + "tbz x0, #2, 33f\n" + "ld1 { v27.s }[0], [x11], #0x4\n" + "tbz x0, #1, 32f\n" + "ld1 { v27.h }[2], [x11], #0x2\n" + "tbz x0, #0, 35f\n" + "ld1 { v27.b }[6], [x11]\n" "b 35f\n" "32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 35f\n" - "ld1 { v27.b }[4], [x23]\n" + "tbz x0, #0, 35f\n" + "ld1 { v27.b }[4], [x11]\n" "b 35f\n" "33:" // Oddments: Load (2, 3): Bit 2: Unset - "tbz x4, #1, 34f\n" - "ld1 { v27.h }[0], [x23], #0x2\n" - "tbz x4, #0, 35f\n" - "ld1 { v27.b }[2], [x23]\n" + "tbz x0, #1, 34f\n" + "ld1 { v27.h }[0], [x11], #0x2\n" + "tbz x0, #0, 35f\n" + "ld1 { v27.b }[2], [x11]\n" "b 35f\n" "34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 35f\n" - "ld1 { v27.b }[0], [x23]\n" + "tbz x0, #0, 35f\n" + "ld1 { v27.b }[0], [x11]\n" "35:" // Oddments: Load (2, 3): Bit 2: End - "ldr d3, [x3, #0x40]\n" - "usubl v27.8h, v27.8b, v7.8b\n" - "smlal v10.4s, v27.4h, v2.4h\n" - "ldr x20, [x25, #0x80]\n" - "usubl v3.8h, v3.8b, v13.8b\n" - "smlal2 v9.4s, v27.8h, v2.8h\n" - "add x20, x20, x10\n" + "ldr d3, [x23, #0x40]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "usubl v3.8h, v3.8b, v14.8b\n" + "ldr x12, [x20, #0x80]\n" + "smlal v6.4s, v27.4h, v2.4h\n" + "smlal2 v5.4s, v27.8h, v2.8h\n" + "add x12, x12, x24\n" "smlal v15.4s, v31.4h, v3.4h\n" - "smlal2 v20.4s, v31.8h, v3.8h\n" - "smlal v18.4s, v30.4h, v3.4h\n" - "smlal2 v5.4s, v30.8h, v3.8h\n" - "smlal v11.4s, v27.4h, v3.4h\n" - "smlal2 v8.4s, v27.8h, v3.8h\n" - "tbz x4, #2, 37f\n" - "ld1 { v23.s }[0], [x20], #0x4\n" - "tbz x4, #1, 36f\n" - "ld1 { v23.h }[2], [x20], #0x2\n" - "tbz x4, #0, 39f\n" - "ld1 { v23.b }[6], [x20]\n" + "smlal2 v16.4s, v31.8h, v3.8h\n" + "smlal v17.4s, v30.4h, v3.4h\n" + "smlal2 v8.4s, v30.8h, v3.8h\n" + "smlal v10.4s, v27.4h, v3.4h\n" + "smlal2 v7.4s, v27.8h, v3.8h\n" + "tbz x0, #2, 37f\n" + "ld1 { v23.s }[0], [x12], #0x4\n" + "tbz x0, #1, 36f\n" + "ld1 { v23.h }[2], [x12], #0x2\n" + "tbz x0, #0, 39f\n" + "ld1 { v23.b }[6], [x12]\n" "b 39f\n" "36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 39f\n" - "ld1 { v23.b }[4], [x20]\n" + "tbz x0, #0, 39f\n" + "ld1 { v23.b }[4], [x12]\n" "b 39f\n" "37:" // Oddments: Load (2, 4): Bit 2: Unset - "tbz x4, #1, 38f\n" - "ld1 { v23.h }[0], [x20], #0x2\n" - "tbz x4, #0, 39f\n" - "ld1 { v23.b }[2], [x20]\n" + "tbz x0, #1, 38f\n" + "ld1 { v23.h }[0], [x12], #0x2\n" + "tbz x0, #0, 39f\n" + "ld1 { v23.b }[2], [x12]\n" "b 39f\n" "38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 39f\n" - "ld1 { v23.b }[0], [x20]\n" + "tbz x0, #0, 39f\n" + "ld1 { v23.b }[0], [x12]\n" "39:" // Oddments: Load (2, 4): Bit 2: End - "ldr d4, [x3, #0x48]\n" - "usubl v23.8h, v23.8b, v7.8b\n" - "smlal v10.4s, v23.4h, v3.4h\n" - "ldr x22, [x25, #0x88]\n" - "usubl v4.8h, v4.8b, v13.8b\n" - "smlal2 v9.4s, v23.8h, v3.8h\n" - "add x22, x22, x10\n" + "ldr d4, [x23, #0x48]\n" + "usubl v23.8h, v23.8b, v9.8b\n" + "usubl v4.8h, v4.8b, v14.8b\n" + "ldr x26, [x20, #0x88]\n" + "smlal v6.4s, v23.4h, v3.4h\n" + "smlal2 v5.4s, v23.8h, v3.8h\n" + "add x26, x26, x24\n" "smlal v15.4s, v30.4h, v4.4h\n" - "smlal2 v20.4s, v30.8h, v4.8h\n" - "smlal v18.4s, v26.4h, v4.4h\n" - "smlal2 v5.4s, v26.8h, v4.8h\n" - "smlal v11.4s, v23.4h, v4.4h\n" - "smlal2 v8.4s, v23.8h, v4.8h\n" - "tbz x4, #2, 41f\n" - "ld1 { v28.s }[0], [x22], #0x4\n" - "tbz x4, #1, 40f\n" - "ld1 { v28.h }[2], [x22], #0x2\n" - "tbz x4, #0, 43f\n" - "ld1 { v28.b }[6], [x22]\n" + "smlal2 v16.4s, v30.8h, v4.8h\n" + "smlal v17.4s, v26.4h, v4.4h\n" + "smlal2 v8.4s, v26.8h, v4.8h\n" + "smlal v10.4s, v23.4h, v4.4h\n" + "smlal2 v7.4s, v23.8h, v4.8h\n" + "tbz x0, #2, 41f\n" + "ld1 { v28.s }[0], [x26], #0x4\n" + "tbz x0, #1, 40f\n" + "ld1 { v28.h }[2], [x26], #0x2\n" + "tbz x0, #0, 43f\n" + "ld1 { v28.b }[6], [x26]\n" "b 43f\n" "40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset - "tbz x4, #0, 43f\n" - "ld1 { v28.b }[4], [x22]\n" + "tbz x0, #0, 43f\n" + "ld1 { v28.b }[4], [x26]\n" "b 43f\n" "41:" // Oddments: Load (2, 5): Bit 2: Unset - "tbz x4, #1, 42f\n" - "ld1 { v28.h }[0], [x22], #0x2\n" - "tbz x4, #0, 43f\n" - "ld1 { v28.b }[2], [x22]\n" + "tbz x0, #1, 42f\n" + "ld1 { v28.h }[0], [x26], #0x2\n" + "tbz x0, #0, 43f\n" + "ld1 { v28.b }[2], [x26]\n" "b 43f\n" "42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 43f\n" - "ld1 { v28.b }[0], [x22]\n" + "tbz x0, #0, 43f\n" + "ld1 { v28.b }[0], [x26]\n" "43:" // Oddments: Load (2, 5): Bit 2: End - "ldr d0, [x3, #0x50]\n" - "usubl v28.8h, v28.8b, v7.8b\n" - "smlal v10.4s, v28.4h, v4.4h\n" - "ldr x13, [x25, #0x90]\n" - "usubl v0.8h, v0.8b, v13.8b\n" - "smlal2 v9.4s, v28.8h, v4.8h\n" - "add x13, x13, x10\n" + "ldr d0, [x23, #0x50]\n" + "usubl v28.8h, v28.8b, v9.8b\n" + "usubl v0.8h, v0.8b, v14.8b\n" + "ldr x14, [x20, #0x90]\n" + "smlal v6.4s, v28.4h, v4.4h\n" + "smlal2 v5.4s, v28.8h, v4.8h\n" + "add x14, x14, x24\n" "smlal v15.4s, v22.4h, v0.4h\n" - "smlal2 v20.4s, v22.8h, v0.8h\n" - "smlal v18.4s, v25.4h, v0.4h\n" - "smlal2 v5.4s, v25.8h, v0.8h\n" - "tbz x4, #2, 45f\n" - "ld1 { v31.s }[0], [x13], #0x4\n" - "tbz x4, #1, 44f\n" - "ld1 { v31.h }[2], [x13], #0x2\n" - "tbz x4, #0, 47f\n" - "ld1 { v31.b }[6], [x13]\n" + "smlal2 v16.4s, v22.8h, v0.8h\n" + "smlal v17.4s, v25.4h, v0.4h\n" + "smlal2 v8.4s, v25.8h, v0.8h\n" + "tbz x0, #2, 45f\n" + "ld1 { v31.s }[0], [x14], #0x4\n" + "tbz x0, #1, 44f\n" + "ld1 { v31.h }[2], [x14], #0x2\n" + "tbz x0, #0, 47f\n" + "ld1 { v31.b }[6], [x14]\n" "b 47f\n" "44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset - "tbz x4, #0, 47f\n" - "ld1 { v31.b }[4], [x13]\n" + "tbz x0, #0, 47f\n" + "ld1 { v31.b }[4], [x14]\n" "b 47f\n" "45:" // Oddments: Load (3, 0): Bit 2: Unset - "tbz x4, #1, 46f\n" - "ld1 { v31.h }[0], [x13], #0x2\n" - "tbz x4, #0, 47f\n" - "ld1 { v31.b }[2], [x13]\n" + "tbz x0, #1, 46f\n" + "ld1 { v31.h }[0], [x14], #0x2\n" + "tbz x0, #0, 47f\n" + "ld1 { v31.b }[2], [x14]\n" "b 47f\n" "46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 47f\n" - "ld1 { v31.b }[0], [x13]\n" + "tbz x0, #0, 47f\n" + "ld1 { v31.b }[0], [x14]\n" "47:" // Oddments: Load (3, 0): Bit 2: End - "ldr x21, [x25, #0x98]\n" - "usubl v31.8h, v31.8b, v7.8b\n" - "smlal v11.4s, v31.4h, v0.4h\n" - "smlal2 v8.4s, v31.8h, v0.8h\n" - "add x21, x21, x10\n" - "tbz x4, #2, 49f\n" - "ld1 { v30.s }[0], [x21], #0x4\n" - "tbz x4, #1, 48f\n" - "ld1 { v30.h }[2], [x21], #0x2\n" - "tbz x4, #0, 51f\n" - "ld1 { v30.b }[6], [x21]\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "ldr x15, [x20, #0x98]\n" + "smlal v10.4s, v31.4h, v0.4h\n" + "smlal2 v7.4s, v31.8h, v0.8h\n" + "add x15, x15, x24\n" + "tbz x0, #2, 49f\n" + "ld1 { v30.s }[0], [x15], #0x4\n" + "tbz x0, #1, 48f\n" + "ld1 { v30.h }[2], [x15], #0x2\n" + "tbz x0, #0, 51f\n" + "ld1 { v30.b }[6], [x15]\n" "b 51f\n" "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset - "tbz x4, #0, 51f\n" - "ld1 { v30.b }[4], [x21]\n" + "tbz x0, #0, 51f\n" + "ld1 { v30.b }[4], [x15]\n" "b 51f\n" "49:" // Oddments: Load (3, 1): Bit 2: Unset - "tbz x4, #1, 50f\n" - "ld1 { v30.h }[0], [x21], #0x2\n" - "tbz x4, #0, 51f\n" - "ld1 { v30.b }[2], [x21]\n" + "tbz x0, #1, 50f\n" + "ld1 { v30.h }[0], [x15], #0x2\n" + "tbz x0, #0, 51f\n" + "ld1 { v30.b }[2], [x15]\n" "b 51f\n" "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 51f\n" - "ld1 { v30.b }[0], [x21]\n" + "tbz x0, #0, 51f\n" + "ld1 { v30.b }[0], [x15]\n" "51:" // Oddments: Load (3, 1): Bit 2: End - "ldr d1, [x3, #0x58]\n" - "usubl v30.8h, v30.8b, v7.8b\n" - "smlal v10.4s, v30.4h, v0.4h\n" - "ldr x14, [x25, #0xa0]\n" - "usubl v1.8h, v1.8b, v13.8b\n" - "smlal2 v9.4s, v30.8h, v0.8h\n" - "add x14, x14, x10\n" + "ldr d1, [x23, #0x58]\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "usubl v1.8h, v1.8b, v14.8b\n" + "ldr x21, [x20, #0xa0]\n" + "smlal v6.4s, v30.4h, v0.4h\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "add x21, x21, x24\n" "smlal v15.4s, v25.4h, v1.4h\n" - "smlal2 v20.4s, v25.8h, v1.8h\n" - "smlal v18.4s, v24.4h, v1.4h\n" - "smlal2 v5.4s, v24.8h, v1.8h\n" - "smlal v11.4s, v30.4h, v1.4h\n" - "smlal2 v8.4s, v30.8h, v1.8h\n" - "tbz x4, #2, 53f\n" - "ld1 { v26.s }[0], [x14], #0x4\n" - "tbz x4, #1, 52f\n" - "ld1 { v26.h }[2], [x14], #0x2\n" - "tbz x4, #0, 55f\n" - "ld1 { v26.b }[6], [x14]\n" + "smlal2 v16.4s, v25.8h, v1.8h\n" + "smlal v17.4s, v24.4h, v1.4h\n" + "smlal2 v8.4s, v24.8h, v1.8h\n" + "smlal v10.4s, v30.4h, v1.4h\n" + "smlal2 v7.4s, v30.8h, v1.8h\n" + "tbz x0, #2, 53f\n" + "ld1 { v26.s }[0], [x21], #0x4\n" + "tbz x0, #1, 52f\n" + "ld1 { v26.h }[2], [x21], #0x2\n" + "tbz x0, #0, 55f\n" + "ld1 { v26.b }[6], [x21]\n" "b 55f\n" "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset - "tbz x4, #0, 55f\n" - "ld1 { v26.b }[4], [x14]\n" + "tbz x0, #0, 55f\n" + "ld1 { v26.b }[4], [x21]\n" "b 55f\n" "53:" // Oddments: Load (3, 2): Bit 2: Unset - "tbz x4, #1, 54f\n" - "ld1 { v26.h }[0], [x14], #0x2\n" - "tbz x4, #0, 55f\n" - "ld1 { v26.b }[2], [x14]\n" + "tbz x0, #1, 54f\n" + "ld1 { v26.h }[0], [x21], #0x2\n" + "tbz x0, #0, 55f\n" + "ld1 { v26.b }[2], [x21]\n" "b 55f\n" "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 55f\n" - "ld1 { v26.b }[0], [x14]\n" + "tbz x0, #0, 55f\n" + "ld1 { v26.b }[0], [x21]\n" "55:" // Oddments: Load (3, 2): Bit 2: End - "ldr d2, [x3, #0x60]\n" - "usubl v26.8h, v26.8b, v7.8b\n" - "smlal v10.4s, v26.4h, v1.4h\n" - "ldr x11, [x25, #0xa8]\n" - "usubl v2.8h, v2.8b, v13.8b\n" - "smlal2 v9.4s, v26.8h, v1.8h\n" - "add x11, x11, x10\n" + "ldr d2, [x23, #0x60]\n" + "usubl v26.8h, v26.8b, v9.8b\n" + "usubl v2.8h, v2.8b, v14.8b\n" + "ldr x2, [x20, #0xa8]\n" + "smlal v6.4s, v26.4h, v1.4h\n" + "smlal2 v5.4s, v26.8h, v1.8h\n" + "add x2, x2, x24\n" "smlal v15.4s, v24.4h, v2.4h\n" - "smlal2 v20.4s, v24.8h, v2.8h\n" - "smlal v18.4s, v27.4h, v2.4h\n" - "smlal2 v5.4s, v27.8h, v2.8h\n" - "smlal v11.4s, v26.4h, v2.4h\n" - "smlal2 v8.4s, v26.8h, v2.8h\n" - "tbz x4, #2, 57f\n" - "ld1 { v25.s }[0], [x11], #0x4\n" - "tbz x4, #1, 56f\n" - "ld1 { v25.h }[2], [x11], #0x2\n" - "tbz x4, #0, 59f\n" - "ld1 { v25.b }[6], [x11]\n" + "smlal2 v16.4s, v24.8h, v2.8h\n" + "smlal v17.4s, v27.4h, v2.4h\n" + "smlal2 v8.4s, v27.8h, v2.8h\n" + "smlal v10.4s, v26.4h, v2.4h\n" + "smlal2 v7.4s, v26.8h, v2.8h\n" + "tbz x0, #2, 57f\n" + "ld1 { v25.s }[0], [x2], #0x4\n" + "tbz x0, #1, 56f\n" + "ld1 { v25.h }[2], [x2], #0x2\n" + "tbz x0, #0, 59f\n" + "ld1 { v25.b }[6], [x2]\n" "b 59f\n" "56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 59f\n" - "ld1 { v25.b }[4], [x11]\n" + "tbz x0, #0, 59f\n" + "ld1 { v25.b }[4], [x2]\n" "b 59f\n" "57:" // Oddments: Load (3, 3): Bit 2: Unset - "tbz x4, #1, 58f\n" - "ld1 { v25.h }[0], [x11], #0x2\n" - "tbz x4, #0, 59f\n" - "ld1 { v25.b }[2], [x11]\n" + "tbz x0, #1, 58f\n" + "ld1 { v25.h }[0], [x2], #0x2\n" + "tbz x0, #0, 59f\n" + "ld1 { v25.b }[2], [x2]\n" "b 59f\n" "58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 59f\n" - "ld1 { v25.b }[0], [x11]\n" + "tbz x0, #0, 59f\n" + "ld1 { v25.b }[0], [x2]\n" "59:" // Oddments: Load (3, 3): Bit 2: End - "ldr d3, [x3, #0x68]\n" - "usubl v25.8h, v25.8b, v7.8b\n" - "smlal v10.4s, v25.4h, v2.4h\n" - "ldr x24, [x25, #0xb0]\n" - "usubl v3.8h, v3.8b, v13.8b\n" - "smlal2 v9.4s, v25.8h, v2.8h\n" - "add x24, x24, x10\n" + "ldr d3, [x23, #0x68]\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "usubl v3.8h, v3.8b, v14.8b\n" + "ldr x13, [x20, #0xb0]\n" + "smlal v6.4s, v25.4h, v2.4h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "add x13, x13, x24\n" "smlal v15.4s, v27.4h, v3.4h\n" - "smlal2 v20.4s, v27.8h, v3.8h\n" - "smlal v18.4s, v23.4h, v3.4h\n" - "smlal2 v5.4s, v23.8h, v3.8h\n" - "smlal v11.4s, v25.4h, v3.4h\n" - "smlal2 v8.4s, v25.8h, v3.8h\n" - "tbz x4, #2, 61f\n" - "ld1 { v24.s }[0], [x24], #0x4\n" - "tbz x4, #1, 60f\n" - "ld1 { v24.h }[2], [x24], #0x2\n" - "tbz x4, #0, 63f\n" - "ld1 { v24.b }[6], [x24]\n" + "smlal2 v16.4s, v27.8h, v3.8h\n" + "smlal v17.4s, v23.4h, v3.4h\n" + "smlal2 v8.4s, v23.8h, v3.8h\n" + "smlal v10.4s, v25.4h, v3.4h\n" + "smlal2 v7.4s, v25.8h, v3.8h\n" + "tbz x0, #2, 61f\n" + "ld1 { v24.s }[0], [x13], #0x4\n" + "tbz x0, #1, 60f\n" + "ld1 { v24.h }[2], [x13], #0x2\n" + "tbz x0, #0, 63f\n" + "ld1 { v24.b }[6], [x13]\n" "b 63f\n" "60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 63f\n" - "ld1 { v24.b }[4], [x24]\n" + "tbz x0, #0, 63f\n" + "ld1 { v24.b }[4], [x13]\n" "b 63f\n" "61:" // Oddments: Load (3, 4): Bit 2: Unset - "tbz x4, #1, 62f\n" - "ld1 { v24.h }[0], [x24], #0x2\n" - "tbz x4, #0, 63f\n" - "ld1 { v24.b }[2], [x24]\n" + "tbz x0, #1, 62f\n" + "ld1 { v24.h }[0], [x13], #0x2\n" + "tbz x0, #0, 63f\n" + "ld1 { v24.b }[2], [x13]\n" "b 63f\n" "62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 63f\n" - "ld1 { v24.b }[0], [x24]\n" + "tbz x0, #0, 63f\n" + "ld1 { v24.b }[0], [x13]\n" "63:" // Oddments: Load (3, 4): Bit 2: End - "ldr d4, [x3, #0x70]\n" - "usubl v24.8h, v24.8b, v7.8b\n" - "smlal v10.4s, v24.4h, v3.4h\n" - "ldr x0, [x25, #0xb8]\n" - "usubl v4.8h, v4.8b, v13.8b\n" - "smlal2 v9.4s, v24.8h, v3.8h\n" - "add x0, x0, x10\n" + "ldr d4, [x23, #0x70]\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "usubl v4.8h, v4.8b, v14.8b\n" + "ldr x9, [x20, #0xb8]\n" + "smlal v6.4s, v24.4h, v3.4h\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "add x9, x9, x24\n" "smlal v15.4s, v23.4h, v4.4h\n" - "smlal2 v20.4s, v23.8h, v4.8h\n" - "smlal v18.4s, v28.4h, v4.4h\n" - "smlal2 v5.4s, v28.8h, v4.8h\n" - "smlal v11.4s, v24.4h, v4.4h\n" - "smlal2 v8.4s, v24.8h, v4.8h\n" - "tbz x4, #2, 65f\n" - "ld1 { v22.s }[0], [x0], #0x4\n" - "tbz x4, #1, 64f\n" - "ld1 { v22.h }[2], [x0], #0x2\n" - "tbz x4, #0, 67f\n" - "ld1 { v22.b }[6], [x0]\n" + "smlal2 v16.4s, v23.8h, v4.8h\n" + "smlal v17.4s, v28.4h, v4.4h\n" + "smlal2 v8.4s, v28.8h, v4.8h\n" + "smlal v10.4s, v24.4h, v4.4h\n" + "smlal2 v7.4s, v24.8h, v4.8h\n" + "tbz x0, #2, 65f\n" + "ld1 { v22.s }[0], [x9], #0x4\n" + "tbz x0, #1, 64f\n" + "ld1 { v22.h }[2], [x9], #0x2\n" + "tbz x0, #0, 67f\n" + "ld1 { v22.b }[6], [x9]\n" "b 67f\n" "64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset - "tbz x4, #0, 67f\n" - "ld1 { v22.b }[4], [x0]\n" + "tbz x0, #0, 67f\n" + "ld1 { v22.b }[4], [x9]\n" "b 67f\n" "65:" // Oddments: Load (3, 5): Bit 2: Unset - "tbz x4, #1, 66f\n" - "ld1 { v22.h }[0], [x0], #0x2\n" - "tbz x4, #0, 67f\n" - "ld1 { v22.b }[2], [x0]\n" + "tbz x0, #1, 66f\n" + "ld1 { v22.h }[0], [x9], #0x2\n" + "tbz x0, #0, 67f\n" + "ld1 { v22.b }[2], [x9]\n" "b 67f\n" "66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 67f\n" - "ld1 { v22.b }[0], [x0]\n" + "tbz x0, #0, 67f\n" + "ld1 { v22.b }[0], [x9]\n" "67:" // Oddments: Load (3, 5): Bit 2: End - "ldr d0, [x3, #0x78]\n" - "usubl v22.8h, v22.8b, v7.8b\n" - "smlal v10.4s, v22.4h, v4.4h\n" - "ldr x15, [x25, #0xc0]\n" - "usubl v0.8h, v0.8b, v13.8b\n" - "smlal2 v9.4s, v22.8h, v4.8h\n" - "add x15, x15, x10\n" + "ldr d0, [x23, #0x78]\n" + "usubl v22.8h, v22.8b, v9.8b\n" + "usubl v0.8h, v0.8b, v14.8b\n" + "ldr x19, [x20, #0xc0]\n" + "smlal v6.4s, v22.4h, v4.4h\n" + "smlal2 v5.4s, v22.8h, v4.8h\n" + "add x19, x19, x24\n" "smlal v15.4s, v31.4h, v0.4h\n" - "smlal2 v20.4s, v31.8h, v0.8h\n" - "smlal v18.4s, v30.4h, v0.4h\n" - "smlal2 v5.4s, v30.8h, v0.8h\n" - "tbz x4, #2, 69f\n" - "ld1 { v27.s }[0], [x15], #0x4\n" - "tbz x4, #1, 68f\n" - "ld1 { v27.h }[2], [x15], #0x2\n" - "tbz x4, #0, 71f\n" - "ld1 { v27.b }[6], [x15]\n" + "smlal2 v16.4s, v31.8h, v0.8h\n" + "smlal v17.4s, v30.4h, v0.4h\n" + "smlal2 v8.4s, v30.8h, v0.8h\n" + "tbz x0, #2, 69f\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "tbz x0, #1, 68f\n" + "ld1 { v27.h }[2], [x19], #0x2\n" + "tbz x0, #0, 71f\n" + "ld1 { v27.b }[6], [x19]\n" "b 71f\n" "68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset - "tbz x4, #0, 71f\n" - "ld1 { v27.b }[4], [x15]\n" + "tbz x0, #0, 71f\n" + "ld1 { v27.b }[4], [x19]\n" "b 71f\n" "69:" // Oddments: Load (4, 0): Bit 2: Unset - "tbz x4, #1, 70f\n" - "ld1 { v27.h }[0], [x15], #0x2\n" - "tbz x4, #0, 71f\n" - "ld1 { v27.b }[2], [x15]\n" + "tbz x0, #1, 70f\n" + "ld1 { v27.h }[0], [x19], #0x2\n" + "tbz x0, #0, 71f\n" + "ld1 { v27.b }[2], [x19]\n" "b 71f\n" "70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 71f\n" - "ld1 { v27.b }[0], [x15]\n" + "tbz x0, #0, 71f\n" + "ld1 { v27.b }[0], [x19]\n" "71:" // Oddments: Load (4, 0): Bit 2: End - "ldr x9, [x25, #0xc8]\n" - "usubl v27.8h, v27.8b, v7.8b\n" - "smlal v11.4s, v27.4h, v0.4h\n" - "smlal2 v8.4s, v27.8h, v0.8h\n" - "add x9, x9, x10\n" - "tbz x4, #2, 73f\n" - "ld1 { v23.s }[0], [x9], #0x4\n" - "tbz x4, #1, 72f\n" - "ld1 { v23.h }[2], [x9], #0x2\n" - "tbz x4, #0, 75f\n" - "ld1 { v23.b }[6], [x9]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "ldr x28, [x20, #0xc8]\n" + "smlal v10.4s, v27.4h, v0.4h\n" + "smlal2 v7.4s, v27.8h, v0.8h\n" + "add x28, x28, x24\n" + "tbz x0, #2, 73f\n" + "ld1 { v23.s }[0], [x28], #0x4\n" + "tbz x0, #1, 72f\n" + "ld1 { v23.h }[2], [x28], #0x2\n" + "tbz x0, #0, 75f\n" + "ld1 { v23.b }[6], [x28]\n" "b 75f\n" "72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset - "tbz x4, #0, 75f\n" - "ld1 { v23.b }[4], [x9]\n" + "tbz x0, #0, 75f\n" + "ld1 { v23.b }[4], [x28]\n" "b 75f\n" "73:" // Oddments: Load (4, 1): Bit 2: Unset - "tbz x4, #1, 74f\n" - "ld1 { v23.h }[0], [x9], #0x2\n" - "tbz x4, #0, 75f\n" - "ld1 { v23.b }[2], [x9]\n" + "tbz x0, #1, 74f\n" + "ld1 { v23.h }[0], [x28], #0x2\n" + "tbz x0, #0, 75f\n" + "ld1 { v23.b }[2], [x28]\n" "b 75f\n" "74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 75f\n" - "ld1 { v23.b }[0], [x9]\n" + "tbz x0, #0, 75f\n" + "ld1 { v23.b }[0], [x28]\n" "75:" // Oddments: Load (4, 1): Bit 2: End - "ldr d1, [x3, #0x80]\n" - "usubl v23.8h, v23.8b, v7.8b\n" - "smlal v10.4s, v23.4h, v0.4h\n" - "ldr x27, [x25, #0xd0]\n" - "usubl v1.8h, v1.8b, v13.8b\n" - "smlal2 v9.4s, v23.8h, v0.8h\n" - "add x27, x27, x10\n" + "ldr d1, [x23, #0x80]\n" + "usubl v23.8h, v23.8b, v9.8b\n" + "usubl v1.8h, v1.8b, v14.8b\n" + "ldr x6, [x20, #0xd0]\n" + "smlal v6.4s, v23.4h, v0.4h\n" + "smlal2 v5.4s, v23.8h, v0.8h\n" + "add x6, x6, x24\n" "smlal v15.4s, v30.4h, v1.4h\n" - "smlal2 v20.4s, v30.8h, v1.8h\n" - "smlal v18.4s, v26.4h, v1.4h\n" - "smlal2 v5.4s, v26.8h, v1.8h\n" - "smlal v11.4s, v23.4h, v1.4h\n" - "smlal2 v8.4s, v23.8h, v1.8h\n" - "tbz x4, #2, 77f\n" - "ld1 { v31.s }[0], [x27], #0x4\n" - "tbz x4, #1, 76f\n" - "ld1 { v31.h }[2], [x27], #0x2\n" - "tbz x4, #0, 79f\n" - "ld1 { v31.b }[6], [x27]\n" + "smlal2 v16.4s, v30.8h, v1.8h\n" + "smlal v17.4s, v26.4h, v1.4h\n" + "smlal2 v8.4s, v26.8h, v1.8h\n" + "smlal v10.4s, v23.4h, v1.4h\n" + "smlal2 v7.4s, v23.8h, v1.8h\n" + "tbz x0, #2, 77f\n" + "ld1 { v31.s }[0], [x6], #0x4\n" + "tbz x0, #1, 76f\n" + "ld1 { v31.h }[2], [x6], #0x2\n" + "tbz x0, #0, 79f\n" + "ld1 { v31.b }[6], [x6]\n" "b 79f\n" "76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset - "tbz x4, #0, 79f\n" - "ld1 { v31.b }[4], [x27]\n" + "tbz x0, #0, 79f\n" + "ld1 { v31.b }[4], [x6]\n" "b 79f\n" "77:" // Oddments: Load (4, 2): Bit 2: Unset - "tbz x4, #1, 78f\n" - "ld1 { v31.h }[0], [x27], #0x2\n" - "tbz x4, #0, 79f\n" - "ld1 { v31.b }[2], [x27]\n" + "tbz x0, #1, 78f\n" + "ld1 { v31.h }[0], [x6], #0x2\n" + "tbz x0, #0, 79f\n" + "ld1 { v31.b }[2], [x6]\n" "b 79f\n" "78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 79f\n" - "ld1 { v31.b }[0], [x27]\n" + "tbz x0, #0, 79f\n" + "ld1 { v31.b }[0], [x6]\n" "79:" // Oddments: Load (4, 2): Bit 2: End - "ldr d2, [x3, #0x88]\n" - "usubl v31.8h, v31.8b, v7.8b\n" - "smlal v10.4s, v31.4h, v1.4h\n" - "ldr x28, [x25, #0xd8]\n" - "usubl v2.8h, v2.8b, v13.8b\n" - "smlal2 v9.4s, v31.8h, v1.8h\n" - "add x28, x28, x10\n" + "ldr d2, [x23, #0x88]\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "usubl v2.8h, v2.8b, v14.8b\n" + "ldr x27, [x20, #0xd8]\n" + "smlal v6.4s, v31.4h, v1.4h\n" + "smlal2 v5.4s, v31.8h, v1.8h\n" + "add x27, x27, x24\n" "smlal v15.4s, v26.4h, v2.4h\n" - "smlal2 v20.4s, v26.8h, v2.8h\n" - "smlal v18.4s, v25.4h, v2.4h\n" - "smlal2 v5.4s, v25.8h, v2.8h\n" - "smlal v11.4s, v31.4h, v2.4h\n" - "smlal2 v8.4s, v31.8h, v2.8h\n" - "tbz x4, #2, 81f\n" - "ld1 { v30.s }[0], [x28], #0x4\n" - "tbz x4, #1, 80f\n" - "ld1 { v30.h }[2], [x28], #0x2\n" - "tbz x4, #0, 83f\n" - "ld1 { v30.b }[6], [x28]\n" + "smlal2 v16.4s, v26.8h, v2.8h\n" + "smlal v17.4s, v25.4h, v2.4h\n" + "smlal2 v8.4s, v25.8h, v2.8h\n" + "smlal v10.4s, v31.4h, v2.4h\n" + "smlal2 v7.4s, v31.8h, v2.8h\n" + "tbz x0, #2, 81f\n" + "ld1 { v30.s }[0], [x27], #0x4\n" + "tbz x0, #1, 80f\n" + "ld1 { v30.h }[2], [x27], #0x2\n" + "tbz x0, #0, 83f\n" + "ld1 { v30.b }[6], [x27]\n" "b 83f\n" "80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 83f\n" - "ld1 { v30.b }[4], [x28]\n" + "tbz x0, #0, 83f\n" + "ld1 { v30.b }[4], [x27]\n" "b 83f\n" "81:" // Oddments: Load (4, 3): Bit 2: Unset - "tbz x4, #1, 82f\n" - "ld1 { v30.h }[0], [x28], #0x2\n" - "tbz x4, #0, 83f\n" - "ld1 { v30.b }[2], [x28]\n" + "tbz x0, #1, 82f\n" + "ld1 { v30.h }[0], [x27], #0x2\n" + "tbz x0, #0, 83f\n" + "ld1 { v30.b }[2], [x27]\n" "b 83f\n" "82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 83f\n" - "ld1 { v30.b }[0], [x28]\n" + "tbz x0, #0, 83f\n" + "ld1 { v30.b }[0], [x27]\n" "83:" // Oddments: Load (4, 3): Bit 2: End - "ldr d3, [x3, #0x90]\n" - "usubl v30.8h, v30.8b, v7.8b\n" - "smlal v10.4s, v30.4h, v2.4h\n" - "ldr x12, [x25, #0xe0]\n" - "usubl v3.8h, v3.8b, v13.8b\n" - "smlal2 v9.4s, v30.8h, v2.8h\n" - "add x12, x12, x10\n" + "ldr d3, [x23, #0x90]\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "usubl v3.8h, v3.8b, v14.8b\n" + "ldr x11, [x20, #0xe0]\n" + "smlal v6.4s, v30.4h, v2.4h\n" + "smlal2 v5.4s, v30.8h, v2.8h\n" + "add x11, x11, x24\n" "smlal v15.4s, v25.4h, v3.4h\n" - "smlal2 v20.4s, v25.8h, v3.8h\n" - "smlal v18.4s, v24.4h, v3.4h\n" - "smlal2 v5.4s, v24.8h, v3.8h\n" - "smlal v11.4s, v30.4h, v3.4h\n" - "smlal2 v8.4s, v30.8h, v3.8h\n" - "tbz x4, #2, 85f\n" - "ld1 { v28.s }[0], [x12], #0x4\n" - "tbz x4, #1, 84f\n" - "ld1 { v28.h }[2], [x12], #0x2\n" - "tbz x4, #0, 87f\n" - "ld1 { v28.b }[6], [x12]\n" + "smlal2 v16.4s, v25.8h, v3.8h\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal2 v8.4s, v24.8h, v3.8h\n" + "smlal v10.4s, v30.4h, v3.4h\n" + "smlal2 v7.4s, v30.8h, v3.8h\n" + "tbz x0, #2, 85f\n" + "ld1 { v28.s }[0], [x11], #0x4\n" + "tbz x0, #1, 84f\n" + "ld1 { v28.h }[2], [x11], #0x2\n" + "tbz x0, #0, 87f\n" + "ld1 { v28.b }[6], [x11]\n" "b 87f\n" "84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 87f\n" - "ld1 { v28.b }[4], [x12]\n" + "tbz x0, #0, 87f\n" + "ld1 { v28.b }[4], [x11]\n" "b 87f\n" "85:" // Oddments: Load (4, 4): Bit 2: Unset - "tbz x4, #1, 86f\n" - "ld1 { v28.h }[0], [x12], #0x2\n" - "tbz x4, #0, 87f\n" - "ld1 { v28.b }[2], [x12]\n" + "tbz x0, #1, 86f\n" + "ld1 { v28.h }[0], [x11], #0x2\n" + "tbz x0, #0, 87f\n" + "ld1 { v28.b }[2], [x11]\n" "b 87f\n" "86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 87f\n" - "ld1 { v28.b }[0], [x12]\n" + "tbz x0, #0, 87f\n" + "ld1 { v28.b }[0], [x11]\n" "87:" // Oddments: Load (4, 4): Bit 2: End - "ldr d4, [x3, #0x98]\n" - "usubl v28.8h, v28.8b, v7.8b\n" - "smlal v10.4s, v28.4h, v3.4h\n" - "ldr x7, [x25, #0xe8]\n" - "usubl v4.8h, v4.8b, v13.8b\n" - "smlal2 v9.4s, v28.8h, v3.8h\n" - "add x7, x7, x10\n" + "ldr d4, [x23, #0x98]\n" + "usubl v28.8h, v28.8b, v9.8b\n" + "usubl v4.8h, v4.8b, v14.8b\n" + "ldr x17, [x20, #0xe8]\n" + "smlal v6.4s, v28.4h, v3.4h\n" + "smlal2 v5.4s, v28.8h, v3.8h\n" + "add x17, x17, x24\n" "smlal v15.4s, v24.4h, v4.4h\n" - "smlal2 v20.4s, v24.8h, v4.8h\n" - "smlal v18.4s, v22.4h, v4.4h\n" - "smlal2 v5.4s, v22.8h, v4.8h\n" - "smlal v11.4s, v28.4h, v4.4h\n" - "smlal2 v8.4s, v28.8h, v4.8h\n" - "tbz x4, #2, 89f\n" - "ld1 { v26.s }[0], [x7], #0x4\n" - "tbz x4, #1, 88f\n" - "ld1 { v26.h }[2], [x7], #0x2\n" - "tbz x4, #0, 91f\n" - "ld1 { v26.b }[6], [x7]\n" + "smlal2 v16.4s, v24.8h, v4.8h\n" + "smlal v17.4s, v22.4h, v4.4h\n" + "smlal2 v8.4s, v22.8h, v4.8h\n" + "smlal v10.4s, v28.4h, v4.4h\n" + "smlal2 v7.4s, v28.8h, v4.8h\n" + "tbz x0, #2, 89f\n" + "ld1 { v26.s }[0], [x17], #0x4\n" + "tbz x0, #1, 88f\n" + "ld1 { v26.h }[2], [x17], #0x2\n" + "tbz x0, #0, 91f\n" + "ld1 { v26.b }[6], [x17]\n" "b 91f\n" "88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset - "tbz x4, #0, 91f\n" - "ld1 { v26.b }[4], [x7]\n" + "tbz x0, #0, 91f\n" + "ld1 { v26.b }[4], [x17]\n" "b 91f\n" "89:" // Oddments: Load (4, 5): Bit 2: Unset - "tbz x4, #1, 90f\n" - "ld1 { v26.h }[0], [x7], #0x2\n" - "tbz x4, #0, 91f\n" - "ld1 { v26.b }[2], [x7]\n" + "tbz x0, #1, 90f\n" + "ld1 { v26.h }[0], [x17], #0x2\n" + "tbz x0, #0, 91f\n" + "ld1 { v26.b }[2], [x17]\n" "b 91f\n" "90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 91f\n" - "ld1 { v26.b }[0], [x7]\n" + "tbz x0, #0, 91f\n" + "ld1 { v26.b }[0], [x17]\n" "91:" // Oddments: Load (4, 5): Bit 2: End - "ldr d0, [x3, #0xa0]\n" - "usubl v26.8h, v26.8b, v7.8b\n" - "smlal v10.4s, v26.4h, v4.4h\n" - "ldr x26, [x25, #0xf0]\n" - "usubl v0.8h, v0.8b, v13.8b\n" - "smlal2 v9.4s, v26.8h, v4.8h\n" - "add x26, x26, x10\n" + "ldr d0, [x23, #0xa0]\n" + "usubl v26.8h, v26.8b, v9.8b\n" + "usubl v0.8h, v0.8b, v14.8b\n" + "ldr x5, [x20, #0xf0]\n" + "smlal v6.4s, v26.4h, v4.4h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "add x5, x5, x24\n" "smlal v15.4s, v27.4h, v0.4h\n" - "smlal2 v20.4s, v27.8h, v0.8h\n" - "smlal v18.4s, v23.4h, v0.4h\n" - "smlal2 v5.4s, v23.8h, v0.8h\n" - "tbz x4, #2, 93f\n" - "ld1 { v25.s }[0], [x26], #0x4\n" - "tbz x4, #1, 92f\n" - "ld1 { v25.h }[2], [x26], #0x2\n" - "tbz x4, #0, 95f\n" - "ld1 { v25.b }[6], [x26]\n" + "smlal2 v16.4s, v27.8h, v0.8h\n" + "smlal v17.4s, v23.4h, v0.4h\n" + "smlal2 v8.4s, v23.8h, v0.8h\n" + "tbz x0, #2, 93f\n" + "ld1 { v25.s }[0], [x5], #0x4\n" + "tbz x0, #1, 92f\n" + "ld1 { v25.h }[2], [x5], #0x2\n" + "tbz x0, #0, 95f\n" + "ld1 { v25.b }[6], [x5]\n" "b 95f\n" "92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset - "tbz x4, #0, 95f\n" - "ld1 { v25.b }[4], [x26]\n" + "tbz x0, #0, 95f\n" + "ld1 { v25.b }[4], [x5]\n" "b 95f\n" "93:" // Oddments: Load (5, 0): Bit 2: Unset - "tbz x4, #1, 94f\n" - "ld1 { v25.h }[0], [x26], #0x2\n" - "tbz x4, #0, 95f\n" - "ld1 { v25.b }[2], [x26]\n" + "tbz x0, #1, 94f\n" + "ld1 { v25.h }[0], [x5], #0x2\n" + "tbz x0, #0, 95f\n" + "ld1 { v25.b }[2], [x5]\n" "b 95f\n" "94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 95f\n" - "ld1 { v25.b }[0], [x26]\n" + "tbz x0, #0, 95f\n" + "ld1 { v25.b }[0], [x5]\n" "95:" // Oddments: Load (5, 0): Bit 2: End - "ldr x23, [x25, #0xf8]\n" - "usubl v25.8h, v25.8b, v7.8b\n" - "smlal v11.4s, v25.4h, v0.4h\n" - "smlal2 v8.4s, v25.8h, v0.8h\n" - "add x23, x23, x10\n" - "tbz x4, #2, 97f\n" - "ld1 { v24.s }[0], [x23], #0x4\n" - "tbz x4, #1, 96f\n" - "ld1 { v24.h }[2], [x23], #0x2\n" - "tbz x4, #0, 99f\n" - "ld1 { v24.b }[6], [x23]\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "ldr x25, [x20, #0xf8]\n" + "smlal v10.4s, v25.4h, v0.4h\n" + "smlal2 v7.4s, v25.8h, v0.8h\n" + "add x25, x25, x24\n" + "tbz x0, #2, 97f\n" + "ld1 { v24.s }[0], [x25], #0x4\n" + "tbz x0, #1, 96f\n" + "ld1 { v24.h }[2], [x25], #0x2\n" + "tbz x0, #0, 99f\n" + "ld1 { v24.b }[6], [x25]\n" "b 99f\n" "96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset - "tbz x4, #0, 99f\n" - "ld1 { v24.b }[4], [x23]\n" + "tbz x0, #0, 99f\n" + "ld1 { v24.b }[4], [x25]\n" "b 99f\n" "97:" // Oddments: Load (5, 1): Bit 2: Unset - "tbz x4, #1, 98f\n" - "ld1 { v24.h }[0], [x23], #0x2\n" - "tbz x4, #0, 99f\n" - "ld1 { v24.b }[2], [x23]\n" + "tbz x0, #1, 98f\n" + "ld1 { v24.h }[0], [x25], #0x2\n" + "tbz x0, #0, 99f\n" + "ld1 { v24.b }[2], [x25]\n" "b 99f\n" "98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 99f\n" - "ld1 { v24.b }[0], [x23]\n" + "tbz x0, #0, 99f\n" + "ld1 { v24.b }[0], [x25]\n" "99:" // Oddments: Load (5, 1): Bit 2: End - "ldr d1, [x3, #0xa8]\n" - "usubl v24.8h, v24.8b, v7.8b\n" - "smlal v10.4s, v24.4h, v0.4h\n" - "ldr x22, [x25, #0x100]\n" - "usubl v1.8h, v1.8b, v13.8b\n" - "smlal2 v9.4s, v24.8h, v0.8h\n" - "add x22, x22, x10\n" + "ldr d1, [x23, #0xa8]\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "usubl v1.8h, v1.8b, v14.8b\n" + "ldr x26, [x20, #0x100]\n" + "smlal v6.4s, v24.4h, v0.4h\n" + "smlal2 v5.4s, v24.8h, v0.8h\n" + "add x26, x26, x24\n" "smlal v15.4s, v23.4h, v1.4h\n" - "smlal2 v20.4s, v23.8h, v1.8h\n" - "smlal v18.4s, v31.4h, v1.4h\n" - "smlal2 v5.4s, v31.8h, v1.8h\n" - "smlal v11.4s, v24.4h, v1.4h\n" - "smlal2 v8.4s, v24.8h, v1.8h\n" - "tbz x4, #2, 101f\n" - "ld1 { v27.s }[0], [x22], #0x4\n" - "tbz x4, #1, 100f\n" - "ld1 { v27.h }[2], [x22], #0x2\n" - "tbz x4, #0, 103f\n" - "ld1 { v27.b }[6], [x22]\n" + "smlal2 v16.4s, v23.8h, v1.8h\n" + "smlal v17.4s, v31.4h, v1.4h\n" + "smlal2 v8.4s, v31.8h, v1.8h\n" + "smlal v10.4s, v24.4h, v1.4h\n" + "smlal2 v7.4s, v24.8h, v1.8h\n" + "tbz x0, #2, 101f\n" + "ld1 { v27.s }[0], [x26], #0x4\n" + "tbz x0, #1, 100f\n" + "ld1 { v27.h }[2], [x26], #0x2\n" + "tbz x0, #0, 103f\n" + "ld1 { v27.b }[6], [x26]\n" "b 103f\n" "100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset - "tbz x4, #0, 103f\n" - "ld1 { v27.b }[4], [x22]\n" + "tbz x0, #0, 103f\n" + "ld1 { v27.b }[4], [x26]\n" "b 103f\n" "101:" // Oddments: Load (5, 2): Bit 2: Unset - "tbz x4, #1, 102f\n" - "ld1 { v27.h }[0], [x22], #0x2\n" - "tbz x4, #0, 103f\n" - "ld1 { v27.b }[2], [x22]\n" + "tbz x0, #1, 102f\n" + "ld1 { v27.h }[0], [x26], #0x2\n" + "tbz x0, #0, 103f\n" + "ld1 { v27.b }[2], [x26]\n" "b 103f\n" "102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 103f\n" - "ld1 { v27.b }[0], [x22]\n" + "tbz x0, #0, 103f\n" + "ld1 { v27.b }[0], [x26]\n" "103:" // Oddments: Load (5, 2): Bit 2: End - "ldr d2, [x3, #0xb0]\n" - "usubl v27.8h, v27.8b, v7.8b\n" - "smlal v10.4s, v27.4h, v1.4h\n" - "ldr x20, [x25, #0x108]\n" - "usubl v2.8h, v2.8b, v13.8b\n" - "smlal2 v9.4s, v27.8h, v1.8h\n" - "add x20, x20, x10\n" + "ldr d2, [x23, #0xb0]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "usubl v2.8h, v2.8b, v14.8b\n" + "ldr x12, [x20, #0x108]\n" + "smlal v6.4s, v27.4h, v1.4h\n" + "smlal2 v5.4s, v27.8h, v1.8h\n" + "add x12, x12, x24\n" "smlal v15.4s, v31.4h, v2.4h\n" - "smlal2 v20.4s, v31.8h, v2.8h\n" - "smlal v18.4s, v30.4h, v2.4h\n" - "smlal2 v5.4s, v30.8h, v2.8h\n" - "smlal v11.4s, v27.4h, v2.4h\n" - "smlal2 v8.4s, v27.8h, v2.8h\n" - "tbz x4, #2, 105f\n" - "ld1 { v25.s }[0], [x20], #0x4\n" - "tbz x4, #1, 104f\n" - "ld1 { v25.h }[2], [x20], #0x2\n" - "tbz x4, #0, 107f\n" - "ld1 { v25.b }[6], [x20]\n" + "smlal2 v16.4s, v31.8h, v2.8h\n" + "smlal v17.4s, v30.4h, v2.4h\n" + "smlal2 v8.4s, v30.8h, v2.8h\n" + "smlal v10.4s, v27.4h, v2.4h\n" + "smlal2 v7.4s, v27.8h, v2.8h\n" + "tbz x0, #2, 105f\n" + "ld1 { v25.s }[0], [x12], #0x4\n" + "tbz x0, #1, 104f\n" + "ld1 { v25.h }[2], [x12], #0x2\n" + "tbz x0, #0, 107f\n" + "ld1 { v25.b }[6], [x12]\n" "b 107f\n" "104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 107f\n" - "ld1 { v25.b }[4], [x20]\n" + "tbz x0, #0, 107f\n" + "ld1 { v25.b }[4], [x12]\n" "b 107f\n" "105:" // Oddments: Load (5, 3): Bit 2: Unset - "tbz x4, #1, 106f\n" - "ld1 { v25.h }[0], [x20], #0x2\n" - "tbz x4, #0, 107f\n" - "ld1 { v25.b }[2], [x20]\n" + "tbz x0, #1, 106f\n" + "ld1 { v25.h }[0], [x12], #0x2\n" + "tbz x0, #0, 107f\n" + "ld1 { v25.b }[2], [x12]\n" "b 107f\n" "106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 107f\n" - "ld1 { v25.b }[0], [x20]\n" + "tbz x0, #0, 107f\n" + "ld1 { v25.b }[0], [x12]\n" "107:" // Oddments: Load (5, 3): Bit 2: End - "ldr d3, [x3, #0xb8]\n" - "usubl v25.8h, v25.8b, v7.8b\n" - "smlal v10.4s, v25.4h, v2.4h\n" - "ldr x13, [x25, #0x110]\n" - "usubl v3.8h, v3.8b, v13.8b\n" - "smlal2 v9.4s, v25.8h, v2.8h\n" - "add x13, x13, x10\n" + "ldr d3, [x23, #0xb8]\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "usubl v3.8h, v3.8b, v14.8b\n" + "ldr x14, [x20, #0x110]\n" + "smlal v6.4s, v25.4h, v2.4h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "add x14, x14, x24\n" "smlal v15.4s, v30.4h, v3.4h\n" - "smlal2 v20.4s, v30.8h, v3.8h\n" - "smlal v18.4s, v28.4h, v3.4h\n" - "smlal2 v5.4s, v28.8h, v3.8h\n" - "smlal v11.4s, v25.4h, v3.4h\n" - "smlal2 v8.4s, v25.8h, v3.8h\n" - "tbz x4, #2, 109f\n" - "ld1 { v24.s }[0], [x13], #0x4\n" - "tbz x4, #1, 108f\n" - "ld1 { v24.h }[2], [x13], #0x2\n" - "tbz x4, #0, 111f\n" - "ld1 { v24.b }[6], [x13]\n" + "smlal2 v16.4s, v30.8h, v3.8h\n" + "smlal v17.4s, v28.4h, v3.4h\n" + "smlal2 v8.4s, v28.8h, v3.8h\n" + "smlal v10.4s, v25.4h, v3.4h\n" + "smlal2 v7.4s, v25.8h, v3.8h\n" + "tbz x0, #2, 109f\n" + "ld1 { v24.s }[0], [x14], #0x4\n" + "tbz x0, #1, 108f\n" + "ld1 { v24.h }[2], [x14], #0x2\n" + "tbz x0, #0, 111f\n" + "ld1 { v24.b }[6], [x14]\n" "b 111f\n" "108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 111f\n" - "ld1 { v24.b }[4], [x13]\n" + "tbz x0, #0, 111f\n" + "ld1 { v24.b }[4], [x14]\n" "b 111f\n" "109:" // Oddments: Load (5, 4): Bit 2: Unset - "tbz x4, #1, 110f\n" - "ld1 { v24.h }[0], [x13], #0x2\n" - "tbz x4, #0, 111f\n" - "ld1 { v24.b }[2], [x13]\n" + "tbz x0, #1, 110f\n" + "ld1 { v24.h }[0], [x14], #0x2\n" + "tbz x0, #0, 111f\n" + "ld1 { v24.b }[2], [x14]\n" "b 111f\n" "110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 111f\n" - "ld1 { v24.b }[0], [x13]\n" + "tbz x0, #0, 111f\n" + "ld1 { v24.b }[0], [x14]\n" "111:" // Oddments: Load (5, 4): Bit 2: End - "ldr d4, [x3, #0xc0]\n" - "usubl v24.8h, v24.8b, v7.8b\n" - "smlal v10.4s, v24.4h, v3.4h\n" - "ldr x21, [x25, #0x118]\n" - "usubl v4.8h, v4.8b, v13.8b\n" - "smlal2 v9.4s, v24.8h, v3.8h\n" - "add x21, x21, x10\n" + "ldr d4, [x23, #0xc0]\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "usubl v4.8h, v4.8b, v14.8b\n" + "ldr x21, [x20, #0x118]\n" + "smlal v6.4s, v24.4h, v3.4h\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "add x21, x21, x24\n" "smlal v15.4s, v28.4h, v4.4h\n" - "smlal2 v20.4s, v28.8h, v4.8h\n" - "smlal v18.4s, v26.4h, v4.4h\n" - "smlal2 v5.4s, v26.8h, v4.8h\n" - "smlal v11.4s, v24.4h, v4.4h\n" - "smlal2 v8.4s, v24.8h, v4.8h\n" - "tbz x4, #2, 113f\n" + "smlal2 v16.4s, v28.8h, v4.8h\n" + "smlal v17.4s, v26.4h, v4.4h\n" + "smlal2 v8.4s, v26.8h, v4.8h\n" + "smlal v10.4s, v24.4h, v4.4h\n" + "smlal2 v7.4s, v24.8h, v4.8h\n" + "tbz x0, #2, 113f\n" "ld1 { v27.s }[0], [x21], #0x4\n" - "tbz x4, #1, 112f\n" + "tbz x0, #1, 112f\n" "ld1 { v27.h }[2], [x21], #0x2\n" - "tbz x4, #0, 115f\n" + "tbz x0, #0, 115f\n" "ld1 { v27.b }[6], [x21]\n" "b 115f\n" "112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset - "tbz x4, #0, 115f\n" + "tbz x0, #0, 115f\n" "ld1 { v27.b }[4], [x21]\n" "b 115f\n" "113:" // Oddments: Load (5, 5): Bit 2: Unset - "tbz x4, #1, 114f\n" + "tbz x0, #1, 114f\n" "ld1 { v27.h }[0], [x21], #0x2\n" - "tbz x4, #0, 115f\n" + "tbz x0, #0, 115f\n" "ld1 { v27.b }[2], [x21]\n" "b 115f\n" "114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 115f\n" + "tbz x0, #0, 115f\n" "ld1 { v27.b }[0], [x21]\n" "115:" // Oddments: Load (5, 5): Bit 2: End - "usubl v27.8h, v27.8b, v7.8b\n" - "smlal v10.4s, v27.4h, v4.4h\n" - "smlal2 v9.4s, v27.8h, v4.8h\n" - "tbz x4, #2, 117f\n" - "ld1 { v6.4s }, [x2], #0x10\n" - "ld1 { v21.4s }, [x5], #0x10\n" - "tbz x4, #1, 116f\n" - "ld1 { v17.d }[0], [x2], #0x8\n" - "ld1 { v14.d }[0], [x5], #0x8\n" - "tbz x4, #0, 119f\n" - "ld1 { v17.s }[2], [x2]\n" - "ld1 { v14.s }[2], [x5]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal v6.4s, v27.4h, v4.4h\n" + "smlal2 v5.4s, v27.8h, v4.8h\n" + "tbz x0, #2, 117f\n" + "ld1 { v12.4s }, [x10], #0x10\n" + "ld1 { v19.4s }, [x1], #0x10\n" + "tbz x0, #1, 116f\n" + "ld1 { v20.d }[0], [x10], #0x8\n" + "ld1 { v29.d }[0], [x1], #0x8\n" + "tbz x0, #0, 119f\n" + "ld1 { v20.s }[2], [x10]\n" + "ld1 { v29.s }[2], [x1]\n" "b 119f\n" "116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset - "tbz x4, #0, 119f\n" - "ld1 { v17.s }[0], [x2]\n" - "ld1 { v14.s }[0], [x5]\n" + "tbz x0, #0, 119f\n" + "ld1 { v20.s }[0], [x10]\n" + "ld1 { v29.s }[0], [x1]\n" "b 119f\n" "117:" // Oddments: Load requant params: Bit 2: Unset - "tbz x4, #1, 118f\n" - "ld1 { v6.d }[0], [x2], #0x8\n" - "ld1 { v21.d }[0], [x5], #0x8\n" - "tbz x4, #0, 119f\n" - "ld1 { v6.s }[2], [x2]\n" - "ld1 { v21.s }[2], [x5]\n" + "tbz x0, #1, 118f\n" + "ld1 { v12.d }[0], [x10], #0x8\n" + "ld1 { v19.d }[0], [x1], #0x8\n" + "tbz x0, #0, 119f\n" + "ld1 { v12.s }[2], [x10]\n" + "ld1 { v19.s }[2], [x1]\n" "b 119f\n" "118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 119f\n" - "ld1 { v6.s }[0], [x2]\n" - "ld1 { v21.s }[0], [x5]\n" + "tbz x0, #0, 119f\n" + "ld1 { v12.s }[0], [x10]\n" + "ld1 { v19.s }[0], [x1]\n" "119:" // Oddments: Load requant params: Bit 2: End - "sqrdmulh v15.4s, v15.4s, v6.4s\n" - "add x17, x17, x1\n" - "sqrdmulh v20.4s, v20.4s, v17.4s\n" - "add x16, x16, x1\n" - "sqrdmulh v18.4s, v18.4s, v6.4s\n" - "add x6, x6, x1\n" - "sqrdmulh v5.4s, v5.4s, v17.4s\n" - "add x8, x8, x1\n" - "sqrdmulh v11.4s, v11.4s, v6.4s\n" - "and v1.16b, v15.16b, v21.16b\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "and v29.16b, v20.16b, v14.16b\n" - "and v3.16b, v18.16b, v21.16b\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "and v2.16b, v5.16b, v14.16b\n" - "and v0.16b, v11.16b, v21.16b\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "sqrdmulh v8.4s, v8.4s, v17.4s\n" + "sqdmulh v15.4s, v15.4s, v12.4s\n" + "sqdmulh v17.4s, v17.4s, v12.4s\n" + "add x16, x16, x22\n" + "add x8, x8, x22\n" + "sqdmulh v10.4s, v10.4s, v12.4s\n" + "sqdmulh v6.4s, v6.4s, v12.4s\n" + "add x4, x4, x22\n" + "add x7, x7, x22\n" + "and v23.16b, v15.16b, v19.16b\n" + "sqdmulh v16.4s, v16.4s, v20.4s\n" + "and v22.16b, v17.16b, v19.16b\n" + "sqdmulh v8.4s, v8.4s, v20.4s\n" + "and v21.16b, v10.16b, v19.16b\n" + "sqdmulh v7.4s, v7.4s, v20.4s\n" + "and v26.16b, v6.16b, v19.16b\n" + "sqdmulh v5.4s, v5.4s, v20.4s\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "and v4.16b, v16.16b, v29.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v2.16b, v8.16b, v29.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v3.16b, v7.16b, v29.16b\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "and v25.16b, v5.16b, v29.16b\n" + "sqadd v15.4s, v15.4s, v23.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v17.4s, v17.4s, v22.4s\n" "sshr v2.4s, v2.4s, #0x1f\n" - "sqadd v15.4s, v15.4s, v1.4s\n" - "sqrdmulh v10.4s, v10.4s, v6.4s\n" - "sshr v0.4s, v0.4s, #0x1f\n" - "sqrdmulh v9.4s, v9.4s, v17.4s\n" - "sqadd v20.4s, v20.4s, v29.4s\n" - "sqadd v18.4s, v18.4s, v3.4s\n" - "srshl v15.4s, v15.4s, v21.4s\n" - "sqadd v5.4s, v5.4s, v2.4s\n" - "srshl v20.4s, v20.4s, v14.4s\n" - "srshl v18.4s, v18.4s, v21.4s\n" - "add v15.4s, v15.4s, v19.4s\n" - "srshl v5.4s, v5.4s, v14.4s\n" - "add v20.4s, v20.4s, v19.4s\n" - "smin v15.4s, v15.4s, v12.4s\n" - "add v18.4s, v18.4s, v19.4s\n" - "smin v20.4s, v20.4s, v12.4s\n" - "smax v15.4s, v15.4s, v16.4s\n" - "smin v18.4s, v18.4s, v12.4s\n" - "smax v20.4s, v20.4s, v16.4s\n" - "add v5.4s, v5.4s, v19.4s\n" - "smax v18.4s, v18.4s, v16.4s\n" - "uzp1 v15.16b, v15.16b, v20.16b\n" - "smin v5.4s, v5.4s, v12.4s\n" + "sqadd v10.4s, v10.4s, v21.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqadd v6.4s, v6.4s, v26.4s\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v19.4s\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "srshl v17.4s, v17.4s, v19.4s\n" + "sqadd v8.4s, v8.4s, v2.4s\n" + "srshl v10.4s, v10.4s, v19.4s\n" + "sqadd v7.4s, v7.4s, v3.4s\n" + "srshl v6.4s, v6.4s, v19.4s\n" + "sqadd v5.4s, v5.4s, v25.4s\n" + "srshl v16.4s, v16.4s, v29.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v8.4s, v8.4s, v29.4s\n" + "sqxtn v17.4h, v17.4s\n" + "srshl v7.4s, v7.4s, v29.4s\n" + "sqxtn v10.4h, v10.4s\n" + "srshl v5.4s, v5.4s, v29.4s\n" + "sqxtn v6.4h, v6.4s\n" + "sqxtn2 v15.8h, v16.4s\n" + "sqxtn2 v17.8h, v8.4s\n" + "sqxtn2 v10.8h, v7.4s\n" + "sqxtn2 v6.8h, v5.4s\n" + "sqadd v15.8h, v15.8h, v18.8h\n" + "sqadd v17.8h, v17.8h, v18.8h\n" + "sqadd v10.8h, v10.8h, v18.8h\n" + "sqadd v6.8h, v6.8h, v18.8h\n" + "smax v15.8h, v15.8h, v11.8h\n" + "smax v17.8h, v17.8h, v11.8h\n" + "smax v10.8h, v10.8h, v11.8h\n" + "smax v6.8h, v6.8h, v11.8h\n" + "smin v15.8h, v15.8h, v13.8h\n" + "smin v17.8h, v17.8h, v13.8h\n" + "smin v10.8h, v10.8h, v13.8h\n" + "smin v6.8h, v6.8h, v13.8h\n" "uzp1 v15.16b, v15.16b, v15.16b\n" - "sqadd v11.4s, v11.4s, v0.4s\n" - "smax v5.4s, v5.4s, v16.4s\n" - "and v27.16b, v8.16b, v14.16b\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "uzp1 v18.16b, v18.16b, v5.16b\n" - "srshl v11.4s, v11.4s, v21.4s\n" - "and v30.16b, v10.16b, v21.16b\n" - "sshr v30.4s, v30.4s, #0x1f\n" - "uzp1 v18.16b, v18.16b, v18.16b\n" - "add v11.4s, v11.4s, v19.4s\n" - "sqadd v8.4s, v8.4s, v27.4s\n" - "and v6.16b, v9.16b, v14.16b\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "smin v11.4s, v11.4s, v12.4s\n" - "srshl v8.4s, v8.4s, v14.4s\n" - "sqadd v10.4s, v10.4s, v30.4s\n" - "smax v11.4s, v11.4s, v16.4s\n" - "add v8.4s, v8.4s, v19.4s\n" - "srshl v10.4s, v10.4s, v21.4s\n" - "sqadd v9.4s, v9.4s, v6.4s\n" - "smin v8.4s, v8.4s, v12.4s\n" - "add v10.4s, v10.4s, v19.4s\n" - "srshl v9.4s, v9.4s, v14.4s\n" - "smax v8.4s, v8.4s, v16.4s\n" - "smin v10.4s, v10.4s, v12.4s\n" - "uzp1 v11.16b, v11.16b, v8.16b\n" - "add v9.4s, v9.4s, v19.4s\n" - "uzp1 v11.16b, v11.16b, v11.16b\n" - "smax v10.4s, v10.4s, v16.4s\n" - "smin v9.4s, v9.4s, v12.4s\n" - "smax v9.4s, v9.4s, v16.4s\n" - "uzp1 v10.16b, v10.16b, v9.16b\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" "uzp1 v10.16b, v10.16b, v10.16b\n" - "tbz x4, #2, 121f\n" - "st1 { v15.s }[0], [x17], #0x4\n" - "st1 { v18.s }[0], [x16], #0x4\n" - "st1 { v11.s }[0], [x6], #0x4\n" - "st1 { v10.s }[0], [x8], #0x4\n" - "tbz x4, #1, 120f\n" - "st1 { v15.h }[2], [x17], #0x2\n" - "st1 { v18.h }[2], [x16], #0x2\n" - "st1 { v11.h }[2], [x6], #0x2\n" - "st1 { v10.h }[2], [x8], #0x2\n" - "tbz x4, #0, 123f\n" - "st1 { v15.b }[6], [x17], #0x1\n" - "st1 { v18.b }[6], [x16], #0x1\n" - "st1 { v11.b }[6], [x6], #0x1\n" - "st1 { v10.b }[6], [x8], #0x1\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "tbz x0, #2, 121f\n" + "st1 { v15.s }[0], [x16], #0x4\n" + "st1 { v17.s }[0], [x8], #0x4\n" + "st1 { v10.s }[0], [x4], #0x4\n" + "st1 { v6.s }[0], [x7], #0x4\n" + "tbz x0, #1, 120f\n" + "st1 { v15.h }[2], [x16], #0x2\n" + "st1 { v17.h }[2], [x8], #0x2\n" + "st1 { v10.h }[2], [x4], #0x2\n" + "st1 { v6.h }[2], [x7], #0x2\n" + "tbz x0, #0, 123f\n" + "st1 { v15.b }[6], [x16], #0x1\n" + "st1 { v17.b }[6], [x8], #0x1\n" + "st1 { v10.b }[6], [x4], #0x1\n" + "st1 { v6.b }[6], [x7], #0x1\n" "b 123f\n" "120:" // Oddments: Bit 2: Bit 1: Unset - "tbz x4, #0, 123f\n" - "st1 { v15.b }[4], [x17], #0x1\n" - "st1 { v18.b }[4], [x16], #0x1\n" - "st1 { v11.b }[4], [x6], #0x1\n" - "st1 { v10.b }[4], [x8], #0x1\n" + "tbz x0, #0, 123f\n" + "st1 { v15.b }[4], [x16], #0x1\n" + "st1 { v17.b }[4], [x8], #0x1\n" + "st1 { v10.b }[4], [x4], #0x1\n" + "st1 { v6.b }[4], [x7], #0x1\n" "b 123f\n" "121:" // Oddments: Bit 2: Unset - "tbz x4, #1, 122f\n" - "st1 { v15.h }[0], [x17], #0x2\n" - "st1 { v18.h }[0], [x16], #0x2\n" - "st1 { v11.h }[0], [x6], #0x2\n" - "st1 { v10.h }[0], [x8], #0x2\n" - "tbz x4, #0, 123f\n" - "st1 { v15.b }[2], [x17], #0x1\n" - "st1 { v18.b }[2], [x16], #0x1\n" - "st1 { v11.b }[2], [x6], #0x1\n" - "st1 { v10.b }[2], [x8], #0x1\n" + "tbz x0, #1, 122f\n" + "st1 { v15.h }[0], [x16], #0x2\n" + "st1 { v17.h }[0], [x8], #0x2\n" + "st1 { v10.h }[0], [x4], #0x2\n" + "st1 { v6.h }[0], [x7], #0x2\n" + "tbz x0, #0, 123f\n" + "st1 { v15.b }[2], [x16], #0x1\n" + "st1 { v17.b }[2], [x8], #0x1\n" + "st1 { v10.b }[2], [x4], #0x1\n" + "st1 { v6.b }[2], [x7], #0x1\n" "b 123f\n" "122:" // Oddments: Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 123f\n" - "st1 { v15.b }[0], [x17], #0x1\n" - "st1 { v18.b }[0], [x16], #0x1\n" - "st1 { v11.b }[0], [x6], #0x1\n" - "st1 { v10.b }[0], [x8], #0x1\n" + "tbz x0, #0, 123f\n" + "st1 { v15.b }[0], [x16], #0x1\n" + "st1 { v17.b }[0], [x8], #0x1\n" + "st1 { v10.b }[0], [x4], #0x1\n" + "st1 { v6.b }[0], [x7], #0x1\n" "123:" // Oddments: Bit 2: End - "124:" // End - : : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp index f5459c2ac1..b859978b1e 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_nhwc_generic_output9_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,28 +28,23 @@ #pragma once +#if defined(__aarch64__) + namespace arm_conv { namespace depthwise { void a64_u8q_nhwc_generic_output9_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int); -struct a64_u8q_nhwc_generic_output9_mla_depthfirst +class a64_u8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy { - typedef int32_t bias_type; - typedef uint8_t input_type; - typedef uint8_t weight_type; - typedef uint8_t return_type; - - typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - constexpr static unsigned int n_output_points = 9; + KernelType kernel = a64_u8q_nhwc_generic_output9_mla_depthfirst_impl; - kern_type kernel = a64_u8q_nhwc_generic_output9_mla_depthfirst_impl; + public: + a64_u8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy(9, arm_gemm::VLType::None) {} - a64_u8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {} + KernelType get_kernel() const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp index e8ac603928..134f657fb8 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,39 +28,33 @@ #pragma once +#if defined(__aarch64__) + namespace arm_conv { namespace depthwise { void a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); -struct a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst +struct a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst : DepthfirstMultiplierStrategy { - typedef uint32_t bias_type; - typedef uint8_t input_type; - typedef uint8_t weight_type; - typedef uint8_t return_type; - - typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - + using Parent = DepthfirstMultiplierStrategy; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 2; constexpr static unsigned int stride_cols = 2; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 4; - - constexpr static unsigned int input_rows = 5; - constexpr static unsigned int input_cols = 9; - constexpr static unsigned int input_col_quads = 1; + a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) + : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols) + { + } - kern_type kernel = a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl; + arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; } - a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp index c5e0417c20..b575a5d169 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,39 +28,33 @@ #pragma once +#if defined(__aarch64__) + namespace arm_conv { namespace depthwise { void a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); -struct a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst +struct a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst : DepthfirstMultiplierStrategy { - typedef uint32_t bias_type; - typedef uint8_t input_type; - typedef uint8_t weight_type; - typedef uint8_t return_type; - - typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - + using Parent = DepthfirstMultiplierStrategy; constexpr static unsigned int kernel_rows = 5; constexpr static unsigned int kernel_cols = 5; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 4; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 8; - constexpr static unsigned int input_cols = 6; - constexpr static unsigned int input_col_quads = 1; + a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) + : Parent(4, 2, kernel_rows, kernel_cols, stride_rows, stride_cols) + { + } - kern_type kernel = a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl; + arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::None; } - a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp index 6b52017ce1..13f903b95d 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,31 +28,25 @@ #pragma once +#if defined(__aarch64__) + namespace arm_conv { namespace depthwise { void a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const uint8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&); -struct a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst +struct a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy { - typedef int32_t bias_type; - typedef uint8_t input_type; - typedef uint8_t weight_type; - typedef uint8_t return_type; - - typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const uint8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - constexpr static unsigned int output_rows(void) { return 2; }; - constexpr static unsigned int output_cols(void) { return 8; }; - - constexpr static unsigned int output_col_regs(void) { return 2; }; - - kern_type kernel = a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl; - - a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {} + using Parent = GenericDepthfirstMultiplierKernelStrategy; + a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) + : Parent(2, 8, arm_gemm::VLType::None) + { + } + Parent::KernelType kernel = a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..2d2b452630 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + +class a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy +{ + using Parent = DepthwiseDepthfirstStrategy; + + public: + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {} + + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; } + + Parent::KernelType kernel = a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..2410d38512 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,1164 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm.hpp" + +#include +#include + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const uint8_t *const *const inptrs, + const uint8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const void *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + uint8_t *const *const outptrs; + const uint8_t *inptrs[16]; + + Params( + long unsigned int n_channels, + const uint8_t *const *inptrs_raw, + const void *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[5]; + inptrs[1] = inptrs_raw[0]; + inptrs[2] = inptrs_raw[3]; + inptrs[3] = inptrs_raw[6]; + inptrs[4] = inptrs_raw[9]; + inptrs[5] = inptrs_raw[12]; + inptrs[6] = inptrs_raw[15]; + inptrs[7] = inptrs_raw[1]; + inptrs[8] = inptrs_raw[2]; + inptrs[9] = inptrs_raw[10]; + inptrs[10] = inptrs_raw[4]; + inptrs[11] = inptrs_raw[7]; + inptrs[12] = inptrs_raw[8]; + inptrs[13] = inptrs_raw[11]; + inptrs[14] = inptrs_raw[13]; + inptrs[15] = inptrs_raw[14]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x19, [%x[params], %[offsetof_Params_requant]]\n" + "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n" + "add x23, x19, %[offsetof_Requantize32_b_offset]\n" + "add x22, x19, %[offsetof_Requantize32_c_offset]\n" + "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x20, x19, %[offsetof_Requantize32_minval]\n" + "add x19, x19, %[offsetof_Requantize32_maxval]\n" + "ldr x17, [%x[params], %[offsetof_Params_weights]]\n" + "ld1r { v15.16b }, [x23]\n" + "ld1r { v13.8h }, [x22]\n" + "lsr x16, x8, #0x3\n" + "mov x15, #0x0\n" + "ld1r { v11.8h }, [x20]\n" + "ld1r { v25.8h }, [x19]\n" + "mov x14, #0x0\n" + "add x13, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x12, [%x[params], %[offsetof_Params_requant_muls]]\n" + "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "ldp x10, x9, [x21, #0x0]\n" + "ldp x28, x27, [x21, #0x10]\n" + "cbz x16, 3f\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q12, [x19, #0x0]\n" + "subs x16, x16, #0x1\n" + "mov v14.16b, v12.16b\n" + "ldr q17, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "ldr d2, [x17, #0x10]\n" + "mov v9.16b, v17.16b\n" + "mov v16.16b, v12.16b\n" + "ldr d3, [x17, #0x18]\n" + "ldr d4, [x17, #0x20]\n" + "mov v10.16b, v17.16b\n" + "mov v18.16b, v12.16b\n" + "ldr d5, [x17, #0x28]\n" + "ldr d6, [x17, #0x30]\n" + "mov v26.16b, v17.16b\n" + "usubl v0.8h, v0.8b, v15.8b\n" + "ldr d7, [x17, #0x38]\n" + "ldr d8, [x17, #0x40]\n" + "usubl v1.8h, v1.8b, v15.8b\n" + "usubl v2.8h, v2.8b, v15.8b\n" + "ldp x23, x22, [x13, #0x0]\n" + "ldp x21, x20, [x13, #0x10]\n" + "usubl v3.8h, v3.8b, v15.8b\n" + "usubl v4.8h, v4.8b, v15.8b\n" + "ldr x19, [x13, #0x20]\n" + "ldr d31, [x23, x15]\n" + "usubl v5.8h, v5.8b, v15.8b\n" + "usubl v6.8h, v6.8b, v15.8b\n" + "ldr d30, [x22, x15]\n" + "ldr d29, [x21, x15]\n" + "usubl v7.8h, v7.8b, v15.8b\n" + "usubl v8.8h, v8.8b, v15.8b\n" + "ldr d28, [x20, x15]\n" + "ldr d27, [x19, x15]\n" + "ushll v31.8h, v31.8b, #0x0\n" + "ushll v30.8h, v30.8b, #0x0\n" + "ushll v29.8h, v29.8b, #0x0\n" + "ushll v28.8h, v28.8b, #0x0\n" + "ushll v27.8h, v27.8b, #0x0\n" + "beq 2f\n" + "1:" // Loop + "smlal v12.4s, v31.4h, v4.4h\n" + "smlal2 v17.4s, v31.8h, v4.8h\n" + "ldr x21, [x13, #0x28]\n" + "ldr x26, [x13, #0x38]\n" + "smlal v14.4s, v31.4h, v3.4h\n" + "smlal2 v9.4s, v31.8h, v3.8h\n" + "ldr x20, [x13, #0x30]\n" + "ldr x25, [x13, #0x40]\n" + "smlal v12.4s, v30.4h, v0.4h\n" + "smlal2 v17.4s, v30.8h, v0.8h\n" + "ldr x19, [x13, #0x48]\n" + "ldr d30, [x19, x15]\n" + "smlal v14.4s, v29.4h, v2.4h\n" + "smlal2 v9.4s, v29.8h, v2.8h\n" + "ldr d29, [x20, x15]\n" + "ushll v29.8h, v29.8b, #0x0\n" + "smlal v16.4s, v31.4h, v1.4h\n" + "smlal2 v10.4s, v31.8h, v1.8h\n" + "ldr x24, [x13, #0x50]\n" + "ldr x23, [x13, #0x58]\n" + "smlal v18.4s, v31.4h, v0.4h\n" + "smlal2 v26.4s, v31.8h, v0.8h\n" + "ldr d31, [x21, x15]\n" + "ushll v31.8h, v31.8b, #0x0\n" + "smlal v12.4s, v28.4h, v5.4h\n" + "smlal2 v17.4s, v28.8h, v5.8h\n" + "ushll v30.8h, v30.8b, #0x0\n" + "ldr x22, [x13, #0x60]\n" + "smlal v14.4s, v28.4h, v4.4h\n" + "smlal2 v9.4s, v28.8h, v4.8h\n" + "ldr x21, [x13, #0x68]\n" + "ldr x20, [x13, #0x70]\n" + "smlal v16.4s, v28.4h, v2.4h\n" + "smlal2 v10.4s, v28.8h, v2.8h\n" + "ldr x19, [x13, #0x78]\n" + "ldr q21, [x12, #0x0]\n" + "smlal v18.4s, v28.4h, v1.4h\n" + "smlal2 v26.4s, v28.8h, v1.8h\n" + "ldr d28, [x26, x15]\n" + "ushll v28.8h, v28.8b, #0x0\n" + "smlal v12.4s, v27.4h, v7.4h\n" + "smlal2 v17.4s, v27.8h, v7.8h\n" + "ldr q24, [x11, #0x0]\n" + "ldr q19, [x12, #0x10]\n" + "smlal v14.4s, v27.4h, v6.4h\n" + "smlal2 v9.4s, v27.8h, v6.8h\n" + "ldr q23, [x11, #0x10]\n" + "add x17, x17, #0x48\n" + "smlal v16.4s, v31.4h, v6.4h\n" + "smlal2 v10.4s, v31.8h, v6.8h\n" + "ldr d31, [x25, x15]\n" + "ushll v31.8h, v31.8b, #0x0\n" + "smlal v18.4s, v27.4h, v3.4h\n" + "smlal2 v26.4s, v27.8h, v3.8h\n" + "subs x16, x16, #0x1\n" + "add x12, x12, #0x20\n" + "smlal v12.4s, v28.4h, v1.4h\n" + "smlal2 v17.4s, v28.8h, v1.8h\n" + "add x11, x11, #0x20\n" + "smlal v14.4s, v28.4h, v0.4h\n" + "smlal2 v9.4s, v28.8h, v0.8h\n" + "ldr d28, [x23, x15]\n" + "ushll v28.8h, v28.8b, #0x0\n" + "smlal v16.4s, v27.4h, v4.4h\n" + "smlal v18.4s, v29.4h, v8.4h\n" + "smlal2 v10.4s, v27.8h, v4.8h\n" + "smlal2 v26.4s, v29.8h, v8.8h\n" + "ldr d29, [x24, x15]\n" + "ushll v29.8h, v29.8b, #0x0\n" + "smlal v12.4s, v31.4h, v2.4h\n" + "smlal2 v17.4s, v31.8h, v2.8h\n" + "smlal v14.4s, v31.4h, v1.4h\n" + "smlal2 v9.4s, v31.8h, v1.8h\n" + "ldr d31, [x22, x15]\n" + "ushll v31.8h, v31.8b, #0x0\n" + "smlal v16.4s, v30.4h, v5.4h\n" + "smlal v18.4s, v30.4h, v4.4h\n" + "smlal v12.4s, v30.4h, v8.4h\n" + "smlal2 v17.4s, v30.8h, v8.8h\n" + "smlal v14.4s, v30.4h, v7.4h\n" + "smlal2 v9.4s, v30.8h, v7.8h\n" + "smlal2 v10.4s, v30.8h, v5.8h\n" + "smlal2 v26.4s, v30.8h, v4.8h\n" + "ldr d30, [x21, x15]\n" + "ushll v30.8h, v30.8b, #0x0\n" + "smlal v16.4s, v29.4h, v0.4h\n" + "smlal v18.4s, v28.4h, v2.4h\n" + "smlal v12.4s, v29.4h, v3.4h\n" + "smlal2 v17.4s, v29.8h, v3.8h\n" + "smlal2 v10.4s, v29.8h, v0.8h\n" + "ldr d29, [x20, x15]\n" + "smlal2 v26.4s, v28.8h, v2.8h\n" + "ushll v29.8h, v29.8b, #0x0\n" + "smlal v16.4s, v31.4h, v3.4h\n" + "smlal v18.4s, v30.4h, v5.4h\n" + "smlal v14.4s, v28.4h, v5.4h\n" + "smlal2 v9.4s, v28.8h, v5.8h\n" + "ldr d28, [x19, x15]\n" + "ushll v28.8h, v28.8b, #0x0\n" + "smlal2 v10.4s, v31.8h, v3.8h\n" + "smlal2 v26.4s, v30.8h, v5.8h\n" + "add x15, x15, #0x8\n" + "smlal v16.4s, v29.4h, v7.4h\n" + "smlal v18.4s, v29.4h, v6.4h\n" + "smlal2 v10.4s, v29.8h, v7.8h\n" + "smlal2 v26.4s, v29.8h, v6.8h\n" + "smlal v12.4s, v31.4h, v6.4h\n" + "smlal v14.4s, v30.4h, v8.4h\n" + "sqdmulh v12.4s, v12.4s, v21.4s\n" + "smlal v16.4s, v28.4h, v8.4h\n" + "smlal v18.4s, v28.4h, v7.4h\n" + "sqdmulh v14.4s, v14.4s, v21.4s\n" + "smlal2 v17.4s, v31.8h, v6.8h\n" + "smlal2 v9.4s, v30.8h, v8.8h\n" + "sqdmulh v16.4s, v16.4s, v21.4s\n" + "smlal2 v10.4s, v28.8h, v8.8h\n" + "smlal2 v26.4s, v28.8h, v7.8h\n" + "sqdmulh v18.4s, v18.4s, v21.4s\n" + "and v29.16b, v12.16b, v24.16b\n" + "sqdmulh v17.4s, v17.4s, v19.4s\n" + "and v22.16b, v14.16b, v24.16b\n" + "sqdmulh v9.4s, v9.4s, v19.4s\n" + "and v21.16b, v16.16b, v24.16b\n" + "sqdmulh v10.4s, v10.4s, v19.4s\n" + "and v20.16b, v18.16b, v24.16b\n" + "sqdmulh v26.4s, v26.4s, v19.4s\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "and v19.16b, v17.16b, v23.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v30.16b, v9.16b, v23.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v3.16b, v10.16b, v23.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v28.16b, v26.16b, v23.16b\n" + "sqadd v12.4s, v12.4s, v29.4s\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sqadd v14.4s, v14.4s, v22.4s\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v21.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v20.4s\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "srshl v12.4s, v12.4s, v24.4s\n" + "sqadd v17.4s, v17.4s, v19.4s\n" + "srshl v14.4s, v14.4s, v24.4s\n" + "sqadd v9.4s, v9.4s, v30.4s\n" + "srshl v16.4s, v16.4s, v24.4s\n" + "sqadd v10.4s, v10.4s, v3.4s\n" + "srshl v18.4s, v18.4s, v24.4s\n" + "sqadd v26.4s, v26.4s, v28.4s\n" + "srshl v17.4s, v17.4s, v23.4s\n" + "sqxtn v12.4h, v12.4s\n" + "srshl v9.4s, v9.4s, v23.4s\n" + "sqxtn v14.4h, v14.4s\n" + "srshl v10.4s, v10.4s, v23.4s\n" + "sqxtn v16.4h, v16.4s\n" + "srshl v26.4s, v26.4s, v23.4s\n" + "sqxtn v18.4h, v18.4s\n" + "sqxtn2 v12.8h, v17.4s\n" + "sqxtn2 v14.8h, v9.4s\n" + "sqxtn2 v16.8h, v10.4s\n" + "sqxtn2 v18.8h, v26.4s\n" + "sqadd v12.8h, v12.8h, v13.8h\n" + "sqadd v14.8h, v14.8h, v13.8h\n" + "sqadd v16.8h, v16.8h, v13.8h\n" + "sqadd v18.8h, v18.8h, v13.8h\n" + "smax v12.8h, v12.8h, v11.8h\n" + "smax v14.8h, v14.8h, v11.8h\n" + "smax v16.8h, v16.8h, v11.8h\n" + "smax v18.8h, v18.8h, v11.8h\n" + "smin v12.8h, v12.8h, v25.8h\n" + "smin v14.8h, v14.8h, v25.8h\n" + "smin v16.8h, v16.8h, v25.8h\n" + "smin v18.8h, v18.8h, v25.8h\n" + "uzp1 v12.16b, v12.16b, v12.16b\n" + "uzp1 v14.16b, v14.16b, v14.16b\n" + "str d12, [x10, x14]\n" + "uzp1 v16.16b, v16.16b, v16.16b\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "str d14, [x9, x14]\n" + "str d16, [x28, x14]\n" + "str d18, [x27, x14]\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q12, [x19, #0x0]\n" + "add x14, x14, #0x8\n" + "ldr q17, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "ldr d2, [x17, #0x10]\n" + "mov v14.16b, v12.16b\n" + "mov v9.16b, v17.16b\n" + "ldr d3, [x17, #0x18]\n" + "ldr d4, [x17, #0x20]\n" + "mov v16.16b, v12.16b\n" + "mov v10.16b, v17.16b\n" + "ldr d5, [x17, #0x28]\n" + "ldr d6, [x17, #0x30]\n" + "mov v18.16b, v12.16b\n" + "mov v26.16b, v17.16b\n" + "ldr d7, [x17, #0x38]\n" + "ldr d8, [x17, #0x40]\n" + "usubl v0.8h, v0.8b, v15.8b\n" + "usubl v1.8h, v1.8b, v15.8b\n" + "ldp x23, x22, [x13, #0x0]\n" + "ldp x21, x20, [x13, #0x10]\n" + "usubl v2.8h, v2.8b, v15.8b\n" + "usubl v3.8h, v3.8b, v15.8b\n" + "ldr x19, [x13, #0x20]\n" + "ldr d31, [x23, x15]\n" + "usubl v4.8h, v4.8b, v15.8b\n" + "usubl v5.8h, v5.8b, v15.8b\n" + "ldr d30, [x22, x15]\n" + "ldr d29, [x21, x15]\n" + "usubl v6.8h, v6.8b, v15.8b\n" + "usubl v7.8h, v7.8b, v15.8b\n" + "ldr d28, [x20, x15]\n" + "ldr d27, [x19, x15]\n" + "usubl v8.8h, v8.8b, v15.8b\n" + "ushll v31.8h, v31.8b, #0x0\n" + "ushll v30.8h, v30.8b, #0x0\n" + "ushll v29.8h, v29.8b, #0x0\n" + "ushll v28.8h, v28.8b, #0x0\n" + "ushll v27.8h, v27.8b, #0x0\n" + "bgt 1b\n" + "2:" // Tail + "smlal v12.4s, v31.4h, v4.4h\n" + "smlal2 v17.4s, v31.8h, v4.8h\n" + "ldr x21, [x13, #0x28]\n" + "ldr x26, [x13, #0x38]\n" + "smlal v14.4s, v31.4h, v3.4h\n" + "smlal2 v9.4s, v31.8h, v3.8h\n" + "ldr x20, [x13, #0x30]\n" + "ldr x25, [x13, #0x40]\n" + "smlal v12.4s, v30.4h, v0.4h\n" + "smlal2 v17.4s, v30.8h, v0.8h\n" + "ldr x19, [x13, #0x48]\n" + "ldr d30, [x19, x15]\n" + "smlal v14.4s, v29.4h, v2.4h\n" + "smlal2 v9.4s, v29.8h, v2.8h\n" + "ldr d29, [x20, x15]\n" + "ushll v29.8h, v29.8b, #0x0\n" + "smlal v16.4s, v31.4h, v1.4h\n" + "smlal2 v10.4s, v31.8h, v1.8h\n" + "ldr x24, [x13, #0x50]\n" + "ldr x23, [x13, #0x58]\n" + "smlal v18.4s, v31.4h, v0.4h\n" + "smlal2 v26.4s, v31.8h, v0.8h\n" + "ldr d31, [x21, x15]\n" + "ushll v31.8h, v31.8b, #0x0\n" + "smlal v12.4s, v28.4h, v5.4h\n" + "smlal2 v17.4s, v28.8h, v5.8h\n" + "ushll v30.8h, v30.8b, #0x0\n" + "ldr x22, [x13, #0x60]\n" + "smlal v14.4s, v28.4h, v4.4h\n" + "smlal2 v9.4s, v28.8h, v4.8h\n" + "ldr x21, [x13, #0x68]\n" + "ldr x20, [x13, #0x70]\n" + "smlal v16.4s, v28.4h, v2.4h\n" + "smlal2 v10.4s, v28.8h, v2.8h\n" + "ldr x19, [x13, #0x78]\n" + "ldr q21, [x12, #0x0]\n" + "smlal v18.4s, v28.4h, v1.4h\n" + "smlal2 v26.4s, v28.8h, v1.8h\n" + "ldr d28, [x26, x15]\n" + "ushll v28.8h, v28.8b, #0x0\n" + "smlal v12.4s, v27.4h, v7.4h\n" + "smlal2 v17.4s, v27.8h, v7.8h\n" + "ldr q24, [x11, #0x0]\n" + "ldr q19, [x12, #0x10]\n" + "smlal v14.4s, v27.4h, v6.4h\n" + "smlal2 v9.4s, v27.8h, v6.8h\n" + "ldr q23, [x11, #0x10]\n" + "tst x8, #0x7\n" + "smlal v16.4s, v31.4h, v6.4h\n" + "smlal2 v10.4s, v31.8h, v6.8h\n" + "ldr d31, [x25, x15]\n" + "ushll v31.8h, v31.8b, #0x0\n" + "smlal v18.4s, v27.4h, v3.4h\n" + "smlal2 v26.4s, v27.8h, v3.8h\n" + "add x12, x12, #0x20\n" + "add x11, x11, #0x20\n" + "smlal v12.4s, v28.4h, v1.4h\n" + "smlal2 v17.4s, v28.8h, v1.8h\n" + "smlal v14.4s, v28.4h, v0.4h\n" + "smlal2 v9.4s, v28.8h, v0.8h\n" + "ldr d28, [x23, x15]\n" + "ushll v28.8h, v28.8b, #0x0\n" + "smlal v16.4s, v27.4h, v4.4h\n" + "smlal v18.4s, v29.4h, v8.4h\n" + "smlal2 v10.4s, v27.8h, v4.8h\n" + "smlal2 v26.4s, v29.8h, v8.8h\n" + "ldr d29, [x24, x15]\n" + "ushll v29.8h, v29.8b, #0x0\n" + "smlal v12.4s, v31.4h, v2.4h\n" + "smlal2 v17.4s, v31.8h, v2.8h\n" + "smlal v14.4s, v31.4h, v1.4h\n" + "smlal2 v9.4s, v31.8h, v1.8h\n" + "ldr d31, [x22, x15]\n" + "ushll v31.8h, v31.8b, #0x0\n" + "smlal v16.4s, v30.4h, v5.4h\n" + "smlal v18.4s, v30.4h, v4.4h\n" + "smlal v12.4s, v30.4h, v8.4h\n" + "smlal2 v17.4s, v30.8h, v8.8h\n" + "smlal v14.4s, v30.4h, v7.4h\n" + "smlal2 v9.4s, v30.8h, v7.8h\n" + "smlal2 v10.4s, v30.8h, v5.8h\n" + "smlal2 v26.4s, v30.8h, v4.8h\n" + "ldr d30, [x21, x15]\n" + "ushll v30.8h, v30.8b, #0x0\n" + "smlal v16.4s, v29.4h, v0.4h\n" + "smlal v18.4s, v28.4h, v2.4h\n" + "smlal v12.4s, v29.4h, v3.4h\n" + "smlal2 v17.4s, v29.8h, v3.8h\n" + "smlal2 v10.4s, v29.8h, v0.8h\n" + "ldr d29, [x20, x15]\n" + "smlal2 v26.4s, v28.8h, v2.8h\n" + "ushll v29.8h, v29.8b, #0x0\n" + "smlal v16.4s, v31.4h, v3.4h\n" + "smlal v18.4s, v30.4h, v5.4h\n" + "smlal v14.4s, v28.4h, v5.4h\n" + "smlal2 v9.4s, v28.8h, v5.8h\n" + "ldr d28, [x19, x15]\n" + "ushll v28.8h, v28.8b, #0x0\n" + "smlal2 v10.4s, v31.8h, v3.8h\n" + "smlal2 v26.4s, v30.8h, v5.8h\n" + "add x15, x15, #0x8\n" + "smlal v16.4s, v29.4h, v7.4h\n" + "smlal v18.4s, v29.4h, v6.4h\n" + "smlal2 v10.4s, v29.8h, v7.8h\n" + "smlal2 v26.4s, v29.8h, v6.8h\n" + "smlal v12.4s, v31.4h, v6.4h\n" + "smlal v14.4s, v30.4h, v8.4h\n" + "sqdmulh v12.4s, v12.4s, v21.4s\n" + "smlal v16.4s, v28.4h, v8.4h\n" + "smlal v18.4s, v28.4h, v7.4h\n" + "sqdmulh v14.4s, v14.4s, v21.4s\n" + "smlal2 v17.4s, v31.8h, v6.8h\n" + "smlal2 v9.4s, v30.8h, v8.8h\n" + "sqdmulh v16.4s, v16.4s, v21.4s\n" + "smlal2 v10.4s, v28.8h, v8.8h\n" + "smlal2 v26.4s, v28.8h, v7.8h\n" + "sqdmulh v18.4s, v18.4s, v21.4s\n" + "and v29.16b, v12.16b, v24.16b\n" + "sqdmulh v17.4s, v17.4s, v19.4s\n" + "and v22.16b, v14.16b, v24.16b\n" + "sqdmulh v9.4s, v9.4s, v19.4s\n" + "and v21.16b, v16.16b, v24.16b\n" + "sqdmulh v10.4s, v10.4s, v19.4s\n" + "and v20.16b, v18.16b, v24.16b\n" + "sqdmulh v26.4s, v26.4s, v19.4s\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "and v19.16b, v17.16b, v23.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v30.16b, v9.16b, v23.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v3.16b, v10.16b, v23.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v28.16b, v26.16b, v23.16b\n" + "sqadd v12.4s, v12.4s, v29.4s\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sqadd v14.4s, v14.4s, v22.4s\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v21.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v20.4s\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "srshl v12.4s, v12.4s, v24.4s\n" + "sqadd v17.4s, v17.4s, v19.4s\n" + "srshl v14.4s, v14.4s, v24.4s\n" + "sqadd v9.4s, v9.4s, v30.4s\n" + "srshl v16.4s, v16.4s, v24.4s\n" + "sqadd v10.4s, v10.4s, v3.4s\n" + "srshl v18.4s, v18.4s, v24.4s\n" + "sqadd v26.4s, v26.4s, v28.4s\n" + "srshl v17.4s, v17.4s, v23.4s\n" + "sqxtn v12.4h, v12.4s\n" + "srshl v9.4s, v9.4s, v23.4s\n" + "sqxtn v14.4h, v14.4s\n" + "srshl v10.4s, v10.4s, v23.4s\n" + "sqxtn v16.4h, v16.4s\n" + "srshl v26.4s, v26.4s, v23.4s\n" + "sqxtn v18.4h, v18.4s\n" + "sqxtn2 v12.8h, v17.4s\n" + "sqxtn2 v14.8h, v9.4s\n" + "sqxtn2 v16.8h, v10.4s\n" + "sqxtn2 v18.8h, v26.4s\n" + "sqadd v12.8h, v12.8h, v13.8h\n" + "sqadd v14.8h, v14.8h, v13.8h\n" + "sqadd v16.8h, v16.8h, v13.8h\n" + "sqadd v18.8h, v18.8h, v13.8h\n" + "smax v12.8h, v12.8h, v11.8h\n" + "smax v14.8h, v14.8h, v11.8h\n" + "smax v16.8h, v16.8h, v11.8h\n" + "smax v18.8h, v18.8h, v11.8h\n" + "smin v12.8h, v12.8h, v25.8h\n" + "smin v14.8h, v14.8h, v25.8h\n" + "smin v16.8h, v16.8h, v25.8h\n" + "smin v18.8h, v18.8h, v25.8h\n" + "uzp1 v12.16b, v12.16b, v12.16b\n" + "uzp1 v14.16b, v14.16b, v14.16b\n" + "str d12, [x10, x14]\n" + "uzp1 v16.16b, v16.16b, v16.16b\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "str d14, [x9, x14]\n" + "str d16, [x28, x14]\n" + "str d18, [x27, x14]\n" + "add x14, x14, #0x8\n" + "beq 64f\n" + "add x17, x17, #0x48\n" + "3:" // Oddments + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "tbz x8, #2, 5f\n" + "ld1 { v12.4s }, [x19], #0x10\n" + "tbz x8, #1, 4f\n" + "ld1 { v17.d }[0], [x19], #0x8\n" + "tbz x8, #0, 7f\n" + "ld1 { v17.s }[2], [x19]\n" + "b 7f\n" + "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset + "tbz x8, #0, 7f\n" + "ld1 { v17.s }[0], [x19]\n" + "b 7f\n" + "5:" // Oddments: Load bias: Bit 2: Unset + "tbz x8, #1, 6f\n" + "ld1 { v12.d }[0], [x19], #0x8\n" + "tbz x8, #0, 7f\n" + "ld1 { v12.s }[2], [x19]\n" + "b 7f\n" + "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 7f\n" + "ld1 { v12.s }[0], [x19]\n" + "7:" // Oddments: Load bias: Bit 2: End + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "mov v14.16b, v12.16b\n" + "mov v9.16b, v17.16b\n" + "ldr d2, [x17, #0x10]\n" + "ldr d3, [x17, #0x18]\n" + "mov v16.16b, v12.16b\n" + "mov v10.16b, v17.16b\n" + "ldr d4, [x17, #0x20]\n" + "ldr d5, [x17, #0x28]\n" + "mov v18.16b, v12.16b\n" + "mov v26.16b, v17.16b\n" + "ldr d6, [x17, #0x30]\n" + "ldr d7, [x17, #0x38]\n" + "usubl v0.8h, v0.8b, v15.8b\n" + "usubl v1.8h, v1.8b, v15.8b\n" + "ldr d8, [x17, #0x40]\n" + "ldp x23, x22, [x13, #0x0]\n" + "usubl v2.8h, v2.8b, v15.8b\n" + "usubl v3.8h, v3.8b, v15.8b\n" + "ldp x21, x20, [x13, #0x10]\n" + "ldr x19, [x13, #0x20]\n" + "usubl v4.8h, v4.8b, v15.8b\n" + "usubl v5.8h, v5.8b, v15.8b\n" + "usubl v6.8h, v6.8b, v15.8b\n" + "usubl v7.8h, v7.8b, v15.8b\n" + "usubl v8.8h, v8.8b, v15.8b\n" + "add x23, x23, x15\n" + "add x22, x22, x15\n" + "add x21, x21, x15\n" + "add x20, x20, x15\n" + "add x19, x19, x15\n" + "tbz x8, #2, 9f\n" + "ld1 { v31.s }[0], [x23], #0x4\n" + "ld1 { v30.s }[0], [x22], #0x4\n" + "ld1 { v29.s }[0], [x21], #0x4\n" + "ld1 { v28.s }[0], [x20], #0x4\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "tbz x8, #1, 8f\n" + "ld1 { v31.h }[2], [x23], #0x2\n" + "ld1 { v30.h }[2], [x22], #0x2\n" + "ld1 { v29.h }[2], [x21], #0x2\n" + "ld1 { v28.h }[2], [x20], #0x2\n" + "ld1 { v27.h }[2], [x19], #0x2\n" + "tbz x8, #0, 11f\n" + "ld1 { v31.b }[6], [x23]\n" + "ld1 { v30.b }[6], [x22]\n" + "ld1 { v29.b }[6], [x21]\n" + "ld1 { v28.b }[6], [x20]\n" + "ld1 { v27.b }[6], [x19]\n" + "b 11f\n" + "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset + "tbz x8, #0, 11f\n" + "ld1 { v31.b }[4], [x23]\n" + "ld1 { v30.b }[4], [x22]\n" + "ld1 { v29.b }[4], [x21]\n" + "ld1 { v28.b }[4], [x20]\n" + "ld1 { v27.b }[4], [x19]\n" + "b 11f\n" + "9:" // Oddments: Initial loads: Bit 2: Unset + "tbz x8, #1, 10f\n" + "ld1 { v31.h }[0], [x23], #0x2\n" + "ld1 { v30.h }[0], [x22], #0x2\n" + "ld1 { v29.h }[0], [x21], #0x2\n" + "ld1 { v28.h }[0], [x20], #0x2\n" + "ld1 { v27.h }[0], [x19], #0x2\n" + "tbz x8, #0, 11f\n" + "ld1 { v31.b }[2], [x23]\n" + "ld1 { v30.b }[2], [x22]\n" + "ld1 { v29.b }[2], [x21]\n" + "ld1 { v28.b }[2], [x20]\n" + "ld1 { v27.b }[2], [x19]\n" + "b 11f\n" + "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 11f\n" + "ld1 { v31.b }[0], [x23]\n" + "ld1 { v30.b }[0], [x22]\n" + "ld1 { v29.b }[0], [x21]\n" + "ld1 { v28.b }[0], [x20]\n" + "ld1 { v27.b }[0], [x19]\n" + "11:" // Oddments: Initial loads: Bit 2: End + "ushll v31.8h, v31.8b, #0x0\n" + "smlal v12.4s, v31.4h, v4.4h\n" + "smlal2 v17.4s, v31.8h, v4.8h\n" + "ldr x21, [x13, #0x28]\n" + "smlal v14.4s, v31.4h, v3.4h\n" + "smlal2 v9.4s, v31.8h, v3.8h\n" + "ushll v30.8h, v30.8b, #0x0\n" + "add x21, x21, x15\n" + "ushll v29.8h, v29.8b, #0x0\n" + "smlal v16.4s, v31.4h, v1.4h\n" + "smlal2 v10.4s, v31.8h, v1.8h\n" + "smlal v18.4s, v31.4h, v0.4h\n" + "smlal2 v26.4s, v31.8h, v0.8h\n" + "ushll v28.8h, v28.8b, #0x0\n" + "smlal v12.4s, v30.4h, v0.4h\n" + "smlal2 v17.4s, v30.8h, v0.8h\n" + "ushll v27.8h, v27.8b, #0x0\n" + "smlal v14.4s, v29.4h, v2.4h\n" + "smlal2 v9.4s, v29.8h, v2.8h\n" + "smlal v12.4s, v28.4h, v5.4h\n" + "smlal2 v17.4s, v28.8h, v5.8h\n" + "smlal v14.4s, v28.4h, v4.4h\n" + "smlal2 v9.4s, v28.8h, v4.8h\n" + "smlal v16.4s, v28.4h, v2.4h\n" + "smlal2 v10.4s, v28.8h, v2.8h\n" + "smlal v18.4s, v28.4h, v1.4h\n" + "smlal2 v26.4s, v28.8h, v1.8h\n" + "tbz x8, #2, 13f\n" + "ld1 { v31.s }[0], [x21], #0x4\n" + "tbz x8, #1, 12f\n" + "ld1 { v31.h }[2], [x21], #0x2\n" + "tbz x8, #0, 15f\n" + "ld1 { v31.b }[6], [x21]\n" + "b 15f\n" + "12:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset + "tbz x8, #0, 15f\n" + "ld1 { v31.b }[4], [x21]\n" + "b 15f\n" + "13:" // Oddments: Load (3, 0): Bit 2: Unset + "tbz x8, #1, 14f\n" + "ld1 { v31.h }[0], [x21], #0x2\n" + "tbz x8, #0, 15f\n" + "ld1 { v31.b }[2], [x21]\n" + "b 15f\n" + "14:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 15f\n" + "ld1 { v31.b }[0], [x21]\n" + "15:" // Oddments: Load (3, 0): Bit 2: End + "ushll v31.8h, v31.8b, #0x0\n" + "smlal v16.4s, v31.4h, v6.4h\n" + "smlal2 v10.4s, v31.8h, v6.8h\n" + "ldr x20, [x13, #0x30]\n" + "smlal v12.4s, v27.4h, v7.4h\n" + "smlal2 v17.4s, v27.8h, v7.8h\n" + "add x20, x20, x15\n" + "smlal v14.4s, v27.4h, v6.4h\n" + "smlal2 v9.4s, v27.8h, v6.8h\n" + "smlal v16.4s, v27.4h, v4.4h\n" + "smlal2 v10.4s, v27.8h, v4.8h\n" + "smlal v18.4s, v27.4h, v3.4h\n" + "smlal2 v26.4s, v27.8h, v3.8h\n" + "tbz x8, #2, 17f\n" + "ld1 { v29.s }[0], [x20], #0x4\n" + "tbz x8, #1, 16f\n" + "ld1 { v29.h }[2], [x20], #0x2\n" + "tbz x8, #0, 19f\n" + "ld1 { v29.b }[6], [x20]\n" + "b 19f\n" + "16:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset + "tbz x8, #0, 19f\n" + "ld1 { v29.b }[4], [x20]\n" + "b 19f\n" + "17:" // Oddments: Load (3, 3): Bit 2: Unset + "tbz x8, #1, 18f\n" + "ld1 { v29.h }[0], [x20], #0x2\n" + "tbz x8, #0, 19f\n" + "ld1 { v29.b }[2], [x20]\n" + "b 19f\n" + "18:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 19f\n" + "ld1 { v29.b }[0], [x20]\n" + "19:" // Oddments: Load (3, 3): Bit 2: End + "ushll v29.8h, v29.8b, #0x0\n" + "ldr x26, [x13, #0x38]\n" + "smlal v18.4s, v29.4h, v8.4h\n" + "smlal2 v26.4s, v29.8h, v8.8h\n" + "add x26, x26, x15\n" + "tbz x8, #2, 21f\n" + "ld1 { v28.s }[0], [x26], #0x4\n" + "tbz x8, #1, 20f\n" + "ld1 { v28.h }[2], [x26], #0x2\n" + "tbz x8, #0, 23f\n" + "ld1 { v28.b }[6], [x26]\n" + "b 23f\n" + "20:" // Oddments: Load (0, 1): Bit 2: Bit 1: Unset + "tbz x8, #0, 23f\n" + "ld1 { v28.b }[4], [x26]\n" + "b 23f\n" + "21:" // Oddments: Load (0, 1): Bit 2: Unset + "tbz x8, #1, 22f\n" + "ld1 { v28.h }[0], [x26], #0x2\n" + "tbz x8, #0, 23f\n" + "ld1 { v28.b }[2], [x26]\n" + "b 23f\n" + "22:" // Oddments: Load (0, 1): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 23f\n" + "ld1 { v28.b }[0], [x26]\n" + "23:" // Oddments: Load (0, 1): Bit 2: End + "ushll v28.8h, v28.8b, #0x0\n" + "ldr x25, [x13, #0x40]\n" + "smlal v12.4s, v28.4h, v1.4h\n" + "smlal2 v17.4s, v28.8h, v1.8h\n" + "smlal v14.4s, v28.4h, v0.4h\n" + "smlal2 v9.4s, v28.8h, v0.8h\n" + "add x25, x25, x15\n" + "tbz x8, #2, 25f\n" + "ld1 { v31.s }[0], [x25], #0x4\n" + "tbz x8, #1, 24f\n" + "ld1 { v31.h }[2], [x25], #0x2\n" + "tbz x8, #0, 27f\n" + "ld1 { v31.b }[6], [x25]\n" + "b 27f\n" + "24:" // Oddments: Load (0, 2): Bit 2: Bit 1: Unset + "tbz x8, #0, 27f\n" + "ld1 { v31.b }[4], [x25]\n" + "b 27f\n" + "25:" // Oddments: Load (0, 2): Bit 2: Unset + "tbz x8, #1, 26f\n" + "ld1 { v31.h }[0], [x25], #0x2\n" + "tbz x8, #0, 27f\n" + "ld1 { v31.b }[2], [x25]\n" + "b 27f\n" + "26:" // Oddments: Load (0, 2): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 27f\n" + "ld1 { v31.b }[0], [x25]\n" + "27:" // Oddments: Load (0, 2): Bit 2: End + "ushll v31.8h, v31.8b, #0x0\n" + "ldr x19, [x13, #0x48]\n" + "smlal v12.4s, v31.4h, v2.4h\n" + "smlal2 v17.4s, v31.8h, v2.8h\n" + "smlal v14.4s, v31.4h, v1.4h\n" + "smlal2 v9.4s, v31.8h, v1.8h\n" + "add x19, x19, x15\n" + "tbz x8, #2, 29f\n" + "ld1 { v30.s }[0], [x19], #0x4\n" + "tbz x8, #1, 28f\n" + "ld1 { v30.h }[2], [x19], #0x2\n" + "tbz x8, #0, 31f\n" + "ld1 { v30.b }[6], [x19]\n" + "b 31f\n" + "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset + "tbz x8, #0, 31f\n" + "ld1 { v30.b }[4], [x19]\n" + "b 31f\n" + "29:" // Oddments: Load (2, 2): Bit 2: Unset + "tbz x8, #1, 30f\n" + "ld1 { v30.h }[0], [x19], #0x2\n" + "tbz x8, #0, 31f\n" + "ld1 { v30.b }[2], [x19]\n" + "b 31f\n" + "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 31f\n" + "ld1 { v30.b }[0], [x19]\n" + "31:" // Oddments: Load (2, 2): Bit 2: End + "ushll v30.8h, v30.8b, #0x0\n" + "ldr x24, [x13, #0x50]\n" + "smlal v12.4s, v30.4h, v8.4h\n" + "smlal2 v17.4s, v30.8h, v8.8h\n" + "smlal v14.4s, v30.4h, v7.4h\n" + "smlal2 v9.4s, v30.8h, v7.8h\n" + "add x24, x24, x15\n" + "smlal v16.4s, v30.4h, v5.4h\n" + "smlal2 v10.4s, v30.8h, v5.8h\n" + "smlal v18.4s, v30.4h, v4.4h\n" + "smlal2 v26.4s, v30.8h, v4.8h\n" + "tbz x8, #2, 33f\n" + "ld1 { v29.s }[0], [x24], #0x4\n" + "tbz x8, #1, 32f\n" + "ld1 { v29.h }[2], [x24], #0x2\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[6], [x24]\n" + "b 35f\n" + "32:" // Oddments: Load (1, 0): Bit 2: Bit 1: Unset + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[4], [x24]\n" + "b 35f\n" + "33:" // Oddments: Load (1, 0): Bit 2: Unset + "tbz x8, #1, 34f\n" + "ld1 { v29.h }[0], [x24], #0x2\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[2], [x24]\n" + "b 35f\n" + "34:" // Oddments: Load (1, 0): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[0], [x24]\n" + "35:" // Oddments: Load (1, 0): Bit 2: End + "ushll v29.8h, v29.8b, #0x0\n" + "ldr x23, [x13, #0x58]\n" + "smlal v12.4s, v29.4h, v3.4h\n" + "smlal2 v17.4s, v29.8h, v3.8h\n" + "smlal v16.4s, v29.4h, v0.4h\n" + "smlal2 v10.4s, v29.8h, v0.8h\n" + "add x23, x23, x15\n" + "tbz x8, #2, 37f\n" + "ld1 { v28.s }[0], [x23], #0x4\n" + "tbz x8, #1, 36f\n" + "ld1 { v28.h }[2], [x23], #0x2\n" + "tbz x8, #0, 39f\n" + "ld1 { v28.b }[6], [x23]\n" + "b 39f\n" + "36:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset + "tbz x8, #0, 39f\n" + "ld1 { v28.b }[4], [x23]\n" + "b 39f\n" + "37:" // Oddments: Load (1, 3): Bit 2: Unset + "tbz x8, #1, 38f\n" + "ld1 { v28.h }[0], [x23], #0x2\n" + "tbz x8, #0, 39f\n" + "ld1 { v28.b }[2], [x23]\n" + "b 39f\n" + "38:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 39f\n" + "ld1 { v28.b }[0], [x23]\n" + "39:" // Oddments: Load (1, 3): Bit 2: End + "ushll v28.8h, v28.8b, #0x0\n" + "ldr x22, [x13, #0x60]\n" + "smlal v14.4s, v28.4h, v5.4h\n" + "smlal2 v9.4s, v28.8h, v5.8h\n" + "smlal v18.4s, v28.4h, v2.4h\n" + "smlal2 v26.4s, v28.8h, v2.8h\n" + "add x22, x22, x15\n" + "tbz x8, #2, 41f\n" + "ld1 { v31.s }[0], [x22], #0x4\n" + "tbz x8, #1, 40f\n" + "ld1 { v31.h }[2], [x22], #0x2\n" + "tbz x8, #0, 43f\n" + "ld1 { v31.b }[6], [x22]\n" + "b 43f\n" + "40:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset + "tbz x8, #0, 43f\n" + "ld1 { v31.b }[4], [x22]\n" + "b 43f\n" + "41:" // Oddments: Load (2, 0): Bit 2: Unset + "tbz x8, #1, 42f\n" + "ld1 { v31.h }[0], [x22], #0x2\n" + "tbz x8, #0, 43f\n" + "ld1 { v31.b }[2], [x22]\n" + "b 43f\n" + "42:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 43f\n" + "ld1 { v31.b }[0], [x22]\n" + "43:" // Oddments: Load (2, 0): Bit 2: End + "ushll v31.8h, v31.8b, #0x0\n" + "ldr x21, [x13, #0x68]\n" + "smlal v12.4s, v31.4h, v6.4h\n" + "smlal2 v17.4s, v31.8h, v6.8h\n" + "smlal v16.4s, v31.4h, v3.4h\n" + "smlal2 v10.4s, v31.8h, v3.8h\n" + "add x21, x21, x15\n" + "tbz x8, #2, 45f\n" + "ld1 { v30.s }[0], [x21], #0x4\n" + "tbz x8, #1, 44f\n" + "ld1 { v30.h }[2], [x21], #0x2\n" + "tbz x8, #0, 47f\n" + "ld1 { v30.b }[6], [x21]\n" + "b 47f\n" + "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset + "tbz x8, #0, 47f\n" + "ld1 { v30.b }[4], [x21]\n" + "b 47f\n" + "45:" // Oddments: Load (2, 3): Bit 2: Unset + "tbz x8, #1, 46f\n" + "ld1 { v30.h }[0], [x21], #0x2\n" + "tbz x8, #0, 47f\n" + "ld1 { v30.b }[2], [x21]\n" + "b 47f\n" + "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 47f\n" + "ld1 { v30.b }[0], [x21]\n" + "47:" // Oddments: Load (2, 3): Bit 2: End + "ushll v30.8h, v30.8b, #0x0\n" + "ldr x20, [x13, #0x70]\n" + "smlal v14.4s, v30.4h, v8.4h\n" + "smlal2 v9.4s, v30.8h, v8.8h\n" + "smlal v18.4s, v30.4h, v5.4h\n" + "smlal2 v26.4s, v30.8h, v5.8h\n" + "add x20, x20, x15\n" + "tbz x8, #2, 49f\n" + "ld1 { v29.s }[0], [x20], #0x4\n" + "tbz x8, #1, 48f\n" + "ld1 { v29.h }[2], [x20], #0x2\n" + "tbz x8, #0, 51f\n" + "ld1 { v29.b }[6], [x20]\n" + "b 51f\n" + "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset + "tbz x8, #0, 51f\n" + "ld1 { v29.b }[4], [x20]\n" + "b 51f\n" + "49:" // Oddments: Load (3, 1): Bit 2: Unset + "tbz x8, #1, 50f\n" + "ld1 { v29.h }[0], [x20], #0x2\n" + "tbz x8, #0, 51f\n" + "ld1 { v29.b }[2], [x20]\n" + "b 51f\n" + "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 51f\n" + "ld1 { v29.b }[0], [x20]\n" + "51:" // Oddments: Load (3, 1): Bit 2: End + "ushll v29.8h, v29.8b, #0x0\n" + "ldr x19, [x13, #0x78]\n" + "smlal v16.4s, v29.4h, v7.4h\n" + "smlal2 v10.4s, v29.8h, v7.8h\n" + "smlal v18.4s, v29.4h, v6.4h\n" + "smlal2 v26.4s, v29.8h, v6.8h\n" + "add x19, x19, x15\n" + "tbz x8, #2, 53f\n" + "ld1 { v28.s }[0], [x19], #0x4\n" + "tbz x8, #1, 52f\n" + "ld1 { v28.h }[2], [x19], #0x2\n" + "tbz x8, #0, 55f\n" + "ld1 { v28.b }[6], [x19]\n" + "b 55f\n" + "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset + "tbz x8, #0, 55f\n" + "ld1 { v28.b }[4], [x19]\n" + "b 55f\n" + "53:" // Oddments: Load (3, 2): Bit 2: Unset + "tbz x8, #1, 54f\n" + "ld1 { v28.h }[0], [x19], #0x2\n" + "tbz x8, #0, 55f\n" + "ld1 { v28.b }[2], [x19]\n" + "b 55f\n" + "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 55f\n" + "ld1 { v28.b }[0], [x19]\n" + "55:" // Oddments: Load (3, 2): Bit 2: End + "ushll v28.8h, v28.8b, #0x0\n" + "smlal v16.4s, v28.4h, v8.4h\n" + "smlal2 v10.4s, v28.8h, v8.8h\n" + "smlal v18.4s, v28.4h, v7.4h\n" + "smlal2 v26.4s, v28.8h, v7.8h\n" + "tbz x8, #2, 57f\n" + "ld1 { v21.4s }, [x12], #0x10\n" + "ld1 { v24.4s }, [x11], #0x10\n" + "tbz x8, #1, 56f\n" + "ld1 { v19.d }[0], [x12], #0x8\n" + "ld1 { v23.d }[0], [x11], #0x8\n" + "tbz x8, #0, 59f\n" + "ld1 { v19.s }[2], [x12]\n" + "ld1 { v23.s }[2], [x11]\n" + "b 59f\n" + "56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset + "tbz x8, #0, 59f\n" + "ld1 { v19.s }[0], [x12]\n" + "ld1 { v23.s }[0], [x11]\n" + "b 59f\n" + "57:" // Oddments: Load requant params: Bit 2: Unset + "tbz x8, #1, 58f\n" + "ld1 { v21.d }[0], [x12], #0x8\n" + "ld1 { v24.d }[0], [x11], #0x8\n" + "tbz x8, #0, 59f\n" + "ld1 { v21.s }[2], [x12]\n" + "ld1 { v24.s }[2], [x11]\n" + "b 59f\n" + "58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 59f\n" + "ld1 { v21.s }[0], [x12]\n" + "ld1 { v24.s }[0], [x11]\n" + "59:" // Oddments: Load requant params: Bit 2: End + "sqdmulh v12.4s, v12.4s, v21.4s\n" + "sqdmulh v14.4s, v14.4s, v21.4s\n" + "add x10, x10, x14\n" + "add x9, x9, x14\n" + "sqdmulh v16.4s, v16.4s, v21.4s\n" + "sqdmulh v18.4s, v18.4s, v21.4s\n" + "add x28, x28, x14\n" + "add x27, x27, x14\n" + "and v29.16b, v12.16b, v24.16b\n" + "sqdmulh v17.4s, v17.4s, v19.4s\n" + "and v22.16b, v14.16b, v24.16b\n" + "sqdmulh v9.4s, v9.4s, v19.4s\n" + "and v21.16b, v16.16b, v24.16b\n" + "sqdmulh v10.4s, v10.4s, v19.4s\n" + "and v20.16b, v18.16b, v24.16b\n" + "sqdmulh v26.4s, v26.4s, v19.4s\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "and v19.16b, v17.16b, v23.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v30.16b, v9.16b, v23.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v3.16b, v10.16b, v23.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v28.16b, v26.16b, v23.16b\n" + "sqadd v12.4s, v12.4s, v29.4s\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sqadd v14.4s, v14.4s, v22.4s\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "sqadd v16.4s, v16.4s, v21.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v20.4s\n" + "sshr v28.4s, v28.4s, #0x1f\n" + "srshl v12.4s, v12.4s, v24.4s\n" + "sqadd v17.4s, v17.4s, v19.4s\n" + "srshl v14.4s, v14.4s, v24.4s\n" + "sqadd v9.4s, v9.4s, v30.4s\n" + "srshl v16.4s, v16.4s, v24.4s\n" + "sqadd v10.4s, v10.4s, v3.4s\n" + "srshl v18.4s, v18.4s, v24.4s\n" + "sqadd v26.4s, v26.4s, v28.4s\n" + "srshl v17.4s, v17.4s, v23.4s\n" + "sqxtn v12.4h, v12.4s\n" + "srshl v9.4s, v9.4s, v23.4s\n" + "sqxtn v14.4h, v14.4s\n" + "srshl v10.4s, v10.4s, v23.4s\n" + "sqxtn v16.4h, v16.4s\n" + "srshl v26.4s, v26.4s, v23.4s\n" + "sqxtn v18.4h, v18.4s\n" + "sqxtn2 v12.8h, v17.4s\n" + "sqxtn2 v14.8h, v9.4s\n" + "sqxtn2 v16.8h, v10.4s\n" + "sqxtn2 v18.8h, v26.4s\n" + "sqadd v12.8h, v12.8h, v13.8h\n" + "sqadd v14.8h, v14.8h, v13.8h\n" + "sqadd v16.8h, v16.8h, v13.8h\n" + "sqadd v18.8h, v18.8h, v13.8h\n" + "smax v12.8h, v12.8h, v11.8h\n" + "smax v14.8h, v14.8h, v11.8h\n" + "smax v16.8h, v16.8h, v11.8h\n" + "smax v18.8h, v18.8h, v11.8h\n" + "smin v12.8h, v12.8h, v25.8h\n" + "smin v14.8h, v14.8h, v25.8h\n" + "smin v16.8h, v16.8h, v25.8h\n" + "smin v18.8h, v18.8h, v25.8h\n" + "uzp1 v12.16b, v12.16b, v12.16b\n" + "uzp1 v14.16b, v14.16b, v14.16b\n" + "uzp1 v16.16b, v16.16b, v16.16b\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "tbz x8, #2, 61f\n" + "st1 { v12.s }[0], [x10], #0x4\n" + "st1 { v14.s }[0], [x9], #0x4\n" + "st1 { v16.s }[0], [x28], #0x4\n" + "st1 { v18.s }[0], [x27], #0x4\n" + "tbz x8, #1, 60f\n" + "st1 { v12.h }[2], [x10], #0x2\n" + "st1 { v14.h }[2], [x9], #0x2\n" + "st1 { v16.h }[2], [x28], #0x2\n" + "st1 { v18.h }[2], [x27], #0x2\n" + "tbz x8, #0, 63f\n" + "st1 { v12.b }[6], [x10], #0x1\n" + "st1 { v14.b }[6], [x9], #0x1\n" + "st1 { v16.b }[6], [x28], #0x1\n" + "st1 { v18.b }[6], [x27], #0x1\n" + "b 63f\n" + "60:" // Oddments: Bit 2: Bit 1: Unset + "tbz x8, #0, 63f\n" + "st1 { v12.b }[4], [x10], #0x1\n" + "st1 { v14.b }[4], [x9], #0x1\n" + "st1 { v16.b }[4], [x28], #0x1\n" + "st1 { v18.b }[4], [x27], #0x1\n" + "b 63f\n" + "61:" // Oddments: Bit 2: Unset + "tbz x8, #1, 62f\n" + "st1 { v12.h }[0], [x10], #0x2\n" + "st1 { v14.h }[0], [x9], #0x2\n" + "st1 { v16.h }[0], [x28], #0x2\n" + "st1 { v18.h }[0], [x27], #0x2\n" + "tbz x8, #0, 63f\n" + "st1 { v12.b }[2], [x10], #0x1\n" + "st1 { v14.b }[2], [x9], #0x1\n" + "st1 { v16.b }[2], [x28], #0x1\n" + "st1 { v18.b }[2], [x27], #0x1\n" + "b 63f\n" + "62:" // Oddments: Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 63f\n" + "st1 { v12.b }[0], [x10], #0x1\n" + "st1 { v14.b }[0], [x9], #0x1\n" + "st1 { v16.b }[0], [x28], #0x1\n" + "st1 { v18.b }[0], [x27], #0x1\n" + "63:" // Oddments: Bit 2: End + "64:" // End + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..b479dbf7a4 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + +class a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy +{ + using Parent = DepthwiseDepthfirstStrategy; + + public: + constexpr static unsigned int kernel_rows = 3; + constexpr static unsigned int kernel_cols = 3; + + constexpr static unsigned int stride_rows = 2; + constexpr static unsigned int stride_cols = 2; + + a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {} + + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; } + + Parent::KernelType kernel = a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..49f69c4204 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,1395 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm.hpp" + +#include +#include + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const uint8_t *const *const inptrs, + const uint8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const void *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + uint8_t *const *const outptrs; + const uint8_t *inptrs[25]; + + Params( + long unsigned int n_channels, + const uint8_t *const *inptrs_raw, + const void *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[12]; + inptrs[1] = inptrs_raw[0]; + inptrs[2] = inptrs_raw[1]; + inptrs[3] = inptrs_raw[3]; + inptrs[4] = inptrs_raw[4]; + inptrs[5] = inptrs_raw[5]; + inptrs[6] = inptrs_raw[6]; + inptrs[7] = inptrs_raw[2]; + inptrs[8] = inptrs_raw[8]; + inptrs[9] = inptrs_raw[9]; + inptrs[10] = inptrs_raw[7]; + inptrs[11] = inptrs_raw[15]; + inptrs[12] = inptrs_raw[10]; + inptrs[13] = inptrs_raw[16]; + inptrs[14] = inptrs_raw[11]; + inptrs[15] = inptrs_raw[18]; + inptrs[16] = inptrs_raw[13]; + inptrs[17] = inptrs_raw[19]; + inptrs[18] = inptrs_raw[20]; + inptrs[19] = inptrs_raw[14]; + inptrs[20] = inptrs_raw[21]; + inptrs[21] = inptrs_raw[17]; + inptrs[22] = inptrs_raw[23]; + inptrs[23] = inptrs_raw[22]; + inptrs[24] = inptrs_raw[24]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x19, [%x[params], %[offsetof_Params_requant]]\n" + "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n" + "add x23, x19, %[offsetof_Requantize32_b_offset]\n" + "add x22, x19, %[offsetof_Requantize32_c_offset]\n" + "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x20, x19, %[offsetof_Requantize32_minval]\n" + "add x19, x19, %[offsetof_Requantize32_maxval]\n" + "ldr x17, [%x[params], %[offsetof_Params_weights]]\n" + "ld1r { v16.16b }, [x23]\n" + "ld1r { v12.8h }, [x22]\n" + "lsr x16, x8, #0x3\n" + "mov x15, #0x0\n" + "ld1r { v14.8h }, [x20]\n" + "ld1r { v21.8h }, [x19]\n" + "mov x14, #0x0\n" + "add x13, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x12, [%x[params], %[offsetof_Params_requant_muls]]\n" + "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "ldp x10, x9, [x21, #0x0]\n" + "ldp x28, x27, [x21, #0x10]\n" + "cbz x16, 3f\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q15, [x19, #0x0]\n" + "subs x16, x16, #0x1\n" + "mov v13.16b, v15.16b\n" + "ldr q18, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "ldr d2, [x17, #0x10]\n" + "mov v17.16b, v18.16b\n" + "mov v11.16b, v15.16b\n" + "ldr d3, [x17, #0x18]\n" + "ldr d4, [x17, #0x20]\n" + "mov v10.16b, v18.16b\n" + "mov v23.16b, v15.16b\n" + "ldr d5, [x17, #0x28]\n" + "ldr d6, [x17, #0x30]\n" + "mov v9.16b, v18.16b\n" + "usubl v0.8h, v0.8b, v16.8b\n" + "ldr d7, [x17, #0x38]\n" + "ldr d8, [x17, #0x40]\n" + "usubl v1.8h, v1.8b, v16.8b\n" + "usubl v2.8h, v2.8b, v16.8b\n" + "ldp x26, x25, [x13, #0x0]\n" + "ldp x24, x23, [x13, #0x10]\n" + "usubl v3.8h, v3.8b, v16.8b\n" + "usubl v4.8h, v4.8b, v16.8b\n" + "ldp x22, x21, [x13, #0x20]\n" + "ldp x20, x19, [x13, #0x30]\n" + "usubl v5.8h, v5.8b, v16.8b\n" + "usubl v6.8h, v6.8b, v16.8b\n" + "ldr d31, [x26, x15]\n" + "ldr d30, [x25, x15]\n" + "usubl v7.8h, v7.8b, v16.8b\n" + "usubl v8.8h, v8.8b, v16.8b\n" + "ldr d29, [x24, x15]\n" + "ldr d28, [x23, x15]\n" + "ushll v31.8h, v31.8b, #0x0\n" + "ushll v30.8h, v30.8b, #0x0\n" + "ldr d27, [x22, x15]\n" + "ldr d26, [x21, x15]\n" + "ushll v29.8h, v29.8b, #0x0\n" + "ushll v28.8h, v28.8b, #0x0\n" + "ldr d25, [x20, x15]\n" + "ldr d24, [x19, x15]\n" + "ushll v27.8h, v27.8b, #0x0\n" + "ushll v26.8h, v26.8b, #0x0\n" + "ushll v25.8h, v25.8b, #0x0\n" + "ushll v24.8h, v24.8b, #0x0\n" + "beq 2f\n" + "1:" // Loop + "smlal v15.4s, v31.4h, v8.4h\n" + "smlal2 v18.4s, v31.8h, v8.8h\n" + "ldr x24, [x13, #0x40]\n" + "ldr x23, [x13, #0x48]\n" + "smlal v13.4s, v31.4h, v6.4h\n" + "smlal2 v17.4s, v31.8h, v6.8h\n" + "ldr x21, [x13, #0x50]\n" + "ldr x19, [x13, #0x58]\n" + "smlal v15.4s, v30.4h, v0.4h\n" + "smlal2 v18.4s, v30.8h, v0.8h\n" + "ldr x22, [x13, #0x78]\n" + "ldr x20, [x13, #0x60]\n" + "smlal v13.4s, v28.4h, v1.4h\n" + "smlal2 v17.4s, v28.8h, v1.8h\n" + "ldr d28, [x23, x15]\n" + "ushll v28.8h, v28.8b, #0x0\n" + "smlal v15.4s, v29.4h, v1.4h\n" + "smlal2 v18.4s, v29.8h, v1.8h\n" + "ldr d29, [x24, x15]\n" + "ushll v29.8h, v29.8b, #0x0\n" + "smlal v13.4s, v27.4h, v2.4h\n" + "smlal2 v17.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x15]\n" + "ushll v27.8h, v27.8b, #0x0\n" + "smlal v15.4s, v26.4h, v3.4h\n" + "smlal2 v18.4s, v26.8h, v3.8h\n" + "ldr d26, [x19, x15]\n" + "ushll v26.8h, v26.8b, #0x0\n" + "smlal v13.4s, v24.4h, v0.4h\n" + "smlal2 v17.4s, v24.8h, v0.8h\n" + "ldr x21, [x13, #0x80]\n" + "ldr x19, [x13, #0x68]\n" + "smlal v15.4s, v25.4h, v4.4h\n" + "smlal2 v18.4s, v25.8h, v4.8h\n" + "ldr d25, [x20, x15]\n" + "ushll v25.8h, v25.8b, #0x0\n" + "smlal v13.4s, v29.4h, v4.4h\n" + "smlal2 v17.4s, v29.8h, v4.8h\n" + "ldr x20, [x13, #0x88]\n" + "ldr d29, [x19, x15]\n" + "smlal v15.4s, v24.4h, v2.4h\n" + "smlal2 v18.4s, v24.8h, v2.8h\n" + "ldr x19, [x13, #0x70]\n" + "ushll v29.8h, v29.8b, #0x0\n" + "smlal v13.4s, v28.4h, v5.4h\n" + "smlal2 v17.4s, v28.8h, v5.8h\n" + "ldr d28, [x21, x15]\n" + "ushll v28.8h, v28.8b, #0x0\n" + "smlal v11.4s, v31.4h, v2.4h\n" + "smlal2 v10.4s, v31.8h, v2.8h\n" + "ldr x24, [x13, #0x98]\n" + "ldr d24, [x19, x15]\n" + "smlal v15.4s, v27.4h, v5.4h\n" + "smlal2 v18.4s, v27.8h, v5.8h\n" + "ushll v24.8h, v24.8b, #0x0\n" + "ldr x23, [x13, #0x90]\n" + "smlal v13.4s, v27.4h, v3.4h\n" + "smlal2 v17.4s, v27.8h, v3.8h\n" + "ldr d27, [x22, x15]\n" + "ushll v27.8h, v27.8b, #0x0\n" + "smlal v23.4s, v31.4h, v0.4h\n" + "smlal v11.4s, v26.4h, v3.4h\n" + "ldr x22, [x13, #0xa8]\n" + "ldr x19, [x13, #0xa0]\n" + "smlal2 v10.4s, v26.8h, v3.8h\n" + "smlal2 v9.4s, v31.8h, v0.8h\n" + "ldr d26, [x20, x15]\n" + "ushll v26.8h, v26.8b, #0x0\n" + "smlal v23.4s, v27.4h, v4.4h\n" + "smlal v11.4s, v25.4h, v0.4h\n" + "ldr x21, [x13, #0xb0]\n" + "ldr x20, [x13, #0xb8]\n" + "smlal2 v10.4s, v25.8h, v0.8h\n" + "smlal2 v9.4s, v27.8h, v4.8h\n" + "ldr d27, [x19, x15]\n" + "ushll v27.8h, v27.8b, #0x0\n" + "smlal v23.4s, v28.4h, v1.4h\n" + "smlal v15.4s, v25.4h, v6.4h\n" + "ldr x19, [x13, #0xc0]\n" + "ldr q22, [x12, #0x0]\n" + "smlal2 v18.4s, v25.8h, v6.8h\n" + "smlal v11.4s, v29.4h, v4.4h\n" + "ldr d25, [x23, x15]\n" + "ushll v25.8h, v25.8b, #0x0\n" + "smlal2 v10.4s, v29.8h, v4.8h\n" + "ldr d29, [x24, x15]\n" + "smlal2 v9.4s, v28.8h, v1.8h\n" + "ushll v29.8h, v29.8b, #0x0\n" + "smlal v23.4s, v26.4h, v5.4h\n" + "smlal v15.4s, v24.4h, v7.4h\n" + "ldr q31, [x11, #0x0]\n" + "ldr q19, [x12, #0x10]\n" + "smlal2 v18.4s, v24.8h, v7.8h\n" + "smlal v11.4s, v24.4h, v1.4h\n" + "sqdmulh v15.4s, v15.4s, v22.4s\n" + "ldr q30, [x11, #0x10]\n" + "smlal2 v10.4s, v24.8h, v1.8h\n" + "ldr d24, [x22, x15]\n" + "smlal2 v9.4s, v26.8h, v5.8h\n" + "ushll v24.8h, v24.8b, #0x0\n" + "smlal v23.4s, v29.4h, v2.4h\n" + "ldr d26, [x21, x15]\n" + "smlal2 v9.4s, v29.8h, v2.8h\n" + "ushll v26.8h, v26.8b, #0x0\n" + "smlal v11.4s, v25.4h, v6.4h\n" + "smlal v23.4s, v24.4h, v3.4h\n" + "and v4.16b, v15.16b, v31.16b\n" + "add x17, x17, #0x48\n" + "smlal v13.4s, v28.4h, v7.4h\n" + "smlal2 v17.4s, v28.8h, v7.8h\n" + "sqdmulh v18.4s, v18.4s, v19.4s\n" + "subs x16, x16, #0x1\n" + "smlal2 v10.4s, v25.8h, v6.8h\n" + "ldr d25, [x20, x15]\n" + "smlal2 v9.4s, v24.8h, v3.8h\n" + "ushll v25.8h, v25.8b, #0x0\n" + "smlal v11.4s, v27.4h, v7.4h\n" + "smlal v23.4s, v26.4h, v7.4h\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "add x12, x12, #0x20\n" + "smlal v13.4s, v29.4h, v8.4h\n" + "smlal2 v17.4s, v29.8h, v8.8h\n" + "ldr d29, [x19, x15]\n" + "ushll v29.8h, v29.8b, #0x0\n" + "smlal2 v10.4s, v27.8h, v7.8h\n" + "smlal2 v9.4s, v26.8h, v7.8h\n" + "sqdmulh v13.4s, v13.4s, v22.4s\n" + "add x15, x15, #0x8\n" + "smlal v11.4s, v24.4h, v5.4h\n" + "smlal v23.4s, v25.4h, v6.4h\n" + "and v1.16b, v13.16b, v31.16b\n" + "add x11, x11, #0x20\n" + "smlal2 v10.4s, v24.8h, v5.8h\n" + "smlal2 v9.4s, v25.8h, v6.8h\n" + "sqdmulh v17.4s, v17.4s, v19.4s\n" + "smlal v11.4s, v25.4h, v8.4h\n" + "smlal v23.4s, v29.4h, v8.4h\n" + "sqdmulh v11.4s, v11.4s, v22.4s\n" + "smlal2 v10.4s, v25.8h, v8.8h\n" + "smlal2 v9.4s, v29.8h, v8.8h\n" + "sqdmulh v23.4s, v23.4s, v22.4s\n" + "and v22.16b, v11.16b, v31.16b\n" + "sqdmulh v10.4s, v10.4s, v19.4s\n" + "and v20.16b, v23.16b, v31.16b\n" + "sqdmulh v9.4s, v9.4s, v19.4s\n" + "and v19.16b, v18.16b, v30.16b\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "and v27.16b, v17.16b, v30.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v25.16b, v10.16b, v30.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v0.16b, v9.16b, v30.16b\n" + "sqadd v15.4s, v15.4s, v4.4s\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sqadd v13.4s, v13.4s, v1.4s\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sqadd v11.4s, v11.4s, v22.4s\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v20.4s\n" + "sshr v0.4s, v0.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v31.4s\n" + "sqadd v18.4s, v18.4s, v19.4s\n" + "srshl v13.4s, v13.4s, v31.4s\n" + "sqadd v17.4s, v17.4s, v27.4s\n" + "srshl v11.4s, v11.4s, v31.4s\n" + "sqadd v10.4s, v10.4s, v25.4s\n" + "srshl v23.4s, v23.4s, v31.4s\n" + "sqadd v9.4s, v9.4s, v0.4s\n" + "srshl v18.4s, v18.4s, v30.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v17.4s, v17.4s, v30.4s\n" + "sqxtn v13.4h, v13.4s\n" + "srshl v10.4s, v10.4s, v30.4s\n" + "sqxtn v11.4h, v11.4s\n" + "srshl v9.4s, v9.4s, v30.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v15.8h, v18.4s\n" + "sqxtn2 v13.8h, v17.4s\n" + "sqxtn2 v11.8h, v10.4s\n" + "sqxtn2 v23.8h, v9.4s\n" + "sqadd v15.8h, v15.8h, v12.8h\n" + "sqadd v13.8h, v13.8h, v12.8h\n" + "sqadd v11.8h, v11.8h, v12.8h\n" + "sqadd v23.8h, v23.8h, v12.8h\n" + "smax v15.8h, v15.8h, v14.8h\n" + "smax v13.8h, v13.8h, v14.8h\n" + "smax v11.8h, v11.8h, v14.8h\n" + "smax v23.8h, v23.8h, v14.8h\n" + "smin v15.8h, v15.8h, v21.8h\n" + "smin v13.8h, v13.8h, v21.8h\n" + "smin v11.8h, v11.8h, v21.8h\n" + "smin v23.8h, v23.8h, v21.8h\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "str d15, [x10, x14]\n" + "uzp1 v13.16b, v13.16b, v13.16b\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "str d13, [x9, x14]\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str d11, [x28, x14]\n" + "str d23, [x27, x14]\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q15, [x19, #0x0]\n" + "add x14, x14, #0x8\n" + "ldr q18, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "ldr d2, [x17, #0x10]\n" + "mov v13.16b, v15.16b\n" + "mov v17.16b, v18.16b\n" + "ldr d3, [x17, #0x18]\n" + "ldr d4, [x17, #0x20]\n" + "mov v11.16b, v15.16b\n" + "mov v10.16b, v18.16b\n" + "ldr d5, [x17, #0x28]\n" + "ldr d6, [x17, #0x30]\n" + "mov v23.16b, v15.16b\n" + "mov v9.16b, v18.16b\n" + "ldr d7, [x17, #0x38]\n" + "ldr d8, [x17, #0x40]\n" + "usubl v0.8h, v0.8b, v16.8b\n" + "usubl v1.8h, v1.8b, v16.8b\n" + "ldp x26, x25, [x13, #0x0]\n" + "ldp x24, x23, [x13, #0x10]\n" + "usubl v2.8h, v2.8b, v16.8b\n" + "usubl v3.8h, v3.8b, v16.8b\n" + "ldp x22, x21, [x13, #0x20]\n" + "ldp x20, x19, [x13, #0x30]\n" + "usubl v4.8h, v4.8b, v16.8b\n" + "usubl v5.8h, v5.8b, v16.8b\n" + "ldr d31, [x26, x15]\n" + "ldr d30, [x25, x15]\n" + "usubl v6.8h, v6.8b, v16.8b\n" + "usubl v7.8h, v7.8b, v16.8b\n" + "ldr d29, [x24, x15]\n" + "ldr d28, [x23, x15]\n" + "usubl v8.8h, v8.8b, v16.8b\n" + "ushll v31.8h, v31.8b, #0x0\n" + "ldr d27, [x22, x15]\n" + "ldr d26, [x21, x15]\n" + "ushll v30.8h, v30.8b, #0x0\n" + "ushll v29.8h, v29.8b, #0x0\n" + "ldr d25, [x20, x15]\n" + "ldr d24, [x19, x15]\n" + "ushll v28.8h, v28.8b, #0x0\n" + "ushll v27.8h, v27.8b, #0x0\n" + "ushll v26.8h, v26.8b, #0x0\n" + "ushll v25.8h, v25.8b, #0x0\n" + "ushll v24.8h, v24.8b, #0x0\n" + "bgt 1b\n" + "2:" // Tail + "smlal v15.4s, v31.4h, v8.4h\n" + "smlal2 v18.4s, v31.8h, v8.8h\n" + "ldr x24, [x13, #0x40]\n" + "ldr x23, [x13, #0x48]\n" + "smlal v13.4s, v31.4h, v6.4h\n" + "smlal2 v17.4s, v31.8h, v6.8h\n" + "ldr x21, [x13, #0x50]\n" + "ldr x19, [x13, #0x58]\n" + "smlal v15.4s, v30.4h, v0.4h\n" + "smlal2 v18.4s, v30.8h, v0.8h\n" + "ldr x22, [x13, #0x78]\n" + "ldr x20, [x13, #0x60]\n" + "smlal v13.4s, v28.4h, v1.4h\n" + "smlal2 v17.4s, v28.8h, v1.8h\n" + "ldr d28, [x23, x15]\n" + "ushll v28.8h, v28.8b, #0x0\n" + "smlal v15.4s, v29.4h, v1.4h\n" + "smlal2 v18.4s, v29.8h, v1.8h\n" + "ldr d29, [x24, x15]\n" + "ushll v29.8h, v29.8b, #0x0\n" + "smlal v13.4s, v27.4h, v2.4h\n" + "smlal2 v17.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x15]\n" + "ushll v27.8h, v27.8b, #0x0\n" + "smlal v15.4s, v26.4h, v3.4h\n" + "smlal2 v18.4s, v26.8h, v3.8h\n" + "ldr d26, [x19, x15]\n" + "ushll v26.8h, v26.8b, #0x0\n" + "smlal v13.4s, v24.4h, v0.4h\n" + "smlal2 v17.4s, v24.8h, v0.8h\n" + "ldr x21, [x13, #0x80]\n" + "ldr x19, [x13, #0x68]\n" + "smlal v15.4s, v25.4h, v4.4h\n" + "smlal2 v18.4s, v25.8h, v4.8h\n" + "ldr d25, [x20, x15]\n" + "ushll v25.8h, v25.8b, #0x0\n" + "smlal v13.4s, v29.4h, v4.4h\n" + "smlal2 v17.4s, v29.8h, v4.8h\n" + "ldr x20, [x13, #0x88]\n" + "ldr d29, [x19, x15]\n" + "smlal v15.4s, v24.4h, v2.4h\n" + "smlal2 v18.4s, v24.8h, v2.8h\n" + "ldr x19, [x13, #0x70]\n" + "ushll v29.8h, v29.8b, #0x0\n" + "smlal v13.4s, v28.4h, v5.4h\n" + "smlal2 v17.4s, v28.8h, v5.8h\n" + "ldr d28, [x21, x15]\n" + "ushll v28.8h, v28.8b, #0x0\n" + "smlal v11.4s, v31.4h, v2.4h\n" + "smlal2 v10.4s, v31.8h, v2.8h\n" + "ldr x24, [x13, #0x98]\n" + "ldr d24, [x19, x15]\n" + "smlal v15.4s, v27.4h, v5.4h\n" + "smlal2 v18.4s, v27.8h, v5.8h\n" + "ushll v24.8h, v24.8b, #0x0\n" + "ldr x23, [x13, #0x90]\n" + "smlal v13.4s, v27.4h, v3.4h\n" + "smlal2 v17.4s, v27.8h, v3.8h\n" + "ldr d27, [x22, x15]\n" + "ushll v27.8h, v27.8b, #0x0\n" + "smlal v23.4s, v31.4h, v0.4h\n" + "smlal v11.4s, v26.4h, v3.4h\n" + "ldr x22, [x13, #0xa8]\n" + "ldr x19, [x13, #0xa0]\n" + "smlal2 v10.4s, v26.8h, v3.8h\n" + "smlal2 v9.4s, v31.8h, v0.8h\n" + "ldr d26, [x20, x15]\n" + "ushll v26.8h, v26.8b, #0x0\n" + "smlal v23.4s, v27.4h, v4.4h\n" + "smlal v11.4s, v25.4h, v0.4h\n" + "ldr x21, [x13, #0xb0]\n" + "ldr x20, [x13, #0xb8]\n" + "smlal2 v10.4s, v25.8h, v0.8h\n" + "smlal2 v9.4s, v27.8h, v4.8h\n" + "ldr d27, [x19, x15]\n" + "ushll v27.8h, v27.8b, #0x0\n" + "smlal v23.4s, v28.4h, v1.4h\n" + "smlal v15.4s, v25.4h, v6.4h\n" + "ldr x19, [x13, #0xc0]\n" + "ldr q22, [x12, #0x0]\n" + "smlal2 v18.4s, v25.8h, v6.8h\n" + "smlal v11.4s, v29.4h, v4.4h\n" + "ldr d25, [x23, x15]\n" + "ushll v25.8h, v25.8b, #0x0\n" + "smlal2 v10.4s, v29.8h, v4.8h\n" + "ldr d29, [x24, x15]\n" + "smlal2 v9.4s, v28.8h, v1.8h\n" + "ushll v29.8h, v29.8b, #0x0\n" + "smlal v23.4s, v26.4h, v5.4h\n" + "smlal v15.4s, v24.4h, v7.4h\n" + "ldr q31, [x11, #0x0]\n" + "ldr q19, [x12, #0x10]\n" + "smlal2 v18.4s, v24.8h, v7.8h\n" + "smlal v11.4s, v24.4h, v1.4h\n" + "sqdmulh v15.4s, v15.4s, v22.4s\n" + "ldr q30, [x11, #0x10]\n" + "smlal2 v10.4s, v24.8h, v1.8h\n" + "ldr d24, [x22, x15]\n" + "smlal2 v9.4s, v26.8h, v5.8h\n" + "ushll v24.8h, v24.8b, #0x0\n" + "smlal v23.4s, v29.4h, v2.4h\n" + "ldr d26, [x21, x15]\n" + "smlal2 v9.4s, v29.8h, v2.8h\n" + "ushll v26.8h, v26.8b, #0x0\n" + "smlal v11.4s, v25.4h, v6.4h\n" + "smlal v23.4s, v24.4h, v3.4h\n" + "and v4.16b, v15.16b, v31.16b\n" + "tst x8, #0x7\n" + "smlal v13.4s, v28.4h, v7.4h\n" + "smlal2 v17.4s, v28.8h, v7.8h\n" + "sqdmulh v18.4s, v18.4s, v19.4s\n" + "add x12, x12, #0x20\n" + "smlal2 v10.4s, v25.8h, v6.8h\n" + "ldr d25, [x20, x15]\n" + "smlal2 v9.4s, v24.8h, v3.8h\n" + "ushll v25.8h, v25.8b, #0x0\n" + "smlal v11.4s, v27.4h, v7.4h\n" + "smlal v23.4s, v26.4h, v7.4h\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "add x11, x11, #0x20\n" + "smlal v13.4s, v29.4h, v8.4h\n" + "smlal2 v17.4s, v29.8h, v8.8h\n" + "ldr d29, [x19, x15]\n" + "ushll v29.8h, v29.8b, #0x0\n" + "smlal2 v10.4s, v27.8h, v7.8h\n" + "smlal2 v9.4s, v26.8h, v7.8h\n" + "sqdmulh v13.4s, v13.4s, v22.4s\n" + "add x15, x15, #0x8\n" + "smlal v11.4s, v24.4h, v5.4h\n" + "smlal v23.4s, v25.4h, v6.4h\n" + "and v1.16b, v13.16b, v31.16b\n" + "smlal2 v10.4s, v24.8h, v5.8h\n" + "smlal2 v9.4s, v25.8h, v6.8h\n" + "sqdmulh v17.4s, v17.4s, v19.4s\n" + "smlal v11.4s, v25.4h, v8.4h\n" + "smlal v23.4s, v29.4h, v8.4h\n" + "sqdmulh v11.4s, v11.4s, v22.4s\n" + "smlal2 v10.4s, v25.8h, v8.8h\n" + "smlal2 v9.4s, v29.8h, v8.8h\n" + "sqdmulh v23.4s, v23.4s, v22.4s\n" + "and v22.16b, v11.16b, v31.16b\n" + "sqdmulh v10.4s, v10.4s, v19.4s\n" + "and v20.16b, v23.16b, v31.16b\n" + "sqdmulh v9.4s, v9.4s, v19.4s\n" + "and v19.16b, v18.16b, v30.16b\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "and v27.16b, v17.16b, v30.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v25.16b, v10.16b, v30.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v0.16b, v9.16b, v30.16b\n" + "sqadd v15.4s, v15.4s, v4.4s\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sqadd v13.4s, v13.4s, v1.4s\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sqadd v11.4s, v11.4s, v22.4s\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v20.4s\n" + "sshr v0.4s, v0.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v31.4s\n" + "sqadd v18.4s, v18.4s, v19.4s\n" + "srshl v13.4s, v13.4s, v31.4s\n" + "sqadd v17.4s, v17.4s, v27.4s\n" + "srshl v11.4s, v11.4s, v31.4s\n" + "sqadd v10.4s, v10.4s, v25.4s\n" + "srshl v23.4s, v23.4s, v31.4s\n" + "sqadd v9.4s, v9.4s, v0.4s\n" + "srshl v18.4s, v18.4s, v30.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v17.4s, v17.4s, v30.4s\n" + "sqxtn v13.4h, v13.4s\n" + "srshl v10.4s, v10.4s, v30.4s\n" + "sqxtn v11.4h, v11.4s\n" + "srshl v9.4s, v9.4s, v30.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v15.8h, v18.4s\n" + "sqxtn2 v13.8h, v17.4s\n" + "sqxtn2 v11.8h, v10.4s\n" + "sqxtn2 v23.8h, v9.4s\n" + "sqadd v15.8h, v15.8h, v12.8h\n" + "sqadd v13.8h, v13.8h, v12.8h\n" + "sqadd v11.8h, v11.8h, v12.8h\n" + "sqadd v23.8h, v23.8h, v12.8h\n" + "smax v15.8h, v15.8h, v14.8h\n" + "smax v13.8h, v13.8h, v14.8h\n" + "smax v11.8h, v11.8h, v14.8h\n" + "smax v23.8h, v23.8h, v14.8h\n" + "smin v15.8h, v15.8h, v21.8h\n" + "smin v13.8h, v13.8h, v21.8h\n" + "smin v11.8h, v11.8h, v21.8h\n" + "smin v23.8h, v23.8h, v21.8h\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "str d15, [x10, x14]\n" + "uzp1 v13.16b, v13.16b, v13.16b\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "str d13, [x9, x14]\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str d11, [x28, x14]\n" + "str d23, [x27, x14]\n" + "add x14, x14, #0x8\n" + "beq 88f\n" + "add x17, x17, #0x48\n" + "3:" // Oddments + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "tbz x8, #2, 5f\n" + "ld1 { v15.4s }, [x19], #0x10\n" + "tbz x8, #1, 4f\n" + "ld1 { v18.d }[0], [x19], #0x8\n" + "tbz x8, #0, 7f\n" + "ld1 { v18.s }[2], [x19]\n" + "b 7f\n" + "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset + "tbz x8, #0, 7f\n" + "ld1 { v18.s }[0], [x19]\n" + "b 7f\n" + "5:" // Oddments: Load bias: Bit 2: Unset + "tbz x8, #1, 6f\n" + "ld1 { v15.d }[0], [x19], #0x8\n" + "tbz x8, #0, 7f\n" + "ld1 { v15.s }[2], [x19]\n" + "b 7f\n" + "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 7f\n" + "ld1 { v15.s }[0], [x19]\n" + "7:" // Oddments: Load bias: Bit 2: End + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "mov v13.16b, v15.16b\n" + "mov v17.16b, v18.16b\n" + "ldr d2, [x17, #0x10]\n" + "ldr d3, [x17, #0x18]\n" + "mov v11.16b, v15.16b\n" + "mov v10.16b, v18.16b\n" + "ldr d4, [x17, #0x20]\n" + "ldr d5, [x17, #0x28]\n" + "mov v23.16b, v15.16b\n" + "mov v9.16b, v18.16b\n" + "ldr d6, [x17, #0x30]\n" + "ldr d7, [x17, #0x38]\n" + "usubl v0.8h, v0.8b, v16.8b\n" + "usubl v1.8h, v1.8b, v16.8b\n" + "ldr d8, [x17, #0x40]\n" + "ldp x26, x25, [x13, #0x0]\n" + "usubl v2.8h, v2.8b, v16.8b\n" + "usubl v3.8h, v3.8b, v16.8b\n" + "ldp x24, x23, [x13, #0x10]\n" + "ldp x22, x21, [x13, #0x20]\n" + "usubl v4.8h, v4.8b, v16.8b\n" + "usubl v5.8h, v5.8b, v16.8b\n" + "ldp x20, x19, [x13, #0x30]\n" + "usubl v6.8h, v6.8b, v16.8b\n" + "usubl v7.8h, v7.8b, v16.8b\n" + "usubl v8.8h, v8.8b, v16.8b\n" + "add x26, x26, x15\n" + "add x25, x25, x15\n" + "add x24, x24, x15\n" + "add x23, x23, x15\n" + "add x22, x22, x15\n" + "add x21, x21, x15\n" + "add x20, x20, x15\n" + "add x19, x19, x15\n" + "tbz x8, #2, 9f\n" + "ld1 { v31.s }[0], [x26], #0x4\n" + "ld1 { v30.s }[0], [x25], #0x4\n" + "ld1 { v29.s }[0], [x24], #0x4\n" + "ld1 { v28.s }[0], [x23], #0x4\n" + "ld1 { v27.s }[0], [x22], #0x4\n" + "ld1 { v26.s }[0], [x21], #0x4\n" + "ld1 { v25.s }[0], [x20], #0x4\n" + "ld1 { v24.s }[0], [x19], #0x4\n" + "tbz x8, #1, 8f\n" + "ld1 { v31.h }[2], [x26], #0x2\n" + "ld1 { v30.h }[2], [x25], #0x2\n" + "ld1 { v29.h }[2], [x24], #0x2\n" + "ld1 { v28.h }[2], [x23], #0x2\n" + "ld1 { v27.h }[2], [x22], #0x2\n" + "ld1 { v26.h }[2], [x21], #0x2\n" + "ld1 { v25.h }[2], [x20], #0x2\n" + "ld1 { v24.h }[2], [x19], #0x2\n" + "tbz x8, #0, 11f\n" + "ld1 { v31.b }[6], [x26]\n" + "ld1 { v30.b }[6], [x25]\n" + "ld1 { v29.b }[6], [x24]\n" + "ld1 { v28.b }[6], [x23]\n" + "ld1 { v27.b }[6], [x22]\n" + "ld1 { v26.b }[6], [x21]\n" + "ld1 { v25.b }[6], [x20]\n" + "ld1 { v24.b }[6], [x19]\n" + "b 11f\n" + "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset + "tbz x8, #0, 11f\n" + "ld1 { v31.b }[4], [x26]\n" + "ld1 { v30.b }[4], [x25]\n" + "ld1 { v29.b }[4], [x24]\n" + "ld1 { v28.b }[4], [x23]\n" + "ld1 { v27.b }[4], [x22]\n" + "ld1 { v26.b }[4], [x21]\n" + "ld1 { v25.b }[4], [x20]\n" + "ld1 { v24.b }[4], [x19]\n" + "b 11f\n" + "9:" // Oddments: Initial loads: Bit 2: Unset + "tbz x8, #1, 10f\n" + "ld1 { v31.h }[0], [x26], #0x2\n" + "ld1 { v30.h }[0], [x25], #0x2\n" + "ld1 { v29.h }[0], [x24], #0x2\n" + "ld1 { v28.h }[0], [x23], #0x2\n" + "ld1 { v27.h }[0], [x22], #0x2\n" + "ld1 { v26.h }[0], [x21], #0x2\n" + "ld1 { v25.h }[0], [x20], #0x2\n" + "ld1 { v24.h }[0], [x19], #0x2\n" + "tbz x8, #0, 11f\n" + "ld1 { v31.b }[2], [x26]\n" + "ld1 { v30.b }[2], [x25]\n" + "ld1 { v29.b }[2], [x24]\n" + "ld1 { v28.b }[2], [x23]\n" + "ld1 { v27.b }[2], [x22]\n" + "ld1 { v26.b }[2], [x21]\n" + "ld1 { v25.b }[2], [x20]\n" + "ld1 { v24.b }[2], [x19]\n" + "b 11f\n" + "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 11f\n" + "ld1 { v31.b }[0], [x26]\n" + "ld1 { v30.b }[0], [x25]\n" + "ld1 { v29.b }[0], [x24]\n" + "ld1 { v28.b }[0], [x23]\n" + "ld1 { v27.b }[0], [x22]\n" + "ld1 { v26.b }[0], [x21]\n" + "ld1 { v25.b }[0], [x20]\n" + "ld1 { v24.b }[0], [x19]\n" + "11:" // Oddments: Initial loads: Bit 2: End + "ushll v31.8h, v31.8b, #0x0\n" + "smlal v15.4s, v31.4h, v8.4h\n" + "smlal2 v18.4s, v31.8h, v8.8h\n" + "ldr x24, [x13, #0x40]\n" + "ushll v30.8h, v30.8b, #0x0\n" + "smlal v15.4s, v30.4h, v0.4h\n" + "smlal2 v18.4s, v30.8h, v0.8h\n" + "add x24, x24, x15\n" + "ushll v29.8h, v29.8b, #0x0\n" + "smlal v13.4s, v31.4h, v6.4h\n" + "smlal2 v17.4s, v31.8h, v6.8h\n" + "smlal v15.4s, v29.4h, v1.4h\n" + "smlal2 v18.4s, v29.8h, v1.8h\n" + "ushll v28.8h, v28.8b, #0x0\n" + "ushll v26.8h, v26.8b, #0x0\n" + "smlal v13.4s, v28.4h, v1.4h\n" + "smlal2 v17.4s, v28.8h, v1.8h\n" + "smlal v15.4s, v26.4h, v3.4h\n" + "smlal2 v18.4s, v26.8h, v3.8h\n" + "ushll v27.8h, v27.8b, #0x0\n" + "ushll v25.8h, v25.8b, #0x0\n" + "smlal v13.4s, v27.4h, v2.4h\n" + "smlal2 v17.4s, v27.8h, v2.8h\n" + "smlal v15.4s, v25.4h, v4.4h\n" + "smlal2 v18.4s, v25.8h, v4.8h\n" + "ushll v24.8h, v24.8b, #0x0\n" + "smlal v11.4s, v31.4h, v2.4h\n" + "smlal2 v10.4s, v31.8h, v2.8h\n" + "smlal v23.4s, v31.4h, v0.4h\n" + "smlal2 v9.4s, v31.8h, v0.8h\n" + "smlal v15.4s, v24.4h, v2.4h\n" + "smlal2 v18.4s, v24.8h, v2.8h\n" + "smlal v13.4s, v24.4h, v0.4h\n" + "smlal2 v17.4s, v24.8h, v0.8h\n" + "tbz x8, #2, 13f\n" + "ld1 { v29.s }[0], [x24], #0x4\n" + "tbz x8, #1, 12f\n" + "ld1 { v29.h }[2], [x24], #0x2\n" + "tbz x8, #0, 15f\n" + "ld1 { v29.b }[6], [x24]\n" + "b 15f\n" + "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset + "tbz x8, #0, 15f\n" + "ld1 { v29.b }[4], [x24]\n" + "b 15f\n" + "13:" // Oddments: Load (1, 3): Bit 2: Unset + "tbz x8, #1, 14f\n" + "ld1 { v29.h }[0], [x24], #0x2\n" + "tbz x8, #0, 15f\n" + "ld1 { v29.b }[2], [x24]\n" + "b 15f\n" + "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 15f\n" + "ld1 { v29.b }[0], [x24]\n" + "15:" // Oddments: Load (1, 3): Bit 2: End + "ushll v29.8h, v29.8b, #0x0\n" + "ldr x23, [x13, #0x48]\n" + "smlal v13.4s, v29.4h, v4.4h\n" + "smlal2 v17.4s, v29.8h, v4.8h\n" + "add x23, x23, x15\n" + "tbz x8, #2, 17f\n" + "ld1 { v28.s }[0], [x23], #0x4\n" + "tbz x8, #1, 16f\n" + "ld1 { v28.h }[2], [x23], #0x2\n" + "tbz x8, #0, 19f\n" + "ld1 { v28.b }[6], [x23]\n" + "b 19f\n" + "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset + "tbz x8, #0, 19f\n" + "ld1 { v28.b }[4], [x23]\n" + "b 19f\n" + "17:" // Oddments: Load (1, 4): Bit 2: Unset + "tbz x8, #1, 18f\n" + "ld1 { v28.h }[0], [x23], #0x2\n" + "tbz x8, #0, 19f\n" + "ld1 { v28.b }[2], [x23]\n" + "b 19f\n" + "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 19f\n" + "ld1 { v28.b }[0], [x23]\n" + "19:" // Oddments: Load (1, 4): Bit 2: End + "ushll v28.8h, v28.8b, #0x0\n" + "ldr x21, [x13, #0x50]\n" + "smlal v13.4s, v28.4h, v5.4h\n" + "smlal2 v17.4s, v28.8h, v5.8h\n" + "add x21, x21, x15\n" + "tbz x8, #2, 21f\n" + "ld1 { v27.s }[0], [x21], #0x4\n" + "tbz x8, #1, 20f\n" + "ld1 { v27.h }[2], [x21], #0x2\n" + "tbz x8, #0, 23f\n" + "ld1 { v27.b }[6], [x21]\n" + "b 23f\n" + "20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset + "tbz x8, #0, 23f\n" + "ld1 { v27.b }[4], [x21]\n" + "b 23f\n" + "21:" // Oddments: Load (1, 2): Bit 2: Unset + "tbz x8, #1, 22f\n" + "ld1 { v27.h }[0], [x21], #0x2\n" + "tbz x8, #0, 23f\n" + "ld1 { v27.b }[2], [x21]\n" + "b 23f\n" + "22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 23f\n" + "ld1 { v27.b }[0], [x21]\n" + "23:" // Oddments: Load (1, 2): Bit 2: End + "ushll v27.8h, v27.8b, #0x0\n" + "ldr x19, [x13, #0x58]\n" + "smlal v15.4s, v27.4h, v5.4h\n" + "smlal2 v18.4s, v27.8h, v5.8h\n" + "smlal v13.4s, v27.4h, v3.4h\n" + "smlal2 v17.4s, v27.8h, v3.8h\n" + "add x19, x19, x15\n" + "tbz x8, #2, 25f\n" + "ld1 { v26.s }[0], [x19], #0x4\n" + "tbz x8, #1, 24f\n" + "ld1 { v26.h }[2], [x19], #0x2\n" + "tbz x8, #0, 27f\n" + "ld1 { v26.b }[6], [x19]\n" + "b 27f\n" + "24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset + "tbz x8, #0, 27f\n" + "ld1 { v26.b }[4], [x19]\n" + "b 27f\n" + "25:" // Oddments: Load (3, 0): Bit 2: Unset + "tbz x8, #1, 26f\n" + "ld1 { v26.h }[0], [x19], #0x2\n" + "tbz x8, #0, 27f\n" + "ld1 { v26.b }[2], [x19]\n" + "b 27f\n" + "26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 27f\n" + "ld1 { v26.b }[0], [x19]\n" + "27:" // Oddments: Load (3, 0): Bit 2: End + "ushll v26.8h, v26.8b, #0x0\n" + "ldr x20, [x13, #0x60]\n" + "smlal v11.4s, v26.4h, v3.4h\n" + "smlal2 v10.4s, v26.8h, v3.8h\n" + "add x20, x20, x15\n" + "tbz x8, #2, 29f\n" + "ld1 { v25.s }[0], [x20], #0x4\n" + "tbz x8, #1, 28f\n" + "ld1 { v25.h }[2], [x20], #0x2\n" + "tbz x8, #0, 31f\n" + "ld1 { v25.b }[6], [x20]\n" + "b 31f\n" + "28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset + "tbz x8, #0, 31f\n" + "ld1 { v25.b }[4], [x20]\n" + "b 31f\n" + "29:" // Oddments: Load (2, 0): Bit 2: Unset + "tbz x8, #1, 30f\n" + "ld1 { v25.h }[0], [x20], #0x2\n" + "tbz x8, #0, 31f\n" + "ld1 { v25.b }[2], [x20]\n" + "b 31f\n" + "30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 31f\n" + "ld1 { v25.b }[0], [x20]\n" + "31:" // Oddments: Load (2, 0): Bit 2: End + "ushll v25.8h, v25.8b, #0x0\n" + "ldr x19, [x13, #0x68]\n" + "smlal v15.4s, v25.4h, v6.4h\n" + "smlal2 v18.4s, v25.8h, v6.8h\n" + "smlal v11.4s, v25.4h, v0.4h\n" + "smlal2 v10.4s, v25.8h, v0.8h\n" + "add x19, x19, x15\n" + "tbz x8, #2, 33f\n" + "ld1 { v29.s }[0], [x19], #0x4\n" + "tbz x8, #1, 32f\n" + "ld1 { v29.h }[2], [x19], #0x2\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[6], [x19]\n" + "b 35f\n" + "32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[4], [x19]\n" + "b 35f\n" + "33:" // Oddments: Load (3, 1): Bit 2: Unset + "tbz x8, #1, 34f\n" + "ld1 { v29.h }[0], [x19], #0x2\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[2], [x19]\n" + "b 35f\n" + "34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[0], [x19]\n" + "35:" // Oddments: Load (3, 1): Bit 2: End + "ushll v29.8h, v29.8b, #0x0\n" + "ldr x19, [x13, #0x70]\n" + "smlal v11.4s, v29.4h, v4.4h\n" + "smlal2 v10.4s, v29.8h, v4.8h\n" + "add x19, x19, x15\n" + "tbz x8, #2, 37f\n" + "ld1 { v24.s }[0], [x19], #0x4\n" + "tbz x8, #1, 36f\n" + "ld1 { v24.h }[2], [x19], #0x2\n" + "tbz x8, #0, 39f\n" + "ld1 { v24.b }[6], [x19]\n" + "b 39f\n" + "36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset + "tbz x8, #0, 39f\n" + "ld1 { v24.b }[4], [x19]\n" + "b 39f\n" + "37:" // Oddments: Load (2, 1): Bit 2: Unset + "tbz x8, #1, 38f\n" + "ld1 { v24.h }[0], [x19], #0x2\n" + "tbz x8, #0, 39f\n" + "ld1 { v24.b }[2], [x19]\n" + "b 39f\n" + "38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 39f\n" + "ld1 { v24.b }[0], [x19]\n" + "39:" // Oddments: Load (2, 1): Bit 2: End + "ushll v24.8h, v24.8b, #0x0\n" + "ldr x22, [x13, #0x78]\n" + "smlal v15.4s, v24.4h, v7.4h\n" + "smlal2 v18.4s, v24.8h, v7.8h\n" + "smlal v11.4s, v24.4h, v1.4h\n" + "smlal2 v10.4s, v24.8h, v1.8h\n" + "add x22, x22, x15\n" + "tbz x8, #2, 41f\n" + "ld1 { v27.s }[0], [x22], #0x4\n" + "tbz x8, #1, 40f\n" + "ld1 { v27.h }[2], [x22], #0x2\n" + "tbz x8, #0, 43f\n" + "ld1 { v27.b }[6], [x22]\n" + "b 43f\n" + "40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset + "tbz x8, #0, 43f\n" + "ld1 { v27.b }[4], [x22]\n" + "b 43f\n" + "41:" // Oddments: Load (3, 3): Bit 2: Unset + "tbz x8, #1, 42f\n" + "ld1 { v27.h }[0], [x22], #0x2\n" + "tbz x8, #0, 43f\n" + "ld1 { v27.b }[2], [x22]\n" + "b 43f\n" + "42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 43f\n" + "ld1 { v27.b }[0], [x22]\n" + "43:" // Oddments: Load (3, 3): Bit 2: End + "ushll v27.8h, v27.8b, #0x0\n" + "ldr x21, [x13, #0x80]\n" + "smlal v23.4s, v27.4h, v4.4h\n" + "smlal2 v9.4s, v27.8h, v4.8h\n" + "add x21, x21, x15\n" + "tbz x8, #2, 45f\n" + "ld1 { v28.s }[0], [x21], #0x4\n" + "tbz x8, #1, 44f\n" + "ld1 { v28.h }[2], [x21], #0x2\n" + "tbz x8, #0, 47f\n" + "ld1 { v28.b }[6], [x21]\n" + "b 47f\n" + "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset + "tbz x8, #0, 47f\n" + "ld1 { v28.b }[4], [x21]\n" + "b 47f\n" + "45:" // Oddments: Load (2, 3): Bit 2: Unset + "tbz x8, #1, 46f\n" + "ld1 { v28.h }[0], [x21], #0x2\n" + "tbz x8, #0, 47f\n" + "ld1 { v28.b }[2], [x21]\n" + "b 47f\n" + "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 47f\n" + "ld1 { v28.b }[0], [x21]\n" + "47:" // Oddments: Load (2, 3): Bit 2: End + "ushll v28.8h, v28.8b, #0x0\n" + "ldr x20, [x13, #0x88]\n" + "smlal v13.4s, v28.4h, v7.4h\n" + "smlal2 v17.4s, v28.8h, v7.8h\n" + "smlal v23.4s, v28.4h, v1.4h\n" + "smlal2 v9.4s, v28.8h, v1.8h\n" + "add x20, x20, x15\n" + "tbz x8, #2, 49f\n" + "ld1 { v26.s }[0], [x20], #0x4\n" + "tbz x8, #1, 48f\n" + "ld1 { v26.h }[2], [x20], #0x2\n" + "tbz x8, #0, 51f\n" + "ld1 { v26.b }[6], [x20]\n" + "b 51f\n" + "48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset + "tbz x8, #0, 51f\n" + "ld1 { v26.b }[4], [x20]\n" + "b 51f\n" + "49:" // Oddments: Load (3, 4): Bit 2: Unset + "tbz x8, #1, 50f\n" + "ld1 { v26.h }[0], [x20], #0x2\n" + "tbz x8, #0, 51f\n" + "ld1 { v26.b }[2], [x20]\n" + "b 51f\n" + "50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 51f\n" + "ld1 { v26.b }[0], [x20]\n" + "51:" // Oddments: Load (3, 4): Bit 2: End + "ushll v26.8h, v26.8b, #0x0\n" + "ldr x23, [x13, #0x90]\n" + "smlal v23.4s, v26.4h, v5.4h\n" + "smlal2 v9.4s, v26.8h, v5.8h\n" + "add x23, x23, x15\n" + "tbz x8, #2, 53f\n" + "ld1 { v25.s }[0], [x23], #0x4\n" + "tbz x8, #1, 52f\n" + "ld1 { v25.h }[2], [x23], #0x2\n" + "tbz x8, #0, 55f\n" + "ld1 { v25.b }[6], [x23]\n" + "b 55f\n" + "52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset + "tbz x8, #0, 55f\n" + "ld1 { v25.b }[4], [x23]\n" + "b 55f\n" + "53:" // Oddments: Load (4, 0): Bit 2: Unset + "tbz x8, #1, 54f\n" + "ld1 { v25.h }[0], [x23], #0x2\n" + "tbz x8, #0, 55f\n" + "ld1 { v25.b }[2], [x23]\n" + "b 55f\n" + "54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 55f\n" + "ld1 { v25.b }[0], [x23]\n" + "55:" // Oddments: Load (4, 0): Bit 2: End + "ushll v25.8h, v25.8b, #0x0\n" + "ldr x24, [x13, #0x98]\n" + "smlal v11.4s, v25.4h, v6.4h\n" + "smlal2 v10.4s, v25.8h, v6.8h\n" + "add x24, x24, x15\n" + "tbz x8, #2, 57f\n" + "ld1 { v29.s }[0], [x24], #0x4\n" + "tbz x8, #1, 56f\n" + "ld1 { v29.h }[2], [x24], #0x2\n" + "tbz x8, #0, 59f\n" + "ld1 { v29.b }[6], [x24]\n" + "b 59f\n" + "56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset + "tbz x8, #0, 59f\n" + "ld1 { v29.b }[4], [x24]\n" + "b 59f\n" + "57:" // Oddments: Load (2, 4): Bit 2: Unset + "tbz x8, #1, 58f\n" + "ld1 { v29.h }[0], [x24], #0x2\n" + "tbz x8, #0, 59f\n" + "ld1 { v29.b }[2], [x24]\n" + "b 59f\n" + "58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 59f\n" + "ld1 { v29.b }[0], [x24]\n" + "59:" // Oddments: Load (2, 4): Bit 2: End + "ushll v29.8h, v29.8b, #0x0\n" + "ldr x19, [x13, #0xa0]\n" + "smlal v13.4s, v29.4h, v8.4h\n" + "smlal2 v17.4s, v29.8h, v8.8h\n" + "smlal v23.4s, v29.4h, v2.4h\n" + "smlal2 v9.4s, v29.8h, v2.8h\n" + "add x19, x19, x15\n" + "tbz x8, #2, 61f\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "tbz x8, #1, 60f\n" + "ld1 { v27.h }[2], [x19], #0x2\n" + "tbz x8, #0, 63f\n" + "ld1 { v27.b }[6], [x19]\n" + "b 63f\n" + "60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset + "tbz x8, #0, 63f\n" + "ld1 { v27.b }[4], [x19]\n" + "b 63f\n" + "61:" // Oddments: Load (4, 1): Bit 2: Unset + "tbz x8, #1, 62f\n" + "ld1 { v27.h }[0], [x19], #0x2\n" + "tbz x8, #0, 63f\n" + "ld1 { v27.b }[2], [x19]\n" + "b 63f\n" + "62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 63f\n" + "ld1 { v27.b }[0], [x19]\n" + "63:" // Oddments: Load (4, 1): Bit 2: End + "ushll v27.8h, v27.8b, #0x0\n" + "ldr x22, [x13, #0xa8]\n" + "smlal v11.4s, v27.4h, v7.4h\n" + "smlal2 v10.4s, v27.8h, v7.8h\n" + "add x22, x22, x15\n" + "tbz x8, #2, 65f\n" + "ld1 { v24.s }[0], [x22], #0x4\n" + "tbz x8, #1, 64f\n" + "ld1 { v24.h }[2], [x22], #0x2\n" + "tbz x8, #0, 67f\n" + "ld1 { v24.b }[6], [x22]\n" + "b 67f\n" + "64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset + "tbz x8, #0, 67f\n" + "ld1 { v24.b }[4], [x22]\n" + "b 67f\n" + "65:" // Oddments: Load (3, 2): Bit 2: Unset + "tbz x8, #1, 66f\n" + "ld1 { v24.h }[0], [x22], #0x2\n" + "tbz x8, #0, 67f\n" + "ld1 { v24.b }[2], [x22]\n" + "b 67f\n" + "66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 67f\n" + "ld1 { v24.b }[0], [x22]\n" + "67:" // Oddments: Load (3, 2): Bit 2: End + "ushll v24.8h, v24.8b, #0x0\n" + "ldr x21, [x13, #0xb0]\n" + "smlal v11.4s, v24.4h, v5.4h\n" + "smlal2 v10.4s, v24.8h, v5.8h\n" + "smlal v23.4s, v24.4h, v3.4h\n" + "smlal2 v9.4s, v24.8h, v3.8h\n" + "add x21, x21, x15\n" + "tbz x8, #2, 69f\n" + "ld1 { v26.s }[0], [x21], #0x4\n" + "tbz x8, #1, 68f\n" + "ld1 { v26.h }[2], [x21], #0x2\n" + "tbz x8, #0, 71f\n" + "ld1 { v26.b }[6], [x21]\n" + "b 71f\n" + "68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset + "tbz x8, #0, 71f\n" + "ld1 { v26.b }[4], [x21]\n" + "b 71f\n" + "69:" // Oddments: Load (4, 3): Bit 2: Unset + "tbz x8, #1, 70f\n" + "ld1 { v26.h }[0], [x21], #0x2\n" + "tbz x8, #0, 71f\n" + "ld1 { v26.b }[2], [x21]\n" + "b 71f\n" + "70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 71f\n" + "ld1 { v26.b }[0], [x21]\n" + "71:" // Oddments: Load (4, 3): Bit 2: End + "ushll v26.8h, v26.8b, #0x0\n" + "ldr x20, [x13, #0xb8]\n" + "smlal v23.4s, v26.4h, v7.4h\n" + "smlal2 v9.4s, v26.8h, v7.8h\n" + "add x20, x20, x15\n" + "tbz x8, #2, 73f\n" + "ld1 { v25.s }[0], [x20], #0x4\n" + "tbz x8, #1, 72f\n" + "ld1 { v25.h }[2], [x20], #0x2\n" + "tbz x8, #0, 75f\n" + "ld1 { v25.b }[6], [x20]\n" + "b 75f\n" + "72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset + "tbz x8, #0, 75f\n" + "ld1 { v25.b }[4], [x20]\n" + "b 75f\n" + "73:" // Oddments: Load (4, 2): Bit 2: Unset + "tbz x8, #1, 74f\n" + "ld1 { v25.h }[0], [x20], #0x2\n" + "tbz x8, #0, 75f\n" + "ld1 { v25.b }[2], [x20]\n" + "b 75f\n" + "74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 75f\n" + "ld1 { v25.b }[0], [x20]\n" + "75:" // Oddments: Load (4, 2): Bit 2: End + "ushll v25.8h, v25.8b, #0x0\n" + "ldr x19, [x13, #0xc0]\n" + "smlal v11.4s, v25.4h, v8.4h\n" + "smlal2 v10.4s, v25.8h, v8.8h\n" + "smlal v23.4s, v25.4h, v6.4h\n" + "smlal2 v9.4s, v25.8h, v6.8h\n" + "add x19, x19, x15\n" + "tbz x8, #2, 77f\n" + "ld1 { v29.s }[0], [x19], #0x4\n" + "tbz x8, #1, 76f\n" + "ld1 { v29.h }[2], [x19], #0x2\n" + "tbz x8, #0, 79f\n" + "ld1 { v29.b }[6], [x19]\n" + "b 79f\n" + "76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset + "tbz x8, #0, 79f\n" + "ld1 { v29.b }[4], [x19]\n" + "b 79f\n" + "77:" // Oddments: Load (4, 4): Bit 2: Unset + "tbz x8, #1, 78f\n" + "ld1 { v29.h }[0], [x19], #0x2\n" + "tbz x8, #0, 79f\n" + "ld1 { v29.b }[2], [x19]\n" + "b 79f\n" + "78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 79f\n" + "ld1 { v29.b }[0], [x19]\n" + "79:" // Oddments: Load (4, 4): Bit 2: End + "ushll v29.8h, v29.8b, #0x0\n" + "smlal v23.4s, v29.4h, v8.4h\n" + "smlal2 v9.4s, v29.8h, v8.8h\n" + "tbz x8, #2, 81f\n" + "ld1 { v22.4s }, [x12], #0x10\n" + "ld1 { v31.4s }, [x11], #0x10\n" + "tbz x8, #1, 80f\n" + "ld1 { v19.d }[0], [x12], #0x8\n" + "ld1 { v30.d }[0], [x11], #0x8\n" + "tbz x8, #0, 83f\n" + "ld1 { v19.s }[2], [x12]\n" + "ld1 { v30.s }[2], [x11]\n" + "b 83f\n" + "80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset + "tbz x8, #0, 83f\n" + "ld1 { v19.s }[0], [x12]\n" + "ld1 { v30.s }[0], [x11]\n" + "b 83f\n" + "81:" // Oddments: Load requant params: Bit 2: Unset + "tbz x8, #1, 82f\n" + "ld1 { v22.d }[0], [x12], #0x8\n" + "ld1 { v31.d }[0], [x11], #0x8\n" + "tbz x8, #0, 83f\n" + "ld1 { v22.s }[2], [x12]\n" + "ld1 { v31.s }[2], [x11]\n" + "b 83f\n" + "82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 83f\n" + "ld1 { v22.s }[0], [x12]\n" + "ld1 { v31.s }[0], [x11]\n" + "83:" // Oddments: Load requant params: Bit 2: End + "sqdmulh v15.4s, v15.4s, v22.4s\n" + "sqdmulh v13.4s, v13.4s, v22.4s\n" + "add x10, x10, x14\n" + "add x9, x9, x14\n" + "sqdmulh v11.4s, v11.4s, v22.4s\n" + "sqdmulh v23.4s, v23.4s, v22.4s\n" + "add x28, x28, x14\n" + "add x27, x27, x14\n" + "and v4.16b, v15.16b, v31.16b\n" + "sqdmulh v18.4s, v18.4s, v19.4s\n" + "and v1.16b, v13.16b, v31.16b\n" + "sqdmulh v17.4s, v17.4s, v19.4s\n" + "and v22.16b, v11.16b, v31.16b\n" + "sqdmulh v10.4s, v10.4s, v19.4s\n" + "and v20.16b, v23.16b, v31.16b\n" + "sqdmulh v9.4s, v9.4s, v19.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v19.16b, v18.16b, v30.16b\n" + "sshr v1.4s, v1.4s, #0x1f\n" + "and v27.16b, v17.16b, v30.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v25.16b, v10.16b, v30.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v0.16b, v9.16b, v30.16b\n" + "sqadd v15.4s, v15.4s, v4.4s\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sqadd v13.4s, v13.4s, v1.4s\n" + "sshr v27.4s, v27.4s, #0x1f\n" + "sqadd v11.4s, v11.4s, v22.4s\n" + "sshr v25.4s, v25.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v20.4s\n" + "sshr v0.4s, v0.4s, #0x1f\n" + "srshl v15.4s, v15.4s, v31.4s\n" + "sqadd v18.4s, v18.4s, v19.4s\n" + "srshl v13.4s, v13.4s, v31.4s\n" + "sqadd v17.4s, v17.4s, v27.4s\n" + "srshl v11.4s, v11.4s, v31.4s\n" + "sqadd v10.4s, v10.4s, v25.4s\n" + "srshl v23.4s, v23.4s, v31.4s\n" + "sqadd v9.4s, v9.4s, v0.4s\n" + "srshl v18.4s, v18.4s, v30.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v17.4s, v17.4s, v30.4s\n" + "sqxtn v13.4h, v13.4s\n" + "srshl v10.4s, v10.4s, v30.4s\n" + "sqxtn v11.4h, v11.4s\n" + "srshl v9.4s, v9.4s, v30.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v15.8h, v18.4s\n" + "sqxtn2 v13.8h, v17.4s\n" + "sqxtn2 v11.8h, v10.4s\n" + "sqxtn2 v23.8h, v9.4s\n" + "sqadd v15.8h, v15.8h, v12.8h\n" + "sqadd v13.8h, v13.8h, v12.8h\n" + "sqadd v11.8h, v11.8h, v12.8h\n" + "sqadd v23.8h, v23.8h, v12.8h\n" + "smax v15.8h, v15.8h, v14.8h\n" + "smax v13.8h, v13.8h, v14.8h\n" + "smax v11.8h, v11.8h, v14.8h\n" + "smax v23.8h, v23.8h, v14.8h\n" + "smin v15.8h, v15.8h, v21.8h\n" + "smin v13.8h, v13.8h, v21.8h\n" + "smin v11.8h, v11.8h, v21.8h\n" + "smin v23.8h, v23.8h, v21.8h\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "uzp1 v13.16b, v13.16b, v13.16b\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "tbz x8, #2, 85f\n" + "st1 { v15.s }[0], [x10], #0x4\n" + "st1 { v13.s }[0], [x9], #0x4\n" + "st1 { v11.s }[0], [x28], #0x4\n" + "st1 { v23.s }[0], [x27], #0x4\n" + "tbz x8, #1, 84f\n" + "st1 { v15.h }[2], [x10], #0x2\n" + "st1 { v13.h }[2], [x9], #0x2\n" + "st1 { v11.h }[2], [x28], #0x2\n" + "st1 { v23.h }[2], [x27], #0x2\n" + "tbz x8, #0, 87f\n" + "st1 { v15.b }[6], [x10], #0x1\n" + "st1 { v13.b }[6], [x9], #0x1\n" + "st1 { v11.b }[6], [x28], #0x1\n" + "st1 { v23.b }[6], [x27], #0x1\n" + "b 87f\n" + "84:" // Oddments: Bit 2: Bit 1: Unset + "tbz x8, #0, 87f\n" + "st1 { v15.b }[4], [x10], #0x1\n" + "st1 { v13.b }[4], [x9], #0x1\n" + "st1 { v11.b }[4], [x28], #0x1\n" + "st1 { v23.b }[4], [x27], #0x1\n" + "b 87f\n" + "85:" // Oddments: Bit 2: Unset + "tbz x8, #1, 86f\n" + "st1 { v15.h }[0], [x10], #0x2\n" + "st1 { v13.h }[0], [x9], #0x2\n" + "st1 { v11.h }[0], [x28], #0x2\n" + "st1 { v23.h }[0], [x27], #0x2\n" + "tbz x8, #0, 87f\n" + "st1 { v15.b }[2], [x10], #0x1\n" + "st1 { v13.b }[2], [x9], #0x1\n" + "st1 { v11.b }[2], [x28], #0x1\n" + "st1 { v23.b }[2], [x27], #0x1\n" + "b 87f\n" + "86:" // Oddments: Bit 2: Unset: Bit 1: Unset + "tbz x8, #0, 87f\n" + "st1 { v15.b }[0], [x10], #0x1\n" + "st1 { v13.b }[0], [x9], #0x1\n" + "st1 { v11.b }[0], [x28], #0x1\n" + "st1 { v23.b }[0], [x27], #0x1\n" + "87:" // Oddments: Bit 2: End + "88:" // End + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp new file mode 100644 index 0000000000..482d1af80c --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +#include "src/core/NEON/kernels/arm_conv/depthwise/interleaves/list.hpp" + +#include + +#pragma once + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); + +class a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy +{ + using Parent = DepthwiseDepthfirstStrategy; + + public: + constexpr static unsigned int kernel_rows = 5; + constexpr static unsigned int kernel_cols = 5; + + constexpr static unsigned int stride_rows = 1; + constexpr static unsigned int stride_cols = 1; + + a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {} + + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; } + + Parent::KernelType kernel = a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } +}; + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp new file mode 100644 index 0000000000..7f1fd1d568 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp @@ -0,0 +1,2185 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_gemm.hpp" + +#include +#include + +#if defined(__aarch64__) + +namespace arm_conv { +namespace depthwise { + +void a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst_impl( + const unsigned int n_channels, + const uint8_t *const *const inptrs, + const uint8_t *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *const outptrs +) +{ + struct Params + { + long unsigned int n_channels; + const void *weights; + const int32_t *bias; + const arm_gemm::Requantize32 *requant; + const int32_t *const requant_muls; + const int32_t *const requant_shifts; + uint8_t *const *const outptrs; + const uint8_t *inptrs[36]; + + Params( + long unsigned int n_channels, + const uint8_t *const *inptrs_raw, + const void *const weights, + const int32_t *const bias, + const arm_gemm::Requantize32 &qp, + const int32_t *const requant_muls, + const int32_t *const requant_shifts, + uint8_t *const *outptrs + ) : n_channels(n_channels), weights(weights), bias(bias), + requant(&qp), requant_muls(requant_muls), + requant_shifts(requant_shifts), outptrs(outptrs) + { + inptrs[0] = inptrs_raw[0]; + inptrs[1] = inptrs_raw[1]; + inptrs[2] = inptrs_raw[6]; + inptrs[3] = inptrs_raw[7]; + inptrs[4] = inptrs_raw[2]; + inptrs[5] = inptrs_raw[8]; + inptrs[6] = inptrs_raw[3]; + inptrs[7] = inptrs_raw[4]; + inptrs[8] = inptrs_raw[11]; + inptrs[9] = inptrs_raw[12]; + inptrs[10] = inptrs_raw[9]; + inptrs[11] = inptrs_raw[10]; + inptrs[12] = inptrs_raw[5]; + inptrs[13] = inptrs_raw[13]; + inptrs[14] = inptrs_raw[14]; + inptrs[15] = inptrs_raw[15]; + inptrs[16] = inptrs_raw[16]; + inptrs[17] = inptrs_raw[17]; + inptrs[18] = inptrs_raw[18]; + inptrs[19] = inptrs_raw[19]; + inptrs[20] = inptrs_raw[20]; + inptrs[21] = inptrs_raw[21]; + inptrs[22] = inptrs_raw[22]; + inptrs[23] = inptrs_raw[23]; + inptrs[24] = inptrs_raw[24]; + inptrs[25] = inptrs_raw[25]; + inptrs[26] = inptrs_raw[26]; + inptrs[27] = inptrs_raw[27]; + inptrs[28] = inptrs_raw[28]; + inptrs[29] = inptrs_raw[29]; + inptrs[30] = inptrs_raw[30]; + inptrs[31] = inptrs_raw[31]; + inptrs[32] = inptrs_raw[32]; + inptrs[33] = inptrs_raw[33]; + inptrs[34] = inptrs_raw[34]; + inptrs[35] = inptrs_raw[35]; + + } + }; + + const Params params(n_channels, inptrs, weights, bias, qp, + requant_muls, requant_shifts, outptrs); + + __asm__ __volatile__( + "ldr x16, [%x[params], %[offsetof_Params_requant]]\n" + "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n" + "add x9, x16, %[offsetof_Requantize32_b_offset]\n" + "add x19, x16, %[offsetof_Requantize32_c_offset]\n" + "ldr x10, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x24, x16, %[offsetof_Requantize32_minval]\n" + "add x2, x16, %[offsetof_Requantize32_maxval]\n" + "ldr x8, [%x[params], %[offsetof_Params_weights]]\n" + "ld1r { v15.16b }, [x9]\n" + "ld1r { v16.8h }, [x19]\n" + "lsr x3, x4, #0x3\n" + "mov x1, #0x0\n" + "ld1r { v12.8h }, [x24]\n" + "ld1r { v13.8h }, [x2]\n" + "mov x2, #0x0\n" + "add x0, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x5, [%x[params], %[offsetof_Params_requant_muls]]\n" + "ldr x6, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "ldp x21, x15, [x10, #0x0]\n" + "ldp x17, x16, [x10, #0x10]\n" + "cbz x3, 3f\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q11, [x19, #0x0]\n" + "subs x3, x3, #0x1\n" + "mov v14.16b, v11.16b\n" + "ldr q21, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr d0, [x8, #0x0]\n" + "ldr d1, [x8, #0x8]\n" + "ldr d2, [x8, #0x10]\n" + "mov v10.16b, v21.16b\n" + "mov v9.16b, v11.16b\n" + "ldr d3, [x8, #0x18]\n" + "ldr d4, [x8, #0x20]\n" + "mov v8.16b, v21.16b\n" + "mov v7.16b, v11.16b\n" + "ldp x28, x27, [x0, #0x0]\n" + "ldp x10, x26, [x0, #0x10]\n" + "mov v6.16b, v21.16b\n" + "usubl v0.8h, v0.8b, v15.8b\n" + "ldp x24, x23, [x0, #0x20]\n" + "ldp x22, x25, [x0, #0x30]\n" + "usubl v1.8h, v1.8b, v15.8b\n" + "usubl v2.8h, v2.8b, v15.8b\n" + "ldp x20, x19, [x0, #0x40]\n" + "ldr d31, [x28, x1]\n" + "usubl v3.8h, v3.8b, v15.8b\n" + "usubl v4.8h, v4.8b, v15.8b\n" + "ldr d30, [x27, x1]\n" + "ldr d29, [x10, x1]\n" + "ushll v31.8h, v31.8b, #0x0\n" + "ushll v30.8h, v30.8b, #0x0\n" + "ldr d28, [x26, x1]\n" + "ldr d27, [x24, x1]\n" + "ushll v29.8h, v29.8b, #0x0\n" + "ushll v28.8h, v28.8b, #0x0\n" + "ldr d23, [x23, x1]\n" + "ldr d25, [x22, x1]\n" + "ushll v27.8h, v27.8b, #0x0\n" + "ushll v23.8h, v23.8b, #0x0\n" + "ldr d24, [x25, x1]\n" + "ldr d26, [x20, x1]\n" + "ushll v25.8h, v25.8b, #0x0\n" + "ushll v24.8h, v24.8b, #0x0\n" + "ldr d22, [x19, x1]\n" + "ushll v26.8h, v26.8b, #0x0\n" + "ushll v22.8h, v22.8b, #0x0\n" + "beq 2f\n" + "1:" // Loop + "smlal v11.4s, v31.4h, v0.4h\n" + "smlal2 v21.4s, v31.8h, v0.8h\n" + "ldr x19, [x0, #0x50]\n" + "ldr d31, [x19, x1]\n" + "smlal v14.4s, v30.4h, v0.4h\n" + "smlal v9.4s, v29.4h, v0.4h\n" + "ldr x20, [x0, #0x58]\n" + "ushll v31.8h, v31.8b, #0x0\n" + "smlal v7.4s, v28.4h, v0.4h\n" + "smlal2 v10.4s, v30.8h, v0.8h\n" + "ldr x19, [x0, #0x60]\n" + "ldr x24, [x0, #0x68]\n" + "smlal2 v8.4s, v29.8h, v0.8h\n" + "smlal v11.4s, v30.4h, v1.4h\n" + "ldr x23, [x0, #0x70]\n" + "ldr x26, [x0, #0x78]\n" + "smlal2 v21.4s, v30.8h, v1.8h\n" + "smlal2 v6.4s, v28.8h, v0.8h\n" + "ldr d30, [x20, x1]\n" + "ushll v30.8h, v30.8b, #0x0\n" + "smlal v14.4s, v27.4h, v1.4h\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "ldr d0, [x8, #0x28]\n" + "usubl v0.8h, v0.8b, v15.8b\n" + "smlal v7.4s, v23.4h, v1.4h\n" + "smlal2 v10.4s, v27.8h, v1.8h\n" + "ldr x7, [x0, #0x80]\n" + "ldr x22, [x0, #0x88]\n" + "smlal2 v8.4s, v28.8h, v1.8h\n" + "smlal v11.4s, v27.4h, v2.4h\n" + "ldr x20, [x0, #0x90]\n" + "ldr x14, [x0, #0x98]\n" + "smlal2 v21.4s, v27.8h, v2.8h\n" + "smlal2 v6.4s, v23.8h, v1.8h\n" + "ldr d27, [x19, x1]\n" + "ushll v27.8h, v27.8b, #0x0\n" + "smlal v14.4s, v25.4h, v2.4h\n" + "smlal v9.4s, v23.4h, v2.4h\n" + "ldr d1, [x8, #0x30]\n" + "usubl v1.8h, v1.8b, v15.8b\n" + "smlal v7.4s, v31.4h, v2.4h\n" + "smlal2 v10.4s, v25.8h, v2.8h\n" + "ldr x19, [x0, #0xa0]\n" + "ldr x13, [x0, #0xa8]\n" + "smlal2 v8.4s, v23.8h, v2.8h\n" + "smlal v11.4s, v25.4h, v3.4h\n" + "ldr x12, [x0, #0xb0]\n" + "ldr x11, [x0, #0xb8]\n" + "smlal2 v21.4s, v25.8h, v3.8h\n" + "smlal2 v6.4s, v31.8h, v2.8h\n" + "ldr d25, [x24, x1]\n" + "ushll v25.8h, v25.8b, #0x0\n" + "smlal v14.4s, v24.4h, v3.4h\n" + "smlal v9.4s, v31.4h, v3.4h\n" + "ldr d2, [x8, #0x38]\n" + "usubl v2.8h, v2.8b, v15.8b\n" + "smlal v7.4s, v30.4h, v3.4h\n" + "smlal2 v10.4s, v24.8h, v3.8h\n" + "ldr x10, [x0, #0xc0]\n" + "ldr x9, [x0, #0xc8]\n" + "smlal2 v8.4s, v31.8h, v3.8h\n" + "smlal v11.4s, v24.4h, v4.4h\n" + "ldr x28, [x0, #0xd0]\n" + "ldr x27, [x0, #0xd8]\n" + "smlal2 v21.4s, v24.8h, v4.8h\n" + "smlal2 v6.4s, v30.8h, v3.8h\n" + "ldr d24, [x23, x1]\n" + "ushll v24.8h, v24.8b, #0x0\n" + "smlal v14.4s, v27.4h, v4.4h\n" + "smlal v9.4s, v30.4h, v4.4h\n" + "ldr d3, [x8, #0x40]\n" + "usubl v3.8h, v3.8b, v15.8b\n" + "smlal v7.4s, v26.4h, v4.4h\n" + "smlal2 v10.4s, v27.8h, v4.8h\n" + "ldr d27, [x26, x1]\n" + "ushll v27.8h, v27.8b, #0x0\n" + "smlal2 v8.4s, v30.8h, v4.8h\n" + "smlal v11.4s, v29.4h, v0.4h\n" + "ldr x26, [x0, #0xe0]\n" + "ldr x25, [x0, #0xe8]\n" + "smlal2 v21.4s, v29.8h, v0.8h\n" + "smlal2 v6.4s, v26.8h, v4.8h\n" + "ldr d4, [x8, #0x48]\n" + "usubl v4.8h, v4.8b, v15.8b\n" + "smlal v14.4s, v28.4h, v0.4h\n" + "smlal v9.4s, v22.4h, v0.4h\n" + "ldr x24, [x0, #0xf0]\n" + "ldr q17, [x5, #0x0]\n" + "smlal v7.4s, v25.4h, v0.4h\n" + "smlal2 v10.4s, v28.8h, v0.8h\n" + "ldr q5, [x6, #0x0]\n" + "ldr q18, [x5, #0x10]\n" + "smlal2 v8.4s, v22.8h, v0.8h\n" + "smlal v11.4s, v28.4h, v1.4h\n" + "ldr q29, [x6, #0x10]\n" + "subs x3, x3, #0x1\n" + "smlal2 v21.4s, v28.8h, v1.8h\n" + "smlal2 v6.4s, v25.8h, v0.8h\n" + "ldr d28, [x22, x1]\n" + "ldr d0, [x8, #0x50]\n" + "smlal v14.4s, v23.4h, v1.4h\n" + "smlal v9.4s, v25.4h, v1.4h\n" + "ushll v28.8h, v28.8b, #0x0\n" + "ldr x23, [x0, #0xf8]\n" + "smlal v7.4s, v24.4h, v1.4h\n" + "smlal2 v10.4s, v23.8h, v1.8h\n" + "usubl v0.8h, v0.8b, v15.8b\n" + "add x5, x5, #0x20\n" + "smlal2 v8.4s, v25.8h, v1.8h\n" + "smlal v11.4s, v23.4h, v2.4h\n" + "add x6, x6, #0x20\n" + "smlal2 v21.4s, v23.8h, v2.8h\n" + "ldr d23, [x7, x1]\n" + "smlal2 v6.4s, v24.8h, v1.8h\n" + "ushll v23.8h, v23.8b, #0x0\n" + "smlal v14.4s, v31.4h, v2.4h\n" + "smlal v9.4s, v24.4h, v2.4h\n" + "ldr d1, [x8, #0x58]\n" + "usubl v1.8h, v1.8b, v15.8b\n" + "smlal v7.4s, v27.4h, v2.4h\n" + "smlal2 v10.4s, v31.8h, v2.8h\n" + "ldr x22, [x0, #0x100]\n" + "smlal2 v8.4s, v24.8h, v2.8h\n" + "smlal v11.4s, v31.4h, v3.4h\n" + "smlal2 v21.4s, v31.8h, v3.8h\n" + "smlal2 v6.4s, v27.8h, v2.8h\n" + "ldr d31, [x20, x1]\n" + "ushll v31.8h, v31.8b, #0x0\n" + "smlal v14.4s, v30.4h, v3.4h\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "ldr d2, [x8, #0x60]\n" + "usubl v2.8h, v2.8b, v15.8b\n" + "smlal v7.4s, v23.4h, v3.4h\n" + "smlal2 v10.4s, v30.8h, v3.8h\n" + "ldr x7, [x0, #0x108]\n" + "smlal2 v8.4s, v27.8h, v3.8h\n" + "smlal v11.4s, v30.4h, v4.4h\n" + "smlal2 v21.4s, v30.8h, v4.8h\n" + "ldr d30, [x14, x1]\n" + "smlal2 v6.4s, v23.8h, v3.8h\n" + "ushll v30.8h, v30.8b, #0x0\n" + "smlal v14.4s, v26.4h, v4.4h\n" + "smlal v9.4s, v23.4h, v4.4h\n" + "ldr d3, [x8, #0x68]\n" + "usubl v3.8h, v3.8b, v15.8b\n" + "smlal v7.4s, v28.4h, v4.4h\n" + "smlal2 v10.4s, v26.8h, v4.8h\n" + "ldr d26, [x19, x1]\n" + "ushll v26.8h, v26.8b, #0x0\n" + "smlal2 v8.4s, v23.8h, v4.8h\n" + "smlal v11.4s, v22.4h, v0.4h\n" + "ldr x20, [x0, #0x110]\n" + "ldr x19, [x0, #0x118]\n" + "smlal2 v21.4s, v22.8h, v0.8h\n" + "smlal2 v6.4s, v28.8h, v4.8h\n" + "ldr d4, [x8, #0x70]\n" + "ldr d22, [x11, x1]\n" + "smlal v14.4s, v25.4h, v0.4h\n" + "smlal v9.4s, v31.4h, v0.4h\n" + "usubl v4.8h, v4.8b, v15.8b\n" + "smlal v7.4s, v30.4h, v0.4h\n" + "smlal2 v10.4s, v25.8h, v0.8h\n" + "ushll v22.8h, v22.8b, #0x0\n" + "smlal2 v8.4s, v31.8h, v0.8h\n" + "smlal v11.4s, v25.4h, v1.4h\n" + "smlal2 v21.4s, v25.8h, v1.8h\n" + "ldr d25, [x13, x1]\n" + "smlal2 v6.4s, v30.8h, v0.8h\n" + "ushll v25.8h, v25.8b, #0x0\n" + "smlal v14.4s, v24.4h, v1.4h\n" + "smlal v9.4s, v30.4h, v1.4h\n" + "ldr d0, [x8, #0x78]\n" + "usubl v0.8h, v0.8b, v15.8b\n" + "smlal v7.4s, v26.4h, v1.4h\n" + "smlal2 v10.4s, v24.8h, v1.8h\n" + "smlal2 v8.4s, v30.8h, v1.8h\n" + "smlal v11.4s, v24.4h, v2.4h\n" + "smlal2 v21.4s, v24.8h, v2.8h\n" + "ldr d24, [x12, x1]\n" + "smlal2 v6.4s, v26.8h, v1.8h\n" + "ushll v24.8h, v24.8b, #0x0\n" + "smlal v14.4s, v27.4h, v2.4h\n" + "smlal v9.4s, v26.4h, v2.4h\n" + "ldr d1, [x8, #0x80]\n" + "usubl v1.8h, v1.8b, v15.8b\n" + "smlal v7.4s, v25.4h, v2.4h\n" + "smlal2 v10.4s, v27.8h, v2.8h\n" + "smlal2 v8.4s, v26.8h, v2.8h\n" + "smlal v11.4s, v27.4h, v3.4h\n" + "smlal2 v21.4s, v27.8h, v3.8h\n" + "smlal2 v6.4s, v25.8h, v2.8h\n" + "ldr d27, [x10, x1]\n" + "ushll v27.8h, v27.8b, #0x0\n" + "smlal v14.4s, v23.4h, v3.4h\n" + "smlal v9.4s, v25.4h, v3.4h\n" + "ldr d2, [x8, #0x88]\n" + "usubl v2.8h, v2.8b, v15.8b\n" + "smlal v7.4s, v24.4h, v3.4h\n" + "smlal2 v10.4s, v23.8h, v3.8h\n" + "smlal2 v8.4s, v25.8h, v3.8h\n" + "smlal v11.4s, v23.4h, v4.4h\n" + "smlal2 v21.4s, v23.8h, v4.8h\n" + "ldr d23, [x9, x1]\n" + "smlal2 v6.4s, v24.8h, v3.8h\n" + "ushll v23.8h, v23.8b, #0x0\n" + "smlal v14.4s, v28.4h, v4.4h\n" + "smlal v9.4s, v24.4h, v4.4h\n" + "ldr d3, [x8, #0x90]\n" + "usubl v3.8h, v3.8b, v15.8b\n" + "smlal v7.4s, v22.4h, v4.4h\n" + "smlal2 v10.4s, v28.8h, v4.8h\n" + "ldr d28, [x26, x1]\n" + "ushll v28.8h, v28.8b, #0x0\n" + "smlal2 v8.4s, v24.8h, v4.8h\n" + "smlal v11.4s, v31.4h, v0.4h\n" + "smlal2 v21.4s, v31.8h, v0.8h\n" + "ldr d31, [x28, x1]\n" + "smlal2 v6.4s, v22.8h, v4.8h\n" + "ushll v31.8h, v31.8b, #0x0\n" + "smlal v14.4s, v30.4h, v0.4h\n" + "smlal v9.4s, v27.4h, v0.4h\n" + "ldr d4, [x8, #0x98]\n" + "usubl v4.8h, v4.8b, v15.8b\n" + "smlal v7.4s, v23.4h, v0.4h\n" + "smlal2 v10.4s, v30.8h, v0.8h\n" + "smlal2 v8.4s, v27.8h, v0.8h\n" + "smlal v11.4s, v30.4h, v1.4h\n" + "smlal2 v21.4s, v30.8h, v1.8h\n" + "ldr d30, [x27, x1]\n" + "smlal2 v6.4s, v23.8h, v0.8h\n" + "ushll v30.8h, v30.8b, #0x0\n" + "smlal v14.4s, v26.4h, v1.4h\n" + "smlal v9.4s, v23.4h, v1.4h\n" + "ldr d0, [x8, #0xa0]\n" + "usubl v0.8h, v0.8b, v15.8b\n" + "smlal v7.4s, v31.4h, v1.4h\n" + "smlal2 v10.4s, v26.8h, v1.8h\n" + "smlal2 v8.4s, v23.8h, v1.8h\n" + "smlal v11.4s, v26.4h, v2.4h\n" + "smlal2 v21.4s, v26.8h, v2.8h\n" + "smlal2 v6.4s, v31.8h, v1.8h\n" + "ldr d26, [x25, x1]\n" + "ushll v26.8h, v26.8b, #0x0\n" + "smlal v14.4s, v25.4h, v2.4h\n" + "smlal v9.4s, v31.4h, v2.4h\n" + "ldr d1, [x8, #0xa8]\n" + "usubl v1.8h, v1.8b, v15.8b\n" + "smlal v7.4s, v30.4h, v2.4h\n" + "smlal2 v10.4s, v25.8h, v2.8h\n" + "smlal2 v8.4s, v31.8h, v2.8h\n" + "smlal v11.4s, v25.4h, v3.4h\n" + "smlal2 v21.4s, v25.8h, v3.8h\n" + "smlal2 v6.4s, v30.8h, v2.8h\n" + "ldr d25, [x24, x1]\n" + "ushll v25.8h, v25.8b, #0x0\n" + "smlal v14.4s, v24.4h, v3.4h\n" + "smlal v9.4s, v30.4h, v3.4h\n" + "ldr d2, [x8, #0xb0]\n" + "usubl v2.8h, v2.8b, v15.8b\n" + "smlal v7.4s, v28.4h, v3.4h\n" + "smlal2 v10.4s, v24.8h, v3.8h\n" + "smlal2 v8.4s, v30.8h, v3.8h\n" + "smlal v11.4s, v24.4h, v4.4h\n" + "smlal2 v21.4s, v24.8h, v4.8h\n" + "ldr d24, [x23, x1]\n" + "smlal2 v6.4s, v28.8h, v3.8h\n" + "ushll v24.8h, v24.8b, #0x0\n" + "smlal v14.4s, v22.4h, v4.4h\n" + "smlal v9.4s, v28.4h, v4.4h\n" + "ldr d3, [x8, #0xb8]\n" + "usubl v3.8h, v3.8b, v15.8b\n" + "smlal v7.4s, v26.4h, v4.4h\n" + "smlal2 v8.4s, v28.8h, v4.8h\n" + "smlal v11.4s, v27.4h, v0.4h\n" + "smlal2 v21.4s, v27.8h, v0.8h\n" + "ldr d27, [x22, x1]\n" + "ushll v27.8h, v27.8b, #0x0\n" + "smlal2 v10.4s, v22.8h, v4.8h\n" + "smlal2 v6.4s, v26.8h, v4.8h\n" + "ldr d4, [x8, #0xc0]\n" + "usubl v4.8h, v4.8b, v15.8b\n" + "smlal v14.4s, v23.4h, v0.4h\n" + "smlal v9.4s, v25.4h, v0.4h\n" + "add x8, x8, #0xc8\n" + "smlal v7.4s, v24.4h, v0.4h\n" + "smlal2 v8.4s, v25.8h, v0.8h\n" + "ldr d25, [x7, x1]\n" + "ushll v25.8h, v25.8b, #0x0\n" + "smlal2 v10.4s, v23.8h, v0.8h\n" + "smlal2 v6.4s, v24.8h, v0.8h\n" + "smlal v11.4s, v23.4h, v1.4h\n" + "smlal v14.4s, v31.4h, v1.4h\n" + "smlal v9.4s, v24.4h, v1.4h\n" + "smlal v7.4s, v27.4h, v1.4h\n" + "smlal2 v8.4s, v24.8h, v1.8h\n" + "ldr d24, [x20, x1]\n" + "smlal2 v21.4s, v23.8h, v1.8h\n" + "ushll v24.8h, v24.8b, #0x0\n" + "smlal2 v10.4s, v31.8h, v1.8h\n" + "smlal2 v6.4s, v27.8h, v1.8h\n" + "smlal v11.4s, v31.4h, v2.4h\n" + "smlal v14.4s, v30.4h, v2.4h\n" + "smlal v9.4s, v27.4h, v2.4h\n" + "smlal v7.4s, v25.4h, v2.4h\n" + "smlal2 v8.4s, v27.8h, v2.8h\n" + "ldr d27, [x19, x1]\n" + "smlal2 v21.4s, v31.8h, v2.8h\n" + "ushll v27.8h, v27.8b, #0x0\n" + "smlal2 v10.4s, v30.8h, v2.8h\n" + "smlal2 v6.4s, v25.8h, v2.8h\n" + "add x1, x1, #0x8\n" + "smlal v11.4s, v30.4h, v3.4h\n" + "smlal v14.4s, v28.4h, v3.4h\n" + "smlal v9.4s, v25.4h, v3.4h\n" + "smlal v7.4s, v24.4h, v3.4h\n" + "smlal2 v21.4s, v30.8h, v3.8h\n" + "smlal2 v10.4s, v28.8h, v3.8h\n" + "smlal2 v8.4s, v25.8h, v3.8h\n" + "smlal2 v6.4s, v24.8h, v3.8h\n" + "smlal v11.4s, v28.4h, v4.4h\n" + "smlal v14.4s, v26.4h, v4.4h\n" + "sqdmulh v11.4s, v11.4s, v17.4s\n" + "smlal v9.4s, v24.4h, v4.4h\n" + "smlal v7.4s, v27.4h, v4.4h\n" + "sqdmulh v14.4s, v14.4s, v17.4s\n" + "smlal2 v21.4s, v28.8h, v4.8h\n" + "smlal2 v10.4s, v26.8h, v4.8h\n" + "sqdmulh v9.4s, v9.4s, v17.4s\n" + "smlal2 v8.4s, v24.8h, v4.8h\n" + "smlal2 v6.4s, v27.8h, v4.8h\n" + "sqdmulh v7.4s, v7.4s, v17.4s\n" + "and v23.16b, v11.16b, v5.16b\n" + "sqdmulh v21.4s, v21.4s, v18.4s\n" + "and v22.16b, v14.16b, v5.16b\n" + "sqdmulh v10.4s, v10.4s, v18.4s\n" + "and v17.16b, v9.16b, v5.16b\n" + "sqdmulh v8.4s, v8.4s, v18.4s\n" + "and v20.16b, v7.16b, v5.16b\n" + "sqdmulh v6.4s, v6.4s, v18.4s\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "and v19.16b, v21.16b, v29.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v18.16b, v10.16b, v29.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v26.16b, v8.16b, v29.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v4.16b, v6.16b, v29.16b\n" + "sqadd v11.4s, v11.4s, v23.4s\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sqadd v14.4s, v14.4s, v22.4s\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v17.4s\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sqadd v7.4s, v7.4s, v20.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "srshl v11.4s, v11.4s, v5.4s\n" + "sqadd v21.4s, v21.4s, v19.4s\n" + "srshl v14.4s, v14.4s, v5.4s\n" + "sqadd v10.4s, v10.4s, v18.4s\n" + "srshl v9.4s, v9.4s, v5.4s\n" + "sqadd v8.4s, v8.4s, v26.4s\n" + "srshl v7.4s, v7.4s, v5.4s\n" + "sqadd v6.4s, v6.4s, v4.4s\n" + "srshl v21.4s, v21.4s, v29.4s\n" + "sqxtn v11.4h, v11.4s\n" + "srshl v10.4s, v10.4s, v29.4s\n" + "sqxtn v14.4h, v14.4s\n" + "srshl v8.4s, v8.4s, v29.4s\n" + "sqxtn v9.4h, v9.4s\n" + "srshl v6.4s, v6.4s, v29.4s\n" + "sqxtn v7.4h, v7.4s\n" + "sqxtn2 v11.8h, v21.4s\n" + "sqxtn2 v14.8h, v10.4s\n" + "sqxtn2 v9.8h, v8.4s\n" + "sqxtn2 v7.8h, v6.4s\n" + "sqadd v11.8h, v11.8h, v16.8h\n" + "sqadd v14.8h, v14.8h, v16.8h\n" + "sqadd v9.8h, v9.8h, v16.8h\n" + "sqadd v7.8h, v7.8h, v16.8h\n" + "smax v11.8h, v11.8h, v12.8h\n" + "smax v14.8h, v14.8h, v12.8h\n" + "smax v9.8h, v9.8h, v12.8h\n" + "smax v7.8h, v7.8h, v12.8h\n" + "smin v11.8h, v11.8h, v13.8h\n" + "smin v14.8h, v14.8h, v13.8h\n" + "smin v9.8h, v9.8h, v13.8h\n" + "smin v7.8h, v7.8h, v13.8h\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "uzp1 v14.16b, v14.16b, v14.16b\n" + "str d11, [x21, x2]\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" + "uzp1 v7.16b, v7.16b, v7.16b\n" + "str d14, [x15, x2]\n" + "str d9, [x17, x2]\n" + "str d7, [x16, x2]\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q11, [x19, #0x0]\n" + "add x2, x2, #0x8\n" + "ldr q21, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr d0, [x8, #0x0]\n" + "ldr d1, [x8, #0x8]\n" + "ldr d2, [x8, #0x10]\n" + "mov v14.16b, v11.16b\n" + "mov v10.16b, v21.16b\n" + "ldr d3, [x8, #0x18]\n" + "ldr d4, [x8, #0x20]\n" + "mov v9.16b, v11.16b\n" + "mov v8.16b, v21.16b\n" + "ldp x28, x27, [x0, #0x0]\n" + "ldp x10, x26, [x0, #0x10]\n" + "mov v7.16b, v11.16b\n" + "mov v6.16b, v21.16b\n" + "ldp x24, x23, [x0, #0x20]\n" + "ldp x22, x25, [x0, #0x30]\n" + "usubl v0.8h, v0.8b, v15.8b\n" + "usubl v1.8h, v1.8b, v15.8b\n" + "ldp x20, x19, [x0, #0x40]\n" + "ldr d31, [x28, x1]\n" + "usubl v2.8h, v2.8b, v15.8b\n" + "usubl v3.8h, v3.8b, v15.8b\n" + "ldr d30, [x27, x1]\n" + "ldr d29, [x10, x1]\n" + "usubl v4.8h, v4.8b, v15.8b\n" + "ushll v31.8h, v31.8b, #0x0\n" + "ldr d28, [x26, x1]\n" + "ldr d27, [x24, x1]\n" + "ushll v30.8h, v30.8b, #0x0\n" + "ushll v29.8h, v29.8b, #0x0\n" + "ldr d23, [x23, x1]\n" + "ldr d25, [x22, x1]\n" + "ushll v28.8h, v28.8b, #0x0\n" + "ushll v27.8h, v27.8b, #0x0\n" + "ldr d24, [x25, x1]\n" + "ldr d26, [x20, x1]\n" + "ushll v23.8h, v23.8b, #0x0\n" + "ushll v25.8h, v25.8b, #0x0\n" + "ldr d22, [x19, x1]\n" + "ushll v24.8h, v24.8b, #0x0\n" + "ushll v26.8h, v26.8b, #0x0\n" + "ushll v22.8h, v22.8b, #0x0\n" + "bgt 1b\n" + "2:" // Tail + "smlal v11.4s, v31.4h, v0.4h\n" + "smlal2 v21.4s, v31.8h, v0.8h\n" + "ldr x19, [x0, #0x50]\n" + "ldr d31, [x19, x1]\n" + "smlal v14.4s, v30.4h, v0.4h\n" + "smlal v9.4s, v29.4h, v0.4h\n" + "ldr x20, [x0, #0x58]\n" + "ushll v31.8h, v31.8b, #0x0\n" + "smlal v7.4s, v28.4h, v0.4h\n" + "smlal2 v10.4s, v30.8h, v0.8h\n" + "ldr x19, [x0, #0x60]\n" + "ldr x24, [x0, #0x68]\n" + "smlal2 v8.4s, v29.8h, v0.8h\n" + "smlal v11.4s, v30.4h, v1.4h\n" + "ldr x23, [x0, #0x70]\n" + "ldr x26, [x0, #0x78]\n" + "smlal2 v21.4s, v30.8h, v1.8h\n" + "smlal2 v6.4s, v28.8h, v0.8h\n" + "ldr d30, [x20, x1]\n" + "ushll v30.8h, v30.8b, #0x0\n" + "smlal v14.4s, v27.4h, v1.4h\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "ldr d0, [x8, #0x28]\n" + "usubl v0.8h, v0.8b, v15.8b\n" + "smlal v7.4s, v23.4h, v1.4h\n" + "smlal2 v10.4s, v27.8h, v1.8h\n" + "ldr x7, [x0, #0x80]\n" + "ldr x22, [x0, #0x88]\n" + "smlal2 v8.4s, v28.8h, v1.8h\n" + "smlal v11.4s, v27.4h, v2.4h\n" + "ldr x20, [x0, #0x90]\n" + "ldr x14, [x0, #0x98]\n" + "smlal2 v21.4s, v27.8h, v2.8h\n" + "smlal2 v6.4s, v23.8h, v1.8h\n" + "ldr d27, [x19, x1]\n" + "ushll v27.8h, v27.8b, #0x0\n" + "smlal v14.4s, v25.4h, v2.4h\n" + "smlal v9.4s, v23.4h, v2.4h\n" + "ldr d1, [x8, #0x30]\n" + "usubl v1.8h, v1.8b, v15.8b\n" + "smlal v7.4s, v31.4h, v2.4h\n" + "smlal2 v10.4s, v25.8h, v2.8h\n" + "ldr x19, [x0, #0xa0]\n" + "ldr x13, [x0, #0xa8]\n" + "smlal2 v8.4s, v23.8h, v2.8h\n" + "smlal v11.4s, v25.4h, v3.4h\n" + "ldr x12, [x0, #0xb0]\n" + "ldr x11, [x0, #0xb8]\n" + "smlal2 v21.4s, v25.8h, v3.8h\n" + "smlal2 v6.4s, v31.8h, v2.8h\n" + "ldr d25, [x24, x1]\n" + "ushll v25.8h, v25.8b, #0x0\n" + "smlal v14.4s, v24.4h, v3.4h\n" + "smlal v9.4s, v31.4h, v3.4h\n" + "ldr d2, [x8, #0x38]\n" + "usubl v2.8h, v2.8b, v15.8b\n" + "smlal v7.4s, v30.4h, v3.4h\n" + "smlal2 v10.4s, v24.8h, v3.8h\n" + "ldr x10, [x0, #0xc0]\n" + "ldr x9, [x0, #0xc8]\n" + "smlal2 v8.4s, v31.8h, v3.8h\n" + "smlal v11.4s, v24.4h, v4.4h\n" + "ldr x28, [x0, #0xd0]\n" + "ldr x27, [x0, #0xd8]\n" + "smlal2 v21.4s, v24.8h, v4.8h\n" + "smlal2 v6.4s, v30.8h, v3.8h\n" + "ldr d24, [x23, x1]\n" + "ushll v24.8h, v24.8b, #0x0\n" + "smlal v14.4s, v27.4h, v4.4h\n" + "smlal v9.4s, v30.4h, v4.4h\n" + "ldr d3, [x8, #0x40]\n" + "usubl v3.8h, v3.8b, v15.8b\n" + "smlal v7.4s, v26.4h, v4.4h\n" + "smlal2 v10.4s, v27.8h, v4.8h\n" + "ldr d27, [x26, x1]\n" + "ushll v27.8h, v27.8b, #0x0\n" + "smlal2 v8.4s, v30.8h, v4.8h\n" + "smlal v11.4s, v29.4h, v0.4h\n" + "ldr x26, [x0, #0xe0]\n" + "ldr x25, [x0, #0xe8]\n" + "smlal2 v21.4s, v29.8h, v0.8h\n" + "smlal2 v6.4s, v26.8h, v4.8h\n" + "ldr d4, [x8, #0x48]\n" + "usubl v4.8h, v4.8b, v15.8b\n" + "smlal v14.4s, v28.4h, v0.4h\n" + "smlal v9.4s, v22.4h, v0.4h\n" + "ldr x24, [x0, #0xf0]\n" + "ldr x23, [x0, #0xf8]\n" + "smlal v7.4s, v25.4h, v0.4h\n" + "smlal2 v10.4s, v28.8h, v0.8h\n" + "ldr q17, [x5, #0x0]\n" + "ldr q5, [x6, #0x0]\n" + "smlal2 v8.4s, v22.8h, v0.8h\n" + "smlal v11.4s, v28.4h, v1.4h\n" + "ldr q18, [x5, #0x10]\n" + "ldr q29, [x6, #0x10]\n" + "smlal2 v21.4s, v28.8h, v1.8h\n" + "smlal2 v6.4s, v25.8h, v0.8h\n" + "ldr d28, [x22, x1]\n" + "ldr d0, [x8, #0x50]\n" + "smlal v14.4s, v23.4h, v1.4h\n" + "smlal v9.4s, v25.4h, v1.4h\n" + "ushll v28.8h, v28.8b, #0x0\n" + "ldr x22, [x0, #0x100]\n" + "smlal v7.4s, v24.4h, v1.4h\n" + "smlal2 v10.4s, v23.8h, v1.8h\n" + "usubl v0.8h, v0.8b, v15.8b\n" + "tst x4, #0x7\n" + "smlal2 v8.4s, v25.8h, v1.8h\n" + "smlal v11.4s, v23.4h, v2.4h\n" + "add x5, x5, #0x20\n" + "add x6, x6, #0x20\n" + "smlal2 v21.4s, v23.8h, v2.8h\n" + "ldr d23, [x7, x1]\n" + "smlal2 v6.4s, v24.8h, v1.8h\n" + "ushll v23.8h, v23.8b, #0x0\n" + "smlal v14.4s, v31.4h, v2.4h\n" + "smlal v9.4s, v24.4h, v2.4h\n" + "ldr d1, [x8, #0x58]\n" + "usubl v1.8h, v1.8b, v15.8b\n" + "smlal v7.4s, v27.4h, v2.4h\n" + "smlal2 v10.4s, v31.8h, v2.8h\n" + "ldr x7, [x0, #0x108]\n" + "smlal2 v8.4s, v24.8h, v2.8h\n" + "smlal v11.4s, v31.4h, v3.4h\n" + "smlal2 v21.4s, v31.8h, v3.8h\n" + "smlal2 v6.4s, v27.8h, v2.8h\n" + "ldr d31, [x20, x1]\n" + "ushll v31.8h, v31.8b, #0x0\n" + "smlal v14.4s, v30.4h, v3.4h\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "ldr d2, [x8, #0x60]\n" + "usubl v2.8h, v2.8b, v15.8b\n" + "smlal v7.4s, v23.4h, v3.4h\n" + "smlal2 v10.4s, v30.8h, v3.8h\n" + "ldr x20, [x0, #0x110]\n" + "smlal2 v8.4s, v27.8h, v3.8h\n" + "smlal v11.4s, v30.4h, v4.4h\n" + "smlal2 v21.4s, v30.8h, v4.8h\n" + "ldr d30, [x14, x1]\n" + "smlal2 v6.4s, v23.8h, v3.8h\n" + "ushll v30.8h, v30.8b, #0x0\n" + "smlal v14.4s, v26.4h, v4.4h\n" + "smlal v9.4s, v23.4h, v4.4h\n" + "ldr d3, [x8, #0x68]\n" + "usubl v3.8h, v3.8b, v15.8b\n" + "smlal v7.4s, v28.4h, v4.4h\n" + "smlal2 v10.4s, v26.8h, v4.8h\n" + "ldr d26, [x19, x1]\n" + "ushll v26.8h, v26.8b, #0x0\n" + "smlal2 v8.4s, v23.8h, v4.8h\n" + "smlal v11.4s, v22.4h, v0.4h\n" + "ldr x19, [x0, #0x118]\n" + "smlal2 v21.4s, v22.8h, v0.8h\n" + "smlal2 v6.4s, v28.8h, v4.8h\n" + "ldr d4, [x8, #0x70]\n" + "ldr d22, [x11, x1]\n" + "smlal v14.4s, v25.4h, v0.4h\n" + "smlal v9.4s, v31.4h, v0.4h\n" + "usubl v4.8h, v4.8b, v15.8b\n" + "smlal v7.4s, v30.4h, v0.4h\n" + "smlal2 v10.4s, v25.8h, v0.8h\n" + "ushll v22.8h, v22.8b, #0x0\n" + "smlal2 v8.4s, v31.8h, v0.8h\n" + "smlal v11.4s, v25.4h, v1.4h\n" + "smlal2 v21.4s, v25.8h, v1.8h\n" + "ldr d25, [x13, x1]\n" + "smlal2 v6.4s, v30.8h, v0.8h\n" + "ushll v25.8h, v25.8b, #0x0\n" + "smlal v14.4s, v24.4h, v1.4h\n" + "smlal v9.4s, v30.4h, v1.4h\n" + "ldr d0, [x8, #0x78]\n" + "usubl v0.8h, v0.8b, v15.8b\n" + "smlal v7.4s, v26.4h, v1.4h\n" + "smlal2 v10.4s, v24.8h, v1.8h\n" + "smlal2 v8.4s, v30.8h, v1.8h\n" + "smlal v11.4s, v24.4h, v2.4h\n" + "smlal2 v21.4s, v24.8h, v2.8h\n" + "ldr d24, [x12, x1]\n" + "smlal2 v6.4s, v26.8h, v1.8h\n" + "ushll v24.8h, v24.8b, #0x0\n" + "smlal v14.4s, v27.4h, v2.4h\n" + "smlal v9.4s, v26.4h, v2.4h\n" + "ldr d1, [x8, #0x80]\n" + "usubl v1.8h, v1.8b, v15.8b\n" + "smlal v7.4s, v25.4h, v2.4h\n" + "smlal2 v10.4s, v27.8h, v2.8h\n" + "smlal2 v8.4s, v26.8h, v2.8h\n" + "smlal v11.4s, v27.4h, v3.4h\n" + "smlal2 v21.4s, v27.8h, v3.8h\n" + "smlal2 v6.4s, v25.8h, v2.8h\n" + "ldr d27, [x10, x1]\n" + "ushll v27.8h, v27.8b, #0x0\n" + "smlal v14.4s, v23.4h, v3.4h\n" + "smlal v9.4s, v25.4h, v3.4h\n" + "ldr d2, [x8, #0x88]\n" + "usubl v2.8h, v2.8b, v15.8b\n" + "smlal v7.4s, v24.4h, v3.4h\n" + "smlal2 v10.4s, v23.8h, v3.8h\n" + "smlal2 v8.4s, v25.8h, v3.8h\n" + "smlal v11.4s, v23.4h, v4.4h\n" + "smlal2 v21.4s, v23.8h, v4.8h\n" + "ldr d23, [x9, x1]\n" + "smlal2 v6.4s, v24.8h, v3.8h\n" + "ushll v23.8h, v23.8b, #0x0\n" + "smlal v14.4s, v28.4h, v4.4h\n" + "smlal v9.4s, v24.4h, v4.4h\n" + "ldr d3, [x8, #0x90]\n" + "usubl v3.8h, v3.8b, v15.8b\n" + "smlal v7.4s, v22.4h, v4.4h\n" + "smlal2 v10.4s, v28.8h, v4.8h\n" + "ldr d28, [x26, x1]\n" + "ushll v28.8h, v28.8b, #0x0\n" + "smlal2 v8.4s, v24.8h, v4.8h\n" + "smlal v11.4s, v31.4h, v0.4h\n" + "smlal2 v21.4s, v31.8h, v0.8h\n" + "ldr d31, [x28, x1]\n" + "smlal2 v6.4s, v22.8h, v4.8h\n" + "ushll v31.8h, v31.8b, #0x0\n" + "smlal v14.4s, v30.4h, v0.4h\n" + "smlal v9.4s, v27.4h, v0.4h\n" + "ldr d4, [x8, #0x98]\n" + "usubl v4.8h, v4.8b, v15.8b\n" + "smlal v7.4s, v23.4h, v0.4h\n" + "smlal2 v10.4s, v30.8h, v0.8h\n" + "smlal2 v8.4s, v27.8h, v0.8h\n" + "smlal v11.4s, v30.4h, v1.4h\n" + "smlal2 v21.4s, v30.8h, v1.8h\n" + "ldr d30, [x27, x1]\n" + "smlal2 v6.4s, v23.8h, v0.8h\n" + "ushll v30.8h, v30.8b, #0x0\n" + "smlal v14.4s, v26.4h, v1.4h\n" + "smlal v9.4s, v23.4h, v1.4h\n" + "ldr d0, [x8, #0xa0]\n" + "usubl v0.8h, v0.8b, v15.8b\n" + "smlal v7.4s, v31.4h, v1.4h\n" + "smlal2 v10.4s, v26.8h, v1.8h\n" + "smlal2 v8.4s, v23.8h, v1.8h\n" + "smlal v11.4s, v26.4h, v2.4h\n" + "smlal2 v21.4s, v26.8h, v2.8h\n" + "smlal2 v6.4s, v31.8h, v1.8h\n" + "ldr d26, [x25, x1]\n" + "ushll v26.8h, v26.8b, #0x0\n" + "smlal v14.4s, v25.4h, v2.4h\n" + "smlal v9.4s, v31.4h, v2.4h\n" + "ldr d1, [x8, #0xa8]\n" + "usubl v1.8h, v1.8b, v15.8b\n" + "smlal v7.4s, v30.4h, v2.4h\n" + "smlal2 v10.4s, v25.8h, v2.8h\n" + "smlal2 v8.4s, v31.8h, v2.8h\n" + "smlal v11.4s, v25.4h, v3.4h\n" + "smlal2 v21.4s, v25.8h, v3.8h\n" + "smlal2 v6.4s, v30.8h, v2.8h\n" + "ldr d25, [x24, x1]\n" + "ushll v25.8h, v25.8b, #0x0\n" + "smlal v14.4s, v24.4h, v3.4h\n" + "smlal v9.4s, v30.4h, v3.4h\n" + "ldr d2, [x8, #0xb0]\n" + "usubl v2.8h, v2.8b, v15.8b\n" + "smlal v7.4s, v28.4h, v3.4h\n" + "smlal2 v10.4s, v24.8h, v3.8h\n" + "smlal2 v8.4s, v30.8h, v3.8h\n" + "smlal v11.4s, v24.4h, v4.4h\n" + "smlal2 v21.4s, v24.8h, v4.8h\n" + "ldr d24, [x23, x1]\n" + "smlal2 v6.4s, v28.8h, v3.8h\n" + "ushll v24.8h, v24.8b, #0x0\n" + "smlal v14.4s, v22.4h, v4.4h\n" + "smlal v9.4s, v28.4h, v4.4h\n" + "ldr d3, [x8, #0xb8]\n" + "usubl v3.8h, v3.8b, v15.8b\n" + "smlal v7.4s, v26.4h, v4.4h\n" + "smlal2 v8.4s, v28.8h, v4.8h\n" + "smlal v11.4s, v27.4h, v0.4h\n" + "smlal2 v21.4s, v27.8h, v0.8h\n" + "ldr d27, [x22, x1]\n" + "ushll v27.8h, v27.8b, #0x0\n" + "smlal2 v10.4s, v22.8h, v4.8h\n" + "smlal2 v6.4s, v26.8h, v4.8h\n" + "ldr d4, [x8, #0xc0]\n" + "usubl v4.8h, v4.8b, v15.8b\n" + "smlal v14.4s, v23.4h, v0.4h\n" + "smlal v9.4s, v25.4h, v0.4h\n" + "smlal v7.4s, v24.4h, v0.4h\n" + "smlal2 v8.4s, v25.8h, v0.8h\n" + "ldr d25, [x7, x1]\n" + "ushll v25.8h, v25.8b, #0x0\n" + "smlal2 v10.4s, v23.8h, v0.8h\n" + "smlal2 v6.4s, v24.8h, v0.8h\n" + "smlal v11.4s, v23.4h, v1.4h\n" + "smlal v14.4s, v31.4h, v1.4h\n" + "smlal v9.4s, v24.4h, v1.4h\n" + "smlal v7.4s, v27.4h, v1.4h\n" + "smlal2 v8.4s, v24.8h, v1.8h\n" + "ldr d24, [x20, x1]\n" + "smlal2 v21.4s, v23.8h, v1.8h\n" + "ushll v24.8h, v24.8b, #0x0\n" + "smlal2 v10.4s, v31.8h, v1.8h\n" + "smlal2 v6.4s, v27.8h, v1.8h\n" + "smlal v11.4s, v31.4h, v2.4h\n" + "smlal v14.4s, v30.4h, v2.4h\n" + "smlal v9.4s, v27.4h, v2.4h\n" + "smlal v7.4s, v25.4h, v2.4h\n" + "smlal2 v8.4s, v27.8h, v2.8h\n" + "ldr d27, [x19, x1]\n" + "smlal2 v21.4s, v31.8h, v2.8h\n" + "ushll v27.8h, v27.8b, #0x0\n" + "smlal2 v10.4s, v30.8h, v2.8h\n" + "smlal2 v6.4s, v25.8h, v2.8h\n" + "add x1, x1, #0x8\n" + "smlal v11.4s, v30.4h, v3.4h\n" + "smlal v14.4s, v28.4h, v3.4h\n" + "smlal v9.4s, v25.4h, v3.4h\n" + "smlal v7.4s, v24.4h, v3.4h\n" + "smlal2 v21.4s, v30.8h, v3.8h\n" + "smlal2 v10.4s, v28.8h, v3.8h\n" + "smlal2 v8.4s, v25.8h, v3.8h\n" + "smlal2 v6.4s, v24.8h, v3.8h\n" + "smlal v11.4s, v28.4h, v4.4h\n" + "smlal v14.4s, v26.4h, v4.4h\n" + "sqdmulh v11.4s, v11.4s, v17.4s\n" + "smlal v9.4s, v24.4h, v4.4h\n" + "smlal v7.4s, v27.4h, v4.4h\n" + "sqdmulh v14.4s, v14.4s, v17.4s\n" + "smlal2 v21.4s, v28.8h, v4.8h\n" + "smlal2 v10.4s, v26.8h, v4.8h\n" + "sqdmulh v9.4s, v9.4s, v17.4s\n" + "smlal2 v8.4s, v24.8h, v4.8h\n" + "smlal2 v6.4s, v27.8h, v4.8h\n" + "sqdmulh v7.4s, v7.4s, v17.4s\n" + "and v23.16b, v11.16b, v5.16b\n" + "sqdmulh v21.4s, v21.4s, v18.4s\n" + "and v22.16b, v14.16b, v5.16b\n" + "sqdmulh v10.4s, v10.4s, v18.4s\n" + "and v17.16b, v9.16b, v5.16b\n" + "sqdmulh v8.4s, v8.4s, v18.4s\n" + "and v20.16b, v7.16b, v5.16b\n" + "sqdmulh v6.4s, v6.4s, v18.4s\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "and v19.16b, v21.16b, v29.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v18.16b, v10.16b, v29.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v26.16b, v8.16b, v29.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v4.16b, v6.16b, v29.16b\n" + "sqadd v11.4s, v11.4s, v23.4s\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sqadd v14.4s, v14.4s, v22.4s\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v17.4s\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sqadd v7.4s, v7.4s, v20.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "srshl v11.4s, v11.4s, v5.4s\n" + "sqadd v21.4s, v21.4s, v19.4s\n" + "srshl v14.4s, v14.4s, v5.4s\n" + "sqadd v10.4s, v10.4s, v18.4s\n" + "srshl v9.4s, v9.4s, v5.4s\n" + "sqadd v8.4s, v8.4s, v26.4s\n" + "srshl v7.4s, v7.4s, v5.4s\n" + "sqadd v6.4s, v6.4s, v4.4s\n" + "srshl v21.4s, v21.4s, v29.4s\n" + "sqxtn v11.4h, v11.4s\n" + "srshl v10.4s, v10.4s, v29.4s\n" + "sqxtn v14.4h, v14.4s\n" + "srshl v8.4s, v8.4s, v29.4s\n" + "sqxtn v9.4h, v9.4s\n" + "srshl v6.4s, v6.4s, v29.4s\n" + "sqxtn v7.4h, v7.4s\n" + "sqxtn2 v11.8h, v21.4s\n" + "sqxtn2 v14.8h, v10.4s\n" + "sqxtn2 v9.8h, v8.4s\n" + "sqxtn2 v7.8h, v6.4s\n" + "sqadd v11.8h, v11.8h, v16.8h\n" + "sqadd v14.8h, v14.8h, v16.8h\n" + "sqadd v9.8h, v9.8h, v16.8h\n" + "sqadd v7.8h, v7.8h, v16.8h\n" + "smax v11.8h, v11.8h, v12.8h\n" + "smax v14.8h, v14.8h, v12.8h\n" + "smax v9.8h, v9.8h, v12.8h\n" + "smax v7.8h, v7.8h, v12.8h\n" + "smin v11.8h, v11.8h, v13.8h\n" + "smin v14.8h, v14.8h, v13.8h\n" + "smin v9.8h, v9.8h, v13.8h\n" + "smin v7.8h, v7.8h, v13.8h\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "uzp1 v14.16b, v14.16b, v14.16b\n" + "str d11, [x21, x2]\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" + "uzp1 v7.16b, v7.16b, v7.16b\n" + "str d14, [x15, x2]\n" + "str d9, [x17, x2]\n" + "str d7, [x16, x2]\n" + "add x2, x2, #0x8\n" + "beq 124f\n" + "add x8, x8, #0xc8\n" + "3:" // Oddments + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "tbz x4, #2, 5f\n" + "ld1 { v11.4s }, [x19], #0x10\n" + "tbz x4, #1, 4f\n" + "ld1 { v21.d }[0], [x19], #0x8\n" + "tbz x4, #0, 7f\n" + "ld1 { v21.s }[2], [x19]\n" + "b 7f\n" + "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset + "tbz x4, #0, 7f\n" + "ld1 { v21.s }[0], [x19]\n" + "b 7f\n" + "5:" // Oddments: Load bias: Bit 2: Unset + "tbz x4, #1, 6f\n" + "ld1 { v11.d }[0], [x19], #0x8\n" + "tbz x4, #0, 7f\n" + "ld1 { v11.s }[2], [x19]\n" + "b 7f\n" + "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 7f\n" + "ld1 { v11.s }[0], [x19]\n" + "7:" // Oddments: Load bias: Bit 2: End + "ldr d0, [x8, #0x0]\n" + "ldr d1, [x8, #0x8]\n" + "mov v14.16b, v11.16b\n" + "mov v10.16b, v21.16b\n" + "ldr d2, [x8, #0x10]\n" + "ldr d3, [x8, #0x18]\n" + "mov v9.16b, v11.16b\n" + "mov v8.16b, v21.16b\n" + "ldr d4, [x8, #0x20]\n" + "ldp x28, x27, [x0, #0x0]\n" + "mov v7.16b, v11.16b\n" + "mov v6.16b, v21.16b\n" + "ldp x10, x26, [x0, #0x10]\n" + "ldp x24, x23, [x0, #0x20]\n" + "usubl v0.8h, v0.8b, v15.8b\n" + "usubl v1.8h, v1.8b, v15.8b\n" + "ldp x22, x25, [x0, #0x30]\n" + "ldp x20, x19, [x0, #0x40]\n" + "usubl v2.8h, v2.8b, v15.8b\n" + "usubl v3.8h, v3.8b, v15.8b\n" + "usubl v4.8h, v4.8b, v15.8b\n" + "add x28, x28, x1\n" + "add x27, x27, x1\n" + "add x10, x10, x1\n" + "add x26, x26, x1\n" + "add x24, x24, x1\n" + "add x23, x23, x1\n" + "add x22, x22, x1\n" + "add x25, x25, x1\n" + "add x20, x20, x1\n" + "add x19, x19, x1\n" + "tbz x4, #2, 9f\n" + "ld1 { v31.s }[0], [x28], #0x4\n" + "ld1 { v30.s }[0], [x27], #0x4\n" + "ld1 { v29.s }[0], [x10], #0x4\n" + "ld1 { v28.s }[0], [x26], #0x4\n" + "ld1 { v27.s }[0], [x24], #0x4\n" + "ld1 { v23.s }[0], [x23], #0x4\n" + "ld1 { v25.s }[0], [x22], #0x4\n" + "ld1 { v24.s }[0], [x25], #0x4\n" + "ld1 { v26.s }[0], [x20], #0x4\n" + "ld1 { v22.s }[0], [x19], #0x4\n" + "tbz x4, #1, 8f\n" + "ld1 { v31.h }[2], [x28], #0x2\n" + "ld1 { v30.h }[2], [x27], #0x2\n" + "ld1 { v29.h }[2], [x10], #0x2\n" + "ld1 { v28.h }[2], [x26], #0x2\n" + "ld1 { v27.h }[2], [x24], #0x2\n" + "ld1 { v23.h }[2], [x23], #0x2\n" + "ld1 { v25.h }[2], [x22], #0x2\n" + "ld1 { v24.h }[2], [x25], #0x2\n" + "ld1 { v26.h }[2], [x20], #0x2\n" + "ld1 { v22.h }[2], [x19], #0x2\n" + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[6], [x28]\n" + "ld1 { v30.b }[6], [x27]\n" + "ld1 { v29.b }[6], [x10]\n" + "ld1 { v28.b }[6], [x26]\n" + "ld1 { v27.b }[6], [x24]\n" + "ld1 { v23.b }[6], [x23]\n" + "ld1 { v25.b }[6], [x22]\n" + "ld1 { v24.b }[6], [x25]\n" + "ld1 { v26.b }[6], [x20]\n" + "ld1 { v22.b }[6], [x19]\n" + "b 11f\n" + "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[4], [x28]\n" + "ld1 { v30.b }[4], [x27]\n" + "ld1 { v29.b }[4], [x10]\n" + "ld1 { v28.b }[4], [x26]\n" + "ld1 { v27.b }[4], [x24]\n" + "ld1 { v23.b }[4], [x23]\n" + "ld1 { v25.b }[4], [x22]\n" + "ld1 { v24.b }[4], [x25]\n" + "ld1 { v26.b }[4], [x20]\n" + "ld1 { v22.b }[4], [x19]\n" + "b 11f\n" + "9:" // Oddments: Initial loads: Bit 2: Unset + "tbz x4, #1, 10f\n" + "ld1 { v31.h }[0], [x28], #0x2\n" + "ld1 { v30.h }[0], [x27], #0x2\n" + "ld1 { v29.h }[0], [x10], #0x2\n" + "ld1 { v28.h }[0], [x26], #0x2\n" + "ld1 { v27.h }[0], [x24], #0x2\n" + "ld1 { v23.h }[0], [x23], #0x2\n" + "ld1 { v25.h }[0], [x22], #0x2\n" + "ld1 { v24.h }[0], [x25], #0x2\n" + "ld1 { v26.h }[0], [x20], #0x2\n" + "ld1 { v22.h }[0], [x19], #0x2\n" + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[2], [x28]\n" + "ld1 { v30.b }[2], [x27]\n" + "ld1 { v29.b }[2], [x10]\n" + "ld1 { v28.b }[2], [x26]\n" + "ld1 { v27.b }[2], [x24]\n" + "ld1 { v23.b }[2], [x23]\n" + "ld1 { v25.b }[2], [x22]\n" + "ld1 { v24.b }[2], [x25]\n" + "ld1 { v26.b }[2], [x20]\n" + "ld1 { v22.b }[2], [x19]\n" + "b 11f\n" + "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 11f\n" + "ld1 { v31.b }[0], [x28]\n" + "ld1 { v30.b }[0], [x27]\n" + "ld1 { v29.b }[0], [x10]\n" + "ld1 { v28.b }[0], [x26]\n" + "ld1 { v27.b }[0], [x24]\n" + "ld1 { v23.b }[0], [x23]\n" + "ld1 { v25.b }[0], [x22]\n" + "ld1 { v24.b }[0], [x25]\n" + "ld1 { v26.b }[0], [x20]\n" + "ld1 { v22.b }[0], [x19]\n" + "11:" // Oddments: Initial loads: Bit 2: End + "ushll v31.8h, v31.8b, #0x0\n" + "ushll v30.8h, v30.8b, #0x0\n" + "smlal v11.4s, v31.4h, v0.4h\n" + "ldr x19, [x0, #0x50]\n" + "ushll v29.8h, v29.8b, #0x0\n" + "smlal2 v21.4s, v31.8h, v0.8h\n" + "smlal v14.4s, v30.4h, v0.4h\n" + "smlal2 v10.4s, v30.8h, v0.8h\n" + "smlal v9.4s, v29.4h, v0.4h\n" + "ushll v28.8h, v28.8b, #0x0\n" + "add x19, x19, x1\n" + "smlal2 v8.4s, v29.8h, v0.8h\n" + "ushll v27.8h, v27.8b, #0x0\n" + "smlal v7.4s, v28.4h, v0.4h\n" + "smlal2 v6.4s, v28.8h, v0.8h\n" + "smlal v11.4s, v30.4h, v1.4h\n" + "ushll v23.8h, v23.8b, #0x0\n" + "smlal2 v21.4s, v30.8h, v1.8h\n" + "smlal v14.4s, v27.4h, v1.4h\n" + "ushll v25.8h, v25.8b, #0x0\n" + "smlal2 v10.4s, v27.8h, v1.8h\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "ushll v24.8h, v24.8b, #0x0\n" + "smlal2 v8.4s, v28.8h, v1.8h\n" + "ushll v26.8h, v26.8b, #0x0\n" + "smlal v7.4s, v23.4h, v1.4h\n" + "ushll v22.8h, v22.8b, #0x0\n" + "smlal2 v6.4s, v23.8h, v1.8h\n" + "smlal v11.4s, v27.4h, v2.4h\n" + "smlal2 v21.4s, v27.8h, v2.8h\n" + "smlal v14.4s, v25.4h, v2.4h\n" + "smlal2 v10.4s, v25.8h, v2.8h\n" + "smlal v9.4s, v23.4h, v2.4h\n" + "smlal2 v8.4s, v23.8h, v2.8h\n" + "tbz x4, #2, 13f\n" + "ld1 { v31.s }[0], [x19], #0x4\n" + "tbz x4, #1, 12f\n" + "ld1 { v31.h }[2], [x19], #0x2\n" + "tbz x4, #0, 15f\n" + "ld1 { v31.b }[6], [x19]\n" + "b 15f\n" + "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 15f\n" + "ld1 { v31.b }[4], [x19]\n" + "b 15f\n" + "13:" // Oddments: Load (1, 3): Bit 2: Unset + "tbz x4, #1, 14f\n" + "ld1 { v31.h }[0], [x19], #0x2\n" + "tbz x4, #0, 15f\n" + "ld1 { v31.b }[2], [x19]\n" + "b 15f\n" + "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 15f\n" + "ld1 { v31.b }[0], [x19]\n" + "15:" // Oddments: Load (1, 3): Bit 2: End + "ushll v31.8h, v31.8b, #0x0\n" + "ldr x20, [x0, #0x58]\n" + "smlal v7.4s, v31.4h, v2.4h\n" + "smlal2 v6.4s, v31.8h, v2.8h\n" + "smlal v11.4s, v25.4h, v3.4h\n" + "smlal2 v21.4s, v25.8h, v3.8h\n" + "add x20, x20, x1\n" + "smlal v14.4s, v24.4h, v3.4h\n" + "smlal2 v10.4s, v24.8h, v3.8h\n" + "smlal v9.4s, v31.4h, v3.4h\n" + "smlal2 v8.4s, v31.8h, v3.8h\n" + "tbz x4, #2, 17f\n" + "ld1 { v30.s }[0], [x20], #0x4\n" + "tbz x4, #1, 16f\n" + "ld1 { v30.h }[2], [x20], #0x2\n" + "tbz x4, #0, 19f\n" + "ld1 { v30.b }[6], [x20]\n" + "b 19f\n" + "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 19f\n" + "ld1 { v30.b }[4], [x20]\n" + "b 19f\n" + "17:" // Oddments: Load (1, 4): Bit 2: Unset + "tbz x4, #1, 18f\n" + "ld1 { v30.h }[0], [x20], #0x2\n" + "tbz x4, #0, 19f\n" + "ld1 { v30.b }[2], [x20]\n" + "b 19f\n" + "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 19f\n" + "ld1 { v30.b }[0], [x20]\n" + "19:" // Oddments: Load (1, 4): Bit 2: End + "ushll v30.8h, v30.8b, #0x0\n" + "ldr x19, [x0, #0x60]\n" + "smlal v7.4s, v30.4h, v3.4h\n" + "smlal2 v6.4s, v30.8h, v3.8h\n" + "smlal v11.4s, v24.4h, v4.4h\n" + "smlal2 v21.4s, v24.8h, v4.8h\n" + "add x19, x19, x1\n" + "tbz x4, #2, 21f\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "tbz x4, #1, 20f\n" + "ld1 { v27.h }[2], [x19], #0x2\n" + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[6], [x19]\n" + "b 23f\n" + "20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[4], [x19]\n" + "b 23f\n" + "21:" // Oddments: Load (0, 5): Bit 2: Unset + "tbz x4, #1, 22f\n" + "ld1 { v27.h }[0], [x19], #0x2\n" + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[2], [x19]\n" + "b 23f\n" + "22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 23f\n" + "ld1 { v27.b }[0], [x19]\n" + "23:" // Oddments: Load (0, 5): Bit 2: End + "ushll v27.8h, v27.8b, #0x0\n" + "ldr d0, [x8, #0x28]\n" + "smlal v14.4s, v27.4h, v4.4h\n" + "smlal2 v10.4s, v27.8h, v4.8h\n" + "smlal v9.4s, v30.4h, v4.4h\n" + "smlal2 v8.4s, v30.8h, v4.8h\n" + "usubl v0.8h, v0.8b, v15.8b\n" + "ldr x24, [x0, #0x68]\n" + "smlal v7.4s, v26.4h, v4.4h\n" + "smlal2 v6.4s, v26.8h, v4.8h\n" + "add x24, x24, x1\n" + "smlal v11.4s, v29.4h, v0.4h\n" + "smlal2 v21.4s, v29.8h, v0.8h\n" + "smlal v14.4s, v28.4h, v0.4h\n" + "smlal2 v10.4s, v28.8h, v0.8h\n" + "smlal v9.4s, v22.4h, v0.4h\n" + "smlal2 v8.4s, v22.8h, v0.8h\n" + "tbz x4, #2, 25f\n" + "ld1 { v25.s }[0], [x24], #0x4\n" + "tbz x4, #1, 24f\n" + "ld1 { v25.h }[2], [x24], #0x2\n" + "tbz x4, #0, 27f\n" + "ld1 { v25.b }[6], [x24]\n" + "b 27f\n" + "24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 27f\n" + "ld1 { v25.b }[4], [x24]\n" + "b 27f\n" + "25:" // Oddments: Load (2, 1): Bit 2: Unset + "tbz x4, #1, 26f\n" + "ld1 { v25.h }[0], [x24], #0x2\n" + "tbz x4, #0, 27f\n" + "ld1 { v25.b }[2], [x24]\n" + "b 27f\n" + "26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 27f\n" + "ld1 { v25.b }[0], [x24]\n" + "27:" // Oddments: Load (2, 1): Bit 2: End + "ldr d1, [x8, #0x30]\n" + "ushll v25.8h, v25.8b, #0x0\n" + "usubl v1.8h, v1.8b, v15.8b\n" + "ldr x23, [x0, #0x70]\n" + "smlal v7.4s, v25.4h, v0.4h\n" + "smlal2 v6.4s, v25.8h, v0.8h\n" + "add x23, x23, x1\n" + "smlal v11.4s, v28.4h, v1.4h\n" + "smlal2 v21.4s, v28.8h, v1.8h\n" + "smlal v14.4s, v23.4h, v1.4h\n" + "smlal2 v10.4s, v23.8h, v1.8h\n" + "smlal v9.4s, v25.4h, v1.4h\n" + "smlal2 v8.4s, v25.8h, v1.8h\n" + "tbz x4, #2, 29f\n" + "ld1 { v24.s }[0], [x23], #0x4\n" + "tbz x4, #1, 28f\n" + "ld1 { v24.h }[2], [x23], #0x2\n" + "tbz x4, #0, 31f\n" + "ld1 { v24.b }[6], [x23]\n" + "b 31f\n" + "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 31f\n" + "ld1 { v24.b }[4], [x23]\n" + "b 31f\n" + "29:" // Oddments: Load (2, 2): Bit 2: Unset + "tbz x4, #1, 30f\n" + "ld1 { v24.h }[0], [x23], #0x2\n" + "tbz x4, #0, 31f\n" + "ld1 { v24.b }[2], [x23]\n" + "b 31f\n" + "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 31f\n" + "ld1 { v24.b }[0], [x23]\n" + "31:" // Oddments: Load (2, 2): Bit 2: End + "ldr d2, [x8, #0x38]\n" + "ushll v24.8h, v24.8b, #0x0\n" + "usubl v2.8h, v2.8b, v15.8b\n" + "ldr x26, [x0, #0x78]\n" + "smlal v7.4s, v24.4h, v1.4h\n" + "smlal2 v6.4s, v24.8h, v1.8h\n" + "add x26, x26, x1\n" + "smlal v11.4s, v23.4h, v2.4h\n" + "smlal2 v21.4s, v23.8h, v2.8h\n" + "smlal v14.4s, v31.4h, v2.4h\n" + "smlal2 v10.4s, v31.8h, v2.8h\n" + "smlal v9.4s, v24.4h, v2.4h\n" + "smlal2 v8.4s, v24.8h, v2.8h\n" + "tbz x4, #2, 33f\n" + "ld1 { v27.s }[0], [x26], #0x4\n" + "tbz x4, #1, 32f\n" + "ld1 { v27.h }[2], [x26], #0x2\n" + "tbz x4, #0, 35f\n" + "ld1 { v27.b }[6], [x26]\n" + "b 35f\n" + "32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 35f\n" + "ld1 { v27.b }[4], [x26]\n" + "b 35f\n" + "33:" // Oddments: Load (2, 3): Bit 2: Unset + "tbz x4, #1, 34f\n" + "ld1 { v27.h }[0], [x26], #0x2\n" + "tbz x4, #0, 35f\n" + "ld1 { v27.b }[2], [x26]\n" + "b 35f\n" + "34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 35f\n" + "ld1 { v27.b }[0], [x26]\n" + "35:" // Oddments: Load (2, 3): Bit 2: End + "ldr d3, [x8, #0x40]\n" + "ushll v27.8h, v27.8b, #0x0\n" + "usubl v3.8h, v3.8b, v15.8b\n" + "ldr x7, [x0, #0x80]\n" + "smlal v7.4s, v27.4h, v2.4h\n" + "smlal2 v6.4s, v27.8h, v2.8h\n" + "add x7, x7, x1\n" + "smlal v11.4s, v31.4h, v3.4h\n" + "smlal2 v21.4s, v31.8h, v3.8h\n" + "smlal v14.4s, v30.4h, v3.4h\n" + "smlal2 v10.4s, v30.8h, v3.8h\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "smlal2 v8.4s, v27.8h, v3.8h\n" + "tbz x4, #2, 37f\n" + "ld1 { v23.s }[0], [x7], #0x4\n" + "tbz x4, #1, 36f\n" + "ld1 { v23.h }[2], [x7], #0x2\n" + "tbz x4, #0, 39f\n" + "ld1 { v23.b }[6], [x7]\n" + "b 39f\n" + "36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 39f\n" + "ld1 { v23.b }[4], [x7]\n" + "b 39f\n" + "37:" // Oddments: Load (2, 4): Bit 2: Unset + "tbz x4, #1, 38f\n" + "ld1 { v23.h }[0], [x7], #0x2\n" + "tbz x4, #0, 39f\n" + "ld1 { v23.b }[2], [x7]\n" + "b 39f\n" + "38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 39f\n" + "ld1 { v23.b }[0], [x7]\n" + "39:" // Oddments: Load (2, 4): Bit 2: End + "ldr d4, [x8, #0x48]\n" + "ushll v23.8h, v23.8b, #0x0\n" + "usubl v4.8h, v4.8b, v15.8b\n" + "ldr x22, [x0, #0x88]\n" + "smlal v7.4s, v23.4h, v3.4h\n" + "smlal2 v6.4s, v23.8h, v3.8h\n" + "add x22, x22, x1\n" + "smlal v11.4s, v30.4h, v4.4h\n" + "smlal2 v21.4s, v30.8h, v4.8h\n" + "smlal v14.4s, v26.4h, v4.4h\n" + "smlal2 v10.4s, v26.8h, v4.8h\n" + "smlal v9.4s, v23.4h, v4.4h\n" + "smlal2 v8.4s, v23.8h, v4.8h\n" + "tbz x4, #2, 41f\n" + "ld1 { v28.s }[0], [x22], #0x4\n" + "tbz x4, #1, 40f\n" + "ld1 { v28.h }[2], [x22], #0x2\n" + "tbz x4, #0, 43f\n" + "ld1 { v28.b }[6], [x22]\n" + "b 43f\n" + "40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset + "tbz x4, #0, 43f\n" + "ld1 { v28.b }[4], [x22]\n" + "b 43f\n" + "41:" // Oddments: Load (2, 5): Bit 2: Unset + "tbz x4, #1, 42f\n" + "ld1 { v28.h }[0], [x22], #0x2\n" + "tbz x4, #0, 43f\n" + "ld1 { v28.b }[2], [x22]\n" + "b 43f\n" + "42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 43f\n" + "ld1 { v28.b }[0], [x22]\n" + "43:" // Oddments: Load (2, 5): Bit 2: End + "ldr d0, [x8, #0x50]\n" + "ushll v28.8h, v28.8b, #0x0\n" + "usubl v0.8h, v0.8b, v15.8b\n" + "ldr x20, [x0, #0x90]\n" + "smlal v7.4s, v28.4h, v4.4h\n" + "smlal2 v6.4s, v28.8h, v4.8h\n" + "add x20, x20, x1\n" + "smlal v11.4s, v22.4h, v0.4h\n" + "smlal2 v21.4s, v22.8h, v0.8h\n" + "smlal v14.4s, v25.4h, v0.4h\n" + "smlal2 v10.4s, v25.8h, v0.8h\n" + "tbz x4, #2, 45f\n" + "ld1 { v31.s }[0], [x20], #0x4\n" + "tbz x4, #1, 44f\n" + "ld1 { v31.h }[2], [x20], #0x2\n" + "tbz x4, #0, 47f\n" + "ld1 { v31.b }[6], [x20]\n" + "b 47f\n" + "44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset + "tbz x4, #0, 47f\n" + "ld1 { v31.b }[4], [x20]\n" + "b 47f\n" + "45:" // Oddments: Load (3, 0): Bit 2: Unset + "tbz x4, #1, 46f\n" + "ld1 { v31.h }[0], [x20], #0x2\n" + "tbz x4, #0, 47f\n" + "ld1 { v31.b }[2], [x20]\n" + "b 47f\n" + "46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 47f\n" + "ld1 { v31.b }[0], [x20]\n" + "47:" // Oddments: Load (3, 0): Bit 2: End + "ushll v31.8h, v31.8b, #0x0\n" + "ldr x14, [x0, #0x98]\n" + "smlal v9.4s, v31.4h, v0.4h\n" + "smlal2 v8.4s, v31.8h, v0.8h\n" + "add x14, x14, x1\n" + "tbz x4, #2, 49f\n" + "ld1 { v30.s }[0], [x14], #0x4\n" + "tbz x4, #1, 48f\n" + "ld1 { v30.h }[2], [x14], #0x2\n" + "tbz x4, #0, 51f\n" + "ld1 { v30.b }[6], [x14]\n" + "b 51f\n" + "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 51f\n" + "ld1 { v30.b }[4], [x14]\n" + "b 51f\n" + "49:" // Oddments: Load (3, 1): Bit 2: Unset + "tbz x4, #1, 50f\n" + "ld1 { v30.h }[0], [x14], #0x2\n" + "tbz x4, #0, 51f\n" + "ld1 { v30.b }[2], [x14]\n" + "b 51f\n" + "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 51f\n" + "ld1 { v30.b }[0], [x14]\n" + "51:" // Oddments: Load (3, 1): Bit 2: End + "ldr d1, [x8, #0x58]\n" + "ushll v30.8h, v30.8b, #0x0\n" + "usubl v1.8h, v1.8b, v15.8b\n" + "ldr x19, [x0, #0xa0]\n" + "smlal v7.4s, v30.4h, v0.4h\n" + "smlal2 v6.4s, v30.8h, v0.8h\n" + "add x19, x19, x1\n" + "smlal v11.4s, v25.4h, v1.4h\n" + "smlal2 v21.4s, v25.8h, v1.8h\n" + "smlal v14.4s, v24.4h, v1.4h\n" + "smlal2 v10.4s, v24.8h, v1.8h\n" + "smlal v9.4s, v30.4h, v1.4h\n" + "smlal2 v8.4s, v30.8h, v1.8h\n" + "tbz x4, #2, 53f\n" + "ld1 { v26.s }[0], [x19], #0x4\n" + "tbz x4, #1, 52f\n" + "ld1 { v26.h }[2], [x19], #0x2\n" + "tbz x4, #0, 55f\n" + "ld1 { v26.b }[6], [x19]\n" + "b 55f\n" + "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 55f\n" + "ld1 { v26.b }[4], [x19]\n" + "b 55f\n" + "53:" // Oddments: Load (3, 2): Bit 2: Unset + "tbz x4, #1, 54f\n" + "ld1 { v26.h }[0], [x19], #0x2\n" + "tbz x4, #0, 55f\n" + "ld1 { v26.b }[2], [x19]\n" + "b 55f\n" + "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 55f\n" + "ld1 { v26.b }[0], [x19]\n" + "55:" // Oddments: Load (3, 2): Bit 2: End + "ldr d2, [x8, #0x60]\n" + "ushll v26.8h, v26.8b, #0x0\n" + "usubl v2.8h, v2.8b, v15.8b\n" + "ldr x13, [x0, #0xa8]\n" + "smlal v7.4s, v26.4h, v1.4h\n" + "smlal2 v6.4s, v26.8h, v1.8h\n" + "add x13, x13, x1\n" + "smlal v11.4s, v24.4h, v2.4h\n" + "smlal2 v21.4s, v24.8h, v2.8h\n" + "smlal v14.4s, v27.4h, v2.4h\n" + "smlal2 v10.4s, v27.8h, v2.8h\n" + "smlal v9.4s, v26.4h, v2.4h\n" + "smlal2 v8.4s, v26.8h, v2.8h\n" + "tbz x4, #2, 57f\n" + "ld1 { v25.s }[0], [x13], #0x4\n" + "tbz x4, #1, 56f\n" + "ld1 { v25.h }[2], [x13], #0x2\n" + "tbz x4, #0, 59f\n" + "ld1 { v25.b }[6], [x13]\n" + "b 59f\n" + "56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 59f\n" + "ld1 { v25.b }[4], [x13]\n" + "b 59f\n" + "57:" // Oddments: Load (3, 3): Bit 2: Unset + "tbz x4, #1, 58f\n" + "ld1 { v25.h }[0], [x13], #0x2\n" + "tbz x4, #0, 59f\n" + "ld1 { v25.b }[2], [x13]\n" + "b 59f\n" + "58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 59f\n" + "ld1 { v25.b }[0], [x13]\n" + "59:" // Oddments: Load (3, 3): Bit 2: End + "ldr d3, [x8, #0x68]\n" + "ushll v25.8h, v25.8b, #0x0\n" + "usubl v3.8h, v3.8b, v15.8b\n" + "ldr x12, [x0, #0xb0]\n" + "smlal v7.4s, v25.4h, v2.4h\n" + "smlal2 v6.4s, v25.8h, v2.8h\n" + "add x12, x12, x1\n" + "smlal v11.4s, v27.4h, v3.4h\n" + "smlal2 v21.4s, v27.8h, v3.8h\n" + "smlal v14.4s, v23.4h, v3.4h\n" + "smlal2 v10.4s, v23.8h, v3.8h\n" + "smlal v9.4s, v25.4h, v3.4h\n" + "smlal2 v8.4s, v25.8h, v3.8h\n" + "tbz x4, #2, 61f\n" + "ld1 { v24.s }[0], [x12], #0x4\n" + "tbz x4, #1, 60f\n" + "ld1 { v24.h }[2], [x12], #0x2\n" + "tbz x4, #0, 63f\n" + "ld1 { v24.b }[6], [x12]\n" + "b 63f\n" + "60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 63f\n" + "ld1 { v24.b }[4], [x12]\n" + "b 63f\n" + "61:" // Oddments: Load (3, 4): Bit 2: Unset + "tbz x4, #1, 62f\n" + "ld1 { v24.h }[0], [x12], #0x2\n" + "tbz x4, #0, 63f\n" + "ld1 { v24.b }[2], [x12]\n" + "b 63f\n" + "62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 63f\n" + "ld1 { v24.b }[0], [x12]\n" + "63:" // Oddments: Load (3, 4): Bit 2: End + "ldr d4, [x8, #0x70]\n" + "ushll v24.8h, v24.8b, #0x0\n" + "usubl v4.8h, v4.8b, v15.8b\n" + "ldr x11, [x0, #0xb8]\n" + "smlal v7.4s, v24.4h, v3.4h\n" + "smlal2 v6.4s, v24.8h, v3.8h\n" + "add x11, x11, x1\n" + "smlal v11.4s, v23.4h, v4.4h\n" + "smlal2 v21.4s, v23.8h, v4.8h\n" + "smlal v14.4s, v28.4h, v4.4h\n" + "smlal2 v10.4s, v28.8h, v4.8h\n" + "smlal v9.4s, v24.4h, v4.4h\n" + "smlal2 v8.4s, v24.8h, v4.8h\n" + "tbz x4, #2, 65f\n" + "ld1 { v22.s }[0], [x11], #0x4\n" + "tbz x4, #1, 64f\n" + "ld1 { v22.h }[2], [x11], #0x2\n" + "tbz x4, #0, 67f\n" + "ld1 { v22.b }[6], [x11]\n" + "b 67f\n" + "64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset + "tbz x4, #0, 67f\n" + "ld1 { v22.b }[4], [x11]\n" + "b 67f\n" + "65:" // Oddments: Load (3, 5): Bit 2: Unset + "tbz x4, #1, 66f\n" + "ld1 { v22.h }[0], [x11], #0x2\n" + "tbz x4, #0, 67f\n" + "ld1 { v22.b }[2], [x11]\n" + "b 67f\n" + "66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 67f\n" + "ld1 { v22.b }[0], [x11]\n" + "67:" // Oddments: Load (3, 5): Bit 2: End + "ldr d0, [x8, #0x78]\n" + "ushll v22.8h, v22.8b, #0x0\n" + "usubl v0.8h, v0.8b, v15.8b\n" + "ldr x10, [x0, #0xc0]\n" + "smlal v7.4s, v22.4h, v4.4h\n" + "smlal2 v6.4s, v22.8h, v4.8h\n" + "add x10, x10, x1\n" + "smlal v11.4s, v31.4h, v0.4h\n" + "smlal2 v21.4s, v31.8h, v0.8h\n" + "smlal v14.4s, v30.4h, v0.4h\n" + "smlal2 v10.4s, v30.8h, v0.8h\n" + "tbz x4, #2, 69f\n" + "ld1 { v27.s }[0], [x10], #0x4\n" + "tbz x4, #1, 68f\n" + "ld1 { v27.h }[2], [x10], #0x2\n" + "tbz x4, #0, 71f\n" + "ld1 { v27.b }[6], [x10]\n" + "b 71f\n" + "68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset + "tbz x4, #0, 71f\n" + "ld1 { v27.b }[4], [x10]\n" + "b 71f\n" + "69:" // Oddments: Load (4, 0): Bit 2: Unset + "tbz x4, #1, 70f\n" + "ld1 { v27.h }[0], [x10], #0x2\n" + "tbz x4, #0, 71f\n" + "ld1 { v27.b }[2], [x10]\n" + "b 71f\n" + "70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 71f\n" + "ld1 { v27.b }[0], [x10]\n" + "71:" // Oddments: Load (4, 0): Bit 2: End + "ushll v27.8h, v27.8b, #0x0\n" + "ldr x9, [x0, #0xc8]\n" + "smlal v9.4s, v27.4h, v0.4h\n" + "smlal2 v8.4s, v27.8h, v0.8h\n" + "add x9, x9, x1\n" + "tbz x4, #2, 73f\n" + "ld1 { v23.s }[0], [x9], #0x4\n" + "tbz x4, #1, 72f\n" + "ld1 { v23.h }[2], [x9], #0x2\n" + "tbz x4, #0, 75f\n" + "ld1 { v23.b }[6], [x9]\n" + "b 75f\n" + "72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 75f\n" + "ld1 { v23.b }[4], [x9]\n" + "b 75f\n" + "73:" // Oddments: Load (4, 1): Bit 2: Unset + "tbz x4, #1, 74f\n" + "ld1 { v23.h }[0], [x9], #0x2\n" + "tbz x4, #0, 75f\n" + "ld1 { v23.b }[2], [x9]\n" + "b 75f\n" + "74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 75f\n" + "ld1 { v23.b }[0], [x9]\n" + "75:" // Oddments: Load (4, 1): Bit 2: End + "ldr d1, [x8, #0x80]\n" + "ushll v23.8h, v23.8b, #0x0\n" + "usubl v1.8h, v1.8b, v15.8b\n" + "ldr x28, [x0, #0xd0]\n" + "smlal v7.4s, v23.4h, v0.4h\n" + "smlal2 v6.4s, v23.8h, v0.8h\n" + "add x28, x28, x1\n" + "smlal v11.4s, v30.4h, v1.4h\n" + "smlal2 v21.4s, v30.8h, v1.8h\n" + "smlal v14.4s, v26.4h, v1.4h\n" + "smlal2 v10.4s, v26.8h, v1.8h\n" + "smlal v9.4s, v23.4h, v1.4h\n" + "smlal2 v8.4s, v23.8h, v1.8h\n" + "tbz x4, #2, 77f\n" + "ld1 { v31.s }[0], [x28], #0x4\n" + "tbz x4, #1, 76f\n" + "ld1 { v31.h }[2], [x28], #0x2\n" + "tbz x4, #0, 79f\n" + "ld1 { v31.b }[6], [x28]\n" + "b 79f\n" + "76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 79f\n" + "ld1 { v31.b }[4], [x28]\n" + "b 79f\n" + "77:" // Oddments: Load (4, 2): Bit 2: Unset + "tbz x4, #1, 78f\n" + "ld1 { v31.h }[0], [x28], #0x2\n" + "tbz x4, #0, 79f\n" + "ld1 { v31.b }[2], [x28]\n" + "b 79f\n" + "78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 79f\n" + "ld1 { v31.b }[0], [x28]\n" + "79:" // Oddments: Load (4, 2): Bit 2: End + "ldr d2, [x8, #0x88]\n" + "ushll v31.8h, v31.8b, #0x0\n" + "usubl v2.8h, v2.8b, v15.8b\n" + "ldr x27, [x0, #0xd8]\n" + "smlal v7.4s, v31.4h, v1.4h\n" + "smlal2 v6.4s, v31.8h, v1.8h\n" + "add x27, x27, x1\n" + "smlal v11.4s, v26.4h, v2.4h\n" + "smlal2 v21.4s, v26.8h, v2.8h\n" + "smlal v14.4s, v25.4h, v2.4h\n" + "smlal2 v10.4s, v25.8h, v2.8h\n" + "smlal v9.4s, v31.4h, v2.4h\n" + "smlal2 v8.4s, v31.8h, v2.8h\n" + "tbz x4, #2, 81f\n" + "ld1 { v30.s }[0], [x27], #0x4\n" + "tbz x4, #1, 80f\n" + "ld1 { v30.h }[2], [x27], #0x2\n" + "tbz x4, #0, 83f\n" + "ld1 { v30.b }[6], [x27]\n" + "b 83f\n" + "80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 83f\n" + "ld1 { v30.b }[4], [x27]\n" + "b 83f\n" + "81:" // Oddments: Load (4, 3): Bit 2: Unset + "tbz x4, #1, 82f\n" + "ld1 { v30.h }[0], [x27], #0x2\n" + "tbz x4, #0, 83f\n" + "ld1 { v30.b }[2], [x27]\n" + "b 83f\n" + "82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 83f\n" + "ld1 { v30.b }[0], [x27]\n" + "83:" // Oddments: Load (4, 3): Bit 2: End + "ldr d3, [x8, #0x90]\n" + "ushll v30.8h, v30.8b, #0x0\n" + "usubl v3.8h, v3.8b, v15.8b\n" + "ldr x26, [x0, #0xe0]\n" + "smlal v7.4s, v30.4h, v2.4h\n" + "smlal2 v6.4s, v30.8h, v2.8h\n" + "add x26, x26, x1\n" + "smlal v11.4s, v25.4h, v3.4h\n" + "smlal2 v21.4s, v25.8h, v3.8h\n" + "smlal v14.4s, v24.4h, v3.4h\n" + "smlal2 v10.4s, v24.8h, v3.8h\n" + "smlal v9.4s, v30.4h, v3.4h\n" + "smlal2 v8.4s, v30.8h, v3.8h\n" + "tbz x4, #2, 85f\n" + "ld1 { v28.s }[0], [x26], #0x4\n" + "tbz x4, #1, 84f\n" + "ld1 { v28.h }[2], [x26], #0x2\n" + "tbz x4, #0, 87f\n" + "ld1 { v28.b }[6], [x26]\n" + "b 87f\n" + "84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 87f\n" + "ld1 { v28.b }[4], [x26]\n" + "b 87f\n" + "85:" // Oddments: Load (4, 4): Bit 2: Unset + "tbz x4, #1, 86f\n" + "ld1 { v28.h }[0], [x26], #0x2\n" + "tbz x4, #0, 87f\n" + "ld1 { v28.b }[2], [x26]\n" + "b 87f\n" + "86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 87f\n" + "ld1 { v28.b }[0], [x26]\n" + "87:" // Oddments: Load (4, 4): Bit 2: End + "ldr d4, [x8, #0x98]\n" + "ushll v28.8h, v28.8b, #0x0\n" + "usubl v4.8h, v4.8b, v15.8b\n" + "ldr x25, [x0, #0xe8]\n" + "smlal v7.4s, v28.4h, v3.4h\n" + "smlal2 v6.4s, v28.8h, v3.8h\n" + "add x25, x25, x1\n" + "smlal v11.4s, v24.4h, v4.4h\n" + "smlal2 v21.4s, v24.8h, v4.8h\n" + "smlal v14.4s, v22.4h, v4.4h\n" + "smlal2 v10.4s, v22.8h, v4.8h\n" + "smlal v9.4s, v28.4h, v4.4h\n" + "smlal2 v8.4s, v28.8h, v4.8h\n" + "tbz x4, #2, 89f\n" + "ld1 { v26.s }[0], [x25], #0x4\n" + "tbz x4, #1, 88f\n" + "ld1 { v26.h }[2], [x25], #0x2\n" + "tbz x4, #0, 91f\n" + "ld1 { v26.b }[6], [x25]\n" + "b 91f\n" + "88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset + "tbz x4, #0, 91f\n" + "ld1 { v26.b }[4], [x25]\n" + "b 91f\n" + "89:" // Oddments: Load (4, 5): Bit 2: Unset + "tbz x4, #1, 90f\n" + "ld1 { v26.h }[0], [x25], #0x2\n" + "tbz x4, #0, 91f\n" + "ld1 { v26.b }[2], [x25]\n" + "b 91f\n" + "90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 91f\n" + "ld1 { v26.b }[0], [x25]\n" + "91:" // Oddments: Load (4, 5): Bit 2: End + "ldr d0, [x8, #0xa0]\n" + "ushll v26.8h, v26.8b, #0x0\n" + "usubl v0.8h, v0.8b, v15.8b\n" + "ldr x24, [x0, #0xf0]\n" + "smlal v7.4s, v26.4h, v4.4h\n" + "smlal2 v6.4s, v26.8h, v4.8h\n" + "add x24, x24, x1\n" + "smlal v11.4s, v27.4h, v0.4h\n" + "smlal2 v21.4s, v27.8h, v0.8h\n" + "smlal v14.4s, v23.4h, v0.4h\n" + "smlal2 v10.4s, v23.8h, v0.8h\n" + "tbz x4, #2, 93f\n" + "ld1 { v25.s }[0], [x24], #0x4\n" + "tbz x4, #1, 92f\n" + "ld1 { v25.h }[2], [x24], #0x2\n" + "tbz x4, #0, 95f\n" + "ld1 { v25.b }[6], [x24]\n" + "b 95f\n" + "92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset + "tbz x4, #0, 95f\n" + "ld1 { v25.b }[4], [x24]\n" + "b 95f\n" + "93:" // Oddments: Load (5, 0): Bit 2: Unset + "tbz x4, #1, 94f\n" + "ld1 { v25.h }[0], [x24], #0x2\n" + "tbz x4, #0, 95f\n" + "ld1 { v25.b }[2], [x24]\n" + "b 95f\n" + "94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 95f\n" + "ld1 { v25.b }[0], [x24]\n" + "95:" // Oddments: Load (5, 0): Bit 2: End + "ushll v25.8h, v25.8b, #0x0\n" + "ldr x23, [x0, #0xf8]\n" + "smlal v9.4s, v25.4h, v0.4h\n" + "smlal2 v8.4s, v25.8h, v0.8h\n" + "add x23, x23, x1\n" + "tbz x4, #2, 97f\n" + "ld1 { v24.s }[0], [x23], #0x4\n" + "tbz x4, #1, 96f\n" + "ld1 { v24.h }[2], [x23], #0x2\n" + "tbz x4, #0, 99f\n" + "ld1 { v24.b }[6], [x23]\n" + "b 99f\n" + "96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset + "tbz x4, #0, 99f\n" + "ld1 { v24.b }[4], [x23]\n" + "b 99f\n" + "97:" // Oddments: Load (5, 1): Bit 2: Unset + "tbz x4, #1, 98f\n" + "ld1 { v24.h }[0], [x23], #0x2\n" + "tbz x4, #0, 99f\n" + "ld1 { v24.b }[2], [x23]\n" + "b 99f\n" + "98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 99f\n" + "ld1 { v24.b }[0], [x23]\n" + "99:" // Oddments: Load (5, 1): Bit 2: End + "ldr d1, [x8, #0xa8]\n" + "ushll v24.8h, v24.8b, #0x0\n" + "usubl v1.8h, v1.8b, v15.8b\n" + "ldr x22, [x0, #0x100]\n" + "smlal v7.4s, v24.4h, v0.4h\n" + "smlal2 v6.4s, v24.8h, v0.8h\n" + "add x22, x22, x1\n" + "smlal v11.4s, v23.4h, v1.4h\n" + "smlal2 v21.4s, v23.8h, v1.8h\n" + "smlal v14.4s, v31.4h, v1.4h\n" + "smlal2 v10.4s, v31.8h, v1.8h\n" + "smlal v9.4s, v24.4h, v1.4h\n" + "smlal2 v8.4s, v24.8h, v1.8h\n" + "tbz x4, #2, 101f\n" + "ld1 { v27.s }[0], [x22], #0x4\n" + "tbz x4, #1, 100f\n" + "ld1 { v27.h }[2], [x22], #0x2\n" + "tbz x4, #0, 103f\n" + "ld1 { v27.b }[6], [x22]\n" + "b 103f\n" + "100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset + "tbz x4, #0, 103f\n" + "ld1 { v27.b }[4], [x22]\n" + "b 103f\n" + "101:" // Oddments: Load (5, 2): Bit 2: Unset + "tbz x4, #1, 102f\n" + "ld1 { v27.h }[0], [x22], #0x2\n" + "tbz x4, #0, 103f\n" + "ld1 { v27.b }[2], [x22]\n" + "b 103f\n" + "102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 103f\n" + "ld1 { v27.b }[0], [x22]\n" + "103:" // Oddments: Load (5, 2): Bit 2: End + "ldr d2, [x8, #0xb0]\n" + "ushll v27.8h, v27.8b, #0x0\n" + "usubl v2.8h, v2.8b, v15.8b\n" + "ldr x7, [x0, #0x108]\n" + "smlal v7.4s, v27.4h, v1.4h\n" + "smlal2 v6.4s, v27.8h, v1.8h\n" + "add x7, x7, x1\n" + "smlal v11.4s, v31.4h, v2.4h\n" + "smlal2 v21.4s, v31.8h, v2.8h\n" + "smlal v14.4s, v30.4h, v2.4h\n" + "smlal2 v10.4s, v30.8h, v2.8h\n" + "smlal v9.4s, v27.4h, v2.4h\n" + "smlal2 v8.4s, v27.8h, v2.8h\n" + "tbz x4, #2, 105f\n" + "ld1 { v25.s }[0], [x7], #0x4\n" + "tbz x4, #1, 104f\n" + "ld1 { v25.h }[2], [x7], #0x2\n" + "tbz x4, #0, 107f\n" + "ld1 { v25.b }[6], [x7]\n" + "b 107f\n" + "104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset + "tbz x4, #0, 107f\n" + "ld1 { v25.b }[4], [x7]\n" + "b 107f\n" + "105:" // Oddments: Load (5, 3): Bit 2: Unset + "tbz x4, #1, 106f\n" + "ld1 { v25.h }[0], [x7], #0x2\n" + "tbz x4, #0, 107f\n" + "ld1 { v25.b }[2], [x7]\n" + "b 107f\n" + "106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 107f\n" + "ld1 { v25.b }[0], [x7]\n" + "107:" // Oddments: Load (5, 3): Bit 2: End + "ldr d3, [x8, #0xb8]\n" + "ushll v25.8h, v25.8b, #0x0\n" + "usubl v3.8h, v3.8b, v15.8b\n" + "ldr x20, [x0, #0x110]\n" + "smlal v7.4s, v25.4h, v2.4h\n" + "smlal2 v6.4s, v25.8h, v2.8h\n" + "add x20, x20, x1\n" + "smlal v11.4s, v30.4h, v3.4h\n" + "smlal2 v21.4s, v30.8h, v3.8h\n" + "smlal v14.4s, v28.4h, v3.4h\n" + "smlal2 v10.4s, v28.8h, v3.8h\n" + "smlal v9.4s, v25.4h, v3.4h\n" + "smlal2 v8.4s, v25.8h, v3.8h\n" + "tbz x4, #2, 109f\n" + "ld1 { v24.s }[0], [x20], #0x4\n" + "tbz x4, #1, 108f\n" + "ld1 { v24.h }[2], [x20], #0x2\n" + "tbz x4, #0, 111f\n" + "ld1 { v24.b }[6], [x20]\n" + "b 111f\n" + "108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset + "tbz x4, #0, 111f\n" + "ld1 { v24.b }[4], [x20]\n" + "b 111f\n" + "109:" // Oddments: Load (5, 4): Bit 2: Unset + "tbz x4, #1, 110f\n" + "ld1 { v24.h }[0], [x20], #0x2\n" + "tbz x4, #0, 111f\n" + "ld1 { v24.b }[2], [x20]\n" + "b 111f\n" + "110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 111f\n" + "ld1 { v24.b }[0], [x20]\n" + "111:" // Oddments: Load (5, 4): Bit 2: End + "ldr d4, [x8, #0xc0]\n" + "ushll v24.8h, v24.8b, #0x0\n" + "usubl v4.8h, v4.8b, v15.8b\n" + "ldr x19, [x0, #0x118]\n" + "smlal v7.4s, v24.4h, v3.4h\n" + "smlal2 v6.4s, v24.8h, v3.8h\n" + "add x19, x19, x1\n" + "smlal v11.4s, v28.4h, v4.4h\n" + "smlal2 v21.4s, v28.8h, v4.8h\n" + "smlal v14.4s, v26.4h, v4.4h\n" + "smlal2 v10.4s, v26.8h, v4.8h\n" + "smlal v9.4s, v24.4h, v4.4h\n" + "smlal2 v8.4s, v24.8h, v4.8h\n" + "tbz x4, #2, 113f\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "tbz x4, #1, 112f\n" + "ld1 { v27.h }[2], [x19], #0x2\n" + "tbz x4, #0, 115f\n" + "ld1 { v27.b }[6], [x19]\n" + "b 115f\n" + "112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset + "tbz x4, #0, 115f\n" + "ld1 { v27.b }[4], [x19]\n" + "b 115f\n" + "113:" // Oddments: Load (5, 5): Bit 2: Unset + "tbz x4, #1, 114f\n" + "ld1 { v27.h }[0], [x19], #0x2\n" + "tbz x4, #0, 115f\n" + "ld1 { v27.b }[2], [x19]\n" + "b 115f\n" + "114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 115f\n" + "ld1 { v27.b }[0], [x19]\n" + "115:" // Oddments: Load (5, 5): Bit 2: End + "ushll v27.8h, v27.8b, #0x0\n" + "smlal v7.4s, v27.4h, v4.4h\n" + "smlal2 v6.4s, v27.8h, v4.8h\n" + "tbz x4, #2, 117f\n" + "ld1 { v17.4s }, [x5], #0x10\n" + "ld1 { v5.4s }, [x6], #0x10\n" + "tbz x4, #1, 116f\n" + "ld1 { v18.d }[0], [x5], #0x8\n" + "ld1 { v29.d }[0], [x6], #0x8\n" + "tbz x4, #0, 119f\n" + "ld1 { v18.s }[2], [x5]\n" + "ld1 { v29.s }[2], [x6]\n" + "b 119f\n" + "116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset + "tbz x4, #0, 119f\n" + "ld1 { v18.s }[0], [x5]\n" + "ld1 { v29.s }[0], [x6]\n" + "b 119f\n" + "117:" // Oddments: Load requant params: Bit 2: Unset + "tbz x4, #1, 118f\n" + "ld1 { v17.d }[0], [x5], #0x8\n" + "ld1 { v5.d }[0], [x6], #0x8\n" + "tbz x4, #0, 119f\n" + "ld1 { v17.s }[2], [x5]\n" + "ld1 { v5.s }[2], [x6]\n" + "b 119f\n" + "118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 119f\n" + "ld1 { v17.s }[0], [x5]\n" + "ld1 { v5.s }[0], [x6]\n" + "119:" // Oddments: Load requant params: Bit 2: End + "sqdmulh v11.4s, v11.4s, v17.4s\n" + "sqdmulh v14.4s, v14.4s, v17.4s\n" + "add x21, x21, x2\n" + "add x15, x15, x2\n" + "sqdmulh v9.4s, v9.4s, v17.4s\n" + "sqdmulh v7.4s, v7.4s, v17.4s\n" + "add x17, x17, x2\n" + "add x16, x16, x2\n" + "and v23.16b, v11.16b, v5.16b\n" + "sqdmulh v21.4s, v21.4s, v18.4s\n" + "and v22.16b, v14.16b, v5.16b\n" + "sqdmulh v10.4s, v10.4s, v18.4s\n" + "and v17.16b, v9.16b, v5.16b\n" + "sqdmulh v8.4s, v8.4s, v18.4s\n" + "and v20.16b, v7.16b, v5.16b\n" + "sqdmulh v6.4s, v6.4s, v18.4s\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "and v19.16b, v21.16b, v29.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v18.16b, v10.16b, v29.16b\n" + "sshr v17.4s, v17.4s, #0x1f\n" + "and v26.16b, v8.16b, v29.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v4.16b, v6.16b, v29.16b\n" + "sqadd v11.4s, v11.4s, v23.4s\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sqadd v14.4s, v14.4s, v22.4s\n" + "sshr v18.4s, v18.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v17.4s\n" + "sshr v26.4s, v26.4s, #0x1f\n" + "sqadd v7.4s, v7.4s, v20.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "srshl v11.4s, v11.4s, v5.4s\n" + "sqadd v21.4s, v21.4s, v19.4s\n" + "srshl v14.4s, v14.4s, v5.4s\n" + "sqadd v10.4s, v10.4s, v18.4s\n" + "srshl v9.4s, v9.4s, v5.4s\n" + "sqadd v8.4s, v8.4s, v26.4s\n" + "srshl v7.4s, v7.4s, v5.4s\n" + "sqadd v6.4s, v6.4s, v4.4s\n" + "srshl v21.4s, v21.4s, v29.4s\n" + "sqxtn v11.4h, v11.4s\n" + "srshl v10.4s, v10.4s, v29.4s\n" + "sqxtn v14.4h, v14.4s\n" + "srshl v8.4s, v8.4s, v29.4s\n" + "sqxtn v9.4h, v9.4s\n" + "srshl v6.4s, v6.4s, v29.4s\n" + "sqxtn v7.4h, v7.4s\n" + "sqxtn2 v11.8h, v21.4s\n" + "sqxtn2 v14.8h, v10.4s\n" + "sqxtn2 v9.8h, v8.4s\n" + "sqxtn2 v7.8h, v6.4s\n" + "sqadd v11.8h, v11.8h, v16.8h\n" + "sqadd v14.8h, v14.8h, v16.8h\n" + "sqadd v9.8h, v9.8h, v16.8h\n" + "sqadd v7.8h, v7.8h, v16.8h\n" + "smax v11.8h, v11.8h, v12.8h\n" + "smax v14.8h, v14.8h, v12.8h\n" + "smax v9.8h, v9.8h, v12.8h\n" + "smax v7.8h, v7.8h, v12.8h\n" + "smin v11.8h, v11.8h, v13.8h\n" + "smin v14.8h, v14.8h, v13.8h\n" + "smin v9.8h, v9.8h, v13.8h\n" + "smin v7.8h, v7.8h, v13.8h\n" + "uzp1 v11.16b, v11.16b, v11.16b\n" + "uzp1 v14.16b, v14.16b, v14.16b\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" + "uzp1 v7.16b, v7.16b, v7.16b\n" + "tbz x4, #2, 121f\n" + "st1 { v11.s }[0], [x21], #0x4\n" + "st1 { v14.s }[0], [x15], #0x4\n" + "st1 { v9.s }[0], [x17], #0x4\n" + "st1 { v7.s }[0], [x16], #0x4\n" + "tbz x4, #1, 120f\n" + "st1 { v11.h }[2], [x21], #0x2\n" + "st1 { v14.h }[2], [x15], #0x2\n" + "st1 { v9.h }[2], [x17], #0x2\n" + "st1 { v7.h }[2], [x16], #0x2\n" + "tbz x4, #0, 123f\n" + "st1 { v11.b }[6], [x21], #0x1\n" + "st1 { v14.b }[6], [x15], #0x1\n" + "st1 { v9.b }[6], [x17], #0x1\n" + "st1 { v7.b }[6], [x16], #0x1\n" + "b 123f\n" + "120:" // Oddments: Bit 2: Bit 1: Unset + "tbz x4, #0, 123f\n" + "st1 { v11.b }[4], [x21], #0x1\n" + "st1 { v14.b }[4], [x15], #0x1\n" + "st1 { v9.b }[4], [x17], #0x1\n" + "st1 { v7.b }[4], [x16], #0x1\n" + "b 123f\n" + "121:" // Oddments: Bit 2: Unset + "tbz x4, #1, 122f\n" + "st1 { v11.h }[0], [x21], #0x2\n" + "st1 { v14.h }[0], [x15], #0x2\n" + "st1 { v9.h }[0], [x17], #0x2\n" + "st1 { v7.h }[0], [x16], #0x2\n" + "tbz x4, #0, 123f\n" + "st1 { v11.b }[2], [x21], #0x1\n" + "st1 { v14.b }[2], [x15], #0x1\n" + "st1 { v9.b }[2], [x17], #0x1\n" + "st1 { v7.b }[2], [x16], #0x1\n" + "b 123f\n" + "122:" // Oddments: Bit 2: Unset: Bit 1: Unset + "tbz x4, #0, 123f\n" + "st1 { v11.b }[0], [x21], #0x1\n" + "st1 { v14.b }[0], [x15], #0x1\n" + "st1 { v9.b }[0], [x17], #0x1\n" + "st1 { v7.b }[0], [x16], #0x1\n" + "123:" // Oddments: Bit 2: End + "124:" // End + : + : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + ); +} + +} // namespace depthwise +} // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp index 1bacb5ffe7..281511a1fb 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,37 +36,24 @@ namespace depthwise { void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); -struct a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst +class a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef uint8_t input_type; - typedef int8_t weight_type; - typedef uint8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); - typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 4; - constexpr static unsigned int input_cols = 4; - - constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_mla::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_mla::get_packed_size; + a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {} - kern_type kernel = a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl; + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; } - a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp index 8cbbfae00d..22f95746fc 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -46,7 +46,7 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( struct Params { long unsigned int n_channels; - const int8_t *weights; + const void *weights; const int32_t *bias; const arm_gemm::Requantize32 *requant; const int32_t *const requant_muls; @@ -57,7 +57,7 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( Params( long unsigned int n_channels, const uint8_t *const *inptrs_raw, - const int8_t *const weights, + const void *const weights, const int32_t *const bias, const arm_gemm::Requantize32 &qp, const int32_t *const requant_muls, @@ -91,505 +91,489 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( requant_muls, requant_shifts, outptrs); __asm__ __volatile__( + "ldr x19, [%x[params], %[offsetof_Params_requant]]\n" "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n" - "mov x17, #0x0\n" - "ldr x16, [%x[params], %[offsetof_Params_weights]]\n" + "add x24, x19, %[offsetof_Requantize32_a_offset]\n" + "add x23, x19, %[offsetof_Requantize32_b_offset]\n" + "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x21, x19, %[offsetof_Requantize32_c_offset]\n" + "add x20, x19, %[offsetof_Requantize32_minval]\n" + "ldr x17, [%x[params], %[offsetof_Params_weights]]\n" + "add x19, x19, %[offsetof_Requantize32_maxval]\n" + "ld1r { v22.16b }, [x24]\n" + "ld1r { v12.16b }, [x23]\n" + "lsr x16, x8, #0x3\n" + "ld1r { v14.8h }, [x21]\n" + "ld1r { v17.8h }, [x20]\n" "mov x15, #0x0\n" - "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" - "add x14, %x[params], %[offsetof_Params_inptrs]\n" + "mov x14, #0x0\n" + "ld1r { v15.8h }, [x19]\n" "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n" - "lsr x12, x8, #0x3\n" + "add x12, %x[params], %[offsetof_Params_inptrs]\n" "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n" - "add x19, x22, %[offsetof_Requantize32_a_offset]\n" - "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" - "add x20, x22, %[offsetof_Requantize32_b_offset]\n" - "ld1r { v21.16b }, [x19]\n" - "add x19, x22, %[offsetof_Requantize32_c_offset]\n" - "ld1r { v17.16b }, [x20]\n" - "add x20, x22, %[offsetof_Requantize32_minval]\n" - "ld1r { v13.4s }, [x19]\n" - "add x19, x22, %[offsetof_Requantize32_maxval]\n" - "ld1r { v15.4s }, [x20]\n" - "ld1r { v14.4s }, [x19]\n" - "ldp x10, x9, [x21, #0x0]\n" - "ldp x28, x27, [x21, #0x10]\n" - "cbz x12, 3f\n" - "subs x12, x12, #0x1\n" + "ldp x10, x9, [x22, #0x0]\n" + "ldp x28, x27, [x22, #0x10]\n" + "cbz x16, 3f\n" "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" - "ldr q11, [x19, #0x0]\n" - "mov v23.16b, v11.16b\n" + "ldr q13, [x19, #0x0]\n" + "subs x16, x16, #0x1\n" + "mov v19.16b, v13.16b\n" "ldr q26, [x19, #0x10]\n" "add x19, x19, #0x20\n" - "mov v12.16b, v11.16b\n" "str x19, [%x[params], %[offsetof_Params_bias]]\n" - "mov v24.16b, v11.16b\n" - "ldr d0, [x16, #0x0]\n" - "ldr d1, [x16, #0x8]\n" - "mov v9.16b, v26.16b\n" - "ldr d2, [x16, #0x10]\n" - "mov v22.16b, v26.16b\n" - "ldr d3, [x16, #0x18]\n" - "mov v10.16b, v26.16b\n" - "ldr d4, [x16, #0x20]\n" - "ssubl v0.8h, v0.8b, v17.8b\n" - "ldr d5, [x16, #0x28]\n" - "ssubl v1.8h, v1.8b, v17.8b\n" - "ldr d6, [x16, #0x30]\n" - "ssubl v2.8h, v2.8b, v17.8b\n" - "ldr d7, [x16, #0x38]\n" - "ssubl v3.8h, v3.8b, v17.8b\n" - "ldr d8, [x16, #0x40]\n" - "ssubl v4.8h, v4.8b, v17.8b\n" - "ldp x23, x22, [x14, #0x0]\n" - "ssubl v5.8h, v5.8b, v17.8b\n" - "ldp x21, x20, [x14, #0x10]\n" - "ssubl v6.8h, v6.8b, v17.8b\n" - "ssubl v7.8h, v7.8b, v17.8b\n" - "ldr x19, [x14, #0x20]\n" - "ssubl v8.8h, v8.8b, v17.8b\n" - "ldr d31, [x23, x17]\n" - "usubl v31.8h, v31.8b, v21.8b\n" - "ldr d30, [x22, x17]\n" - "ldr d29, [x21, x17]\n" - "usubl v30.8h, v30.8b, v21.8b\n" - "ldr d28, [x20, x17]\n" - "usubl v29.8h, v29.8b, v21.8b\n" - "ldr d27, [x19, x17]\n" - "usubl v28.8h, v28.8b, v21.8b\n" - "usubl v27.8h, v27.8b, v21.8b\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "ldr d2, [x17, #0x10]\n" + "mov v11.16b, v26.16b\n" + "mov v18.16b, v13.16b\n" + "ldr d3, [x17, #0x18]\n" + "ldr d4, [x17, #0x20]\n" + "mov v24.16b, v26.16b\n" + "mov v9.16b, v13.16b\n" + "ldr d5, [x17, #0x28]\n" + "ldr d6, [x17, #0x30]\n" + "mov v23.16b, v26.16b\n" + "ssubl v0.8h, v0.8b, v12.8b\n" + "ldr d7, [x17, #0x38]\n" + "ldr d8, [x17, #0x40]\n" + "ssubl v1.8h, v1.8b, v12.8b\n" + "ssubl v2.8h, v2.8b, v12.8b\n" + "ldp x23, x22, [x12, #0x0]\n" + "ldp x21, x20, [x12, #0x10]\n" + "ssubl v3.8h, v3.8b, v12.8b\n" + "ssubl v4.8h, v4.8b, v12.8b\n" + "ldr x19, [x12, #0x20]\n" + "ldr d31, [x23, x15]\n" + "ssubl v5.8h, v5.8b, v12.8b\n" + "ssubl v6.8h, v6.8b, v12.8b\n" + "ldr d30, [x22, x15]\n" + "ldr d29, [x21, x15]\n" + "ssubl v7.8h, v7.8b, v12.8b\n" + "ssubl v8.8h, v8.8b, v12.8b\n" + "ldr d28, [x20, x15]\n" + "ldr d27, [x19, x15]\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "usubl v27.8h, v27.8b, v22.8b\n" "beq 2f\n" "1:" // Loop - "smlal v11.4s, v31.4h, v4.4h\n" - "ldr x21, [x14, #0x28]\n" - "add x16, x16, #0x48\n" + "smlal v13.4s, v31.4h, v4.4h\n" "smlal2 v26.4s, v31.8h, v4.8h\n" - "ldr x20, [x14, #0x30]\n" - "subs x12, x12, #0x1\n" - "smlal v23.4s, v31.4h, v3.4h\n" - "ldr x26, [x14, #0x38]\n" - "smlal2 v9.4s, v31.8h, v3.8h\n" - "ldr x25, [x14, #0x40]\n" - "smlal v12.4s, v31.4h, v1.4h\n" - "ldr x19, [x14, #0x48]\n" - "smlal2 v22.4s, v31.8h, v1.8h\n" - "ldr x24, [x14, #0x50]\n" - "smlal v24.4s, v31.4h, v0.4h\n" - "ldr x23, [x14, #0x58]\n" - "smlal2 v10.4s, v31.8h, v0.8h\n" - "ldr d31, [x21, x17]\n" - "smlal v11.4s, v30.4h, v0.4h\n" - "ldr x22, [x14, #0x60]\n" + "ldr x21, [x12, #0x28]\n" + "ldr x26, [x12, #0x38]\n" + "smlal v19.4s, v31.4h, v3.4h\n" + "smlal2 v11.4s, v31.8h, v3.8h\n" + "ldr x20, [x12, #0x30]\n" + "ldr x25, [x12, #0x40]\n" + "smlal v13.4s, v30.4h, v0.4h\n" "smlal2 v26.4s, v30.8h, v0.8h\n" - "ldr d30, [x19, x17]\n" - "smlal v23.4s, v29.4h, v2.4h\n" - "ldr x21, [x14, #0x68]\n" - "smlal2 v9.4s, v29.8h, v2.8h\n" - "ldr d29, [x20, x17]\n" - "smlal v11.4s, v28.4h, v5.4h\n" - "ldr x20, [x14, #0x70]\n" + "ldr x19, [x12, #0x48]\n" + "ldr d30, [x19, x15]\n" + "smlal v19.4s, v29.4h, v2.4h\n" + "smlal2 v11.4s, v29.8h, v2.8h\n" + "ldr d29, [x20, x15]\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "smlal v18.4s, v31.4h, v1.4h\n" + "smlal2 v24.4s, v31.8h, v1.8h\n" + "ldr x24, [x12, #0x50]\n" + "ldr x23, [x12, #0x58]\n" + "smlal v9.4s, v31.4h, v0.4h\n" + "smlal2 v23.4s, v31.8h, v0.8h\n" + "ldr d31, [x21, x15]\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "smlal v13.4s, v28.4h, v5.4h\n" "smlal2 v26.4s, v28.8h, v5.8h\n" - "ldr x19, [x14, #0x78]\n" - "smlal v23.4s, v28.4h, v4.4h\n" - "ldr q25, [x13, #0x0]\n" - "smlal2 v9.4s, v28.8h, v4.8h\n" - "ldr q18, [x11, #0x0]\n" - "smlal v12.4s, v28.4h, v2.4h\n" - "ldr q16, [x13, #0x10]\n" - "add x13, x13, #0x20\n" - "smlal2 v22.4s, v28.8h, v2.8h\n" - "ldr q20, [x11, #0x10]\n" - "add x11, x11, #0x20\n" - "smlal v24.4s, v28.4h, v1.4h\n" - "smlal2 v10.4s, v28.8h, v1.8h\n" - "ldr d28, [x26, x17]\n" - "usubl v31.8h, v31.8b, v21.8b\n" - "smlal v11.4s, v27.4h, v7.4h\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "ldr x22, [x12, #0x60]\n" + "smlal v19.4s, v28.4h, v4.4h\n" + "smlal2 v11.4s, v28.8h, v4.8h\n" + "ldr x21, [x12, #0x68]\n" + "ldr x20, [x12, #0x70]\n" + "smlal v18.4s, v28.4h, v2.4h\n" + "smlal2 v24.4s, v28.8h, v2.8h\n" + "ldr x19, [x12, #0x78]\n" + "ldr q21, [x13, #0x0]\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "smlal2 v23.4s, v28.8h, v1.8h\n" + "ldr d28, [x26, x15]\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "smlal v13.4s, v27.4h, v7.4h\n" "smlal2 v26.4s, v27.8h, v7.8h\n" - "smlal v12.4s, v31.4h, v6.4h\n" - "smlal2 v22.4s, v31.8h, v6.8h\n" - "ldr d31, [x25, x17]\n" - "smlal v23.4s, v27.4h, v6.4h\n" - "smlal2 v9.4s, v27.8h, v6.8h\n" - "smlal v12.4s, v27.4h, v4.4h\n" - "smlal2 v22.4s, v27.8h, v4.8h\n" - "smlal v24.4s, v27.4h, v3.4h\n" - "smlal2 v10.4s, v27.8h, v3.8h\n" - "usubl v29.8h, v29.8b, v21.8b\n" - "usubl v28.8h, v28.8b, v21.8b\n" - "usubl v31.8h, v31.8b, v21.8b\n" - "smlal v24.4s, v29.4h, v8.4h\n" - "smlal2 v10.4s, v29.8h, v8.8h\n" - "ldr d29, [x24, x17]\n" - "smlal v11.4s, v28.4h, v1.4h\n" + "ldr q25, [x11, #0x0]\n" + "ldr q10, [x13, #0x10]\n" + "smlal v19.4s, v27.4h, v6.4h\n" + "smlal2 v11.4s, v27.8h, v6.8h\n" + "ldr q16, [x11, #0x10]\n" + "add x17, x17, #0x48\n" + "smlal v18.4s, v31.4h, v6.4h\n" + "smlal2 v24.4s, v31.8h, v6.8h\n" + "ldr d31, [x25, x15]\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "smlal2 v23.4s, v27.8h, v3.8h\n" + "subs x16, x16, #0x1\n" + "add x13, x13, #0x20\n" + "smlal v13.4s, v28.4h, v1.4h\n" "smlal2 v26.4s, v28.8h, v1.8h\n" - "smlal v23.4s, v28.4h, v0.4h\n" - "smlal2 v9.4s, v28.8h, v0.8h\n" - "ldr d28, [x23, x17]\n" - "smlal v11.4s, v31.4h, v2.4h\n" + "add x11, x11, #0x20\n" + "smlal v19.4s, v28.4h, v0.4h\n" + "smlal2 v11.4s, v28.8h, v0.8h\n" + "ldr d28, [x23, x15]\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "smlal v18.4s, v27.4h, v4.4h\n" + "smlal v9.4s, v29.4h, v8.4h\n" + "smlal2 v24.4s, v27.8h, v4.8h\n" + "smlal2 v23.4s, v29.8h, v8.8h\n" + "ldr d29, [x24, x15]\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "smlal v13.4s, v31.4h, v2.4h\n" "smlal2 v26.4s, v31.8h, v2.8h\n" - "smlal v23.4s, v31.4h, v1.4h\n" - "smlal2 v9.4s, v31.8h, v1.8h\n" - "ldr d31, [x22, x17]\n" - "usubl v30.8h, v30.8b, v21.8b\n" - "usubl v29.8h, v29.8b, v21.8b\n" - "usubl v28.8h, v28.8b, v21.8b\n" - "smlal v11.4s, v30.4h, v8.4h\n" + "smlal v19.4s, v31.4h, v1.4h\n" + "smlal2 v11.4s, v31.8h, v1.8h\n" + "ldr d31, [x22, x15]\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "smlal v18.4s, v30.4h, v5.4h\n" + "smlal v9.4s, v30.4h, v4.4h\n" + "smlal v13.4s, v30.4h, v8.4h\n" "smlal2 v26.4s, v30.8h, v8.8h\n" - "smlal v23.4s, v30.4h, v7.4h\n" - "smlal2 v9.4s, v30.8h, v7.8h\n" - "smlal v12.4s, v30.4h, v5.4h\n" - "smlal2 v22.4s, v30.8h, v5.8h\n" - "smlal v24.4s, v30.4h, v4.4h\n" - "smlal2 v10.4s, v30.8h, v4.8h\n" - "ldr d30, [x21, x17]\n" - "smlal v11.4s, v29.4h, v3.4h\n" + "smlal v19.4s, v30.4h, v7.4h\n" + "smlal2 v11.4s, v30.8h, v7.8h\n" + "smlal2 v24.4s, v30.8h, v5.8h\n" + "smlal2 v23.4s, v30.8h, v4.8h\n" + "ldr d30, [x21, x15]\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "smlal v18.4s, v29.4h, v0.4h\n" + "smlal v9.4s, v28.4h, v2.4h\n" + "smlal v13.4s, v29.4h, v3.4h\n" "smlal2 v26.4s, v29.8h, v3.8h\n" - "smlal v12.4s, v29.4h, v0.4h\n" - "smlal2 v22.4s, v29.8h, v0.8h\n" - "ldr d29, [x20, x17]\n" - "smlal v23.4s, v28.4h, v5.4h\n" - "smlal2 v9.4s, v28.8h, v5.8h\n" - "smlal v24.4s, v28.4h, v2.4h\n" - "smlal2 v10.4s, v28.8h, v2.8h\n" - "ldr d28, [x19, x17]\n" - "add x17, x17, #0x8\n" - "usubl v31.8h, v31.8b, v21.8b\n" - "usubl v30.8h, v30.8b, v21.8b\n" - "usubl v29.8h, v29.8b, v21.8b\n" - "smlal v11.4s, v31.4h, v6.4h\n" - "smlal2 v26.4s, v31.8h, v6.8h\n" - "smlal v12.4s, v31.4h, v3.4h\n" - "smlal2 v22.4s, v31.8h, v3.8h\n" - "smlal v23.4s, v30.4h, v8.4h\n" - "smlal2 v9.4s, v30.8h, v8.8h\n" - "smlal v24.4s, v30.4h, v5.4h\n" - "smlal2 v10.4s, v30.8h, v5.8h\n" - "smlal v12.4s, v29.4h, v7.4h\n" - "smlal2 v22.4s, v29.8h, v7.8h\n" - "smlal v24.4s, v29.4h, v6.4h\n" - "smlal2 v10.4s, v29.8h, v6.8h\n" - "usubl v28.8h, v28.8b, v21.8b\n" - "sqrdmulh v11.4s, v11.4s, v25.4s\n" - "sqrdmulh v26.4s, v26.4s, v16.4s\n" - "smlal v12.4s, v28.4h, v8.4h\n" - "smlal2 v22.4s, v28.8h, v8.8h\n" - "smlal v24.4s, v28.4h, v7.4h\n" - "smlal2 v10.4s, v28.8h, v7.8h\n" - "and v19.16b, v11.16b, v18.16b\n" - "and v5.16b, v26.16b, v20.16b\n" - "sqrdmulh v23.4s, v23.4s, v25.4s\n" - "sshr v19.4s, v19.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqrdmulh v9.4s, v9.4s, v16.4s\n" - "sqadd v11.4s, v11.4s, v19.4s\n" - "sqadd v26.4s, v26.4s, v5.4s\n" - "and v28.16b, v23.16b, v18.16b\n" - "and v8.16b, v9.16b, v20.16b\n" - "srshl v11.4s, v11.4s, v18.4s\n" - "srshl v26.4s, v26.4s, v20.4s\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "add v11.4s, v11.4s, v13.4s\n" - "add v26.4s, v26.4s, v13.4s\n" - "sqadd v23.4s, v23.4s, v28.4s\n" - "smin v11.4s, v11.4s, v14.4s\n" - "smin v26.4s, v26.4s, v14.4s\n" - "sqadd v9.4s, v9.4s, v8.4s\n" - "smax v11.4s, v11.4s, v15.4s\n" - "smax v26.4s, v26.4s, v15.4s\n" - "srshl v23.4s, v23.4s, v18.4s\n" - "srshl v9.4s, v9.4s, v20.4s\n" - "uzp1 v11.16b, v11.16b, v26.16b\n" - "sqrdmulh v12.4s, v12.4s, v25.4s\n" - "uzp1 v11.16b, v11.16b, v11.16b\n" - "str d11, [x10, x15]\n" - "add v23.4s, v23.4s, v13.4s\n" - "add v9.4s, v9.4s, v13.4s\n" - "and v1.16b, v12.16b, v18.16b\n" - "sqrdmulh v22.4s, v22.4s, v16.4s\n" - "smin v23.4s, v23.4s, v14.4s\n" - "smin v9.4s, v9.4s, v14.4s\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "smax v23.4s, v23.4s, v15.4s\n" - "smax v9.4s, v9.4s, v15.4s\n" - "sqadd v12.4s, v12.4s, v1.4s\n" - "and v0.16b, v22.16b, v20.16b\n" - "uzp1 v23.16b, v23.16b, v9.16b\n" - "sqrdmulh v24.4s, v24.4s, v25.4s\n" - "uzp1 v23.16b, v23.16b, v23.16b\n" - "str d23, [x9, x15]\n" - "srshl v12.4s, v12.4s, v18.4s\n" - "sshr v0.4s, v0.4s, #0x1f\n" - "and v26.16b, v24.16b, v18.16b\n" - "sqrdmulh v10.4s, v10.4s, v16.4s\n" - "sqadd v22.4s, v22.4s, v0.4s\n" - "add v12.4s, v12.4s, v13.4s\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "and v16.16b, v10.16b, v20.16b\n" - "smin v12.4s, v12.4s, v14.4s\n" - "srshl v22.4s, v22.4s, v20.4s\n" - "sqadd v24.4s, v24.4s, v26.4s\n" - "smax v12.4s, v12.4s, v15.4s\n" - "sshr v16.4s, v16.4s, #0x1f\n" - "add v22.4s, v22.4s, v13.4s\n" - "srshl v24.4s, v24.4s, v18.4s\n" - "sqadd v10.4s, v10.4s, v16.4s\n" - "smin v22.4s, v22.4s, v14.4s\n" - "add v24.4s, v24.4s, v13.4s\n" - "smax v22.4s, v22.4s, v15.4s\n" - "srshl v10.4s, v10.4s, v20.4s\n" - "smin v24.4s, v24.4s, v14.4s\n" - "uzp1 v12.16b, v12.16b, v22.16b\n" - "add v10.4s, v10.4s, v13.4s\n" - "uzp1 v12.16b, v12.16b, v12.16b\n" - "str d12, [x28, x15]\n" - "smax v24.4s, v24.4s, v15.4s\n" - "smin v10.4s, v10.4s, v14.4s\n" - "smax v10.4s, v10.4s, v15.4s\n" - "uzp1 v24.16b, v24.16b, v10.16b\n" - "uzp1 v24.16b, v24.16b, v24.16b\n" - "str d24, [x27, x15]\n" + "smlal2 v24.4s, v29.8h, v0.8h\n" + "ldr d29, [x20, x15]\n" + "smlal2 v23.4s, v28.8h, v2.8h\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "smlal v18.4s, v31.4h, v3.4h\n" + "smlal v9.4s, v30.4h, v5.4h\n" + "smlal v19.4s, v28.4h, v5.4h\n" + "smlal2 v11.4s, v28.8h, v5.8h\n" + "ldr d28, [x19, x15]\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "smlal2 v24.4s, v31.8h, v3.8h\n" + "smlal2 v23.4s, v30.8h, v5.8h\n" "add x15, x15, #0x8\n" + "smlal v18.4s, v29.4h, v7.4h\n" + "smlal v9.4s, v29.4h, v6.4h\n" + "smlal2 v24.4s, v29.8h, v7.8h\n" + "smlal2 v23.4s, v29.8h, v6.8h\n" + "smlal v13.4s, v31.4h, v6.4h\n" + "smlal v19.4s, v30.4h, v8.4h\n" + "sqdmulh v13.4s, v13.4s, v21.4s\n" + "smlal v18.4s, v28.4h, v8.4h\n" + "smlal v9.4s, v28.4h, v7.4h\n" + "sqdmulh v19.4s, v19.4s, v21.4s\n" + "smlal2 v26.4s, v31.8h, v6.8h\n" + "smlal2 v11.4s, v30.8h, v8.8h\n" + "sqdmulh v18.4s, v18.4s, v21.4s\n" + "smlal2 v24.4s, v28.8h, v8.8h\n" + "smlal2 v23.4s, v28.8h, v7.8h\n" + "sqdmulh v9.4s, v9.4s, v21.4s\n" + "and v7.16b, v13.16b, v25.16b\n" + "sqdmulh v26.4s, v26.4s, v10.4s\n" + "and v4.16b, v19.16b, v25.16b\n" + "sqdmulh v11.4s, v11.4s, v10.4s\n" + "and v21.16b, v18.16b, v25.16b\n" + "sqdmulh v24.4s, v24.4s, v10.4s\n" + "and v20.16b, v9.16b, v25.16b\n" + "sqdmulh v23.4s, v23.4s, v10.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v29.16b, v26.16b, v16.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v10.16b, v11.16b, v16.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v31.16b, v24.16b, v16.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v30.16b, v23.16b, v16.16b\n" + "sqadd v13.4s, v13.4s, v7.4s\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sqadd v19.4s, v19.4s, v4.4s\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v21.4s\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v20.4s\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "srshl v13.4s, v13.4s, v25.4s\n" + "sqadd v26.4s, v26.4s, v29.4s\n" + "srshl v19.4s, v19.4s, v25.4s\n" + "sqadd v11.4s, v11.4s, v10.4s\n" + "srshl v18.4s, v18.4s, v25.4s\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "srshl v9.4s, v9.4s, v25.4s\n" + "sqadd v23.4s, v23.4s, v30.4s\n" + "srshl v26.4s, v26.4s, v16.4s\n" + "sqxtn v13.4h, v13.4s\n" + "srshl v11.4s, v11.4s, v16.4s\n" + "sqxtn v19.4h, v19.4s\n" + "srshl v24.4s, v24.4s, v16.4s\n" + "sqxtn v18.4h, v18.4s\n" + "srshl v23.4s, v23.4s, v16.4s\n" + "sqxtn v9.4h, v9.4s\n" + "sqxtn2 v13.8h, v26.4s\n" + "sqxtn2 v19.8h, v11.4s\n" + "sqxtn2 v18.8h, v24.4s\n" + "sqxtn2 v9.8h, v23.4s\n" + "sqadd v13.8h, v13.8h, v14.8h\n" + "sqadd v19.8h, v19.8h, v14.8h\n" + "sqadd v18.8h, v18.8h, v14.8h\n" + "sqadd v9.8h, v9.8h, v14.8h\n" + "smax v13.8h, v13.8h, v17.8h\n" + "smax v19.8h, v19.8h, v17.8h\n" + "smax v18.8h, v18.8h, v17.8h\n" + "smax v9.8h, v9.8h, v17.8h\n" + "smin v13.8h, v13.8h, v15.8h\n" + "smin v19.8h, v19.8h, v15.8h\n" + "smin v18.8h, v18.8h, v15.8h\n" + "smin v9.8h, v9.8h, v15.8h\n" + "uzp1 v13.16b, v13.16b, v13.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str d13, [x10, x14]\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" + "str d19, [x9, x14]\n" + "str d18, [x28, x14]\n" + "str d9, [x27, x14]\n" "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" - "ldr q11, [x19, #0x0]\n" - "mov v23.16b, v11.16b\n" + "ldr q13, [x19, #0x0]\n" + "add x14, x14, #0x8\n" "ldr q26, [x19, #0x10]\n" "add x19, x19, #0x20\n" - "mov v12.16b, v11.16b\n" "str x19, [%x[params], %[offsetof_Params_bias]]\n" - "mov v24.16b, v11.16b\n" - "ldr d0, [x16, #0x0]\n" - "ldr d1, [x16, #0x8]\n" - "mov v9.16b, v26.16b\n" - "ldr d2, [x16, #0x10]\n" - "mov v22.16b, v26.16b\n" - "ldr d3, [x16, #0x18]\n" - "mov v10.16b, v26.16b\n" - "ldr d4, [x16, #0x20]\n" - "ssubl v0.8h, v0.8b, v17.8b\n" - "ldr d5, [x16, #0x28]\n" - "ssubl v1.8h, v1.8b, v17.8b\n" - "ldr d6, [x16, #0x30]\n" - "ssubl v2.8h, v2.8b, v17.8b\n" - "ldr d7, [x16, #0x38]\n" - "ssubl v3.8h, v3.8b, v17.8b\n" - "ldr d8, [x16, #0x40]\n" - "ssubl v4.8h, v4.8b, v17.8b\n" - "ldp x23, x22, [x14, #0x0]\n" - "ssubl v5.8h, v5.8b, v17.8b\n" - "ldp x21, x20, [x14, #0x10]\n" - "ssubl v6.8h, v6.8b, v17.8b\n" - "ssubl v7.8h, v7.8b, v17.8b\n" - "ldr x19, [x14, #0x20]\n" - "ssubl v8.8h, v8.8b, v17.8b\n" - "ldr d31, [x23, x17]\n" - "usubl v31.8h, v31.8b, v21.8b\n" - "ldr d30, [x22, x17]\n" - "ldr d29, [x21, x17]\n" - "usubl v30.8h, v30.8b, v21.8b\n" - "ldr d28, [x20, x17]\n" - "usubl v29.8h, v29.8b, v21.8b\n" - "ldr d27, [x19, x17]\n" - "usubl v28.8h, v28.8b, v21.8b\n" - "usubl v27.8h, v27.8b, v21.8b\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "ldr d2, [x17, #0x10]\n" + "mov v19.16b, v13.16b\n" + "mov v11.16b, v26.16b\n" + "ldr d3, [x17, #0x18]\n" + "ldr d4, [x17, #0x20]\n" + "mov v18.16b, v13.16b\n" + "mov v24.16b, v26.16b\n" + "ldr d5, [x17, #0x28]\n" + "ldr d6, [x17, #0x30]\n" + "mov v9.16b, v13.16b\n" + "mov v23.16b, v26.16b\n" + "ldr d7, [x17, #0x38]\n" + "ldr d8, [x17, #0x40]\n" + "ssubl v0.8h, v0.8b, v12.8b\n" + "ssubl v1.8h, v1.8b, v12.8b\n" + "ldp x23, x22, [x12, #0x0]\n" + "ldp x21, x20, [x12, #0x10]\n" + "ssubl v2.8h, v2.8b, v12.8b\n" + "ssubl v3.8h, v3.8b, v12.8b\n" + "ldr x19, [x12, #0x20]\n" + "ldr d31, [x23, x15]\n" + "ssubl v4.8h, v4.8b, v12.8b\n" + "ssubl v5.8h, v5.8b, v12.8b\n" + "ldr d30, [x22, x15]\n" + "ldr d29, [x21, x15]\n" + "ssubl v6.8h, v6.8b, v12.8b\n" + "ssubl v7.8h, v7.8b, v12.8b\n" + "ldr d28, [x20, x15]\n" + "ldr d27, [x19, x15]\n" + "ssubl v8.8h, v8.8b, v12.8b\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "usubl v27.8h, v27.8b, v22.8b\n" "bgt 1b\n" "2:" // Tail - "smlal v11.4s, v31.4h, v4.4h\n" - "ldr x21, [x14, #0x28]\n" - "tst x8, #0x7\n" + "smlal v13.4s, v31.4h, v4.4h\n" "smlal2 v26.4s, v31.8h, v4.8h\n" - "ldr x20, [x14, #0x30]\n" - "smlal v23.4s, v31.4h, v3.4h\n" - "ldr x26, [x14, #0x38]\n" - "smlal2 v9.4s, v31.8h, v3.8h\n" - "ldr x25, [x14, #0x40]\n" - "smlal v12.4s, v31.4h, v1.4h\n" - "ldr x19, [x14, #0x48]\n" - "smlal2 v22.4s, v31.8h, v1.8h\n" - "ldr x24, [x14, #0x50]\n" - "smlal v24.4s, v31.4h, v0.4h\n" - "ldr x23, [x14, #0x58]\n" - "smlal2 v10.4s, v31.8h, v0.8h\n" - "ldr d31, [x21, x17]\n" - "smlal v11.4s, v30.4h, v0.4h\n" - "ldr x22, [x14, #0x60]\n" + "ldr x21, [x12, #0x28]\n" + "ldr x26, [x12, #0x38]\n" + "smlal v19.4s, v31.4h, v3.4h\n" + "smlal2 v11.4s, v31.8h, v3.8h\n" + "ldr x20, [x12, #0x30]\n" + "ldr x25, [x12, #0x40]\n" + "smlal v13.4s, v30.4h, v0.4h\n" "smlal2 v26.4s, v30.8h, v0.8h\n" - "ldr d30, [x19, x17]\n" - "smlal v23.4s, v29.4h, v2.4h\n" - "ldr x21, [x14, #0x68]\n" - "smlal2 v9.4s, v29.8h, v2.8h\n" - "ldr d29, [x20, x17]\n" - "smlal v11.4s, v28.4h, v5.4h\n" - "ldr x20, [x14, #0x70]\n" + "ldr x19, [x12, #0x48]\n" + "ldr d30, [x19, x15]\n" + "smlal v19.4s, v29.4h, v2.4h\n" + "smlal2 v11.4s, v29.8h, v2.8h\n" + "ldr d29, [x20, x15]\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "smlal v18.4s, v31.4h, v1.4h\n" + "smlal2 v24.4s, v31.8h, v1.8h\n" + "ldr x24, [x12, #0x50]\n" + "ldr x23, [x12, #0x58]\n" + "smlal v9.4s, v31.4h, v0.4h\n" + "smlal2 v23.4s, v31.8h, v0.8h\n" + "ldr d31, [x21, x15]\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "smlal v13.4s, v28.4h, v5.4h\n" "smlal2 v26.4s, v28.8h, v5.8h\n" - "ldr x19, [x14, #0x78]\n" - "smlal v23.4s, v28.4h, v4.4h\n" - "ldr q25, [x13, #0x0]\n" - "smlal2 v9.4s, v28.8h, v4.8h\n" - "ldr q18, [x11, #0x0]\n" - "smlal v12.4s, v28.4h, v2.4h\n" - "ldr q16, [x13, #0x10]\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "ldr x22, [x12, #0x60]\n" + "smlal v19.4s, v28.4h, v4.4h\n" + "smlal2 v11.4s, v28.8h, v4.8h\n" + "ldr x21, [x12, #0x68]\n" + "ldr x20, [x12, #0x70]\n" + "smlal v18.4s, v28.4h, v2.4h\n" + "smlal2 v24.4s, v28.8h, v2.8h\n" + "ldr x19, [x12, #0x78]\n" + "ldr q21, [x13, #0x0]\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "smlal2 v23.4s, v28.8h, v1.8h\n" + "ldr d28, [x26, x15]\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "smlal v13.4s, v27.4h, v7.4h\n" + "smlal2 v26.4s, v27.8h, v7.8h\n" + "ldr q25, [x11, #0x0]\n" + "ldr q10, [x13, #0x10]\n" + "smlal v19.4s, v27.4h, v6.4h\n" + "smlal2 v11.4s, v27.8h, v6.8h\n" + "ldr q16, [x11, #0x10]\n" + "tst x8, #0x7\n" + "smlal v18.4s, v31.4h, v6.4h\n" + "smlal2 v24.4s, v31.8h, v6.8h\n" + "ldr d31, [x25, x15]\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "smlal2 v23.4s, v27.8h, v3.8h\n" "add x13, x13, #0x20\n" - "smlal2 v22.4s, v28.8h, v2.8h\n" - "ldr q20, [x11, #0x10]\n" "add x11, x11, #0x20\n" - "smlal v24.4s, v28.4h, v1.4h\n" - "smlal2 v10.4s, v28.8h, v1.8h\n" - "ldr d28, [x26, x17]\n" - "usubl v31.8h, v31.8b, v21.8b\n" - "smlal v11.4s, v27.4h, v7.4h\n" - "smlal2 v26.4s, v27.8h, v7.8h\n" - "smlal v12.4s, v31.4h, v6.4h\n" - "smlal2 v22.4s, v31.8h, v6.8h\n" - "ldr d31, [x25, x17]\n" - "smlal v23.4s, v27.4h, v6.4h\n" - "smlal2 v9.4s, v27.8h, v6.8h\n" - "smlal v12.4s, v27.4h, v4.4h\n" - "smlal2 v22.4s, v27.8h, v4.8h\n" - "smlal v24.4s, v27.4h, v3.4h\n" - "smlal2 v10.4s, v27.8h, v3.8h\n" - "usubl v29.8h, v29.8b, v21.8b\n" - "usubl v28.8h, v28.8b, v21.8b\n" - "usubl v31.8h, v31.8b, v21.8b\n" - "smlal v24.4s, v29.4h, v8.4h\n" - "smlal2 v10.4s, v29.8h, v8.8h\n" - "ldr d29, [x24, x17]\n" - "smlal v11.4s, v28.4h, v1.4h\n" + "smlal v13.4s, v28.4h, v1.4h\n" "smlal2 v26.4s, v28.8h, v1.8h\n" - "smlal v23.4s, v28.4h, v0.4h\n" - "smlal2 v9.4s, v28.8h, v0.8h\n" - "ldr d28, [x23, x17]\n" - "smlal v11.4s, v31.4h, v2.4h\n" + "smlal v19.4s, v28.4h, v0.4h\n" + "smlal2 v11.4s, v28.8h, v0.8h\n" + "ldr d28, [x23, x15]\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "smlal v18.4s, v27.4h, v4.4h\n" + "smlal v9.4s, v29.4h, v8.4h\n" + "smlal2 v24.4s, v27.8h, v4.8h\n" + "smlal2 v23.4s, v29.8h, v8.8h\n" + "ldr d29, [x24, x15]\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "smlal v13.4s, v31.4h, v2.4h\n" "smlal2 v26.4s, v31.8h, v2.8h\n" - "smlal v23.4s, v31.4h, v1.4h\n" - "smlal2 v9.4s, v31.8h, v1.8h\n" - "ldr d31, [x22, x17]\n" - "usubl v30.8h, v30.8b, v21.8b\n" - "usubl v29.8h, v29.8b, v21.8b\n" - "usubl v28.8h, v28.8b, v21.8b\n" - "smlal v11.4s, v30.4h, v8.4h\n" + "smlal v19.4s, v31.4h, v1.4h\n" + "smlal2 v11.4s, v31.8h, v1.8h\n" + "ldr d31, [x22, x15]\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "smlal v18.4s, v30.4h, v5.4h\n" + "smlal v9.4s, v30.4h, v4.4h\n" + "smlal v13.4s, v30.4h, v8.4h\n" "smlal2 v26.4s, v30.8h, v8.8h\n" - "smlal v23.4s, v30.4h, v7.4h\n" - "smlal2 v9.4s, v30.8h, v7.8h\n" - "smlal v12.4s, v30.4h, v5.4h\n" - "smlal2 v22.4s, v30.8h, v5.8h\n" - "smlal v24.4s, v30.4h, v4.4h\n" - "smlal2 v10.4s, v30.8h, v4.8h\n" - "ldr d30, [x21, x17]\n" - "smlal v11.4s, v29.4h, v3.4h\n" + "smlal v19.4s, v30.4h, v7.4h\n" + "smlal2 v11.4s, v30.8h, v7.8h\n" + "smlal2 v24.4s, v30.8h, v5.8h\n" + "smlal2 v23.4s, v30.8h, v4.8h\n" + "ldr d30, [x21, x15]\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "smlal v18.4s, v29.4h, v0.4h\n" + "smlal v9.4s, v28.4h, v2.4h\n" + "smlal v13.4s, v29.4h, v3.4h\n" "smlal2 v26.4s, v29.8h, v3.8h\n" - "smlal v12.4s, v29.4h, v0.4h\n" - "smlal2 v22.4s, v29.8h, v0.8h\n" - "ldr d29, [x20, x17]\n" - "smlal v23.4s, v28.4h, v5.4h\n" - "smlal2 v9.4s, v28.8h, v5.8h\n" - "smlal v24.4s, v28.4h, v2.4h\n" - "smlal2 v10.4s, v28.8h, v2.8h\n" - "ldr d28, [x19, x17]\n" - "add x17, x17, #0x8\n" - "usubl v31.8h, v31.8b, v21.8b\n" - "usubl v30.8h, v30.8b, v21.8b\n" - "usubl v29.8h, v29.8b, v21.8b\n" - "smlal v11.4s, v31.4h, v6.4h\n" - "smlal2 v26.4s, v31.8h, v6.8h\n" - "smlal v12.4s, v31.4h, v3.4h\n" - "smlal2 v22.4s, v31.8h, v3.8h\n" - "smlal v23.4s, v30.4h, v8.4h\n" - "smlal2 v9.4s, v30.8h, v8.8h\n" - "smlal v24.4s, v30.4h, v5.4h\n" - "smlal2 v10.4s, v30.8h, v5.8h\n" - "smlal v12.4s, v29.4h, v7.4h\n" - "smlal2 v22.4s, v29.8h, v7.8h\n" - "smlal v24.4s, v29.4h, v6.4h\n" - "smlal2 v10.4s, v29.8h, v6.8h\n" - "usubl v28.8h, v28.8b, v21.8b\n" - "sqrdmulh v11.4s, v11.4s, v25.4s\n" - "sqrdmulh v26.4s, v26.4s, v16.4s\n" - "smlal v12.4s, v28.4h, v8.4h\n" - "smlal2 v22.4s, v28.8h, v8.8h\n" - "smlal v24.4s, v28.4h, v7.4h\n" - "smlal2 v10.4s, v28.8h, v7.8h\n" - "and v19.16b, v11.16b, v18.16b\n" - "and v5.16b, v26.16b, v20.16b\n" - "sqrdmulh v23.4s, v23.4s, v25.4s\n" - "sshr v19.4s, v19.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sqrdmulh v9.4s, v9.4s, v16.4s\n" - "sqadd v11.4s, v11.4s, v19.4s\n" - "sqadd v26.4s, v26.4s, v5.4s\n" - "and v28.16b, v23.16b, v18.16b\n" - "and v8.16b, v9.16b, v20.16b\n" - "srshl v11.4s, v11.4s, v18.4s\n" - "srshl v26.4s, v26.4s, v20.4s\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "add v11.4s, v11.4s, v13.4s\n" - "add v26.4s, v26.4s, v13.4s\n" - "sqadd v23.4s, v23.4s, v28.4s\n" - "smin v11.4s, v11.4s, v14.4s\n" - "smin v26.4s, v26.4s, v14.4s\n" - "sqadd v9.4s, v9.4s, v8.4s\n" - "smax v11.4s, v11.4s, v15.4s\n" - "smax v26.4s, v26.4s, v15.4s\n" - "srshl v23.4s, v23.4s, v18.4s\n" - "srshl v9.4s, v9.4s, v20.4s\n" - "uzp1 v11.16b, v11.16b, v26.16b\n" - "sqrdmulh v12.4s, v12.4s, v25.4s\n" - "uzp1 v11.16b, v11.16b, v11.16b\n" - "str d11, [x10, x15]\n" - "add v23.4s, v23.4s, v13.4s\n" - "add v9.4s, v9.4s, v13.4s\n" - "and v1.16b, v12.16b, v18.16b\n" - "sqrdmulh v22.4s, v22.4s, v16.4s\n" - "smin v23.4s, v23.4s, v14.4s\n" - "smin v9.4s, v9.4s, v14.4s\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "smax v23.4s, v23.4s, v15.4s\n" - "smax v9.4s, v9.4s, v15.4s\n" - "sqadd v12.4s, v12.4s, v1.4s\n" - "and v0.16b, v22.16b, v20.16b\n" - "uzp1 v23.16b, v23.16b, v9.16b\n" - "sqrdmulh v24.4s, v24.4s, v25.4s\n" - "uzp1 v23.16b, v23.16b, v23.16b\n" - "str d23, [x9, x15]\n" - "srshl v12.4s, v12.4s, v18.4s\n" - "sshr v0.4s, v0.4s, #0x1f\n" - "and v26.16b, v24.16b, v18.16b\n" - "sqrdmulh v10.4s, v10.4s, v16.4s\n" - "sqadd v22.4s, v22.4s, v0.4s\n" - "add v12.4s, v12.4s, v13.4s\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "and v16.16b, v10.16b, v20.16b\n" - "smin v12.4s, v12.4s, v14.4s\n" - "srshl v22.4s, v22.4s, v20.4s\n" - "sqadd v24.4s, v24.4s, v26.4s\n" - "smax v12.4s, v12.4s, v15.4s\n" - "sshr v16.4s, v16.4s, #0x1f\n" - "add v22.4s, v22.4s, v13.4s\n" - "srshl v24.4s, v24.4s, v18.4s\n" - "sqadd v10.4s, v10.4s, v16.4s\n" - "smin v22.4s, v22.4s, v14.4s\n" - "add v24.4s, v24.4s, v13.4s\n" - "smax v22.4s, v22.4s, v15.4s\n" - "srshl v10.4s, v10.4s, v20.4s\n" - "smin v24.4s, v24.4s, v14.4s\n" - "uzp1 v12.16b, v12.16b, v22.16b\n" - "add v10.4s, v10.4s, v13.4s\n" - "uzp1 v12.16b, v12.16b, v12.16b\n" - "str d12, [x28, x15]\n" - "smax v24.4s, v24.4s, v15.4s\n" - "smin v10.4s, v10.4s, v14.4s\n" - "smax v10.4s, v10.4s, v15.4s\n" - "uzp1 v24.16b, v24.16b, v10.16b\n" - "uzp1 v24.16b, v24.16b, v24.16b\n" - "str d24, [x27, x15]\n" + "smlal2 v24.4s, v29.8h, v0.8h\n" + "ldr d29, [x20, x15]\n" + "smlal2 v23.4s, v28.8h, v2.8h\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "smlal v18.4s, v31.4h, v3.4h\n" + "smlal v9.4s, v30.4h, v5.4h\n" + "smlal v19.4s, v28.4h, v5.4h\n" + "smlal2 v11.4s, v28.8h, v5.8h\n" + "ldr d28, [x19, x15]\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "smlal2 v24.4s, v31.8h, v3.8h\n" + "smlal2 v23.4s, v30.8h, v5.8h\n" "add x15, x15, #0x8\n" + "smlal v18.4s, v29.4h, v7.4h\n" + "smlal v9.4s, v29.4h, v6.4h\n" + "smlal2 v24.4s, v29.8h, v7.8h\n" + "smlal2 v23.4s, v29.8h, v6.8h\n" + "smlal v13.4s, v31.4h, v6.4h\n" + "smlal v19.4s, v30.4h, v8.4h\n" + "sqdmulh v13.4s, v13.4s, v21.4s\n" + "smlal v18.4s, v28.4h, v8.4h\n" + "smlal v9.4s, v28.4h, v7.4h\n" + "sqdmulh v19.4s, v19.4s, v21.4s\n" + "smlal2 v26.4s, v31.8h, v6.8h\n" + "smlal2 v11.4s, v30.8h, v8.8h\n" + "sqdmulh v18.4s, v18.4s, v21.4s\n" + "smlal2 v24.4s, v28.8h, v8.8h\n" + "smlal2 v23.4s, v28.8h, v7.8h\n" + "sqdmulh v9.4s, v9.4s, v21.4s\n" + "and v7.16b, v13.16b, v25.16b\n" + "sqdmulh v26.4s, v26.4s, v10.4s\n" + "and v4.16b, v19.16b, v25.16b\n" + "sqdmulh v11.4s, v11.4s, v10.4s\n" + "and v21.16b, v18.16b, v25.16b\n" + "sqdmulh v24.4s, v24.4s, v10.4s\n" + "and v20.16b, v9.16b, v25.16b\n" + "sqdmulh v23.4s, v23.4s, v10.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v29.16b, v26.16b, v16.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v10.16b, v11.16b, v16.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v31.16b, v24.16b, v16.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v30.16b, v23.16b, v16.16b\n" + "sqadd v13.4s, v13.4s, v7.4s\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sqadd v19.4s, v19.4s, v4.4s\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v21.4s\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v20.4s\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "srshl v13.4s, v13.4s, v25.4s\n" + "sqadd v26.4s, v26.4s, v29.4s\n" + "srshl v19.4s, v19.4s, v25.4s\n" + "sqadd v11.4s, v11.4s, v10.4s\n" + "srshl v18.4s, v18.4s, v25.4s\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "srshl v9.4s, v9.4s, v25.4s\n" + "sqadd v23.4s, v23.4s, v30.4s\n" + "srshl v26.4s, v26.4s, v16.4s\n" + "sqxtn v13.4h, v13.4s\n" + "srshl v11.4s, v11.4s, v16.4s\n" + "sqxtn v19.4h, v19.4s\n" + "srshl v24.4s, v24.4s, v16.4s\n" + "sqxtn v18.4h, v18.4s\n" + "srshl v23.4s, v23.4s, v16.4s\n" + "sqxtn v9.4h, v9.4s\n" + "sqxtn2 v13.8h, v26.4s\n" + "sqxtn2 v19.8h, v11.4s\n" + "sqxtn2 v18.8h, v24.4s\n" + "sqxtn2 v9.8h, v23.4s\n" + "sqadd v13.8h, v13.8h, v14.8h\n" + "sqadd v19.8h, v19.8h, v14.8h\n" + "sqadd v18.8h, v18.8h, v14.8h\n" + "sqadd v9.8h, v9.8h, v14.8h\n" + "smax v13.8h, v13.8h, v17.8h\n" + "smax v19.8h, v19.8h, v17.8h\n" + "smax v18.8h, v18.8h, v17.8h\n" + "smax v9.8h, v9.8h, v17.8h\n" + "smin v13.8h, v13.8h, v15.8h\n" + "smin v19.8h, v19.8h, v15.8h\n" + "smin v18.8h, v18.8h, v15.8h\n" + "smin v9.8h, v9.8h, v15.8h\n" + "uzp1 v13.16b, v13.16b, v13.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "str d13, [x10, x14]\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" + "str d19, [x9, x14]\n" + "str d18, [x28, x14]\n" + "str d9, [x27, x14]\n" + "add x14, x14, #0x8\n" "beq 64f\n" - "add x16, x16, #0x48\n" + "add x17, x17, #0x48\n" "3:" // Oddments "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" "tbz x8, #2, 5f\n" - "ld1 { v11.4s }, [x19], #0x10\n" + "ld1 { v13.4s }, [x19], #0x10\n" "tbz x8, #1, 4f\n" "ld1 { v26.d }[0], [x19], #0x8\n" "tbz x8, #0, 7f\n" @@ -601,46 +585,46 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "b 7f\n" "5:" // Oddments: Load bias: Bit 2: Unset "tbz x8, #1, 6f\n" - "ld1 { v11.d }[0], [x19], #0x8\n" + "ld1 { v13.d }[0], [x19], #0x8\n" "tbz x8, #0, 7f\n" - "ld1 { v11.s }[2], [x19]\n" + "ld1 { v13.s }[2], [x19]\n" "b 7f\n" "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset "tbz x8, #0, 7f\n" - "ld1 { v11.s }[0], [x19]\n" + "ld1 { v13.s }[0], [x19]\n" "7:" // Oddments: Load bias: Bit 2: End - "mov v23.16b, v11.16b\n" - "ldr d0, [x16, #0x0]\n" - "mov v9.16b, v26.16b\n" - "ldr d1, [x16, #0x8]\n" - "mov v12.16b, v11.16b\n" - "ldr d2, [x16, #0x10]\n" - "mov v22.16b, v26.16b\n" - "ldr d3, [x16, #0x18]\n" - "mov v24.16b, v11.16b\n" - "ldr d4, [x16, #0x20]\n" - "mov v10.16b, v26.16b\n" - "ldr d5, [x16, #0x28]\n" - "ssubl v0.8h, v0.8b, v17.8b\n" - "ldr d6, [x16, #0x30]\n" - "ssubl v1.8h, v1.8b, v17.8b\n" - "ldr d7, [x16, #0x38]\n" - "ssubl v2.8h, v2.8b, v17.8b\n" - "ldr d8, [x16, #0x40]\n" - "ssubl v3.8h, v3.8b, v17.8b\n" - "ldp x23, x22, [x14, #0x0]\n" - "add x23, x23, x17\n" - "ssubl v4.8h, v4.8b, v17.8b\n" - "ldp x21, x20, [x14, #0x10]\n" - "ssubl v5.8h, v5.8b, v17.8b\n" - "ldr x19, [x14, #0x20]\n" - "ssubl v6.8h, v6.8b, v17.8b\n" - "add x22, x22, x17\n" - "ssubl v7.8h, v7.8b, v17.8b\n" - "add x21, x21, x17\n" - "ssubl v8.8h, v8.8b, v17.8b\n" - "add x20, x20, x17\n" - "add x19, x19, x17\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "mov v19.16b, v13.16b\n" + "mov v11.16b, v26.16b\n" + "ldr d2, [x17, #0x10]\n" + "ldr d3, [x17, #0x18]\n" + "mov v18.16b, v13.16b\n" + "mov v24.16b, v26.16b\n" + "ldr d4, [x17, #0x20]\n" + "ldr d5, [x17, #0x28]\n" + "mov v9.16b, v13.16b\n" + "mov v23.16b, v26.16b\n" + "ldr d6, [x17, #0x30]\n" + "ldr d7, [x17, #0x38]\n" + "ssubl v0.8h, v0.8b, v12.8b\n" + "ssubl v1.8h, v1.8b, v12.8b\n" + "ldr d8, [x17, #0x40]\n" + "ldp x23, x22, [x12, #0x0]\n" + "ssubl v2.8h, v2.8b, v12.8b\n" + "ssubl v3.8h, v3.8b, v12.8b\n" + "ldp x21, x20, [x12, #0x10]\n" + "ldr x19, [x12, #0x20]\n" + "ssubl v4.8h, v4.8b, v12.8b\n" + "ssubl v5.8h, v5.8b, v12.8b\n" + "ssubl v6.8h, v6.8b, v12.8b\n" + "ssubl v7.8h, v7.8b, v12.8b\n" + "ssubl v8.8h, v8.8b, v12.8b\n" + "add x23, x23, x15\n" + "add x22, x22, x15\n" + "add x21, x21, x15\n" + "add x20, x20, x15\n" + "add x19, x19, x15\n" "tbz x8, #2, 9f\n" "ld1 { v31.s }[0], [x23], #0x4\n" "ld1 { v30.s }[0], [x22], #0x4\n" @@ -690,33 +674,33 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "ld1 { v28.b }[0], [x20]\n" "ld1 { v27.b }[0], [x19]\n" "11:" // Oddments: Initial loads: Bit 2: End - "usubl v31.8h, v31.8b, v21.8b\n" - "ldr x21, [x14, #0x28]\n" - "add x21, x21, x17\n" - "usubl v30.8h, v30.8b, v21.8b\n" - "usubl v29.8h, v29.8b, v21.8b\n" - "usubl v28.8h, v28.8b, v21.8b\n" - "usubl v27.8h, v27.8b, v21.8b\n" - "smlal v11.4s, v31.4h, v4.4h\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "smlal v13.4s, v31.4h, v4.4h\n" "smlal2 v26.4s, v31.8h, v4.8h\n" - "smlal v23.4s, v31.4h, v3.4h\n" - "smlal2 v9.4s, v31.8h, v3.8h\n" - "smlal v12.4s, v31.4h, v1.4h\n" - "smlal2 v22.4s, v31.8h, v1.8h\n" - "smlal v24.4s, v31.4h, v0.4h\n" - "smlal2 v10.4s, v31.8h, v0.8h\n" - "smlal v11.4s, v30.4h, v0.4h\n" + "ldr x21, [x12, #0x28]\n" + "smlal v19.4s, v31.4h, v3.4h\n" + "smlal2 v11.4s, v31.8h, v3.8h\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "add x21, x21, x15\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "smlal v18.4s, v31.4h, v1.4h\n" + "smlal2 v24.4s, v31.8h, v1.8h\n" + "smlal v9.4s, v31.4h, v0.4h\n" + "smlal2 v23.4s, v31.8h, v0.8h\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "smlal v13.4s, v30.4h, v0.4h\n" "smlal2 v26.4s, v30.8h, v0.8h\n" - "smlal v23.4s, v29.4h, v2.4h\n" - "smlal2 v9.4s, v29.8h, v2.8h\n" - "smlal v11.4s, v28.4h, v5.4h\n" + "usubl v27.8h, v27.8b, v22.8b\n" + "smlal v19.4s, v29.4h, v2.4h\n" + "smlal2 v11.4s, v29.8h, v2.8h\n" + "smlal v13.4s, v28.4h, v5.4h\n" "smlal2 v26.4s, v28.8h, v5.8h\n" - "smlal v23.4s, v28.4h, v4.4h\n" - "smlal2 v9.4s, v28.8h, v4.8h\n" - "smlal v12.4s, v28.4h, v2.4h\n" - "smlal2 v22.4s, v28.8h, v2.8h\n" - "smlal v24.4s, v28.4h, v1.4h\n" - "smlal2 v10.4s, v28.8h, v1.8h\n" + "smlal v19.4s, v28.4h, v4.4h\n" + "smlal2 v11.4s, v28.8h, v4.8h\n" + "smlal v18.4s, v28.4h, v2.4h\n" + "smlal2 v24.4s, v28.8h, v2.8h\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "smlal2 v23.4s, v28.8h, v1.8h\n" "tbz x8, #2, 13f\n" "ld1 { v31.s }[0], [x21], #0x4\n" "tbz x8, #1, 12f\n" @@ -738,19 +722,19 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 15f\n" "ld1 { v31.b }[0], [x21]\n" "15:" // Oddments: Load (3, 0): Bit 2: End - "usubl v31.8h, v31.8b, v21.8b\n" - "ldr x20, [x14, #0x30]\n" - "smlal v11.4s, v27.4h, v7.4h\n" - "add x20, x20, x17\n" - "smlal v12.4s, v31.4h, v6.4h\n" - "smlal2 v22.4s, v31.8h, v6.8h\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "smlal v18.4s, v31.4h, v6.4h\n" + "smlal2 v24.4s, v31.8h, v6.8h\n" + "ldr x20, [x12, #0x30]\n" + "smlal v13.4s, v27.4h, v7.4h\n" "smlal2 v26.4s, v27.8h, v7.8h\n" - "smlal v23.4s, v27.4h, v6.4h\n" - "smlal2 v9.4s, v27.8h, v6.8h\n" - "smlal v12.4s, v27.4h, v4.4h\n" - "smlal2 v22.4s, v27.8h, v4.8h\n" - "smlal v24.4s, v27.4h, v3.4h\n" - "smlal2 v10.4s, v27.8h, v3.8h\n" + "add x20, x20, x15\n" + "smlal v19.4s, v27.4h, v6.4h\n" + "smlal2 v11.4s, v27.8h, v6.8h\n" + "smlal v18.4s, v27.4h, v4.4h\n" + "smlal2 v24.4s, v27.8h, v4.8h\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "smlal2 v23.4s, v27.8h, v3.8h\n" "tbz x8, #2, 17f\n" "ld1 { v29.s }[0], [x20], #0x4\n" "tbz x8, #1, 16f\n" @@ -772,11 +756,11 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 19f\n" "ld1 { v29.b }[0], [x20]\n" "19:" // Oddments: Load (3, 3): Bit 2: End - "usubl v29.8h, v29.8b, v21.8b\n" - "ldr x26, [x14, #0x38]\n" - "smlal v24.4s, v29.4h, v8.4h\n" - "add x26, x26, x17\n" - "smlal2 v10.4s, v29.8h, v8.8h\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "ldr x26, [x12, #0x38]\n" + "smlal v9.4s, v29.4h, v8.4h\n" + "smlal2 v23.4s, v29.8h, v8.8h\n" + "add x26, x26, x15\n" "tbz x8, #2, 21f\n" "ld1 { v28.s }[0], [x26], #0x4\n" "tbz x8, #1, 20f\n" @@ -798,13 +782,13 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 23f\n" "ld1 { v28.b }[0], [x26]\n" "23:" // Oddments: Load (0, 1): Bit 2: End - "usubl v28.8h, v28.8b, v21.8b\n" - "ldr x25, [x14, #0x40]\n" - "smlal v11.4s, v28.4h, v1.4h\n" - "add x25, x25, x17\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "ldr x25, [x12, #0x40]\n" + "smlal v13.4s, v28.4h, v1.4h\n" "smlal2 v26.4s, v28.8h, v1.8h\n" - "smlal v23.4s, v28.4h, v0.4h\n" - "smlal2 v9.4s, v28.8h, v0.8h\n" + "smlal v19.4s, v28.4h, v0.4h\n" + "smlal2 v11.4s, v28.8h, v0.8h\n" + "add x25, x25, x15\n" "tbz x8, #2, 25f\n" "ld1 { v31.s }[0], [x25], #0x4\n" "tbz x8, #1, 24f\n" @@ -826,13 +810,13 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 27f\n" "ld1 { v31.b }[0], [x25]\n" "27:" // Oddments: Load (0, 2): Bit 2: End - "usubl v31.8h, v31.8b, v21.8b\n" - "ldr x19, [x14, #0x48]\n" - "smlal v11.4s, v31.4h, v2.4h\n" - "add x19, x19, x17\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "ldr x19, [x12, #0x48]\n" + "smlal v13.4s, v31.4h, v2.4h\n" "smlal2 v26.4s, v31.8h, v2.8h\n" - "smlal v23.4s, v31.4h, v1.4h\n" - "smlal2 v9.4s, v31.8h, v1.8h\n" + "smlal v19.4s, v31.4h, v1.4h\n" + "smlal2 v11.4s, v31.8h, v1.8h\n" + "add x19, x19, x15\n" "tbz x8, #2, 29f\n" "ld1 { v30.s }[0], [x19], #0x4\n" "tbz x8, #1, 28f\n" @@ -854,17 +838,17 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 31f\n" "ld1 { v30.b }[0], [x19]\n" "31:" // Oddments: Load (2, 2): Bit 2: End - "usubl v30.8h, v30.8b, v21.8b\n" - "ldr x24, [x14, #0x50]\n" - "smlal v11.4s, v30.4h, v8.4h\n" - "add x24, x24, x17\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "ldr x24, [x12, #0x50]\n" + "smlal v13.4s, v30.4h, v8.4h\n" "smlal2 v26.4s, v30.8h, v8.8h\n" - "smlal v23.4s, v30.4h, v7.4h\n" - "smlal2 v9.4s, v30.8h, v7.8h\n" - "smlal v12.4s, v30.4h, v5.4h\n" - "smlal2 v22.4s, v30.8h, v5.8h\n" - "smlal v24.4s, v30.4h, v4.4h\n" - "smlal2 v10.4s, v30.8h, v4.8h\n" + "smlal v19.4s, v30.4h, v7.4h\n" + "smlal2 v11.4s, v30.8h, v7.8h\n" + "add x24, x24, x15\n" + "smlal v18.4s, v30.4h, v5.4h\n" + "smlal2 v24.4s, v30.8h, v5.8h\n" + "smlal v9.4s, v30.4h, v4.4h\n" + "smlal2 v23.4s, v30.8h, v4.8h\n" "tbz x8, #2, 33f\n" "ld1 { v29.s }[0], [x24], #0x4\n" "tbz x8, #1, 32f\n" @@ -886,13 +870,13 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 35f\n" "ld1 { v29.b }[0], [x24]\n" "35:" // Oddments: Load (1, 0): Bit 2: End - "usubl v29.8h, v29.8b, v21.8b\n" - "ldr x23, [x14, #0x58]\n" - "smlal v11.4s, v29.4h, v3.4h\n" - "add x23, x23, x17\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "ldr x23, [x12, #0x58]\n" + "smlal v13.4s, v29.4h, v3.4h\n" "smlal2 v26.4s, v29.8h, v3.8h\n" - "smlal v12.4s, v29.4h, v0.4h\n" - "smlal2 v22.4s, v29.8h, v0.8h\n" + "smlal v18.4s, v29.4h, v0.4h\n" + "smlal2 v24.4s, v29.8h, v0.8h\n" + "add x23, x23, x15\n" "tbz x8, #2, 37f\n" "ld1 { v28.s }[0], [x23], #0x4\n" "tbz x8, #1, 36f\n" @@ -914,13 +898,13 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 39f\n" "ld1 { v28.b }[0], [x23]\n" "39:" // Oddments: Load (1, 3): Bit 2: End - "usubl v28.8h, v28.8b, v21.8b\n" - "ldr x22, [x14, #0x60]\n" - "smlal v23.4s, v28.4h, v5.4h\n" - "add x22, x22, x17\n" - "smlal2 v9.4s, v28.8h, v5.8h\n" - "smlal v24.4s, v28.4h, v2.4h\n" - "smlal2 v10.4s, v28.8h, v2.8h\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "ldr x22, [x12, #0x60]\n" + "smlal v19.4s, v28.4h, v5.4h\n" + "smlal2 v11.4s, v28.8h, v5.8h\n" + "smlal v9.4s, v28.4h, v2.4h\n" + "smlal2 v23.4s, v28.8h, v2.8h\n" + "add x22, x22, x15\n" "tbz x8, #2, 41f\n" "ld1 { v31.s }[0], [x22], #0x4\n" "tbz x8, #1, 40f\n" @@ -942,13 +926,13 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 43f\n" "ld1 { v31.b }[0], [x22]\n" "43:" // Oddments: Load (2, 0): Bit 2: End - "usubl v31.8h, v31.8b, v21.8b\n" - "ldr x21, [x14, #0x68]\n" - "smlal v11.4s, v31.4h, v6.4h\n" - "add x21, x21, x17\n" + "usubl v31.8h, v31.8b, v22.8b\n" + "ldr x21, [x12, #0x68]\n" + "smlal v13.4s, v31.4h, v6.4h\n" "smlal2 v26.4s, v31.8h, v6.8h\n" - "smlal v12.4s, v31.4h, v3.4h\n" - "smlal2 v22.4s, v31.8h, v3.8h\n" + "smlal v18.4s, v31.4h, v3.4h\n" + "smlal2 v24.4s, v31.8h, v3.8h\n" + "add x21, x21, x15\n" "tbz x8, #2, 45f\n" "ld1 { v30.s }[0], [x21], #0x4\n" "tbz x8, #1, 44f\n" @@ -970,13 +954,13 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 47f\n" "ld1 { v30.b }[0], [x21]\n" "47:" // Oddments: Load (2, 3): Bit 2: End - "usubl v30.8h, v30.8b, v21.8b\n" - "ldr x20, [x14, #0x70]\n" - "smlal v23.4s, v30.4h, v8.4h\n" - "add x20, x20, x17\n" - "smlal2 v9.4s, v30.8h, v8.8h\n" - "smlal v24.4s, v30.4h, v5.4h\n" - "smlal2 v10.4s, v30.8h, v5.8h\n" + "usubl v30.8h, v30.8b, v22.8b\n" + "ldr x20, [x12, #0x70]\n" + "smlal v19.4s, v30.4h, v8.4h\n" + "smlal2 v11.4s, v30.8h, v8.8h\n" + "smlal v9.4s, v30.4h, v5.4h\n" + "smlal2 v23.4s, v30.8h, v5.8h\n" + "add x20, x20, x15\n" "tbz x8, #2, 49f\n" "ld1 { v29.s }[0], [x20], #0x4\n" "tbz x8, #1, 48f\n" @@ -998,13 +982,13 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 51f\n" "ld1 { v29.b }[0], [x20]\n" "51:" // Oddments: Load (3, 1): Bit 2: End - "usubl v29.8h, v29.8b, v21.8b\n" - "ldr x19, [x14, #0x78]\n" - "smlal v12.4s, v29.4h, v7.4h\n" - "add x19, x19, x17\n" - "smlal2 v22.4s, v29.8h, v7.8h\n" - "smlal v24.4s, v29.4h, v6.4h\n" - "smlal2 v10.4s, v29.8h, v6.8h\n" + "usubl v29.8h, v29.8b, v22.8b\n" + "ldr x19, [x12, #0x78]\n" + "smlal v18.4s, v29.4h, v7.4h\n" + "smlal2 v24.4s, v29.8h, v7.8h\n" + "smlal v9.4s, v29.4h, v6.4h\n" + "smlal2 v23.4s, v29.8h, v6.8h\n" + "add x19, x19, x15\n" "tbz x8, #2, 53f\n" "ld1 { v28.s }[0], [x19], #0x4\n" "tbz x8, #1, 52f\n" @@ -1026,160 +1010,150 @@ void a64_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( "tbz x8, #0, 55f\n" "ld1 { v28.b }[0], [x19]\n" "55:" // Oddments: Load (3, 2): Bit 2: End - "usubl v28.8h, v28.8b, v21.8b\n" - "smlal v12.4s, v28.4h, v8.4h\n" - "smlal2 v22.4s, v28.8h, v8.8h\n" - "smlal v24.4s, v28.4h, v7.4h\n" - "smlal2 v10.4s, v28.8h, v7.8h\n" + "usubl v28.8h, v28.8b, v22.8b\n" + "smlal v18.4s, v28.4h, v8.4h\n" + "smlal2 v24.4s, v28.8h, v8.8h\n" + "smlal v9.4s, v28.4h, v7.4h\n" + "smlal2 v23.4s, v28.8h, v7.8h\n" "tbz x8, #2, 57f\n" - "ld1 { v25.4s }, [x13], #0x10\n" - "ld1 { v18.4s }, [x11], #0x10\n" + "ld1 { v21.4s }, [x13], #0x10\n" + "ld1 { v25.4s }, [x11], #0x10\n" "tbz x8, #1, 56f\n" - "ld1 { v16.d }[0], [x13], #0x8\n" - "ld1 { v20.d }[0], [x11], #0x8\n" + "ld1 { v10.d }[0], [x13], #0x8\n" + "ld1 { v16.d }[0], [x11], #0x8\n" "tbz x8, #0, 59f\n" - "ld1 { v16.s }[2], [x13]\n" - "ld1 { v20.s }[2], [x11]\n" + "ld1 { v10.s }[2], [x13]\n" + "ld1 { v16.s }[2], [x11]\n" "b 59f\n" "56:" // Oddments: Load requant params: Bit 2: Bit 1: Unset "tbz x8, #0, 59f\n" - "ld1 { v16.s }[0], [x13]\n" - "ld1 { v20.s }[0], [x11]\n" + "ld1 { v10.s }[0], [x13]\n" + "ld1 { v16.s }[0], [x11]\n" "b 59f\n" "57:" // Oddments: Load requant params: Bit 2: Unset "tbz x8, #1, 58f\n" - "ld1 { v25.d }[0], [x13], #0x8\n" - "ld1 { v18.d }[0], [x11], #0x8\n" + "ld1 { v21.d }[0], [x13], #0x8\n" + "ld1 { v25.d }[0], [x11], #0x8\n" "tbz x8, #0, 59f\n" - "ld1 { v25.s }[2], [x13]\n" - "ld1 { v18.s }[2], [x11]\n" + "ld1 { v21.s }[2], [x13]\n" + "ld1 { v25.s }[2], [x11]\n" "b 59f\n" "58:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset "tbz x8, #0, 59f\n" - "ld1 { v25.s }[0], [x13]\n" - "ld1 { v18.s }[0], [x11]\n" + "ld1 { v21.s }[0], [x13]\n" + "ld1 { v25.s }[0], [x11]\n" "59:" // Oddments: Load requant params: Bit 2: End - "sqrdmulh v11.4s, v11.4s, v25.4s\n" - "add x10, x10, x15\n" - "sqrdmulh v26.4s, v26.4s, v16.4s\n" - "add x9, x9, x15\n" - "sqrdmulh v23.4s, v23.4s, v25.4s\n" - "add x28, x28, x15\n" - "sqrdmulh v9.4s, v9.4s, v16.4s\n" - "add x27, x27, x15\n" - "sqrdmulh v12.4s, v12.4s, v25.4s\n" - "and v19.16b, v11.16b, v18.16b\n" - "and v5.16b, v26.16b, v20.16b\n" - "and v28.16b, v23.16b, v18.16b\n" - "sshr v19.4s, v19.4s, #0x1f\n" - "sshr v5.4s, v5.4s, #0x1f\n" - "sshr v28.4s, v28.4s, #0x1f\n" - "sqadd v11.4s, v11.4s, v19.4s\n" - "sqadd v26.4s, v26.4s, v5.4s\n" - "sqadd v23.4s, v23.4s, v28.4s\n" - "and v8.16b, v9.16b, v20.16b\n" - "srshl v11.4s, v11.4s, v18.4s\n" - "srshl v26.4s, v26.4s, v20.4s\n" - "srshl v23.4s, v23.4s, v18.4s\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "add v11.4s, v11.4s, v13.4s\n" - "add v26.4s, v26.4s, v13.4s\n" - "add v23.4s, v23.4s, v13.4s\n" - "smin v11.4s, v11.4s, v14.4s\n" - "smin v26.4s, v26.4s, v14.4s\n" - "smin v23.4s, v23.4s, v14.4s\n" - "smax v11.4s, v11.4s, v15.4s\n" - "smax v26.4s, v26.4s, v15.4s\n" - "smax v23.4s, v23.4s, v15.4s\n" - "sqadd v9.4s, v9.4s, v8.4s\n" - "uzp1 v11.16b, v11.16b, v26.16b\n" - "and v1.16b, v12.16b, v18.16b\n" - "uzp1 v11.16b, v11.16b, v11.16b\n" - "srshl v9.4s, v9.4s, v20.4s\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "sqrdmulh v22.4s, v22.4s, v16.4s\n" - "sqrdmulh v24.4s, v24.4s, v25.4s\n" - "add v9.4s, v9.4s, v13.4s\n" - "sqadd v12.4s, v12.4s, v1.4s\n" - "and v0.16b, v22.16b, v20.16b\n" - "smin v9.4s, v9.4s, v14.4s\n" - "and v26.16b, v24.16b, v18.16b\n" - "srshl v12.4s, v12.4s, v18.4s\n" - "smax v9.4s, v9.4s, v15.4s\n" - "sshr v0.4s, v0.4s, #0x1f\n" - "sshr v26.4s, v26.4s, #0x1f\n" - "uzp1 v23.16b, v23.16b, v9.16b\n" - "add v12.4s, v12.4s, v13.4s\n" - "uzp1 v23.16b, v23.16b, v23.16b\n" - "sqadd v22.4s, v22.4s, v0.4s\n" - "smin v12.4s, v12.4s, v14.4s\n" - "sqadd v24.4s, v24.4s, v26.4s\n" - "sqrdmulh v10.4s, v10.4s, v16.4s\n" - "smax v12.4s, v12.4s, v15.4s\n" - "srshl v22.4s, v22.4s, v20.4s\n" - "srshl v24.4s, v24.4s, v18.4s\n" - "and v16.16b, v10.16b, v20.16b\n" - "add v22.4s, v22.4s, v13.4s\n" - "add v24.4s, v24.4s, v13.4s\n" - "sshr v16.4s, v16.4s, #0x1f\n" - "smin v22.4s, v22.4s, v14.4s\n" - "smin v24.4s, v24.4s, v14.4s\n" - "sqadd v10.4s, v10.4s, v16.4s\n" - "smax v22.4s, v22.4s, v15.4s\n" - "smax v24.4s, v24.4s, v15.4s\n" - "srshl v10.4s, v10.4s, v20.4s\n" - "uzp1 v12.16b, v12.16b, v22.16b\n" - "uzp1 v12.16b, v12.16b, v12.16b\n" - "add v10.4s, v10.4s, v13.4s\n" - "smin v10.4s, v10.4s, v14.4s\n" - "smax v10.4s, v10.4s, v15.4s\n" - "uzp1 v24.16b, v24.16b, v10.16b\n" - "uzp1 v24.16b, v24.16b, v24.16b\n" + "sqdmulh v13.4s, v13.4s, v21.4s\n" + "sqdmulh v19.4s, v19.4s, v21.4s\n" + "add x10, x10, x14\n" + "add x9, x9, x14\n" + "sqdmulh v18.4s, v18.4s, v21.4s\n" + "sqdmulh v9.4s, v9.4s, v21.4s\n" + "add x28, x28, x14\n" + "add x27, x27, x14\n" + "and v7.16b, v13.16b, v25.16b\n" + "sqdmulh v26.4s, v26.4s, v10.4s\n" + "and v4.16b, v19.16b, v25.16b\n" + "sqdmulh v11.4s, v11.4s, v10.4s\n" + "and v21.16b, v18.16b, v25.16b\n" + "sqdmulh v24.4s, v24.4s, v10.4s\n" + "and v20.16b, v9.16b, v25.16b\n" + "sqdmulh v23.4s, v23.4s, v10.4s\n" + "sshr v7.4s, v7.4s, #0x1f\n" + "and v29.16b, v26.16b, v16.16b\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "and v10.16b, v11.16b, v16.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v31.16b, v24.16b, v16.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v30.16b, v23.16b, v16.16b\n" + "sqadd v13.4s, v13.4s, v7.4s\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "sqadd v19.4s, v19.4s, v4.4s\n" + "sshr v10.4s, v10.4s, #0x1f\n" + "sqadd v18.4s, v18.4s, v21.4s\n" + "sshr v31.4s, v31.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v20.4s\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "srshl v13.4s, v13.4s, v25.4s\n" + "sqadd v26.4s, v26.4s, v29.4s\n" + "srshl v19.4s, v19.4s, v25.4s\n" + "sqadd v11.4s, v11.4s, v10.4s\n" + "srshl v18.4s, v18.4s, v25.4s\n" + "sqadd v24.4s, v24.4s, v31.4s\n" + "srshl v9.4s, v9.4s, v25.4s\n" + "sqadd v23.4s, v23.4s, v30.4s\n" + "srshl v26.4s, v26.4s, v16.4s\n" + "sqxtn v13.4h, v13.4s\n" + "srshl v11.4s, v11.4s, v16.4s\n" + "sqxtn v19.4h, v19.4s\n" + "srshl v24.4s, v24.4s, v16.4s\n" + "sqxtn v18.4h, v18.4s\n" + "srshl v23.4s, v23.4s, v16.4s\n" + "sqxtn v9.4h, v9.4s\n" + "sqxtn2 v13.8h, v26.4s\n" + "sqxtn2 v19.8h, v11.4s\n" + "sqxtn2 v18.8h, v24.4s\n" + "sqxtn2 v9.8h, v23.4s\n" + "sqadd v13.8h, v13.8h, v14.8h\n" + "sqadd v19.8h, v19.8h, v14.8h\n" + "sqadd v18.8h, v18.8h, v14.8h\n" + "sqadd v9.8h, v9.8h, v14.8h\n" + "smax v13.8h, v13.8h, v17.8h\n" + "smax v19.8h, v19.8h, v17.8h\n" + "smax v18.8h, v18.8h, v17.8h\n" + "smax v9.8h, v9.8h, v17.8h\n" + "smin v13.8h, v13.8h, v15.8h\n" + "smin v19.8h, v19.8h, v15.8h\n" + "smin v18.8h, v18.8h, v15.8h\n" + "smin v9.8h, v9.8h, v15.8h\n" + "uzp1 v13.16b, v13.16b, v13.16b\n" + "uzp1 v19.16b, v19.16b, v19.16b\n" + "uzp1 v18.16b, v18.16b, v18.16b\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" "tbz x8, #2, 61f\n" - "st1 { v11.s }[0], [x10], #0x4\n" - "st1 { v23.s }[0], [x9], #0x4\n" - "st1 { v12.s }[0], [x28], #0x4\n" - "st1 { v24.s }[0], [x27], #0x4\n" + "st1 { v13.s }[0], [x10], #0x4\n" + "st1 { v19.s }[0], [x9], #0x4\n" + "st1 { v18.s }[0], [x28], #0x4\n" + "st1 { v9.s }[0], [x27], #0x4\n" "tbz x8, #1, 60f\n" - "st1 { v11.h }[2], [x10], #0x2\n" - "st1 { v23.h }[2], [x9], #0x2\n" - "st1 { v12.h }[2], [x28], #0x2\n" - "st1 { v24.h }[2], [x27], #0x2\n" + "st1 { v13.h }[2], [x10], #0x2\n" + "st1 { v19.h }[2], [x9], #0x2\n" + "st1 { v18.h }[2], [x28], #0x2\n" + "st1 { v9.h }[2], [x27], #0x2\n" "tbz x8, #0, 63f\n" - "st1 { v11.b }[6], [x10], #0x1\n" - "st1 { v23.b }[6], [x9], #0x1\n" - "st1 { v12.b }[6], [x28], #0x1\n" - "st1 { v24.b }[6], [x27], #0x1\n" + "st1 { v13.b }[6], [x10], #0x1\n" + "st1 { v19.b }[6], [x9], #0x1\n" + "st1 { v18.b }[6], [x28], #0x1\n" + "st1 { v9.b }[6], [x27], #0x1\n" "b 63f\n" "60:" // Oddments: Bit 2: Bit 1: Unset "tbz x8, #0, 63f\n" - "st1 { v11.b }[4], [x10], #0x1\n" - "st1 { v23.b }[4], [x9], #0x1\n" - "st1 { v12.b }[4], [x28], #0x1\n" - "st1 { v24.b }[4], [x27], #0x1\n" + "st1 { v13.b }[4], [x10], #0x1\n" + "st1 { v19.b }[4], [x9], #0x1\n" + "st1 { v18.b }[4], [x28], #0x1\n" + "st1 { v9.b }[4], [x27], #0x1\n" "b 63f\n" "61:" // Oddments: Bit 2: Unset "tbz x8, #1, 62f\n" - "st1 { v11.h }[0], [x10], #0x2\n" - "st1 { v23.h }[0], [x9], #0x2\n" - "st1 { v12.h }[0], [x28], #0x2\n" - "st1 { v24.h }[0], [x27], #0x2\n" + "st1 { v13.h }[0], [x10], #0x2\n" + "st1 { v19.h }[0], [x9], #0x2\n" + "st1 { v18.h }[0], [x28], #0x2\n" + "st1 { v9.h }[0], [x27], #0x2\n" "tbz x8, #0, 63f\n" - "st1 { v11.b }[2], [x10], #0x1\n" - "st1 { v23.b }[2], [x9], #0x1\n" - "st1 { v12.b }[2], [x28], #0x1\n" - "st1 { v24.b }[2], [x27], #0x1\n" + "st1 { v13.b }[2], [x10], #0x1\n" + "st1 { v19.b }[2], [x9], #0x1\n" + "st1 { v18.b }[2], [x28], #0x1\n" + "st1 { v9.b }[2], [x27], #0x1\n" "b 63f\n" "62:" // Oddments: Bit 2: Unset: Bit 1: Unset "tbz x8, #0, 63f\n" - "st1 { v11.b }[0], [x10], #0x1\n" - "st1 { v23.b }[0], [x9], #0x1\n" - "st1 { v12.b }[0], [x28], #0x1\n" - "st1 { v24.b }[0], [x27], #0x1\n" + "st1 { v13.b }[0], [x10], #0x1\n" + "st1 { v19.b }[0], [x9], #0x1\n" + "st1 { v18.b }[0], [x28], #0x1\n" + "st1 { v9.b }[0], [x27], #0x1\n" "63:" // Oddments: Bit 2: End - "64:" // End - : : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp index 77861e94f0..9a1b64ed7b 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,37 +36,24 @@ namespace depthwise { void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); -struct a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst +class a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef uint8_t input_type; - typedef int8_t weight_type; - typedef uint8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); - typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 2; constexpr static unsigned int stride_cols = 2; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 5; - constexpr static unsigned int input_cols = 5; - - constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_3x3_mla::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_3x3_mla::get_packed_size; + a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {} - kern_type kernel = a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl; + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; } - a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp index 4e1586b033..790d26bc6d 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -46,7 +46,7 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( struct Params { long unsigned int n_channels; - const int8_t *weights; + const void *weights; const int32_t *bias; const arm_gemm::Requantize32 *requant; const int32_t *const requant_muls; @@ -57,7 +57,7 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( Params( long unsigned int n_channels, const uint8_t *const *inptrs_raw, - const int8_t *const weights, + const void *const weights, const int32_t *const bias, const arm_gemm::Requantize32 &qp, const int32_t *const requant_muls, @@ -100,611 +100,595 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( requant_muls, requant_shifts, outptrs); __asm__ __volatile__( - "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n" - "mov x4, #0x0\n" - "ldr x5, [%x[params], %[offsetof_Params_weights]]\n" - "mov x6, #0x0\n" - "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" - "add x7, %x[params], %[offsetof_Params_inptrs]\n" - "ldr x8, [%x[params], %[offsetof_Params_requant_muls]]\n" - "lsr x17, x3, #0x3\n" - "ldr x16, [%x[params], %[offsetof_Params_requant_shifts]]\n" - "add x19, x22, %[offsetof_Requantize32_a_offset]\n" - "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" - "add x20, x22, %[offsetof_Requantize32_b_offset]\n" - "ld1r { v22.16b }, [x19]\n" - "add x19, x22, %[offsetof_Requantize32_c_offset]\n" - "ld1r { v12.16b }, [x20]\n" - "add x20, x22, %[offsetof_Requantize32_minval]\n" - "ld1r { v14.4s }, [x19]\n" - "add x19, x22, %[offsetof_Requantize32_maxval]\n" - "ld1r { v16.4s }, [x20]\n" - "ld1r { v15.4s }, [x19]\n" - "ldp x15, x14, [x21, #0x0]\n" - "ldp x13, x12, [x21, #0x10]\n" - "cbz x17, 3f\n" - "subs x17, x17, #0x1\n" + "ldr x19, [%x[params], %[offsetof_Params_requant]]\n" + "ldr x8, [%x[params], %[offsetof_Params_n_channels]]\n" + "add x24, x19, %[offsetof_Requantize32_a_offset]\n" + "add x23, x19, %[offsetof_Requantize32_b_offset]\n" + "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x21, x19, %[offsetof_Requantize32_c_offset]\n" + "add x20, x19, %[offsetof_Requantize32_minval]\n" + "ldr x17, [%x[params], %[offsetof_Params_weights]]\n" + "add x19, x19, %[offsetof_Requantize32_maxval]\n" + "ld1r { v12.16b }, [x24]\n" + "ld1r { v13.16b }, [x23]\n" + "lsr x16, x8, #0x3\n" + "ld1r { v11.8h }, [x21]\n" + "ld1r { v17.8h }, [x20]\n" + "mov x15, #0x0\n" + "mov x14, #0x0\n" + "ld1r { v14.8h }, [x19]\n" + "ldr x13, [%x[params], %[offsetof_Params_requant_muls]]\n" + "add x12, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x11, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "ldp x10, x9, [x22, #0x0]\n" + "ldp x28, x27, [x22, #0x10]\n" + "cbz x16, 3f\n" "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" - "ldr q13, [x19, #0x0]\n" - "mov v19.16b, v13.16b\n" + "ldr q15, [x19, #0x0]\n" + "subs x16, x16, #0x1\n" + "mov v9.16b, v15.16b\n" "ldr q10, [x19, #0x10]\n" "add x19, x19, #0x20\n" - "mov v11.16b, v13.16b\n" "str x19, [%x[params], %[offsetof_Params_bias]]\n" - "mov v18.16b, v13.16b\n" - "ldr d0, [x5, #0x0]\n" - "ldr d1, [x5, #0x8]\n" - "mov v20.16b, v10.16b\n" - "ldr d2, [x5, #0x10]\n" - "mov v17.16b, v10.16b\n" - "ldr d3, [x5, #0x18]\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "ldr d2, [x17, #0x10]\n" + "mov v16.16b, v10.16b\n" + "mov v22.16b, v15.16b\n" + "ldr d3, [x17, #0x18]\n" + "ldr d4, [x17, #0x20]\n" "mov v21.16b, v10.16b\n" - "ldr d4, [x5, #0x20]\n" - "ssubl v0.8h, v0.8b, v12.8b\n" - "ldr d5, [x5, #0x28]\n" - "ssubl v1.8h, v1.8b, v12.8b\n" - "ldr d6, [x5, #0x30]\n" - "ssubl v2.8h, v2.8b, v12.8b\n" - "ldr d7, [x5, #0x38]\n" - "ssubl v3.8h, v3.8b, v12.8b\n" - "ldr d8, [x5, #0x40]\n" - "ssubl v4.8h, v4.8b, v12.8b\n" - "ldp x26, x25, [x7, #0x0]\n" - "ssubl v5.8h, v5.8b, v12.8b\n" - "ldp x24, x23, [x7, #0x10]\n" - "ssubl v6.8h, v6.8b, v12.8b\n" - "ssubl v7.8h, v7.8b, v12.8b\n" - "ldp x22, x21, [x7, #0x20]\n" - "ssubl v8.8h, v8.8b, v12.8b\n" - "ldp x20, x19, [x7, #0x30]\n" - "ldr d31, [x26, x4]\n" - "usubl v31.8h, v31.8b, v22.8b\n" - "ldr d30, [x25, x4]\n" - "ldr d29, [x24, x4]\n" - "usubl v30.8h, v30.8b, v22.8b\n" - "ldr d28, [x23, x4]\n" - "usubl v29.8h, v29.8b, v22.8b\n" - "ldr d27, [x22, x4]\n" - "ldr d26, [x21, x4]\n" - "usubl v28.8h, v28.8b, v22.8b\n" - "ldr d25, [x20, x4]\n" - "ldr d24, [x19, x4]\n" - "usubl v27.8h, v27.8b, v22.8b\n" - "usubl v26.8h, v26.8b, v22.8b\n" - "usubl v25.8h, v25.8b, v22.8b\n" - "usubl v24.8h, v24.8b, v22.8b\n" + "mov v23.16b, v15.16b\n" + "ldr d5, [x17, #0x28]\n" + "ldr d6, [x17, #0x30]\n" + "mov v18.16b, v10.16b\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "ldr d7, [x17, #0x38]\n" + "ldr d8, [x17, #0x40]\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "ldp x26, x25, [x12, #0x0]\n" + "ldp x24, x23, [x12, #0x10]\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "ldp x22, x21, [x12, #0x20]\n" + "ldp x20, x19, [x12, #0x30]\n" + "ssubl v5.8h, v5.8b, v13.8b\n" + "ssubl v6.8h, v6.8b, v13.8b\n" + "ldr d31, [x26, x15]\n" + "ldr d30, [x25, x15]\n" + "ssubl v7.8h, v7.8b, v13.8b\n" + "ssubl v8.8h, v8.8b, v13.8b\n" + "ldr d29, [x24, x15]\n" + "ldr d28, [x23, x15]\n" + "usubl v31.8h, v31.8b, v12.8b\n" + "usubl v30.8h, v30.8b, v12.8b\n" + "ldr d27, [x22, x15]\n" + "ldr d26, [x21, x15]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "ldr d25, [x20, x15]\n" + "ldr d24, [x19, x15]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "usubl v24.8h, v24.8b, v12.8b\n" "beq 2f\n" "1:" // Loop - "smlal v13.4s, v31.4h, v8.4h\n" - "ldr x22, [x7, #0x40]\n" - "add x5, x5, #0x48\n" + "smlal v15.4s, v31.4h, v8.4h\n" "smlal2 v10.4s, v31.8h, v8.8h\n" - "ldr x21, [x7, #0x48]\n" - "subs x17, x17, #0x1\n" - "smlal v19.4s, v31.4h, v6.4h\n" - "ldr x20, [x7, #0x50]\n" - "smlal2 v20.4s, v31.8h, v6.8h\n" - "ldr x19, [x7, #0x58]\n" - "smlal v11.4s, v31.4h, v2.4h\n" - "ldr x11, [x7, #0x60]\n" - "smlal2 v17.4s, v31.8h, v2.8h\n" - "ldr x10, [x7, #0x68]\n" - "smlal v18.4s, v31.4h, v0.4h\n" - "ldr x9, [x7, #0x70]\n" - "smlal2 v21.4s, v31.8h, v0.8h\n" - "ldr x28, [x7, #0x78]\n" - "smlal v13.4s, v30.4h, v0.4h\n" - "ldr x27, [x7, #0x80]\n" + "ldr x24, [x12, #0x40]\n" + "ldr x23, [x12, #0x48]\n" + "smlal v9.4s, v31.4h, v6.4h\n" + "smlal2 v16.4s, v31.8h, v6.8h\n" + "ldr x21, [x12, #0x50]\n" + "ldr x19, [x12, #0x58]\n" + "smlal v15.4s, v30.4h, v0.4h\n" "smlal2 v10.4s, v30.8h, v0.8h\n" - "ldr x26, [x7, #0x88]\n" - "smlal v19.4s, v28.4h, v1.4h\n" - "ldr x25, [x7, #0x90]\n" - "smlal2 v20.4s, v28.8h, v1.8h\n" - "ldr d28, [x21, x4]\n" - "smlal v13.4s, v29.4h, v1.4h\n" - "ldr x24, [x7, #0x98]\n" + "ldr x22, [x12, #0x78]\n" + "ldr x20, [x12, #0x60]\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "smlal2 v16.4s, v28.8h, v1.8h\n" + "ldr d28, [x23, x15]\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "smlal v15.4s, v29.4h, v1.4h\n" "smlal2 v10.4s, v29.8h, v1.8h\n" - "ldr d29, [x22, x4]\n" - "smlal v19.4s, v27.4h, v2.4h\n" - "ldr x23, [x7, #0xa0]\n" - "smlal2 v20.4s, v27.8h, v2.8h\n" - "ldr d27, [x20, x4]\n" - "smlal v13.4s, v26.4h, v3.4h\n" - "ldr x22, [x7, #0xa8]\n" + "ldr d29, [x24, x15]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v9.4s, v27.4h, v2.4h\n" + "smlal2 v16.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x15]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "smlal v15.4s, v26.4h, v3.4h\n" "smlal2 v10.4s, v26.8h, v3.8h\n" - "ldr d26, [x19, x4]\n" - "smlal v19.4s, v24.4h, v0.4h\n" - "ldr x21, [x7, #0xb0]\n" - "smlal2 v20.4s, v24.8h, v0.8h\n" - "ldr x20, [x7, #0xb8]\n" - "smlal v13.4s, v25.4h, v4.4h\n" - "ldr x19, [x7, #0xc0]\n" + "ldr d26, [x19, x15]\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v9.4s, v24.4h, v0.4h\n" + "smlal2 v16.4s, v24.8h, v0.8h\n" + "ldr x21, [x12, #0x80]\n" + "ldr x19, [x12, #0x68]\n" + "smlal v15.4s, v25.4h, v4.4h\n" "smlal2 v10.4s, v25.8h, v4.8h\n" - "ldr d25, [x11, x4]\n" - "usubl v29.8h, v29.8b, v22.8b\n" - "ldr q31, [x8, #0x0]\n" - "usubl v28.8h, v28.8b, v22.8b\n" - "ldr q30, [x16, #0x0]\n" - "smlal v13.4s, v24.4h, v2.4h\n" - "ldr q23, [x8, #0x10]\n" - "add x8, x8, #0x20\n" + "ldr d25, [x20, x15]\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "smlal v9.4s, v29.4h, v4.4h\n" + "smlal2 v16.4s, v29.8h, v4.8h\n" + "ldr x20, [x12, #0x88]\n" + "ldr d29, [x19, x15]\n" + "smlal v15.4s, v24.4h, v2.4h\n" "smlal2 v10.4s, v24.8h, v2.8h\n" - "ldr d24, [x9, x4]\n" - "smlal v19.4s, v29.4h, v4.4h\n" - "ldr q9, [x16, #0x10]\n" - "add x16, x16, #0x20\n" - "smlal2 v20.4s, v29.8h, v4.8h\n" - "ldr d29, [x10, x4]\n" - "usubl v27.8h, v27.8b, v22.8b\n" - "usubl v26.8h, v26.8b, v22.8b\n" - "smlal v19.4s, v28.4h, v5.4h\n" - "smlal v13.4s, v27.4h, v5.4h\n" - "smlal2 v20.4s, v28.8h, v5.8h\n" - "ldr d28, [x27, x4]\n" + "ldr x19, [x12, #0x70]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v9.4s, v28.4h, v5.4h\n" + "smlal2 v16.4s, v28.8h, v5.8h\n" + "ldr d28, [x21, x15]\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "smlal v22.4s, v31.4h, v2.4h\n" + "smlal2 v21.4s, v31.8h, v2.8h\n" + "ldr x24, [x12, #0x98]\n" + "ldr d24, [x19, x15]\n" + "smlal v15.4s, v27.4h, v5.4h\n" "smlal2 v10.4s, v27.8h, v5.8h\n" - "smlal v19.4s, v27.4h, v3.4h\n" - "smlal v11.4s, v26.4h, v3.4h\n" - "smlal2 v20.4s, v27.8h, v3.8h\n" - "ldr d27, [x28, x4]\n" - "smlal2 v17.4s, v26.8h, v3.8h\n" - "ldr d26, [x26, x4]\n" - "usubl v25.8h, v25.8b, v22.8b\n" - "usubl v29.8h, v29.8b, v22.8b\n" - "usubl v24.8h, v24.8b, v22.8b\n" - "smlal v13.4s, v25.4h, v6.4h\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "ldr x23, [x12, #0x90]\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "smlal2 v16.4s, v27.8h, v3.8h\n" + "ldr d27, [x22, x15]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "smlal v23.4s, v31.4h, v0.4h\n" + "smlal v22.4s, v26.4h, v3.4h\n" + "ldr x22, [x12, #0xa8]\n" + "ldr x19, [x12, #0xa0]\n" + "smlal2 v21.4s, v26.8h, v3.8h\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" + "ldr d26, [x20, x15]\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v23.4s, v27.4h, v4.4h\n" + "smlal v22.4s, v25.4h, v0.4h\n" + "ldr x21, [x12, #0xb0]\n" + "ldr x20, [x12, #0xb8]\n" + "smlal2 v21.4s, v25.8h, v0.8h\n" + "smlal2 v18.4s, v27.8h, v4.8h\n" + "ldr d27, [x19, x15]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "smlal v23.4s, v28.4h, v1.4h\n" + "smlal v15.4s, v25.4h, v6.4h\n" + "ldr x19, [x12, #0xc0]\n" + "ldr q19, [x13, #0x0]\n" "smlal2 v10.4s, v25.8h, v6.8h\n" - "smlal v11.4s, v25.4h, v0.4h\n" - "smlal2 v17.4s, v25.8h, v0.8h\n" - "ldr d25, [x25, x4]\n" - "smlal v13.4s, v24.4h, v7.4h\n" + "smlal v22.4s, v29.4h, v4.4h\n" + "ldr d25, [x23, x15]\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "smlal2 v21.4s, v29.8h, v4.8h\n" + "ldr d29, [x24, x15]\n" + "smlal2 v18.4s, v28.8h, v1.8h\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v23.4s, v26.4h, v5.4h\n" + "smlal v15.4s, v24.4h, v7.4h\n" + "ldr q0, [x11, #0x0]\n" + "ldr q4, [x13, #0x10]\n" "smlal2 v10.4s, v24.8h, v7.8h\n" - "smlal v11.4s, v29.4h, v4.4h\n" - "smlal2 v17.4s, v29.8h, v4.8h\n" - "ldr d29, [x24, x4]\n" - "usubl v27.8h, v27.8b, v22.8b\n" - "usubl v28.8h, v28.8b, v22.8b\n" - "smlal v11.4s, v24.4h, v1.4h\n" - "smlal2 v17.4s, v24.8h, v1.8h\n" - "ldr d24, [x22, x4]\n" - "smlal v18.4s, v27.4h, v4.4h\n" - "smlal2 v21.4s, v27.8h, v4.8h\n" - "ldr d27, [x23, x4]\n" - "smlal v19.4s, v28.4h, v7.4h\n" - "smlal2 v20.4s, v28.8h, v7.8h\n" - "smlal v18.4s, v28.4h, v1.4h\n" - "smlal2 v21.4s, v28.8h, v1.8h\n" - "usubl v26.8h, v26.8b, v22.8b\n" - "usubl v25.8h, v25.8b, v22.8b\n" - "usubl v29.8h, v29.8b, v22.8b\n" - "smlal v18.4s, v26.4h, v5.4h\n" - "smlal2 v21.4s, v26.8h, v5.8h\n" - "ldr d26, [x21, x4]\n" - "smlal v11.4s, v25.4h, v6.4h\n" - "smlal2 v17.4s, v25.8h, v6.8h\n" - "ldr d25, [x20, x4]\n" - "smlal v19.4s, v29.4h, v8.4h\n" - "smlal2 v20.4s, v29.8h, v8.8h\n" - "smlal v18.4s, v29.4h, v2.4h\n" - "smlal2 v21.4s, v29.8h, v2.8h\n" - "ldr d29, [x19, x4]\n" - "add x4, x4, #0x8\n" - "usubl v27.8h, v27.8b, v22.8b\n" - "usubl v24.8h, v24.8b, v22.8b\n" - "usubl v26.8h, v26.8b, v22.8b\n" - "usubl v25.8h, v25.8b, v22.8b\n" - "smlal v11.4s, v27.4h, v7.4h\n" - "smlal2 v17.4s, v27.8h, v7.8h\n" - "smlal v18.4s, v24.4h, v3.4h\n" - "smlal2 v21.4s, v24.8h, v3.8h\n" - "smlal v11.4s, v24.4h, v5.4h\n" - "smlal2 v17.4s, v24.8h, v5.8h\n" - "smlal v18.4s, v26.4h, v7.4h\n" - "smlal2 v21.4s, v26.8h, v7.8h\n" - "smlal v11.4s, v25.4h, v8.4h\n" - "smlal2 v17.4s, v25.8h, v8.8h\n" - "smlal v18.4s, v25.4h, v6.4h\n" + "smlal v22.4s, v24.4h, v1.4h\n" + "sqdmulh v15.4s, v15.4s, v19.4s\n" + "ldr q31, [x11, #0x10]\n" + "smlal2 v21.4s, v24.8h, v1.8h\n" + "ldr d24, [x22, x15]\n" + "smlal2 v18.4s, v26.8h, v5.8h\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "smlal v23.4s, v29.4h, v2.4h\n" + "ldr d26, [x21, x15]\n" + "smlal2 v18.4s, v29.8h, v2.8h\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v22.4s, v25.4h, v6.4h\n" + "smlal v23.4s, v24.4h, v3.4h\n" + "and v30.16b, v15.16b, v0.16b\n" + "add x17, x17, #0x48\n" + "smlal v9.4s, v28.4h, v7.4h\n" + "smlal2 v16.4s, v28.8h, v7.8h\n" + "sqdmulh v10.4s, v10.4s, v4.4s\n" + "subs x16, x16, #0x1\n" "smlal2 v21.4s, v25.8h, v6.8h\n" - "usubl v29.8h, v29.8b, v22.8b\n" - "sqrdmulh v13.4s, v13.4s, v31.4s\n" - "sqrdmulh v10.4s, v10.4s, v23.4s\n" - "smlal v18.4s, v29.4h, v8.4h\n" - "smlal2 v21.4s, v29.8h, v8.8h\n" - "and v27.16b, v13.16b, v30.16b\n" - "and v7.16b, v10.16b, v9.16b\n" - "sqrdmulh v19.4s, v19.4s, v31.4s\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqrdmulh v20.4s, v20.4s, v23.4s\n" - "sqadd v13.4s, v13.4s, v27.4s\n" - "sqadd v10.4s, v10.4s, v7.4s\n" - "and v6.16b, v19.16b, v30.16b\n" - "and v3.16b, v20.16b, v9.16b\n" - "srshl v13.4s, v13.4s, v30.4s\n" - "srshl v10.4s, v10.4s, v9.4s\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "add v13.4s, v13.4s, v14.4s\n" - "add v10.4s, v10.4s, v14.4s\n" - "sqadd v19.4s, v19.4s, v6.4s\n" - "smin v13.4s, v13.4s, v15.4s\n" - "smin v10.4s, v10.4s, v15.4s\n" - "sqadd v20.4s, v20.4s, v3.4s\n" - "smax v13.4s, v13.4s, v16.4s\n" - "smax v10.4s, v10.4s, v16.4s\n" - "srshl v19.4s, v19.4s, v30.4s\n" - "srshl v20.4s, v20.4s, v9.4s\n" - "uzp1 v13.16b, v13.16b, v10.16b\n" - "sqrdmulh v11.4s, v11.4s, v31.4s\n" - "uzp1 v13.16b, v13.16b, v13.16b\n" - "str d13, [x15, x6]\n" - "add v19.4s, v19.4s, v14.4s\n" - "add v20.4s, v20.4s, v14.4s\n" - "and v28.16b, v11.16b, v30.16b\n" - "sqrdmulh v17.4s, v17.4s, v23.4s\n" - "smin v19.4s, v19.4s, v15.4s\n" - "smin v20.4s, v20.4s, v15.4s\n" + "ldr d25, [x20, x15]\n" + "smlal2 v18.4s, v24.8h, v3.8h\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "smlal v22.4s, v27.4h, v7.4h\n" + "smlal v23.4s, v26.4h, v7.4h\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "add x13, x13, #0x20\n" + "smlal v9.4s, v29.4h, v8.4h\n" + "smlal2 v16.4s, v29.8h, v8.8h\n" + "ldr d29, [x19, x15]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal2 v21.4s, v27.8h, v7.8h\n" + "smlal2 v18.4s, v26.8h, v7.8h\n" + "sqdmulh v9.4s, v9.4s, v19.4s\n" + "add x15, x15, #0x8\n" + "smlal v22.4s, v24.4h, v5.4h\n" + "smlal v23.4s, v25.4h, v6.4h\n" + "and v28.16b, v9.16b, v0.16b\n" + "add x11, x11, #0x20\n" + "smlal2 v21.4s, v24.8h, v5.8h\n" + "smlal2 v18.4s, v25.8h, v6.8h\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "smlal v22.4s, v25.4h, v8.4h\n" + "smlal v23.4s, v29.4h, v8.4h\n" + "sqdmulh v22.4s, v22.4s, v19.4s\n" + "smlal2 v21.4s, v25.8h, v8.8h\n" + "smlal2 v18.4s, v29.8h, v8.8h\n" + "sqdmulh v23.4s, v23.4s, v19.4s\n" + "and v29.16b, v22.16b, v0.16b\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "and v20.16b, v23.16b, v0.16b\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "and v19.16b, v10.16b, v31.16b\n" "sshr v28.4s, v28.4s, #0x1f\n" - "smax v19.4s, v19.4s, v16.4s\n" - "smax v20.4s, v20.4s, v16.4s\n" - "sqadd v11.4s, v11.4s, v28.4s\n" - "and v26.16b, v17.16b, v9.16b\n" - "uzp1 v19.16b, v19.16b, v20.16b\n" - "sqrdmulh v18.4s, v18.4s, v31.4s\n" - "uzp1 v19.16b, v19.16b, v19.16b\n" - "str d19, [x14, x6]\n" - "srshl v11.4s, v11.4s, v30.4s\n" + "and v4.16b, v16.16b, v31.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "and v5.16b, v21.16b, v31.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v26.16b, v18.16b, v31.16b\n" + "sqadd v15.4s, v15.4s, v30.4s\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v28.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v22.4s, v22.4s, v29.4s\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v20.4s\n" "sshr v26.4s, v26.4s, #0x1f\n" - "and v8.16b, v18.16b, v30.16b\n" - "sqrdmulh v21.4s, v21.4s, v23.4s\n" - "sqadd v17.4s, v17.4s, v26.4s\n" - "add v11.4s, v11.4s, v14.4s\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "and v27.16b, v21.16b, v9.16b\n" - "smin v11.4s, v11.4s, v15.4s\n" - "srshl v17.4s, v17.4s, v9.4s\n" - "sqadd v18.4s, v18.4s, v8.4s\n" - "smax v11.4s, v11.4s, v16.4s\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "add v17.4s, v17.4s, v14.4s\n" - "srshl v18.4s, v18.4s, v30.4s\n" - "sqadd v21.4s, v21.4s, v27.4s\n" - "smin v17.4s, v17.4s, v15.4s\n" - "add v18.4s, v18.4s, v14.4s\n" - "smax v17.4s, v17.4s, v16.4s\n" - "srshl v21.4s, v21.4s, v9.4s\n" - "smin v18.4s, v18.4s, v15.4s\n" - "uzp1 v11.16b, v11.16b, v17.16b\n" - "add v21.4s, v21.4s, v14.4s\n" - "uzp1 v11.16b, v11.16b, v11.16b\n" - "str d11, [x13, x6]\n" - "smax v18.4s, v18.4s, v16.4s\n" - "smin v21.4s, v21.4s, v15.4s\n" - "smax v21.4s, v21.4s, v16.4s\n" - "uzp1 v18.16b, v18.16b, v21.16b\n" - "uzp1 v18.16b, v18.16b, v18.16b\n" - "str d18, [x12, x6]\n" - "add x6, x6, #0x8\n" + "srshl v15.4s, v15.4s, v0.4s\n" + "sqadd v10.4s, v10.4s, v19.4s\n" + "srshl v9.4s, v9.4s, v0.4s\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "sqadd v21.4s, v21.4s, v5.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "sqadd v18.4s, v18.4s, v26.4s\n" + "srshl v10.4s, v10.4s, v31.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v16.4s, v16.4s, v31.4s\n" + "sqxtn v9.4h, v9.4s\n" + "srshl v21.4s, v21.4s, v31.4s\n" + "sqxtn v22.4h, v22.4s\n" + "srshl v18.4s, v18.4s, v31.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v15.8h, v10.4s\n" + "sqxtn2 v9.8h, v16.4s\n" + "sqxtn2 v22.8h, v21.4s\n" + "sqxtn2 v23.8h, v18.4s\n" + "sqadd v15.8h, v15.8h, v11.8h\n" + "sqadd v9.8h, v9.8h, v11.8h\n" + "sqadd v22.8h, v22.8h, v11.8h\n" + "sqadd v23.8h, v23.8h, v11.8h\n" + "smax v15.8h, v15.8h, v17.8h\n" + "smax v9.8h, v9.8h, v17.8h\n" + "smax v22.8h, v22.8h, v17.8h\n" + "smax v23.8h, v23.8h, v17.8h\n" + "smin v15.8h, v15.8h, v14.8h\n" + "smin v9.8h, v9.8h, v14.8h\n" + "smin v22.8h, v22.8h, v14.8h\n" + "smin v23.8h, v23.8h, v14.8h\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "str d15, [x10, x14]\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str d9, [x9, x14]\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str d22, [x28, x14]\n" + "str d23, [x27, x14]\n" "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" - "ldr q13, [x19, #0x0]\n" - "mov v19.16b, v13.16b\n" + "ldr q15, [x19, #0x0]\n" + "add x14, x14, #0x8\n" "ldr q10, [x19, #0x10]\n" "add x19, x19, #0x20\n" - "mov v11.16b, v13.16b\n" "str x19, [%x[params], %[offsetof_Params_bias]]\n" - "mov v18.16b, v13.16b\n" - "ldr d0, [x5, #0x0]\n" - "ldr d1, [x5, #0x8]\n" - "mov v20.16b, v10.16b\n" - "ldr d2, [x5, #0x10]\n" - "mov v17.16b, v10.16b\n" - "ldr d3, [x5, #0x18]\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "ldr d2, [x17, #0x10]\n" + "mov v9.16b, v15.16b\n" + "mov v16.16b, v10.16b\n" + "ldr d3, [x17, #0x18]\n" + "ldr d4, [x17, #0x20]\n" + "mov v22.16b, v15.16b\n" "mov v21.16b, v10.16b\n" - "ldr d4, [x5, #0x20]\n" - "ssubl v0.8h, v0.8b, v12.8b\n" - "ldr d5, [x5, #0x28]\n" - "ssubl v1.8h, v1.8b, v12.8b\n" - "ldr d6, [x5, #0x30]\n" - "ssubl v2.8h, v2.8b, v12.8b\n" - "ldr d7, [x5, #0x38]\n" - "ssubl v3.8h, v3.8b, v12.8b\n" - "ldr d8, [x5, #0x40]\n" - "ssubl v4.8h, v4.8b, v12.8b\n" - "ldp x26, x25, [x7, #0x0]\n" - "ssubl v5.8h, v5.8b, v12.8b\n" - "ldp x24, x23, [x7, #0x10]\n" - "ssubl v6.8h, v6.8b, v12.8b\n" - "ssubl v7.8h, v7.8b, v12.8b\n" - "ldp x22, x21, [x7, #0x20]\n" - "ssubl v8.8h, v8.8b, v12.8b\n" - "ldp x20, x19, [x7, #0x30]\n" - "ldr d31, [x26, x4]\n" - "usubl v31.8h, v31.8b, v22.8b\n" - "ldr d30, [x25, x4]\n" - "ldr d29, [x24, x4]\n" - "usubl v30.8h, v30.8b, v22.8b\n" - "ldr d28, [x23, x4]\n" - "usubl v29.8h, v29.8b, v22.8b\n" - "ldr d27, [x22, x4]\n" - "ldr d26, [x21, x4]\n" - "usubl v28.8h, v28.8b, v22.8b\n" - "ldr d25, [x20, x4]\n" - "ldr d24, [x19, x4]\n" - "usubl v27.8h, v27.8b, v22.8b\n" - "usubl v26.8h, v26.8b, v22.8b\n" - "usubl v25.8h, v25.8b, v22.8b\n" - "usubl v24.8h, v24.8b, v22.8b\n" + "ldr d5, [x17, #0x28]\n" + "ldr d6, [x17, #0x30]\n" + "mov v23.16b, v15.16b\n" + "mov v18.16b, v10.16b\n" + "ldr d7, [x17, #0x38]\n" + "ldr d8, [x17, #0x40]\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "ldp x26, x25, [x12, #0x0]\n" + "ldp x24, x23, [x12, #0x10]\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "ldp x22, x21, [x12, #0x20]\n" + "ldp x20, x19, [x12, #0x30]\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "ssubl v5.8h, v5.8b, v13.8b\n" + "ldr d31, [x26, x15]\n" + "ldr d30, [x25, x15]\n" + "ssubl v6.8h, v6.8b, v13.8b\n" + "ssubl v7.8h, v7.8b, v13.8b\n" + "ldr d29, [x24, x15]\n" + "ldr d28, [x23, x15]\n" + "ssubl v8.8h, v8.8b, v13.8b\n" + "usubl v31.8h, v31.8b, v12.8b\n" + "ldr d27, [x22, x15]\n" + "ldr d26, [x21, x15]\n" + "usubl v30.8h, v30.8b, v12.8b\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "ldr d25, [x20, x15]\n" + "ldr d24, [x19, x15]\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "usubl v24.8h, v24.8b, v12.8b\n" "bgt 1b\n" "2:" // Tail - "smlal v13.4s, v31.4h, v8.4h\n" - "ldr x22, [x7, #0x40]\n" - "tst x3, #0x7\n" + "smlal v15.4s, v31.4h, v8.4h\n" "smlal2 v10.4s, v31.8h, v8.8h\n" - "ldr x21, [x7, #0x48]\n" - "smlal v19.4s, v31.4h, v6.4h\n" - "ldr x20, [x7, #0x50]\n" - "smlal2 v20.4s, v31.8h, v6.8h\n" - "ldr x19, [x7, #0x58]\n" - "smlal v11.4s, v31.4h, v2.4h\n" - "ldr x11, [x7, #0x60]\n" - "smlal2 v17.4s, v31.8h, v2.8h\n" - "ldr x10, [x7, #0x68]\n" - "smlal v18.4s, v31.4h, v0.4h\n" - "ldr x9, [x7, #0x70]\n" - "smlal2 v21.4s, v31.8h, v0.8h\n" - "ldr x28, [x7, #0x78]\n" - "smlal v13.4s, v30.4h, v0.4h\n" - "ldr x27, [x7, #0x80]\n" + "ldr x24, [x12, #0x40]\n" + "ldr x23, [x12, #0x48]\n" + "smlal v9.4s, v31.4h, v6.4h\n" + "smlal2 v16.4s, v31.8h, v6.8h\n" + "ldr x21, [x12, #0x50]\n" + "ldr x19, [x12, #0x58]\n" + "smlal v15.4s, v30.4h, v0.4h\n" "smlal2 v10.4s, v30.8h, v0.8h\n" - "ldr x26, [x7, #0x88]\n" - "smlal v19.4s, v28.4h, v1.4h\n" - "ldr x25, [x7, #0x90]\n" - "smlal2 v20.4s, v28.8h, v1.8h\n" - "ldr d28, [x21, x4]\n" - "smlal v13.4s, v29.4h, v1.4h\n" - "ldr x24, [x7, #0x98]\n" + "ldr x22, [x12, #0x78]\n" + "ldr x20, [x12, #0x60]\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "smlal2 v16.4s, v28.8h, v1.8h\n" + "ldr d28, [x23, x15]\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "smlal v15.4s, v29.4h, v1.4h\n" "smlal2 v10.4s, v29.8h, v1.8h\n" - "ldr d29, [x22, x4]\n" - "smlal v19.4s, v27.4h, v2.4h\n" - "ldr x23, [x7, #0xa0]\n" - "smlal2 v20.4s, v27.8h, v2.8h\n" - "ldr d27, [x20, x4]\n" - "smlal v13.4s, v26.4h, v3.4h\n" - "ldr x22, [x7, #0xa8]\n" + "ldr d29, [x24, x15]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v9.4s, v27.4h, v2.4h\n" + "smlal2 v16.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x15]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "smlal v15.4s, v26.4h, v3.4h\n" "smlal2 v10.4s, v26.8h, v3.8h\n" - "ldr d26, [x19, x4]\n" - "smlal v19.4s, v24.4h, v0.4h\n" - "ldr x21, [x7, #0xb0]\n" - "smlal2 v20.4s, v24.8h, v0.8h\n" - "ldr x20, [x7, #0xb8]\n" - "smlal v13.4s, v25.4h, v4.4h\n" - "ldr x19, [x7, #0xc0]\n" + "ldr d26, [x19, x15]\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v9.4s, v24.4h, v0.4h\n" + "smlal2 v16.4s, v24.8h, v0.8h\n" + "ldr x21, [x12, #0x80]\n" + "ldr x19, [x12, #0x68]\n" + "smlal v15.4s, v25.4h, v4.4h\n" "smlal2 v10.4s, v25.8h, v4.8h\n" - "ldr d25, [x11, x4]\n" - "usubl v29.8h, v29.8b, v22.8b\n" - "ldr q31, [x8, #0x0]\n" - "usubl v28.8h, v28.8b, v22.8b\n" - "ldr q30, [x16, #0x0]\n" - "smlal v13.4s, v24.4h, v2.4h\n" - "ldr q23, [x8, #0x10]\n" - "add x8, x8, #0x20\n" + "ldr d25, [x20, x15]\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "smlal v9.4s, v29.4h, v4.4h\n" + "smlal2 v16.4s, v29.8h, v4.8h\n" + "ldr x20, [x12, #0x88]\n" + "ldr d29, [x19, x15]\n" + "smlal v15.4s, v24.4h, v2.4h\n" "smlal2 v10.4s, v24.8h, v2.8h\n" - "ldr d24, [x9, x4]\n" - "smlal v19.4s, v29.4h, v4.4h\n" - "ldr q9, [x16, #0x10]\n" - "add x16, x16, #0x20\n" - "smlal2 v20.4s, v29.8h, v4.8h\n" - "ldr d29, [x10, x4]\n" - "usubl v27.8h, v27.8b, v22.8b\n" - "usubl v26.8h, v26.8b, v22.8b\n" - "smlal v19.4s, v28.4h, v5.4h\n" - "smlal v13.4s, v27.4h, v5.4h\n" - "smlal2 v20.4s, v28.8h, v5.8h\n" - "ldr d28, [x27, x4]\n" + "ldr x19, [x12, #0x70]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v9.4s, v28.4h, v5.4h\n" + "smlal2 v16.4s, v28.8h, v5.8h\n" + "ldr d28, [x21, x15]\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "smlal v22.4s, v31.4h, v2.4h\n" + "smlal2 v21.4s, v31.8h, v2.8h\n" + "ldr x24, [x12, #0x98]\n" + "ldr d24, [x19, x15]\n" + "smlal v15.4s, v27.4h, v5.4h\n" "smlal2 v10.4s, v27.8h, v5.8h\n" - "smlal v19.4s, v27.4h, v3.4h\n" - "smlal v11.4s, v26.4h, v3.4h\n" - "smlal2 v20.4s, v27.8h, v3.8h\n" - "ldr d27, [x28, x4]\n" - "smlal2 v17.4s, v26.8h, v3.8h\n" - "ldr d26, [x26, x4]\n" - "usubl v25.8h, v25.8b, v22.8b\n" - "usubl v29.8h, v29.8b, v22.8b\n" - "usubl v24.8h, v24.8b, v22.8b\n" - "smlal v13.4s, v25.4h, v6.4h\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "ldr x23, [x12, #0x90]\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "smlal2 v16.4s, v27.8h, v3.8h\n" + "ldr d27, [x22, x15]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "smlal v23.4s, v31.4h, v0.4h\n" + "smlal v22.4s, v26.4h, v3.4h\n" + "ldr x22, [x12, #0xa8]\n" + "ldr x19, [x12, #0xa0]\n" + "smlal2 v21.4s, v26.8h, v3.8h\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" + "ldr d26, [x20, x15]\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v23.4s, v27.4h, v4.4h\n" + "smlal v22.4s, v25.4h, v0.4h\n" + "ldr x21, [x12, #0xb0]\n" + "ldr x20, [x12, #0xb8]\n" + "smlal2 v21.4s, v25.8h, v0.8h\n" + "smlal2 v18.4s, v27.8h, v4.8h\n" + "ldr d27, [x19, x15]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "smlal v23.4s, v28.4h, v1.4h\n" + "smlal v15.4s, v25.4h, v6.4h\n" + "ldr x19, [x12, #0xc0]\n" + "ldr q19, [x13, #0x0]\n" "smlal2 v10.4s, v25.8h, v6.8h\n" - "smlal v11.4s, v25.4h, v0.4h\n" - "smlal2 v17.4s, v25.8h, v0.8h\n" - "ldr d25, [x25, x4]\n" - "smlal v13.4s, v24.4h, v7.4h\n" + "smlal v22.4s, v29.4h, v4.4h\n" + "ldr d25, [x23, x15]\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "smlal2 v21.4s, v29.8h, v4.8h\n" + "ldr d29, [x24, x15]\n" + "smlal2 v18.4s, v28.8h, v1.8h\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v23.4s, v26.4h, v5.4h\n" + "smlal v15.4s, v24.4h, v7.4h\n" + "ldr q0, [x11, #0x0]\n" + "ldr q4, [x13, #0x10]\n" "smlal2 v10.4s, v24.8h, v7.8h\n" - "smlal v11.4s, v29.4h, v4.4h\n" - "smlal2 v17.4s, v29.8h, v4.8h\n" - "ldr d29, [x24, x4]\n" - "usubl v27.8h, v27.8b, v22.8b\n" - "usubl v28.8h, v28.8b, v22.8b\n" - "smlal v11.4s, v24.4h, v1.4h\n" - "smlal2 v17.4s, v24.8h, v1.8h\n" - "ldr d24, [x22, x4]\n" - "smlal v18.4s, v27.4h, v4.4h\n" - "smlal2 v21.4s, v27.8h, v4.8h\n" - "ldr d27, [x23, x4]\n" - "smlal v19.4s, v28.4h, v7.4h\n" - "smlal2 v20.4s, v28.8h, v7.8h\n" - "smlal v18.4s, v28.4h, v1.4h\n" - "smlal2 v21.4s, v28.8h, v1.8h\n" - "usubl v26.8h, v26.8b, v22.8b\n" - "usubl v25.8h, v25.8b, v22.8b\n" - "usubl v29.8h, v29.8b, v22.8b\n" - "smlal v18.4s, v26.4h, v5.4h\n" - "smlal2 v21.4s, v26.8h, v5.8h\n" - "ldr d26, [x21, x4]\n" - "smlal v11.4s, v25.4h, v6.4h\n" - "smlal2 v17.4s, v25.8h, v6.8h\n" - "ldr d25, [x20, x4]\n" - "smlal v19.4s, v29.4h, v8.4h\n" - "smlal2 v20.4s, v29.8h, v8.8h\n" - "smlal v18.4s, v29.4h, v2.4h\n" - "smlal2 v21.4s, v29.8h, v2.8h\n" - "ldr d29, [x19, x4]\n" - "add x4, x4, #0x8\n" - "usubl v27.8h, v27.8b, v22.8b\n" - "usubl v24.8h, v24.8b, v22.8b\n" - "usubl v26.8h, v26.8b, v22.8b\n" - "usubl v25.8h, v25.8b, v22.8b\n" - "smlal v11.4s, v27.4h, v7.4h\n" - "smlal2 v17.4s, v27.8h, v7.8h\n" - "smlal v18.4s, v24.4h, v3.4h\n" - "smlal2 v21.4s, v24.8h, v3.8h\n" - "smlal v11.4s, v24.4h, v5.4h\n" - "smlal2 v17.4s, v24.8h, v5.8h\n" - "smlal v18.4s, v26.4h, v7.4h\n" - "smlal2 v21.4s, v26.8h, v7.8h\n" - "smlal v11.4s, v25.4h, v8.4h\n" - "smlal2 v17.4s, v25.8h, v8.8h\n" - "smlal v18.4s, v25.4h, v6.4h\n" + "smlal v22.4s, v24.4h, v1.4h\n" + "sqdmulh v15.4s, v15.4s, v19.4s\n" + "ldr q31, [x11, #0x10]\n" + "smlal2 v21.4s, v24.8h, v1.8h\n" + "ldr d24, [x22, x15]\n" + "smlal2 v18.4s, v26.8h, v5.8h\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "smlal v23.4s, v29.4h, v2.4h\n" + "ldr d26, [x21, x15]\n" + "smlal2 v18.4s, v29.8h, v2.8h\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v22.4s, v25.4h, v6.4h\n" + "smlal v23.4s, v24.4h, v3.4h\n" + "and v30.16b, v15.16b, v0.16b\n" + "tst x8, #0x7\n" + "smlal v9.4s, v28.4h, v7.4h\n" + "smlal2 v16.4s, v28.8h, v7.8h\n" + "sqdmulh v10.4s, v10.4s, v4.4s\n" + "add x13, x13, #0x20\n" "smlal2 v21.4s, v25.8h, v6.8h\n" - "usubl v29.8h, v29.8b, v22.8b\n" - "sqrdmulh v13.4s, v13.4s, v31.4s\n" - "sqrdmulh v10.4s, v10.4s, v23.4s\n" - "smlal v18.4s, v29.4h, v8.4h\n" - "smlal2 v21.4s, v29.8h, v8.8h\n" - "and v27.16b, v13.16b, v30.16b\n" - "and v7.16b, v10.16b, v9.16b\n" - "sqrdmulh v19.4s, v19.4s, v31.4s\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sqrdmulh v20.4s, v20.4s, v23.4s\n" - "sqadd v13.4s, v13.4s, v27.4s\n" - "sqadd v10.4s, v10.4s, v7.4s\n" - "and v6.16b, v19.16b, v30.16b\n" - "and v3.16b, v20.16b, v9.16b\n" - "srshl v13.4s, v13.4s, v30.4s\n" - "srshl v10.4s, v10.4s, v9.4s\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "add v13.4s, v13.4s, v14.4s\n" - "add v10.4s, v10.4s, v14.4s\n" - "sqadd v19.4s, v19.4s, v6.4s\n" - "smin v13.4s, v13.4s, v15.4s\n" - "smin v10.4s, v10.4s, v15.4s\n" - "sqadd v20.4s, v20.4s, v3.4s\n" - "smax v13.4s, v13.4s, v16.4s\n" - "smax v10.4s, v10.4s, v16.4s\n" - "srshl v19.4s, v19.4s, v30.4s\n" - "srshl v20.4s, v20.4s, v9.4s\n" - "uzp1 v13.16b, v13.16b, v10.16b\n" - "sqrdmulh v11.4s, v11.4s, v31.4s\n" - "uzp1 v13.16b, v13.16b, v13.16b\n" - "str d13, [x15, x6]\n" - "add v19.4s, v19.4s, v14.4s\n" - "add v20.4s, v20.4s, v14.4s\n" - "and v28.16b, v11.16b, v30.16b\n" - "sqrdmulh v17.4s, v17.4s, v23.4s\n" - "smin v19.4s, v19.4s, v15.4s\n" - "smin v20.4s, v20.4s, v15.4s\n" + "ldr d25, [x20, x15]\n" + "smlal2 v18.4s, v24.8h, v3.8h\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "smlal v22.4s, v27.4h, v7.4h\n" + "smlal v23.4s, v26.4h, v7.4h\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "add x11, x11, #0x20\n" + "smlal v9.4s, v29.4h, v8.4h\n" + "smlal2 v16.4s, v29.8h, v8.8h\n" + "ldr d29, [x19, x15]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal2 v21.4s, v27.8h, v7.8h\n" + "smlal2 v18.4s, v26.8h, v7.8h\n" + "sqdmulh v9.4s, v9.4s, v19.4s\n" + "add x15, x15, #0x8\n" + "smlal v22.4s, v24.4h, v5.4h\n" + "smlal v23.4s, v25.4h, v6.4h\n" + "and v28.16b, v9.16b, v0.16b\n" + "smlal2 v21.4s, v24.8h, v5.8h\n" + "smlal2 v18.4s, v25.8h, v6.8h\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "smlal v22.4s, v25.4h, v8.4h\n" + "smlal v23.4s, v29.4h, v8.4h\n" + "sqdmulh v22.4s, v22.4s, v19.4s\n" + "smlal2 v21.4s, v25.8h, v8.8h\n" + "smlal2 v18.4s, v29.8h, v8.8h\n" + "sqdmulh v23.4s, v23.4s, v19.4s\n" + "and v29.16b, v22.16b, v0.16b\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "and v20.16b, v23.16b, v0.16b\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "and v19.16b, v10.16b, v31.16b\n" "sshr v28.4s, v28.4s, #0x1f\n" - "smax v19.4s, v19.4s, v16.4s\n" - "smax v20.4s, v20.4s, v16.4s\n" - "sqadd v11.4s, v11.4s, v28.4s\n" - "and v26.16b, v17.16b, v9.16b\n" - "uzp1 v19.16b, v19.16b, v20.16b\n" - "sqrdmulh v18.4s, v18.4s, v31.4s\n" - "uzp1 v19.16b, v19.16b, v19.16b\n" - "str d19, [x14, x6]\n" - "srshl v11.4s, v11.4s, v30.4s\n" + "and v4.16b, v16.16b, v31.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "and v5.16b, v21.16b, v31.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v26.16b, v18.16b, v31.16b\n" + "sqadd v15.4s, v15.4s, v30.4s\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v28.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v22.4s, v22.4s, v29.4s\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v20.4s\n" "sshr v26.4s, v26.4s, #0x1f\n" - "and v8.16b, v18.16b, v30.16b\n" - "sqrdmulh v21.4s, v21.4s, v23.4s\n" - "sqadd v17.4s, v17.4s, v26.4s\n" - "add v11.4s, v11.4s, v14.4s\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "and v27.16b, v21.16b, v9.16b\n" - "smin v11.4s, v11.4s, v15.4s\n" - "srshl v17.4s, v17.4s, v9.4s\n" - "sqadd v18.4s, v18.4s, v8.4s\n" - "smax v11.4s, v11.4s, v16.4s\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "add v17.4s, v17.4s, v14.4s\n" - "srshl v18.4s, v18.4s, v30.4s\n" - "sqadd v21.4s, v21.4s, v27.4s\n" - "smin v17.4s, v17.4s, v15.4s\n" - "add v18.4s, v18.4s, v14.4s\n" - "smax v17.4s, v17.4s, v16.4s\n" - "srshl v21.4s, v21.4s, v9.4s\n" - "smin v18.4s, v18.4s, v15.4s\n" - "uzp1 v11.16b, v11.16b, v17.16b\n" - "add v21.4s, v21.4s, v14.4s\n" - "uzp1 v11.16b, v11.16b, v11.16b\n" - "str d11, [x13, x6]\n" - "smax v18.4s, v18.4s, v16.4s\n" - "smin v21.4s, v21.4s, v15.4s\n" - "smax v21.4s, v21.4s, v16.4s\n" - "uzp1 v18.16b, v18.16b, v21.16b\n" - "uzp1 v18.16b, v18.16b, v18.16b\n" - "str d18, [x12, x6]\n" - "add x6, x6, #0x8\n" + "srshl v15.4s, v15.4s, v0.4s\n" + "sqadd v10.4s, v10.4s, v19.4s\n" + "srshl v9.4s, v9.4s, v0.4s\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "sqadd v21.4s, v21.4s, v5.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "sqadd v18.4s, v18.4s, v26.4s\n" + "srshl v10.4s, v10.4s, v31.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v16.4s, v16.4s, v31.4s\n" + "sqxtn v9.4h, v9.4s\n" + "srshl v21.4s, v21.4s, v31.4s\n" + "sqxtn v22.4h, v22.4s\n" + "srshl v18.4s, v18.4s, v31.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v15.8h, v10.4s\n" + "sqxtn2 v9.8h, v16.4s\n" + "sqxtn2 v22.8h, v21.4s\n" + "sqxtn2 v23.8h, v18.4s\n" + "sqadd v15.8h, v15.8h, v11.8h\n" + "sqadd v9.8h, v9.8h, v11.8h\n" + "sqadd v22.8h, v22.8h, v11.8h\n" + "sqadd v23.8h, v23.8h, v11.8h\n" + "smax v15.8h, v15.8h, v17.8h\n" + "smax v9.8h, v9.8h, v17.8h\n" + "smax v22.8h, v22.8h, v17.8h\n" + "smax v23.8h, v23.8h, v17.8h\n" + "smin v15.8h, v15.8h, v14.8h\n" + "smin v9.8h, v9.8h, v14.8h\n" + "smin v22.8h, v22.8h, v14.8h\n" + "smin v23.8h, v23.8h, v14.8h\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "str d15, [x10, x14]\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "str d9, [x9, x14]\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "str d22, [x28, x14]\n" + "str d23, [x27, x14]\n" + "add x14, x14, #0x8\n" "beq 88f\n" - "add x5, x5, #0x48\n" + "add x17, x17, #0x48\n" "3:" // Oddments "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" - "tbz x3, #2, 5f\n" - "ld1 { v13.4s }, [x19], #0x10\n" - "tbz x3, #1, 4f\n" + "tbz x8, #2, 5f\n" + "ld1 { v15.4s }, [x19], #0x10\n" + "tbz x8, #1, 4f\n" "ld1 { v10.d }[0], [x19], #0x8\n" - "tbz x3, #0, 7f\n" + "tbz x8, #0, 7f\n" "ld1 { v10.s }[2], [x19]\n" "b 7f\n" "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset - "tbz x3, #0, 7f\n" + "tbz x8, #0, 7f\n" "ld1 { v10.s }[0], [x19]\n" "b 7f\n" "5:" // Oddments: Load bias: Bit 2: Unset - "tbz x3, #1, 6f\n" - "ld1 { v13.d }[0], [x19], #0x8\n" - "tbz x3, #0, 7f\n" - "ld1 { v13.s }[2], [x19]\n" + "tbz x8, #1, 6f\n" + "ld1 { v15.d }[0], [x19], #0x8\n" + "tbz x8, #0, 7f\n" + "ld1 { v15.s }[2], [x19]\n" "b 7f\n" "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 7f\n" - "ld1 { v13.s }[0], [x19]\n" + "tbz x8, #0, 7f\n" + "ld1 { v15.s }[0], [x19]\n" "7:" // Oddments: Load bias: Bit 2: End - "mov v19.16b, v13.16b\n" - "ldr d0, [x5, #0x0]\n" - "mov v20.16b, v10.16b\n" - "ldr d1, [x5, #0x8]\n" - "mov v11.16b, v13.16b\n" - "ldr d2, [x5, #0x10]\n" - "mov v17.16b, v10.16b\n" - "ldr d3, [x5, #0x18]\n" - "mov v18.16b, v13.16b\n" - "ldr d4, [x5, #0x20]\n" + "ldr d0, [x17, #0x0]\n" + "ldr d1, [x17, #0x8]\n" + "mov v9.16b, v15.16b\n" + "mov v16.16b, v10.16b\n" + "ldr d2, [x17, #0x10]\n" + "ldr d3, [x17, #0x18]\n" + "mov v22.16b, v15.16b\n" "mov v21.16b, v10.16b\n" - "ldr d5, [x5, #0x28]\n" - "ssubl v0.8h, v0.8b, v12.8b\n" - "ldr d6, [x5, #0x30]\n" - "ssubl v1.8h, v1.8b, v12.8b\n" - "ldr d7, [x5, #0x38]\n" - "ssubl v2.8h, v2.8b, v12.8b\n" - "ldr d8, [x5, #0x40]\n" - "ssubl v3.8h, v3.8b, v12.8b\n" - "ldp x26, x25, [x7, #0x0]\n" - "add x26, x26, x4\n" - "ssubl v4.8h, v4.8b, v12.8b\n" - "ldp x24, x23, [x7, #0x10]\n" - "ssubl v5.8h, v5.8b, v12.8b\n" - "ldp x22, x21, [x7, #0x20]\n" - "ssubl v6.8h, v6.8b, v12.8b\n" - "add x25, x25, x4\n" - "ssubl v7.8h, v7.8b, v12.8b\n" - "ldp x20, x19, [x7, #0x30]\n" - "ssubl v8.8h, v8.8b, v12.8b\n" - "add x24, x24, x4\n" - "add x23, x23, x4\n" - "add x22, x22, x4\n" - "add x21, x21, x4\n" - "add x20, x20, x4\n" - "add x19, x19, x4\n" - "tbz x3, #2, 9f\n" + "ldr d4, [x17, #0x20]\n" + "ldr d5, [x17, #0x28]\n" + "mov v23.16b, v15.16b\n" + "mov v18.16b, v10.16b\n" + "ldr d6, [x17, #0x30]\n" + "ldr d7, [x17, #0x38]\n" + "ssubl v0.8h, v0.8b, v13.8b\n" + "ssubl v1.8h, v1.8b, v13.8b\n" + "ldr d8, [x17, #0x40]\n" + "ldp x26, x25, [x12, #0x0]\n" + "ssubl v2.8h, v2.8b, v13.8b\n" + "ssubl v3.8h, v3.8b, v13.8b\n" + "ldp x24, x23, [x12, #0x10]\n" + "ldp x22, x21, [x12, #0x20]\n" + "ssubl v4.8h, v4.8b, v13.8b\n" + "ssubl v5.8h, v5.8b, v13.8b\n" + "ldp x20, x19, [x12, #0x30]\n" + "ssubl v6.8h, v6.8b, v13.8b\n" + "ssubl v7.8h, v7.8b, v13.8b\n" + "ssubl v8.8h, v8.8b, v13.8b\n" + "add x26, x26, x15\n" + "add x25, x25, x15\n" + "add x24, x24, x15\n" + "add x23, x23, x15\n" + "add x22, x22, x15\n" + "add x21, x21, x15\n" + "add x20, x20, x15\n" + "add x19, x19, x15\n" + "tbz x8, #2, 9f\n" "ld1 { v31.s }[0], [x26], #0x4\n" "ld1 { v30.s }[0], [x25], #0x4\n" "ld1 { v29.s }[0], [x24], #0x4\n" @@ -713,7 +697,7 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v26.s }[0], [x21], #0x4\n" "ld1 { v25.s }[0], [x20], #0x4\n" "ld1 { v24.s }[0], [x19], #0x4\n" - "tbz x3, #1, 8f\n" + "tbz x8, #1, 8f\n" "ld1 { v31.h }[2], [x26], #0x2\n" "ld1 { v30.h }[2], [x25], #0x2\n" "ld1 { v29.h }[2], [x24], #0x2\n" @@ -722,7 +706,7 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v26.h }[2], [x21], #0x2\n" "ld1 { v25.h }[2], [x20], #0x2\n" "ld1 { v24.h }[2], [x19], #0x2\n" - "tbz x3, #0, 11f\n" + "tbz x8, #0, 11f\n" "ld1 { v31.b }[6], [x26]\n" "ld1 { v30.b }[6], [x25]\n" "ld1 { v29.b }[6], [x24]\n" @@ -733,7 +717,7 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v24.b }[6], [x19]\n" "b 11f\n" "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset - "tbz x3, #0, 11f\n" + "tbz x8, #0, 11f\n" "ld1 { v31.b }[4], [x26]\n" "ld1 { v30.b }[4], [x25]\n" "ld1 { v29.b }[4], [x24]\n" @@ -744,7 +728,7 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v24.b }[4], [x19]\n" "b 11f\n" "9:" // Oddments: Initial loads: Bit 2: Unset - "tbz x3, #1, 10f\n" + "tbz x8, #1, 10f\n" "ld1 { v31.h }[0], [x26], #0x2\n" "ld1 { v30.h }[0], [x25], #0x2\n" "ld1 { v29.h }[0], [x24], #0x2\n" @@ -753,7 +737,7 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v26.h }[0], [x21], #0x2\n" "ld1 { v25.h }[0], [x20], #0x2\n" "ld1 { v24.h }[0], [x19], #0x2\n" - "tbz x3, #0, 11f\n" + "tbz x8, #0, 11f\n" "ld1 { v31.b }[2], [x26]\n" "ld1 { v30.b }[2], [x25]\n" "ld1 { v29.b }[2], [x24]\n" @@ -764,7 +748,7 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v24.b }[2], [x19]\n" "b 11f\n" "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 11f\n" + "tbz x8, #0, 11f\n" "ld1 { v31.b }[0], [x26]\n" "ld1 { v30.b }[0], [x25]\n" "ld1 { v29.b }[0], [x24]\n" @@ -774,646 +758,636 @@ void a64_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( "ld1 { v25.b }[0], [x20]\n" "ld1 { v24.b }[0], [x19]\n" "11:" // Oddments: Initial loads: Bit 2: End - "usubl v31.8h, v31.8b, v22.8b\n" - "ldr x22, [x7, #0x40]\n" - "add x22, x22, x4\n" - "usubl v30.8h, v30.8b, v22.8b\n" - "usubl v29.8h, v29.8b, v22.8b\n" - "usubl v28.8h, v28.8b, v22.8b\n" - "usubl v27.8h, v27.8b, v22.8b\n" - "usubl v26.8h, v26.8b, v22.8b\n" - "usubl v25.8h, v25.8b, v22.8b\n" - "usubl v24.8h, v24.8b, v22.8b\n" - "smlal v13.4s, v31.4h, v8.4h\n" + "usubl v31.8h, v31.8b, v12.8b\n" + "smlal v15.4s, v31.4h, v8.4h\n" "smlal2 v10.4s, v31.8h, v8.8h\n" - "smlal v19.4s, v31.4h, v6.4h\n" - "smlal2 v20.4s, v31.8h, v6.8h\n" - "smlal v11.4s, v31.4h, v2.4h\n" - "smlal2 v17.4s, v31.8h, v2.8h\n" - "smlal v18.4s, v31.4h, v0.4h\n" - "smlal2 v21.4s, v31.8h, v0.8h\n" - "smlal v13.4s, v30.4h, v0.4h\n" + "ldr x24, [x12, #0x40]\n" + "usubl v30.8h, v30.8b, v12.8b\n" + "smlal v15.4s, v30.4h, v0.4h\n" "smlal2 v10.4s, v30.8h, v0.8h\n" - "smlal v19.4s, v28.4h, v1.4h\n" - "smlal2 v20.4s, v28.8h, v1.8h\n" - "smlal v13.4s, v29.4h, v1.4h\n" + "add x24, x24, x15\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v9.4s, v31.4h, v6.4h\n" + "smlal2 v16.4s, v31.8h, v6.8h\n" + "smlal v15.4s, v29.4h, v1.4h\n" "smlal2 v10.4s, v29.8h, v1.8h\n" - "smlal v19.4s, v27.4h, v2.4h\n" - "smlal2 v20.4s, v27.8h, v2.8h\n" - "smlal v13.4s, v26.4h, v3.4h\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "smlal v9.4s, v28.4h, v1.4h\n" + "smlal2 v16.4s, v28.8h, v1.8h\n" + "smlal v15.4s, v26.4h, v3.4h\n" "smlal2 v10.4s, v26.8h, v3.8h\n" - "smlal v19.4s, v24.4h, v0.4h\n" - "smlal2 v20.4s, v24.8h, v0.8h\n" - "smlal v13.4s, v25.4h, v4.4h\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "smlal v9.4s, v27.4h, v2.4h\n" + "smlal2 v16.4s, v27.8h, v2.8h\n" + "smlal v15.4s, v25.4h, v4.4h\n" "smlal2 v10.4s, v25.8h, v4.8h\n" - "smlal v13.4s, v24.4h, v2.4h\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "smlal v22.4s, v31.4h, v2.4h\n" + "smlal2 v21.4s, v31.8h, v2.8h\n" + "smlal v23.4s, v31.4h, v0.4h\n" + "smlal2 v18.4s, v31.8h, v0.8h\n" + "smlal v15.4s, v24.4h, v2.4h\n" "smlal2 v10.4s, v24.8h, v2.8h\n" - "tbz x3, #2, 13f\n" - "ld1 { v29.s }[0], [x22], #0x4\n" - "tbz x3, #1, 12f\n" - "ld1 { v29.h }[2], [x22], #0x2\n" - "tbz x3, #0, 15f\n" - "ld1 { v29.b }[6], [x22]\n" + "smlal v9.4s, v24.4h, v0.4h\n" + "smlal2 v16.4s, v24.8h, v0.8h\n" + "tbz x8, #2, 13f\n" + "ld1 { v29.s }[0], [x24], #0x4\n" + "tbz x8, #1, 12f\n" + "ld1 { v29.h }[2], [x24], #0x2\n" + "tbz x8, #0, 15f\n" + "ld1 { v29.b }[6], [x24]\n" "b 15f\n" "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset - "tbz x3, #0, 15f\n" - "ld1 { v29.b }[4], [x22]\n" + "tbz x8, #0, 15f\n" + "ld1 { v29.b }[4], [x24]\n" "b 15f\n" "13:" // Oddments: Load (1, 3): Bit 2: Unset - "tbz x3, #1, 14f\n" - "ld1 { v29.h }[0], [x22], #0x2\n" - "tbz x3, #0, 15f\n" - "ld1 { v29.b }[2], [x22]\n" + "tbz x8, #1, 14f\n" + "ld1 { v29.h }[0], [x24], #0x2\n" + "tbz x8, #0, 15f\n" + "ld1 { v29.b }[2], [x24]\n" "b 15f\n" "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 15f\n" - "ld1 { v29.b }[0], [x22]\n" + "tbz x8, #0, 15f\n" + "ld1 { v29.b }[0], [x24]\n" "15:" // Oddments: Load (1, 3): Bit 2: End - "usubl v29.8h, v29.8b, v22.8b\n" - "ldr x21, [x7, #0x48]\n" - "smlal v19.4s, v29.4h, v4.4h\n" - "add x21, x21, x4\n" - "smlal2 v20.4s, v29.8h, v4.8h\n" - "tbz x3, #2, 17f\n" - "ld1 { v28.s }[0], [x21], #0x4\n" - "tbz x3, #1, 16f\n" - "ld1 { v28.h }[2], [x21], #0x2\n" - "tbz x3, #0, 19f\n" - "ld1 { v28.b }[6], [x21]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "ldr x23, [x12, #0x48]\n" + "smlal v9.4s, v29.4h, v4.4h\n" + "smlal2 v16.4s, v29.8h, v4.8h\n" + "add x23, x23, x15\n" + "tbz x8, #2, 17f\n" + "ld1 { v28.s }[0], [x23], #0x4\n" + "tbz x8, #1, 16f\n" + "ld1 { v28.h }[2], [x23], #0x2\n" + "tbz x8, #0, 19f\n" + "ld1 { v28.b }[6], [x23]\n" "b 19f\n" "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset - "tbz x3, #0, 19f\n" - "ld1 { v28.b }[4], [x21]\n" + "tbz x8, #0, 19f\n" + "ld1 { v28.b }[4], [x23]\n" "b 19f\n" "17:" // Oddments: Load (1, 4): Bit 2: Unset - "tbz x3, #1, 18f\n" - "ld1 { v28.h }[0], [x21], #0x2\n" - "tbz x3, #0, 19f\n" - "ld1 { v28.b }[2], [x21]\n" + "tbz x8, #1, 18f\n" + "ld1 { v28.h }[0], [x23], #0x2\n" + "tbz x8, #0, 19f\n" + "ld1 { v28.b }[2], [x23]\n" "b 19f\n" "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 19f\n" - "ld1 { v28.b }[0], [x21]\n" + "tbz x8, #0, 19f\n" + "ld1 { v28.b }[0], [x23]\n" "19:" // Oddments: Load (1, 4): Bit 2: End - "usubl v28.8h, v28.8b, v22.8b\n" - "ldr x20, [x7, #0x50]\n" - "smlal v19.4s, v28.4h, v5.4h\n" - "add x20, x20, x4\n" - "smlal2 v20.4s, v28.8h, v5.8h\n" - "tbz x3, #2, 21f\n" - "ld1 { v27.s }[0], [x20], #0x4\n" - "tbz x3, #1, 20f\n" - "ld1 { v27.h }[2], [x20], #0x2\n" - "tbz x3, #0, 23f\n" - "ld1 { v27.b }[6], [x20]\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "ldr x21, [x12, #0x50]\n" + "smlal v9.4s, v28.4h, v5.4h\n" + "smlal2 v16.4s, v28.8h, v5.8h\n" + "add x21, x21, x15\n" + "tbz x8, #2, 21f\n" + "ld1 { v27.s }[0], [x21], #0x4\n" + "tbz x8, #1, 20f\n" + "ld1 { v27.h }[2], [x21], #0x2\n" + "tbz x8, #0, 23f\n" + "ld1 { v27.b }[6], [x21]\n" "b 23f\n" "20:" // Oddments: Load (1, 2): Bit 2: Bit 1: Unset - "tbz x3, #0, 23f\n" - "ld1 { v27.b }[4], [x20]\n" + "tbz x8, #0, 23f\n" + "ld1 { v27.b }[4], [x21]\n" "b 23f\n" "21:" // Oddments: Load (1, 2): Bit 2: Unset - "tbz x3, #1, 22f\n" - "ld1 { v27.h }[0], [x20], #0x2\n" - "tbz x3, #0, 23f\n" - "ld1 { v27.b }[2], [x20]\n" + "tbz x8, #1, 22f\n" + "ld1 { v27.h }[0], [x21], #0x2\n" + "tbz x8, #0, 23f\n" + "ld1 { v27.b }[2], [x21]\n" "b 23f\n" "22:" // Oddments: Load (1, 2): Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 23f\n" - "ld1 { v27.b }[0], [x20]\n" + "tbz x8, #0, 23f\n" + "ld1 { v27.b }[0], [x21]\n" "23:" // Oddments: Load (1, 2): Bit 2: End - "usubl v27.8h, v27.8b, v22.8b\n" - "ldr x19, [x7, #0x58]\n" - "smlal v13.4s, v27.4h, v5.4h\n" - "add x19, x19, x4\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "ldr x19, [x12, #0x58]\n" + "smlal v15.4s, v27.4h, v5.4h\n" "smlal2 v10.4s, v27.8h, v5.8h\n" - "smlal v19.4s, v27.4h, v3.4h\n" - "smlal2 v20.4s, v27.8h, v3.8h\n" - "tbz x3, #2, 25f\n" + "smlal v9.4s, v27.4h, v3.4h\n" + "smlal2 v16.4s, v27.8h, v3.8h\n" + "add x19, x19, x15\n" + "tbz x8, #2, 25f\n" "ld1 { v26.s }[0], [x19], #0x4\n" - "tbz x3, #1, 24f\n" + "tbz x8, #1, 24f\n" "ld1 { v26.h }[2], [x19], #0x2\n" - "tbz x3, #0, 27f\n" + "tbz x8, #0, 27f\n" "ld1 { v26.b }[6], [x19]\n" "b 27f\n" "24:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset - "tbz x3, #0, 27f\n" + "tbz x8, #0, 27f\n" "ld1 { v26.b }[4], [x19]\n" "b 27f\n" "25:" // Oddments: Load (3, 0): Bit 2: Unset - "tbz x3, #1, 26f\n" + "tbz x8, #1, 26f\n" "ld1 { v26.h }[0], [x19], #0x2\n" - "tbz x3, #0, 27f\n" + "tbz x8, #0, 27f\n" "ld1 { v26.b }[2], [x19]\n" "b 27f\n" "26:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 27f\n" + "tbz x8, #0, 27f\n" "ld1 { v26.b }[0], [x19]\n" "27:" // Oddments: Load (3, 0): Bit 2: End - "usubl v26.8h, v26.8b, v22.8b\n" - "ldr x11, [x7, #0x60]\n" - "smlal v11.4s, v26.4h, v3.4h\n" - "add x11, x11, x4\n" - "smlal2 v17.4s, v26.8h, v3.8h\n" - "tbz x3, #2, 29f\n" - "ld1 { v25.s }[0], [x11], #0x4\n" - "tbz x3, #1, 28f\n" - "ld1 { v25.h }[2], [x11], #0x2\n" - "tbz x3, #0, 31f\n" - "ld1 { v25.b }[6], [x11]\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "ldr x20, [x12, #0x60]\n" + "smlal v22.4s, v26.4h, v3.4h\n" + "smlal2 v21.4s, v26.8h, v3.8h\n" + "add x20, x20, x15\n" + "tbz x8, #2, 29f\n" + "ld1 { v25.s }[0], [x20], #0x4\n" + "tbz x8, #1, 28f\n" + "ld1 { v25.h }[2], [x20], #0x2\n" + "tbz x8, #0, 31f\n" + "ld1 { v25.b }[6], [x20]\n" "b 31f\n" "28:" // Oddments: Load (2, 0): Bit 2: Bit 1: Unset - "tbz x3, #0, 31f\n" - "ld1 { v25.b }[4], [x11]\n" + "tbz x8, #0, 31f\n" + "ld1 { v25.b }[4], [x20]\n" "b 31f\n" "29:" // Oddments: Load (2, 0): Bit 2: Unset - "tbz x3, #1, 30f\n" - "ld1 { v25.h }[0], [x11], #0x2\n" - "tbz x3, #0, 31f\n" - "ld1 { v25.b }[2], [x11]\n" + "tbz x8, #1, 30f\n" + "ld1 { v25.h }[0], [x20], #0x2\n" + "tbz x8, #0, 31f\n" + "ld1 { v25.b }[2], [x20]\n" "b 31f\n" "30:" // Oddments: Load (2, 0): Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 31f\n" - "ld1 { v25.b }[0], [x11]\n" + "tbz x8, #0, 31f\n" + "ld1 { v25.b }[0], [x20]\n" "31:" // Oddments: Load (2, 0): Bit 2: End - "usubl v25.8h, v25.8b, v22.8b\n" - "ldr x10, [x7, #0x68]\n" - "smlal v13.4s, v25.4h, v6.4h\n" - "add x10, x10, x4\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "ldr x19, [x12, #0x68]\n" + "smlal v15.4s, v25.4h, v6.4h\n" "smlal2 v10.4s, v25.8h, v6.8h\n" - "smlal v11.4s, v25.4h, v0.4h\n" - "smlal2 v17.4s, v25.8h, v0.8h\n" - "tbz x3, #2, 33f\n" - "ld1 { v29.s }[0], [x10], #0x4\n" - "tbz x3, #1, 32f\n" - "ld1 { v29.h }[2], [x10], #0x2\n" - "tbz x3, #0, 35f\n" - "ld1 { v29.b }[6], [x10]\n" + "smlal v22.4s, v25.4h, v0.4h\n" + "smlal2 v21.4s, v25.8h, v0.8h\n" + "add x19, x19, x15\n" + "tbz x8, #2, 33f\n" + "ld1 { v29.s }[0], [x19], #0x4\n" + "tbz x8, #1, 32f\n" + "ld1 { v29.h }[2], [x19], #0x2\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[6], [x19]\n" "b 35f\n" "32:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset - "tbz x3, #0, 35f\n" - "ld1 { v29.b }[4], [x10]\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[4], [x19]\n" "b 35f\n" "33:" // Oddments: Load (3, 1): Bit 2: Unset - "tbz x3, #1, 34f\n" - "ld1 { v29.h }[0], [x10], #0x2\n" - "tbz x3, #0, 35f\n" - "ld1 { v29.b }[2], [x10]\n" + "tbz x8, #1, 34f\n" + "ld1 { v29.h }[0], [x19], #0x2\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[2], [x19]\n" "b 35f\n" "34:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 35f\n" - "ld1 { v29.b }[0], [x10]\n" + "tbz x8, #0, 35f\n" + "ld1 { v29.b }[0], [x19]\n" "35:" // Oddments: Load (3, 1): Bit 2: End - "usubl v29.8h, v29.8b, v22.8b\n" - "ldr x9, [x7, #0x70]\n" - "smlal v11.4s, v29.4h, v4.4h\n" - "add x9, x9, x4\n" - "smlal2 v17.4s, v29.8h, v4.8h\n" - "tbz x3, #2, 37f\n" - "ld1 { v24.s }[0], [x9], #0x4\n" - "tbz x3, #1, 36f\n" - "ld1 { v24.h }[2], [x9], #0x2\n" - "tbz x3, #0, 39f\n" - "ld1 { v24.b }[6], [x9]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "ldr x19, [x12, #0x70]\n" + "smlal v22.4s, v29.4h, v4.4h\n" + "smlal2 v21.4s, v29.8h, v4.8h\n" + "add x19, x19, x15\n" + "tbz x8, #2, 37f\n" + "ld1 { v24.s }[0], [x19], #0x4\n" + "tbz x8, #1, 36f\n" + "ld1 { v24.h }[2], [x19], #0x2\n" + "tbz x8, #0, 39f\n" + "ld1 { v24.b }[6], [x19]\n" "b 39f\n" "36:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset - "tbz x3, #0, 39f\n" - "ld1 { v24.b }[4], [x9]\n" + "tbz x8, #0, 39f\n" + "ld1 { v24.b }[4], [x19]\n" "b 39f\n" "37:" // Oddments: Load (2, 1): Bit 2: Unset - "tbz x3, #1, 38f\n" - "ld1 { v24.h }[0], [x9], #0x2\n" - "tbz x3, #0, 39f\n" - "ld1 { v24.b }[2], [x9]\n" + "tbz x8, #1, 38f\n" + "ld1 { v24.h }[0], [x19], #0x2\n" + "tbz x8, #0, 39f\n" + "ld1 { v24.b }[2], [x19]\n" "b 39f\n" "38:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 39f\n" - "ld1 { v24.b }[0], [x9]\n" + "tbz x8, #0, 39f\n" + "ld1 { v24.b }[0], [x19]\n" "39:" // Oddments: Load (2, 1): Bit 2: End - "usubl v24.8h, v24.8b, v22.8b\n" - "ldr x28, [x7, #0x78]\n" - "smlal v13.4s, v24.4h, v7.4h\n" - "add x28, x28, x4\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "ldr x22, [x12, #0x78]\n" + "smlal v15.4s, v24.4h, v7.4h\n" "smlal2 v10.4s, v24.8h, v7.8h\n" - "smlal v11.4s, v24.4h, v1.4h\n" - "smlal2 v17.4s, v24.8h, v1.8h\n" - "tbz x3, #2, 41f\n" - "ld1 { v27.s }[0], [x28], #0x4\n" - "tbz x3, #1, 40f\n" - "ld1 { v27.h }[2], [x28], #0x2\n" - "tbz x3, #0, 43f\n" - "ld1 { v27.b }[6], [x28]\n" + "smlal v22.4s, v24.4h, v1.4h\n" + "smlal2 v21.4s, v24.8h, v1.8h\n" + "add x22, x22, x15\n" + "tbz x8, #2, 41f\n" + "ld1 { v27.s }[0], [x22], #0x4\n" + "tbz x8, #1, 40f\n" + "ld1 { v27.h }[2], [x22], #0x2\n" + "tbz x8, #0, 43f\n" + "ld1 { v27.b }[6], [x22]\n" "b 43f\n" "40:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset - "tbz x3, #0, 43f\n" - "ld1 { v27.b }[4], [x28]\n" + "tbz x8, #0, 43f\n" + "ld1 { v27.b }[4], [x22]\n" "b 43f\n" "41:" // Oddments: Load (3, 3): Bit 2: Unset - "tbz x3, #1, 42f\n" - "ld1 { v27.h }[0], [x28], #0x2\n" - "tbz x3, #0, 43f\n" - "ld1 { v27.b }[2], [x28]\n" + "tbz x8, #1, 42f\n" + "ld1 { v27.h }[0], [x22], #0x2\n" + "tbz x8, #0, 43f\n" + "ld1 { v27.b }[2], [x22]\n" "b 43f\n" "42:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 43f\n" - "ld1 { v27.b }[0], [x28]\n" + "tbz x8, #0, 43f\n" + "ld1 { v27.b }[0], [x22]\n" "43:" // Oddments: Load (3, 3): Bit 2: End - "usubl v27.8h, v27.8b, v22.8b\n" - "ldr x27, [x7, #0x80]\n" - "smlal v18.4s, v27.4h, v4.4h\n" - "add x27, x27, x4\n" - "smlal2 v21.4s, v27.8h, v4.8h\n" - "tbz x3, #2, 45f\n" - "ld1 { v28.s }[0], [x27], #0x4\n" - "tbz x3, #1, 44f\n" - "ld1 { v28.h }[2], [x27], #0x2\n" - "tbz x3, #0, 47f\n" - "ld1 { v28.b }[6], [x27]\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "ldr x21, [x12, #0x80]\n" + "smlal v23.4s, v27.4h, v4.4h\n" + "smlal2 v18.4s, v27.8h, v4.8h\n" + "add x21, x21, x15\n" + "tbz x8, #2, 45f\n" + "ld1 { v28.s }[0], [x21], #0x4\n" + "tbz x8, #1, 44f\n" + "ld1 { v28.h }[2], [x21], #0x2\n" + "tbz x8, #0, 47f\n" + "ld1 { v28.b }[6], [x21]\n" "b 47f\n" "44:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset - "tbz x3, #0, 47f\n" - "ld1 { v28.b }[4], [x27]\n" + "tbz x8, #0, 47f\n" + "ld1 { v28.b }[4], [x21]\n" "b 47f\n" "45:" // Oddments: Load (2, 3): Bit 2: Unset - "tbz x3, #1, 46f\n" - "ld1 { v28.h }[0], [x27], #0x2\n" - "tbz x3, #0, 47f\n" - "ld1 { v28.b }[2], [x27]\n" + "tbz x8, #1, 46f\n" + "ld1 { v28.h }[0], [x21], #0x2\n" + "tbz x8, #0, 47f\n" + "ld1 { v28.b }[2], [x21]\n" "b 47f\n" "46:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 47f\n" - "ld1 { v28.b }[0], [x27]\n" + "tbz x8, #0, 47f\n" + "ld1 { v28.b }[0], [x21]\n" "47:" // Oddments: Load (2, 3): Bit 2: End - "usubl v28.8h, v28.8b, v22.8b\n" - "ldr x26, [x7, #0x88]\n" - "smlal v19.4s, v28.4h, v7.4h\n" - "add x26, x26, x4\n" - "smlal2 v20.4s, v28.8h, v7.8h\n" - "smlal v18.4s, v28.4h, v1.4h\n" - "smlal2 v21.4s, v28.8h, v1.8h\n" - "tbz x3, #2, 49f\n" - "ld1 { v26.s }[0], [x26], #0x4\n" - "tbz x3, #1, 48f\n" - "ld1 { v26.h }[2], [x26], #0x2\n" - "tbz x3, #0, 51f\n" - "ld1 { v26.b }[6], [x26]\n" + "usubl v28.8h, v28.8b, v12.8b\n" + "ldr x20, [x12, #0x88]\n" + "smlal v9.4s, v28.4h, v7.4h\n" + "smlal2 v16.4s, v28.8h, v7.8h\n" + "smlal v23.4s, v28.4h, v1.4h\n" + "smlal2 v18.4s, v28.8h, v1.8h\n" + "add x20, x20, x15\n" + "tbz x8, #2, 49f\n" + "ld1 { v26.s }[0], [x20], #0x4\n" + "tbz x8, #1, 48f\n" + "ld1 { v26.h }[2], [x20], #0x2\n" + "tbz x8, #0, 51f\n" + "ld1 { v26.b }[6], [x20]\n" "b 51f\n" "48:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset - "tbz x3, #0, 51f\n" - "ld1 { v26.b }[4], [x26]\n" + "tbz x8, #0, 51f\n" + "ld1 { v26.b }[4], [x20]\n" "b 51f\n" "49:" // Oddments: Load (3, 4): Bit 2: Unset - "tbz x3, #1, 50f\n" - "ld1 { v26.h }[0], [x26], #0x2\n" - "tbz x3, #0, 51f\n" - "ld1 { v26.b }[2], [x26]\n" + "tbz x8, #1, 50f\n" + "ld1 { v26.h }[0], [x20], #0x2\n" + "tbz x8, #0, 51f\n" + "ld1 { v26.b }[2], [x20]\n" "b 51f\n" "50:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 51f\n" - "ld1 { v26.b }[0], [x26]\n" + "tbz x8, #0, 51f\n" + "ld1 { v26.b }[0], [x20]\n" "51:" // Oddments: Load (3, 4): Bit 2: End - "usubl v26.8h, v26.8b, v22.8b\n" - "ldr x25, [x7, #0x90]\n" - "smlal v18.4s, v26.4h, v5.4h\n" - "add x25, x25, x4\n" - "smlal2 v21.4s, v26.8h, v5.8h\n" - "tbz x3, #2, 53f\n" - "ld1 { v25.s }[0], [x25], #0x4\n" - "tbz x3, #1, 52f\n" - "ld1 { v25.h }[2], [x25], #0x2\n" - "tbz x3, #0, 55f\n" - "ld1 { v25.b }[6], [x25]\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "ldr x23, [x12, #0x90]\n" + "smlal v23.4s, v26.4h, v5.4h\n" + "smlal2 v18.4s, v26.8h, v5.8h\n" + "add x23, x23, x15\n" + "tbz x8, #2, 53f\n" + "ld1 { v25.s }[0], [x23], #0x4\n" + "tbz x8, #1, 52f\n" + "ld1 { v25.h }[2], [x23], #0x2\n" + "tbz x8, #0, 55f\n" + "ld1 { v25.b }[6], [x23]\n" "b 55f\n" "52:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset - "tbz x3, #0, 55f\n" - "ld1 { v25.b }[4], [x25]\n" + "tbz x8, #0, 55f\n" + "ld1 { v25.b }[4], [x23]\n" "b 55f\n" "53:" // Oddments: Load (4, 0): Bit 2: Unset - "tbz x3, #1, 54f\n" - "ld1 { v25.h }[0], [x25], #0x2\n" - "tbz x3, #0, 55f\n" - "ld1 { v25.b }[2], [x25]\n" + "tbz x8, #1, 54f\n" + "ld1 { v25.h }[0], [x23], #0x2\n" + "tbz x8, #0, 55f\n" + "ld1 { v25.b }[2], [x23]\n" "b 55f\n" "54:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 55f\n" - "ld1 { v25.b }[0], [x25]\n" + "tbz x8, #0, 55f\n" + "ld1 { v25.b }[0], [x23]\n" "55:" // Oddments: Load (4, 0): Bit 2: End - "usubl v25.8h, v25.8b, v22.8b\n" - "ldr x24, [x7, #0x98]\n" - "smlal v11.4s, v25.4h, v6.4h\n" - "add x24, x24, x4\n" - "smlal2 v17.4s, v25.8h, v6.8h\n" - "tbz x3, #2, 57f\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "ldr x24, [x12, #0x98]\n" + "smlal v22.4s, v25.4h, v6.4h\n" + "smlal2 v21.4s, v25.8h, v6.8h\n" + "add x24, x24, x15\n" + "tbz x8, #2, 57f\n" "ld1 { v29.s }[0], [x24], #0x4\n" - "tbz x3, #1, 56f\n" + "tbz x8, #1, 56f\n" "ld1 { v29.h }[2], [x24], #0x2\n" - "tbz x3, #0, 59f\n" + "tbz x8, #0, 59f\n" "ld1 { v29.b }[6], [x24]\n" "b 59f\n" "56:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset - "tbz x3, #0, 59f\n" + "tbz x8, #0, 59f\n" "ld1 { v29.b }[4], [x24]\n" "b 59f\n" "57:" // Oddments: Load (2, 4): Bit 2: Unset - "tbz x3, #1, 58f\n" + "tbz x8, #1, 58f\n" "ld1 { v29.h }[0], [x24], #0x2\n" - "tbz x3, #0, 59f\n" + "tbz x8, #0, 59f\n" "ld1 { v29.b }[2], [x24]\n" "b 59f\n" "58:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 59f\n" + "tbz x8, #0, 59f\n" "ld1 { v29.b }[0], [x24]\n" "59:" // Oddments: Load (2, 4): Bit 2: End - "usubl v29.8h, v29.8b, v22.8b\n" - "ldr x23, [x7, #0xa0]\n" - "smlal v19.4s, v29.4h, v8.4h\n" - "add x23, x23, x4\n" - "smlal2 v20.4s, v29.8h, v8.8h\n" - "smlal v18.4s, v29.4h, v2.4h\n" - "smlal2 v21.4s, v29.8h, v2.8h\n" - "tbz x3, #2, 61f\n" - "ld1 { v27.s }[0], [x23], #0x4\n" - "tbz x3, #1, 60f\n" - "ld1 { v27.h }[2], [x23], #0x2\n" - "tbz x3, #0, 63f\n" - "ld1 { v27.b }[6], [x23]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "ldr x19, [x12, #0xa0]\n" + "smlal v9.4s, v29.4h, v8.4h\n" + "smlal2 v16.4s, v29.8h, v8.8h\n" + "smlal v23.4s, v29.4h, v2.4h\n" + "smlal2 v18.4s, v29.8h, v2.8h\n" + "add x19, x19, x15\n" + "tbz x8, #2, 61f\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "tbz x8, #1, 60f\n" + "ld1 { v27.h }[2], [x19], #0x2\n" + "tbz x8, #0, 63f\n" + "ld1 { v27.b }[6], [x19]\n" "b 63f\n" "60:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset - "tbz x3, #0, 63f\n" - "ld1 { v27.b }[4], [x23]\n" + "tbz x8, #0, 63f\n" + "ld1 { v27.b }[4], [x19]\n" "b 63f\n" "61:" // Oddments: Load (4, 1): Bit 2: Unset - "tbz x3, #1, 62f\n" - "ld1 { v27.h }[0], [x23], #0x2\n" - "tbz x3, #0, 63f\n" - "ld1 { v27.b }[2], [x23]\n" + "tbz x8, #1, 62f\n" + "ld1 { v27.h }[0], [x19], #0x2\n" + "tbz x8, #0, 63f\n" + "ld1 { v27.b }[2], [x19]\n" "b 63f\n" "62:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 63f\n" - "ld1 { v27.b }[0], [x23]\n" + "tbz x8, #0, 63f\n" + "ld1 { v27.b }[0], [x19]\n" "63:" // Oddments: Load (4, 1): Bit 2: End - "usubl v27.8h, v27.8b, v22.8b\n" - "ldr x22, [x7, #0xa8]\n" - "smlal v11.4s, v27.4h, v7.4h\n" - "add x22, x22, x4\n" - "smlal2 v17.4s, v27.8h, v7.8h\n" - "tbz x3, #2, 65f\n" + "usubl v27.8h, v27.8b, v12.8b\n" + "ldr x22, [x12, #0xa8]\n" + "smlal v22.4s, v27.4h, v7.4h\n" + "smlal2 v21.4s, v27.8h, v7.8h\n" + "add x22, x22, x15\n" + "tbz x8, #2, 65f\n" "ld1 { v24.s }[0], [x22], #0x4\n" - "tbz x3, #1, 64f\n" + "tbz x8, #1, 64f\n" "ld1 { v24.h }[2], [x22], #0x2\n" - "tbz x3, #0, 67f\n" + "tbz x8, #0, 67f\n" "ld1 { v24.b }[6], [x22]\n" "b 67f\n" "64:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset - "tbz x3, #0, 67f\n" + "tbz x8, #0, 67f\n" "ld1 { v24.b }[4], [x22]\n" "b 67f\n" "65:" // Oddments: Load (3, 2): Bit 2: Unset - "tbz x3, #1, 66f\n" + "tbz x8, #1, 66f\n" "ld1 { v24.h }[0], [x22], #0x2\n" - "tbz x3, #0, 67f\n" + "tbz x8, #0, 67f\n" "ld1 { v24.b }[2], [x22]\n" "b 67f\n" "66:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 67f\n" + "tbz x8, #0, 67f\n" "ld1 { v24.b }[0], [x22]\n" "67:" // Oddments: Load (3, 2): Bit 2: End - "usubl v24.8h, v24.8b, v22.8b\n" - "ldr x21, [x7, #0xb0]\n" - "smlal v11.4s, v24.4h, v5.4h\n" - "add x21, x21, x4\n" - "smlal2 v17.4s, v24.8h, v5.8h\n" - "smlal v18.4s, v24.4h, v3.4h\n" - "smlal2 v21.4s, v24.8h, v3.8h\n" - "tbz x3, #2, 69f\n" + "usubl v24.8h, v24.8b, v12.8b\n" + "ldr x21, [x12, #0xb0]\n" + "smlal v22.4s, v24.4h, v5.4h\n" + "smlal2 v21.4s, v24.8h, v5.8h\n" + "smlal v23.4s, v24.4h, v3.4h\n" + "smlal2 v18.4s, v24.8h, v3.8h\n" + "add x21, x21, x15\n" + "tbz x8, #2, 69f\n" "ld1 { v26.s }[0], [x21], #0x4\n" - "tbz x3, #1, 68f\n" + "tbz x8, #1, 68f\n" "ld1 { v26.h }[2], [x21], #0x2\n" - "tbz x3, #0, 71f\n" + "tbz x8, #0, 71f\n" "ld1 { v26.b }[6], [x21]\n" "b 71f\n" "68:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset - "tbz x3, #0, 71f\n" + "tbz x8, #0, 71f\n" "ld1 { v26.b }[4], [x21]\n" "b 71f\n" "69:" // Oddments: Load (4, 3): Bit 2: Unset - "tbz x3, #1, 70f\n" + "tbz x8, #1, 70f\n" "ld1 { v26.h }[0], [x21], #0x2\n" - "tbz x3, #0, 71f\n" + "tbz x8, #0, 71f\n" "ld1 { v26.b }[2], [x21]\n" "b 71f\n" "70:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 71f\n" + "tbz x8, #0, 71f\n" "ld1 { v26.b }[0], [x21]\n" "71:" // Oddments: Load (4, 3): Bit 2: End - "usubl v26.8h, v26.8b, v22.8b\n" - "ldr x20, [x7, #0xb8]\n" - "smlal v18.4s, v26.4h, v7.4h\n" - "add x20, x20, x4\n" - "smlal2 v21.4s, v26.8h, v7.8h\n" - "tbz x3, #2, 73f\n" + "usubl v26.8h, v26.8b, v12.8b\n" + "ldr x20, [x12, #0xb8]\n" + "smlal v23.4s, v26.4h, v7.4h\n" + "smlal2 v18.4s, v26.8h, v7.8h\n" + "add x20, x20, x15\n" + "tbz x8, #2, 73f\n" "ld1 { v25.s }[0], [x20], #0x4\n" - "tbz x3, #1, 72f\n" + "tbz x8, #1, 72f\n" "ld1 { v25.h }[2], [x20], #0x2\n" - "tbz x3, #0, 75f\n" + "tbz x8, #0, 75f\n" "ld1 { v25.b }[6], [x20]\n" "b 75f\n" "72:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset - "tbz x3, #0, 75f\n" + "tbz x8, #0, 75f\n" "ld1 { v25.b }[4], [x20]\n" "b 75f\n" "73:" // Oddments: Load (4, 2): Bit 2: Unset - "tbz x3, #1, 74f\n" + "tbz x8, #1, 74f\n" "ld1 { v25.h }[0], [x20], #0x2\n" - "tbz x3, #0, 75f\n" + "tbz x8, #0, 75f\n" "ld1 { v25.b }[2], [x20]\n" "b 75f\n" "74:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 75f\n" + "tbz x8, #0, 75f\n" "ld1 { v25.b }[0], [x20]\n" "75:" // Oddments: Load (4, 2): Bit 2: End - "usubl v25.8h, v25.8b, v22.8b\n" - "ldr x19, [x7, #0xc0]\n" - "smlal v11.4s, v25.4h, v8.4h\n" - "add x19, x19, x4\n" - "smlal2 v17.4s, v25.8h, v8.8h\n" - "smlal v18.4s, v25.4h, v6.4h\n" - "smlal2 v21.4s, v25.8h, v6.8h\n" - "tbz x3, #2, 77f\n" + "usubl v25.8h, v25.8b, v12.8b\n" + "ldr x19, [x12, #0xc0]\n" + "smlal v22.4s, v25.4h, v8.4h\n" + "smlal2 v21.4s, v25.8h, v8.8h\n" + "smlal v23.4s, v25.4h, v6.4h\n" + "smlal2 v18.4s, v25.8h, v6.8h\n" + "add x19, x19, x15\n" + "tbz x8, #2, 77f\n" "ld1 { v29.s }[0], [x19], #0x4\n" - "tbz x3, #1, 76f\n" + "tbz x8, #1, 76f\n" "ld1 { v29.h }[2], [x19], #0x2\n" - "tbz x3, #0, 79f\n" + "tbz x8, #0, 79f\n" "ld1 { v29.b }[6], [x19]\n" "b 79f\n" "76:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset - "tbz x3, #0, 79f\n" + "tbz x8, #0, 79f\n" "ld1 { v29.b }[4], [x19]\n" "b 79f\n" "77:" // Oddments: Load (4, 4): Bit 2: Unset - "tbz x3, #1, 78f\n" + "tbz x8, #1, 78f\n" "ld1 { v29.h }[0], [x19], #0x2\n" - "tbz x3, #0, 79f\n" + "tbz x8, #0, 79f\n" "ld1 { v29.b }[2], [x19]\n" "b 79f\n" "78:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 79f\n" + "tbz x8, #0, 79f\n" "ld1 { v29.b }[0], [x19]\n" "79:" // Oddments: Load (4, 4): Bit 2: End - "usubl v29.8h, v29.8b, v22.8b\n" - "smlal v18.4s, v29.4h, v8.4h\n" - "smlal2 v21.4s, v29.8h, v8.8h\n" - "tbz x3, #2, 81f\n" - "ld1 { v31.4s }, [x8], #0x10\n" - "ld1 { v30.4s }, [x16], #0x10\n" - "tbz x3, #1, 80f\n" - "ld1 { v23.d }[0], [x8], #0x8\n" - "ld1 { v9.d }[0], [x16], #0x8\n" - "tbz x3, #0, 83f\n" - "ld1 { v23.s }[2], [x8]\n" - "ld1 { v9.s }[2], [x16]\n" + "usubl v29.8h, v29.8b, v12.8b\n" + "smlal v23.4s, v29.4h, v8.4h\n" + "smlal2 v18.4s, v29.8h, v8.8h\n" + "tbz x8, #2, 81f\n" + "ld1 { v19.4s }, [x13], #0x10\n" + "ld1 { v0.4s }, [x11], #0x10\n" + "tbz x8, #1, 80f\n" + "ld1 { v4.d }[0], [x13], #0x8\n" + "ld1 { v31.d }[0], [x11], #0x8\n" + "tbz x8, #0, 83f\n" + "ld1 { v4.s }[2], [x13]\n" + "ld1 { v31.s }[2], [x11]\n" "b 83f\n" "80:" // Oddments: Load requant params: Bit 2: Bit 1: Unset - "tbz x3, #0, 83f\n" - "ld1 { v23.s }[0], [x8]\n" - "ld1 { v9.s }[0], [x16]\n" + "tbz x8, #0, 83f\n" + "ld1 { v4.s }[0], [x13]\n" + "ld1 { v31.s }[0], [x11]\n" "b 83f\n" "81:" // Oddments: Load requant params: Bit 2: Unset - "tbz x3, #1, 82f\n" - "ld1 { v31.d }[0], [x8], #0x8\n" - "ld1 { v30.d }[0], [x16], #0x8\n" - "tbz x3, #0, 83f\n" - "ld1 { v31.s }[2], [x8]\n" - "ld1 { v30.s }[2], [x16]\n" + "tbz x8, #1, 82f\n" + "ld1 { v19.d }[0], [x13], #0x8\n" + "ld1 { v0.d }[0], [x11], #0x8\n" + "tbz x8, #0, 83f\n" + "ld1 { v19.s }[2], [x13]\n" + "ld1 { v0.s }[2], [x11]\n" "b 83f\n" "82:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 83f\n" - "ld1 { v31.s }[0], [x8]\n" - "ld1 { v30.s }[0], [x16]\n" + "tbz x8, #0, 83f\n" + "ld1 { v19.s }[0], [x13]\n" + "ld1 { v0.s }[0], [x11]\n" "83:" // Oddments: Load requant params: Bit 2: End - "sqrdmulh v13.4s, v13.4s, v31.4s\n" - "add x15, x15, x6\n" - "sqrdmulh v10.4s, v10.4s, v23.4s\n" - "add x14, x14, x6\n" - "sqrdmulh v19.4s, v19.4s, v31.4s\n" - "add x13, x13, x6\n" - "sqrdmulh v20.4s, v20.4s, v23.4s\n" - "add x12, x12, x6\n" - "sqrdmulh v11.4s, v11.4s, v31.4s\n" - "and v27.16b, v13.16b, v30.16b\n" - "and v7.16b, v10.16b, v9.16b\n" - "and v6.16b, v19.16b, v30.16b\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "sshr v7.4s, v7.4s, #0x1f\n" - "sshr v6.4s, v6.4s, #0x1f\n" - "sqadd v13.4s, v13.4s, v27.4s\n" - "sqadd v10.4s, v10.4s, v7.4s\n" - "sqadd v19.4s, v19.4s, v6.4s\n" - "and v3.16b, v20.16b, v9.16b\n" - "srshl v13.4s, v13.4s, v30.4s\n" - "srshl v10.4s, v10.4s, v9.4s\n" - "srshl v19.4s, v19.4s, v30.4s\n" - "sshr v3.4s, v3.4s, #0x1f\n" - "add v13.4s, v13.4s, v14.4s\n" - "add v10.4s, v10.4s, v14.4s\n" - "add v19.4s, v19.4s, v14.4s\n" - "smin v13.4s, v13.4s, v15.4s\n" - "smin v10.4s, v10.4s, v15.4s\n" - "smin v19.4s, v19.4s, v15.4s\n" - "smax v13.4s, v13.4s, v16.4s\n" - "smax v10.4s, v10.4s, v16.4s\n" - "smax v19.4s, v19.4s, v16.4s\n" - "sqadd v20.4s, v20.4s, v3.4s\n" - "uzp1 v13.16b, v13.16b, v10.16b\n" - "and v28.16b, v11.16b, v30.16b\n" - "uzp1 v13.16b, v13.16b, v13.16b\n" - "srshl v20.4s, v20.4s, v9.4s\n" + "sqdmulh v15.4s, v15.4s, v19.4s\n" + "sqdmulh v9.4s, v9.4s, v19.4s\n" + "add x10, x10, x14\n" + "add x9, x9, x14\n" + "sqdmulh v22.4s, v22.4s, v19.4s\n" + "sqdmulh v23.4s, v23.4s, v19.4s\n" + "add x28, x28, x14\n" + "add x27, x27, x14\n" + "and v30.16b, v15.16b, v0.16b\n" + "sqdmulh v10.4s, v10.4s, v4.4s\n" + "and v28.16b, v9.16b, v0.16b\n" + "sqdmulh v16.4s, v16.4s, v4.4s\n" + "and v29.16b, v22.16b, v0.16b\n" + "sqdmulh v21.4s, v21.4s, v4.4s\n" + "and v20.16b, v23.16b, v0.16b\n" + "sqdmulh v18.4s, v18.4s, v4.4s\n" + "sshr v30.4s, v30.4s, #0x1f\n" + "and v19.16b, v10.16b, v31.16b\n" "sshr v28.4s, v28.4s, #0x1f\n" - "sqrdmulh v17.4s, v17.4s, v23.4s\n" - "sqrdmulh v18.4s, v18.4s, v31.4s\n" - "add v20.4s, v20.4s, v14.4s\n" - "sqadd v11.4s, v11.4s, v28.4s\n" - "and v26.16b, v17.16b, v9.16b\n" - "smin v20.4s, v20.4s, v15.4s\n" - "and v8.16b, v18.16b, v30.16b\n" - "srshl v11.4s, v11.4s, v30.4s\n" - "smax v20.4s, v20.4s, v16.4s\n" + "and v4.16b, v16.16b, v31.16b\n" + "sshr v29.4s, v29.4s, #0x1f\n" + "and v5.16b, v21.16b, v31.16b\n" + "sshr v20.4s, v20.4s, #0x1f\n" + "and v26.16b, v18.16b, v31.16b\n" + "sqadd v15.4s, v15.4s, v30.4s\n" + "sshr v19.4s, v19.4s, #0x1f\n" + "sqadd v9.4s, v9.4s, v28.4s\n" + "sshr v4.4s, v4.4s, #0x1f\n" + "sqadd v22.4s, v22.4s, v29.4s\n" + "sshr v5.4s, v5.4s, #0x1f\n" + "sqadd v23.4s, v23.4s, v20.4s\n" "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v8.4s, v8.4s, #0x1f\n" - "uzp1 v19.16b, v19.16b, v20.16b\n" - "add v11.4s, v11.4s, v14.4s\n" - "uzp1 v19.16b, v19.16b, v19.16b\n" - "sqadd v17.4s, v17.4s, v26.4s\n" - "smin v11.4s, v11.4s, v15.4s\n" - "sqadd v18.4s, v18.4s, v8.4s\n" - "sqrdmulh v21.4s, v21.4s, v23.4s\n" - "smax v11.4s, v11.4s, v16.4s\n" - "srshl v17.4s, v17.4s, v9.4s\n" - "srshl v18.4s, v18.4s, v30.4s\n" - "and v27.16b, v21.16b, v9.16b\n" - "add v17.4s, v17.4s, v14.4s\n" - "add v18.4s, v18.4s, v14.4s\n" - "sshr v27.4s, v27.4s, #0x1f\n" - "smin v17.4s, v17.4s, v15.4s\n" - "smin v18.4s, v18.4s, v15.4s\n" - "sqadd v21.4s, v21.4s, v27.4s\n" - "smax v17.4s, v17.4s, v16.4s\n" - "smax v18.4s, v18.4s, v16.4s\n" - "srshl v21.4s, v21.4s, v9.4s\n" - "uzp1 v11.16b, v11.16b, v17.16b\n" - "uzp1 v11.16b, v11.16b, v11.16b\n" - "add v21.4s, v21.4s, v14.4s\n" - "smin v21.4s, v21.4s, v15.4s\n" - "smax v21.4s, v21.4s, v16.4s\n" - "uzp1 v18.16b, v18.16b, v21.16b\n" - "uzp1 v18.16b, v18.16b, v18.16b\n" - "tbz x3, #2, 85f\n" - "st1 { v13.s }[0], [x15], #0x4\n" - "st1 { v19.s }[0], [x14], #0x4\n" - "st1 { v11.s }[0], [x13], #0x4\n" - "st1 { v18.s }[0], [x12], #0x4\n" - "tbz x3, #1, 84f\n" - "st1 { v13.h }[2], [x15], #0x2\n" - "st1 { v19.h }[2], [x14], #0x2\n" - "st1 { v11.h }[2], [x13], #0x2\n" - "st1 { v18.h }[2], [x12], #0x2\n" - "tbz x3, #0, 87f\n" - "st1 { v13.b }[6], [x15], #0x1\n" - "st1 { v19.b }[6], [x14], #0x1\n" - "st1 { v11.b }[6], [x13], #0x1\n" - "st1 { v18.b }[6], [x12], #0x1\n" + "srshl v15.4s, v15.4s, v0.4s\n" + "sqadd v10.4s, v10.4s, v19.4s\n" + "srshl v9.4s, v9.4s, v0.4s\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "srshl v22.4s, v22.4s, v0.4s\n" + "sqadd v21.4s, v21.4s, v5.4s\n" + "srshl v23.4s, v23.4s, v0.4s\n" + "sqadd v18.4s, v18.4s, v26.4s\n" + "srshl v10.4s, v10.4s, v31.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v16.4s, v16.4s, v31.4s\n" + "sqxtn v9.4h, v9.4s\n" + "srshl v21.4s, v21.4s, v31.4s\n" + "sqxtn v22.4h, v22.4s\n" + "srshl v18.4s, v18.4s, v31.4s\n" + "sqxtn v23.4h, v23.4s\n" + "sqxtn2 v15.8h, v10.4s\n" + "sqxtn2 v9.8h, v16.4s\n" + "sqxtn2 v22.8h, v21.4s\n" + "sqxtn2 v23.8h, v18.4s\n" + "sqadd v15.8h, v15.8h, v11.8h\n" + "sqadd v9.8h, v9.8h, v11.8h\n" + "sqadd v22.8h, v22.8h, v11.8h\n" + "sqadd v23.8h, v23.8h, v11.8h\n" + "smax v15.8h, v15.8h, v17.8h\n" + "smax v9.8h, v9.8h, v17.8h\n" + "smax v22.8h, v22.8h, v17.8h\n" + "smax v23.8h, v23.8h, v17.8h\n" + "smin v15.8h, v15.8h, v14.8h\n" + "smin v9.8h, v9.8h, v14.8h\n" + "smin v22.8h, v22.8h, v14.8h\n" + "smin v23.8h, v23.8h, v14.8h\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "uzp1 v9.16b, v9.16b, v9.16b\n" + "uzp1 v22.16b, v22.16b, v22.16b\n" + "uzp1 v23.16b, v23.16b, v23.16b\n" + "tbz x8, #2, 85f\n" + "st1 { v15.s }[0], [x10], #0x4\n" + "st1 { v9.s }[0], [x9], #0x4\n" + "st1 { v22.s }[0], [x28], #0x4\n" + "st1 { v23.s }[0], [x27], #0x4\n" + "tbz x8, #1, 84f\n" + "st1 { v15.h }[2], [x10], #0x2\n" + "st1 { v9.h }[2], [x9], #0x2\n" + "st1 { v22.h }[2], [x28], #0x2\n" + "st1 { v23.h }[2], [x27], #0x2\n" + "tbz x8, #0, 87f\n" + "st1 { v15.b }[6], [x10], #0x1\n" + "st1 { v9.b }[6], [x9], #0x1\n" + "st1 { v22.b }[6], [x28], #0x1\n" + "st1 { v23.b }[6], [x27], #0x1\n" "b 87f\n" "84:" // Oddments: Bit 2: Bit 1: Unset - "tbz x3, #0, 87f\n" - "st1 { v13.b }[4], [x15], #0x1\n" - "st1 { v19.b }[4], [x14], #0x1\n" - "st1 { v11.b }[4], [x13], #0x1\n" - "st1 { v18.b }[4], [x12], #0x1\n" + "tbz x8, #0, 87f\n" + "st1 { v15.b }[4], [x10], #0x1\n" + "st1 { v9.b }[4], [x9], #0x1\n" + "st1 { v22.b }[4], [x28], #0x1\n" + "st1 { v23.b }[4], [x27], #0x1\n" "b 87f\n" "85:" // Oddments: Bit 2: Unset - "tbz x3, #1, 86f\n" - "st1 { v13.h }[0], [x15], #0x2\n" - "st1 { v19.h }[0], [x14], #0x2\n" - "st1 { v11.h }[0], [x13], #0x2\n" - "st1 { v18.h }[0], [x12], #0x2\n" - "tbz x3, #0, 87f\n" - "st1 { v13.b }[2], [x15], #0x1\n" - "st1 { v19.b }[2], [x14], #0x1\n" - "st1 { v11.b }[2], [x13], #0x1\n" - "st1 { v18.b }[2], [x12], #0x1\n" + "tbz x8, #1, 86f\n" + "st1 { v15.h }[0], [x10], #0x2\n" + "st1 { v9.h }[0], [x9], #0x2\n" + "st1 { v22.h }[0], [x28], #0x2\n" + "st1 { v23.h }[0], [x27], #0x2\n" + "tbz x8, #0, 87f\n" + "st1 { v15.b }[2], [x10], #0x1\n" + "st1 { v9.b }[2], [x9], #0x1\n" + "st1 { v22.b }[2], [x28], #0x1\n" + "st1 { v23.b }[2], [x27], #0x1\n" "b 87f\n" "86:" // Oddments: Bit 2: Unset: Bit 1: Unset - "tbz x3, #0, 87f\n" - "st1 { v13.b }[0], [x15], #0x1\n" - "st1 { v19.b }[0], [x14], #0x1\n" - "st1 { v11.b }[0], [x13], #0x1\n" - "st1 { v18.b }[0], [x12], #0x1\n" + "tbz x8, #0, 87f\n" + "st1 { v15.b }[0], [x10], #0x1\n" + "st1 { v9.b }[0], [x9], #0x1\n" + "st1 { v22.b }[0], [x28], #0x1\n" + "st1 { v23.b }[0], [x27], #0x1\n" "87:" // Oddments: Bit 2: End - "88:" // End - : : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" ); } diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp index d3d5000d4c..ea70b56349 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,37 +36,24 @@ namespace depthwise { void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); -struct a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst +class a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef uint8_t input_type; - typedef int8_t weight_type; - typedef uint8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); - typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 5; constexpr static unsigned int kernel_cols = 5; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 6; - constexpr static unsigned int input_cols = 6; - - constexpr static parameter_packing_fn pack_parameters = interleave_a64_s8q_5x5_mla::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_a64_s8q_5x5_mla::get_packed_size; + a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {} - kern_type kernel = a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl; + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::None; } - a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp index 97156137bf..291ffec0bb 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -46,7 +46,7 @@ void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl( struct Params { long unsigned int n_channels; - const int8_t *weights; + const void *weights; const int32_t *bias; const arm_gemm::Requantize32 *requant; const int32_t *const requant_muls; @@ -57,7 +57,7 @@ void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl( Params( long unsigned int n_channels, const uint8_t *const *inptrs_raw, - const int8_t *const weights, + const void *const weights, const int32_t *const bias, const arm_gemm::Requantize32 &qp, const int32_t *const requant_muls, @@ -111,2096 +111,2070 @@ void a64_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl( requant_muls, requant_shifts, outptrs); __asm__ __volatile__( - "ldr x4, [%x[params], %[offsetof_Params_n_channels]]\n" - "mov x10, #0x0\n" - "ldr x3, [%x[params], %[offsetof_Params_weights]]\n" - "mov x1, #0x0\n" - "ldr x22, [%x[params], %[offsetof_Params_requant]]\n" - "add x25, %x[params], %[offsetof_Params_inptrs]\n" - "ldr x2, [%x[params], %[offsetof_Params_requant_muls]]\n" - "lsr x19, x4, #0x3\n" - "ldr x5, [%x[params], %[offsetof_Params_requant_shifts]]\n" - "add x13, x22, %[offsetof_Requantize32_a_offset]\n" - "ldr x21, [%x[params], %[offsetof_Params_outptrs]]\n" - "add x20, x22, %[offsetof_Requantize32_b_offset]\n" - "ld1r { v9.16b }, [x13]\n" - "add x8, x22, %[offsetof_Requantize32_c_offset]\n" - "ld1r { v14.16b }, [x20]\n" - "add x20, x22, %[offsetof_Requantize32_minval]\n" - "ld1r { v10.4s }, [x8]\n" - "add x8, x22, %[offsetof_Requantize32_maxval]\n" - "ld1r { v11.4s }, [x20]\n" - "ld1r { v13.4s }, [x8]\n" - "ldp x17, x16, [x21, #0x0]\n" - "ldp x6, x8, [x21, #0x10]\n" - "cbz x19, 3f\n" - "subs x19, x19, #0x1\n" - "ldr x12, [%x[params], %[offsetof_Params_bias]]\n" - "ldr q15, [x12, #0x0]\n" - "mov v16.16b, v15.16b\n" - "ldr q18, [x12, #0x10]\n" - "add x12, x12, #0x20\n" - "mov v7.16b, v15.16b\n" - "str x12, [%x[params], %[offsetof_Params_bias]]\n" - "mov v8.16b, v15.16b\n" - "ldr d0, [x3, #0x0]\n" - "ldr d1, [x3, #0x8]\n" - "mov v21.16b, v18.16b\n" - "ldr d2, [x3, #0x10]\n" - "mov v17.16b, v18.16b\n" - "ldr d3, [x3, #0x18]\n" - "mov v5.16b, v18.16b\n" - "ldr d4, [x3, #0x20]\n" + "ldr x10, [%x[params], %[offsetof_Params_requant]]\n" + "ldr x0, [%x[params], %[offsetof_Params_n_channels]]\n" + "add x17, x10, %[offsetof_Requantize32_a_offset]\n" + "add x9, x10, %[offsetof_Requantize32_b_offset]\n" + "ldr x25, [%x[params], %[offsetof_Params_outptrs]]\n" + "add x4, x10, %[offsetof_Requantize32_c_offset]\n" + "add x14, x10, %[offsetof_Requantize32_minval]\n" + "ldr x23, [%x[params], %[offsetof_Params_weights]]\n" + "add x5, x10, %[offsetof_Requantize32_maxval]\n" + "ld1r { v9.16b }, [x17]\n" + "ld1r { v14.16b }, [x9]\n" + "lsr x3, x0, #0x3\n" + "ld1r { v18.8h }, [x4]\n" + "ld1r { v11.8h }, [x14]\n" + "mov x24, #0x0\n" + "mov x22, #0x0\n" + "ld1r { v13.8h }, [x5]\n" + "ldr x10, [%x[params], %[offsetof_Params_requant_muls]]\n" + "add x20, %x[params], %[offsetof_Params_inptrs]\n" + "ldr x1, [%x[params], %[offsetof_Params_requant_shifts]]\n" + "ldp x16, x8, [x25, #0x0]\n" + "ldp x4, x7, [x25, #0x10]\n" + "cbz x3, 3f\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q15, [x19, #0x0]\n" + "subs x3, x3, #0x1\n" + "mov v17.16b, v15.16b\n" + "ldr q16, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr d0, [x23, #0x0]\n" + "ldr d1, [x23, #0x8]\n" + "ldr d2, [x23, #0x10]\n" + "mov v8.16b, v16.16b\n" + "mov v10.16b, v15.16b\n" + "ldr d3, [x23, #0x18]\n" + "ldr d4, [x23, #0x20]\n" + "mov v7.16b, v16.16b\n" + "mov v6.16b, v15.16b\n" + "ldp x28, x6, [x20, #0x0]\n" + "ldp x26, x25, [x20, #0x10]\n" + "mov v5.16b, v16.16b\n" "ssubl v0.8h, v0.8b, v14.8b\n" - "ldp x28, x27, [x25, #0x0]\n" + "ldp x5, x2, [x20, #0x20]\n" + "ldp x27, x21, [x20, #0x30]\n" "ssubl v1.8h, v1.8b, v14.8b\n" - "ldp x26, x13, [x25, #0x10]\n" "ssubl v2.8h, v2.8b, v14.8b\n" + "ldp x12, x19, [x20, #0x40]\n" + "ldr d31, [x28, x24]\n" "ssubl v3.8h, v3.8b, v14.8b\n" - "ldp x24, x23, [x25, #0x20]\n" "ssubl v4.8h, v4.8b, v14.8b\n" - "ldp x22, x21, [x25, #0x30]\n" - "ldp x20, x0, [x25, #0x40]\n" - "ldr d31, [x28, x10]\n" + "ldr d30, [x6, x24]\n" + "ldr d29, [x26, x24]\n" "usubl v31.8h, v31.8b, v9.8b\n" - "ldr d30, [x27, x10]\n" - "ldr d29, [x26, x10]\n" "usubl v30.8h, v30.8b, v9.8b\n" - "ldr d28, [x13, x10]\n" + "ldr d28, [x25, x24]\n" + "ldr d27, [x5, x24]\n" "usubl v29.8h, v29.8b, v9.8b\n" - "ldr d27, [x24, x10]\n" - "ldr d23, [x23, x10]\n" "usubl v28.8h, v28.8b, v9.8b\n" - "ldr d25, [x22, x10]\n" - "ldr d24, [x21, x10]\n" + "ldr d23, [x2, x24]\n" + "ldr d25, [x27, x24]\n" "usubl v27.8h, v27.8b, v9.8b\n" "usubl v23.8h, v23.8b, v9.8b\n" - "ldr d26, [x20, x10]\n" - "ldr d22, [x0, x10]\n" + "ldr d24, [x21, x24]\n" + "ldr d26, [x12, x24]\n" "usubl v25.8h, v25.8b, v9.8b\n" "usubl v24.8h, v24.8b, v9.8b\n" + "ldr d22, [x19, x24]\n" "usubl v26.8h, v26.8b, v9.8b\n" "usubl v22.8h, v22.8b, v9.8b\n" "beq 2f\n" "1:" // Loop "smlal v15.4s, v31.4h, v0.4h\n" - "ldr x20, [x25, #0x50]\n" - "subs x19, x19, #0x1\n" - "smlal2 v18.4s, v31.8h, v0.8h\n" - "ldr x28, [x25, #0x58]\n" - "smlal v16.4s, v30.4h, v0.4h\n" - "ldr x0, [x25, #0x60]\n" - "smlal2 v21.4s, v30.8h, v0.8h\n" - "ldr d31, [x20, x10]\n" - "smlal v7.4s, v29.4h, v0.4h\n" - "ldr x7, [x25, #0x68]\n" - "smlal2 v17.4s, v29.8h, v0.8h\n" - "ldr x26, [x25, #0x70]\n" - "smlal v8.4s, v28.4h, v0.4h\n" - "ldr x23, [x25, #0x78]\n" - "smlal2 v5.4s, v28.8h, v0.8h\n" - "ldr d0, [x3, #0x28]\n" + "smlal2 v16.4s, v31.8h, v0.8h\n" + "ldr x19, [x20, #0x50]\n" + "ldr d31, [x19, x24]\n" + "smlal v17.4s, v30.4h, v0.4h\n" + "smlal v10.4s, v29.4h, v0.4h\n" + "ldr x15, [x20, #0x58]\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "smlal v6.4s, v28.4h, v0.4h\n" + "smlal2 v8.4s, v30.8h, v0.8h\n" + "ldr x19, [x20, #0x60]\n" + "ldr x27, [x20, #0x68]\n" + "smlal2 v7.4s, v29.8h, v0.8h\n" "smlal v15.4s, v30.4h, v1.4h\n" - "ldr x20, [x25, #0x80]\n" - "smlal2 v18.4s, v30.8h, v1.8h\n" - "ldr d30, [x28, x10]\n" - "smlal v16.4s, v27.4h, v1.4h\n" - "ldr x22, [x25, #0x88]\n" - "smlal2 v21.4s, v27.8h, v1.8h\n" - "ldr x13, [x25, #0x90]\n" - "smlal v7.4s, v28.4h, v1.4h\n" - "ldr x21, [x25, #0x98]\n" - "smlal2 v17.4s, v28.8h, v1.8h\n" - "ldr x14, [x25, #0xa0]\n" - "smlal v8.4s, v23.4h, v1.4h\n" - "ldr x11, [x25, #0xa8]\n" - "smlal2 v5.4s, v23.8h, v1.8h\n" - "ldr d1, [x3, #0x30]\n" + "ldr x5, [x20, #0x70]\n" + "ldr x11, [x20, #0x78]\n" + "smlal2 v16.4s, v30.8h, v1.8h\n" + "smlal2 v5.4s, v28.8h, v0.8h\n" + "ldr d30, [x15, x24]\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "smlal v17.4s, v27.4h, v1.4h\n" + "smlal v10.4s, v28.4h, v1.4h\n" + "ldr d0, [x23, #0x28]\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal v6.4s, v23.4h, v1.4h\n" + "smlal2 v8.4s, v27.8h, v1.8h\n" + "ldr x12, [x20, #0x80]\n" + "ldr x26, [x20, #0x88]\n" + "smlal2 v7.4s, v28.8h, v1.8h\n" "smlal v15.4s, v27.4h, v2.4h\n" - "ldr x24, [x25, #0xb0]\n" - "smlal2 v18.4s, v27.8h, v2.8h\n" - "ldr d27, [x0, x10]\n" - "smlal v16.4s, v25.4h, v2.4h\n" - "ldr x0, [x25, #0xb8]\n" - "smlal2 v21.4s, v25.8h, v2.8h\n" - "ldr x15, [x25, #0xc0]\n" - "smlal v7.4s, v23.4h, v2.4h\n" - "ldr x9, [x25, #0xc8]\n" - "smlal2 v17.4s, v23.8h, v2.8h\n" - "ldr x27, [x25, #0xd0]\n" - "usubl v31.8h, v31.8b, v9.8b\n" - "ldr x28, [x25, #0xd8]\n" + "ldr x14, [x20, #0x90]\n" + "ldr x15, [x20, #0x98]\n" + "smlal2 v16.4s, v27.8h, v2.8h\n" + "smlal2 v5.4s, v23.8h, v1.8h\n" + "ldr d27, [x19, x24]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal v17.4s, v25.4h, v2.4h\n" + "smlal v10.4s, v23.4h, v2.4h\n" + "ldr d1, [x23, #0x30]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v31.4h, v2.4h\n" + "smlal2 v8.4s, v25.8h, v2.8h\n" + "ldr x21, [x20, #0xa0]\n" + "ldr x2, [x20, #0xa8]\n" + "smlal2 v7.4s, v23.8h, v2.8h\n" "smlal v15.4s, v25.4h, v3.4h\n" - "ldr q6, [x2, #0x0]\n" - "smlal2 v18.4s, v25.8h, v3.8h\n" - "ldr d25, [x7, x10]\n" - "smlal v8.4s, v31.4h, v2.4h\n" - "ldr x12, [x25, #0xe0]\n" + "ldr x13, [x20, #0xb0]\n" + "ldr x9, [x20, #0xb8]\n" + "smlal2 v16.4s, v25.8h, v3.8h\n" "smlal2 v5.4s, v31.8h, v2.8h\n" - "ldr d2, [x3, #0x38]\n" - "smlal v16.4s, v24.4h, v3.4h\n" - "ldr q19, [x5, #0x0]\n" - "smlal2 v21.4s, v24.8h, v3.8h\n" - "ldr q20, [x2, #0x10]\n" - "add x2, x2, #0x20\n" - "smlal v7.4s, v31.4h, v3.4h\n" - "ldr q12, [x5, #0x10]\n" - "add x5, x5, #0x20\n" - "smlal2 v17.4s, v31.8h, v3.8h\n" - "usubl v30.8h, v30.8b, v9.8b\n" + "ldr d25, [x27, x24]\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal v10.4s, v31.4h, v3.4h\n" + "ldr d2, [x23, #0x38]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v30.4h, v3.4h\n" + "smlal2 v8.4s, v24.8h, v3.8h\n" + "ldr x19, [x20, #0xc0]\n" + "ldr x28, [x20, #0xc8]\n" + "smlal2 v7.4s, v31.8h, v3.8h\n" "smlal v15.4s, v24.4h, v4.4h\n" - "smlal2 v18.4s, v24.8h, v4.8h\n" - "ldr d24, [x26, x10]\n" - "smlal v8.4s, v30.4h, v3.4h\n" - "ldr x7, [x25, #0xe8]\n" + "ldr x6, [x20, #0xd0]\n" + "ldr x27, [x20, #0xd8]\n" + "smlal2 v16.4s, v24.8h, v4.8h\n" "smlal2 v5.4s, v30.8h, v3.8h\n" - "ldr d3, [x3, #0x40]\n" + "ldr d24, [x5, x24]\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "smlal v17.4s, v27.4h, v4.4h\n" + "smlal v10.4s, v30.4h, v4.4h\n" + "ldr d3, [x23, #0x40]\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v26.4h, v4.4h\n" + "smlal2 v8.4s, v27.8h, v4.8h\n" + "ldr d27, [x11, x24]\n" "usubl v27.8h, v27.8b, v9.8b\n" - "smlal v7.4s, v30.4h, v4.4h\n" - "smlal2 v17.4s, v30.8h, v4.8h\n" - "smlal v16.4s, v27.4h, v4.4h\n" - "smlal2 v21.4s, v27.8h, v4.8h\n" - "ldr d27, [x23, x10]\n" - "smlal v8.4s, v26.4h, v4.4h\n" - "ldr x26, [x25, #0xf0]\n" - "smlal2 v5.4s, v26.8h, v4.8h\n" - "ldr d4, [x3, #0x48]\n" - "ssubl v0.8h, v0.8b, v14.8b\n" - "usubl v25.8h, v25.8b, v9.8b\n" - "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal2 v7.4s, v30.8h, v4.8h\n" "smlal v15.4s, v29.4h, v0.4h\n" - "smlal2 v18.4s, v29.8h, v0.8h\n" - "smlal v16.4s, v28.4h, v0.4h\n" - "smlal2 v21.4s, v28.8h, v0.8h\n" - "smlal v7.4s, v22.4h, v0.4h\n" - "smlal2 v17.4s, v22.8h, v0.8h\n" - "smlal v8.4s, v25.4h, v0.4h\n" - "smlal2 v5.4s, v25.8h, v0.8h\n" - "ldr d0, [x3, #0x50]\n" + "ldr x11, [x20, #0xe0]\n" + "ldr x17, [x20, #0xe8]\n" + "smlal2 v16.4s, v29.8h, v0.8h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d4, [x23, #0x48]\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v17.4s, v28.4h, v0.4h\n" + "smlal v10.4s, v22.4h, v0.4h\n" + "ldr x5, [x20, #0xf0]\n" + "ldr q12, [x10, #0x0]\n" + "smlal v6.4s, v25.4h, v0.4h\n" + "smlal2 v8.4s, v28.8h, v0.8h\n" + "ldr q19, [x1, #0x0]\n" + "ldr q20, [x10, #0x10]\n" + "smlal2 v7.4s, v22.8h, v0.8h\n" "smlal v15.4s, v28.4h, v1.4h\n" - "smlal2 v18.4s, v28.8h, v1.8h\n" - "ldr d28, [x22, x10]\n" - "smlal v16.4s, v23.4h, v1.4h\n" - "ldr x23, [x25, #0xf8]\n" - "smlal2 v21.4s, v23.8h, v1.8h\n" - "smlal v7.4s, v25.4h, v1.4h\n" - "smlal2 v17.4s, v25.8h, v1.8h\n" - "usubl v24.8h, v24.8b, v9.8b\n" - "ssubl v2.8h, v2.8b, v14.8b\n" - "usubl v27.8h, v27.8b, v9.8b\n" - "smlal v8.4s, v24.4h, v1.4h\n" - "smlal2 v5.4s, v24.8h, v1.8h\n" - "ldr d1, [x3, #0x58]\n" + "ldr q29, [x1, #0x10]\n" + "subs x3, x3, #0x1\n" + "smlal2 v16.4s, v28.8h, v1.8h\n" + "smlal2 v5.4s, v25.8h, v0.8h\n" + "ldr d28, [x26, x24]\n" + "ldr d0, [x23, #0x50]\n" + "smlal v17.4s, v23.4h, v1.4h\n" + "smlal v10.4s, v25.4h, v1.4h\n" + "usubl v28.8h, v28.8b, v9.8b\n" + "ldr x25, [x20, #0xf8]\n" + "smlal v6.4s, v24.4h, v1.4h\n" + "smlal2 v8.4s, v23.8h, v1.8h\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "add x10, x10, #0x20\n" + "smlal2 v7.4s, v25.8h, v1.8h\n" "smlal v15.4s, v23.4h, v2.4h\n" - "smlal2 v18.4s, v23.8h, v2.8h\n" - "ldr d23, [x20, x10]\n" - "smlal v16.4s, v31.4h, v2.4h\n" - "ldr x22, [x25, #0x100]\n" - "smlal2 v21.4s, v31.8h, v2.8h\n" - "smlal v7.4s, v24.4h, v2.4h\n" - "smlal2 v17.4s, v24.8h, v2.8h\n" - "smlal v8.4s, v27.4h, v2.4h\n" - "smlal2 v5.4s, v27.8h, v2.8h\n" - "ldr d2, [x3, #0x60]\n" - "ssubl v3.8h, v3.8b, v14.8b\n" + "add x1, x1, #0x20\n" + "smlal2 v16.4s, v23.8h, v2.8h\n" + "ldr d23, [x12, x24]\n" + "smlal2 v5.4s, v24.8h, v1.8h\n" "usubl v23.8h, v23.8b, v9.8b\n" - "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v17.4s, v31.4h, v2.4h\n" + "smlal v10.4s, v24.4h, v2.4h\n" + "ldr d1, [x23, #0x58]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v27.4h, v2.4h\n" + "smlal2 v8.4s, v31.8h, v2.8h\n" + "ldr x26, [x20, #0x100]\n" + "smlal2 v7.4s, v24.8h, v2.8h\n" "smlal v15.4s, v31.4h, v3.4h\n" - "smlal2 v18.4s, v31.8h, v3.8h\n" - "ldr d31, [x13, x10]\n" - "smlal v16.4s, v30.4h, v3.4h\n" - "ldr x20, [x25, #0x108]\n" - "smlal2 v21.4s, v30.8h, v3.8h\n" - "smlal v7.4s, v27.4h, v3.4h\n" - "smlal2 v17.4s, v27.8h, v3.8h\n" - "smlal v8.4s, v23.4h, v3.4h\n" - "smlal2 v5.4s, v23.8h, v3.8h\n" - "ldr d3, [x3, #0x68]\n" - "smlal v15.4s, v30.4h, v4.4h\n" - "smlal2 v18.4s, v30.8h, v4.8h\n" - "ldr d30, [x21, x10]\n" - "smlal v16.4s, v26.4h, v4.4h\n" - "ldr x13, [x25, #0x110]\n" - "smlal2 v21.4s, v26.8h, v4.8h\n" - "ldr d26, [x14, x10]\n" - "smlal v7.4s, v23.4h, v4.4h\n" - "ldr x21, [x25, #0x118]\n" - "smlal2 v17.4s, v23.8h, v4.8h\n" - "usubl v28.8h, v28.8b, v9.8b\n" - "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal2 v16.4s, v31.8h, v3.8h\n" + "smlal2 v5.4s, v27.8h, v2.8h\n" + "ldr d31, [x14, x24]\n" "usubl v31.8h, v31.8b, v9.8b\n" - "smlal v8.4s, v28.4h, v4.4h\n" - "smlal2 v5.4s, v28.8h, v4.8h\n" - "ldr d4, [x3, #0x70]\n" - "smlal v15.4s, v22.4h, v0.4h\n" - "smlal2 v18.4s, v22.8h, v0.8h\n" - "ldr d22, [x0, x10]\n" - "smlal v16.4s, v25.4h, v0.4h\n" - "smlal2 v21.4s, v25.8h, v0.8h\n" - "smlal v7.4s, v31.4h, v0.4h\n" - "smlal2 v17.4s, v31.8h, v0.8h\n" + "smlal v17.4s, v30.4h, v3.4h\n" + "smlal v10.4s, v27.4h, v3.4h\n" + "ldr d2, [x23, #0x60]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v23.4h, v3.4h\n" + "smlal2 v8.4s, v30.8h, v3.8h\n" + "ldr x12, [x20, #0x108]\n" + "smlal2 v7.4s, v27.8h, v3.8h\n" + "smlal v15.4s, v30.4h, v4.4h\n" + "smlal2 v16.4s, v30.8h, v4.8h\n" + "ldr d30, [x15, x24]\n" + "smlal2 v5.4s, v23.8h, v3.8h\n" "usubl v30.8h, v30.8b, v9.8b\n" - "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v17.4s, v26.4h, v4.4h\n" + "smlal v10.4s, v23.4h, v4.4h\n" + "ldr d3, [x23, #0x68]\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v28.4h, v4.4h\n" + "smlal2 v8.4s, v26.8h, v4.8h\n" + "ldr d26, [x21, x24]\n" "usubl v26.8h, v26.8b, v9.8b\n" - "smlal v8.4s, v30.4h, v0.4h\n" - "smlal2 v5.4s, v30.8h, v0.8h\n" - "ldr d0, [x3, #0x78]\n" + "smlal2 v7.4s, v23.8h, v4.8h\n" + "smlal v15.4s, v22.4h, v0.4h\n" + "ldr x14, [x20, #0x110]\n" + "ldr x21, [x20, #0x118]\n" + "smlal2 v16.4s, v22.8h, v0.8h\n" + "smlal2 v5.4s, v28.8h, v4.8h\n" + "ldr d4, [x23, #0x70]\n" + "ldr d22, [x9, x24]\n" + "smlal v17.4s, v25.4h, v0.4h\n" + "smlal v10.4s, v31.4h, v0.4h\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v6.4s, v30.4h, v0.4h\n" + "smlal2 v8.4s, v25.8h, v0.8h\n" + "usubl v22.8h, v22.8b, v9.8b\n" + "smlal2 v7.4s, v31.8h, v0.8h\n" "smlal v15.4s, v25.4h, v1.4h\n" - "smlal2 v18.4s, v25.8h, v1.8h\n" - "ldr d25, [x11, x10]\n" - "smlal v16.4s, v24.4h, v1.4h\n" - "smlal2 v21.4s, v24.8h, v1.8h\n" - "smlal v7.4s, v30.4h, v1.4h\n" - "smlal2 v17.4s, v30.8h, v1.8h\n" - "smlal v8.4s, v26.4h, v1.4h\n" - "smlal2 v5.4s, v26.8h, v1.8h\n" - "ldr d1, [x3, #0x80]\n" - "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal2 v16.4s, v25.8h, v1.8h\n" + "ldr d25, [x2, x24]\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" "usubl v25.8h, v25.8b, v9.8b\n" - "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v17.4s, v24.4h, v1.4h\n" + "smlal v10.4s, v30.4h, v1.4h\n" + "ldr d0, [x23, #0x78]\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal v6.4s, v26.4h, v1.4h\n" + "smlal2 v8.4s, v24.8h, v1.8h\n" + "smlal2 v7.4s, v30.8h, v1.8h\n" "smlal v15.4s, v24.4h, v2.4h\n" - "smlal2 v18.4s, v24.8h, v2.8h\n" - "ldr d24, [x24, x10]\n" - "smlal v16.4s, v27.4h, v2.4h\n" - "smlal2 v21.4s, v27.8h, v2.8h\n" - "smlal v7.4s, v26.4h, v2.4h\n" - "smlal2 v17.4s, v26.8h, v2.8h\n" - "smlal v8.4s, v25.4h, v2.4h\n" - "smlal2 v5.4s, v25.8h, v2.8h\n" - "ldr d2, [x3, #0x88]\n" - "smlal v15.4s, v27.4h, v3.4h\n" - "smlal2 v18.4s, v27.8h, v3.8h\n" - "ldr d27, [x15, x10]\n" - "smlal v16.4s, v23.4h, v3.4h\n" - "smlal2 v21.4s, v23.8h, v3.8h\n" - "smlal v7.4s, v25.4h, v3.4h\n" - "smlal2 v17.4s, v25.8h, v3.8h\n" + "smlal2 v16.4s, v24.8h, v2.8h\n" + "ldr d24, [x13, x24]\n" + "smlal2 v5.4s, v26.8h, v1.8h\n" "usubl v24.8h, v24.8b, v9.8b\n" - "ssubl v4.8h, v4.8b, v14.8b\n" - "usubl v22.8h, v22.8b, v9.8b\n" - "smlal v8.4s, v24.4h, v3.4h\n" - "smlal2 v5.4s, v24.8h, v3.8h\n" - "ldr d3, [x3, #0x90]\n" - "smlal v15.4s, v23.4h, v4.4h\n" - "smlal2 v18.4s, v23.8h, v4.8h\n" - "ldr d23, [x9, x10]\n" - "smlal v16.4s, v28.4h, v4.4h\n" - "smlal2 v21.4s, v28.8h, v4.8h\n" - "ldr d28, [x12, x10]\n" - "smlal v7.4s, v24.4h, v4.4h\n" - "smlal2 v17.4s, v24.8h, v4.8h\n" - "smlal v8.4s, v22.4h, v4.4h\n" - "smlal2 v5.4s, v22.8h, v4.8h\n" - "ldr d4, [x3, #0x98]\n" - "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal v17.4s, v27.4h, v2.4h\n" + "smlal v10.4s, v26.4h, v2.4h\n" + "ldr d1, [x23, #0x80]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v25.4h, v2.4h\n" + "smlal2 v8.4s, v27.8h, v2.8h\n" + "smlal2 v7.4s, v26.8h, v2.8h\n" + "smlal v15.4s, v27.4h, v3.4h\n" + "smlal2 v16.4s, v27.8h, v3.8h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "ldr d27, [x19, x24]\n" "usubl v27.8h, v27.8b, v9.8b\n" + "smlal v17.4s, v23.4h, v3.4h\n" + "smlal v10.4s, v25.4h, v3.4h\n" + "ldr d2, [x23, #0x88]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v24.4h, v3.4h\n" + "smlal2 v8.4s, v23.8h, v3.8h\n" + "smlal2 v7.4s, v25.8h, v3.8h\n" + "smlal v15.4s, v23.4h, v4.4h\n" + "smlal2 v16.4s, v23.8h, v4.8h\n" + "ldr d23, [x28, x24]\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" "usubl v23.8h, v23.8b, v9.8b\n" + "smlal v17.4s, v28.4h, v4.4h\n" + "smlal v10.4s, v24.4h, v4.4h\n" + "ldr d3, [x23, #0x90]\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v22.4h, v4.4h\n" + "smlal2 v8.4s, v28.8h, v4.8h\n" + "ldr d28, [x11, x24]\n" + "usubl v28.8h, v28.8b, v9.8b\n" + "smlal2 v7.4s, v24.8h, v4.8h\n" "smlal v15.4s, v31.4h, v0.4h\n" - "smlal2 v18.4s, v31.8h, v0.8h\n" - "ldr d31, [x27, x10]\n" - "smlal v16.4s, v30.4h, v0.4h\n" - "smlal2 v21.4s, v30.8h, v0.8h\n" - "smlal v7.4s, v27.4h, v0.4h\n" - "smlal2 v17.4s, v27.8h, v0.8h\n" - "smlal v8.4s, v23.4h, v0.4h\n" - "smlal2 v5.4s, v23.8h, v0.8h\n" - "ldr d0, [x3, #0xa0]\n" - "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal2 v16.4s, v31.8h, v0.8h\n" + "ldr d31, [x6, x24]\n" + "smlal2 v5.4s, v22.8h, v4.8h\n" "usubl v31.8h, v31.8b, v9.8b\n" - "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v17.4s, v30.4h, v0.4h\n" + "smlal v10.4s, v27.4h, v0.4h\n" + "ldr d4, [x23, #0x98]\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v6.4s, v23.4h, v0.4h\n" + "smlal2 v8.4s, v30.8h, v0.8h\n" + "smlal2 v7.4s, v27.8h, v0.8h\n" "smlal v15.4s, v30.4h, v1.4h\n" - "smlal2 v18.4s, v30.8h, v1.8h\n" - "ldr d30, [x28, x10]\n" - "smlal v16.4s, v26.4h, v1.4h\n" - "smlal2 v21.4s, v26.8h, v1.8h\n" - "smlal v7.4s, v23.4h, v1.4h\n" - "smlal2 v17.4s, v23.8h, v1.8h\n" - "smlal v8.4s, v31.4h, v1.4h\n" - "smlal2 v5.4s, v31.8h, v1.8h\n" - "ldr d1, [x3, #0xa8]\n" - "smlal v15.4s, v26.4h, v2.4h\n" - "smlal2 v18.4s, v26.8h, v2.8h\n" - "ldr d26, [x7, x10]\n" - "smlal v16.4s, v25.4h, v2.4h\n" - "smlal2 v21.4s, v25.8h, v2.8h\n" - "smlal v7.4s, v31.4h, v2.4h\n" - "smlal2 v17.4s, v31.8h, v2.8h\n" + "smlal2 v16.4s, v30.8h, v1.8h\n" + "ldr d30, [x27, x24]\n" + "smlal2 v5.4s, v23.8h, v0.8h\n" "usubl v30.8h, v30.8b, v9.8b\n" - "ssubl v3.8h, v3.8b, v14.8b\n" - "usubl v28.8h, v28.8b, v9.8b\n" - "smlal v8.4s, v30.4h, v2.4h\n" - "smlal2 v5.4s, v30.8h, v2.8h\n" - "ldr d2, [x3, #0xb0]\n" - "smlal v15.4s, v25.4h, v3.4h\n" - "smlal2 v18.4s, v25.8h, v3.8h\n" - "ldr d25, [x26, x10]\n" - "smlal v16.4s, v24.4h, v3.4h\n" - "smlal2 v21.4s, v24.8h, v3.8h\n" - "smlal v7.4s, v30.4h, v3.4h\n" - "smlal2 v17.4s, v30.8h, v3.8h\n" - "smlal v8.4s, v28.4h, v3.4h\n" - "smlal2 v5.4s, v28.8h, v3.8h\n" - "ldr d3, [x3, #0xb8]\n" - "ssubl v4.8h, v4.8b, v14.8b\n" - "usubl v26.8h, v26.8b, v9.8b\n" + "smlal v17.4s, v26.4h, v1.4h\n" + "smlal v10.4s, v23.4h, v1.4h\n" + "ldr d0, [x23, #0xa0]\n" "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal v6.4s, v31.4h, v1.4h\n" + "smlal2 v8.4s, v26.8h, v1.8h\n" + "smlal2 v7.4s, v23.8h, v1.8h\n" + "smlal v15.4s, v26.4h, v2.4h\n" + "smlal2 v16.4s, v26.8h, v2.8h\n" + "smlal2 v5.4s, v31.8h, v1.8h\n" + "ldr d26, [x17, x24]\n" + "usubl v26.8h, v26.8b, v9.8b\n" + "smlal v17.4s, v25.4h, v2.4h\n" + "smlal v10.4s, v31.4h, v2.4h\n" + "ldr d1, [x23, #0xa8]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v30.4h, v2.4h\n" + "smlal2 v8.4s, v25.8h, v2.8h\n" + "smlal2 v7.4s, v31.8h, v2.8h\n" + "smlal v15.4s, v25.4h, v3.4h\n" + "smlal2 v16.4s, v25.8h, v3.8h\n" + "smlal2 v5.4s, v30.8h, v2.8h\n" + "ldr d25, [x5, x24]\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal v10.4s, v30.4h, v3.4h\n" + "ldr d2, [x23, #0xb0]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v28.4h, v3.4h\n" + "smlal2 v8.4s, v24.8h, v3.8h\n" + "smlal2 v7.4s, v30.8h, v3.8h\n" "smlal v15.4s, v24.4h, v4.4h\n" - "smlal2 v18.4s, v24.8h, v4.8h\n" - "ldr d24, [x23, x10]\n" - "smlal v16.4s, v22.4h, v4.4h\n" - "smlal2 v21.4s, v22.8h, v4.8h\n" - "smlal v7.4s, v28.4h, v4.4h\n" - "smlal2 v17.4s, v28.8h, v4.8h\n" - "smlal v8.4s, v26.4h, v4.4h\n" - "smlal2 v5.4s, v26.8h, v4.8h\n" - "ldr d4, [x3, #0xc0]\n" - "add x3, x3, #0xc8\n" + "smlal2 v16.4s, v24.8h, v4.8h\n" + "ldr d24, [x25, x24]\n" + "smlal2 v5.4s, v28.8h, v3.8h\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "smlal v17.4s, v22.4h, v4.4h\n" + "smlal v10.4s, v28.4h, v4.4h\n" + "ldr d3, [x23, #0xb8]\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v26.4h, v4.4h\n" + "smlal2 v7.4s, v28.8h, v4.8h\n" "smlal v15.4s, v27.4h, v0.4h\n" - "smlal2 v18.4s, v27.8h, v0.8h\n" - "ldr d27, [x22, x10]\n" - "smlal v16.4s, v23.4h, v0.4h\n" - "smlal2 v21.4s, v23.8h, v0.8h\n" + "smlal2 v16.4s, v27.8h, v0.8h\n" + "ldr d27, [x26, x24]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal2 v8.4s, v22.8h, v4.8h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d4, [x23, #0xc0]\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v17.4s, v23.4h, v0.4h\n" + "smlal v10.4s, v25.4h, v0.4h\n" + "add x23, x23, #0xc8\n" + "smlal v6.4s, v24.4h, v0.4h\n" + "smlal2 v7.4s, v25.8h, v0.8h\n" + "ldr d25, [x12, x24]\n" "usubl v25.8h, v25.8b, v9.8b\n" - "usubl v24.8h, v24.8b, v9.8b\n" - "ssubl v1.8h, v1.8b, v14.8b\n" - "smlal v7.4s, v25.4h, v0.4h\n" - "smlal2 v17.4s, v25.8h, v0.8h\n" - "ldr d25, [x20, x10]\n" - "smlal v8.4s, v24.4h, v0.4h\n" + "smlal2 v8.4s, v23.8h, v0.8h\n" "smlal2 v5.4s, v24.8h, v0.8h\n" "smlal v15.4s, v23.4h, v1.4h\n" - "smlal2 v18.4s, v23.8h, v1.8h\n" - "smlal v16.4s, v31.4h, v1.4h\n" - "smlal2 v21.4s, v31.8h, v1.8h\n" - "smlal v7.4s, v24.4h, v1.4h\n" - "smlal2 v17.4s, v24.8h, v1.8h\n" - "ldr d24, [x13, x10]\n" - "usubl v27.8h, v27.8b, v9.8b\n" - "ssubl v2.8h, v2.8b, v14.8b\n" - "usubl v25.8h, v25.8b, v9.8b\n" - "smlal v8.4s, v27.4h, v1.4h\n" + "smlal v17.4s, v31.4h, v1.4h\n" + "smlal v10.4s, v24.4h, v1.4h\n" + "smlal v6.4s, v27.4h, v1.4h\n" + "smlal2 v7.4s, v24.8h, v1.8h\n" + "ldr d24, [x14, x24]\n" + "smlal2 v16.4s, v23.8h, v1.8h\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "smlal2 v8.4s, v31.8h, v1.8h\n" "smlal2 v5.4s, v27.8h, v1.8h\n" "smlal v15.4s, v31.4h, v2.4h\n" - "smlal2 v18.4s, v31.8h, v2.8h\n" - "smlal v16.4s, v30.4h, v2.4h\n" - "smlal2 v21.4s, v30.8h, v2.8h\n" - "smlal v7.4s, v27.4h, v2.4h\n" - "smlal2 v17.4s, v27.8h, v2.8h\n" - "ldr d27, [x21, x10]\n" - "add x10, x10, #0x8\n" - "smlal v8.4s, v25.4h, v2.4h\n" + "smlal v17.4s, v30.4h, v2.4h\n" + "smlal v10.4s, v27.4h, v2.4h\n" + "smlal v6.4s, v25.4h, v2.4h\n" + "smlal2 v7.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x24]\n" + "smlal2 v16.4s, v31.8h, v2.8h\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal2 v8.4s, v30.8h, v2.8h\n" "smlal2 v5.4s, v25.8h, v2.8h\n" - "ssubl v3.8h, v3.8b, v14.8b\n" - "usubl v24.8h, v24.8b, v9.8b\n" - "ssubl v4.8h, v4.8b, v14.8b\n" + "add x24, x24, #0x8\n" "smlal v15.4s, v30.4h, v3.4h\n" - "smlal2 v18.4s, v30.8h, v3.8h\n" - "smlal v16.4s, v28.4h, v3.4h\n" - "smlal2 v21.4s, v28.8h, v3.8h\n" - "smlal v7.4s, v25.4h, v3.4h\n" - "smlal2 v17.4s, v25.8h, v3.8h\n" - "smlal v8.4s, v24.4h, v3.4h\n" + "smlal v17.4s, v28.4h, v3.4h\n" + "smlal v10.4s, v25.4h, v3.4h\n" + "smlal v6.4s, v24.4h, v3.4h\n" + "smlal2 v16.4s, v30.8h, v3.8h\n" + "smlal2 v8.4s, v28.8h, v3.8h\n" + "smlal2 v7.4s, v25.8h, v3.8h\n" "smlal2 v5.4s, v24.8h, v3.8h\n" "smlal v15.4s, v28.4h, v4.4h\n" - "smlal2 v18.4s, v28.8h, v4.8h\n" - "smlal v16.4s, v26.4h, v4.4h\n" - "smlal2 v21.4s, v26.8h, v4.8h\n" - "smlal v7.4s, v24.4h, v4.4h\n" - "smlal2 v17.4s, v24.8h, v4.8h\n" - "usubl v27.8h, v27.8b, v9.8b\n" - "sqrdmulh v15.4s, v15.4s, v6.4s\n" - "sqrdmulh v18.4s, v18.4s, v20.4s\n" - "smlal v8.4s, v27.4h, v4.4h\n" + "smlal v17.4s, v26.4h, v4.4h\n" + "sqdmulh v15.4s, v15.4s, v12.4s\n" + "smlal v10.4s, v24.4h, v4.4h\n" + "smlal v6.4s, v27.4h, v4.4h\n" + "sqdmulh v17.4s, v17.4s, v12.4s\n" + "smlal2 v16.4s, v28.8h, v4.8h\n" + "smlal2 v8.4s, v26.8h, v4.8h\n" + "sqdmulh v10.4s, v10.4s, v12.4s\n" + "smlal2 v7.4s, v24.8h, v4.8h\n" "smlal2 v5.4s, v27.8h, v4.8h\n" - "and v28.16b, v15.16b, v19.16b\n" - "and v26.16b, v18.16b, v12.16b\n" - "sqrdmulh v16.4s, v16.4s, v6.4s\n" - "sshr v28.4s, v28.4s, #0x1f\n" + "sqdmulh v6.4s, v6.4s, v12.4s\n" + "and v23.16b, v15.16b, v19.16b\n" + "sqdmulh v16.4s, v16.4s, v20.4s\n" + "and v22.16b, v17.16b, v19.16b\n" + "sqdmulh v8.4s, v8.4s, v20.4s\n" + "and v21.16b, v10.16b, v19.16b\n" + "sqdmulh v7.4s, v7.4s, v20.4s\n" + "and v26.16b, v6.16b, v19.16b\n" + "sqdmulh v5.4s, v5.4s, v20.4s\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "and v4.16b, v16.16b, v29.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v2.16b, v8.16b, v29.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v3.16b, v7.16b, v29.16b\n" "sshr v26.4s, v26.4s, #0x1f\n" - "sqrdmulh v21.4s, v21.4s, v20.4s\n" - "sqadd v15.4s, v15.4s, v28.4s\n" - "sqadd v18.4s, v18.4s, v26.4s\n" - "and v29.16b, v16.16b, v19.16b\n" - "and v4.16b, v21.16b, v12.16b\n" - "srshl v15.4s, v15.4s, v19.4s\n" - "srshl v18.4s, v18.4s, v12.4s\n" - "sshr v29.4s, v29.4s, #0x1f\n" + "and v25.16b, v5.16b, v29.16b\n" + "sqadd v15.4s, v15.4s, v23.4s\n" "sshr v4.4s, v4.4s, #0x1f\n" - "add v15.4s, v15.4s, v10.4s\n" - "add v18.4s, v18.4s, v10.4s\n" - "sqadd v16.4s, v16.4s, v29.4s\n" - "smin v15.4s, v15.4s, v13.4s\n" - "smin v18.4s, v18.4s, v13.4s\n" - "sqadd v21.4s, v21.4s, v4.4s\n" - "smax v15.4s, v15.4s, v11.4s\n" - "smax v18.4s, v18.4s, v11.4s\n" - "srshl v16.4s, v16.4s, v19.4s\n" - "srshl v21.4s, v21.4s, v12.4s\n" - "uzp1 v15.16b, v15.16b, v18.16b\n" - "sqrdmulh v7.4s, v7.4s, v6.4s\n" - "uzp1 v15.16b, v15.16b, v15.16b\n" - "str d15, [x17, x1]\n" - "add v16.4s, v16.4s, v10.4s\n" - "add v21.4s, v21.4s, v10.4s\n" - "and v25.16b, v7.16b, v19.16b\n" - "sqrdmulh v17.4s, v17.4s, v20.4s\n" - "smin v16.4s, v16.4s, v13.4s\n" - "smin v21.4s, v21.4s, v13.4s\n" + "sqadd v17.4s, v17.4s, v22.4s\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sqadd v10.4s, v10.4s, v21.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqadd v6.4s, v6.4s, v26.4s\n" "sshr v25.4s, v25.4s, #0x1f\n" - "smax v16.4s, v16.4s, v11.4s\n" - "smax v21.4s, v21.4s, v11.4s\n" - "sqadd v7.4s, v7.4s, v25.4s\n" - "and v31.16b, v17.16b, v12.16b\n" - "uzp1 v16.16b, v16.16b, v21.16b\n" - "sqrdmulh v8.4s, v8.4s, v6.4s\n" - "uzp1 v16.16b, v16.16b, v16.16b\n" - "str d16, [x16, x1]\n" - "srshl v7.4s, v7.4s, v19.4s\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "and v24.16b, v8.16b, v19.16b\n" - "sqrdmulh v5.4s, v5.4s, v20.4s\n" - "sqadd v17.4s, v17.4s, v31.4s\n" - "add v7.4s, v7.4s, v10.4s\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "and v1.16b, v5.16b, v12.16b\n" - "smin v7.4s, v7.4s, v13.4s\n" - "srshl v17.4s, v17.4s, v12.4s\n" - "sqadd v8.4s, v8.4s, v24.4s\n" - "smax v7.4s, v7.4s, v11.4s\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "add v17.4s, v17.4s, v10.4s\n" - "srshl v8.4s, v8.4s, v19.4s\n" - "sqadd v5.4s, v5.4s, v1.4s\n" - "smin v17.4s, v17.4s, v13.4s\n" - "add v8.4s, v8.4s, v10.4s\n" - "smax v17.4s, v17.4s, v11.4s\n" - "srshl v5.4s, v5.4s, v12.4s\n" - "smin v8.4s, v8.4s, v13.4s\n" - "uzp1 v7.16b, v7.16b, v17.16b\n" - "add v5.4s, v5.4s, v10.4s\n" - "uzp1 v7.16b, v7.16b, v7.16b\n" - "str d7, [x6, x1]\n" - "smax v8.4s, v8.4s, v11.4s\n" - "smin v5.4s, v5.4s, v13.4s\n" - "smax v5.4s, v5.4s, v11.4s\n" - "uzp1 v8.16b, v8.16b, v5.16b\n" - "uzp1 v8.16b, v8.16b, v8.16b\n" - "str d8, [x8, x1]\n" - "add x1, x1, #0x8\n" - "ldr x12, [%x[params], %[offsetof_Params_bias]]\n" - "ldr q15, [x12, #0x0]\n" - "mov v16.16b, v15.16b\n" - "ldr q18, [x12, #0x10]\n" - "add x12, x12, #0x20\n" - "mov v7.16b, v15.16b\n" - "str x12, [%x[params], %[offsetof_Params_bias]]\n" - "mov v8.16b, v15.16b\n" - "ldr d0, [x3, #0x0]\n" - "ldr d1, [x3, #0x8]\n" - "mov v21.16b, v18.16b\n" - "ldr d2, [x3, #0x10]\n" - "mov v17.16b, v18.16b\n" - "ldr d3, [x3, #0x18]\n" - "mov v5.16b, v18.16b\n" - "ldr d4, [x3, #0x20]\n" + "srshl v15.4s, v15.4s, v19.4s\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "srshl v17.4s, v17.4s, v19.4s\n" + "sqadd v8.4s, v8.4s, v2.4s\n" + "srshl v10.4s, v10.4s, v19.4s\n" + "sqadd v7.4s, v7.4s, v3.4s\n" + "srshl v6.4s, v6.4s, v19.4s\n" + "sqadd v5.4s, v5.4s, v25.4s\n" + "srshl v16.4s, v16.4s, v29.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v8.4s, v8.4s, v29.4s\n" + "sqxtn v17.4h, v17.4s\n" + "srshl v7.4s, v7.4s, v29.4s\n" + "sqxtn v10.4h, v10.4s\n" + "srshl v5.4s, v5.4s, v29.4s\n" + "sqxtn v6.4h, v6.4s\n" + "sqxtn2 v15.8h, v16.4s\n" + "sqxtn2 v17.8h, v8.4s\n" + "sqxtn2 v10.8h, v7.4s\n" + "sqxtn2 v6.8h, v5.4s\n" + "sqadd v15.8h, v15.8h, v18.8h\n" + "sqadd v17.8h, v17.8h, v18.8h\n" + "sqadd v10.8h, v10.8h, v18.8h\n" + "sqadd v6.8h, v6.8h, v18.8h\n" + "smax v15.8h, v15.8h, v11.8h\n" + "smax v17.8h, v17.8h, v11.8h\n" + "smax v10.8h, v10.8h, v11.8h\n" + "smax v6.8h, v6.8h, v11.8h\n" + "smin v15.8h, v15.8h, v13.8h\n" + "smin v17.8h, v17.8h, v13.8h\n" + "smin v10.8h, v10.8h, v13.8h\n" + "smin v6.8h, v6.8h, v13.8h\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" + "str d15, [x16, x22]\n" + "uzp1 v10.16b, v10.16b, v10.16b\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "str d17, [x8, x22]\n" + "str d10, [x4, x22]\n" + "str d6, [x7, x22]\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr q15, [x19, #0x0]\n" + "add x22, x22, #0x8\n" + "ldr q16, [x19, #0x10]\n" + "add x19, x19, #0x20\n" + "str x19, [%x[params], %[offsetof_Params_bias]]\n" + "ldr d0, [x23, #0x0]\n" + "ldr d1, [x23, #0x8]\n" + "ldr d2, [x23, #0x10]\n" + "mov v17.16b, v15.16b\n" + "mov v8.16b, v16.16b\n" + "ldr d3, [x23, #0x18]\n" + "ldr d4, [x23, #0x20]\n" + "mov v10.16b, v15.16b\n" + "mov v7.16b, v16.16b\n" + "ldp x28, x6, [x20, #0x0]\n" + "ldp x26, x25, [x20, #0x10]\n" + "mov v6.16b, v15.16b\n" + "mov v5.16b, v16.16b\n" + "ldp x5, x2, [x20, #0x20]\n" + "ldp x27, x21, [x20, #0x30]\n" "ssubl v0.8h, v0.8b, v14.8b\n" - "ldp x28, x27, [x25, #0x0]\n" "ssubl v1.8h, v1.8b, v14.8b\n" - "ldp x26, x13, [x25, #0x10]\n" + "ldp x12, x19, [x20, #0x40]\n" + "ldr d31, [x28, x24]\n" "ssubl v2.8h, v2.8b, v14.8b\n" "ssubl v3.8h, v3.8b, v14.8b\n" - "ldp x24, x23, [x25, #0x20]\n" + "ldr d30, [x6, x24]\n" + "ldr d29, [x26, x24]\n" "ssubl v4.8h, v4.8b, v14.8b\n" - "ldp x22, x21, [x25, #0x30]\n" - "ldp x20, x0, [x25, #0x40]\n" - "ldr d31, [x28, x10]\n" "usubl v31.8h, v31.8b, v9.8b\n" - "ldr d30, [x27, x10]\n" - "ldr d29, [x26, x10]\n" + "ldr d28, [x25, x24]\n" + "ldr d27, [x5, x24]\n" "usubl v30.8h, v30.8b, v9.8b\n" - "ldr d28, [x13, x10]\n" "usubl v29.8h, v29.8b, v9.8b\n" - "ldr d27, [x24, x10]\n" - "ldr d23, [x23, x10]\n" + "ldr d23, [x2, x24]\n" + "ldr d25, [x27, x24]\n" "usubl v28.8h, v28.8b, v9.8b\n" - "ldr d25, [x22, x10]\n" - "ldr d24, [x21, x10]\n" "usubl v27.8h, v27.8b, v9.8b\n" + "ldr d24, [x21, x24]\n" + "ldr d26, [x12, x24]\n" "usubl v23.8h, v23.8b, v9.8b\n" - "ldr d26, [x20, x10]\n" - "ldr d22, [x0, x10]\n" "usubl v25.8h, v25.8b, v9.8b\n" + "ldr d22, [x19, x24]\n" "usubl v24.8h, v24.8b, v9.8b\n" "usubl v26.8h, v26.8b, v9.8b\n" "usubl v22.8h, v22.8b, v9.8b\n" "bgt 1b\n" "2:" // Tail "smlal v15.4s, v31.4h, v0.4h\n" - "ldr x20, [x25, #0x50]\n" - "tst x4, #0x7\n" - "smlal2 v18.4s, v31.8h, v0.8h\n" - "ldr x28, [x25, #0x58]\n" - "smlal v16.4s, v30.4h, v0.4h\n" - "ldr x0, [x25, #0x60]\n" - "smlal2 v21.4s, v30.8h, v0.8h\n" - "ldr d31, [x20, x10]\n" - "smlal v7.4s, v29.4h, v0.4h\n" - "ldr x7, [x25, #0x68]\n" - "smlal2 v17.4s, v29.8h, v0.8h\n" - "ldr x26, [x25, #0x70]\n" - "smlal v8.4s, v28.4h, v0.4h\n" - "ldr x23, [x25, #0x78]\n" - "smlal2 v5.4s, v28.8h, v0.8h\n" - "ldr d0, [x3, #0x28]\n" + "smlal2 v16.4s, v31.8h, v0.8h\n" + "ldr x19, [x20, #0x50]\n" + "ldr d31, [x19, x24]\n" + "smlal v17.4s, v30.4h, v0.4h\n" + "smlal v10.4s, v29.4h, v0.4h\n" + "ldr x15, [x20, #0x58]\n" + "usubl v31.8h, v31.8b, v9.8b\n" + "smlal v6.4s, v28.4h, v0.4h\n" + "smlal2 v8.4s, v30.8h, v0.8h\n" + "ldr x19, [x20, #0x60]\n" + "ldr x27, [x20, #0x68]\n" + "smlal2 v7.4s, v29.8h, v0.8h\n" "smlal v15.4s, v30.4h, v1.4h\n" - "ldr x20, [x25, #0x80]\n" - "smlal2 v18.4s, v30.8h, v1.8h\n" - "ldr d30, [x28, x10]\n" - "smlal v16.4s, v27.4h, v1.4h\n" - "ldr x22, [x25, #0x88]\n" - "smlal2 v21.4s, v27.8h, v1.8h\n" - "ldr x13, [x25, #0x90]\n" - "smlal v7.4s, v28.4h, v1.4h\n" - "ldr x21, [x25, #0x98]\n" - "smlal2 v17.4s, v28.8h, v1.8h\n" - "ldr x14, [x25, #0xa0]\n" - "smlal v8.4s, v23.4h, v1.4h\n" - "ldr x11, [x25, #0xa8]\n" - "smlal2 v5.4s, v23.8h, v1.8h\n" - "ldr d1, [x3, #0x30]\n" + "ldr x5, [x20, #0x70]\n" + "ldr x11, [x20, #0x78]\n" + "smlal2 v16.4s, v30.8h, v1.8h\n" + "smlal2 v5.4s, v28.8h, v0.8h\n" + "ldr d30, [x15, x24]\n" + "usubl v30.8h, v30.8b, v9.8b\n" + "smlal v17.4s, v27.4h, v1.4h\n" + "smlal v10.4s, v28.4h, v1.4h\n" + "ldr d0, [x23, #0x28]\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal v6.4s, v23.4h, v1.4h\n" + "smlal2 v8.4s, v27.8h, v1.8h\n" + "ldr x12, [x20, #0x80]\n" + "ldr x26, [x20, #0x88]\n" + "smlal2 v7.4s, v28.8h, v1.8h\n" "smlal v15.4s, v27.4h, v2.4h\n" - "ldr x24, [x25, #0xb0]\n" - "smlal2 v18.4s, v27.8h, v2.8h\n" - "ldr d27, [x0, x10]\n" - "smlal v16.4s, v25.4h, v2.4h\n" - "ldr x0, [x25, #0xb8]\n" - "smlal2 v21.4s, v25.8h, v2.8h\n" - "ldr x15, [x25, #0xc0]\n" - "smlal v7.4s, v23.4h, v2.4h\n" - "ldr x9, [x25, #0xc8]\n" - "smlal2 v17.4s, v23.8h, v2.8h\n" - "ldr x27, [x25, #0xd0]\n" - "usubl v31.8h, v31.8b, v9.8b\n" - "ldr x28, [x25, #0xd8]\n" + "ldr x14, [x20, #0x90]\n" + "ldr x15, [x20, #0x98]\n" + "smlal2 v16.4s, v27.8h, v2.8h\n" + "smlal2 v5.4s, v23.8h, v1.8h\n" + "ldr d27, [x19, x24]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal v17.4s, v25.4h, v2.4h\n" + "smlal v10.4s, v23.4h, v2.4h\n" + "ldr d1, [x23, #0x30]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v31.4h, v2.4h\n" + "smlal2 v8.4s, v25.8h, v2.8h\n" + "ldr x21, [x20, #0xa0]\n" + "ldr x2, [x20, #0xa8]\n" + "smlal2 v7.4s, v23.8h, v2.8h\n" "smlal v15.4s, v25.4h, v3.4h\n" - "ldr x12, [x25, #0xe0]\n" - "smlal2 v18.4s, v25.8h, v3.8h\n" - "ldr d25, [x7, x10]\n" - "smlal v8.4s, v31.4h, v2.4h\n" - "ldr x7, [x25, #0xe8]\n" + "ldr x13, [x20, #0xb0]\n" + "ldr x9, [x20, #0xb8]\n" + "smlal2 v16.4s, v25.8h, v3.8h\n" "smlal2 v5.4s, v31.8h, v2.8h\n" - "ldr d2, [x3, #0x38]\n" - "smlal v16.4s, v24.4h, v3.4h\n" - "ldr q6, [x2, #0x0]\n" - "smlal2 v21.4s, v24.8h, v3.8h\n" - "ldr q19, [x5, #0x0]\n" - "smlal v7.4s, v31.4h, v3.4h\n" - "ldr q20, [x2, #0x10]\n" - "add x2, x2, #0x20\n" - "smlal2 v17.4s, v31.8h, v3.8h\n" - "ldr q12, [x5, #0x10]\n" - "add x5, x5, #0x20\n" - "usubl v30.8h, v30.8b, v9.8b\n" + "ldr d25, [x27, x24]\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal v10.4s, v31.4h, v3.4h\n" + "ldr d2, [x23, #0x38]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v30.4h, v3.4h\n" + "smlal2 v8.4s, v24.8h, v3.8h\n" + "ldr x19, [x20, #0xc0]\n" + "ldr x28, [x20, #0xc8]\n" + "smlal2 v7.4s, v31.8h, v3.8h\n" "smlal v15.4s, v24.4h, v4.4h\n" - "smlal2 v18.4s, v24.8h, v4.8h\n" - "ldr d24, [x26, x10]\n" - "usubl v27.8h, v27.8b, v9.8b\n" - "ldr x26, [x25, #0xf0]\n" - "smlal v8.4s, v30.4h, v3.4h\n" + "ldr x6, [x20, #0xd0]\n" + "ldr x27, [x20, #0xd8]\n" + "smlal2 v16.4s, v24.8h, v4.8h\n" "smlal2 v5.4s, v30.8h, v3.8h\n" - "ldr d3, [x3, #0x40]\n" - "smlal v16.4s, v27.4h, v4.4h\n" - "smlal2 v21.4s, v27.8h, v4.8h\n" - "ldr d27, [x23, x10]\n" - "smlal v7.4s, v30.4h, v4.4h\n" - "ldr x23, [x25, #0xf8]\n" - "smlal2 v17.4s, v30.8h, v4.8h\n" - "smlal v8.4s, v26.4h, v4.4h\n" - "smlal2 v5.4s, v26.8h, v4.8h\n" - "ldr d4, [x3, #0x48]\n" - "ssubl v0.8h, v0.8b, v14.8b\n" - "usubl v25.8h, v25.8b, v9.8b\n" - "ssubl v1.8h, v1.8b, v14.8b\n" - "smlal v15.4s, v29.4h, v0.4h\n" - "smlal2 v18.4s, v29.8h, v0.8h\n" - "smlal v16.4s, v28.4h, v0.4h\n" - "smlal2 v21.4s, v28.8h, v0.8h\n" - "smlal v7.4s, v22.4h, v0.4h\n" - "smlal2 v17.4s, v22.8h, v0.8h\n" - "smlal v8.4s, v25.4h, v0.4h\n" - "smlal2 v5.4s, v25.8h, v0.8h\n" - "ldr d0, [x3, #0x50]\n" - "smlal v15.4s, v28.4h, v1.4h\n" - "smlal2 v18.4s, v28.8h, v1.8h\n" - "ldr d28, [x22, x10]\n" - "smlal v16.4s, v23.4h, v1.4h\n" - "ldr x22, [x25, #0x100]\n" - "smlal2 v21.4s, v23.8h, v1.8h\n" - "smlal v7.4s, v25.4h, v1.4h\n" - "smlal2 v17.4s, v25.8h, v1.8h\n" + "ldr d24, [x5, x24]\n" "usubl v24.8h, v24.8b, v9.8b\n" - "ssubl v2.8h, v2.8b, v14.8b\n" - "usubl v27.8h, v27.8b, v9.8b\n" - "smlal v8.4s, v24.4h, v1.4h\n" - "smlal2 v5.4s, v24.8h, v1.8h\n" - "ldr d1, [x3, #0x58]\n" - "smlal v15.4s, v23.4h, v2.4h\n" - "smlal2 v18.4s, v23.8h, v2.8h\n" - "ldr d23, [x20, x10]\n" - "smlal v16.4s, v31.4h, v2.4h\n" - "ldr x20, [x25, #0x108]\n" - "smlal2 v21.4s, v31.8h, v2.8h\n" - "smlal v7.4s, v24.4h, v2.4h\n" - "smlal2 v17.4s, v24.8h, v2.8h\n" - "smlal v8.4s, v27.4h, v2.4h\n" - "smlal2 v5.4s, v27.8h, v2.8h\n" - "ldr d2, [x3, #0x60]\n" + "smlal v17.4s, v27.4h, v4.4h\n" + "smlal v10.4s, v30.4h, v4.4h\n" + "ldr d3, [x23, #0x40]\n" "ssubl v3.8h, v3.8b, v14.8b\n" - "usubl v23.8h, v23.8b, v9.8b\n" + "smlal v6.4s, v26.4h, v4.4h\n" + "smlal2 v8.4s, v27.8h, v4.8h\n" + "ldr d27, [x11, x24]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal2 v7.4s, v30.8h, v4.8h\n" + "smlal v15.4s, v29.4h, v0.4h\n" + "ldr x11, [x20, #0xe0]\n" + "ldr x17, [x20, #0xe8]\n" + "smlal2 v16.4s, v29.8h, v0.8h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d4, [x23, #0x48]\n" "ssubl v4.8h, v4.8b, v14.8b\n" - "smlal v15.4s, v31.4h, v3.4h\n" - "smlal2 v18.4s, v31.8h, v3.8h\n" - "ldr d31, [x13, x10]\n" - "smlal v16.4s, v30.4h, v3.4h\n" - "ldr x13, [x25, #0x110]\n" - "smlal2 v21.4s, v30.8h, v3.8h\n" - "smlal v7.4s, v27.4h, v3.4h\n" - "smlal2 v17.4s, v27.8h, v3.8h\n" - "smlal v8.4s, v23.4h, v3.4h\n" - "smlal2 v5.4s, v23.8h, v3.8h\n" - "ldr d3, [x3, #0x68]\n" - "smlal v15.4s, v30.4h, v4.4h\n" - "smlal2 v18.4s, v30.8h, v4.8h\n" - "ldr d30, [x21, x10]\n" - "smlal v16.4s, v26.4h, v4.4h\n" - "ldr x21, [x25, #0x118]\n" - "smlal2 v21.4s, v26.8h, v4.8h\n" - "ldr d26, [x14, x10]\n" - "smlal v7.4s, v23.4h, v4.4h\n" - "smlal2 v17.4s, v23.8h, v4.8h\n" + "smlal v17.4s, v28.4h, v0.4h\n" + "smlal v10.4s, v22.4h, v0.4h\n" + "ldr x5, [x20, #0xf0]\n" + "ldr x25, [x20, #0xf8]\n" + "smlal v6.4s, v25.4h, v0.4h\n" + "smlal2 v8.4s, v28.8h, v0.8h\n" + "ldr q12, [x10, #0x0]\n" + "ldr q19, [x1, #0x0]\n" + "smlal2 v7.4s, v22.8h, v0.8h\n" + "smlal v15.4s, v28.4h, v1.4h\n" + "ldr q20, [x10, #0x10]\n" + "ldr q29, [x1, #0x10]\n" + "smlal2 v16.4s, v28.8h, v1.8h\n" + "smlal2 v5.4s, v25.8h, v0.8h\n" + "ldr d28, [x26, x24]\n" + "ldr d0, [x23, #0x50]\n" + "smlal v17.4s, v23.4h, v1.4h\n" + "smlal v10.4s, v25.4h, v1.4h\n" "usubl v28.8h, v28.8b, v9.8b\n" + "ldr x26, [x20, #0x100]\n" + "smlal v6.4s, v24.4h, v1.4h\n" + "smlal2 v8.4s, v23.8h, v1.8h\n" "ssubl v0.8h, v0.8b, v14.8b\n" + "tst x0, #0x7\n" + "smlal2 v7.4s, v25.8h, v1.8h\n" + "smlal v15.4s, v23.4h, v2.4h\n" + "add x10, x10, #0x20\n" + "add x1, x1, #0x20\n" + "smlal2 v16.4s, v23.8h, v2.8h\n" + "ldr d23, [x12, x24]\n" + "smlal2 v5.4s, v24.8h, v1.8h\n" + "usubl v23.8h, v23.8b, v9.8b\n" + "smlal v17.4s, v31.4h, v2.4h\n" + "smlal v10.4s, v24.4h, v2.4h\n" + "ldr d1, [x23, #0x58]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v27.4h, v2.4h\n" + "smlal2 v8.4s, v31.8h, v2.8h\n" + "ldr x12, [x20, #0x108]\n" + "smlal2 v7.4s, v24.8h, v2.8h\n" + "smlal v15.4s, v31.4h, v3.4h\n" + "smlal2 v16.4s, v31.8h, v3.8h\n" + "smlal2 v5.4s, v27.8h, v2.8h\n" + "ldr d31, [x14, x24]\n" "usubl v31.8h, v31.8b, v9.8b\n" - "smlal v8.4s, v28.4h, v4.4h\n" - "smlal2 v5.4s, v28.8h, v4.8h\n" - "ldr d4, [x3, #0x70]\n" - "smlal v15.4s, v22.4h, v0.4h\n" - "smlal2 v18.4s, v22.8h, v0.8h\n" - "ldr d22, [x0, x10]\n" - "smlal v16.4s, v25.4h, v0.4h\n" - "smlal2 v21.4s, v25.8h, v0.8h\n" - "smlal v7.4s, v31.4h, v0.4h\n" - "smlal2 v17.4s, v31.8h, v0.8h\n" + "smlal v17.4s, v30.4h, v3.4h\n" + "smlal v10.4s, v27.4h, v3.4h\n" + "ldr d2, [x23, #0x60]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v23.4h, v3.4h\n" + "smlal2 v8.4s, v30.8h, v3.8h\n" + "ldr x14, [x20, #0x110]\n" + "smlal2 v7.4s, v27.8h, v3.8h\n" + "smlal v15.4s, v30.4h, v4.4h\n" + "smlal2 v16.4s, v30.8h, v4.8h\n" + "ldr d30, [x15, x24]\n" + "smlal2 v5.4s, v23.8h, v3.8h\n" "usubl v30.8h, v30.8b, v9.8b\n" - "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v17.4s, v26.4h, v4.4h\n" + "smlal v10.4s, v23.4h, v4.4h\n" + "ldr d3, [x23, #0x68]\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v28.4h, v4.4h\n" + "smlal2 v8.4s, v26.8h, v4.8h\n" + "ldr d26, [x21, x24]\n" "usubl v26.8h, v26.8b, v9.8b\n" - "smlal v8.4s, v30.4h, v0.4h\n" - "smlal2 v5.4s, v30.8h, v0.8h\n" - "ldr d0, [x3, #0x78]\n" + "smlal2 v7.4s, v23.8h, v4.8h\n" + "smlal v15.4s, v22.4h, v0.4h\n" + "ldr x21, [x20, #0x118]\n" + "smlal2 v16.4s, v22.8h, v0.8h\n" + "smlal2 v5.4s, v28.8h, v4.8h\n" + "ldr d4, [x23, #0x70]\n" + "ldr d22, [x9, x24]\n" + "smlal v17.4s, v25.4h, v0.4h\n" + "smlal v10.4s, v31.4h, v0.4h\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v6.4s, v30.4h, v0.4h\n" + "smlal2 v8.4s, v25.8h, v0.8h\n" + "usubl v22.8h, v22.8b, v9.8b\n" + "smlal2 v7.4s, v31.8h, v0.8h\n" "smlal v15.4s, v25.4h, v1.4h\n" - "smlal2 v18.4s, v25.8h, v1.8h\n" - "ldr d25, [x11, x10]\n" - "smlal v16.4s, v24.4h, v1.4h\n" - "smlal2 v21.4s, v24.8h, v1.8h\n" - "smlal v7.4s, v30.4h, v1.4h\n" - "smlal2 v17.4s, v30.8h, v1.8h\n" - "smlal v8.4s, v26.4h, v1.4h\n" - "smlal2 v5.4s, v26.8h, v1.8h\n" - "ldr d1, [x3, #0x80]\n" - "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal2 v16.4s, v25.8h, v1.8h\n" + "ldr d25, [x2, x24]\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" "usubl v25.8h, v25.8b, v9.8b\n" - "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v17.4s, v24.4h, v1.4h\n" + "smlal v10.4s, v30.4h, v1.4h\n" + "ldr d0, [x23, #0x78]\n" + "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal v6.4s, v26.4h, v1.4h\n" + "smlal2 v8.4s, v24.8h, v1.8h\n" + "smlal2 v7.4s, v30.8h, v1.8h\n" "smlal v15.4s, v24.4h, v2.4h\n" - "smlal2 v18.4s, v24.8h, v2.8h\n" - "ldr d24, [x24, x10]\n" - "smlal v16.4s, v27.4h, v2.4h\n" - "smlal2 v21.4s, v27.8h, v2.8h\n" - "smlal v7.4s, v26.4h, v2.4h\n" - "smlal2 v17.4s, v26.8h, v2.8h\n" - "smlal v8.4s, v25.4h, v2.4h\n" - "smlal2 v5.4s, v25.8h, v2.8h\n" - "ldr d2, [x3, #0x88]\n" - "smlal v15.4s, v27.4h, v3.4h\n" - "smlal2 v18.4s, v27.8h, v3.8h\n" - "ldr d27, [x15, x10]\n" - "smlal v16.4s, v23.4h, v3.4h\n" - "smlal2 v21.4s, v23.8h, v3.8h\n" - "smlal v7.4s, v25.4h, v3.4h\n" - "smlal2 v17.4s, v25.8h, v3.8h\n" + "smlal2 v16.4s, v24.8h, v2.8h\n" + "ldr d24, [x13, x24]\n" + "smlal2 v5.4s, v26.8h, v1.8h\n" "usubl v24.8h, v24.8b, v9.8b\n" - "ssubl v4.8h, v4.8b, v14.8b\n" - "usubl v22.8h, v22.8b, v9.8b\n" - "smlal v8.4s, v24.4h, v3.4h\n" - "smlal2 v5.4s, v24.8h, v3.8h\n" - "ldr d3, [x3, #0x90]\n" - "smlal v15.4s, v23.4h, v4.4h\n" - "smlal2 v18.4s, v23.8h, v4.8h\n" - "ldr d23, [x9, x10]\n" - "smlal v16.4s, v28.4h, v4.4h\n" - "smlal2 v21.4s, v28.8h, v4.8h\n" - "ldr d28, [x12, x10]\n" - "smlal v7.4s, v24.4h, v4.4h\n" - "smlal2 v17.4s, v24.8h, v4.8h\n" - "smlal v8.4s, v22.4h, v4.4h\n" - "smlal2 v5.4s, v22.8h, v4.8h\n" - "ldr d4, [x3, #0x98]\n" - "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal v17.4s, v27.4h, v2.4h\n" + "smlal v10.4s, v26.4h, v2.4h\n" + "ldr d1, [x23, #0x80]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v25.4h, v2.4h\n" + "smlal2 v8.4s, v27.8h, v2.8h\n" + "smlal2 v7.4s, v26.8h, v2.8h\n" + "smlal v15.4s, v27.4h, v3.4h\n" + "smlal2 v16.4s, v27.8h, v3.8h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "ldr d27, [x19, x24]\n" "usubl v27.8h, v27.8b, v9.8b\n" + "smlal v17.4s, v23.4h, v3.4h\n" + "smlal v10.4s, v25.4h, v3.4h\n" + "ldr d2, [x23, #0x88]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v24.4h, v3.4h\n" + "smlal2 v8.4s, v23.8h, v3.8h\n" + "smlal2 v7.4s, v25.8h, v3.8h\n" + "smlal v15.4s, v23.4h, v4.4h\n" + "smlal2 v16.4s, v23.8h, v4.8h\n" + "ldr d23, [x28, x24]\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" "usubl v23.8h, v23.8b, v9.8b\n" + "smlal v17.4s, v28.4h, v4.4h\n" + "smlal v10.4s, v24.4h, v4.4h\n" + "ldr d3, [x23, #0x90]\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v22.4h, v4.4h\n" + "smlal2 v8.4s, v28.8h, v4.8h\n" + "ldr d28, [x11, x24]\n" + "usubl v28.8h, v28.8b, v9.8b\n" + "smlal2 v7.4s, v24.8h, v4.8h\n" "smlal v15.4s, v31.4h, v0.4h\n" - "smlal2 v18.4s, v31.8h, v0.8h\n" - "ldr d31, [x27, x10]\n" - "smlal v16.4s, v30.4h, v0.4h\n" - "smlal2 v21.4s, v30.8h, v0.8h\n" - "smlal v7.4s, v27.4h, v0.4h\n" - "smlal2 v17.4s, v27.8h, v0.8h\n" - "smlal v8.4s, v23.4h, v0.4h\n" - "smlal2 v5.4s, v23.8h, v0.8h\n" - "ldr d0, [x3, #0xa0]\n" - "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal2 v16.4s, v31.8h, v0.8h\n" + "ldr d31, [x6, x24]\n" + "smlal2 v5.4s, v22.8h, v4.8h\n" "usubl v31.8h, v31.8b, v9.8b\n" - "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v17.4s, v30.4h, v0.4h\n" + "smlal v10.4s, v27.4h, v0.4h\n" + "ldr d4, [x23, #0x98]\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v6.4s, v23.4h, v0.4h\n" + "smlal2 v8.4s, v30.8h, v0.8h\n" + "smlal2 v7.4s, v27.8h, v0.8h\n" "smlal v15.4s, v30.4h, v1.4h\n" - "smlal2 v18.4s, v30.8h, v1.8h\n" - "ldr d30, [x28, x10]\n" - "smlal v16.4s, v26.4h, v1.4h\n" - "smlal2 v21.4s, v26.8h, v1.8h\n" - "smlal v7.4s, v23.4h, v1.4h\n" - "smlal2 v17.4s, v23.8h, v1.8h\n" - "smlal v8.4s, v31.4h, v1.4h\n" - "smlal2 v5.4s, v31.8h, v1.8h\n" - "ldr d1, [x3, #0xa8]\n" - "smlal v15.4s, v26.4h, v2.4h\n" - "smlal2 v18.4s, v26.8h, v2.8h\n" - "ldr d26, [x7, x10]\n" - "smlal v16.4s, v25.4h, v2.4h\n" - "smlal2 v21.4s, v25.8h, v2.8h\n" - "smlal v7.4s, v31.4h, v2.4h\n" - "smlal2 v17.4s, v31.8h, v2.8h\n" + "smlal2 v16.4s, v30.8h, v1.8h\n" + "ldr d30, [x27, x24]\n" + "smlal2 v5.4s, v23.8h, v0.8h\n" "usubl v30.8h, v30.8b, v9.8b\n" - "ssubl v3.8h, v3.8b, v14.8b\n" - "usubl v28.8h, v28.8b, v9.8b\n" - "smlal v8.4s, v30.4h, v2.4h\n" - "smlal2 v5.4s, v30.8h, v2.8h\n" - "ldr d2, [x3, #0xb0]\n" - "smlal v15.4s, v25.4h, v3.4h\n" - "smlal2 v18.4s, v25.8h, v3.8h\n" - "ldr d25, [x26, x10]\n" - "smlal v16.4s, v24.4h, v3.4h\n" - "smlal2 v21.4s, v24.8h, v3.8h\n" - "smlal v7.4s, v30.4h, v3.4h\n" - "smlal2 v17.4s, v30.8h, v3.8h\n" - "smlal v8.4s, v28.4h, v3.4h\n" - "smlal2 v5.4s, v28.8h, v3.8h\n" - "ldr d3, [x3, #0xb8]\n" - "ssubl v4.8h, v4.8b, v14.8b\n" - "usubl v26.8h, v26.8b, v9.8b\n" + "smlal v17.4s, v26.4h, v1.4h\n" + "smlal v10.4s, v23.4h, v1.4h\n" + "ldr d0, [x23, #0xa0]\n" "ssubl v0.8h, v0.8b, v14.8b\n" + "smlal v6.4s, v31.4h, v1.4h\n" + "smlal2 v8.4s, v26.8h, v1.8h\n" + "smlal2 v7.4s, v23.8h, v1.8h\n" + "smlal v15.4s, v26.4h, v2.4h\n" + "smlal2 v16.4s, v26.8h, v2.8h\n" + "smlal2 v5.4s, v31.8h, v1.8h\n" + "ldr d26, [x17, x24]\n" + "usubl v26.8h, v26.8b, v9.8b\n" + "smlal v17.4s, v25.4h, v2.4h\n" + "smlal v10.4s, v31.4h, v2.4h\n" + "ldr d1, [x23, #0xa8]\n" + "ssubl v1.8h, v1.8b, v14.8b\n" + "smlal v6.4s, v30.4h, v2.4h\n" + "smlal2 v8.4s, v25.8h, v2.8h\n" + "smlal2 v7.4s, v31.8h, v2.8h\n" + "smlal v15.4s, v25.4h, v3.4h\n" + "smlal2 v16.4s, v25.8h, v3.8h\n" + "smlal2 v5.4s, v30.8h, v2.8h\n" + "ldr d25, [x5, x24]\n" + "usubl v25.8h, v25.8b, v9.8b\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal v10.4s, v30.4h, v3.4h\n" + "ldr d2, [x23, #0xb0]\n" + "ssubl v2.8h, v2.8b, v14.8b\n" + "smlal v6.4s, v28.4h, v3.4h\n" + "smlal2 v8.4s, v24.8h, v3.8h\n" + "smlal2 v7.4s, v30.8h, v3.8h\n" "smlal v15.4s, v24.4h, v4.4h\n" - "smlal2 v18.4s, v24.8h, v4.8h\n" - "ldr d24, [x23, x10]\n" - "smlal v16.4s, v22.4h, v4.4h\n" - "smlal2 v21.4s, v22.8h, v4.8h\n" - "smlal v7.4s, v28.4h, v4.4h\n" - "smlal2 v17.4s, v28.8h, v4.8h\n" - "smlal v8.4s, v26.4h, v4.4h\n" - "smlal2 v5.4s, v26.8h, v4.8h\n" - "ldr d4, [x3, #0xc0]\n" + "smlal2 v16.4s, v24.8h, v4.8h\n" + "ldr d24, [x25, x24]\n" + "smlal2 v5.4s, v28.8h, v3.8h\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "smlal v17.4s, v22.4h, v4.4h\n" + "smlal v10.4s, v28.4h, v4.4h\n" + "ldr d3, [x23, #0xb8]\n" + "ssubl v3.8h, v3.8b, v14.8b\n" + "smlal v6.4s, v26.4h, v4.4h\n" + "smlal2 v7.4s, v28.8h, v4.8h\n" "smlal v15.4s, v27.4h, v0.4h\n" - "smlal2 v18.4s, v27.8h, v0.8h\n" - "ldr d27, [x22, x10]\n" - "smlal v16.4s, v23.4h, v0.4h\n" - "smlal2 v21.4s, v23.8h, v0.8h\n" + "smlal2 v16.4s, v27.8h, v0.8h\n" + "ldr d27, [x26, x24]\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal2 v8.4s, v22.8h, v4.8h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d4, [x23, #0xc0]\n" + "ssubl v4.8h, v4.8b, v14.8b\n" + "smlal v17.4s, v23.4h, v0.4h\n" + "smlal v10.4s, v25.4h, v0.4h\n" + "smlal v6.4s, v24.4h, v0.4h\n" + "smlal2 v7.4s, v25.8h, v0.8h\n" + "ldr d25, [x12, x24]\n" "usubl v25.8h, v25.8b, v9.8b\n" - "usubl v24.8h, v24.8b, v9.8b\n" - "ssubl v1.8h, v1.8b, v14.8b\n" - "smlal v7.4s, v25.4h, v0.4h\n" - "smlal2 v17.4s, v25.8h, v0.8h\n" - "ldr d25, [x20, x10]\n" - "smlal v8.4s, v24.4h, v0.4h\n" + "smlal2 v8.4s, v23.8h, v0.8h\n" "smlal2 v5.4s, v24.8h, v0.8h\n" "smlal v15.4s, v23.4h, v1.4h\n" - "smlal2 v18.4s, v23.8h, v1.8h\n" - "smlal v16.4s, v31.4h, v1.4h\n" - "smlal2 v21.4s, v31.8h, v1.8h\n" - "smlal v7.4s, v24.4h, v1.4h\n" - "smlal2 v17.4s, v24.8h, v1.8h\n" - "ldr d24, [x13, x10]\n" - "usubl v27.8h, v27.8b, v9.8b\n" - "ssubl v2.8h, v2.8b, v14.8b\n" - "usubl v25.8h, v25.8b, v9.8b\n" - "smlal v8.4s, v27.4h, v1.4h\n" + "smlal v17.4s, v31.4h, v1.4h\n" + "smlal v10.4s, v24.4h, v1.4h\n" + "smlal v6.4s, v27.4h, v1.4h\n" + "smlal2 v7.4s, v24.8h, v1.8h\n" + "ldr d24, [x14, x24]\n" + "smlal2 v16.4s, v23.8h, v1.8h\n" + "usubl v24.8h, v24.8b, v9.8b\n" + "smlal2 v8.4s, v31.8h, v1.8h\n" "smlal2 v5.4s, v27.8h, v1.8h\n" "smlal v15.4s, v31.4h, v2.4h\n" - "smlal2 v18.4s, v31.8h, v2.8h\n" - "smlal v16.4s, v30.4h, v2.4h\n" - "smlal2 v21.4s, v30.8h, v2.8h\n" - "smlal v7.4s, v27.4h, v2.4h\n" - "smlal2 v17.4s, v27.8h, v2.8h\n" - "ldr d27, [x21, x10]\n" - "add x10, x10, #0x8\n" - "smlal v8.4s, v25.4h, v2.4h\n" + "smlal v17.4s, v30.4h, v2.4h\n" + "smlal v10.4s, v27.4h, v2.4h\n" + "smlal v6.4s, v25.4h, v2.4h\n" + "smlal2 v7.4s, v27.8h, v2.8h\n" + "ldr d27, [x21, x24]\n" + "smlal2 v16.4s, v31.8h, v2.8h\n" + "usubl v27.8h, v27.8b, v9.8b\n" + "smlal2 v8.4s, v30.8h, v2.8h\n" "smlal2 v5.4s, v25.8h, v2.8h\n" - "ssubl v3.8h, v3.8b, v14.8b\n" - "usubl v24.8h, v24.8b, v9.8b\n" - "ssubl v4.8h, v4.8b, v14.8b\n" + "add x24, x24, #0x8\n" "smlal v15.4s, v30.4h, v3.4h\n" - "smlal2 v18.4s, v30.8h, v3.8h\n" - "smlal v16.4s, v28.4h, v3.4h\n" - "smlal2 v21.4s, v28.8h, v3.8h\n" - "smlal v7.4s, v25.4h, v3.4h\n" - "smlal2 v17.4s, v25.8h, v3.8h\n" - "smlal v8.4s, v24.4h, v3.4h\n" + "smlal v17.4s, v28.4h, v3.4h\n" + "smlal v10.4s, v25.4h, v3.4h\n" + "smlal v6.4s, v24.4h, v3.4h\n" + "smlal2 v16.4s, v30.8h, v3.8h\n" + "smlal2 v8.4s, v28.8h, v3.8h\n" + "smlal2 v7.4s, v25.8h, v3.8h\n" "smlal2 v5.4s, v24.8h, v3.8h\n" "smlal v15.4s, v28.4h, v4.4h\n" - "smlal2 v18.4s, v28.8h, v4.8h\n" - "smlal v16.4s, v26.4h, v4.4h\n" - "smlal2 v21.4s, v26.8h, v4.8h\n" - "smlal v7.4s, v24.4h, v4.4h\n" - "smlal2 v17.4s, v24.8h, v4.8h\n" - "usubl v27.8h, v27.8b, v9.8b\n" - "sqrdmulh v15.4s, v15.4s, v6.4s\n" - "sqrdmulh v18.4s, v18.4s, v20.4s\n" - "smlal v8.4s, v27.4h, v4.4h\n" + "smlal v17.4s, v26.4h, v4.4h\n" + "sqdmulh v15.4s, v15.4s, v12.4s\n" + "smlal v10.4s, v24.4h, v4.4h\n" + "smlal v6.4s, v27.4h, v4.4h\n" + "sqdmulh v17.4s, v17.4s, v12.4s\n" + "smlal2 v16.4s, v28.8h, v4.8h\n" + "smlal2 v8.4s, v26.8h, v4.8h\n" + "sqdmulh v10.4s, v10.4s, v12.4s\n" + "smlal2 v7.4s, v24.8h, v4.8h\n" "smlal2 v5.4s, v27.8h, v4.8h\n" - "and v28.16b, v15.16b, v19.16b\n" - "and v26.16b, v18.16b, v12.16b\n" - "sqrdmulh v16.4s, v16.4s, v6.4s\n" - "sshr v28.4s, v28.4s, #0x1f\n" + "sqdmulh v6.4s, v6.4s, v12.4s\n" + "and v23.16b, v15.16b, v19.16b\n" + "sqdmulh v16.4s, v16.4s, v20.4s\n" + "and v22.16b, v17.16b, v19.16b\n" + "sqdmulh v8.4s, v8.4s, v20.4s\n" + "and v21.16b, v10.16b, v19.16b\n" + "sqdmulh v7.4s, v7.4s, v20.4s\n" + "and v26.16b, v6.16b, v19.16b\n" + "sqdmulh v5.4s, v5.4s, v20.4s\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "and v4.16b, v16.16b, v29.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v2.16b, v8.16b, v29.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v3.16b, v7.16b, v29.16b\n" "sshr v26.4s, v26.4s, #0x1f\n" - "sqrdmulh v21.4s, v21.4s, v20.4s\n" - "sqadd v15.4s, v15.4s, v28.4s\n" - "sqadd v18.4s, v18.4s, v26.4s\n" - "and v29.16b, v16.16b, v19.16b\n" - "and v4.16b, v21.16b, v12.16b\n" - "srshl v15.4s, v15.4s, v19.4s\n" - "srshl v18.4s, v18.4s, v12.4s\n" - "sshr v29.4s, v29.4s, #0x1f\n" + "and v25.16b, v5.16b, v29.16b\n" + "sqadd v15.4s, v15.4s, v23.4s\n" "sshr v4.4s, v4.4s, #0x1f\n" - "add v15.4s, v15.4s, v10.4s\n" - "add v18.4s, v18.4s, v10.4s\n" - "sqadd v16.4s, v16.4s, v29.4s\n" - "smin v15.4s, v15.4s, v13.4s\n" - "smin v18.4s, v18.4s, v13.4s\n" - "sqadd v21.4s, v21.4s, v4.4s\n" - "smax v15.4s, v15.4s, v11.4s\n" - "smax v18.4s, v18.4s, v11.4s\n" - "srshl v16.4s, v16.4s, v19.4s\n" - "srshl v21.4s, v21.4s, v12.4s\n" - "uzp1 v15.16b, v15.16b, v18.16b\n" - "sqrdmulh v7.4s, v7.4s, v6.4s\n" - "uzp1 v15.16b, v15.16b, v15.16b\n" - "str d15, [x17, x1]\n" - "add v16.4s, v16.4s, v10.4s\n" - "add v21.4s, v21.4s, v10.4s\n" - "and v25.16b, v7.16b, v19.16b\n" - "sqrdmulh v17.4s, v17.4s, v20.4s\n" - "smin v16.4s, v16.4s, v13.4s\n" - "smin v21.4s, v21.4s, v13.4s\n" + "sqadd v17.4s, v17.4s, v22.4s\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sqadd v10.4s, v10.4s, v21.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqadd v6.4s, v6.4s, v26.4s\n" "sshr v25.4s, v25.4s, #0x1f\n" - "smax v16.4s, v16.4s, v11.4s\n" - "smax v21.4s, v21.4s, v11.4s\n" - "sqadd v7.4s, v7.4s, v25.4s\n" - "and v31.16b, v17.16b, v12.16b\n" - "uzp1 v16.16b, v16.16b, v21.16b\n" - "sqrdmulh v8.4s, v8.4s, v6.4s\n" - "uzp1 v16.16b, v16.16b, v16.16b\n" - "str d16, [x16, x1]\n" - "srshl v7.4s, v7.4s, v19.4s\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "and v24.16b, v8.16b, v19.16b\n" - "sqrdmulh v5.4s, v5.4s, v20.4s\n" - "sqadd v17.4s, v17.4s, v31.4s\n" - "add v7.4s, v7.4s, v10.4s\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "and v1.16b, v5.16b, v12.16b\n" - "smin v7.4s, v7.4s, v13.4s\n" - "srshl v17.4s, v17.4s, v12.4s\n" - "sqadd v8.4s, v8.4s, v24.4s\n" - "smax v7.4s, v7.4s, v11.4s\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "add v17.4s, v17.4s, v10.4s\n" - "srshl v8.4s, v8.4s, v19.4s\n" - "sqadd v5.4s, v5.4s, v1.4s\n" - "smin v17.4s, v17.4s, v13.4s\n" - "add v8.4s, v8.4s, v10.4s\n" - "smax v17.4s, v17.4s, v11.4s\n" - "srshl v5.4s, v5.4s, v12.4s\n" - "smin v8.4s, v8.4s, v13.4s\n" - "uzp1 v7.16b, v7.16b, v17.16b\n" - "add v5.4s, v5.4s, v10.4s\n" - "uzp1 v7.16b, v7.16b, v7.16b\n" - "str d7, [x6, x1]\n" - "smax v8.4s, v8.4s, v11.4s\n" - "smin v5.4s, v5.4s, v13.4s\n" - "smax v5.4s, v5.4s, v11.4s\n" - "uzp1 v8.16b, v8.16b, v5.16b\n" - "uzp1 v8.16b, v8.16b, v8.16b\n" - "str d8, [x8, x1]\n" - "add x1, x1, #0x8\n" + "srshl v15.4s, v15.4s, v19.4s\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "srshl v17.4s, v17.4s, v19.4s\n" + "sqadd v8.4s, v8.4s, v2.4s\n" + "srshl v10.4s, v10.4s, v19.4s\n" + "sqadd v7.4s, v7.4s, v3.4s\n" + "srshl v6.4s, v6.4s, v19.4s\n" + "sqadd v5.4s, v5.4s, v25.4s\n" + "srshl v16.4s, v16.4s, v29.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v8.4s, v8.4s, v29.4s\n" + "sqxtn v17.4h, v17.4s\n" + "srshl v7.4s, v7.4s, v29.4s\n" + "sqxtn v10.4h, v10.4s\n" + "srshl v5.4s, v5.4s, v29.4s\n" + "sqxtn v6.4h, v6.4s\n" + "sqxtn2 v15.8h, v16.4s\n" + "sqxtn2 v17.8h, v8.4s\n" + "sqxtn2 v10.8h, v7.4s\n" + "sqxtn2 v6.8h, v5.4s\n" + "sqadd v15.8h, v15.8h, v18.8h\n" + "sqadd v17.8h, v17.8h, v18.8h\n" + "sqadd v10.8h, v10.8h, v18.8h\n" + "sqadd v6.8h, v6.8h, v18.8h\n" + "smax v15.8h, v15.8h, v11.8h\n" + "smax v17.8h, v17.8h, v11.8h\n" + "smax v10.8h, v10.8h, v11.8h\n" + "smax v6.8h, v6.8h, v11.8h\n" + "smin v15.8h, v15.8h, v13.8h\n" + "smin v17.8h, v17.8h, v13.8h\n" + "smin v10.8h, v10.8h, v13.8h\n" + "smin v6.8h, v6.8h, v13.8h\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" + "str d15, [x16, x22]\n" + "uzp1 v10.16b, v10.16b, v10.16b\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "str d17, [x8, x22]\n" + "str d10, [x4, x22]\n" + "str d6, [x7, x22]\n" + "add x22, x22, #0x8\n" "beq 124f\n" - "add x3, x3, #0xc8\n" + "add x23, x23, #0xc8\n" "3:" // Oddments - "ldr x12, [%x[params], %[offsetof_Params_bias]]\n" - "tbz x4, #2, 5f\n" - "ld1 { v15.4s }, [x12], #0x10\n" - "tbz x4, #1, 4f\n" - "ld1 { v18.d }[0], [x12], #0x8\n" - "tbz x4, #0, 7f\n" - "ld1 { v18.s }[2], [x12]\n" + "ldr x19, [%x[params], %[offsetof_Params_bias]]\n" + "tbz x0, #2, 5f\n" + "ld1 { v15.4s }, [x19], #0x10\n" + "tbz x0, #1, 4f\n" + "ld1 { v16.d }[0], [x19], #0x8\n" + "tbz x0, #0, 7f\n" + "ld1 { v16.s }[2], [x19]\n" "b 7f\n" "4:" // Oddments: Load bias: Bit 2: Bit 1: Unset - "tbz x4, #0, 7f\n" - "ld1 { v18.s }[0], [x12]\n" + "tbz x0, #0, 7f\n" + "ld1 { v16.s }[0], [x19]\n" "b 7f\n" "5:" // Oddments: Load bias: Bit 2: Unset - "tbz x4, #1, 6f\n" - "ld1 { v15.d }[0], [x12], #0x8\n" - "tbz x4, #0, 7f\n" - "ld1 { v15.s }[2], [x12]\n" + "tbz x0, #1, 6f\n" + "ld1 { v15.d }[0], [x19], #0x8\n" + "tbz x0, #0, 7f\n" + "ld1 { v15.s }[2], [x19]\n" "b 7f\n" "6:" // Oddments: Load bias: Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 7f\n" - "ld1 { v15.s }[0], [x12]\n" + "tbz x0, #0, 7f\n" + "ld1 { v15.s }[0], [x19]\n" "7:" // Oddments: Load bias: Bit 2: End - "mov v16.16b, v15.16b\n" - "ldr d0, [x3, #0x0]\n" - "mov v21.16b, v18.16b\n" - "ldr d1, [x3, #0x8]\n" - "mov v7.16b, v15.16b\n" - "ldr d2, [x3, #0x10]\n" - "mov v17.16b, v18.16b\n" - "ldr d3, [x3, #0x18]\n" - "mov v8.16b, v15.16b\n" - "ldr d4, [x3, #0x20]\n" - "mov v5.16b, v18.16b\n" - "ldp x28, x27, [x25, #0x0]\n" + "ldr d0, [x23, #0x0]\n" + "ldr d1, [x23, #0x8]\n" + "mov v17.16b, v15.16b\n" + "mov v8.16b, v16.16b\n" + "ldr d2, [x23, #0x10]\n" + "ldr d3, [x23, #0x18]\n" + "mov v10.16b, v15.16b\n" + "mov v7.16b, v16.16b\n" + "ldr d4, [x23, #0x20]\n" + "ldp x28, x6, [x20, #0x0]\n" + "mov v6.16b, v15.16b\n" + "mov v5.16b, v16.16b\n" + "ldp x26, x25, [x20, #0x10]\n" + "ldp x5, x2, [x20, #0x20]\n" "ssubl v0.8h, v0.8b, v14.8b\n" - "ldp x26, x13, [x25, #0x10]\n" "ssubl v1.8h, v1.8b, v14.8b\n" + "ldp x27, x21, [x20, #0x30]\n" + "ldp x12, x19, [x20, #0x40]\n" "ssubl v2.8h, v2.8b, v14.8b\n" - "ldp x24, x23, [x25, #0x20]\n" "ssubl v3.8h, v3.8b, v14.8b\n" "ssubl v4.8h, v4.8b, v14.8b\n" - "ldp x22, x21, [x25, #0x30]\n" - "ldp x20, x0, [x25, #0x40]\n" - "add x28, x28, x10\n" - "add x27, x27, x10\n" - "add x26, x26, x10\n" - "add x13, x13, x10\n" - "add x24, x24, x10\n" - "add x23, x23, x10\n" - "add x22, x22, x10\n" - "add x21, x21, x10\n" - "add x20, x20, x10\n" - "add x0, x0, x10\n" - "tbz x4, #2, 9f\n" + "add x28, x28, x24\n" + "add x6, x6, x24\n" + "add x26, x26, x24\n" + "add x25, x25, x24\n" + "add x5, x5, x24\n" + "add x2, x2, x24\n" + "add x27, x27, x24\n" + "add x21, x21, x24\n" + "add x12, x12, x24\n" + "add x19, x19, x24\n" + "tbz x0, #2, 9f\n" "ld1 { v31.s }[0], [x28], #0x4\n" - "ld1 { v30.s }[0], [x27], #0x4\n" + "ld1 { v30.s }[0], [x6], #0x4\n" "ld1 { v29.s }[0], [x26], #0x4\n" - "ld1 { v28.s }[0], [x13], #0x4\n" - "ld1 { v27.s }[0], [x24], #0x4\n" - "ld1 { v23.s }[0], [x23], #0x4\n" - "ld1 { v25.s }[0], [x22], #0x4\n" + "ld1 { v28.s }[0], [x25], #0x4\n" + "ld1 { v27.s }[0], [x5], #0x4\n" + "ld1 { v23.s }[0], [x2], #0x4\n" + "ld1 { v25.s }[0], [x27], #0x4\n" "ld1 { v24.s }[0], [x21], #0x4\n" - "ld1 { v26.s }[0], [x20], #0x4\n" - "ld1 { v22.s }[0], [x0], #0x4\n" - "tbz x4, #1, 8f\n" + "ld1 { v26.s }[0], [x12], #0x4\n" + "ld1 { v22.s }[0], [x19], #0x4\n" + "tbz x0, #1, 8f\n" "ld1 { v31.h }[2], [x28], #0x2\n" - "ld1 { v30.h }[2], [x27], #0x2\n" + "ld1 { v30.h }[2], [x6], #0x2\n" "ld1 { v29.h }[2], [x26], #0x2\n" - "ld1 { v28.h }[2], [x13], #0x2\n" - "ld1 { v27.h }[2], [x24], #0x2\n" - "ld1 { v23.h }[2], [x23], #0x2\n" - "ld1 { v25.h }[2], [x22], #0x2\n" + "ld1 { v28.h }[2], [x25], #0x2\n" + "ld1 { v27.h }[2], [x5], #0x2\n" + "ld1 { v23.h }[2], [x2], #0x2\n" + "ld1 { v25.h }[2], [x27], #0x2\n" "ld1 { v24.h }[2], [x21], #0x2\n" - "ld1 { v26.h }[2], [x20], #0x2\n" - "ld1 { v22.h }[2], [x0], #0x2\n" - "tbz x4, #0, 11f\n" + "ld1 { v26.h }[2], [x12], #0x2\n" + "ld1 { v22.h }[2], [x19], #0x2\n" + "tbz x0, #0, 11f\n" "ld1 { v31.b }[6], [x28]\n" - "ld1 { v30.b }[6], [x27]\n" + "ld1 { v30.b }[6], [x6]\n" "ld1 { v29.b }[6], [x26]\n" - "ld1 { v28.b }[6], [x13]\n" - "ld1 { v27.b }[6], [x24]\n" - "ld1 { v23.b }[6], [x23]\n" - "ld1 { v25.b }[6], [x22]\n" + "ld1 { v28.b }[6], [x25]\n" + "ld1 { v27.b }[6], [x5]\n" + "ld1 { v23.b }[6], [x2]\n" + "ld1 { v25.b }[6], [x27]\n" "ld1 { v24.b }[6], [x21]\n" - "ld1 { v26.b }[6], [x20]\n" - "ld1 { v22.b }[6], [x0]\n" + "ld1 { v26.b }[6], [x12]\n" + "ld1 { v22.b }[6], [x19]\n" "b 11f\n" "8:" // Oddments: Initial loads: Bit 2: Bit 1: Unset - "tbz x4, #0, 11f\n" + "tbz x0, #0, 11f\n" "ld1 { v31.b }[4], [x28]\n" - "ld1 { v30.b }[4], [x27]\n" + "ld1 { v30.b }[4], [x6]\n" "ld1 { v29.b }[4], [x26]\n" - "ld1 { v28.b }[4], [x13]\n" - "ld1 { v27.b }[4], [x24]\n" - "ld1 { v23.b }[4], [x23]\n" - "ld1 { v25.b }[4], [x22]\n" + "ld1 { v28.b }[4], [x25]\n" + "ld1 { v27.b }[4], [x5]\n" + "ld1 { v23.b }[4], [x2]\n" + "ld1 { v25.b }[4], [x27]\n" "ld1 { v24.b }[4], [x21]\n" - "ld1 { v26.b }[4], [x20]\n" - "ld1 { v22.b }[4], [x0]\n" + "ld1 { v26.b }[4], [x12]\n" + "ld1 { v22.b }[4], [x19]\n" "b 11f\n" "9:" // Oddments: Initial loads: Bit 2: Unset - "tbz x4, #1, 10f\n" + "tbz x0, #1, 10f\n" "ld1 { v31.h }[0], [x28], #0x2\n" - "ld1 { v30.h }[0], [x27], #0x2\n" + "ld1 { v30.h }[0], [x6], #0x2\n" "ld1 { v29.h }[0], [x26], #0x2\n" - "ld1 { v28.h }[0], [x13], #0x2\n" - "ld1 { v27.h }[0], [x24], #0x2\n" - "ld1 { v23.h }[0], [x23], #0x2\n" - "ld1 { v25.h }[0], [x22], #0x2\n" + "ld1 { v28.h }[0], [x25], #0x2\n" + "ld1 { v27.h }[0], [x5], #0x2\n" + "ld1 { v23.h }[0], [x2], #0x2\n" + "ld1 { v25.h }[0], [x27], #0x2\n" "ld1 { v24.h }[0], [x21], #0x2\n" - "ld1 { v26.h }[0], [x20], #0x2\n" - "ld1 { v22.h }[0], [x0], #0x2\n" - "tbz x4, #0, 11f\n" + "ld1 { v26.h }[0], [x12], #0x2\n" + "ld1 { v22.h }[0], [x19], #0x2\n" + "tbz x0, #0, 11f\n" "ld1 { v31.b }[2], [x28]\n" - "ld1 { v30.b }[2], [x27]\n" + "ld1 { v30.b }[2], [x6]\n" "ld1 { v29.b }[2], [x26]\n" - "ld1 { v28.b }[2], [x13]\n" - "ld1 { v27.b }[2], [x24]\n" - "ld1 { v23.b }[2], [x23]\n" - "ld1 { v25.b }[2], [x22]\n" + "ld1 { v28.b }[2], [x25]\n" + "ld1 { v27.b }[2], [x5]\n" + "ld1 { v23.b }[2], [x2]\n" + "ld1 { v25.b }[2], [x27]\n" "ld1 { v24.b }[2], [x21]\n" - "ld1 { v26.b }[2], [x20]\n" - "ld1 { v22.b }[2], [x0]\n" + "ld1 { v26.b }[2], [x12]\n" + "ld1 { v22.b }[2], [x19]\n" "b 11f\n" "10:" // Oddments: Initial loads: Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 11f\n" + "tbz x0, #0, 11f\n" "ld1 { v31.b }[0], [x28]\n" - "ld1 { v30.b }[0], [x27]\n" + "ld1 { v30.b }[0], [x6]\n" "ld1 { v29.b }[0], [x26]\n" - "ld1 { v28.b }[0], [x13]\n" - "ld1 { v27.b }[0], [x24]\n" - "ld1 { v23.b }[0], [x23]\n" - "ld1 { v25.b }[0], [x22]\n" + "ld1 { v28.b }[0], [x25]\n" + "ld1 { v27.b }[0], [x5]\n" + "ld1 { v23.b }[0], [x2]\n" + "ld1 { v25.b }[0], [x27]\n" "ld1 { v24.b }[0], [x21]\n" - "ld1 { v26.b }[0], [x20]\n" - "ld1 { v22.b }[0], [x0]\n" + "ld1 { v26.b }[0], [x12]\n" + "ld1 { v22.b }[0], [x19]\n" "11:" // Oddments: Initial loads: Bit 2: End "usubl v31.8h, v31.8b, v9.8b\n" - "ldr x20, [x25, #0x50]\n" - "add x20, x20, x10\n" "usubl v30.8h, v30.8b, v9.8b\n" + "smlal v15.4s, v31.4h, v0.4h\n" + "ldr x19, [x20, #0x50]\n" "usubl v29.8h, v29.8b, v9.8b\n" + "smlal2 v16.4s, v31.8h, v0.8h\n" + "smlal v17.4s, v30.4h, v0.4h\n" + "smlal2 v8.4s, v30.8h, v0.8h\n" + "smlal v10.4s, v29.4h, v0.4h\n" "usubl v28.8h, v28.8b, v9.8b\n" + "add x19, x19, x24\n" + "smlal2 v7.4s, v29.8h, v0.8h\n" "usubl v27.8h, v27.8b, v9.8b\n" + "smlal v6.4s, v28.4h, v0.4h\n" + "smlal2 v5.4s, v28.8h, v0.8h\n" + "smlal v15.4s, v30.4h, v1.4h\n" "usubl v23.8h, v23.8b, v9.8b\n" + "smlal2 v16.4s, v30.8h, v1.8h\n" + "smlal v17.4s, v27.4h, v1.4h\n" "usubl v25.8h, v25.8b, v9.8b\n" + "smlal2 v8.4s, v27.8h, v1.8h\n" + "smlal v10.4s, v28.4h, v1.4h\n" "usubl v24.8h, v24.8b, v9.8b\n" + "smlal2 v7.4s, v28.8h, v1.8h\n" "usubl v26.8h, v26.8b, v9.8b\n" + "smlal v6.4s, v23.4h, v1.4h\n" "usubl v22.8h, v22.8b, v9.8b\n" - "smlal v15.4s, v31.4h, v0.4h\n" - "smlal2 v18.4s, v31.8h, v0.8h\n" - "smlal v16.4s, v30.4h, v0.4h\n" - "smlal2 v21.4s, v30.8h, v0.8h\n" - "smlal v7.4s, v29.4h, v0.4h\n" - "smlal2 v17.4s, v29.8h, v0.8h\n" - "smlal v8.4s, v28.4h, v0.4h\n" - "smlal2 v5.4s, v28.8h, v0.8h\n" - "smlal v15.4s, v30.4h, v1.4h\n" - "smlal2 v18.4s, v30.8h, v1.8h\n" - "smlal v16.4s, v27.4h, v1.4h\n" - "smlal2 v21.4s, v27.8h, v1.8h\n" - "smlal v7.4s, v28.4h, v1.4h\n" - "smlal2 v17.4s, v28.8h, v1.8h\n" - "smlal v8.4s, v23.4h, v1.4h\n" "smlal2 v5.4s, v23.8h, v1.8h\n" "smlal v15.4s, v27.4h, v2.4h\n" - "smlal2 v18.4s, v27.8h, v2.8h\n" - "smlal v16.4s, v25.4h, v2.4h\n" - "smlal2 v21.4s, v25.8h, v2.8h\n" - "smlal v7.4s, v23.4h, v2.4h\n" - "smlal2 v17.4s, v23.8h, v2.8h\n" - "tbz x4, #2, 13f\n" - "ld1 { v31.s }[0], [x20], #0x4\n" - "tbz x4, #1, 12f\n" - "ld1 { v31.h }[2], [x20], #0x2\n" - "tbz x4, #0, 15f\n" - "ld1 { v31.b }[6], [x20]\n" + "smlal2 v16.4s, v27.8h, v2.8h\n" + "smlal v17.4s, v25.4h, v2.4h\n" + "smlal2 v8.4s, v25.8h, v2.8h\n" + "smlal v10.4s, v23.4h, v2.4h\n" + "smlal2 v7.4s, v23.8h, v2.8h\n" + "tbz x0, #2, 13f\n" + "ld1 { v31.s }[0], [x19], #0x4\n" + "tbz x0, #1, 12f\n" + "ld1 { v31.h }[2], [x19], #0x2\n" + "tbz x0, #0, 15f\n" + "ld1 { v31.b }[6], [x19]\n" "b 15f\n" "12:" // Oddments: Load (1, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 15f\n" - "ld1 { v31.b }[4], [x20]\n" + "tbz x0, #0, 15f\n" + "ld1 { v31.b }[4], [x19]\n" "b 15f\n" "13:" // Oddments: Load (1, 3): Bit 2: Unset - "tbz x4, #1, 14f\n" - "ld1 { v31.h }[0], [x20], #0x2\n" - "tbz x4, #0, 15f\n" - "ld1 { v31.b }[2], [x20]\n" + "tbz x0, #1, 14f\n" + "ld1 { v31.h }[0], [x19], #0x2\n" + "tbz x0, #0, 15f\n" + "ld1 { v31.b }[2], [x19]\n" "b 15f\n" "14:" // Oddments: Load (1, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 15f\n" - "ld1 { v31.b }[0], [x20]\n" + "tbz x0, #0, 15f\n" + "ld1 { v31.b }[0], [x19]\n" "15:" // Oddments: Load (1, 3): Bit 2: End "usubl v31.8h, v31.8b, v9.8b\n" - "ldr x28, [x25, #0x58]\n" - "smlal v15.4s, v25.4h, v3.4h\n" - "add x28, x28, x10\n" - "smlal v8.4s, v31.4h, v2.4h\n" + "ldr x15, [x20, #0x58]\n" + "smlal v6.4s, v31.4h, v2.4h\n" "smlal2 v5.4s, v31.8h, v2.8h\n" - "smlal2 v18.4s, v25.8h, v3.8h\n" - "smlal v16.4s, v24.4h, v3.4h\n" - "smlal2 v21.4s, v24.8h, v3.8h\n" - "smlal v7.4s, v31.4h, v3.4h\n" - "smlal2 v17.4s, v31.8h, v3.8h\n" - "tbz x4, #2, 17f\n" - "ld1 { v30.s }[0], [x28], #0x4\n" - "tbz x4, #1, 16f\n" - "ld1 { v30.h }[2], [x28], #0x2\n" - "tbz x4, #0, 19f\n" - "ld1 { v30.b }[6], [x28]\n" + "smlal v15.4s, v25.4h, v3.4h\n" + "smlal2 v16.4s, v25.8h, v3.8h\n" + "add x15, x15, x24\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal2 v8.4s, v24.8h, v3.8h\n" + "smlal v10.4s, v31.4h, v3.4h\n" + "smlal2 v7.4s, v31.8h, v3.8h\n" + "tbz x0, #2, 17f\n" + "ld1 { v30.s }[0], [x15], #0x4\n" + "tbz x0, #1, 16f\n" + "ld1 { v30.h }[2], [x15], #0x2\n" + "tbz x0, #0, 19f\n" + "ld1 { v30.b }[6], [x15]\n" "b 19f\n" "16:" // Oddments: Load (1, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 19f\n" - "ld1 { v30.b }[4], [x28]\n" + "tbz x0, #0, 19f\n" + "ld1 { v30.b }[4], [x15]\n" "b 19f\n" "17:" // Oddments: Load (1, 4): Bit 2: Unset - "tbz x4, #1, 18f\n" - "ld1 { v30.h }[0], [x28], #0x2\n" - "tbz x4, #0, 19f\n" - "ld1 { v30.b }[2], [x28]\n" + "tbz x0, #1, 18f\n" + "ld1 { v30.h }[0], [x15], #0x2\n" + "tbz x0, #0, 19f\n" + "ld1 { v30.b }[2], [x15]\n" "b 19f\n" "18:" // Oddments: Load (1, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 19f\n" - "ld1 { v30.b }[0], [x28]\n" + "tbz x0, #0, 19f\n" + "ld1 { v30.b }[0], [x15]\n" "19:" // Oddments: Load (1, 4): Bit 2: End "usubl v30.8h, v30.8b, v9.8b\n" - "ldr x0, [x25, #0x60]\n" - "smlal v15.4s, v24.4h, v4.4h\n" - "add x0, x0, x10\n" - "smlal v8.4s, v30.4h, v3.4h\n" + "ldr x19, [x20, #0x60]\n" + "smlal v6.4s, v30.4h, v3.4h\n" "smlal2 v5.4s, v30.8h, v3.8h\n" - "smlal2 v18.4s, v24.8h, v4.8h\n" - "tbz x4, #2, 21f\n" - "ld1 { v27.s }[0], [x0], #0x4\n" - "tbz x4, #1, 20f\n" - "ld1 { v27.h }[2], [x0], #0x2\n" - "tbz x4, #0, 23f\n" - "ld1 { v27.b }[6], [x0]\n" + "smlal v15.4s, v24.4h, v4.4h\n" + "smlal2 v16.4s, v24.8h, v4.8h\n" + "add x19, x19, x24\n" + "tbz x0, #2, 21f\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "tbz x0, #1, 20f\n" + "ld1 { v27.h }[2], [x19], #0x2\n" + "tbz x0, #0, 23f\n" + "ld1 { v27.b }[6], [x19]\n" "b 23f\n" "20:" // Oddments: Load (0, 5): Bit 2: Bit 1: Unset - "tbz x4, #0, 23f\n" - "ld1 { v27.b }[4], [x0]\n" + "tbz x0, #0, 23f\n" + "ld1 { v27.b }[4], [x19]\n" "b 23f\n" "21:" // Oddments: Load (0, 5): Bit 2: Unset - "tbz x4, #1, 22f\n" - "ld1 { v27.h }[0], [x0], #0x2\n" - "tbz x4, #0, 23f\n" - "ld1 { v27.b }[2], [x0]\n" + "tbz x0, #1, 22f\n" + "ld1 { v27.h }[0], [x19], #0x2\n" + "tbz x0, #0, 23f\n" + "ld1 { v27.b }[2], [x19]\n" "b 23f\n" "22:" // Oddments: Load (0, 5): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 23f\n" - "ld1 { v27.b }[0], [x0]\n" + "tbz x0, #0, 23f\n" + "ld1 { v27.b }[0], [x19]\n" "23:" // Oddments: Load (0, 5): Bit 2: End "usubl v27.8h, v27.8b, v9.8b\n" - "ldr d0, [x3, #0x28]\n" - "smlal v7.4s, v30.4h, v4.4h\n" - "ldr x7, [x25, #0x68]\n" - "add x7, x7, x10\n" - "smlal v16.4s, v27.4h, v4.4h\n" - "smlal2 v21.4s, v27.8h, v4.8h\n" - "smlal2 v17.4s, v30.8h, v4.8h\n" - "smlal v8.4s, v26.4h, v4.4h\n" - "smlal2 v5.4s, v26.8h, v4.8h\n" + "ldr d0, [x23, #0x28]\n" + "smlal v17.4s, v27.4h, v4.4h\n" + "smlal2 v8.4s, v27.8h, v4.8h\n" + "smlal v10.4s, v30.4h, v4.4h\n" + "smlal2 v7.4s, v30.8h, v4.8h\n" "ssubl v0.8h, v0.8b, v14.8b\n" + "ldr x27, [x20, #0x68]\n" + "smlal v6.4s, v26.4h, v4.4h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "add x27, x27, x24\n" "smlal v15.4s, v29.4h, v0.4h\n" - "smlal2 v18.4s, v29.8h, v0.8h\n" - "smlal v16.4s, v28.4h, v0.4h\n" - "smlal2 v21.4s, v28.8h, v0.8h\n" - "smlal v7.4s, v22.4h, v0.4h\n" - "smlal2 v17.4s, v22.8h, v0.8h\n" - "tbz x4, #2, 25f\n" - "ld1 { v25.s }[0], [x7], #0x4\n" - "tbz x4, #1, 24f\n" - "ld1 { v25.h }[2], [x7], #0x2\n" - "tbz x4, #0, 27f\n" - "ld1 { v25.b }[6], [x7]\n" + "smlal2 v16.4s, v29.8h, v0.8h\n" + "smlal v17.4s, v28.4h, v0.4h\n" + "smlal2 v8.4s, v28.8h, v0.8h\n" + "smlal v10.4s, v22.4h, v0.4h\n" + "smlal2 v7.4s, v22.8h, v0.8h\n" + "tbz x0, #2, 25f\n" + "ld1 { v25.s }[0], [x27], #0x4\n" + "tbz x0, #1, 24f\n" + "ld1 { v25.h }[2], [x27], #0x2\n" + "tbz x0, #0, 27f\n" + "ld1 { v25.b }[6], [x27]\n" "b 27f\n" "24:" // Oddments: Load (2, 1): Bit 2: Bit 1: Unset - "tbz x4, #0, 27f\n" - "ld1 { v25.b }[4], [x7]\n" + "tbz x0, #0, 27f\n" + "ld1 { v25.b }[4], [x27]\n" "b 27f\n" "25:" // Oddments: Load (2, 1): Bit 2: Unset - "tbz x4, #1, 26f\n" - "ld1 { v25.h }[0], [x7], #0x2\n" - "tbz x4, #0, 27f\n" - "ld1 { v25.b }[2], [x7]\n" + "tbz x0, #1, 26f\n" + "ld1 { v25.h }[0], [x27], #0x2\n" + "tbz x0, #0, 27f\n" + "ld1 { v25.b }[2], [x27]\n" "b 27f\n" "26:" // Oddments: Load (2, 1): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 27f\n" - "ld1 { v25.b }[0], [x7]\n" + "tbz x0, #0, 27f\n" + "ld1 { v25.b }[0], [x27]\n" "27:" // Oddments: Load (2, 1): Bit 2: End + "ldr d1, [x23, #0x30]\n" "usubl v25.8h, v25.8b, v9.8b\n" - "ldr d1, [x3, #0x30]\n" - "smlal v8.4s, v25.4h, v0.4h\n" - "ldr x26, [x25, #0x70]\n" - "add x26, x26, x10\n" - "smlal2 v5.4s, v25.8h, v0.8h\n" "ssubl v1.8h, v1.8b, v14.8b\n" + "ldr x5, [x20, #0x70]\n" + "smlal v6.4s, v25.4h, v0.4h\n" + "smlal2 v5.4s, v25.8h, v0.8h\n" + "add x5, x5, x24\n" "smlal v15.4s, v28.4h, v1.4h\n" - "smlal2 v18.4s, v28.8h, v1.8h\n" - "smlal v16.4s, v23.4h, v1.4h\n" - "smlal2 v21.4s, v23.8h, v1.8h\n" - "smlal v7.4s, v25.4h, v1.4h\n" - "smlal2 v17.4s, v25.8h, v1.8h\n" - "tbz x4, #2, 29f\n" - "ld1 { v24.s }[0], [x26], #0x4\n" - "tbz x4, #1, 28f\n" - "ld1 { v24.h }[2], [x26], #0x2\n" - "tbz x4, #0, 31f\n" - "ld1 { v24.b }[6], [x26]\n" + "smlal2 v16.4s, v28.8h, v1.8h\n" + "smlal v17.4s, v23.4h, v1.4h\n" + "smlal2 v8.4s, v23.8h, v1.8h\n" + "smlal v10.4s, v25.4h, v1.4h\n" + "smlal2 v7.4s, v25.8h, v1.8h\n" + "tbz x0, #2, 29f\n" + "ld1 { v24.s }[0], [x5], #0x4\n" + "tbz x0, #1, 28f\n" + "ld1 { v24.h }[2], [x5], #0x2\n" + "tbz x0, #0, 31f\n" + "ld1 { v24.b }[6], [x5]\n" "b 31f\n" "28:" // Oddments: Load (2, 2): Bit 2: Bit 1: Unset - "tbz x4, #0, 31f\n" - "ld1 { v24.b }[4], [x26]\n" + "tbz x0, #0, 31f\n" + "ld1 { v24.b }[4], [x5]\n" "b 31f\n" "29:" // Oddments: Load (2, 2): Bit 2: Unset - "tbz x4, #1, 30f\n" - "ld1 { v24.h }[0], [x26], #0x2\n" - "tbz x4, #0, 31f\n" - "ld1 { v24.b }[2], [x26]\n" + "tbz x0, #1, 30f\n" + "ld1 { v24.h }[0], [x5], #0x2\n" + "tbz x0, #0, 31f\n" + "ld1 { v24.b }[2], [x5]\n" "b 31f\n" "30:" // Oddments: Load (2, 2): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 31f\n" - "ld1 { v24.b }[0], [x26]\n" + "tbz x0, #0, 31f\n" + "ld1 { v24.b }[0], [x5]\n" "31:" // Oddments: Load (2, 2): Bit 2: End + "ldr d2, [x23, #0x38]\n" "usubl v24.8h, v24.8b, v9.8b\n" - "ldr d2, [x3, #0x38]\n" - "smlal v8.4s, v24.4h, v1.4h\n" - "ldr x23, [x25, #0x78]\n" - "add x23, x23, x10\n" - "smlal2 v5.4s, v24.8h, v1.8h\n" "ssubl v2.8h, v2.8b, v14.8b\n" + "ldr x11, [x20, #0x78]\n" + "smlal v6.4s, v24.4h, v1.4h\n" + "smlal2 v5.4s, v24.8h, v1.8h\n" + "add x11, x11, x24\n" "smlal v15.4s, v23.4h, v2.4h\n" - "smlal2 v18.4s, v23.8h, v2.8h\n" - "smlal v16.4s, v31.4h, v2.4h\n" - "smlal2 v21.4s, v31.8h, v2.8h\n" - "smlal v7.4s, v24.4h, v2.4h\n" - "smlal2 v17.4s, v24.8h, v2.8h\n" - "tbz x4, #2, 33f\n" - "ld1 { v27.s }[0], [x23], #0x4\n" - "tbz x4, #1, 32f\n" - "ld1 { v27.h }[2], [x23], #0x2\n" - "tbz x4, #0, 35f\n" - "ld1 { v27.b }[6], [x23]\n" + "smlal2 v16.4s, v23.8h, v2.8h\n" + "smlal v17.4s, v31.4h, v2.4h\n" + "smlal2 v8.4s, v31.8h, v2.8h\n" + "smlal v10.4s, v24.4h, v2.4h\n" + "smlal2 v7.4s, v24.8h, v2.8h\n" + "tbz x0, #2, 33f\n" + "ld1 { v27.s }[0], [x11], #0x4\n" + "tbz x0, #1, 32f\n" + "ld1 { v27.h }[2], [x11], #0x2\n" + "tbz x0, #0, 35f\n" + "ld1 { v27.b }[6], [x11]\n" "b 35f\n" "32:" // Oddments: Load (2, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 35f\n" - "ld1 { v27.b }[4], [x23]\n" + "tbz x0, #0, 35f\n" + "ld1 { v27.b }[4], [x11]\n" "b 35f\n" "33:" // Oddments: Load (2, 3): Bit 2: Unset - "tbz x4, #1, 34f\n" - "ld1 { v27.h }[0], [x23], #0x2\n" - "tbz x4, #0, 35f\n" - "ld1 { v27.b }[2], [x23]\n" + "tbz x0, #1, 34f\n" + "ld1 { v27.h }[0], [x11], #0x2\n" + "tbz x0, #0, 35f\n" + "ld1 { v27.b }[2], [x11]\n" "b 35f\n" "34:" // Oddments: Load (2, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 35f\n" - "ld1 { v27.b }[0], [x23]\n" + "tbz x0, #0, 35f\n" + "ld1 { v27.b }[0], [x11]\n" "35:" // Oddments: Load (2, 3): Bit 2: End + "ldr d3, [x23, #0x40]\n" "usubl v27.8h, v27.8b, v9.8b\n" - "ldr d3, [x3, #0x40]\n" - "smlal v8.4s, v27.4h, v2.4h\n" - "ldr x20, [x25, #0x80]\n" - "add x20, x20, x10\n" - "smlal2 v5.4s, v27.8h, v2.8h\n" "ssubl v3.8h, v3.8b, v14.8b\n" + "ldr x12, [x20, #0x80]\n" + "smlal v6.4s, v27.4h, v2.4h\n" + "smlal2 v5.4s, v27.8h, v2.8h\n" + "add x12, x12, x24\n" "smlal v15.4s, v31.4h, v3.4h\n" - "smlal2 v18.4s, v31.8h, v3.8h\n" - "smlal v16.4s, v30.4h, v3.4h\n" - "smlal2 v21.4s, v30.8h, v3.8h\n" - "smlal v7.4s, v27.4h, v3.4h\n" - "smlal2 v17.4s, v27.8h, v3.8h\n" - "tbz x4, #2, 37f\n" - "ld1 { v23.s }[0], [x20], #0x4\n" - "tbz x4, #1, 36f\n" - "ld1 { v23.h }[2], [x20], #0x2\n" - "tbz x4, #0, 39f\n" - "ld1 { v23.b }[6], [x20]\n" + "smlal2 v16.4s, v31.8h, v3.8h\n" + "smlal v17.4s, v30.4h, v3.4h\n" + "smlal2 v8.4s, v30.8h, v3.8h\n" + "smlal v10.4s, v27.4h, v3.4h\n" + "smlal2 v7.4s, v27.8h, v3.8h\n" + "tbz x0, #2, 37f\n" + "ld1 { v23.s }[0], [x12], #0x4\n" + "tbz x0, #1, 36f\n" + "ld1 { v23.h }[2], [x12], #0x2\n" + "tbz x0, #0, 39f\n" + "ld1 { v23.b }[6], [x12]\n" "b 39f\n" "36:" // Oddments: Load (2, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 39f\n" - "ld1 { v23.b }[4], [x20]\n" + "tbz x0, #0, 39f\n" + "ld1 { v23.b }[4], [x12]\n" "b 39f\n" "37:" // Oddments: Load (2, 4): Bit 2: Unset - "tbz x4, #1, 38f\n" - "ld1 { v23.h }[0], [x20], #0x2\n" - "tbz x4, #0, 39f\n" - "ld1 { v23.b }[2], [x20]\n" + "tbz x0, #1, 38f\n" + "ld1 { v23.h }[0], [x12], #0x2\n" + "tbz x0, #0, 39f\n" + "ld1 { v23.b }[2], [x12]\n" "b 39f\n" "38:" // Oddments: Load (2, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 39f\n" - "ld1 { v23.b }[0], [x20]\n" + "tbz x0, #0, 39f\n" + "ld1 { v23.b }[0], [x12]\n" "39:" // Oddments: Load (2, 4): Bit 2: End + "ldr d4, [x23, #0x48]\n" "usubl v23.8h, v23.8b, v9.8b\n" - "ldr d4, [x3, #0x48]\n" - "smlal v8.4s, v23.4h, v3.4h\n" - "ldr x22, [x25, #0x88]\n" - "add x22, x22, x10\n" - "smlal2 v5.4s, v23.8h, v3.8h\n" "ssubl v4.8h, v4.8b, v14.8b\n" + "ldr x26, [x20, #0x88]\n" + "smlal v6.4s, v23.4h, v3.4h\n" + "smlal2 v5.4s, v23.8h, v3.8h\n" + "add x26, x26, x24\n" "smlal v15.4s, v30.4h, v4.4h\n" - "smlal2 v18.4s, v30.8h, v4.8h\n" - "smlal v16.4s, v26.4h, v4.4h\n" - "smlal2 v21.4s, v26.8h, v4.8h\n" - "smlal v7.4s, v23.4h, v4.4h\n" - "smlal2 v17.4s, v23.8h, v4.8h\n" - "tbz x4, #2, 41f\n" - "ld1 { v28.s }[0], [x22], #0x4\n" - "tbz x4, #1, 40f\n" - "ld1 { v28.h }[2], [x22], #0x2\n" - "tbz x4, #0, 43f\n" - "ld1 { v28.b }[6], [x22]\n" + "smlal2 v16.4s, v30.8h, v4.8h\n" + "smlal v17.4s, v26.4h, v4.4h\n" + "smlal2 v8.4s, v26.8h, v4.8h\n" + "smlal v10.4s, v23.4h, v4.4h\n" + "smlal2 v7.4s, v23.8h, v4.8h\n" + "tbz x0, #2, 41f\n" + "ld1 { v28.s }[0], [x26], #0x4\n" + "tbz x0, #1, 40f\n" + "ld1 { v28.h }[2], [x26], #0x2\n" + "tbz x0, #0, 43f\n" + "ld1 { v28.b }[6], [x26]\n" "b 43f\n" "40:" // Oddments: Load (2, 5): Bit 2: Bit 1: Unset - "tbz x4, #0, 43f\n" - "ld1 { v28.b }[4], [x22]\n" + "tbz x0, #0, 43f\n" + "ld1 { v28.b }[4], [x26]\n" "b 43f\n" "41:" // Oddments: Load (2, 5): Bit 2: Unset - "tbz x4, #1, 42f\n" - "ld1 { v28.h }[0], [x22], #0x2\n" - "tbz x4, #0, 43f\n" - "ld1 { v28.b }[2], [x22]\n" + "tbz x0, #1, 42f\n" + "ld1 { v28.h }[0], [x26], #0x2\n" + "tbz x0, #0, 43f\n" + "ld1 { v28.b }[2], [x26]\n" "b 43f\n" "42:" // Oddments: Load (2, 5): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 43f\n" - "ld1 { v28.b }[0], [x22]\n" + "tbz x0, #0, 43f\n" + "ld1 { v28.b }[0], [x26]\n" "43:" // Oddments: Load (2, 5): Bit 2: End + "ldr d0, [x23, #0x50]\n" "usubl v28.8h, v28.8b, v9.8b\n" - "ldr d0, [x3, #0x50]\n" - "smlal v8.4s, v28.4h, v4.4h\n" - "ldr x13, [x25, #0x90]\n" - "add x13, x13, x10\n" - "smlal2 v5.4s, v28.8h, v4.8h\n" "ssubl v0.8h, v0.8b, v14.8b\n" + "ldr x14, [x20, #0x90]\n" + "smlal v6.4s, v28.4h, v4.4h\n" + "smlal2 v5.4s, v28.8h, v4.8h\n" + "add x14, x14, x24\n" "smlal v15.4s, v22.4h, v0.4h\n" - "smlal2 v18.4s, v22.8h, v0.8h\n" - "smlal v16.4s, v25.4h, v0.4h\n" - "smlal2 v21.4s, v25.8h, v0.8h\n" - "tbz x4, #2, 45f\n" - "ld1 { v31.s }[0], [x13], #0x4\n" - "tbz x4, #1, 44f\n" - "ld1 { v31.h }[2], [x13], #0x2\n" - "tbz x4, #0, 47f\n" - "ld1 { v31.b }[6], [x13]\n" + "smlal2 v16.4s, v22.8h, v0.8h\n" + "smlal v17.4s, v25.4h, v0.4h\n" + "smlal2 v8.4s, v25.8h, v0.8h\n" + "tbz x0, #2, 45f\n" + "ld1 { v31.s }[0], [x14], #0x4\n" + "tbz x0, #1, 44f\n" + "ld1 { v31.h }[2], [x14], #0x2\n" + "tbz x0, #0, 47f\n" + "ld1 { v31.b }[6], [x14]\n" "b 47f\n" "44:" // Oddments: Load (3, 0): Bit 2: Bit 1: Unset - "tbz x4, #0, 47f\n" - "ld1 { v31.b }[4], [x13]\n" + "tbz x0, #0, 47f\n" + "ld1 { v31.b }[4], [x14]\n" "b 47f\n" "45:" // Oddments: Load (3, 0): Bit 2: Unset - "tbz x4, #1, 46f\n" - "ld1 { v31.h }[0], [x13], #0x2\n" - "tbz x4, #0, 47f\n" - "ld1 { v31.b }[2], [x13]\n" + "tbz x0, #1, 46f\n" + "ld1 { v31.h }[0], [x14], #0x2\n" + "tbz x0, #0, 47f\n" + "ld1 { v31.b }[2], [x14]\n" "b 47f\n" "46:" // Oddments: Load (3, 0): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 47f\n" - "ld1 { v31.b }[0], [x13]\n" + "tbz x0, #0, 47f\n" + "ld1 { v31.b }[0], [x14]\n" "47:" // Oddments: Load (3, 0): Bit 2: End "usubl v31.8h, v31.8b, v9.8b\n" - "ldr x21, [x25, #0x98]\n" - "smlal v7.4s, v31.4h, v0.4h\n" - "add x21, x21, x10\n" - "smlal2 v17.4s, v31.8h, v0.8h\n" - "tbz x4, #2, 49f\n" - "ld1 { v30.s }[0], [x21], #0x4\n" - "tbz x4, #1, 48f\n" - "ld1 { v30.h }[2], [x21], #0x2\n" - "tbz x4, #0, 51f\n" - "ld1 { v30.b }[6], [x21]\n" + "ldr x15, [x20, #0x98]\n" + "smlal v10.4s, v31.4h, v0.4h\n" + "smlal2 v7.4s, v31.8h, v0.8h\n" + "add x15, x15, x24\n" + "tbz x0, #2, 49f\n" + "ld1 { v30.s }[0], [x15], #0x4\n" + "tbz x0, #1, 48f\n" + "ld1 { v30.h }[2], [x15], #0x2\n" + "tbz x0, #0, 51f\n" + "ld1 { v30.b }[6], [x15]\n" "b 51f\n" "48:" // Oddments: Load (3, 1): Bit 2: Bit 1: Unset - "tbz x4, #0, 51f\n" - "ld1 { v30.b }[4], [x21]\n" + "tbz x0, #0, 51f\n" + "ld1 { v30.b }[4], [x15]\n" "b 51f\n" "49:" // Oddments: Load (3, 1): Bit 2: Unset - "tbz x4, #1, 50f\n" - "ld1 { v30.h }[0], [x21], #0x2\n" - "tbz x4, #0, 51f\n" - "ld1 { v30.b }[2], [x21]\n" + "tbz x0, #1, 50f\n" + "ld1 { v30.h }[0], [x15], #0x2\n" + "tbz x0, #0, 51f\n" + "ld1 { v30.b }[2], [x15]\n" "b 51f\n" "50:" // Oddments: Load (3, 1): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 51f\n" - "ld1 { v30.b }[0], [x21]\n" + "tbz x0, #0, 51f\n" + "ld1 { v30.b }[0], [x15]\n" "51:" // Oddments: Load (3, 1): Bit 2: End + "ldr d1, [x23, #0x58]\n" "usubl v30.8h, v30.8b, v9.8b\n" - "ldr d1, [x3, #0x58]\n" - "smlal v8.4s, v30.4h, v0.4h\n" - "ldr x14, [x25, #0xa0]\n" - "add x14, x14, x10\n" - "smlal2 v5.4s, v30.8h, v0.8h\n" "ssubl v1.8h, v1.8b, v14.8b\n" + "ldr x21, [x20, #0xa0]\n" + "smlal v6.4s, v30.4h, v0.4h\n" + "smlal2 v5.4s, v30.8h, v0.8h\n" + "add x21, x21, x24\n" "smlal v15.4s, v25.4h, v1.4h\n" - "smlal2 v18.4s, v25.8h, v1.8h\n" - "smlal v16.4s, v24.4h, v1.4h\n" - "smlal2 v21.4s, v24.8h, v1.8h\n" - "smlal v7.4s, v30.4h, v1.4h\n" - "smlal2 v17.4s, v30.8h, v1.8h\n" - "tbz x4, #2, 53f\n" - "ld1 { v26.s }[0], [x14], #0x4\n" - "tbz x4, #1, 52f\n" - "ld1 { v26.h }[2], [x14], #0x2\n" - "tbz x4, #0, 55f\n" - "ld1 { v26.b }[6], [x14]\n" + "smlal2 v16.4s, v25.8h, v1.8h\n" + "smlal v17.4s, v24.4h, v1.4h\n" + "smlal2 v8.4s, v24.8h, v1.8h\n" + "smlal v10.4s, v30.4h, v1.4h\n" + "smlal2 v7.4s, v30.8h, v1.8h\n" + "tbz x0, #2, 53f\n" + "ld1 { v26.s }[0], [x21], #0x4\n" + "tbz x0, #1, 52f\n" + "ld1 { v26.h }[2], [x21], #0x2\n" + "tbz x0, #0, 55f\n" + "ld1 { v26.b }[6], [x21]\n" "b 55f\n" "52:" // Oddments: Load (3, 2): Bit 2: Bit 1: Unset - "tbz x4, #0, 55f\n" - "ld1 { v26.b }[4], [x14]\n" + "tbz x0, #0, 55f\n" + "ld1 { v26.b }[4], [x21]\n" "b 55f\n" "53:" // Oddments: Load (3, 2): Bit 2: Unset - "tbz x4, #1, 54f\n" - "ld1 { v26.h }[0], [x14], #0x2\n" - "tbz x4, #0, 55f\n" - "ld1 { v26.b }[2], [x14]\n" + "tbz x0, #1, 54f\n" + "ld1 { v26.h }[0], [x21], #0x2\n" + "tbz x0, #0, 55f\n" + "ld1 { v26.b }[2], [x21]\n" "b 55f\n" "54:" // Oddments: Load (3, 2): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 55f\n" - "ld1 { v26.b }[0], [x14]\n" + "tbz x0, #0, 55f\n" + "ld1 { v26.b }[0], [x21]\n" "55:" // Oddments: Load (3, 2): Bit 2: End + "ldr d2, [x23, #0x60]\n" "usubl v26.8h, v26.8b, v9.8b\n" - "ldr d2, [x3, #0x60]\n" - "smlal v8.4s, v26.4h, v1.4h\n" - "ldr x11, [x25, #0xa8]\n" - "add x11, x11, x10\n" - "smlal2 v5.4s, v26.8h, v1.8h\n" "ssubl v2.8h, v2.8b, v14.8b\n" + "ldr x2, [x20, #0xa8]\n" + "smlal v6.4s, v26.4h, v1.4h\n" + "smlal2 v5.4s, v26.8h, v1.8h\n" + "add x2, x2, x24\n" "smlal v15.4s, v24.4h, v2.4h\n" - "smlal2 v18.4s, v24.8h, v2.8h\n" - "smlal v16.4s, v27.4h, v2.4h\n" - "smlal2 v21.4s, v27.8h, v2.8h\n" - "smlal v7.4s, v26.4h, v2.4h\n" - "smlal2 v17.4s, v26.8h, v2.8h\n" - "tbz x4, #2, 57f\n" - "ld1 { v25.s }[0], [x11], #0x4\n" - "tbz x4, #1, 56f\n" - "ld1 { v25.h }[2], [x11], #0x2\n" - "tbz x4, #0, 59f\n" - "ld1 { v25.b }[6], [x11]\n" + "smlal2 v16.4s, v24.8h, v2.8h\n" + "smlal v17.4s, v27.4h, v2.4h\n" + "smlal2 v8.4s, v27.8h, v2.8h\n" + "smlal v10.4s, v26.4h, v2.4h\n" + "smlal2 v7.4s, v26.8h, v2.8h\n" + "tbz x0, #2, 57f\n" + "ld1 { v25.s }[0], [x2], #0x4\n" + "tbz x0, #1, 56f\n" + "ld1 { v25.h }[2], [x2], #0x2\n" + "tbz x0, #0, 59f\n" + "ld1 { v25.b }[6], [x2]\n" "b 59f\n" "56:" // Oddments: Load (3, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 59f\n" - "ld1 { v25.b }[4], [x11]\n" + "tbz x0, #0, 59f\n" + "ld1 { v25.b }[4], [x2]\n" "b 59f\n" "57:" // Oddments: Load (3, 3): Bit 2: Unset - "tbz x4, #1, 58f\n" - "ld1 { v25.h }[0], [x11], #0x2\n" - "tbz x4, #0, 59f\n" - "ld1 { v25.b }[2], [x11]\n" + "tbz x0, #1, 58f\n" + "ld1 { v25.h }[0], [x2], #0x2\n" + "tbz x0, #0, 59f\n" + "ld1 { v25.b }[2], [x2]\n" "b 59f\n" "58:" // Oddments: Load (3, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 59f\n" - "ld1 { v25.b }[0], [x11]\n" + "tbz x0, #0, 59f\n" + "ld1 { v25.b }[0], [x2]\n" "59:" // Oddments: Load (3, 3): Bit 2: End + "ldr d3, [x23, #0x68]\n" "usubl v25.8h, v25.8b, v9.8b\n" - "ldr d3, [x3, #0x68]\n" - "smlal v8.4s, v25.4h, v2.4h\n" - "ldr x24, [x25, #0xb0]\n" - "add x24, x24, x10\n" - "smlal2 v5.4s, v25.8h, v2.8h\n" "ssubl v3.8h, v3.8b, v14.8b\n" + "ldr x13, [x20, #0xb0]\n" + "smlal v6.4s, v25.4h, v2.4h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "add x13, x13, x24\n" "smlal v15.4s, v27.4h, v3.4h\n" - "smlal2 v18.4s, v27.8h, v3.8h\n" - "smlal v16.4s, v23.4h, v3.4h\n" - "smlal2 v21.4s, v23.8h, v3.8h\n" - "smlal v7.4s, v25.4h, v3.4h\n" - "smlal2 v17.4s, v25.8h, v3.8h\n" - "tbz x4, #2, 61f\n" - "ld1 { v24.s }[0], [x24], #0x4\n" - "tbz x4, #1, 60f\n" - "ld1 { v24.h }[2], [x24], #0x2\n" - "tbz x4, #0, 63f\n" - "ld1 { v24.b }[6], [x24]\n" + "smlal2 v16.4s, v27.8h, v3.8h\n" + "smlal v17.4s, v23.4h, v3.4h\n" + "smlal2 v8.4s, v23.8h, v3.8h\n" + "smlal v10.4s, v25.4h, v3.4h\n" + "smlal2 v7.4s, v25.8h, v3.8h\n" + "tbz x0, #2, 61f\n" + "ld1 { v24.s }[0], [x13], #0x4\n" + "tbz x0, #1, 60f\n" + "ld1 { v24.h }[2], [x13], #0x2\n" + "tbz x0, #0, 63f\n" + "ld1 { v24.b }[6], [x13]\n" "b 63f\n" "60:" // Oddments: Load (3, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 63f\n" - "ld1 { v24.b }[4], [x24]\n" + "tbz x0, #0, 63f\n" + "ld1 { v24.b }[4], [x13]\n" "b 63f\n" "61:" // Oddments: Load (3, 4): Bit 2: Unset - "tbz x4, #1, 62f\n" - "ld1 { v24.h }[0], [x24], #0x2\n" - "tbz x4, #0, 63f\n" - "ld1 { v24.b }[2], [x24]\n" + "tbz x0, #1, 62f\n" + "ld1 { v24.h }[0], [x13], #0x2\n" + "tbz x0, #0, 63f\n" + "ld1 { v24.b }[2], [x13]\n" "b 63f\n" "62:" // Oddments: Load (3, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 63f\n" - "ld1 { v24.b }[0], [x24]\n" + "tbz x0, #0, 63f\n" + "ld1 { v24.b }[0], [x13]\n" "63:" // Oddments: Load (3, 4): Bit 2: End + "ldr d4, [x23, #0x70]\n" "usubl v24.8h, v24.8b, v9.8b\n" - "ldr d4, [x3, #0x70]\n" - "smlal v8.4s, v24.4h, v3.4h\n" - "ldr x0, [x25, #0xb8]\n" - "add x0, x0, x10\n" - "smlal2 v5.4s, v24.8h, v3.8h\n" "ssubl v4.8h, v4.8b, v14.8b\n" + "ldr x9, [x20, #0xb8]\n" + "smlal v6.4s, v24.4h, v3.4h\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "add x9, x9, x24\n" "smlal v15.4s, v23.4h, v4.4h\n" - "smlal2 v18.4s, v23.8h, v4.8h\n" - "smlal v16.4s, v28.4h, v4.4h\n" - "smlal2 v21.4s, v28.8h, v4.8h\n" - "smlal v7.4s, v24.4h, v4.4h\n" - "smlal2 v17.4s, v24.8h, v4.8h\n" - "tbz x4, #2, 65f\n" - "ld1 { v22.s }[0], [x0], #0x4\n" - "tbz x4, #1, 64f\n" - "ld1 { v22.h }[2], [x0], #0x2\n" - "tbz x4, #0, 67f\n" - "ld1 { v22.b }[6], [x0]\n" + "smlal2 v16.4s, v23.8h, v4.8h\n" + "smlal v17.4s, v28.4h, v4.4h\n" + "smlal2 v8.4s, v28.8h, v4.8h\n" + "smlal v10.4s, v24.4h, v4.4h\n" + "smlal2 v7.4s, v24.8h, v4.8h\n" + "tbz x0, #2, 65f\n" + "ld1 { v22.s }[0], [x9], #0x4\n" + "tbz x0, #1, 64f\n" + "ld1 { v22.h }[2], [x9], #0x2\n" + "tbz x0, #0, 67f\n" + "ld1 { v22.b }[6], [x9]\n" "b 67f\n" "64:" // Oddments: Load (3, 5): Bit 2: Bit 1: Unset - "tbz x4, #0, 67f\n" - "ld1 { v22.b }[4], [x0]\n" + "tbz x0, #0, 67f\n" + "ld1 { v22.b }[4], [x9]\n" "b 67f\n" "65:" // Oddments: Load (3, 5): Bit 2: Unset - "tbz x4, #1, 66f\n" - "ld1 { v22.h }[0], [x0], #0x2\n" - "tbz x4, #0, 67f\n" - "ld1 { v22.b }[2], [x0]\n" + "tbz x0, #1, 66f\n" + "ld1 { v22.h }[0], [x9], #0x2\n" + "tbz x0, #0, 67f\n" + "ld1 { v22.b }[2], [x9]\n" "b 67f\n" "66:" // Oddments: Load (3, 5): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 67f\n" - "ld1 { v22.b }[0], [x0]\n" + "tbz x0, #0, 67f\n" + "ld1 { v22.b }[0], [x9]\n" "67:" // Oddments: Load (3, 5): Bit 2: End + "ldr d0, [x23, #0x78]\n" "usubl v22.8h, v22.8b, v9.8b\n" - "ldr d0, [x3, #0x78]\n" - "smlal v8.4s, v22.4h, v4.4h\n" - "ldr x15, [x25, #0xc0]\n" - "add x15, x15, x10\n" - "smlal2 v5.4s, v22.8h, v4.8h\n" "ssubl v0.8h, v0.8b, v14.8b\n" + "ldr x19, [x20, #0xc0]\n" + "smlal v6.4s, v22.4h, v4.4h\n" + "smlal2 v5.4s, v22.8h, v4.8h\n" + "add x19, x19, x24\n" "smlal v15.4s, v31.4h, v0.4h\n" - "smlal2 v18.4s, v31.8h, v0.8h\n" - "smlal v16.4s, v30.4h, v0.4h\n" - "smlal2 v21.4s, v30.8h, v0.8h\n" - "tbz x4, #2, 69f\n" - "ld1 { v27.s }[0], [x15], #0x4\n" - "tbz x4, #1, 68f\n" - "ld1 { v27.h }[2], [x15], #0x2\n" - "tbz x4, #0, 71f\n" - "ld1 { v27.b }[6], [x15]\n" + "smlal2 v16.4s, v31.8h, v0.8h\n" + "smlal v17.4s, v30.4h, v0.4h\n" + "smlal2 v8.4s, v30.8h, v0.8h\n" + "tbz x0, #2, 69f\n" + "ld1 { v27.s }[0], [x19], #0x4\n" + "tbz x0, #1, 68f\n" + "ld1 { v27.h }[2], [x19], #0x2\n" + "tbz x0, #0, 71f\n" + "ld1 { v27.b }[6], [x19]\n" "b 71f\n" "68:" // Oddments: Load (4, 0): Bit 2: Bit 1: Unset - "tbz x4, #0, 71f\n" - "ld1 { v27.b }[4], [x15]\n" + "tbz x0, #0, 71f\n" + "ld1 { v27.b }[4], [x19]\n" "b 71f\n" "69:" // Oddments: Load (4, 0): Bit 2: Unset - "tbz x4, #1, 70f\n" - "ld1 { v27.h }[0], [x15], #0x2\n" - "tbz x4, #0, 71f\n" - "ld1 { v27.b }[2], [x15]\n" + "tbz x0, #1, 70f\n" + "ld1 { v27.h }[0], [x19], #0x2\n" + "tbz x0, #0, 71f\n" + "ld1 { v27.b }[2], [x19]\n" "b 71f\n" "70:" // Oddments: Load (4, 0): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 71f\n" - "ld1 { v27.b }[0], [x15]\n" + "tbz x0, #0, 71f\n" + "ld1 { v27.b }[0], [x19]\n" "71:" // Oddments: Load (4, 0): Bit 2: End "usubl v27.8h, v27.8b, v9.8b\n" - "ldr x9, [x25, #0xc8]\n" - "smlal v7.4s, v27.4h, v0.4h\n" - "add x9, x9, x10\n" - "smlal2 v17.4s, v27.8h, v0.8h\n" - "tbz x4, #2, 73f\n" - "ld1 { v23.s }[0], [x9], #0x4\n" - "tbz x4, #1, 72f\n" - "ld1 { v23.h }[2], [x9], #0x2\n" - "tbz x4, #0, 75f\n" - "ld1 { v23.b }[6], [x9]\n" + "ldr x28, [x20, #0xc8]\n" + "smlal v10.4s, v27.4h, v0.4h\n" + "smlal2 v7.4s, v27.8h, v0.8h\n" + "add x28, x28, x24\n" + "tbz x0, #2, 73f\n" + "ld1 { v23.s }[0], [x28], #0x4\n" + "tbz x0, #1, 72f\n" + "ld1 { v23.h }[2], [x28], #0x2\n" + "tbz x0, #0, 75f\n" + "ld1 { v23.b }[6], [x28]\n" "b 75f\n" "72:" // Oddments: Load (4, 1): Bit 2: Bit 1: Unset - "tbz x4, #0, 75f\n" - "ld1 { v23.b }[4], [x9]\n" + "tbz x0, #0, 75f\n" + "ld1 { v23.b }[4], [x28]\n" "b 75f\n" "73:" // Oddments: Load (4, 1): Bit 2: Unset - "tbz x4, #1, 74f\n" - "ld1 { v23.h }[0], [x9], #0x2\n" - "tbz x4, #0, 75f\n" - "ld1 { v23.b }[2], [x9]\n" + "tbz x0, #1, 74f\n" + "ld1 { v23.h }[0], [x28], #0x2\n" + "tbz x0, #0, 75f\n" + "ld1 { v23.b }[2], [x28]\n" "b 75f\n" "74:" // Oddments: Load (4, 1): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 75f\n" - "ld1 { v23.b }[0], [x9]\n" + "tbz x0, #0, 75f\n" + "ld1 { v23.b }[0], [x28]\n" "75:" // Oddments: Load (4, 1): Bit 2: End + "ldr d1, [x23, #0x80]\n" "usubl v23.8h, v23.8b, v9.8b\n" - "ldr d1, [x3, #0x80]\n" - "smlal v8.4s, v23.4h, v0.4h\n" - "ldr x27, [x25, #0xd0]\n" - "add x27, x27, x10\n" - "smlal2 v5.4s, v23.8h, v0.8h\n" "ssubl v1.8h, v1.8b, v14.8b\n" + "ldr x6, [x20, #0xd0]\n" + "smlal v6.4s, v23.4h, v0.4h\n" + "smlal2 v5.4s, v23.8h, v0.8h\n" + "add x6, x6, x24\n" "smlal v15.4s, v30.4h, v1.4h\n" - "smlal2 v18.4s, v30.8h, v1.8h\n" - "smlal v16.4s, v26.4h, v1.4h\n" - "smlal2 v21.4s, v26.8h, v1.8h\n" - "smlal v7.4s, v23.4h, v1.4h\n" - "smlal2 v17.4s, v23.8h, v1.8h\n" - "tbz x4, #2, 77f\n" - "ld1 { v31.s }[0], [x27], #0x4\n" - "tbz x4, #1, 76f\n" - "ld1 { v31.h }[2], [x27], #0x2\n" - "tbz x4, #0, 79f\n" - "ld1 { v31.b }[6], [x27]\n" + "smlal2 v16.4s, v30.8h, v1.8h\n" + "smlal v17.4s, v26.4h, v1.4h\n" + "smlal2 v8.4s, v26.8h, v1.8h\n" + "smlal v10.4s, v23.4h, v1.4h\n" + "smlal2 v7.4s, v23.8h, v1.8h\n" + "tbz x0, #2, 77f\n" + "ld1 { v31.s }[0], [x6], #0x4\n" + "tbz x0, #1, 76f\n" + "ld1 { v31.h }[2], [x6], #0x2\n" + "tbz x0, #0, 79f\n" + "ld1 { v31.b }[6], [x6]\n" "b 79f\n" "76:" // Oddments: Load (4, 2): Bit 2: Bit 1: Unset - "tbz x4, #0, 79f\n" - "ld1 { v31.b }[4], [x27]\n" + "tbz x0, #0, 79f\n" + "ld1 { v31.b }[4], [x6]\n" "b 79f\n" "77:" // Oddments: Load (4, 2): Bit 2: Unset - "tbz x4, #1, 78f\n" - "ld1 { v31.h }[0], [x27], #0x2\n" - "tbz x4, #0, 79f\n" - "ld1 { v31.b }[2], [x27]\n" + "tbz x0, #1, 78f\n" + "ld1 { v31.h }[0], [x6], #0x2\n" + "tbz x0, #0, 79f\n" + "ld1 { v31.b }[2], [x6]\n" "b 79f\n" "78:" // Oddments: Load (4, 2): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 79f\n" - "ld1 { v31.b }[0], [x27]\n" + "tbz x0, #0, 79f\n" + "ld1 { v31.b }[0], [x6]\n" "79:" // Oddments: Load (4, 2): Bit 2: End + "ldr d2, [x23, #0x88]\n" "usubl v31.8h, v31.8b, v9.8b\n" - "ldr d2, [x3, #0x88]\n" - "smlal v8.4s, v31.4h, v1.4h\n" - "ldr x28, [x25, #0xd8]\n" - "add x28, x28, x10\n" - "smlal2 v5.4s, v31.8h, v1.8h\n" "ssubl v2.8h, v2.8b, v14.8b\n" + "ldr x27, [x20, #0xd8]\n" + "smlal v6.4s, v31.4h, v1.4h\n" + "smlal2 v5.4s, v31.8h, v1.8h\n" + "add x27, x27, x24\n" "smlal v15.4s, v26.4h, v2.4h\n" - "smlal2 v18.4s, v26.8h, v2.8h\n" - "smlal v16.4s, v25.4h, v2.4h\n" - "smlal2 v21.4s, v25.8h, v2.8h\n" - "smlal v7.4s, v31.4h, v2.4h\n" - "smlal2 v17.4s, v31.8h, v2.8h\n" - "tbz x4, #2, 81f\n" - "ld1 { v30.s }[0], [x28], #0x4\n" - "tbz x4, #1, 80f\n" - "ld1 { v30.h }[2], [x28], #0x2\n" - "tbz x4, #0, 83f\n" - "ld1 { v30.b }[6], [x28]\n" + "smlal2 v16.4s, v26.8h, v2.8h\n" + "smlal v17.4s, v25.4h, v2.4h\n" + "smlal2 v8.4s, v25.8h, v2.8h\n" + "smlal v10.4s, v31.4h, v2.4h\n" + "smlal2 v7.4s, v31.8h, v2.8h\n" + "tbz x0, #2, 81f\n" + "ld1 { v30.s }[0], [x27], #0x4\n" + "tbz x0, #1, 80f\n" + "ld1 { v30.h }[2], [x27], #0x2\n" + "tbz x0, #0, 83f\n" + "ld1 { v30.b }[6], [x27]\n" "b 83f\n" "80:" // Oddments: Load (4, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 83f\n" - "ld1 { v30.b }[4], [x28]\n" + "tbz x0, #0, 83f\n" + "ld1 { v30.b }[4], [x27]\n" "b 83f\n" "81:" // Oddments: Load (4, 3): Bit 2: Unset - "tbz x4, #1, 82f\n" - "ld1 { v30.h }[0], [x28], #0x2\n" - "tbz x4, #0, 83f\n" - "ld1 { v30.b }[2], [x28]\n" + "tbz x0, #1, 82f\n" + "ld1 { v30.h }[0], [x27], #0x2\n" + "tbz x0, #0, 83f\n" + "ld1 { v30.b }[2], [x27]\n" "b 83f\n" "82:" // Oddments: Load (4, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 83f\n" - "ld1 { v30.b }[0], [x28]\n" + "tbz x0, #0, 83f\n" + "ld1 { v30.b }[0], [x27]\n" "83:" // Oddments: Load (4, 3): Bit 2: End + "ldr d3, [x23, #0x90]\n" "usubl v30.8h, v30.8b, v9.8b\n" - "ldr d3, [x3, #0x90]\n" - "smlal v8.4s, v30.4h, v2.4h\n" - "ldr x12, [x25, #0xe0]\n" - "add x12, x12, x10\n" - "smlal2 v5.4s, v30.8h, v2.8h\n" "ssubl v3.8h, v3.8b, v14.8b\n" + "ldr x11, [x20, #0xe0]\n" + "smlal v6.4s, v30.4h, v2.4h\n" + "smlal2 v5.4s, v30.8h, v2.8h\n" + "add x11, x11, x24\n" "smlal v15.4s, v25.4h, v3.4h\n" - "smlal2 v18.4s, v25.8h, v3.8h\n" - "smlal v16.4s, v24.4h, v3.4h\n" - "smlal2 v21.4s, v24.8h, v3.8h\n" - "smlal v7.4s, v30.4h, v3.4h\n" - "smlal2 v17.4s, v30.8h, v3.8h\n" - "tbz x4, #2, 85f\n" - "ld1 { v28.s }[0], [x12], #0x4\n" - "tbz x4, #1, 84f\n" - "ld1 { v28.h }[2], [x12], #0x2\n" - "tbz x4, #0, 87f\n" - "ld1 { v28.b }[6], [x12]\n" + "smlal2 v16.4s, v25.8h, v3.8h\n" + "smlal v17.4s, v24.4h, v3.4h\n" + "smlal2 v8.4s, v24.8h, v3.8h\n" + "smlal v10.4s, v30.4h, v3.4h\n" + "smlal2 v7.4s, v30.8h, v3.8h\n" + "tbz x0, #2, 85f\n" + "ld1 { v28.s }[0], [x11], #0x4\n" + "tbz x0, #1, 84f\n" + "ld1 { v28.h }[2], [x11], #0x2\n" + "tbz x0, #0, 87f\n" + "ld1 { v28.b }[6], [x11]\n" "b 87f\n" "84:" // Oddments: Load (4, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 87f\n" - "ld1 { v28.b }[4], [x12]\n" + "tbz x0, #0, 87f\n" + "ld1 { v28.b }[4], [x11]\n" "b 87f\n" "85:" // Oddments: Load (4, 4): Bit 2: Unset - "tbz x4, #1, 86f\n" - "ld1 { v28.h }[0], [x12], #0x2\n" - "tbz x4, #0, 87f\n" - "ld1 { v28.b }[2], [x12]\n" + "tbz x0, #1, 86f\n" + "ld1 { v28.h }[0], [x11], #0x2\n" + "tbz x0, #0, 87f\n" + "ld1 { v28.b }[2], [x11]\n" "b 87f\n" "86:" // Oddments: Load (4, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 87f\n" - "ld1 { v28.b }[0], [x12]\n" + "tbz x0, #0, 87f\n" + "ld1 { v28.b }[0], [x11]\n" "87:" // Oddments: Load (4, 4): Bit 2: End + "ldr d4, [x23, #0x98]\n" "usubl v28.8h, v28.8b, v9.8b\n" - "ldr d4, [x3, #0x98]\n" - "smlal v8.4s, v28.4h, v3.4h\n" - "ldr x7, [x25, #0xe8]\n" - "add x7, x7, x10\n" - "smlal2 v5.4s, v28.8h, v3.8h\n" "ssubl v4.8h, v4.8b, v14.8b\n" + "ldr x17, [x20, #0xe8]\n" + "smlal v6.4s, v28.4h, v3.4h\n" + "smlal2 v5.4s, v28.8h, v3.8h\n" + "add x17, x17, x24\n" "smlal v15.4s, v24.4h, v4.4h\n" - "smlal2 v18.4s, v24.8h, v4.8h\n" - "smlal v16.4s, v22.4h, v4.4h\n" - "smlal2 v21.4s, v22.8h, v4.8h\n" - "smlal v7.4s, v28.4h, v4.4h\n" - "smlal2 v17.4s, v28.8h, v4.8h\n" - "tbz x4, #2, 89f\n" - "ld1 { v26.s }[0], [x7], #0x4\n" - "tbz x4, #1, 88f\n" - "ld1 { v26.h }[2], [x7], #0x2\n" - "tbz x4, #0, 91f\n" - "ld1 { v26.b }[6], [x7]\n" + "smlal2 v16.4s, v24.8h, v4.8h\n" + "smlal v17.4s, v22.4h, v4.4h\n" + "smlal2 v8.4s, v22.8h, v4.8h\n" + "smlal v10.4s, v28.4h, v4.4h\n" + "smlal2 v7.4s, v28.8h, v4.8h\n" + "tbz x0, #2, 89f\n" + "ld1 { v26.s }[0], [x17], #0x4\n" + "tbz x0, #1, 88f\n" + "ld1 { v26.h }[2], [x17], #0x2\n" + "tbz x0, #0, 91f\n" + "ld1 { v26.b }[6], [x17]\n" "b 91f\n" "88:" // Oddments: Load (4, 5): Bit 2: Bit 1: Unset - "tbz x4, #0, 91f\n" - "ld1 { v26.b }[4], [x7]\n" + "tbz x0, #0, 91f\n" + "ld1 { v26.b }[4], [x17]\n" "b 91f\n" "89:" // Oddments: Load (4, 5): Bit 2: Unset - "tbz x4, #1, 90f\n" - "ld1 { v26.h }[0], [x7], #0x2\n" - "tbz x4, #0, 91f\n" - "ld1 { v26.b }[2], [x7]\n" + "tbz x0, #1, 90f\n" + "ld1 { v26.h }[0], [x17], #0x2\n" + "tbz x0, #0, 91f\n" + "ld1 { v26.b }[2], [x17]\n" "b 91f\n" "90:" // Oddments: Load (4, 5): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 91f\n" - "ld1 { v26.b }[0], [x7]\n" + "tbz x0, #0, 91f\n" + "ld1 { v26.b }[0], [x17]\n" "91:" // Oddments: Load (4, 5): Bit 2: End + "ldr d0, [x23, #0xa0]\n" "usubl v26.8h, v26.8b, v9.8b\n" - "ldr d0, [x3, #0xa0]\n" - "smlal v8.4s, v26.4h, v4.4h\n" - "ldr x26, [x25, #0xf0]\n" - "add x26, x26, x10\n" - "smlal2 v5.4s, v26.8h, v4.8h\n" "ssubl v0.8h, v0.8b, v14.8b\n" + "ldr x5, [x20, #0xf0]\n" + "smlal v6.4s, v26.4h, v4.4h\n" + "smlal2 v5.4s, v26.8h, v4.8h\n" + "add x5, x5, x24\n" "smlal v15.4s, v27.4h, v0.4h\n" - "smlal2 v18.4s, v27.8h, v0.8h\n" - "smlal v16.4s, v23.4h, v0.4h\n" - "smlal2 v21.4s, v23.8h, v0.8h\n" - "tbz x4, #2, 93f\n" - "ld1 { v25.s }[0], [x26], #0x4\n" - "tbz x4, #1, 92f\n" - "ld1 { v25.h }[2], [x26], #0x2\n" - "tbz x4, #0, 95f\n" - "ld1 { v25.b }[6], [x26]\n" + "smlal2 v16.4s, v27.8h, v0.8h\n" + "smlal v17.4s, v23.4h, v0.4h\n" + "smlal2 v8.4s, v23.8h, v0.8h\n" + "tbz x0, #2, 93f\n" + "ld1 { v25.s }[0], [x5], #0x4\n" + "tbz x0, #1, 92f\n" + "ld1 { v25.h }[2], [x5], #0x2\n" + "tbz x0, #0, 95f\n" + "ld1 { v25.b }[6], [x5]\n" "b 95f\n" "92:" // Oddments: Load (5, 0): Bit 2: Bit 1: Unset - "tbz x4, #0, 95f\n" - "ld1 { v25.b }[4], [x26]\n" + "tbz x0, #0, 95f\n" + "ld1 { v25.b }[4], [x5]\n" "b 95f\n" "93:" // Oddments: Load (5, 0): Bit 2: Unset - "tbz x4, #1, 94f\n" - "ld1 { v25.h }[0], [x26], #0x2\n" - "tbz x4, #0, 95f\n" - "ld1 { v25.b }[2], [x26]\n" + "tbz x0, #1, 94f\n" + "ld1 { v25.h }[0], [x5], #0x2\n" + "tbz x0, #0, 95f\n" + "ld1 { v25.b }[2], [x5]\n" "b 95f\n" "94:" // Oddments: Load (5, 0): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 95f\n" - "ld1 { v25.b }[0], [x26]\n" + "tbz x0, #0, 95f\n" + "ld1 { v25.b }[0], [x5]\n" "95:" // Oddments: Load (5, 0): Bit 2: End "usubl v25.8h, v25.8b, v9.8b\n" - "ldr x23, [x25, #0xf8]\n" - "smlal v7.4s, v25.4h, v0.4h\n" - "add x23, x23, x10\n" - "smlal2 v17.4s, v25.8h, v0.8h\n" - "tbz x4, #2, 97f\n" - "ld1 { v24.s }[0], [x23], #0x4\n" - "tbz x4, #1, 96f\n" - "ld1 { v24.h }[2], [x23], #0x2\n" - "tbz x4, #0, 99f\n" - "ld1 { v24.b }[6], [x23]\n" + "ldr x25, [x20, #0xf8]\n" + "smlal v10.4s, v25.4h, v0.4h\n" + "smlal2 v7.4s, v25.8h, v0.8h\n" + "add x25, x25, x24\n" + "tbz x0, #2, 97f\n" + "ld1 { v24.s }[0], [x25], #0x4\n" + "tbz x0, #1, 96f\n" + "ld1 { v24.h }[2], [x25], #0x2\n" + "tbz x0, #0, 99f\n" + "ld1 { v24.b }[6], [x25]\n" "b 99f\n" "96:" // Oddments: Load (5, 1): Bit 2: Bit 1: Unset - "tbz x4, #0, 99f\n" - "ld1 { v24.b }[4], [x23]\n" + "tbz x0, #0, 99f\n" + "ld1 { v24.b }[4], [x25]\n" "b 99f\n" "97:" // Oddments: Load (5, 1): Bit 2: Unset - "tbz x4, #1, 98f\n" - "ld1 { v24.h }[0], [x23], #0x2\n" - "tbz x4, #0, 99f\n" - "ld1 { v24.b }[2], [x23]\n" + "tbz x0, #1, 98f\n" + "ld1 { v24.h }[0], [x25], #0x2\n" + "tbz x0, #0, 99f\n" + "ld1 { v24.b }[2], [x25]\n" "b 99f\n" "98:" // Oddments: Load (5, 1): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 99f\n" - "ld1 { v24.b }[0], [x23]\n" + "tbz x0, #0, 99f\n" + "ld1 { v24.b }[0], [x25]\n" "99:" // Oddments: Load (5, 1): Bit 2: End + "ldr d1, [x23, #0xa8]\n" "usubl v24.8h, v24.8b, v9.8b\n" - "ldr d1, [x3, #0xa8]\n" - "smlal v8.4s, v24.4h, v0.4h\n" - "ldr x22, [x25, #0x100]\n" - "add x22, x22, x10\n" - "smlal2 v5.4s, v24.8h, v0.8h\n" "ssubl v1.8h, v1.8b, v14.8b\n" + "ldr x26, [x20, #0x100]\n" + "smlal v6.4s, v24.4h, v0.4h\n" + "smlal2 v5.4s, v24.8h, v0.8h\n" + "add x26, x26, x24\n" "smlal v15.4s, v23.4h, v1.4h\n" - "smlal2 v18.4s, v23.8h, v1.8h\n" - "smlal v16.4s, v31.4h, v1.4h\n" - "smlal2 v21.4s, v31.8h, v1.8h\n" - "smlal v7.4s, v24.4h, v1.4h\n" - "smlal2 v17.4s, v24.8h, v1.8h\n" - "tbz x4, #2, 101f\n" - "ld1 { v27.s }[0], [x22], #0x4\n" - "tbz x4, #1, 100f\n" - "ld1 { v27.h }[2], [x22], #0x2\n" - "tbz x4, #0, 103f\n" - "ld1 { v27.b }[6], [x22]\n" + "smlal2 v16.4s, v23.8h, v1.8h\n" + "smlal v17.4s, v31.4h, v1.4h\n" + "smlal2 v8.4s, v31.8h, v1.8h\n" + "smlal v10.4s, v24.4h, v1.4h\n" + "smlal2 v7.4s, v24.8h, v1.8h\n" + "tbz x0, #2, 101f\n" + "ld1 { v27.s }[0], [x26], #0x4\n" + "tbz x0, #1, 100f\n" + "ld1 { v27.h }[2], [x26], #0x2\n" + "tbz x0, #0, 103f\n" + "ld1 { v27.b }[6], [x26]\n" "b 103f\n" "100:" // Oddments: Load (5, 2): Bit 2: Bit 1: Unset - "tbz x4, #0, 103f\n" - "ld1 { v27.b }[4], [x22]\n" + "tbz x0, #0, 103f\n" + "ld1 { v27.b }[4], [x26]\n" "b 103f\n" "101:" // Oddments: Load (5, 2): Bit 2: Unset - "tbz x4, #1, 102f\n" - "ld1 { v27.h }[0], [x22], #0x2\n" - "tbz x4, #0, 103f\n" - "ld1 { v27.b }[2], [x22]\n" + "tbz x0, #1, 102f\n" + "ld1 { v27.h }[0], [x26], #0x2\n" + "tbz x0, #0, 103f\n" + "ld1 { v27.b }[2], [x26]\n" "b 103f\n" "102:" // Oddments: Load (5, 2): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 103f\n" - "ld1 { v27.b }[0], [x22]\n" + "tbz x0, #0, 103f\n" + "ld1 { v27.b }[0], [x26]\n" "103:" // Oddments: Load (5, 2): Bit 2: End + "ldr d2, [x23, #0xb0]\n" "usubl v27.8h, v27.8b, v9.8b\n" - "ldr d2, [x3, #0xb0]\n" - "smlal v8.4s, v27.4h, v1.4h\n" - "ldr x20, [x25, #0x108]\n" - "add x20, x20, x10\n" - "smlal2 v5.4s, v27.8h, v1.8h\n" "ssubl v2.8h, v2.8b, v14.8b\n" + "ldr x12, [x20, #0x108]\n" + "smlal v6.4s, v27.4h, v1.4h\n" + "smlal2 v5.4s, v27.8h, v1.8h\n" + "add x12, x12, x24\n" "smlal v15.4s, v31.4h, v2.4h\n" - "smlal2 v18.4s, v31.8h, v2.8h\n" - "smlal v16.4s, v30.4h, v2.4h\n" - "smlal2 v21.4s, v30.8h, v2.8h\n" - "smlal v7.4s, v27.4h, v2.4h\n" - "smlal2 v17.4s, v27.8h, v2.8h\n" - "tbz x4, #2, 105f\n" - "ld1 { v25.s }[0], [x20], #0x4\n" - "tbz x4, #1, 104f\n" - "ld1 { v25.h }[2], [x20], #0x2\n" - "tbz x4, #0, 107f\n" - "ld1 { v25.b }[6], [x20]\n" + "smlal2 v16.4s, v31.8h, v2.8h\n" + "smlal v17.4s, v30.4h, v2.4h\n" + "smlal2 v8.4s, v30.8h, v2.8h\n" + "smlal v10.4s, v27.4h, v2.4h\n" + "smlal2 v7.4s, v27.8h, v2.8h\n" + "tbz x0, #2, 105f\n" + "ld1 { v25.s }[0], [x12], #0x4\n" + "tbz x0, #1, 104f\n" + "ld1 { v25.h }[2], [x12], #0x2\n" + "tbz x0, #0, 107f\n" + "ld1 { v25.b }[6], [x12]\n" "b 107f\n" "104:" // Oddments: Load (5, 3): Bit 2: Bit 1: Unset - "tbz x4, #0, 107f\n" - "ld1 { v25.b }[4], [x20]\n" + "tbz x0, #0, 107f\n" + "ld1 { v25.b }[4], [x12]\n" "b 107f\n" "105:" // Oddments: Load (5, 3): Bit 2: Unset - "tbz x4, #1, 106f\n" - "ld1 { v25.h }[0], [x20], #0x2\n" - "tbz x4, #0, 107f\n" - "ld1 { v25.b }[2], [x20]\n" + "tbz x0, #1, 106f\n" + "ld1 { v25.h }[0], [x12], #0x2\n" + "tbz x0, #0, 107f\n" + "ld1 { v25.b }[2], [x12]\n" "b 107f\n" "106:" // Oddments: Load (5, 3): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 107f\n" - "ld1 { v25.b }[0], [x20]\n" + "tbz x0, #0, 107f\n" + "ld1 { v25.b }[0], [x12]\n" "107:" // Oddments: Load (5, 3): Bit 2: End + "ldr d3, [x23, #0xb8]\n" "usubl v25.8h, v25.8b, v9.8b\n" - "ldr d3, [x3, #0xb8]\n" - "smlal v8.4s, v25.4h, v2.4h\n" - "ldr x13, [x25, #0x110]\n" - "add x13, x13, x10\n" - "smlal2 v5.4s, v25.8h, v2.8h\n" "ssubl v3.8h, v3.8b, v14.8b\n" + "ldr x14, [x20, #0x110]\n" + "smlal v6.4s, v25.4h, v2.4h\n" + "smlal2 v5.4s, v25.8h, v2.8h\n" + "add x14, x14, x24\n" "smlal v15.4s, v30.4h, v3.4h\n" - "smlal2 v18.4s, v30.8h, v3.8h\n" - "smlal v16.4s, v28.4h, v3.4h\n" - "smlal2 v21.4s, v28.8h, v3.8h\n" - "smlal v7.4s, v25.4h, v3.4h\n" - "smlal2 v17.4s, v25.8h, v3.8h\n" - "tbz x4, #2, 109f\n" - "ld1 { v24.s }[0], [x13], #0x4\n" - "tbz x4, #1, 108f\n" - "ld1 { v24.h }[2], [x13], #0x2\n" - "tbz x4, #0, 111f\n" - "ld1 { v24.b }[6], [x13]\n" + "smlal2 v16.4s, v30.8h, v3.8h\n" + "smlal v17.4s, v28.4h, v3.4h\n" + "smlal2 v8.4s, v28.8h, v3.8h\n" + "smlal v10.4s, v25.4h, v3.4h\n" + "smlal2 v7.4s, v25.8h, v3.8h\n" + "tbz x0, #2, 109f\n" + "ld1 { v24.s }[0], [x14], #0x4\n" + "tbz x0, #1, 108f\n" + "ld1 { v24.h }[2], [x14], #0x2\n" + "tbz x0, #0, 111f\n" + "ld1 { v24.b }[6], [x14]\n" "b 111f\n" "108:" // Oddments: Load (5, 4): Bit 2: Bit 1: Unset - "tbz x4, #0, 111f\n" - "ld1 { v24.b }[4], [x13]\n" + "tbz x0, #0, 111f\n" + "ld1 { v24.b }[4], [x14]\n" "b 111f\n" "109:" // Oddments: Load (5, 4): Bit 2: Unset - "tbz x4, #1, 110f\n" - "ld1 { v24.h }[0], [x13], #0x2\n" - "tbz x4, #0, 111f\n" - "ld1 { v24.b }[2], [x13]\n" + "tbz x0, #1, 110f\n" + "ld1 { v24.h }[0], [x14], #0x2\n" + "tbz x0, #0, 111f\n" + "ld1 { v24.b }[2], [x14]\n" "b 111f\n" "110:" // Oddments: Load (5, 4): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 111f\n" - "ld1 { v24.b }[0], [x13]\n" + "tbz x0, #0, 111f\n" + "ld1 { v24.b }[0], [x14]\n" "111:" // Oddments: Load (5, 4): Bit 2: End + "ldr d4, [x23, #0xc0]\n" "usubl v24.8h, v24.8b, v9.8b\n" - "ldr d4, [x3, #0xc0]\n" - "smlal v8.4s, v24.4h, v3.4h\n" - "ldr x21, [x25, #0x118]\n" - "add x21, x21, x10\n" - "smlal2 v5.4s, v24.8h, v3.8h\n" "ssubl v4.8h, v4.8b, v14.8b\n" + "ldr x21, [x20, #0x118]\n" + "smlal v6.4s, v24.4h, v3.4h\n" + "smlal2 v5.4s, v24.8h, v3.8h\n" + "add x21, x21, x24\n" "smlal v15.4s, v28.4h, v4.4h\n" - "smlal2 v18.4s, v28.8h, v4.8h\n" - "smlal v16.4s, v26.4h, v4.4h\n" - "smlal2 v21.4s, v26.8h, v4.8h\n" - "smlal v7.4s, v24.4h, v4.4h\n" - "smlal2 v17.4s, v24.8h, v4.8h\n" - "tbz x4, #2, 113f\n" + "smlal2 v16.4s, v28.8h, v4.8h\n" + "smlal v17.4s, v26.4h, v4.4h\n" + "smlal2 v8.4s, v26.8h, v4.8h\n" + "smlal v10.4s, v24.4h, v4.4h\n" + "smlal2 v7.4s, v24.8h, v4.8h\n" + "tbz x0, #2, 113f\n" "ld1 { v27.s }[0], [x21], #0x4\n" - "tbz x4, #1, 112f\n" + "tbz x0, #1, 112f\n" "ld1 { v27.h }[2], [x21], #0x2\n" - "tbz x4, #0, 115f\n" + "tbz x0, #0, 115f\n" "ld1 { v27.b }[6], [x21]\n" "b 115f\n" "112:" // Oddments: Load (5, 5): Bit 2: Bit 1: Unset - "tbz x4, #0, 115f\n" + "tbz x0, #0, 115f\n" "ld1 { v27.b }[4], [x21]\n" "b 115f\n" "113:" // Oddments: Load (5, 5): Bit 2: Unset - "tbz x4, #1, 114f\n" + "tbz x0, #1, 114f\n" "ld1 { v27.h }[0], [x21], #0x2\n" - "tbz x4, #0, 115f\n" + "tbz x0, #0, 115f\n" "ld1 { v27.b }[2], [x21]\n" "b 115f\n" "114:" // Oddments: Load (5, 5): Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 115f\n" + "tbz x0, #0, 115f\n" "ld1 { v27.b }[0], [x21]\n" "115:" // Oddments: Load (5, 5): Bit 2: End "usubl v27.8h, v27.8b, v9.8b\n" - "smlal v8.4s, v27.4h, v4.4h\n" + "smlal v6.4s, v27.4h, v4.4h\n" "smlal2 v5.4s, v27.8h, v4.8h\n" - "tbz x4, #2, 117f\n" - "ld1 { v6.4s }, [x2], #0x10\n" - "ld1 { v19.4s }, [x5], #0x10\n" - "tbz x4, #1, 116f\n" - "ld1 { v20.d }[0], [x2], #0x8\n" - "ld1 { v12.d }[0], [x5], #0x8\n" - "tbz x4, #0, 119f\n" - "ld1 { v20.s }[2], [x2]\n" - "ld1 { v12.s }[2], [x5]\n" + "tbz x0, #2, 117f\n" + "ld1 { v12.4s }, [x10], #0x10\n" + "ld1 { v19.4s }, [x1], #0x10\n" + "tbz x0, #1, 116f\n" + "ld1 { v20.d }[0], [x10], #0x8\n" + "ld1 { v29.d }[0], [x1], #0x8\n" + "tbz x0, #0, 119f\n" + "ld1 { v20.s }[2], [x10]\n" + "ld1 { v29.s }[2], [x1]\n" "b 119f\n" "116:" // Oddments: Load requant params: Bit 2: Bit 1: Unset - "tbz x4, #0, 119f\n" - "ld1 { v20.s }[0], [x2]\n" - "ld1 { v12.s }[0], [x5]\n" + "tbz x0, #0, 119f\n" + "ld1 { v20.s }[0], [x10]\n" + "ld1 { v29.s }[0], [x1]\n" "b 119f\n" "117:" // Oddments: Load requant params: Bit 2: Unset - "tbz x4, #1, 118f\n" - "ld1 { v6.d }[0], [x2], #0x8\n" - "ld1 { v19.d }[0], [x5], #0x8\n" - "tbz x4, #0, 119f\n" - "ld1 { v6.s }[2], [x2]\n" - "ld1 { v19.s }[2], [x5]\n" + "tbz x0, #1, 118f\n" + "ld1 { v12.d }[0], [x10], #0x8\n" + "ld1 { v19.d }[0], [x1], #0x8\n" + "tbz x0, #0, 119f\n" + "ld1 { v12.s }[2], [x10]\n" + "ld1 { v19.s }[2], [x1]\n" "b 119f\n" "118:" // Oddments: Load requant params: Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 119f\n" - "ld1 { v6.s }[0], [x2]\n" - "ld1 { v19.s }[0], [x5]\n" + "tbz x0, #0, 119f\n" + "ld1 { v12.s }[0], [x10]\n" + "ld1 { v19.s }[0], [x1]\n" "119:" // Oddments: Load requant params: Bit 2: End - "sqrdmulh v15.4s, v15.4s, v6.4s\n" - "add x17, x17, x1\n" - "sqrdmulh v18.4s, v18.4s, v20.4s\n" - "add x16, x16, x1\n" - "sqrdmulh v16.4s, v16.4s, v6.4s\n" - "add x6, x6, x1\n" - "sqrdmulh v21.4s, v21.4s, v20.4s\n" - "add x8, x8, x1\n" - "sqrdmulh v7.4s, v7.4s, v6.4s\n" - "and v28.16b, v15.16b, v19.16b\n" - "and v26.16b, v18.16b, v12.16b\n" - "and v29.16b, v16.16b, v19.16b\n" - "sshr v28.4s, v28.4s, #0x1f\n" + "sqdmulh v15.4s, v15.4s, v12.4s\n" + "sqdmulh v17.4s, v17.4s, v12.4s\n" + "add x16, x16, x22\n" + "add x8, x8, x22\n" + "sqdmulh v10.4s, v10.4s, v12.4s\n" + "sqdmulh v6.4s, v6.4s, v12.4s\n" + "add x4, x4, x22\n" + "add x7, x7, x22\n" + "and v23.16b, v15.16b, v19.16b\n" + "sqdmulh v16.4s, v16.4s, v20.4s\n" + "and v22.16b, v17.16b, v19.16b\n" + "sqdmulh v8.4s, v8.4s, v20.4s\n" + "and v21.16b, v10.16b, v19.16b\n" + "sqdmulh v7.4s, v7.4s, v20.4s\n" + "and v26.16b, v6.16b, v19.16b\n" + "sqdmulh v5.4s, v5.4s, v20.4s\n" + "sshr v23.4s, v23.4s, #0x1f\n" + "and v4.16b, v16.16b, v29.16b\n" + "sshr v22.4s, v22.4s, #0x1f\n" + "and v2.16b, v8.16b, v29.16b\n" + "sshr v21.4s, v21.4s, #0x1f\n" + "and v3.16b, v7.16b, v29.16b\n" "sshr v26.4s, v26.4s, #0x1f\n" - "sshr v29.4s, v29.4s, #0x1f\n" - "sqadd v15.4s, v15.4s, v28.4s\n" - "sqadd v18.4s, v18.4s, v26.4s\n" - "sqadd v16.4s, v16.4s, v29.4s\n" - "and v4.16b, v21.16b, v12.16b\n" - "srshl v15.4s, v15.4s, v19.4s\n" - "srshl v18.4s, v18.4s, v12.4s\n" - "srshl v16.4s, v16.4s, v19.4s\n" + "and v25.16b, v5.16b, v29.16b\n" + "sqadd v15.4s, v15.4s, v23.4s\n" "sshr v4.4s, v4.4s, #0x1f\n" - "add v15.4s, v15.4s, v10.4s\n" - "add v18.4s, v18.4s, v10.4s\n" - "add v16.4s, v16.4s, v10.4s\n" - "smin v15.4s, v15.4s, v13.4s\n" - "smin v18.4s, v18.4s, v13.4s\n" - "smin v16.4s, v16.4s, v13.4s\n" - "smax v15.4s, v15.4s, v11.4s\n" - "smax v18.4s, v18.4s, v11.4s\n" - "smax v16.4s, v16.4s, v11.4s\n" - "sqadd v21.4s, v21.4s, v4.4s\n" - "uzp1 v15.16b, v15.16b, v18.16b\n" - "and v25.16b, v7.16b, v19.16b\n" - "uzp1 v15.16b, v15.16b, v15.16b\n" - "srshl v21.4s, v21.4s, v12.4s\n" + "sqadd v17.4s, v17.4s, v22.4s\n" + "sshr v2.4s, v2.4s, #0x1f\n" + "sqadd v10.4s, v10.4s, v21.4s\n" + "sshr v3.4s, v3.4s, #0x1f\n" + "sqadd v6.4s, v6.4s, v26.4s\n" "sshr v25.4s, v25.4s, #0x1f\n" - "sqrdmulh v17.4s, v17.4s, v20.4s\n" - "sqrdmulh v8.4s, v8.4s, v6.4s\n" - "add v21.4s, v21.4s, v10.4s\n" - "sqadd v7.4s, v7.4s, v25.4s\n" - "and v31.16b, v17.16b, v12.16b\n" - "smin v21.4s, v21.4s, v13.4s\n" - "and v24.16b, v8.16b, v19.16b\n" - "srshl v7.4s, v7.4s, v19.4s\n" - "smax v21.4s, v21.4s, v11.4s\n" - "sshr v31.4s, v31.4s, #0x1f\n" - "sshr v24.4s, v24.4s, #0x1f\n" - "uzp1 v16.16b, v16.16b, v21.16b\n" - "add v7.4s, v7.4s, v10.4s\n" - "uzp1 v16.16b, v16.16b, v16.16b\n" - "sqadd v17.4s, v17.4s, v31.4s\n" - "smin v7.4s, v7.4s, v13.4s\n" - "sqadd v8.4s, v8.4s, v24.4s\n" - "sqrdmulh v5.4s, v5.4s, v20.4s\n" - "smax v7.4s, v7.4s, v11.4s\n" - "srshl v17.4s, v17.4s, v12.4s\n" - "srshl v8.4s, v8.4s, v19.4s\n" - "and v1.16b, v5.16b, v12.16b\n" - "add v17.4s, v17.4s, v10.4s\n" - "add v8.4s, v8.4s, v10.4s\n" - "sshr v1.4s, v1.4s, #0x1f\n" - "smin v17.4s, v17.4s, v13.4s\n" - "smin v8.4s, v8.4s, v13.4s\n" - "sqadd v5.4s, v5.4s, v1.4s\n" - "smax v17.4s, v17.4s, v11.4s\n" - "smax v8.4s, v8.4s, v11.4s\n" - "srshl v5.4s, v5.4s, v12.4s\n" - "uzp1 v7.16b, v7.16b, v17.16b\n" - "uzp1 v7.16b, v7.16b, v7.16b\n" - "add v5.4s, v5.4s, v10.4s\n" - "smin v5.4s, v5.4s, v13.4s\n" - "smax v5.4s, v5.4s, v11.4s\n" - "uzp1 v8.16b, v8.16b, v5.16b\n" - "uzp1 v8.16b, v8.16b, v8.16b\n" - "tbz x4, #2, 121f\n" - "st1 { v15.s }[0], [x17], #0x4\n" - "st1 { v16.s }[0], [x16], #0x4\n" - "st1 { v7.s }[0], [x6], #0x4\n" - "st1 { v8.s }[0], [x8], #0x4\n" - "tbz x4, #1, 120f\n" - "st1 { v15.h }[2], [x17], #0x2\n" - "st1 { v16.h }[2], [x16], #0x2\n" - "st1 { v7.h }[2], [x6], #0x2\n" - "st1 { v8.h }[2], [x8], #0x2\n" - "tbz x4, #0, 123f\n" - "st1 { v15.b }[6], [x17], #0x1\n" - "st1 { v16.b }[6], [x16], #0x1\n" - "st1 { v7.b }[6], [x6], #0x1\n" - "st1 { v8.b }[6], [x8], #0x1\n" + "srshl v15.4s, v15.4s, v19.4s\n" + "sqadd v16.4s, v16.4s, v4.4s\n" + "srshl v17.4s, v17.4s, v19.4s\n" + "sqadd v8.4s, v8.4s, v2.4s\n" + "srshl v10.4s, v10.4s, v19.4s\n" + "sqadd v7.4s, v7.4s, v3.4s\n" + "srshl v6.4s, v6.4s, v19.4s\n" + "sqadd v5.4s, v5.4s, v25.4s\n" + "srshl v16.4s, v16.4s, v29.4s\n" + "sqxtn v15.4h, v15.4s\n" + "srshl v8.4s, v8.4s, v29.4s\n" + "sqxtn v17.4h, v17.4s\n" + "srshl v7.4s, v7.4s, v29.4s\n" + "sqxtn v10.4h, v10.4s\n" + "srshl v5.4s, v5.4s, v29.4s\n" + "sqxtn v6.4h, v6.4s\n" + "sqxtn2 v15.8h, v16.4s\n" + "sqxtn2 v17.8h, v8.4s\n" + "sqxtn2 v10.8h, v7.4s\n" + "sqxtn2 v6.8h, v5.4s\n" + "sqadd v15.8h, v15.8h, v18.8h\n" + "sqadd v17.8h, v17.8h, v18.8h\n" + "sqadd v10.8h, v10.8h, v18.8h\n" + "sqadd v6.8h, v6.8h, v18.8h\n" + "smax v15.8h, v15.8h, v11.8h\n" + "smax v17.8h, v17.8h, v11.8h\n" + "smax v10.8h, v10.8h, v11.8h\n" + "smax v6.8h, v6.8h, v11.8h\n" + "smin v15.8h, v15.8h, v13.8h\n" + "smin v17.8h, v17.8h, v13.8h\n" + "smin v10.8h, v10.8h, v13.8h\n" + "smin v6.8h, v6.8h, v13.8h\n" + "uzp1 v15.16b, v15.16b, v15.16b\n" + "uzp1 v17.16b, v17.16b, v17.16b\n" + "uzp1 v10.16b, v10.16b, v10.16b\n" + "uzp1 v6.16b, v6.16b, v6.16b\n" + "tbz x0, #2, 121f\n" + "st1 { v15.s }[0], [x16], #0x4\n" + "st1 { v17.s }[0], [x8], #0x4\n" + "st1 { v10.s }[0], [x4], #0x4\n" + "st1 { v6.s }[0], [x7], #0x4\n" + "tbz x0, #1, 120f\n" + "st1 { v15.h }[2], [x16], #0x2\n" + "st1 { v17.h }[2], [x8], #0x2\n" + "st1 { v10.h }[2], [x4], #0x2\n" + "st1 { v6.h }[2], [x7], #0x2\n" + "tbz x0, #0, 123f\n" + "st1 { v15.b }[6], [x16], #0x1\n" + "st1 { v17.b }[6], [x8], #0x1\n" + "st1 { v10.b }[6], [x4], #0x1\n" + "st1 { v6.b }[6], [x7], #0x1\n" "b 123f\n" "120:" // Oddments: Bit 2: Bit 1: Unset - "tbz x4, #0, 123f\n" - "st1 { v15.b }[4], [x17], #0x1\n" - "st1 { v16.b }[4], [x16], #0x1\n" - "st1 { v7.b }[4], [x6], #0x1\n" - "st1 { v8.b }[4], [x8], #0x1\n" + "tbz x0, #0, 123f\n" + "st1 { v15.b }[4], [x16], #0x1\n" + "st1 { v17.b }[4], [x8], #0x1\n" + "st1 { v10.b }[4], [x4], #0x1\n" + "st1 { v6.b }[4], [x7], #0x1\n" "b 123f\n" "121:" // Oddments: Bit 2: Unset - "tbz x4, #1, 122f\n" - "st1 { v15.h }[0], [x17], #0x2\n" - "st1 { v16.h }[0], [x16], #0x2\n" - "st1 { v7.h }[0], [x6], #0x2\n" - "st1 { v8.h }[0], [x8], #0x2\n" - "tbz x4, #0, 123f\n" - "st1 { v15.b }[2], [x17], #0x1\n" - "st1 { v16.b }[2], [x16], #0x1\n" - "st1 { v7.b }[2], [x6], #0x1\n" - "st1 { v8.b }[2], [x8], #0x1\n" + "tbz x0, #1, 122f\n" + "st1 { v15.h }[0], [x16], #0x2\n" + "st1 { v17.h }[0], [x8], #0x2\n" + "st1 { v10.h }[0], [x4], #0x2\n" + "st1 { v6.h }[0], [x7], #0x2\n" + "tbz x0, #0, 123f\n" + "st1 { v15.b }[2], [x16], #0x1\n" + "st1 { v17.b }[2], [x8], #0x1\n" + "st1 { v10.b }[2], [x4], #0x1\n" + "st1 { v6.b }[2], [x7], #0x1\n" "b 123f\n" "122:" // Oddments: Bit 2: Unset: Bit 1: Unset - "tbz x4, #0, 123f\n" - "st1 { v15.b }[0], [x17], #0x1\n" - "st1 { v16.b }[0], [x16], #0x1\n" - "st1 { v7.b }[0], [x6], #0x1\n" - "st1 { v8.b }[0], [x8], #0x1\n" + "tbz x0, #0, 123f\n" + "st1 { v15.b }[0], [x16], #0x1\n" + "st1 { v17.b }[0], [x8], #0x1\n" + "st1 { v10.b }[0], [x4], #0x1\n" + "st1 { v6.b }[0], [x7], #0x1\n" "123:" // Oddments: Bit 2: End - "124:" // End - : : [offsetof_Params_bias] "I" (offsetof(Params, bias)), [offsetof_Params_inptrs] "I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels] "I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs] "I" (offsetof(Params, outptrs)), [offsetof_Params_requant] "I" (offsetof(Params, requant)), [offsetof_Params_requant_muls] "I" (offsetof(Params, requant_muls)), [offsetof_Params_requant_shifts] "I" (offsetof(Params, requant_shifts)), [offsetof_Params_weights] "I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset] "I" (offsetof(arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [params] "r" (¶ms) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28" diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp index 2bfeac0556..6bdcca115c 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,28 +28,23 @@ #pragma once +#if defined(__aarch64__) + namespace arm_conv { namespace depthwise { void a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int); -struct a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst +class a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy { - typedef int32_t bias_type; - typedef uint8_t input_type; - typedef int8_t weight_type; - typedef uint8_t return_type; - - typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, const arm_gemm::Requantize32&, const unsigned int, const unsigned int); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - constexpr static unsigned int n_output_points = 9; + KernelType kernel = a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl; - kern_type kernel = a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst_impl; + public: + a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy(9, arm_gemm::VLType::None) {} - a64_u8s8u8q_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {} + KernelType get_kernel() const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp index 802030573e..394df363da 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,31 +28,25 @@ #pragma once +#if defined(__aarch64__) + namespace arm_conv { namespace depthwise { void a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&); -struct a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst +struct a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy { - typedef int32_t bias_type; - typedef uint8_t input_type; - typedef int8_t weight_type; - typedef uint8_t return_type; - - typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const int8_t *, const int32_t *, const unsigned int, const unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32&); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::None; - - constexpr static unsigned int output_rows(void) { return 2; }; - constexpr static unsigned int output_cols(void) { return 8; }; - - constexpr static unsigned int output_col_regs(void) { return 2; }; - - kern_type kernel = a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl; - - a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {} + using Parent = GenericDepthfirstMultiplierKernelStrategy; + a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) + : Parent(2, 8, arm_gemm::VLType::None) + { + } + Parent::KernelType kernel = a64_u8s8u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv + +#endif // defined(__aarch64__) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp index 1cfea9dcbd..1c1fb25e1f 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,7 +28,7 @@ #pragma once -#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) namespace arm_conv { namespace depthwise { @@ -36,19 +36,16 @@ namespace depthwise { void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); void sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); -class sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy +class sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16> { private: - typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); - indirect_kern_type m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl; - - typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); - direct_kern_type m_direct_kernel = sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl; + using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>; + Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl; + Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl; public: - typedef __fp16 return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + using return_type = __fp16; + constexpr static auto vl_type = arm_gemm::VLType::SVE; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; @@ -59,63 +56,16 @@ class sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirs constexpr static unsigned int output_rows = 2; constexpr static unsigned int output_cols = 2; - constexpr static unsigned int input_rows = 4; - constexpr static unsigned int input_cols = 4; - - sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {} + sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) + : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 3, 1) {} arm_gemm::VLType get_vl_type(void) const override { return vl_type; } - unsigned int get_kernel_rows(void) const override { return kernel_rows; } - unsigned int get_kernel_cols(void) const override { return kernel_cols; } - - unsigned int get_stride_rows(void) const override { return stride_rows; } - unsigned int get_stride_cols(void) const override { return stride_cols; } - - unsigned int get_output_rows(void) const override { return output_rows; } - unsigned int get_output_cols(void) const override { return output_cols; } - - unsigned int get_input_rows(void) const override { return input_rows; } - unsigned int get_input_cols(void) const override { return input_cols; } - - void indirect_kernel( - const void *const *const input_ptrs, - void *const *const outptrs, - const void *params, - unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const override - { - m_indirect_kernel( - reinterpret_cast(input_ptrs), - reinterpret_cast<__fp16 *const *>(outptrs), - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } - - void direct_kernel( - const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, - const void *params, unsigned int n_channels, - const void *activation_min, const void *activation_max - ) const override - { - m_direct_kernel( - n_tile_rows, n_tile_cols, - static_cast(inptr), ld_input_row, ld_input_col, - static_cast<__fp16 *>(outptr), ld_output_row, ld_output_col, - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } + Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; } + Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp index af8af1840a..d49b14eeaf 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,7 +28,7 @@ #pragma once -#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) namespace arm_conv { namespace depthwise { @@ -36,19 +36,16 @@ namespace depthwise { void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); void sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); -class sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public IDepthwiseDepthfirstStrategy +class sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16> { private: - typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); - indirect_kern_type m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl; - - typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); - direct_kern_type m_direct_kernel = sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl; + using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>; + Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl; + Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl; public: - typedef __fp16 return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + using return_type = __fp16; + constexpr static auto vl_type = arm_gemm::VLType::SVE; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; @@ -59,63 +56,16 @@ class sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst : public IDepthwiseDepthfirs constexpr static unsigned int output_rows = 3; constexpr static unsigned int output_cols = 3; - constexpr static unsigned int input_rows = 5; - constexpr static unsigned int input_cols = 5; - - sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {} + sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) + : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(3, 3, 1) {} arm_gemm::VLType get_vl_type(void) const override { return vl_type; } - unsigned int get_kernel_rows(void) const override { return kernel_rows; } - unsigned int get_kernel_cols(void) const override { return kernel_cols; } - - unsigned int get_stride_rows(void) const override { return stride_rows; } - unsigned int get_stride_cols(void) const override { return stride_cols; } - - unsigned int get_output_rows(void) const override { return output_rows; } - unsigned int get_output_cols(void) const override { return output_cols; } - - unsigned int get_input_rows(void) const override { return input_rows; } - unsigned int get_input_cols(void) const override { return input_cols; } - - void indirect_kernel( - const void *const *const input_ptrs, - void *const *const outptrs, - const void *params, - unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const override - { - m_indirect_kernel( - reinterpret_cast(input_ptrs), - reinterpret_cast<__fp16 *const *>(outptrs), - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } - - void direct_kernel( - const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, - const void *params, unsigned int n_channels, - const void *activation_min, const void *activation_max - ) const override - { - m_direct_kernel( - n_tile_rows, n_tile_cols, - static_cast(inptr), ld_input_row, ld_input_col, - static_cast<__fp16 *>(outptr), ld_output_row, ld_output_col, - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } + Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; } + Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp index 60234c8a86..ac6ae284fd 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,7 +28,7 @@ #pragma once -#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) namespace arm_conv { namespace depthwise { @@ -36,19 +36,16 @@ namespace depthwise { void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); void sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); -class sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public IDepthwiseDepthfirstStrategy +class sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16> { private: - typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); - indirect_kern_type m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl; - - typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); - direct_kern_type m_direct_kernel = sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl; + using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>; + Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl; + Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl; public: - typedef __fp16 return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + using return_type = __fp16; + constexpr static auto vl_type = arm_gemm::VLType::SVE; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; @@ -59,63 +56,16 @@ class sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst : public IDepthwiseDepthfirs constexpr static unsigned int output_rows = 4; constexpr static unsigned int output_cols = 4; - constexpr static unsigned int input_rows = 6; - constexpr static unsigned int input_cols = 6; - - sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {} + sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) + : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(4, 3, 1) {} arm_gemm::VLType get_vl_type(void) const override { return vl_type; } - unsigned int get_kernel_rows(void) const override { return kernel_rows; } - unsigned int get_kernel_cols(void) const override { return kernel_cols; } - - unsigned int get_stride_rows(void) const override { return stride_rows; } - unsigned int get_stride_cols(void) const override { return stride_cols; } - - unsigned int get_output_rows(void) const override { return output_rows; } - unsigned int get_output_cols(void) const override { return output_cols; } - - unsigned int get_input_rows(void) const override { return input_rows; } - unsigned int get_input_cols(void) const override { return input_cols; } - - void indirect_kernel( - const void *const *const input_ptrs, - void *const *const outptrs, - const void *params, - unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const override - { - m_indirect_kernel( - reinterpret_cast(input_ptrs), - reinterpret_cast<__fp16 *const *>(outptrs), - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } - - void direct_kernel( - const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, - const void *params, unsigned int n_channels, - const void *activation_min, const void *activation_max - ) const override - { - m_direct_kernel( - n_tile_rows, n_tile_cols, - static_cast(inptr), ld_input_row, ld_input_col, - static_cast<__fp16 *>(outptr), ld_output_row, ld_output_col, - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } + Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; } + Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp index 5968309927..82173ee71f 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,7 +28,7 @@ #pragma once -#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) namespace arm_conv { namespace depthwise { @@ -36,19 +36,16 @@ namespace depthwise { void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); void sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); -class sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy +class sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16> { private: - typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); - indirect_kern_type m_indirect_kernel = sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl; - - typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); - direct_kern_type m_direct_kernel = sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl; + using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>; + Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl; + Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl; public: - typedef __fp16 return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + using return_type = __fp16; + constexpr static auto vl_type = arm_gemm::VLType::SVE; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; @@ -59,63 +56,16 @@ class sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst : public IDepthwiseDepthfirs constexpr static unsigned int output_rows = 2; constexpr static unsigned int output_cols = 2; - constexpr static unsigned int input_rows = 5; - constexpr static unsigned int input_cols = 5; - - sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {} + sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) + : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 3, 2) {} arm_gemm::VLType get_vl_type(void) const override { return vl_type; } - unsigned int get_kernel_rows(void) const override { return kernel_rows; } - unsigned int get_kernel_cols(void) const override { return kernel_cols; } - - unsigned int get_stride_rows(void) const override { return stride_rows; } - unsigned int get_stride_cols(void) const override { return stride_cols; } - - unsigned int get_output_rows(void) const override { return output_rows; } - unsigned int get_output_cols(void) const override { return output_cols; } - - unsigned int get_input_rows(void) const override { return input_rows; } - unsigned int get_input_cols(void) const override { return input_cols; } - - void indirect_kernel( - const void *const *const input_ptrs, - void *const *const outptrs, - const void *params, - unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const override - { - m_indirect_kernel( - reinterpret_cast(input_ptrs), - reinterpret_cast<__fp16 *const *>(outptrs), - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } - - void direct_kernel( - const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, - const void *params, unsigned int n_channels, - const void *activation_min, const void *activation_max - ) const override - { - m_direct_kernel( - n_tile_rows, n_tile_cols, - static_cast(inptr), ld_input_row, ld_input_col, - static_cast<__fp16 *>(outptr), ld_output_row, ld_output_col, - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } + Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; } + Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp index 4a9bd33a1e..f5d4189a47 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,7 +28,7 @@ #pragma once -#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) namespace arm_conv { namespace depthwise { @@ -36,19 +36,16 @@ namespace depthwise { void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); void sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); -class sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy +class sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16> { private: - typedef void (*indirect_kern_type)(const __fp16 *const *const, __fp16 *const *const, const void *, unsigned int, const __fp16, const __fp16); - indirect_kern_type m_indirect_kernel = sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl; - - typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const __fp16 *, int64_t, int64_t, __fp16 *, int64_t, int64_t, const void *, unsigned int, const __fp16, const __fp16); - direct_kern_type m_direct_kernel = sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl; + using Parent = DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>; + Parent::IndirectKernelType m_indirect_kernel = sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl; + Parent::DirectKernelType m_direct_kernel = sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl; public: - typedef __fp16 return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + using return_type = __fp16; + constexpr static auto vl_type = arm_gemm::VLType::SVE; constexpr static unsigned int kernel_rows = 5; constexpr static unsigned int kernel_cols = 5; @@ -59,63 +56,16 @@ class sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirs constexpr static unsigned int output_rows = 2; constexpr static unsigned int output_cols = 2; - constexpr static unsigned int input_rows = 6; - constexpr static unsigned int input_cols = 6; - - sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {} + sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) + : DepthwiseDepthfirstStrategy<__fp16, __fp16, __fp16, __fp16>(2, 5, 1) {} arm_gemm::VLType get_vl_type(void) const override { return vl_type; } - unsigned int get_kernel_rows(void) const override { return kernel_rows; } - unsigned int get_kernel_cols(void) const override { return kernel_cols; } - - unsigned int get_stride_rows(void) const override { return stride_rows; } - unsigned int get_stride_cols(void) const override { return stride_cols; } - - unsigned int get_output_rows(void) const override { return output_rows; } - unsigned int get_output_cols(void) const override { return output_cols; } - - unsigned int get_input_rows(void) const override { return input_rows; } - unsigned int get_input_cols(void) const override { return input_cols; } - - void indirect_kernel( - const void *const *const input_ptrs, - void *const *const outptrs, - const void *params, - unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const override - { - m_indirect_kernel( - reinterpret_cast(input_ptrs), - reinterpret_cast<__fp16 *const *>(outptrs), - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } - - void direct_kernel( - const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, - const void *params, unsigned int n_channels, - const void *activation_min, const void *activation_max - ) const override - { - m_direct_kernel( - n_tile_rows, n_tile_cols, - static_cast(inptr), ld_input_row, ld_input_col, - static_cast<__fp16 *>(outptr), ld_output_row, ld_output_col, - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } + Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; } + Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(__ARM_FP16_ARGS) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp index e07e6314f8..d7b1de2062 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,7 +28,7 @@ #pragma once -#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { @@ -36,19 +36,16 @@ namespace depthwise { void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); void sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); -class sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy +class sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { private: - typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); - indirect_kern_type m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl; - - typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); - direct_kern_type m_direct_kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl; + using Parent = DepthwiseDepthfirstStrategy; + Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_indirect_impl; + Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst_direct_impl; public: - typedef float return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + using return_type = float; + constexpr static auto vl_type = arm_gemm::VLType::SVE; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; @@ -59,63 +56,16 @@ class sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirs constexpr static unsigned int output_rows = 2; constexpr static unsigned int output_cols = 2; - constexpr static unsigned int input_rows = 4; - constexpr static unsigned int input_cols = 4; - - sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {} + sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) + : DepthwiseDepthfirstStrategy(2, 3, 1) {} arm_gemm::VLType get_vl_type(void) const override { return vl_type; } - unsigned int get_kernel_rows(void) const override { return kernel_rows; } - unsigned int get_kernel_cols(void) const override { return kernel_cols; } - - unsigned int get_stride_rows(void) const override { return stride_rows; } - unsigned int get_stride_cols(void) const override { return stride_cols; } - - unsigned int get_output_rows(void) const override { return output_rows; } - unsigned int get_output_cols(void) const override { return output_cols; } - - unsigned int get_input_rows(void) const override { return input_rows; } - unsigned int get_input_cols(void) const override { return input_cols; } - - void indirect_kernel( - const void *const *const input_ptrs, - void *const *const outptrs, - const void *params, - unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const override - { - m_indirect_kernel( - reinterpret_cast(input_ptrs), - reinterpret_cast(outptrs), - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } - - void direct_kernel( - const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, - const void *params, unsigned int n_channels, - const void *activation_min, const void *activation_max - ) const override - { - m_direct_kernel( - n_tile_rows, n_tile_cols, - static_cast(inptr), ld_input_row, ld_input_col, - static_cast(outptr), ld_output_row, ld_output_col, - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } + Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; } + Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp index eb9de9fa01..41ad193364 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,7 +28,7 @@ #pragma once -#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { @@ -36,19 +36,16 @@ namespace depthwise { void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); void sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); -class sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public IDepthwiseDepthfirstStrategy +class sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public DepthwiseDepthfirstStrategy { private: - typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); - indirect_kern_type m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl; - - typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); - direct_kern_type m_direct_kernel = sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl; + using Parent = DepthwiseDepthfirstStrategy; + Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_indirect_impl; + Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst_direct_impl; public: - typedef float return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + using return_type = float; + constexpr static auto vl_type = arm_gemm::VLType::SVE; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; @@ -59,63 +56,16 @@ class sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst : public IDepthwiseDepthfirs constexpr static unsigned int output_rows = 3; constexpr static unsigned int output_cols = 3; - constexpr static unsigned int input_rows = 5; - constexpr static unsigned int input_cols = 5; - - sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) {} + sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(const CPUInfo *) + : DepthwiseDepthfirstStrategy(3, 3, 1) {} arm_gemm::VLType get_vl_type(void) const override { return vl_type; } - unsigned int get_kernel_rows(void) const override { return kernel_rows; } - unsigned int get_kernel_cols(void) const override { return kernel_cols; } - - unsigned int get_stride_rows(void) const override { return stride_rows; } - unsigned int get_stride_cols(void) const override { return stride_cols; } - - unsigned int get_output_rows(void) const override { return output_rows; } - unsigned int get_output_cols(void) const override { return output_cols; } - - unsigned int get_input_rows(void) const override { return input_rows; } - unsigned int get_input_cols(void) const override { return input_cols; } - - void indirect_kernel( - const void *const *const input_ptrs, - void *const *const outptrs, - const void *params, - unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const override - { - m_indirect_kernel( - reinterpret_cast(input_ptrs), - reinterpret_cast(outptrs), - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } - - void direct_kernel( - const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, - const void *params, unsigned int n_channels, - const void *activation_min, const void *activation_max - ) const override - { - m_direct_kernel( - n_tile_rows, n_tile_cols, - static_cast(inptr), ld_input_row, ld_input_col, - static_cast(outptr), ld_output_row, ld_output_col, - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } + Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; } + Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp index d7be9b122e..6073b2ba7d 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,7 +28,7 @@ #pragma once -#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { @@ -36,19 +36,16 @@ namespace depthwise { void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); void sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); -class sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public IDepthwiseDepthfirstStrategy +class sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public DepthwiseDepthfirstStrategy { private: - typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); - indirect_kern_type m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl; - - typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); - direct_kern_type m_direct_kernel = sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl; + using Parent = DepthwiseDepthfirstStrategy; + Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_indirect_impl; + Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst_direct_impl; public: - typedef float return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + using return_type = float; + constexpr static auto vl_type = arm_gemm::VLType::SVE; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; @@ -59,63 +56,16 @@ class sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst : public IDepthwiseDepthfirs constexpr static unsigned int output_rows = 4; constexpr static unsigned int output_cols = 4; - constexpr static unsigned int input_rows = 6; - constexpr static unsigned int input_cols = 6; - - sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) {} + sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(const CPUInfo *) + : DepthwiseDepthfirstStrategy(4, 3, 1) {} arm_gemm::VLType get_vl_type(void) const override { return vl_type; } - unsigned int get_kernel_rows(void) const override { return kernel_rows; } - unsigned int get_kernel_cols(void) const override { return kernel_cols; } - - unsigned int get_stride_rows(void) const override { return stride_rows; } - unsigned int get_stride_cols(void) const override { return stride_cols; } - - unsigned int get_output_rows(void) const override { return output_rows; } - unsigned int get_output_cols(void) const override { return output_cols; } - - unsigned int get_input_rows(void) const override { return input_rows; } - unsigned int get_input_cols(void) const override { return input_cols; } - - void indirect_kernel( - const void *const *const input_ptrs, - void *const *const outptrs, - const void *params, - unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const override - { - m_indirect_kernel( - reinterpret_cast(input_ptrs), - reinterpret_cast(outptrs), - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } - - void direct_kernel( - const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, - const void *params, unsigned int n_channels, - const void *activation_min, const void *activation_max - ) const override - { - m_direct_kernel( - n_tile_rows, n_tile_cols, - static_cast(inptr), ld_input_row, ld_input_col, - static_cast(outptr), ld_output_row, ld_output_col, - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } + Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; } + Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp index 28d44d0b5f..17ac74e223 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,7 +28,7 @@ #pragma once -#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { @@ -36,19 +36,16 @@ namespace depthwise { void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); void sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); -class sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy +class sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { private: - typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); - indirect_kern_type m_indirect_kernel = sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl; - - typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); - direct_kern_type m_direct_kernel = sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl; + using Parent = DepthwiseDepthfirstStrategy; + Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_indirect_impl; + Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst_direct_impl; public: - typedef float return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + using return_type = float; + constexpr static auto vl_type = arm_gemm::VLType::SVE; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; @@ -59,63 +56,16 @@ class sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst : public IDepthwiseDepthfirs constexpr static unsigned int output_rows = 2; constexpr static unsigned int output_cols = 2; - constexpr static unsigned int input_rows = 5; - constexpr static unsigned int input_cols = 5; - - sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {} + sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) + : DepthwiseDepthfirstStrategy(2, 3, 2) {} arm_gemm::VLType get_vl_type(void) const override { return vl_type; } - unsigned int get_kernel_rows(void) const override { return kernel_rows; } - unsigned int get_kernel_cols(void) const override { return kernel_cols; } - - unsigned int get_stride_rows(void) const override { return stride_rows; } - unsigned int get_stride_cols(void) const override { return stride_cols; } - - unsigned int get_output_rows(void) const override { return output_rows; } - unsigned int get_output_cols(void) const override { return output_cols; } - - unsigned int get_input_rows(void) const override { return input_rows; } - unsigned int get_input_cols(void) const override { return input_cols; } - - void indirect_kernel( - const void *const *const input_ptrs, - void *const *const outptrs, - const void *params, - unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const override - { - m_indirect_kernel( - reinterpret_cast(input_ptrs), - reinterpret_cast(outptrs), - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } - - void direct_kernel( - const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, - const void *params, unsigned int n_channels, - const void *activation_min, const void *activation_max - ) const override - { - m_direct_kernel( - n_tile_rows, n_tile_cols, - static_cast(inptr), ld_input_row, ld_input_col, - static_cast(outptr), ld_output_row, ld_output_col, - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } + Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; } + Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp index 751874ffbf..2449c96637 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,7 +28,7 @@ #pragma once -#if __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { @@ -36,19 +36,16 @@ namespace depthwise { void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); void sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); -class sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirstStrategy +class sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { private: - typedef void (*indirect_kern_type)(const float *const *const, float *const *const, const void *, unsigned int, const float, const float); - indirect_kern_type m_indirect_kernel = sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl; - - typedef void (*direct_kern_type)(const unsigned int, const unsigned int, const float *, int64_t, int64_t, float *, int64_t, int64_t, const void *, unsigned int, const float, const float); - direct_kern_type m_direct_kernel = sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl; + using Parent = DepthwiseDepthfirstStrategy; + Parent::IndirectKernelType m_indirect_kernel = sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_indirect_impl; + Parent::DirectKernelType m_direct_kernel = sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst_direct_impl; public: - typedef float return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; + using return_type = float; + constexpr static auto vl_type = arm_gemm::VLType::SVE; constexpr static unsigned int kernel_rows = 5; constexpr static unsigned int kernel_cols = 5; @@ -59,63 +56,16 @@ class sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst : public IDepthwiseDepthfirs constexpr static unsigned int output_rows = 2; constexpr static unsigned int output_cols = 2; - constexpr static unsigned int input_rows = 6; - constexpr static unsigned int input_cols = 6; - - sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {} + sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) + : DepthwiseDepthfirstStrategy(2, 5, 1) {} arm_gemm::VLType get_vl_type(void) const override { return vl_type; } - unsigned int get_kernel_rows(void) const override { return kernel_rows; } - unsigned int get_kernel_cols(void) const override { return kernel_cols; } - - unsigned int get_stride_rows(void) const override { return stride_rows; } - unsigned int get_stride_cols(void) const override { return stride_cols; } - - unsigned int get_output_rows(void) const override { return output_rows; } - unsigned int get_output_cols(void) const override { return output_cols; } - - unsigned int get_input_rows(void) const override { return input_rows; } - unsigned int get_input_cols(void) const override { return input_cols; } - - void indirect_kernel( - const void *const *const input_ptrs, - void *const *const outptrs, - const void *params, - unsigned int n_channels, - const void *activation_min, - const void *activation_max - ) const override - { - m_indirect_kernel( - reinterpret_cast(input_ptrs), - reinterpret_cast(outptrs), - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } - - void direct_kernel( - const unsigned int n_tile_rows, const unsigned int n_tile_cols, - const void *inptr, int64_t ld_input_row, int64_t ld_input_col, - void *outptr, int64_t ld_output_row, int64_t ld_output_col, - const void *params, unsigned int n_channels, - const void *activation_min, const void *activation_max - ) const override - { - m_direct_kernel( - n_tile_rows, n_tile_cols, - static_cast(inptr), ld_input_row, ld_input_col, - static_cast(outptr), ld_output_row, ld_output_col, - params, n_channels, - *static_cast(activation_min), - *static_cast(activation_max) - ); - } + Parent::IndirectKernelType get_indirect_kernel() const override { return m_indirect_kernel; } + Parent::DirectKernelType get_direct_kernel() const override { return m_direct_kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // __aarch64__ && defined(ARM_COMPUTE_ENABLE_SVE) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp index bd071d370c..62faca97a9 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -35,22 +35,14 @@ namespace depthwise { void sve_fp32_nhwc_generic_output9_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float); -struct sve_fp32_nhwc_generic_output9_mla_depthfirst +class sve_fp32_nhwc_generic_output9_mla_depthfirst : public GenericDepthfirstKernelStrategy { - typedef float bias_type; - typedef float input_type; - typedef float weight_type; - typedef float return_type; + KernelType kernel = sve_fp32_nhwc_generic_output9_mla_depthfirst_impl; - typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const void *, const unsigned int, const unsigned int, const float, const float); + public: + sve_fp32_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) : GenericDepthfirstKernelStrategy(9, arm_gemm::VLType::SVE) {} - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; - - constexpr static unsigned int n_output_points = 9; - - kern_type kernel = sve_fp32_nhwc_generic_output9_mla_depthfirst_impl; - - sve_fp32_nhwc_generic_output9_mla_depthfirst(const CPUInfo *) {} + KernelType get_kernel() const override { return kernel; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp index 563f0fc59f..8640343747 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -35,33 +35,24 @@ namespace depthwise { void sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float); -struct sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst +struct sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst : DepthfirstMultiplierStrategy { - typedef float bias_type; - typedef float input_type; - typedef float weight_type; - typedef float return_type; - - typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; - + using Parent = DepthfirstMultiplierStrategy; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 2; constexpr static unsigned int stride_cols = 2; - constexpr static unsigned int output_rows = 3; - constexpr static unsigned int output_cols = 3; - - constexpr static unsigned int input_rows = 7; - constexpr static unsigned int input_cols = 7; - constexpr static unsigned int input_col_quads = 2; + sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(const CPUInfo *) + : Parent(3, 3, kernel_rows, kernel_cols, stride_rows, stride_cols) + { + } - kern_type kernel = sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl; + arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; } - sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp index e9378c2a12..a4ee87cce2 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -35,33 +35,24 @@ namespace depthwise { void sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float); -struct sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst +struct sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst : DepthfirstMultiplierStrategy { - typedef float bias_type; - typedef float input_type; - typedef float weight_type; - typedef float return_type; - - typedef void (*kern_type)(const float *const *const, float *const *const, const void *, const unsigned int, const float, const float); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; - + using Parent = DepthfirstMultiplierStrategy; constexpr static unsigned int kernel_rows = 5; constexpr static unsigned int kernel_cols = 5; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 4; - - constexpr static unsigned int input_rows = 6; - constexpr static unsigned int input_cols = 8; - constexpr static unsigned int input_col_quads = 2; + sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(const CPUInfo *) + : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols) + { + } - kern_type kernel = sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl; + arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; } - sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp index 6849e562bc..e1f0b50d89 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -28,35 +28,25 @@ #pragma once -#if defined(ARM_COMPUTE_ENABLE_SVE) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { void sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float); -struct sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst +struct sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst : GenericDepthfirstMultiplierKernelStrategy { - typedef float bias_type; - typedef float input_type; - typedef float weight_type; - typedef float return_type; - - typedef void (*kern_type)(const float *const *const, float *const *const, const float *, const float *, const unsigned int, const unsigned int, const float, const float); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; - - constexpr static unsigned int output_rows(void) { return 2; }; - constexpr static unsigned int output_cols(void) { return 8; }; - - constexpr static unsigned int output_col_regs(void) { return 2; }; - - kern_type kernel = sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl; - - sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) {} + using Parent = GenericDepthfirstMultiplierKernelStrategy; + sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(const CPUInfo *) + : Parent(2, 8, arm_gemm::VLType::SVE) + { + } + Parent::KernelType kernel = sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // defined(ARM_COMPUTE_ENABLE_SVE) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp index 39974fde88..4e2ee43374 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -34,39 +34,40 @@ namespace arm_conv { namespace depthwise { -void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); +void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *); -struct sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst +class sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef int8_t input_type; - typedef int8_t weight_type; - typedef int8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; - - typedef void (*kern_type)(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); - typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 4; - constexpr static unsigned int input_cols = 4; - - constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_dot::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_dot::get_packed_size; - - kern_type kernel = sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl; - - sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {} + sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {} + + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; } + + Parent::KernelType kernel = sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } + size_t get_storage_size(const DepthwiseArgs &args) const override + { + return interleave_sve_s8q_3x3_dot::get_packed_size(args); + } + + void pack_parameters( + const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp, + const void *weights, size_t ld_weight_col, size_t ld_weight_row + ) const override + { + interleave_sve_s8q_3x3_dot::pack_parameters( + args.input_channels, buffer, reinterpret_cast(biases), + reinterpret_cast(weights), qp, ld_weight_col, ld_weight_row + ); + } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp index 8e9e5f4aeb..800803770a 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,7 +30,15 @@ namespace arm_conv { namespace depthwise { -void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp) +void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl( + const unsigned int n_channels, + const int8_t *const *const inptrs, + const int8_t *params, + const int32_t *, // Bias, should be wrapped into the parameters + const arm_gemm::Requantize32& qp, + const int32_t *, const int32_t *, // Requant parameters, also wrapped + int8_t *const *const outptrs +) { __asm__ __volatile__( "ldp x11, x10, [%x[inptrs], #0x0]\n" @@ -446,7 +454,7 @@ void sve_s8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *cons "b.any 1b\n" "addvl SP, SP, #8\n" : [params] "+&r" (params) - : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) + : [inptrs] "r" (inptrs), [n_channels] "r" ((long unsigned int) n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp index f788829572..3e9765165c 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,47 +29,35 @@ #pragma once -#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); -struct sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst +class sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef int8_t input_type; - typedef int8_t weight_type; - typedef int8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; - - typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); - typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 4; - constexpr static unsigned int input_cols = 4; + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } - constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_mla::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_mla::get_packed_size; + sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {} - kern_type kernel = sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl; + Parent::KernelType kernel = sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl; - sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp index 87387960f1..3583308357 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,7 +27,7 @@ #include #include -#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { @@ -415,4 +415,4 @@ void sve_s8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( } // namespace depthwise } // namespace arm_conv -#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp index 5c2b4f6f53..78bcd1407f 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,47 +29,35 @@ #pragma once -#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); -struct sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst +class sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef int8_t input_type; - typedef int8_t weight_type; - typedef int8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; - - typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); - typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 2; constexpr static unsigned int stride_cols = 2; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 5; - constexpr static unsigned int input_cols = 5; + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } - constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_mla::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_mla::get_packed_size; + sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {} - kern_type kernel = sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl; + Parent::KernelType kernel = sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl; - sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp index b4a1026aaa..ba8c1fdb8d 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,7 +27,7 @@ #include #include -#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { @@ -456,4 +456,4 @@ void sve_s8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( } // namespace depthwise } // namespace arm_conv -#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp index 948c5ad2e7..41ecd520ae 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,47 +29,35 @@ #pragma once -#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); -struct sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst +class sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef int8_t input_type; - typedef int8_t weight_type; - typedef int8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; - - typedef void (*kern_type)(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, int8_t *const *); - typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 5; constexpr static unsigned int kernel_cols = 5; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 6; - constexpr static unsigned int input_cols = 6; + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } - constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_5x5_mla::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_5x5_mla::get_packed_size; + sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {} - kern_type kernel = sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl; + Parent::KernelType kernel = sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl; - sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp index 565c145f92..4733c89199 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,7 +27,7 @@ #include #include -#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { @@ -657,4 +657,4 @@ void sve_s8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl( } // namespace depthwise } // namespace arm_conv -#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp index 176c4f878e..2e8c2019db 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -35,33 +35,24 @@ namespace depthwise { void sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); -struct sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst +struct sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst : DepthfirstMultiplierStrategy { - typedef int32_t bias_type; - typedef int8_t input_type; - typedef int8_t weight_type; - typedef int8_t return_type; - - typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; - + using Parent = DepthfirstMultiplierStrategy; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 2; constexpr static unsigned int stride_cols = 2; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 4; - - constexpr static unsigned int input_rows = 5; - constexpr static unsigned int input_cols = 9; - constexpr static unsigned int input_col_quads = 1; + sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) + : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols) + { + } - kern_type kernel = sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl; + arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; } - sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = sve_s8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp index 10eee34d62..4874fb9a77 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -35,33 +35,24 @@ namespace depthwise { void sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); -struct sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst +struct sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst : DepthfirstMultiplierStrategy { - typedef int32_t bias_type; - typedef int8_t input_type; - typedef int8_t weight_type; - typedef int8_t return_type; - - typedef void (*kern_type)(const int8_t *const *const, int8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; - + using Parent = DepthfirstMultiplierStrategy; constexpr static unsigned int kernel_rows = 5; constexpr static unsigned int kernel_cols = 5; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 4; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 8; - constexpr static unsigned int input_cols = 6; - constexpr static unsigned int input_col_quads = 1; + sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) + : Parent(4, 2, kernel_rows, kernel_cols, stride_rows, stride_cols) + { + } - kern_type kernel = sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl; + arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; } - sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = sve_s8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp index b5c6e983ae..0d185fcafc 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -34,39 +34,40 @@ namespace arm_conv { namespace depthwise { -void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); +void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const int8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, int8_t *const *); -struct sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst +class sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef int8_t input_type; - typedef int8_t weight_type; - typedef int8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; - - typedef void (*kern_type)(const int8_t *const *, int8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); - typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const int8_t *, const arm_gemm::Requantize32 &, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 4; - constexpr static unsigned int input_cols = 4; - - constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_dot::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_dot::get_packed_size; - - kern_type kernel = sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl; - - sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {} + sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {} + + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; } + + Parent::KernelType kernel = sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } + size_t get_storage_size(const DepthwiseArgs &args) const override + { + return interleave_sve_s8q_3x3_dot::get_packed_size(args); + } + + void pack_parameters( + const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp, + const void *weights, size_t ld_weight_col, size_t ld_weight_row + ) const override + { + interleave_sve_s8q_3x3_dot::pack_parameters( + args.input_channels, buffer, reinterpret_cast(biases), + reinterpret_cast(weights), qp, ld_weight_col, ld_weight_row + ); + } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp index 095c1de8f2..391e98b561 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,7 +30,15 @@ namespace arm_conv { namespace depthwise { -void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *const inptrs, int8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp) +void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl( + const unsigned int n_channels, + const int8_t *const *const inptrs, + const int8_t *params, + const int32_t *, // Bias, should be wrapped into the parameters + const arm_gemm::Requantize32& qp, + const int32_t *, const int32_t *, // Requant parameters, also wrapped + int8_t *const *const outptrs +) { __asm__ __volatile__( "ldp x11, x10, [%x[inptrs], #0x0]\n" @@ -377,7 +385,7 @@ void sve_s8qs_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const int8_t *const *con "b.any 1b\n" "addvl SP, SP, #8\n" : [params] "+&r" (params) - : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) + : [inptrs] "r" (inptrs), [n_channels] "r" ((long unsigned int) n_channels), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp index a087e801dc..648b2da163 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -34,39 +34,40 @@ namespace arm_conv { namespace depthwise { -void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *, uint8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); +void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32&, const int32_t *, const int32_t *, uint8_t *const *); -struct sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst +class sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst : public DepthwiseDepthfirstStrategy { - typedef uint32_t bias_type; - typedef uint8_t input_type; - typedef uint8_t weight_type; - typedef uint8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; - - typedef void (*kern_type)(const uint8_t *const *, uint8_t *const *, const void *, uint64_t, const arm_gemm::Requantize32&); - typedef void (*parameter_packing_fn)(unsigned int, void *, const int32_t *, const uint8_t *, const arm_gemm::Requantize32 &, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 4; - constexpr static unsigned int input_cols = 4; - - constexpr static parameter_packing_fn pack_parameters = interleave_sve_u8q_3x3_dot::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_sve_u8q_3x3_dot::get_packed_size; - - kern_type kernel = sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl; - - sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) {} + sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {} + + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; } + + Parent::KernelType kernel = sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } + size_t get_storage_size(const DepthwiseArgs &args) const override + { + return interleave_sve_u8q_3x3_dot::get_packed_size(args); + } + + void pack_parameters( + const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp, + const void *weights, size_t ld_weight_col, size_t ld_weight_row + ) const override + { + interleave_sve_u8q_3x3_dot::pack_parameters( + args.input_channels, buffer, reinterpret_cast(biases), + reinterpret_cast(weights), qp, ld_weight_col, ld_weight_row + ); + } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp index 0d4b9e6687..440f57ed00 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -30,7 +30,15 @@ namespace arm_conv { namespace depthwise { -void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *const inptrs, uint8_t *const *const outptrs, const void *params, const uint64_t n_channels, const arm_gemm::Requantize32& qp) +void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl( + const unsigned int n_channels, + const uint8_t *const *const inptrs, + const uint8_t *params, + const int32_t *, // Bias, should be wrapped into the parameters + const arm_gemm::Requantize32& qp, + const int32_t *, const int32_t *, // Requant parameters, also wrapped + uint8_t *const *const outptrs +) { __asm__ __volatile__( "ldp x11, x10, [%x[inptrs], #0x0]\n" @@ -446,7 +454,7 @@ void sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst_impl(const uint8_t *const *con "b.any 1b\n" "addvl SP, SP, #8\n" : [params] "+&r" (params) - : [inptrs] "r" (inptrs), [n_channels] "r" (n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) + : [inptrs] "r" (inptrs), [n_channels] "r" ((long unsigned int) n_channels), [offsetof_Requantize32_b_offset] "I" (offsetof(arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset] "I" (offsetof(arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval] "I" (offsetof(arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval] "I" (offsetof(arm_gemm::Requantize32, minval)), [outptrs] "r" (outptrs), [qp] "r" (&qp) : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31" ); } diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp index b524fd7c93..1cf20ef721 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -36,37 +36,25 @@ namespace depthwise { void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); -struct sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst +class sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef uint8_t input_type; - typedef uint8_t weight_type; - typedef uint8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; - - typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); - typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 4; - constexpr static unsigned int input_cols = 4; + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } - constexpr static parameter_packing_fn pack_parameters = interleave_sve_u8q_3x3_mla::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_sve_u8q_3x3_mla::get_packed_size; + sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {} - kern_type kernel = sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl; + Parent::KernelType kernel = sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl; - sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp index 52dc4683ad..7bfa5fc4c7 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,7 +27,7 @@ #include #include -#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { @@ -415,4 +415,4 @@ void sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( } // namespace depthwise } // namespace arm_conv -#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp index 981864270d..a794095c6f 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,47 +29,35 @@ #pragma once -#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); -struct sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst +class sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef uint8_t input_type; - typedef uint8_t weight_type; - typedef uint8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; - - typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); - typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 2; constexpr static unsigned int stride_cols = 2; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 5; - constexpr static unsigned int input_cols = 5; + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } - constexpr static parameter_packing_fn pack_parameters = interleave_sve_u8q_3x3_mla::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_sve_u8q_3x3_mla::get_packed_size; + sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {} - kern_type kernel = sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl; + Parent::KernelType kernel = sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl; - sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp index 34ba8ece12..e1b2d257b0 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,7 +27,7 @@ #include #include -#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { @@ -456,4 +456,4 @@ void sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( } // namespace depthwise } // namespace arm_conv -#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp index b1b16c55d3..ac0a00b245 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,47 +29,35 @@ #pragma once -#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); -struct sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst +class sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef uint8_t input_type; - typedef uint8_t weight_type; - typedef uint8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; - - typedef void (*kern_type)(unsigned int, const uint8_t *const *, const uint8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); - typedef void (*parameter_packing_fn)(unsigned int, void *, const uint8_t *, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 5; constexpr static unsigned int kernel_cols = 5; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 6; - constexpr static unsigned int input_cols = 6; + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } - constexpr static parameter_packing_fn pack_parameters = interleave_sve_u8q_5x5_mla::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_sve_u8q_5x5_mla::get_packed_size; + sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {} - kern_type kernel = sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl; + Parent::KernelType kernel = sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl; - sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp index 441da6da7a..0b2182f995 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,7 +27,7 @@ #include #include -#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { @@ -657,4 +657,4 @@ void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl( } // namespace depthwise } // namespace arm_conv -#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp index dbf70c3f8e..81c954a11b 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -35,33 +35,24 @@ namespace depthwise { void sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); -struct sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst +struct sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst : DepthfirstMultiplierStrategy { - typedef uint32_t bias_type; - typedef uint8_t input_type; - typedef uint8_t weight_type; - typedef uint8_t return_type; - - typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; - + using Parent = DepthfirstMultiplierStrategy; constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 2; constexpr static unsigned int stride_cols = 2; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 4; - - constexpr static unsigned int input_rows = 5; - constexpr static unsigned int input_cols = 9; - constexpr static unsigned int input_col_quads = 1; + sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) + : Parent(2, 4, kernel_rows, kernel_cols, stride_rows, stride_cols) + { + } - kern_type kernel = sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl; + arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; } - sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp index 90fefdcda3..e7173de65a 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -35,33 +35,24 @@ namespace depthwise { void sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); -struct sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst +struct sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst : DepthfirstMultiplierStrategy { - typedef uint32_t bias_type; - typedef uint8_t input_type; - typedef uint8_t weight_type; - typedef uint8_t return_type; - - typedef void (*kern_type)(const uint8_t *const *const, uint8_t *const *const, const void *, unsigned int, const arm_gemm::Requantize32&); - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; - + using Parent = DepthfirstMultiplierStrategy; constexpr static unsigned int kernel_rows = 5; constexpr static unsigned int kernel_cols = 5; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 4; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 8; - constexpr static unsigned int input_cols = 6; - constexpr static unsigned int input_col_quads = 1; + sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) + : Parent(4, 2, kernel_rows, kernel_cols, stride_rows, stride_cols) + { + } - kern_type kernel = sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl; + arm_gemm::VLType get_vl_type() const override { return arm_gemm::VLType::SVE; } - sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(const CPUInfo *) {} + Parent::KernelType kernel = sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst_impl; + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp index 8ab2e5ba2a..3d475daf72 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,47 +29,35 @@ #pragma once -#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); -struct sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst +class sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef uint8_t input_type; - typedef int8_t weight_type; - typedef uint8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; - - typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); - typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 4; - constexpr static unsigned int input_cols = 4; + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } - constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_mla::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_mla::get_packed_size; + sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 1, 1) {} - kern_type kernel = sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl; + Parent::KernelType kernel = sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl; - sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp index 4b9be8f3e3..dc8fad95fa 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,7 +27,7 @@ #include #include -#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { @@ -415,4 +415,4 @@ void sve_u8s8u8q_nhwc_3x3_s1_output2x2_mla_depthfirst_impl( } // namespace depthwise } // namespace arm_conv -#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp index f652e48e42..9a3db20f73 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,47 +29,35 @@ #pragma once -#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); -struct sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst +class sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef uint8_t input_type; - typedef int8_t weight_type; - typedef uint8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; - - typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); - typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 3; constexpr static unsigned int kernel_cols = 3; constexpr static unsigned int stride_rows = 2; constexpr static unsigned int stride_cols = 2; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 5; - constexpr static unsigned int input_cols = 5; + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } - constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_3x3_mla::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_3x3_mla::get_packed_size; + sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 3, 3, 2, 2) {} - kern_type kernel = sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl; + Parent::KernelType kernel = sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl; - sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp index 400e62d248..9adf100a0f 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,7 +27,7 @@ #include #include -#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { @@ -456,4 +456,4 @@ void sve_u8s8u8q_nhwc_3x3_s2_output2x2_mla_depthfirst_impl( } // namespace depthwise } // namespace arm_conv -#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp index f07ea13a03..06ca42eed9 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -29,47 +29,35 @@ #pragma once -#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); -struct sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst +class sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst : public DepthwiseDepthfirstStrategy { - typedef int32_t bias_type; - typedef uint8_t input_type; - typedef int8_t weight_type; - typedef uint8_t return_type; - - constexpr static arm_gemm::VLType vl_type = arm_gemm::VLType::SVE; - - typedef void (*kern_type)(unsigned int, const uint8_t *const *, const int8_t *, const int32_t *, const arm_gemm::Requantize32 &, const int32_t *, const int32_t *, uint8_t *const *); - typedef void (*parameter_packing_fn)(unsigned int, void *, const int8_t *, size_t, size_t); - typedef size_t (*parameter_sizing_fn)(const DepthwiseArgs &); + using Parent = DepthwiseDepthfirstStrategy; + public: constexpr static unsigned int kernel_rows = 5; constexpr static unsigned int kernel_cols = 5; constexpr static unsigned int stride_rows = 1; constexpr static unsigned int stride_cols = 1; - constexpr static unsigned int output_rows = 2; - constexpr static unsigned int output_cols = 2; - - constexpr static unsigned int input_rows = 6; - constexpr static unsigned int input_cols = 6; + arm_gemm::VLType get_vl_type(void) const override { return arm_gemm::VLType::SVE; } + unsigned int get_accumulator_depth_vl(void) const override { return 2; } - constexpr static parameter_packing_fn pack_parameters = interleave_sve_s8q_5x5_mla::pack_parameters; - constexpr static parameter_sizing_fn get_packed_size = interleave_sve_s8q_5x5_mla::get_packed_size; + sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) : Parent(2, 2, 5, 5, 1, 1) {} - kern_type kernel = sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl; + Parent::KernelType kernel = sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl; - sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(const CPUInfo *) {} + Parent::KernelType get_kernel(void) const override { return kernel; } }; } // namespace depthwise } // namespace arm_conv -#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp index 29582da0f6..9cf95e9588 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/kernels/sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst/generic.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Arm Limited. + * Copyright (c) 2021-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -27,7 +27,7 @@ #include #include -#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#if defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) namespace arm_conv { namespace depthwise { @@ -657,4 +657,4 @@ void sve_u8s8u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl( } // namespace depthwise } // namespace arm_conv -#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVE2) +#endif // defined(__aarch64__) && defined(ARM_COMPUTE_ENABLE_SVE) diff --git a/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp b/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp new file mode 100644 index 0000000000..e9b29ca877 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/depthwise/working_space.hpp @@ -0,0 +1,431 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* Depthwise kernel drivers commonly require a per-thread blob of working space + * in which to store parameters required by the depthwise implementations. The + * composition of this working space varies with the driver, kernel, and data + * types -- but the tasks of requesting sufficient space, allocating buffer + * space, and performing initialisation of the working space are common. + * + * The classes in this file consist of a number of working space "Elements" + * (which are logical units of functionality) and a Workspace type which allows + * for compile time composition of elements into a single working space type. + * + * Creating a workspace + * ==================== + * + * A new workspace type can be created by combining Elements as an argument to + * the Workspace class. For instance: + * + * Workspace< + * depthwise_depthfirst::InputArrayElement, + * InputBufferElement, + * OutputArrayElement + * > + * + * Creates a new Workspace consisting of the given elements. The workspace type + * contained within this class (`Workspace<...>::WorkspaceType`) is equivalent to: + * + * struct WorkspaceType + * { + * const float **inptr_array; // From InputArrayElement + * float *input_buffer; // From InputBufferElement + * float **outptr_array; // From OutputArrayElement + * float *output_buffer; // From OutputArrayElement + * }; + * + * Calling `Workspace<...>::get_sizeof_workspace(...)` will return the amount + * of space required to store the above struct and the elements contained + * within it. Once this space has been allocated, the workspace can be + * initialised by calling `Workspace<...>::initialise` with a pointer to the + * buffer and the same arguments. This will place a struct of type + * `Workspace<...>::WorkspaceType` at the start of the buffer, and share the + * remaining space between the specified elements. As this is all done at + * compile time, later code can access elements from the `WorkspaceType` by + * name. + * + * Writing a new element + * ===================== + * + * Each Element must provide: + * - A struct called "Workspace" containing the variables contained within + * this portion of the workspace. + * - A static method called `get_element_size` which returns the amount of + * buffer space required by this element of the workspace (NOT including the + * size of the Workspace struct). For example, an element which stores a + * vector of pointers will return the amount of space required top store the + * vector. + * - A static method called `initialise` which accepts a pointer to a struct + * which will be composed of the Element's `Workspace` struct (along with + * other elements), a pointer to the start of the buffer allocated for this + * portion of the workspace, and arguments to be used to initialise the + * workspace. The Element should consume as much of the buffer as it + * requires, initialise the Workspace, and then return the pointer to the + * next free byte of the buffer. + * + * See the below elements for an example of how this should work. + */ + +#pragma once + +#include "depthwise.hpp" +#include "depthfirst_driver.hpp" +#include "src/core/NEON/kernels/arm_gemm/utils.hpp" + +namespace arm_conv { +namespace depthwise { +namespace { // anonymous because we expect this to appear in several compilation units + +/* Arguments to use to size and initialise a workspace. + */ +template +struct WorkspaceArgs +{ + const StratType *strategy; + const DepthwiseArgs &depthwise_args; + const OutputStage &output_stage; + + WorkspaceArgs(const StratType *strat, const DepthwiseArgs &dwargs, const OutputStage &os = {}) + : strategy(strat), depthwise_args(dwargs), output_stage(os) + { + } +}; + + +/* Sometimes we use templated structs to fill in workspace types, the Empty + * element can be useful for when a blank element is required for some sets of + * parameters. + */ +struct EmptyElement +{ + struct Workspace {}; + + template + static size_t get_element_size(const WorkspaceArgs &) { return 0; } + + template + static void *initialise(WorkspaceType *, void *buffer, const WorkspaceArgs &) + { + return buffer; + } +}; + + +/* Store fused activations for a kernel. + * + * Activations are set based on the DepthwiseArgs. + */ +template +class ActivationsElement +{ + public: + struct Workspace + { + T activation_min, activation_max; + }; + + template + static size_t get_element_size(const WorkspaceArgs &) + { + return 0; + } + + template + static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs &args) + { + ws->activation_min = static_cast(-std::numeric_limits::infinity()); + ws->activation_max = static_cast(std::numeric_limits::infinity()); + + switch (args.depthwise_args.activation.type) + { + case arm_gemm::Activation::Type::BoundedReLU: + ws->activation_max = static_cast(args.depthwise_args.activation.param1); + // Fall through + case arm_gemm::Activation::Type::ReLU: + ws->activation_min = static_cast(0); + break; + default: + break; + } + + return buffer; + } +}; + +/* Activation clamps are contained within `arm_gemm::Requantize32`, so if the + * output stage is one of these we substitute in an empty workspace element. + */ +template +class ActivationsElement : public EmptyElement +{ +}; + + +/* Get the value with which to fill an input buffer. This defaults to `0` + * (which we return as a `char` since it gets used by `memset`). + */ +template +char get_input_buffer_fill_value(const OutputStage &) +{ + return 0; +} + +/* In the case of kernels operating on quantized data, we need to fill the + * input buffer with the zero offset of the input tensor. + */ +template <> char get_input_buffer_fill_value(const arm_gemm::Requantize32 &qp) __attribute__ ((unused)); +template <> char get_input_buffer_fill_value(const arm_gemm::Requantize32 &qp) +{ + return qp.a_offset; +} + + +/* Container for a vector of padding values which can be safely consumed by the + * depthwise kernel. The padding values are initialised to either `0` or the + * zero offset of the input tensor (if quantized). + */ +template +class InputBufferElement +{ + public: + struct Workspace + { + T *input_buffer; + }; + + template + static size_t get_element_size(const WorkspaceArgs &args) + { + return sizeof(T) * args.depthwise_args.input_channels; + } + + template + static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs &args) + { + ws->input_buffer = reinterpret_cast(buffer); + memset(ws->input_buffer, get_input_buffer_fill_value(args.output_stage), get_element_size(args)); + return reinterpret_cast(buffer) + get_element_size(args); + } +}; + + +/* Container for an array of output pointers, and a buffer which can be used as + * a destination for unnecessary writes. + */ +template +class OutputArrayElement +{ + public: + struct Workspace + { + T **outptr_array; + T *output_buffer; + }; + + template + static size_t get_element_size(const WorkspaceArgs &args) + { + return sizeof_outptr_array(args) + sizeof_output_buffer(args); + } + + template + static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs &args) + { + char *buffer_bytes = reinterpret_cast(buffer); + + ws->outptr_array = reinterpret_cast(buffer_bytes); + buffer_bytes += sizeof_outptr_array(args); + + ws->output_buffer = reinterpret_cast(buffer_bytes); + buffer_bytes += sizeof_output_buffer(args); + + return buffer_bytes; + } + + protected: + template + static size_t sizeof_outptr_array(const WorkspaceArgs &args) + { + return sizeof(T **) * args.strategy->get_output_rows() * args.strategy->get_output_cols(); + } + + template + static size_t sizeof_output_buffer(const WorkspaceArgs &args) + { + return sizeof(T) * args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier; + } +}; + + +/* Container for requantization parameters. + * + * This removes the distinction between per-layer and per-channel + * requantization parameters by providing a vector of requantization parameters + * regardless of whether per-layer or per-channel is selected. + */ +class RequantizationParametersElement +{ + public: + struct Workspace + { + const int32_t *bias, *requant_muls, *requant_shifts; + }; + + template + static size_t get_element_size(const WorkspaceArgs &args) + { + return sizeof_bias(args) + sizeof_requant_muls(args) + sizeof_requant_shifts(args); + } + + template + static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs &args) + { + const auto n_output_channels = args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier; + char *buffer_bytes = reinterpret_cast(buffer); + + ws->bias = args.output_stage.bias; + ws->requant_muls = args.output_stage.per_channel_muls; + ws->requant_shifts = args.output_stage.per_channel_right_shifts; + + if (ws->bias == nullptr) + { + ws->bias = reinterpret_cast(buffer_bytes); + memset(buffer_bytes, 0, sizeof_bias(args)); + buffer_bytes += sizeof_bias(args); + } + + if (ws->requant_muls == nullptr) + { + ws->requant_muls = reinterpret_cast(buffer_bytes); + auto muls = reinterpret_cast(buffer_bytes); + buffer_bytes += sizeof_requant_muls(args); + + for (auto n = 0u; n < n_output_channels; n++) + { + muls[n] = args.output_stage.per_layer_mul; + } + } + + if (ws->requant_shifts == nullptr) + { + ws->requant_shifts = reinterpret_cast(buffer_bytes); + auto shifts = reinterpret_cast(buffer_bytes); + buffer_bytes += sizeof_requant_shifts(args); + + for (auto n = 0u; n < n_output_channels; n++) + { + shifts[n] = args.output_stage.per_layer_right_shift; + } + } + + return buffer_bytes; + } + + protected: + template + static size_t sizeof_bias(const WorkspaceArgs &args) + { + return args.output_stage.bias != nullptr ? + 0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels; + } + + template + static size_t sizeof_requant_muls(const WorkspaceArgs &args) + { + return args.output_stage.per_channel_muls != nullptr ? + 0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels; + } + + template + static size_t sizeof_requant_shifts(const WorkspaceArgs &args) + { + return args.output_stage.per_channel_right_shifts != nullptr ? + 0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels; + } +}; + + +template +class Workspace; + +template +class Workspace +{ + public: + struct WorkspaceType : Element::Workspace, Workspace::WorkspaceType + { + }; + + template + static void initialise(void *buffer, const WorkspaceArgs &args) + { + // Allocate sufficient space for the struct, then initialise each of the + // elements in turn. + auto ws = reinterpret_cast(buffer); + initialise_elements(ws, ws + 1, args); + } + + template + static size_t get_sizeof_workspace(const WorkspaceArgs &args) + { + return sizeof(WorkspaceType) + get_element_sizes(args); + } + + template + static inline size_t get_element_sizes(const WorkspaceArgs &args) + { + return Element::get_element_size(args) + Workspace::get_element_sizes(args); + } + + template + static void initialise_elements(WorkspaceType *ws, void *buffer, const WorkspaceArgs &args) + { + buffer = Element::initialise(ws, buffer, args); // Get the next buffer + Workspace::initialise_elements(ws, buffer, args); + } +}; + +template <> +class Workspace<> +{ + public: + struct WorkspaceType + { + }; + + template + static inline size_t get_element_sizes(const WorkspaceArgs &) + { + return 0; + } + + template + static void initialise_elements(WorkspaceType *, void *, const WorkspaceArgs &) + { + } +}; + +} // namespace {anonymous} +} // namespace depthwise +} // namespace arm_conv diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp index 9262ea05a4..3998dfbc9a 100644 --- a/src/core/NEON/kernels/assembly/depthwise.hpp +++ b/src/core/NEON/kernels/assembly/depthwise.hpp @@ -59,6 +59,8 @@ struct DepthwiseArgs const DepthwiseConfig *config; + bool fast_mode = false; + DepthwiseArgs( const CPUInfo *cpu_info, unsigned int kernel_rows, unsigned int kernel_cols, @@ -83,15 +85,18 @@ private: protected: const DepthwiseArgs m_args; // Copy of arguments + public: std::string name() const { return _name; } + void set_name(const std::string &n) { _name = n; } + DepthwiseCommon(const DepthwiseArgs &args) : m_args(args) {}; DepthwiseCommon(DepthwiseCommon &) = delete; @@ -103,7 +108,7 @@ public: void *const output, void *const working_space, const unsigned int thread_id, - const unsigned int n_threads) const override + const unsigned int n_threads) const override final { const size_t ld_input_col = m_args.input_channels; const size_t ld_input_row = ld_input_col * m_args.input_cols; @@ -130,7 +135,7 @@ public: size_t ld_output_batch, void *const working_space, const unsigned int thread_id, - const unsigned int n_threads) const override + const unsigned int n_threads) const override final { execute( m_args.n_batches, m_args.input_rows, m_args.input_cols, @@ -142,7 +147,36 @@ public: working_space, thread_id, n_threads); } - virtual void execute( + void execute( + unsigned int batches, + unsigned int input_height, + unsigned int input_width, + unsigned int channels, + const PaddingValues &padding, + const void *input, + size_t ld_input_col, + size_t ld_input_row, + size_t ld_input_batch, + const void *parameters, + unsigned int output_height, + unsigned int output_width, + void *output, + size_t ld_output_col, + size_t ld_output_row, + size_t ld_output_batch, + void *working_space, + unsigned int thread_id, + unsigned int n_threads) const override final + { + this->execute_internal( + batches, input_height, input_width, channels, padding, input, + ld_input_col, ld_input_row, ld_input_batch, parameters, output_height, + output_width, output, ld_output_col, ld_output_row, ld_output_batch, + working_space, thread_id, n_threads); + } + +protected: + virtual void execute_internal( unsigned int batches, unsigned int input_height, unsigned int input_width, @@ -161,7 +195,7 @@ public: size_t ld_output_batch, void *working_space, unsigned int thread_id, - unsigned int n_threads) const override = 0; + unsigned int n_threads) const = 0; }; template -- cgit v1.2.1